1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
from bs4 import BeautifulSoup
import bs4
import re

with open('index.html','r',encoding='utf8') as f:
htmlfile=f.read()

soup=BeautifulSoup(htmlfile,'lxml')

ls=[]

def visit_c(p):
if p.name=='p' or p.name=='strong' or re.match(r'^h\d$',p.name) or p.name=='table':
ls.append(p)
elif p.name=='a' and 'href' in p.attrs:
attrs_d={'href':p.attrs['href']}
if 'target' in p.attrs:
attrs_d['target']=p.attrs['target']
na=bs4.element.Tag(name='a',attrs=attrs_d)
na.append(p.text)
ls.append(na)
else:
for c in p.children:
if isinstance(c,bs4.element.Tag):
visit_c(c)

print(dir(soup))
for c in soup.children:
if isinstance(c,bs4.element.Tag):
visit_c(c)

with open('test2.html','w',encoding='utf8') as f:
for l in ls:
print(l)
f.write(l.__str__()+'\n')

如上代码提取index.html文件中的p标签、strong标签、h标题标签、表标签和锚标签内容,组成新的html文件,去掉原先文件里的复杂冗余的信息,形成一个较为干净的html文件

2310071821