提取html文件中的部分组件元素

from bs4 import BeautifulSoup
import bs4
import re

with open('index.html','r',encoding='utf8') as f:
	htmlfile=f.read()

soup=BeautifulSoup(htmlfile,'lxml')

ls=[]

def visit_c(p):
	if p.name=='p' or p.name=='strong' or re.match(r'^h\d$',p.name) or p.name=='table':
		ls.append(p)
	elif p.name=='a' and 'href' in p.attrs:
		attrs_d={'href':p.attrs['href']}
		if 'target' in p.attrs:
			attrs_d['target']=p.attrs['target']
		na=bs4.element.Tag(name='a',attrs=attrs_d)
		na.append(p.text)
		ls.append(na)
	else:
		for c in p.children:
			if isinstance(c,bs4.element.Tag):
				visit_c(c)

print(dir(soup))
for c in soup.children:
	if isinstance(c,bs4.element.Tag):
		visit_c(c)

with open('test2.html','w',encoding='utf8') as f:
	for l in ls:
		print(l)
		f.write(l.__str__()+'\n')

如上代码提取index.html文件中的p标签、strong标签、h标题标签、表标签和锚标签内容，组成新的html文件，去掉原先文件里的复杂冗余的信息，形成一个较为干净的html文件

2310071821