from bs4 import BeautifulSoup import bs4 import re
with open('index.html','r',encoding='utf8') as f: htmlfile=f.read()
soup=BeautifulSoup(htmlfile,'lxml')
ls=[]
def visit_c(p): if p.name=='p' or p.name=='strong' or re.match(r'^h\d$',p.name) or p.name=='table': ls.append(p) elif p.name=='a' and 'href' in p.attrs: attrs_d={'href':p.attrs['href']} if 'target' in p.attrs: attrs_d['target']=p.attrs['target'] na=bs4.element.Tag(name='a',attrs=attrs_d) na.append(p.text) ls.append(na) else: for c in p.children: if isinstance(c,bs4.element.Tag): visit_c(c)
print(dir(soup)) for c in soup.children: if isinstance(c,bs4.element.Tag): visit_c(c)
with open('test2.html','w',encoding='utf8') as f: for l in ls: print(l) f.write(l.__str__()+'\n')