epub转txt

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
from bs4 import BeautifulSoup

def exact_p_tag(path,f):
xhtml_file = open(path, 'r', encoding='utf-8')
xhtml_handle = xhtml_file.read()

soup = BeautifulSoup(xhtml_handle, 'lxml')

title = soup.find_all("title")
# print(title)

p_list = soup.find_all('p')
for p in p_list:
f.write(p.text+'\n')
xhtml_file.close()

import os

os.chdir('C:/Users/tellw/Downloads/test')

from pathlib import Path

xhtml_file_paths=list(Path('EPUB/xhtml').glob('*.xhtml'))

f=open('C:/Users/tellw/test/test.txt','w',encoding='utf8')
for xfp in xhtml_file_paths:
exact_p_tag(xfp,f)
f.close()

使用 Python 提取 epub 中的文本 https://fanlumaster.github.io/2021/07/08/%E4%BD%BF%E7%94%A8-Python-%E6%8F%90%E5%8F%96-epub-%E4%B8%AD%E7%9A%84%E6%96%87%E6%9C%AC/

创建于2409071243,修改于2409071243