1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56
| import fitz # PyMuPDF import re from pathlib import Path from colorama import Fore import sys import os def search_pdf(pdf_path, format,keywords,context_len=10): relp=os.path.relpath(pdf_path,'C:/Users/tellw') if not os.path.exists(relp): ds,filename=relp.rsplit('\\',1) if not os.path.exists(ds): os.makedirs(ds) if format=='pdf': with open(relp,'w',encoding='utf8') as f: # 打开PDF文件 document = fitz.open(pdf_path) # 搜索PDF中的文本 for page_num in range(len(document)): page = document[page_num] text = re.sub(r'\s','',page.get_text()).lower() f.write(text)
# 关闭PDF文档 document.close() elif format=='txt': with open(pdf_path,'r',encoding='utf8') as f: text=f.read() with open(relp,'w',encoding='utf8') as f: f.write(re.sub(r'\s','',text).lower()) # \s匹配任意的空白符,包括空格,制表符(Tab),换行符,中文全角空格等。 with open(relp,'r',encoding='utf8') as f: text=f.read() search_re='.{0,20}'.join(keywords) search_res=re.findall(f'.{{0,{context_len}}}{search_re}.{{0,{context_len}}}',text) for sr in search_res: for kw in keywords: sr=re.sub(kw,f'{Fore.RED}{kw}{Fore.BLACK}',sr) print(sr+'\t\t\t\t'+str(pdf_path)+'\n')
if len(sys.argv)>=2: keywords=sys.argv[1:] else: sys.exit(1) dirs=['C:/Users/tellw/open_title/file_updates','C:/Users/tellw/open_title/papers/benchmark','C:/Users/tellw/open_title/papers/edge_computing','C:/Users/tellw/open_title/papers/guidance','C:/Users/tellw/open_title/papers/methodology','C:/Users/tellw/open_title/papers/misc','C:/Users/tellw/open_title/papers/other-themes-benchmark','C:/Users/tellw/open_title/papers/speech_recognition','C:/Users/tellw/open_title/papers/test','C:/Users/tellw/open_title/papers/to_c','C:/Users/tellw/open_title/papers/books'] pdf_file_paths=[] txt_file_paths=[] for d in dirs: pdf_file_paths.extend(list(Path(d).glob('*.pdf'))) for d in dirs: txt_file_paths.extend(list(Path(d).glob('*.txt'))) os.chdir('C:/Users/tellw/open_title/paper_search_space') context_len=30 for pdf_file_path in pdf_file_paths: search_pdf(pdf_file_path, 'pdf',keywords,context_len) for txt_file_path in txt_file_paths: search_pdf(txt_file_path,'txt',keywords,context_len)
|