1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
| import json from collections import defaultdict
with open('/home/tellw/dataset/asr/gram1.json','r',encoding='utf8') as f: gram1=json.load(f) with open('/home/tellw/dataset/asr/gram2.json','r',encoding='utf8') as f: gram2=json.load(f) with open('/home/tellw/dataset/text/假面山庄杀人事件.txt','r',encoding='utf8') as f: contents=f.read() contents=''.join(contents.split('\n')) import re import sys contents=re.sub(r'[0-9a-zA-Z_]|\W','',contents) print(contents) g=contents for i in range(len(g)-1): if not g[i] in gram1: gram1[g[i]]=0 gram1[g[i]]+=1 if not g[i:i+2] in gram2: gram2[g[i:i+2]]=0 gram2[g[i:i+2]]+=1 if not g[-1] in gram1: gram1[g[-1]]=0 gram1[g[-1]]+=1 with open('/home/tellw/dataset/asr/gram1.json','w',encoding='utf8') as f: json.dump(gram1,f,ensure_ascii=False) with open('/home/tellw/dataset/asr/gram2.json','w',encoding='utf8') as f: json.dump(gram2,f,ensure_ascii=False)
|