1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
import json
from collections import defaultdict

with open('/home/tellw/dataset/asr/g2p.json','r',encoding='utf8') as f:
g2p_json=json.load(f)
gs=list(g2p_json.keys())
gram1=defaultdict(int)
gram2=defaultdict(int)
for g in gs:
for i in range(len(g)-1):
gram1[g[i]]+=1
gram2[g[i:i+2]]+=1
gram1[g[-1]]+=1
with open('/home/tellw/dataset/asr/gram1.json','w',encoding='utf8') as f:
json.dump(gram1,f,ensure_ascii=False)
with open('/home/tellw/dataset/asr/gram2.json','w',encoding='utf8') as f:
json.dump(gram2,f,ensure_ascii=False)

统计后来新加入的语料的文本特征,不断加强学习更多的语料,增强自己的语言知识

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import json
from collections import defaultdict


with open('/home/tellw/dataset/asr/gram1.json','r',encoding='utf8') as f:
gram1=json.load(f)
with open('/home/tellw/dataset/asr/gram2.json','r',encoding='utf8') as f:
gram2=json.load(f)
with open('/home/tellw/dataset/text/假面山庄杀人事件.txt','r',encoding='utf8') as f:
contents=f.read()
contents=''.join(contents.split('\n'))
import re
import sys
contents=re.sub(r'[0-9a-zA-Z_]|\W','',contents)
print(contents)
g=contents
for i in range(len(g)-1):
if not g[i] in gram1:
gram1[g[i]]=0
gram1[g[i]]+=1
if not g[i:i+2] in gram2:
gram2[g[i:i+2]]=0
gram2[g[i:i+2]]+=1
if not g[-1] in gram1:
gram1[g[-1]]=0
gram1[g[-1]]+=1
with open('/home/tellw/dataset/asr/gram1.json','w',encoding='utf8') as f:
json.dump(gram1,f,ensure_ascii=False)
with open('/home/tellw/dataset/asr/gram2.json','w',encoding='utf8') as f:
json.dump(gram2,f,ensure_ascii=False)

2310231816