使用vosk模型进行语音识别

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import wave
import sys
import json

from vosk import Model, KaldiRecognizer, SetLogLevel

# You can set log level to -1 to disable debug messages
SetLogLevel(-1)

wf = wave.open(sys.argv[1], "rb")
if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE":
print("Audio file must be WAV format mono PCM.")

sys.exit(1)

# model = Model(lang="en-us")
# You can also init model by name or with a folder path
# model = Model(model_name="vosk-model-en-us-0.21")
# 设置模型所在路径,刚刚4.1中解压出来的路径 《《《《
# model = Model("model")
model = Model("../Downloads/vosk-model-small-cn-0.22")

rec = KaldiRecognizer(model, wf.getframerate())
rec.SetWords(True)
# rec.SetPartialWords(True) # 注释这行 《《《《

str_ret = ""

while True:
data = wf.readframes(4000)
if len(data) == 0:
break
if rec.AcceptWaveform(data):
result = rec.Result()
# print(result)

result = json.loads(result)
if 'text' in result:
str_ret += result['text'] + ' '
# else:
# print(rec.PartialResult())

result = json.loads(rec.FinalResult())
if 'text' in result:
str_ret += result['text']

print(str_ret)

wf.close()

评估vosk模型的词错率,响应时间和吞吐量

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import os
import subprocess
import wave
import sys
import json
from vosk import Model, KaldiRecognizer, SetLogLevel
import difflib
import time

def get_edit_distance(str1, str2) -> int:
"""
计算两个串的编辑距离,支持str和list类型
str1和str2是列表,列表元素是要比的字符串,计算对应位置字符串的编辑距离
"""
leven_cost = 0
# print(f'--str1-str2-{str1}-{str2}')
for s1,s2 in zip(str1,str2):
sequence_match = difflib.SequenceMatcher(None, s1, s2)
for tag, index_1, index_2, index_j1, index_j2 in sequence_match.get_opcodes():
if tag == 'replace':
leven_cost += max(index_2-index_1, index_j2-index_j1)
elif tag == 'insert':
leven_cost += (index_j2-index_j1)
elif tag == 'delete':
leven_cost += (index_2-index_1)
return leven_cost

SetLogLevel(-1)

model = Model("../Downloads/vosk-model-small-cn-0.22")

fr=48000
rec = KaldiRecognizer(model, fr)
rec.SetWords(True)

def recognize(file,trans):
wf = wave.open(file, "rb")
if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE":
print("Audio file must be WAV format mono PCM.")

sys.exit(1)

str_ret = ""

while True:
data = wf.readframes(4000)
if len(data) == 0:
break
if rec.AcceptWaveform(data):
result = rec.Result()

result = json.loads(result)
if 'text' in result:
str_ret += result['text'] + ' '

result = json.loads(rec.FinalResult())
if 'text' in result:
str_ret += result['text']
str_ret=str_ret.replace(' ','')

wer=get_edit_distance(str_ret,trans)/len(trans)
print(str_ret,trans,wer)
wf.close()
return wer

wers=[]

os.chdir('../dataset/chs')
for file in os.listdir():
fn,_=os.path.splitext(file)
st=time.time()
wer=recognize(file,fn)
et=time.time()
print(f'latency:{et-st}, throughput:{1/(et-st)}')
wers.append(wer)

print(f'average wer:{sum(wers)/len(wers)}')

模型分享:

通过网盘分享的文件:vosk-model-small-cn-0.22.zip
链接: https://pan.baidu.com/s/1FEH1xwDucdC3cEZSAyDOwQ?pwd=k8p5 提取码: k8p5

通过网盘分享的文件:vosk-model-cn-0.22.zip
链接: https://pan.baidu.com/s/1dISCahVsWppnS-bbvyLWEA?pwd=ymgp 提取码: ymgp

通过网盘分享的文件:vosk-model-en-us-0.22.zip
链接: https://pan.baidu.com/s/1z-d1A8wHvBs7m2dpMjYjXQ?pwd=8hg9 提取码: 8hg9

通过网盘分享的文件:vosk-model-ja-0.22.zip
链接: https://pan.baidu.com/s/1eQCEzNvdjnbyLILdu56RDw?pwd=8nhv 提取码: 8nhv

模型下载自https://alphacephei.com/vosk/models

创建于2412261646,修改于2412261646