1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391
| from pathlib import Path import tensorflow as tf import wave import numpy as np from scipy.fftpack import fft import random from tensorflow.keras import backend as K from tensorflow.keras.layers import Dense,Dropout,Input,Reshape,BatchNormalization,Lambda,Activation,Conv2D,MaxPooling2D from tensorflow.keras.models import Model import json import sys from tensorflow.keras.optimizers.legacy import Adam import os import time import difflib import re
dataset_dir='C:/Users/tellw/dataset/asr'
with open(f'{dataset_dir}/p2g.json','r',encoding='utf8') as f: p2g_json=json.load(f)
with open(f'{dataset_dir}/g2p.json','r',encoding='utf8') as f: g2p_json=json.load(f)
pl=list(p2g_json.keys()) pl.sort() pd={} for i in range(len(pl)): pd[pl[i]]=i # ctc_batch_cost节点 计算要求按照0-num_classes-2的编号编码类别,num_classes-1是背景(?待调查),网络要输出13种类别,尽管我们这里音素只需要识别出来12种 # 2 root error(s) found. # (0) INVALID_ARGUMENT: Saw a non-null label (index >= num_classes - 1) following a null label, batch: 0 num_classes: 13 labels: 12,6,8 labels seen so far: # [[{{node model_1/ctc/CTCLoss}}]] # [[model_1/ctc/CTCLoss/_84]] # 反正是这种报错
label_max_string_length=13 # 文本最长长度 wav_max_window_length=480
class SpeechFeatureMeta(): ''' 声学特征提取类的基类 ''' def __init__(self,framerate=16000): self.framerate=framerate
def run(self,wavsignal,fs=16000): raise NotImplementedError('run() method is not implemented')
class SpecAugment(SpeechFeatureMeta): ''' 复现谷歌SpecAugment数据增强特征算法,基于Spectrogram语谱图基础特征 ''' def __init__(self,framerate=16000,timewindow=25,timeshift=10): self.time_window=timewindow self.window_length=int(framerate/1000*self.time_window) self.timeshift=timeshift self.x=np.linspace(0,self.window_length-1,self.window_length,dtype=np.int16) self.w=0.54-0.46*np.cos(2*np.pi*self.x/(self.window_length-1)) super().__init__(framerate)
def run(self,wavsignal,samplerate=16000): self.framerate=samplerate range0_end=int(len(wavsignal)/self.framerate*1000-self.time_window)//self.timeshift+1 data_input=np.zeros((range0_end,self.window_length//2),dtype=np.float32) data_line=np.zeros((1,self.window_length),dtype=np.float32)
for i in range(0,range0_end): p_start=i*int(self.framerate/1000*self.timeshift) p_end=p_start+self.window_length data_line=wavsignal[p_start:p_end] data_line=data_line*self.w data_line=np.abs(fft(data_line)) data_input[i]=data_line[0:self.window_length//2] data_input=np.log(data_input+1) mode=random.randint(1,100) h_start=random.randint(1,data_input.shape[0]) h_width=random.randint(1,100) v_start=random.randint(1,data_input.shape[1]) v_width=random.randint(1,100) if mode<=60: pass elif 60<mode<=75: data_input[h_start:h_start+h_width,:]=0 elif 75<mode<90: data_input[:,v_start:v_start+v_width]=0 else: data_input[h_start:h_start+h_width,v_start:v_start+v_width]=0 return data_input
class BaseModel: ''' 定义声学模型的接口基类 ''' def __init__(self): self.input_shape=None self.output_shape=None
def get_model(self): return self.model,self.model_base
def get_train_model(self): return self.model
def get_eval_model(self): return self.model_base
def summary(self): self.model.summary()
def get_model_name(self): return self._model_name
def load_weights(self,filename): self.model.load_weights(filename)
def save_weights(self,filename): self.model.save_weights(filename) self.model_base.save_weights(filename+'.base') with open(os.path.dirname(filename)+'/epoch_'+self._model_name+'.txt','w',encoding='utf8') as f: f.write(os.path.abspath(filename))
def get_loss_function(self): raise Exception('method not implemented')
def forward(self,x): raise Exception('method not implemented')
def ctc_lambda_func(args): y_pred,labels,input_length,label_length=args # print('===ctc_lambda_func===',y_pred,labels,input_length,label_length) y_pred=y_pred[:,:,:] return K.ctc_batch_cost(labels,y_pred,input_length,label_length)
class SpeechModel251BN(BaseModel): ''' 定义CNN+CTC模型,使用函数式模型
输入层:275维的特征值序列(一窗口的帧数除以2),一条语音数据的窗口序列的最大长度为480(大约4.8s),因为语音预处理时窗口的步长为10ms,窗口的长度为25ms 隐藏层:卷积池化层,卷积核大小为3x3,池化窗口大小为2 隐藏层:全连接层 输出层:全连接层,神经元数量为self.MS_OUTPUT_SIZE,使用softmax为激活函数 CTC层:使用CTC的loss作为损失函数,实现连接性时序多输出
参数: input_shape: tuple,(窗口数量,特征值序列长度,声道数) output_shape: tuple,(n,目标种类数量) ''' def __init__(self,input_shape=(wav_max_window_length,275,1),output_size=13): super().__init__() self.input_shape=input_shape self._pool_size=4 # 对音频谱的采样倍率(缩小倍数) self.output_shape=(input_shape[0]//self._pool_size,output_size) self._model_name='little_sm251bn' self.model,self.model_base=self._define_model(self.input_shape,self.output_shape[1])
def _define_model(self,input_shape,output_size): input_data=Input(name='the_input',shape=input_shape) layer_h=Conv2D(32,(3,3),use_bias=True,padding='same',kernel_initializer='he_normal',name='Conv0')(input_data) layer_h=BatchNormalization(epsilon=0.0002,name='BN0')(layer_h) layer_h=Activation('relu',name='Act0')(layer_h) layer_h=Conv2D(32,(3,3),use_bias=True,padding='same',kernel_initializer='he_normal',name='Conv1')(layer_h) layer_h=BatchNormalization(epsilon=0.0002,name='BN1')(layer_h) layer_h=Activation('relu',name='Act1')(layer_h) layer_h=MaxPooling2D(pool_size=2,strides=None,padding='valid')(layer_h) layer_h=Conv2D(64,(3,3),use_bias=True,padding='same',kernel_initializer='he_normal',name='Conv2')(layer_h) layer_h=BatchNormalization(epsilon=0.0002,name='BN2')(layer_h) layer_h=Activation('relu',name='Act2')(layer_h) layer_h=Conv2D(64,(3,3),use_bias=True,padding='same',kernel_initializer='he_normal',name='Conv3')(layer_h) layer_h=BatchNormalization(epsilon=0.0002,name='BN3')(layer_h) layer_h=Activation('relu',name='Act3')(layer_h) layer_h=MaxPooling2D(pool_size=2,strides=None,padding='valid')(layer_h) layer_h=Reshape((self.output_shape[0],(input_shape[1]//self._pool_size)*64),name='Reshape0')(layer_h) layer_h=Dense(64,activation='relu',use_bias=True,kernel_initializer='he_normal',name='Dense0')(layer_h) layer_h=Dense(output_size,use_bias=True,kernel_initializer='he_normal',name='Dense1')(layer_h) y_pred=Activation('softmax',name='Activation0')(layer_h) import pdb;pdb.set_trace() model_base=Model(inputs=input_data,outputs=y_pred) labels=Input(name='the_labels',shape=[label_max_string_length],dtype='float32') input_length=Input(name='input_length',shape=[1],dtype='int8') label_length=Input(name='label_length',shape=[1],dtype='int8') loss_out=Lambda(ctc_lambda_func,output_shape=(1,),name='ctc')([y_pred,labels,input_length,label_length]) model=Model(inputs=[input_data,labels,input_length,label_length],outputs=loss_out) return model,model_base
def get_loss_function(self): return {'ctc':lambda y_true,y_pred:y_pred}
def dataset(): with open(f'{dataset_dir}/transcripts.txt','r',encoding='utf8') as f: wavs=f.readlines() wav_filenames=[] transcriptions=[] for wav in wavs: wav_filename,transcription=wav.strip().split() # print(wav_filename,transcription) wav_filenames.append(wav_filename) transcriptions.append(transcription) return wav_filenames,transcriptions
def process_wav(wav_filenames,transcriptions): processed_seq=[] transcription_seq=[] wav_processor=SpecAugment(22050) for wav_filename,transcription in zip(wav_filenames,transcriptions): wav=wave.open(f'{dataset_dir}/{wav_filename}','rb') str_data=wav.readframes(wav.getnframes()) wav.close() wave_data=np.frombuffer(str_data,dtype=np.short) fed_wav=wav_processor.run(wave_data,22050) processed_seq.append(fed_wav) transcription_seq.append([pd[pe] for pe in g2p_json[transcription].split()]) return processed_seq,transcription_seq
def ctc_label_len(label): # https://blog.csdn.net/zkgoup/article/details/103443387 Error: loss:inf ,“No valid path found“ 与 Invalid ArgumentError:“sequence_length(0)“ (彻底解决) add_len=0 label_len=len(label) for i in range(label_len-1): if label[i]==label[i+1]: add_len+=1 return label_len+add_len
def train(iter_start=0,epoch_size=1000,batch_size=8): ts=time.time() wav_filenames,transcriptions=dataset() # for transcription in transcriptions: # print([pd[pe] for pe in g2p_json[transcription].split()]) # sys.exit() processed_seq,transcription_seq=process_wav(wav_filenames,transcriptions) model_m=SpeechModel251BN() train_model=model_m.model train_model.compile(loss=model_m.get_loss_function(),optimizer=Adam(learning_rate=0.0001,beta_1=0.9,beta_2=0.999,decay=0.0,epsilon=1e-7)) if iter_start!=0: train_model.load_weights(f'srm-{iter_start}.h5') # ready for data in model input_data=np.zeros((len(processed_seq),wav_max_window_length,275,1),dtype=np.float32) label_data=np.zeros((len(processed_seq),label_max_string_length),dtype=np.float32) input_length=[] label_length=[] for k in range(len(processed_seq)): pss=processed_seq[k].shape input_data[k,:len(processed_seq[k])]=processed_seq[k].reshape(pss[0],pss[1],1) label_data[k,:len(transcription_seq[k])]=transcription_seq[k] input_length.append([len(processed_seq[k])//4+1]) # 原长度也要经过池化 label_length.append([ctc_label_len(transcription_seq[k])]) label_length=np.matrix(label_length) input_length=np.matrix(input_length) for j in range(epoch_size): for i in range(0,len(processed_seq),batch_size): leng=min(batch_size,len(processed_seq)-i) # print([input_data,label_data,input_length,label_length]) train_model.fit([input_data[i:i+leng],label_data[i:i+leng],input_length[i:i+leng],label_length[i:i+leng]],np.zeros((leng,1))) print(f'in {i+1}/{len(processed_seq)} batch in {j+1}/{epoch_size} epoch') train_model.save_weights(f'srm-{iter_start+j+1}.h5') if os.path.exists(f'srm-{iter_start+j}.h5'): os.remove(f'srm-{iter_start+j}.h5') train_model.save_weights(f'srm-{iter_start+epoch_size}.h5') te=time.time() log_str=f'train {ts} {te} {te-ts} {iter_start} {iter_start+epoch_size}' with open('log.txt','a',encoding='utf8') as f: f.write(log_str)
def ctc_decode_delete_tail_blank(ctc_decode_list): p=0 while p<len(ctc_decode_list) and ctc_decode_list[p]!=-1: p+=1 return ctc_decode_list[:p]
class ModelLanguage: # n-gram语言模型 def __init__(self,model_path=''): self.model_path=model_path self.dict_pinyin=dict() self.model1=dict() self.model2=dict()
def load_model(self): # 加载n-gram语言模型到内存 self.dict_pinyin=p2g_json with open(f'{dataset_dir}/gram1.json','r',encoding='utf8') as f: self.model1=json.load(f) with open(f'{dataset_dir}/gram2.json','r',encoding='utf8') as f: self.model2=json.load(f)
def pinyin2text(self,list_pinyin,beam_size=100): # 拼音转文本,一次性取得全部结果 result=list() tmp_result_last=list() for item_pinyin in list_pinyin: tmp_result=self.pinyin_stream_decode(tmp_result_last,item_pinyin,beam_size) if len(tmp_result)==0 and len(tmp_result_last)>0: result.append(tmp_result_last[0][0]) tmp_result=self.pinyin_stream_decode([],item_pinyin,beam_size) if len(tmp_result)>0: result.append(tmp_result[0][0]) tmp_result_last=tmp_result if len(tmp_result_last)>0: result.append(tmp_result_last[0][0]) return ''.join(result)
def pinyin_stream_decode(self,temple_result,item_pinyin,beam_size=100): # 拼音流式解码,逐字转换,每次返回中间结果 # 如果这个拼音不再汉语拼音字典里的话,直接返回空列表,不做解码 if item_pinyin not in self.dict_pinyin: return [] # 获取拼音下属的字的列表,cur_words包含了该拼音对应的所有的字 cur_words=self.dict_pinyin[item_pinyin] # 第一个字做初始处理 if len(temple_result)==0: lst_result=list() for word in cur_words: # 添加该字到可能的句子列表,设置初始概率为1.0 lst_result.append([word,1.0]) return lst_result # 开始处理已经至少有一个字的中间结果情况 new_result=list() for sequence in temple_result: for cur_word in cur_words: # 得到2-gram的汉字子序列 tuple2_word=sequence[0][-1]+cur_word if tuple2_word not in self.model2: # 如果2-gram子序列不存在 continue # 计算状态转移概率 prob_origin=sequence[1] count_two_word=float(self.model2[tuple2_word]) # 二字频数 count_one_word=float(self.model1[tuple2_word[-2]]) # 单字频数 cur_probility=prob_origin*count_two_word/count_one_word new_result.append([sequence[0]+cur_word,cur_probility]) new_result=sorted(new_result,key=lambda x:x[1],reverse=True) if len(new_result)>beam_size: return new_result[:beam_size] return new_result
def get_edit_distance(str1,str2): # 计算两个串的编辑距离,支持str和list类型,如果str1和str2是列表类型的话,列表元素是要比的字符串,计算对应位置的字符串的编辑距离 leven_cost=0 for s1,s2 in zip(str1,str2): sequence_match=difflib.SequenceMatcher(None,s1,s2) for tag,index_1,index_2,index_j1,index_j2 in sequence_match.get_opcodes(): if tag=='replace': leven_cost+=max(index_2-index_1,index_j2-index_j1) elif tag=='insert': leven_cost+=index_j2-index_j1 elif tag=='delete': leven_cost+=index_2-index_1 return leven_cost
def calc_word_error_rate(rrs,labels): n=0 costs=get_edit_distance(labels,rrs) for i in range(len(labels)): n+=len(labels[i]) return costs/n
def predict(checkpoint=75,batch_size=8): wav_filenames,transcriptions=dataset() processed_seq,_=process_wav(wav_filenames,transcriptions) model_m=SpeechModel251BN() model_m.model.load_weights(f'srm-{checkpoint}.h5') infer_model=model_m.model_base input_data=np.zeros((len(processed_seq),wav_max_window_length,275,1),dtype=np.float32) lm=ModelLanguage() lm.load_model() for k in range(len(processed_seq)): pss=processed_seq[k].shape input_data[k,:len(processed_seq[k])]=processed_seq[k].reshape(pss[0],pss[1],1) labels=[] rrs=[] for i in range(0,len(processed_seq),batch_size): leng=min(batch_size,len(processed_seq)-i) y=infer_model.predict(input_data[i:i+leng]) r=K.ctc_decode(y,np.ones((leng),dtype=np.int8)*120,greedy=True,beam_width=100,top_paths=1) r1=r[0][0].numpy() srs=[] for j in range(leng): # print('===leng===',leng) speech_result=ctc_decode_delete_tail_blank(r1[j]) label=re.match(r'(\w*)\d\.wav',wav_filenames[i+j]).group(1) rr=lm.pinyin2text([pl[sr] for sr in speech_result]) print(label,rr) labels.append(label) rrs.append(rr) print('词错率',calc_word_error_rate(labels,rrs))
if __name__=='__main__': train(11,1) # predict(11)
|