1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88
| def train(): use_cuda = torch.cuda.is_available() random.seed(1234) np.random.seed(1234) torch.manual_seed(1234) if use_cuda: torch.cuda.manual_seed(1234)
# 超参数设置 K = 10 # 负样本随机采样数量 C = 3 # 周围单词的数量 NUM_EPOCHS = 2 VOCAB_SIZE = 30000 BATCH_SIZE = 128 LEARNING_RATE = 0.2 EMBEDDING_SIZE = 100 LOG_FILE = "word_embedding.log"
with open("text8.train.txt", "r") as file: text = file.read() text = [w for w in (text.lower()).split()] vocab = dict(Counter(text).most_common(VOCAB_SIZE-1)) vocab["<unk>"] = len(text) - np.sum(list(vocab.values())) idx_to_word = [word for word in vocab.keys()] word_to_idx = {word:i for i, word in enumerate(idx_to_word)}
word_counts = np.array([count for count in vocab.values()], dtype=np.float32) word_freqs = word_counts / np.sum(word_counts) word_freqs = word_freqs ** (3./4.) word_freqs = word_freqs / np.sum(word_freqs) # 用来做 negative sampling
dataset = Dataset(text, word_to_idx, idx_to_word, word_freqs, word_counts) dataloader = tud.DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0) # print(next(iter(dataloader))[0].shape) # 中间词维度data # print(next(iter(dataloader))[1].shape) # 周围词维度 # print(next(iter(dataloader))[2].shape) # 负样本维度
model = EmbeddingModel(VOCAB_SIZE, EMBEDDING_SIZE) optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE) if use_cuda: model = model.cuda()
# 寻找nearest neighbors def find_nearest(word): '''embedding_weights是一个[vocab_size, embedding_size]的参数矩阵''' index = word_to_idx[word] embedding = embedding_weights[index] # 取出这个单词的embedding向量 cos_dis = np.array([scipy.spatial.distance.cosine(e, embedding) for e in embedding_weights]) # 计算所有30000个embedding向量与传入单词embedding向量的相似度距离 return [idx_to_word[i] for i in cos_dis.argsort()[:10] if i<len(idx_to_word)] # 返回前10个最相似的
# training for e in range(NUM_EPOCHS): for i, (input_labels, pos_labels, neg_labels) in enumerate(dataloader): input_labels = input_labels.long() pos_labels = pos_labels.long() neg_labels = neg_labels.long() if use_cuda: input_labels = input_labels.cuda() pos_labels = pos_labels.cuda() neg_labels = neg_labels.cuda() optimizer.zero_grad() loss = model(input_labels, pos_labels, neg_labels).mean() loss.backward() optimizer.step()
if i % 100 == 0: with open(LOG_FILE, "a") as fout: fout.write("epoch: {}, iter: {}, loss: {}\n".format(e, i, loss.item())) print("epoch: {}, iter: {}, loss: {}".format(e, i, loss.item())) if i % 2000 == 0: embedding_weights = model.input_embeddings() # 取出训练中的in_embed词向量 # 在三个词文本上评估词向量 sim_simlex = evaluate("simlex-999.txt", embedding_weights) sim_men = evaluate("men.txt", embedding_weights) sim_353 = evaluate("wordsim353.csv", embedding_weights) with open(LOG_FILE, "a") as fout: print("epoch: {}, iter: {}, simlex-999: {}, men: {}, sim353: {}, nearest to monster: {}\n".format( e, i, sim_simlex, sim_men, sim_353, find_nearest("often"))) fout.write("epoch: {}, iter: {}, simlex-999: {}, men: {}, sim353: {}, nearest to monster: {}\n".format( e, i, sim_simlex, sim_men, sim_353, find_nearest("often"))) embedding_weights = model.input_embeddings() # 调用最终训练好的embeding词向量 torch.save(model.state_dict(), 'model_embedding.th') # 模型保存
|