chinhr commited on
Commit
89362ce
·
verified ·
1 Parent(s): 80e73be

Upload trans_envi.py

Browse files
Files changed (1) hide show
  1. trans_envi.py +812 -0
trans_envi.py ADDED
@@ -0,0 +1,812 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Bản sao của Transformer.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1Du2LiOZ4ZJ4uzIkGR_kwWgwdJEqLjW2P
8
+ """
9
+
10
+ !pip install spacy==3.7.2
11
+
12
+ ! pip -q install torchtext==0.6.0
13
+ ! pip -q install pyvi
14
+ !python -m spacy download en_core_web_sm
15
+ import nltk
16
+ nltk.download('wordnet')
17
+
18
+ !pip install https://gitlab.com/trungtv/vi_spacy/-/raw/master/packages/vi_core_news_lg-3.6.0/dist/vi_core_news_lg-3.6.0.tar.gz
19
+
20
+ import torch
21
+ import torch.nn as nn
22
+ from torch.autograd import Variable
23
+ import torch.nn.functional as F
24
+ import numpy as np
25
+ import os
26
+ import math
27
+ import nltk
28
+ import spacy
29
+
30
+ """#Embedder
31
+
32
+ """
33
+
34
+ class Embedder(nn.Module):
35
+ def __init__(self, vocab_size, d_model):
36
+ super().__init__()
37
+ self.vocab_size = vocab_size
38
+ self.d_model = d_model
39
+
40
+ self.embed = nn.Embedding(vocab_size, d_model)
41
+
42
+ def forward(self, x):
43
+ return self.embed(x)
44
+
45
+ """#Position encoding
46
+
47
+ """
48
+
49
+ class PositionalEncoder(nn.Module):
50
+ def __init__(self, d_model, max_seq_length=200, dropout=0.1):
51
+ super().__init__()
52
+
53
+ self.d_model = d_model
54
+ self.dropout = nn.Dropout(dropout)
55
+
56
+ pe = torch.zeros(max_seq_length, d_model)
57
+
58
+
59
+ for pos in range(max_seq_length):
60
+ for i in range(0, d_model, 2):
61
+ pe[pos, i] = math.sin(pos/(10000**(2*i/d_model)))
62
+ pe[pos, i+1] = math.cos(pos/(10000**((2*i+1)/d_model)))
63
+ pe = pe.unsqueeze(0)
64
+ self.register_buffer('pe', pe)
65
+
66
+ def forward(self, x):
67
+
68
+ x = x*math.sqrt(self.d_model)
69
+ seq_length = x.size(1)
70
+
71
+ pe = Variable(self.pe[:, :seq_length], requires_grad=False)
72
+
73
+ if x.is_cuda:
74
+ pe.cuda()
75
+ # cộng embedding vector với pe
76
+ x = x + pe
77
+ x = self.dropout(x)
78
+
79
+ return x
80
+
81
+ """# Self Attention Layer
82
+ # And
83
+ # Multi Head Attention
84
+
85
+ """
86
+
87
+ def attention(q, k, v, mask=None, dropout=None):
88
+ """
89
+ q: batch_size x head x seq_length x d_model
90
+ k: batch_size x head x seq_length x d_model
91
+ v: batch_size x head x seq_length x d_model
92
+ mask: batch_size x 1 x 1 x seq_length
93
+ output: batch_size x head x seq_length x d_model
94
+ """
95
+
96
+ # attention score được tính bằng cách nhân q với k
97
+ d_k = q.size(-1)
98
+ scores = torch.matmul(q, k.transpose(-2, -1))/math.sqrt(d_k)
99
+
100
+ if mask is not None:
101
+ mask = mask.unsqueeze(1)
102
+ scores = scores.masked_fill(mask==0, -1e9)
103
+ # chuẩn hóa bằng softmax
104
+ scores = F.softmax(scores, dim=-1)
105
+
106
+ if dropout is not None:
107
+ scores = dropout(scores)
108
+
109
+ output = torch.matmul(scores, v)
110
+ return output, scores
111
+
112
+ class MultiHeadAttention(nn.Module):
113
+ def __init__(self, heads, d_model, dropout=0.1):
114
+ super().__init__()
115
+ assert d_model % heads == 0
116
+
117
+ self.d_model = d_model
118
+ self.d_k = d_model//heads
119
+ self.h = heads
120
+ self.attn = None
121
+
122
+ # tạo ra 3 ma trận trọng số là q_linear, k_linear, v_linear
123
+ self.q_linear = nn.Linear(d_model, d_model)
124
+ self.k_linear = nn.Linear(d_model, d_model)
125
+ self.v_linear = nn.Linear(d_model, d_model)
126
+
127
+ self.dropout = nn.Dropout(dropout)
128
+ self.out = nn.Linear(d_model, d_model)
129
+
130
+ def forward(self, q, k, v, mask=None):
131
+ """
132
+ q: batch_size x seq_length x d_model
133
+ k: batch_size x seq_length x d_model
134
+ v: batch_size x seq_length x d_model
135
+ mask: batch_size x 1 x seq_length
136
+ output: batch_size x seq_length x d_model
137
+ """
138
+ bs = q.size(0)
139
+ # nhân ma trận trọng số q_linear, k_linear, v_linear với dữ liệu đầu vào q, k, v
140
+ q = self.q_linear(q).view(bs, -1, self.h, self.d_k)
141
+ k = self.k_linear(k).view(bs, -1, self.h, self.d_k)
142
+ v = self.v_linear(v).view(bs, -1, self.h, self.d_k)
143
+
144
+ q = q.transpose(1, 2)
145
+ k = k.transpose(1, 2)
146
+ v = v.transpose(1, 2)
147
+
148
+ # tính attention score
149
+ scores, self.attn = attention(q, k, v, mask, self.dropout)
150
+
151
+ concat = scores.transpose(1, 2).contiguous().view(bs, -1, self.d_model)
152
+
153
+ output = self.out(concat)
154
+ return output
155
+
156
+ """# Normalization Layer
157
+
158
+
159
+
160
+
161
+ """
162
+
163
+ class Norm(nn.Module):
164
+ def __init__(self, d_model, eps = 1e-6):
165
+ super().__init__()
166
+
167
+ self.size = d_model
168
+
169
+
170
+ self.alpha = nn.Parameter(torch.ones(self.size))
171
+ self.bias = nn.Parameter(torch.zeros(self.size))
172
+
173
+ self.eps = eps
174
+
175
+ def forward(self, x):
176
+ norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) \
177
+ / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias
178
+ return norm
179
+
180
+ class FeedForward(nn.Module):
181
+
182
+ def __init__(self, d_model, d_ff=2048, dropout = 0.1):
183
+ super().__init__()
184
+
185
+
186
+ self.linear_1 = nn.Linear(d_model, d_ff)
187
+ self.dropout = nn.Dropout(dropout)
188
+ self.linear_2 = nn.Linear(d_ff, d_model)
189
+
190
+ def forward(self, x):
191
+ x = self.dropout(F.relu(self.linear_1(x)))
192
+ x = self.linear_2(x)
193
+ return x
194
+
195
+ class EncoderLayer(nn.Module):
196
+ def __init__(self, d_model, heads, dropout=0.1):
197
+ super().__init__()
198
+ self.norm_1 = Norm(d_model)
199
+ self.norm_2 = Norm(d_model)
200
+ self.attn = MultiHeadAttention(heads, d_model, dropout=dropout)
201
+ self.ff = FeedForward(d_model, dropout=dropout)
202
+ self.dropout_1 = nn.Dropout(dropout)
203
+ self.dropout_2 = nn.Dropout(dropout)
204
+
205
+ def forward(self, x, mask):
206
+ """
207
+ x: batch_size x seq_length x d_model
208
+ mask: batch_size x 1 x seq_length
209
+ output: batch_size x seq_length x d_model
210
+ """
211
+
212
+
213
+ x2 = self.norm_1(x)
214
+ # tính attention value
215
+ x = x + self.dropout_1(self.attn(x2,x2,x2,mask))
216
+ x2 = self.norm_2(x)
217
+ x = x + self.dropout_2(self.ff(x2))
218
+ return x
219
+
220
+ """# Decoder
221
+ Decoder thực hiện chức năng giải mã vector của câu nguồn thành câu đích
222
+
223
+ ## Và Masked Multi Head Attention
224
+
225
+ """
226
+
227
+ class DecoderLayer(nn.Module):
228
+ def __init__(self, d_model, heads, dropout=0.1):
229
+ super().__init__()
230
+ self.norm_1 = Norm(d_model)
231
+ self.norm_2 = Norm(d_model)
232
+ self.norm_3 = Norm(d_model)
233
+
234
+ self.dropout_1 = nn.Dropout(dropout)
235
+ self.dropout_2 = nn.Dropout(dropout)
236
+ self.dropout_3 = nn.Dropout(dropout)
237
+
238
+ self.attn_1 = MultiHeadAttention(heads, d_model, dropout=dropout)
239
+ self.attn_2 = MultiHeadAttention(heads, d_model, dropout=dropout)
240
+ self.ff = FeedForward(d_model, dropout=dropout)
241
+
242
+ def forward(self, x, e_outputs, src_mask, trg_mask):
243
+ """
244
+ x: batch_size x seq_length x d_model
245
+ e_outputs: batch_size x seq_length x d_model
246
+ src_mask: batch_size x 1 x seq_length
247
+ trg_mask: batch_size x 1 x seq_length
248
+ """
249
+
250
+ x2 = self.norm_1(x)
251
+ # multihead attention thứ nhất, chú ý các từ ở target
252
+ x = x + self.dropout_1(self.attn_1(x2, x2, x2, trg_mask))
253
+ x2 = self.norm_2(x)
254
+ # masked mulithead attention thứ 2. k, v là giá trị output của mô hình encoder
255
+ x = x + self.dropout_2(self.attn_2(x2, e_outputs, e_outputs, src_mask))
256
+ x2 = self.norm_3(x)
257
+ x = x + self.dropout_3(self.ff(x2))
258
+ return x
259
+
260
+ """# Cài đặt Encoder
261
+ bao gồm N encoder layer
262
+ """
263
+
264
+ import copy
265
+
266
+ def get_clones(module, N):
267
+ return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
268
+
269
+ class Encoder(nn.Module):
270
+ """Một encoder có nhiều encoder layer nhé !!!
271
+ """
272
+ def __init__(self, vocab_size, d_model, N, heads, dropout):
273
+ super().__init__()
274
+ self.N = N
275
+ self.embed = Embedder(vocab_size, d_model)
276
+ self.pe = PositionalEncoder(d_model, dropout=dropout)
277
+ self.layers = get_clones(EncoderLayer(d_model, heads, dropout), N)
278
+ self.norm = Norm(d_model)
279
+
280
+ def forward(self, src, mask):
281
+ """
282
+ src: batch_size x seq_length
283
+ mask: batch_size x 1 x seq_length
284
+ output: batch_size x seq_length x d_model
285
+ """
286
+ x = self.embed(src)
287
+ x = self.pe(x)
288
+ for i in range(self.N):
289
+ x = self.layers[i](x, mask)
290
+ return self.norm(x)
291
+
292
+ """# Cài đặt Decoder
293
+ bao gồm N decoder layers
294
+ """
295
+
296
+ class Decoder(nn.Module):
297
+ """Một decoder có nhiều decoder layer
298
+ """
299
+ def __init__(self, vocab_size, d_model, N, heads, dropout):
300
+ super().__init__()
301
+ self.N = N
302
+ self.embed = Embedder(vocab_size, d_model)
303
+ self.pe = PositionalEncoder(d_model, dropout=dropout)
304
+ self.layers = get_clones(DecoderLayer(d_model, heads, dropout), N)
305
+ self.norm = Norm(d_model)
306
+ def forward(self, trg, e_outputs, src_mask, trg_mask):
307
+ """
308
+ trg: batch_size x seq_length
309
+ e_outputs: batch_size x seq_length x d_model
310
+ src_mask: batch_size x 1 x seq_length
311
+ trg_mask: batch_size x 1 x seq_length
312
+ output: batch_size x seq_length x d_model
313
+ """
314
+ x = self.embed(trg)
315
+ x = self.pe(x)
316
+ for i in range(self.N):
317
+ x = self.layers[i](x, e_outputs, src_mask, trg_mask)
318
+ return self.norm(x)
319
+
320
+ """# Cài đặt Transformer
321
+ bao gồm encoder và decoder
322
+ """
323
+
324
+ class Transformer(nn.Module):
325
+ # mô hình transformer hoàn chỉnh
326
+ def __init__(self, src_vocab, trg_vocab, d_model, N, heads, dropout):
327
+ super().__init__()
328
+ self.encoder = Encoder(src_vocab, d_model, N, heads, dropout)
329
+ self.decoder = Decoder(trg_vocab, d_model, N, heads, dropout)
330
+ self.out = nn.Linear(d_model, trg_vocab)
331
+ def forward(self, src, trg, src_mask, trg_mask):
332
+
333
+ #src: batch_size x seq_length
334
+ #trg: batch_size x seq_length
335
+ #src_mask: batch_size x 1 x seq_length
336
+ #trg_mask batch_size x 1 x seq_length
337
+ #output: batch_size x seq_length x vocab_size
338
+
339
+ e_outputs = self.encoder(src, src_mask)
340
+
341
+ d_output = self.decoder(trg, e_outputs, src_mask, trg_mask)
342
+ output = self.out(d_output)
343
+ return output
344
+
345
+ from torchtext import data
346
+ #torchtext để load dữ liệu, giúp giảm thời gian và hiệu quả
347
+ class MyIterator(data.Iterator):
348
+ def create_batches(self):
349
+ if self.train:
350
+ def pool(d, random_shuffler):
351
+ for p in data.batch(d, self.batch_size * 100):
352
+ p_batch = data.batch(
353
+ sorted(p, key=self.sort_key),
354
+ self.batch_size, self.batch_size_fn)
355
+ for b in random_shuffler(list(p_batch)):
356
+ yield b
357
+ self.batches = pool(self.data(), self.random_shuffler)
358
+
359
+ else:
360
+ self.batches = []
361
+ for b in data.batch(self.data(), self.batch_size,
362
+ self.batch_size_fn):
363
+ self.batches.append(sorted(b, key=self.sort_key))
364
+
365
+ global max_src_in_batch, max_tgt_in_batch
366
+
367
+ def batch_size_fn(new, count, sofar):
368
+ global max_src_in_batch, max_tgt_in_batch
369
+ if count == 1:
370
+ max_src_in_batch = 0
371
+ max_tgt_in_batch = 0
372
+ max_src_in_batch = max(max_src_in_batch, len(new.src))
373
+ max_tgt_in_batch = max(max_tgt_in_batch, len(new.trg) + 2)
374
+ src_elements = count * max_src_in_batch
375
+ tgt_elements = count * max_tgt_in_batch
376
+ return max(src_elements, tgt_elements)
377
+
378
+ def nopeak_mask(size, device):
379
+ #Tạo mask được sử dụng trong decoder để lúc dự đoán trong quá trình huấn luyện mô hình không nhìn thấy được các từ ở tương lai
380
+
381
+ np_mask = np.triu(np.ones((1, size, size)),
382
+ k=1).astype('uint8')
383
+ np_mask = Variable(torch.from_numpy(np_mask) == 0)
384
+ np_mask = np_mask.to(device)
385
+
386
+ return np_mask
387
+
388
+ def create_masks(src, trg, src_pad, trg_pad, device):
389
+ #Tạo mask cho encoder, để mô hình không bỏ qua thông tin của các kí tự PAD do chúng ta thêm vào
390
+
391
+ src_mask = (src != src_pad).unsqueeze(-2)
392
+
393
+ if trg is not None:
394
+ trg_mask = (trg != trg_pad).unsqueeze(-2)
395
+ size = trg.size(1)
396
+ np_mask = nopeak_mask(size, device)
397
+ if trg.is_cuda:
398
+ np_mask.cuda()
399
+ trg_mask = trg_mask & np_mask
400
+
401
+ else:
402
+ trg_mask = None
403
+ return src_mask, trg_mask
404
+
405
+ from nltk.corpus import wordnet
406
+ import re
407
+
408
+ def get_synonym(word, SRC):
409
+ syns = wordnet.synsets(word)
410
+ for s in syns:
411
+ for l in s.lemmas():
412
+ if SRC.vocab.stoi[l.name()] != 0:
413
+ return SRC.vocab.stoi[l.name()]
414
+
415
+ return 0
416
+
417
+ def multiple_replace(dict, text):
418
+ regex = re.compile("(%s)" % "|".join(map(re.escape, dict.keys())))
419
+
420
+ return regex.sub(lambda mo: dict[mo.string[mo.start():mo.end()]], text)
421
+
422
+ def init_vars(src, model, SRC, TRG, device, k, max_len):
423
+ """ Tính toán các ma trận cần thiết trong quá trình translation sau khi mô hình học xong
424
+ """
425
+ init_tok = TRG.vocab.stoi['<sos>']
426
+ src_mask = (src != SRC.vocab.stoi['<pad>']).unsqueeze(-2)
427
+
428
+ # tính sẵn output của encoder
429
+ e_output = model.encoder(src, src_mask)
430
+
431
+ outputs = torch.LongTensor([[init_tok]])
432
+
433
+ outputs = outputs.to(device)
434
+
435
+ trg_mask = nopeak_mask(1, device)
436
+ # dự đoán kí tự đầu tiên
437
+ out = model.out(model.decoder(outputs,
438
+ e_output, src_mask, trg_mask))
439
+ out = F.softmax(out, dim=-1)
440
+
441
+ probs, ix = out[:, -1].data.topk(k)
442
+ log_scores = torch.Tensor([math.log(prob) for prob in probs.data[0]]).unsqueeze(0)
443
+
444
+ outputs = torch.zeros(k, max_len).long()
445
+ outputs = outputs.to(device)
446
+ outputs[:, 0] = init_tok
447
+ outputs[:, 1] = ix[0]
448
+
449
+ e_outputs = torch.zeros(k, e_output.size(-2),e_output.size(-1))
450
+
451
+ e_outputs = e_outputs.to(device)
452
+ e_outputs[:, :] = e_output[0]
453
+
454
+ return outputs, e_outputs, log_scores
455
+
456
+ def k_best_outputs(outputs, out, log_scores, i, k):
457
+
458
+ probs, ix = out[:, -1].data.topk(k)
459
+ log_probs = torch.Tensor([math.log(p) for p in probs.data.view(-1)]).view(k, -1) + log_scores.transpose(0,1)
460
+ k_probs, k_ix = log_probs.view(-1).topk(k)
461
+
462
+ row = k_ix // k
463
+ col = k_ix % k
464
+
465
+ outputs[:, :i] = outputs[row, :i]
466
+ outputs[:, i] = ix[row, col]
467
+
468
+ log_scores = k_probs.unsqueeze(0)
469
+
470
+ return outputs, log_scores
471
+
472
+ def beam_search(src, model, SRC, TRG, device, k, max_len):
473
+
474
+ outputs, e_outputs, log_scores = init_vars(src, model, SRC, TRG, device, k, max_len)
475
+ eos_tok = TRG.vocab.stoi['<eos>']
476
+ src_mask = (src != SRC.vocab.stoi['<pad>']).unsqueeze(-2)
477
+ ind = None
478
+ for i in range(2, max_len):
479
+
480
+ trg_mask = nopeak_mask(i, device)
481
+
482
+ out = model.out(model.decoder(outputs[:,:i],
483
+ e_outputs, src_mask, trg_mask))
484
+
485
+ out = F.softmax(out, dim=-1)
486
+
487
+ outputs, log_scores = k_best_outputs(outputs, out, log_scores, i, k)
488
+
489
+ ones = (outputs==eos_tok).nonzero()
490
+ sentence_lengths = torch.zeros(len(outputs), dtype=torch.long).cuda()
491
+ for vec in ones:
492
+ i = vec[0]
493
+ if sentence_lengths[i]==0:
494
+ sentence_lengths[i] = vec[1]
495
+
496
+ num_finished_sentences = len([s for s in sentence_lengths if s > 0])
497
+
498
+ if num_finished_sentences == k:
499
+ alpha = 0.7
500
+ div = 1/(sentence_lengths.type_as(log_scores)**alpha)
501
+ _, ind = torch.max(log_scores * div, 1)
502
+ ind = ind.data[0]
503
+ break
504
+
505
+ if ind is None:
506
+
507
+ length = (outputs[0]==eos_tok).nonzero()[0] if len((outputs[0]==eos_tok).nonzero()) > 0 else -1
508
+ return ' '.join([TRG.vocab.itos[tok] for tok in outputs[0][1:length]])
509
+
510
+ else:
511
+ length = (outputs[ind]==eos_tok).nonzero()[0]
512
+ return ' '.join([TRG.vocab.itos[tok] for tok in outputs[ind][1:length]])
513
+
514
+ def translate_sentence(sentence, model, SRC, TRG, device, k, max_len):
515
+ """Dịch một câu sử dụng beamsearch
516
+ """
517
+ model.eval()
518
+ indexed = []
519
+ sentence = SRC.preprocess(sentence)
520
+
521
+ for tok in sentence:
522
+ if SRC.vocab.stoi[tok] != SRC.vocab.stoi['<eos>']:
523
+ indexed.append(SRC.vocab.stoi[tok])
524
+ else:
525
+ indexed.append(get_synonym(tok, SRC))
526
+
527
+ sentence = Variable(torch.LongTensor([indexed]))
528
+
529
+ sentence = sentence.to(device)
530
+
531
+ sentence = beam_search(sentence, model, SRC, TRG, device, k, max_len)
532
+
533
+ return multiple_replace({' ?' : '?',' !':'!',' .':'.','\' ':'\'',' ,':','}, sentence)
534
+
535
+ import re
536
+
537
+ class tokenize(object):
538
+
539
+ def __init__(self, lang):
540
+ self.nlp = spacy.load(lang)
541
+
542
+ def tokenizer(self, sentence):
543
+ sentence = re.sub(
544
+ r"[\*\"“”\n\\…\+\-\/\=\(\)‘•:\[\]\|’\!;]", " ", str(sentence))
545
+ sentence = re.sub(r"[ ]+", " ", sentence)
546
+ sentence = re.sub(r"\!+", "!", sentence)
547
+ sentence = re.sub(r"\,+", ",", sentence)
548
+ sentence = re.sub(r"\?+", "?", sentence)
549
+ sentence = sentence.lower()
550
+ return [tok.text for tok in self.nlp.tokenizer(sentence) if tok.text != " "]
551
+
552
+ """## Data loader
553
+
554
+ """
555
+
556
+ !pip install dill
557
+
558
+ import os
559
+ import dill as pickle
560
+ import pandas as pd
561
+
562
+ def read_data(src_file, trg_file):
563
+ src_data = open(src_file).read().strip().split('\n')
564
+
565
+ trg_data = open(trg_file).read().strip().split('\n')
566
+
567
+ return src_data, trg_data
568
+
569
+ def create_fields(src_lang, trg_lang):
570
+
571
+ print("loading spacy tokenizers...")
572
+
573
+ t_src = tokenize(src_lang)
574
+ t_trg = tokenize(trg_lang)
575
+
576
+ TRG = data.Field(lower=True, tokenize=t_trg.tokenizer, init_token='<sos>', eos_token='<eos>')
577
+ SRC = data.Field(lower=True, tokenize=t_src.tokenizer)
578
+
579
+ return SRC, TRG
580
+
581
+ def create_dataset(src_data, trg_data, max_strlen, batchsize, device, SRC, TRG, istrain=True):
582
+
583
+ print("creating dataset and iterator... ")
584
+
585
+ raw_data = {'src' : [line for line in src_data], 'trg': [line for line in trg_data]}
586
+ df = pd.DataFrame(raw_data, columns=["src", "trg"])
587
+
588
+ mask = (df['src'].str.count(' ') < max_strlen) & (df['trg'].str.count(' ') < max_strlen)
589
+ df = df.loc[mask]
590
+
591
+ df.to_csv("translate_transformer_temp.csv", index=False)
592
+
593
+ data_fields = [('src', SRC), ('trg', TRG)]
594
+ train = data.TabularDataset('./translate_transformer_temp.csv', format='csv', fields=data_fields)
595
+
596
+ train_iter = MyIterator(train, batch_size=batchsize, device=device,
597
+ repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)),
598
+ batch_size_fn=batch_size_fn, train=istrain, shuffle=True)
599
+
600
+ os.remove('translate_transformer_temp.csv')
601
+
602
+ if istrain:
603
+ SRC.build_vocab(train)
604
+ TRG.build_vocab(train)
605
+
606
+ return train_iter
607
+
608
+ def step(model, optimizer,batch, criterion):
609
+ """
610
+ Một lần cập nhật mô hình
611
+ """
612
+ model.train()
613
+
614
+ src = batch.src.transpose(0,1).cuda()
615
+ trg = batch.trg.transpose(0,1).cuda()
616
+ trg_input = trg[:, :-1]
617
+ src_mask, trg_mask = create_masks(src, trg_input, src_pad, trg_pad, opt['device'])
618
+ preds = model(src, trg_input, src_mask, trg_mask)
619
+
620
+ ys = trg[:, 1:].contiguous().view(-1)
621
+
622
+ optimizer.zero_grad()
623
+ loss = criterion(preds.view(-1, preds.size(-1)), ys)
624
+ loss.backward()
625
+ optimizer.step_and_update_lr()
626
+
627
+ loss = loss.item()
628
+
629
+ return loss
630
+
631
+ def validiate(model, valid_iter, criterion):
632
+ """ Tính loss trên tập validation
633
+ """
634
+ model.eval()
635
+
636
+ with torch.no_grad():
637
+ total_loss = []
638
+ for batch in valid_iter:
639
+ src = batch.src.transpose(0,1).cuda()
640
+ trg = batch.trg.transpose(0,1).cuda()
641
+ trg_input = trg[:, :-1]
642
+ src_mask, trg_mask = create_masks(src, trg_input, src_pad, trg_pad, opt['device'])
643
+ preds = model(src, trg_input, src_mask, trg_mask)
644
+
645
+ ys = trg[:, 1:].contiguous().view(-1)
646
+
647
+ loss = criterion(preds.view(-1, preds.size(-1)), ys)
648
+
649
+ loss = loss.item()
650
+
651
+ total_loss.append(loss)
652
+
653
+ avg_loss = np.mean(total_loss)
654
+
655
+ return avg_loss
656
+
657
+ """# Optimizer
658
+
659
+ """
660
+
661
+ class ScheduledOptim():
662
+ '''A simple wrapper class for learning rate scheduling'''
663
+
664
+ def __init__(self, optimizer, init_lr, d_model, n_warmup_steps):
665
+ self._optimizer = optimizer
666
+ self.init_lr = init_lr
667
+ self.d_model = d_model
668
+ self.n_warmup_steps = n_warmup_steps
669
+ self.n_steps = 0
670
+
671
+
672
+ def step_and_update_lr(self):
673
+ "Step with the inner optimizer"
674
+ self._update_learning_rate()
675
+ self._optimizer.step()
676
+
677
+
678
+ def zero_grad(self):
679
+ "Zero out the gradients with the inner optimizer"
680
+ self._optimizer.zero_grad()
681
+
682
+
683
+ def _get_lr_scale(self):
684
+ d_model = self.d_model
685
+ n_steps, n_warmup_steps = self.n_steps, self.n_warmup_steps
686
+ return (d_model ** -0.5) * min(n_steps ** (-0.5), n_steps * n_warmup_steps ** (-1.5))
687
+
688
+ def state_dict(self):
689
+ optimizer_state_dict = {
690
+ 'init_lr':self.init_lr,
691
+ 'd_model':self.d_model,
692
+ 'n_warmup_steps':self.n_warmup_steps,
693
+ 'n_steps':self.n_steps,
694
+ '_optimizer':self._optimizer.state_dict(),
695
+ }
696
+
697
+ return optimizer_state_dict
698
+
699
+ def load_state_dict(self, state_dict):
700
+ self.init_lr = state_dict['init_lr']
701
+ self.d_model = state_dict['d_model']
702
+ self.n_warmup_steps = state_dict['n_warmup_steps']
703
+ self.n_steps = state_dict['n_steps']
704
+
705
+ self._optimizer.load_state_dict(state_dict['_optimizer'])
706
+
707
+ def _update_learning_rate(self):
708
+ ''' Learning rate scheduling per step '''
709
+
710
+ self.n_steps += 1
711
+ lr = self.init_lr * self._get_lr_scale()
712
+
713
+ for param_group in self._optimizer.param_groups:
714
+ param_group['lr'] = lr
715
+
716
+ """# Label Smoothing
717
+ hạn chế hiện tượng overfit
718
+
719
+
720
+ """
721
+
722
+ class LabelSmoothingLoss(nn.Module):
723
+ def __init__(self, classes, padding_idx, smoothing=0.0, dim=-1):
724
+ super(LabelSmoothingLoss, self).__init__()
725
+ self.confidence = 1.0 - smoothing
726
+ self.smoothing = smoothing
727
+ self.cls = classes
728
+ self.dim = dim
729
+ self.padding_idx = padding_idx
730
+
731
+ def forward(self, pred, target):
732
+ pred = pred.log_softmax(dim=self.dim)
733
+ with torch.no_grad():
734
+ # true_dist = pred.data.clone()
735
+ true_dist = torch.zeros_like(pred)
736
+ true_dist.fill_(self.smoothing / (self.cls - 2))
737
+ true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
738
+ true_dist[:, self.padding_idx] = 0
739
+ mask = torch.nonzero(target.data == self.padding_idx, as_tuple=False)
740
+ if mask.dim() > 0:
741
+ true_dist.index_fill_(0, mask.squeeze(), 0.0)
742
+
743
+ return torch.mean(torch.sum(-true_dist * pred, dim=self.dim))
744
+
745
+ from torchtext.data.metrics import bleu_score
746
+
747
+ def bleu(valid_src_data, valid_trg_data, model, SRC, TRG, device, k, max_strlen):
748
+ pred_sents = []
749
+ for sentence in valid_src_data:
750
+ pred_trg = translate_sentence(sentence, model, SRC, TRG, device, k, max_strlen)
751
+ pred_sents.append(pred_trg)
752
+
753
+ pred_sents = [TRG.preprocess(sent) for sent in pred_sents]
754
+ trg_sents = [[sent.split()] for sent in valid_trg_data]
755
+
756
+ return bleu_score(pred_sents, trg_sents)
757
+
758
+ opt = {
759
+ 'train_src_data':'./data/train.en',
760
+ 'train_trg_data':'./data/train.vi',
761
+ 'valid_src_data':'./data/tst2013.en',
762
+ 'valid_trg_data':'./data/tst2013.vi',
763
+ 'src_lang':'en_core_web_sm',
764
+ 'trg_lang':'vi_core_news_lg',
765
+ 'max_strlen':160,
766
+ 'batchsize':1500,
767
+ 'device':'cuda',
768
+ 'd_model': 512,
769
+ 'n_layers': 6,
770
+ 'heads': 8,
771
+ 'dropout': 0.1,
772
+ 'lr':0.0001,
773
+ 'epochs':30,
774
+ 'printevery': 200,
775
+ 'k':5,
776
+ }
777
+
778
+ os.makedirs('./data/', exist_ok=True)
779
+ ! gdown --id 1Fuo_ALIFKlUvOPbK5rUA5OfAS2wKn_95
780
+
781
+ ! unzip -o en_vi.zip
782
+
783
+ train_src_data, train_trg_data = read_data(opt['train_src_data'], opt['train_trg_data'])
784
+ valid_src_data, valid_trg_data = read_data(opt['valid_src_data'], opt['valid_trg_data'])
785
+
786
+ SRC, TRG = create_fields(opt['src_lang'], opt['trg_lang'])
787
+ train_iter = create_dataset(train_src_data, train_trg_data, opt['max_strlen'], opt['batchsize'], opt['device'], SRC, TRG, istrain=True)
788
+ valid_iter = create_dataset(valid_src_data, valid_trg_data, opt['max_strlen'], opt['batchsize'], opt['device'], SRC, TRG, istrain=False)
789
+
790
+ src_pad = SRC.vocab.stoi['<pad>']
791
+ trg_pad = TRG.vocab.stoi['<pad>']
792
+
793
+ model = Transformer(len(SRC.vocab), len(TRG.vocab), opt['d_model'], opt['n_layers'], opt['heads'], opt['dropout'])
794
+
795
+ for p in model.parameters():
796
+ if p.dim() > 1:
797
+ nn.init.xavier_uniform_(p)
798
+
799
+ model = model.to(opt['device'])
800
+
801
+ optimizer = ScheduledOptim(
802
+ torch.optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-09),
803
+ 0.2, opt['d_model'], 4000)
804
+
805
+ criterion = LabelSmoothingLoss(len(TRG.vocab), padding_idx=trg_pad, smoothing=0.1)
806
+
807
+ model.load_state_dict(torch.load('./transformer.pth'))
808
+
809
+ sentence='what is your name'
810
+
811
+ trans_sent = translate_sentence(sentence, model, SRC, TRG, opt['device'], opt['k'], opt['max_strlen'])
812
+ trans_sent