trans_envi / trans_envi.py
chinhr's picture
Upload trans_envi.py
89362ce verified
# -*- coding: utf-8 -*-
"""Bản sao của Transformer.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1Du2LiOZ4ZJ4uzIkGR_kwWgwdJEqLjW2P
"""
!pip install spacy==3.7.2
! pip -q install torchtext==0.6.0
! pip -q install pyvi
!python -m spacy download en_core_web_sm
import nltk
nltk.download('wordnet')
!pip install https://gitlab.com/trungtv/vi_spacy/-/raw/master/packages/vi_core_news_lg-3.6.0/dist/vi_core_news_lg-3.6.0.tar.gz
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
import numpy as np
import os
import math
import nltk
import spacy
"""#Embedder
"""
class Embedder(nn.Module):
def __init__(self, vocab_size, d_model):
super().__init__()
self.vocab_size = vocab_size
self.d_model = d_model
self.embed = nn.Embedding(vocab_size, d_model)
def forward(self, x):
return self.embed(x)
"""#Position encoding
"""
class PositionalEncoder(nn.Module):
def __init__(self, d_model, max_seq_length=200, dropout=0.1):
super().__init__()
self.d_model = d_model
self.dropout = nn.Dropout(dropout)
pe = torch.zeros(max_seq_length, d_model)
for pos in range(max_seq_length):
for i in range(0, d_model, 2):
pe[pos, i] = math.sin(pos/(10000**(2*i/d_model)))
pe[pos, i+1] = math.cos(pos/(10000**((2*i+1)/d_model)))
pe = pe.unsqueeze(0)
self.register_buffer('pe', pe)
def forward(self, x):
x = x*math.sqrt(self.d_model)
seq_length = x.size(1)
pe = Variable(self.pe[:, :seq_length], requires_grad=False)
if x.is_cuda:
pe.cuda()
# cộng embedding vector với pe
x = x + pe
x = self.dropout(x)
return x
"""# Self Attention Layer
# And
# Multi Head Attention
"""
def attention(q, k, v, mask=None, dropout=None):
"""
q: batch_size x head x seq_length x d_model
k: batch_size x head x seq_length x d_model
v: batch_size x head x seq_length x d_model
mask: batch_size x 1 x 1 x seq_length
output: batch_size x head x seq_length x d_model
"""
# attention score được tính bằng cách nhân q với k
d_k = q.size(-1)
scores = torch.matmul(q, k.transpose(-2, -1))/math.sqrt(d_k)
if mask is not None:
mask = mask.unsqueeze(1)
scores = scores.masked_fill(mask==0, -1e9)
# chuẩn hóa bằng softmax
scores = F.softmax(scores, dim=-1)
if dropout is not None:
scores = dropout(scores)
output = torch.matmul(scores, v)
return output, scores
class MultiHeadAttention(nn.Module):
def __init__(self, heads, d_model, dropout=0.1):
super().__init__()
assert d_model % heads == 0
self.d_model = d_model
self.d_k = d_model//heads
self.h = heads
self.attn = None
# tạo ra 3 ma trận trọng số là q_linear, k_linear, v_linear
self.q_linear = nn.Linear(d_model, d_model)
self.k_linear = nn.Linear(d_model, d_model)
self.v_linear = nn.Linear(d_model, d_model)
self.dropout = nn.Dropout(dropout)
self.out = nn.Linear(d_model, d_model)
def forward(self, q, k, v, mask=None):
"""
q: batch_size x seq_length x d_model
k: batch_size x seq_length x d_model
v: batch_size x seq_length x d_model
mask: batch_size x 1 x seq_length
output: batch_size x seq_length x d_model
"""
bs = q.size(0)
# nhân ma trận trọng số q_linear, k_linear, v_linear với dữ liệu đầu vào q, k, v
q = self.q_linear(q).view(bs, -1, self.h, self.d_k)
k = self.k_linear(k).view(bs, -1, self.h, self.d_k)
v = self.v_linear(v).view(bs, -1, self.h, self.d_k)
q = q.transpose(1, 2)
k = k.transpose(1, 2)
v = v.transpose(1, 2)
# tính attention score
scores, self.attn = attention(q, k, v, mask, self.dropout)
concat = scores.transpose(1, 2).contiguous().view(bs, -1, self.d_model)
output = self.out(concat)
return output
"""# Normalization Layer
"""
class Norm(nn.Module):
def __init__(self, d_model, eps = 1e-6):
super().__init__()
self.size = d_model
self.alpha = nn.Parameter(torch.ones(self.size))
self.bias = nn.Parameter(torch.zeros(self.size))
self.eps = eps
def forward(self, x):
norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) \
/ (x.std(dim=-1, keepdim=True) + self.eps) + self.bias
return norm
class FeedForward(nn.Module):
def __init__(self, d_model, d_ff=2048, dropout = 0.1):
super().__init__()
self.linear_1 = nn.Linear(d_model, d_ff)
self.dropout = nn.Dropout(dropout)
self.linear_2 = nn.Linear(d_ff, d_model)
def forward(self, x):
x = self.dropout(F.relu(self.linear_1(x)))
x = self.linear_2(x)
return x
class EncoderLayer(nn.Module):
def __init__(self, d_model, heads, dropout=0.1):
super().__init__()
self.norm_1 = Norm(d_model)
self.norm_2 = Norm(d_model)
self.attn = MultiHeadAttention(heads, d_model, dropout=dropout)
self.ff = FeedForward(d_model, dropout=dropout)
self.dropout_1 = nn.Dropout(dropout)
self.dropout_2 = nn.Dropout(dropout)
def forward(self, x, mask):
"""
x: batch_size x seq_length x d_model
mask: batch_size x 1 x seq_length
output: batch_size x seq_length x d_model
"""
x2 = self.norm_1(x)
# tính attention value
x = x + self.dropout_1(self.attn(x2,x2,x2,mask))
x2 = self.norm_2(x)
x = x + self.dropout_2(self.ff(x2))
return x
"""# Decoder
Decoder thực hiện chức năng giải mã vector của câu nguồn thành câu đích
## Và Masked Multi Head Attention
"""
class DecoderLayer(nn.Module):
def __init__(self, d_model, heads, dropout=0.1):
super().__init__()
self.norm_1 = Norm(d_model)
self.norm_2 = Norm(d_model)
self.norm_3 = Norm(d_model)
self.dropout_1 = nn.Dropout(dropout)
self.dropout_2 = nn.Dropout(dropout)
self.dropout_3 = nn.Dropout(dropout)
self.attn_1 = MultiHeadAttention(heads, d_model, dropout=dropout)
self.attn_2 = MultiHeadAttention(heads, d_model, dropout=dropout)
self.ff = FeedForward(d_model, dropout=dropout)
def forward(self, x, e_outputs, src_mask, trg_mask):
"""
x: batch_size x seq_length x d_model
e_outputs: batch_size x seq_length x d_model
src_mask: batch_size x 1 x seq_length
trg_mask: batch_size x 1 x seq_length
"""
x2 = self.norm_1(x)
# multihead attention thứ nhất, chú ý các từ ở target
x = x + self.dropout_1(self.attn_1(x2, x2, x2, trg_mask))
x2 = self.norm_2(x)
# masked mulithead attention thứ 2. k, v là giá trị output của mô hình encoder
x = x + self.dropout_2(self.attn_2(x2, e_outputs, e_outputs, src_mask))
x2 = self.norm_3(x)
x = x + self.dropout_3(self.ff(x2))
return x
"""# Cài đặt Encoder
bao gồm N encoder layer
"""
import copy
def get_clones(module, N):
return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
class Encoder(nn.Module):
"""Một encoder có nhiều encoder layer nhé !!!
"""
def __init__(self, vocab_size, d_model, N, heads, dropout):
super().__init__()
self.N = N
self.embed = Embedder(vocab_size, d_model)
self.pe = PositionalEncoder(d_model, dropout=dropout)
self.layers = get_clones(EncoderLayer(d_model, heads, dropout), N)
self.norm = Norm(d_model)
def forward(self, src, mask):
"""
src: batch_size x seq_length
mask: batch_size x 1 x seq_length
output: batch_size x seq_length x d_model
"""
x = self.embed(src)
x = self.pe(x)
for i in range(self.N):
x = self.layers[i](x, mask)
return self.norm(x)
"""# Cài đặt Decoder
bao gồm N decoder layers
"""
class Decoder(nn.Module):
"""Một decoder có nhiều decoder layer
"""
def __init__(self, vocab_size, d_model, N, heads, dropout):
super().__init__()
self.N = N
self.embed = Embedder(vocab_size, d_model)
self.pe = PositionalEncoder(d_model, dropout=dropout)
self.layers = get_clones(DecoderLayer(d_model, heads, dropout), N)
self.norm = Norm(d_model)
def forward(self, trg, e_outputs, src_mask, trg_mask):
"""
trg: batch_size x seq_length
e_outputs: batch_size x seq_length x d_model
src_mask: batch_size x 1 x seq_length
trg_mask: batch_size x 1 x seq_length
output: batch_size x seq_length x d_model
"""
x = self.embed(trg)
x = self.pe(x)
for i in range(self.N):
x = self.layers[i](x, e_outputs, src_mask, trg_mask)
return self.norm(x)
"""# Cài đặt Transformer
bao gồm encoder và decoder
"""
class Transformer(nn.Module):
# mô hình transformer hoàn chỉnh
def __init__(self, src_vocab, trg_vocab, d_model, N, heads, dropout):
super().__init__()
self.encoder = Encoder(src_vocab, d_model, N, heads, dropout)
self.decoder = Decoder(trg_vocab, d_model, N, heads, dropout)
self.out = nn.Linear(d_model, trg_vocab)
def forward(self, src, trg, src_mask, trg_mask):
#src: batch_size x seq_length
#trg: batch_size x seq_length
#src_mask: batch_size x 1 x seq_length
#trg_mask batch_size x 1 x seq_length
#output: batch_size x seq_length x vocab_size
e_outputs = self.encoder(src, src_mask)
d_output = self.decoder(trg, e_outputs, src_mask, trg_mask)
output = self.out(d_output)
return output
from torchtext import data
#torchtext để load dữ liệu, giúp giảm thời gian và hiệu quả
class MyIterator(data.Iterator):
def create_batches(self):
if self.train:
def pool(d, random_shuffler):
for p in data.batch(d, self.batch_size * 100):
p_batch = data.batch(
sorted(p, key=self.sort_key),
self.batch_size, self.batch_size_fn)
for b in random_shuffler(list(p_batch)):
yield b
self.batches = pool(self.data(), self.random_shuffler)
else:
self.batches = []
for b in data.batch(self.data(), self.batch_size,
self.batch_size_fn):
self.batches.append(sorted(b, key=self.sort_key))
global max_src_in_batch, max_tgt_in_batch
def batch_size_fn(new, count, sofar):
global max_src_in_batch, max_tgt_in_batch
if count == 1:
max_src_in_batch = 0
max_tgt_in_batch = 0
max_src_in_batch = max(max_src_in_batch, len(new.src))
max_tgt_in_batch = max(max_tgt_in_batch, len(new.trg) + 2)
src_elements = count * max_src_in_batch
tgt_elements = count * max_tgt_in_batch
return max(src_elements, tgt_elements)
def nopeak_mask(size, device):
#Tạo mask được sử dụng trong decoder để lúc dự đoán trong quá trình huấn luyện mô hình không nhìn thấy được các từ ở tương lai
np_mask = np.triu(np.ones((1, size, size)),
k=1).astype('uint8')
np_mask = Variable(torch.from_numpy(np_mask) == 0)
np_mask = np_mask.to(device)
return np_mask
def create_masks(src, trg, src_pad, trg_pad, device):
#Tạo mask cho encoder, để mô hình không bỏ qua thông tin của các kí tự PAD do chúng ta thêm vào
src_mask = (src != src_pad).unsqueeze(-2)
if trg is not None:
trg_mask = (trg != trg_pad).unsqueeze(-2)
size = trg.size(1)
np_mask = nopeak_mask(size, device)
if trg.is_cuda:
np_mask.cuda()
trg_mask = trg_mask & np_mask
else:
trg_mask = None
return src_mask, trg_mask
from nltk.corpus import wordnet
import re
def get_synonym(word, SRC):
syns = wordnet.synsets(word)
for s in syns:
for l in s.lemmas():
if SRC.vocab.stoi[l.name()] != 0:
return SRC.vocab.stoi[l.name()]
return 0
def multiple_replace(dict, text):
regex = re.compile("(%s)" % "|".join(map(re.escape, dict.keys())))
return regex.sub(lambda mo: dict[mo.string[mo.start():mo.end()]], text)
def init_vars(src, model, SRC, TRG, device, k, max_len):
""" Tính toán các ma trận cần thiết trong quá trình translation sau khi mô hình học xong
"""
init_tok = TRG.vocab.stoi['<sos>']
src_mask = (src != SRC.vocab.stoi['<pad>']).unsqueeze(-2)
# tính sẵn output của encoder
e_output = model.encoder(src, src_mask)
outputs = torch.LongTensor([[init_tok]])
outputs = outputs.to(device)
trg_mask = nopeak_mask(1, device)
# dự đoán kí tự đầu tiên
out = model.out(model.decoder(outputs,
e_output, src_mask, trg_mask))
out = F.softmax(out, dim=-1)
probs, ix = out[:, -1].data.topk(k)
log_scores = torch.Tensor([math.log(prob) for prob in probs.data[0]]).unsqueeze(0)
outputs = torch.zeros(k, max_len).long()
outputs = outputs.to(device)
outputs[:, 0] = init_tok
outputs[:, 1] = ix[0]
e_outputs = torch.zeros(k, e_output.size(-2),e_output.size(-1))
e_outputs = e_outputs.to(device)
e_outputs[:, :] = e_output[0]
return outputs, e_outputs, log_scores
def k_best_outputs(outputs, out, log_scores, i, k):
probs, ix = out[:, -1].data.topk(k)
log_probs = torch.Tensor([math.log(p) for p in probs.data.view(-1)]).view(k, -1) + log_scores.transpose(0,1)
k_probs, k_ix = log_probs.view(-1).topk(k)
row = k_ix // k
col = k_ix % k
outputs[:, :i] = outputs[row, :i]
outputs[:, i] = ix[row, col]
log_scores = k_probs.unsqueeze(0)
return outputs, log_scores
def beam_search(src, model, SRC, TRG, device, k, max_len):
outputs, e_outputs, log_scores = init_vars(src, model, SRC, TRG, device, k, max_len)
eos_tok = TRG.vocab.stoi['<eos>']
src_mask = (src != SRC.vocab.stoi['<pad>']).unsqueeze(-2)
ind = None
for i in range(2, max_len):
trg_mask = nopeak_mask(i, device)
out = model.out(model.decoder(outputs[:,:i],
e_outputs, src_mask, trg_mask))
out = F.softmax(out, dim=-1)
outputs, log_scores = k_best_outputs(outputs, out, log_scores, i, k)
ones = (outputs==eos_tok).nonzero()
sentence_lengths = torch.zeros(len(outputs), dtype=torch.long).cuda()
for vec in ones:
i = vec[0]
if sentence_lengths[i]==0:
sentence_lengths[i] = vec[1]
num_finished_sentences = len([s for s in sentence_lengths if s > 0])
if num_finished_sentences == k:
alpha = 0.7
div = 1/(sentence_lengths.type_as(log_scores)**alpha)
_, ind = torch.max(log_scores * div, 1)
ind = ind.data[0]
break
if ind is None:
length = (outputs[0]==eos_tok).nonzero()[0] if len((outputs[0]==eos_tok).nonzero()) > 0 else -1
return ' '.join([TRG.vocab.itos[tok] for tok in outputs[0][1:length]])
else:
length = (outputs[ind]==eos_tok).nonzero()[0]
return ' '.join([TRG.vocab.itos[tok] for tok in outputs[ind][1:length]])
def translate_sentence(sentence, model, SRC, TRG, device, k, max_len):
"""Dịch một câu sử dụng beamsearch
"""
model.eval()
indexed = []
sentence = SRC.preprocess(sentence)
for tok in sentence:
if SRC.vocab.stoi[tok] != SRC.vocab.stoi['<eos>']:
indexed.append(SRC.vocab.stoi[tok])
else:
indexed.append(get_synonym(tok, SRC))
sentence = Variable(torch.LongTensor([indexed]))
sentence = sentence.to(device)
sentence = beam_search(sentence, model, SRC, TRG, device, k, max_len)
return multiple_replace({' ?' : '?',' !':'!',' .':'.','\' ':'\'',' ,':','}, sentence)
import re
class tokenize(object):
def __init__(self, lang):
self.nlp = spacy.load(lang)
def tokenizer(self, sentence):
sentence = re.sub(
r"[\*\"“”\n\\…\+\-\/\=\(\)‘•:\[\]\|’\!;]", " ", str(sentence))
sentence = re.sub(r"[ ]+", " ", sentence)
sentence = re.sub(r"\!+", "!", sentence)
sentence = re.sub(r"\,+", ",", sentence)
sentence = re.sub(r"\?+", "?", sentence)
sentence = sentence.lower()
return [tok.text for tok in self.nlp.tokenizer(sentence) if tok.text != " "]
"""## Data loader
"""
!pip install dill
import os
import dill as pickle
import pandas as pd
def read_data(src_file, trg_file):
src_data = open(src_file).read().strip().split('\n')
trg_data = open(trg_file).read().strip().split('\n')
return src_data, trg_data
def create_fields(src_lang, trg_lang):
print("loading spacy tokenizers...")
t_src = tokenize(src_lang)
t_trg = tokenize(trg_lang)
TRG = data.Field(lower=True, tokenize=t_trg.tokenizer, init_token='<sos>', eos_token='<eos>')
SRC = data.Field(lower=True, tokenize=t_src.tokenizer)
return SRC, TRG
def create_dataset(src_data, trg_data, max_strlen, batchsize, device, SRC, TRG, istrain=True):
print("creating dataset and iterator... ")
raw_data = {'src' : [line for line in src_data], 'trg': [line for line in trg_data]}
df = pd.DataFrame(raw_data, columns=["src", "trg"])
mask = (df['src'].str.count(' ') < max_strlen) & (df['trg'].str.count(' ') < max_strlen)
df = df.loc[mask]
df.to_csv("translate_transformer_temp.csv", index=False)
data_fields = [('src', SRC), ('trg', TRG)]
train = data.TabularDataset('./translate_transformer_temp.csv', format='csv', fields=data_fields)
train_iter = MyIterator(train, batch_size=batchsize, device=device,
repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)),
batch_size_fn=batch_size_fn, train=istrain, shuffle=True)
os.remove('translate_transformer_temp.csv')
if istrain:
SRC.build_vocab(train)
TRG.build_vocab(train)
return train_iter
def step(model, optimizer,batch, criterion):
"""
Một lần cập nhật mô hình
"""
model.train()
src = batch.src.transpose(0,1).cuda()
trg = batch.trg.transpose(0,1).cuda()
trg_input = trg[:, :-1]
src_mask, trg_mask = create_masks(src, trg_input, src_pad, trg_pad, opt['device'])
preds = model(src, trg_input, src_mask, trg_mask)
ys = trg[:, 1:].contiguous().view(-1)
optimizer.zero_grad()
loss = criterion(preds.view(-1, preds.size(-1)), ys)
loss.backward()
optimizer.step_and_update_lr()
loss = loss.item()
return loss
def validiate(model, valid_iter, criterion):
""" Tính loss trên tập validation
"""
model.eval()
with torch.no_grad():
total_loss = []
for batch in valid_iter:
src = batch.src.transpose(0,1).cuda()
trg = batch.trg.transpose(0,1).cuda()
trg_input = trg[:, :-1]
src_mask, trg_mask = create_masks(src, trg_input, src_pad, trg_pad, opt['device'])
preds = model(src, trg_input, src_mask, trg_mask)
ys = trg[:, 1:].contiguous().view(-1)
loss = criterion(preds.view(-1, preds.size(-1)), ys)
loss = loss.item()
total_loss.append(loss)
avg_loss = np.mean(total_loss)
return avg_loss
"""# Optimizer
"""
class ScheduledOptim():
'''A simple wrapper class for learning rate scheduling'''
def __init__(self, optimizer, init_lr, d_model, n_warmup_steps):
self._optimizer = optimizer
self.init_lr = init_lr
self.d_model = d_model
self.n_warmup_steps = n_warmup_steps
self.n_steps = 0
def step_and_update_lr(self):
"Step with the inner optimizer"
self._update_learning_rate()
self._optimizer.step()
def zero_grad(self):
"Zero out the gradients with the inner optimizer"
self._optimizer.zero_grad()
def _get_lr_scale(self):
d_model = self.d_model
n_steps, n_warmup_steps = self.n_steps, self.n_warmup_steps
return (d_model ** -0.5) * min(n_steps ** (-0.5), n_steps * n_warmup_steps ** (-1.5))
def state_dict(self):
optimizer_state_dict = {
'init_lr':self.init_lr,
'd_model':self.d_model,
'n_warmup_steps':self.n_warmup_steps,
'n_steps':self.n_steps,
'_optimizer':self._optimizer.state_dict(),
}
return optimizer_state_dict
def load_state_dict(self, state_dict):
self.init_lr = state_dict['init_lr']
self.d_model = state_dict['d_model']
self.n_warmup_steps = state_dict['n_warmup_steps']
self.n_steps = state_dict['n_steps']
self._optimizer.load_state_dict(state_dict['_optimizer'])
def _update_learning_rate(self):
''' Learning rate scheduling per step '''
self.n_steps += 1
lr = self.init_lr * self._get_lr_scale()
for param_group in self._optimizer.param_groups:
param_group['lr'] = lr
"""# Label Smoothing
hạn chế hiện tượng overfit
"""
class LabelSmoothingLoss(nn.Module):
def __init__(self, classes, padding_idx, smoothing=0.0, dim=-1):
super(LabelSmoothingLoss, self).__init__()
self.confidence = 1.0 - smoothing
self.smoothing = smoothing
self.cls = classes
self.dim = dim
self.padding_idx = padding_idx
def forward(self, pred, target):
pred = pred.log_softmax(dim=self.dim)
with torch.no_grad():
# true_dist = pred.data.clone()
true_dist = torch.zeros_like(pred)
true_dist.fill_(self.smoothing / (self.cls - 2))
true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
true_dist[:, self.padding_idx] = 0
mask = torch.nonzero(target.data == self.padding_idx, as_tuple=False)
if mask.dim() > 0:
true_dist.index_fill_(0, mask.squeeze(), 0.0)
return torch.mean(torch.sum(-true_dist * pred, dim=self.dim))
from torchtext.data.metrics import bleu_score
def bleu(valid_src_data, valid_trg_data, model, SRC, TRG, device, k, max_strlen):
pred_sents = []
for sentence in valid_src_data:
pred_trg = translate_sentence(sentence, model, SRC, TRG, device, k, max_strlen)
pred_sents.append(pred_trg)
pred_sents = [TRG.preprocess(sent) for sent in pred_sents]
trg_sents = [[sent.split()] for sent in valid_trg_data]
return bleu_score(pred_sents, trg_sents)
opt = {
'train_src_data':'./data/train.en',
'train_trg_data':'./data/train.vi',
'valid_src_data':'./data/tst2013.en',
'valid_trg_data':'./data/tst2013.vi',
'src_lang':'en_core_web_sm',
'trg_lang':'vi_core_news_lg',
'max_strlen':160,
'batchsize':1500,
'device':'cuda',
'd_model': 512,
'n_layers': 6,
'heads': 8,
'dropout': 0.1,
'lr':0.0001,
'epochs':30,
'printevery': 200,
'k':5,
}
os.makedirs('./data/', exist_ok=True)
! gdown --id 1Fuo_ALIFKlUvOPbK5rUA5OfAS2wKn_95
! unzip -o en_vi.zip
train_src_data, train_trg_data = read_data(opt['train_src_data'], opt['train_trg_data'])
valid_src_data, valid_trg_data = read_data(opt['valid_src_data'], opt['valid_trg_data'])
SRC, TRG = create_fields(opt['src_lang'], opt['trg_lang'])
train_iter = create_dataset(train_src_data, train_trg_data, opt['max_strlen'], opt['batchsize'], opt['device'], SRC, TRG, istrain=True)
valid_iter = create_dataset(valid_src_data, valid_trg_data, opt['max_strlen'], opt['batchsize'], opt['device'], SRC, TRG, istrain=False)
src_pad = SRC.vocab.stoi['<pad>']
trg_pad = TRG.vocab.stoi['<pad>']
model = Transformer(len(SRC.vocab), len(TRG.vocab), opt['d_model'], opt['n_layers'], opt['heads'], opt['dropout'])
for p in model.parameters():
if p.dim() > 1:
nn.init.xavier_uniform_(p)
model = model.to(opt['device'])
optimizer = ScheduledOptim(
torch.optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-09),
0.2, opt['d_model'], 4000)
criterion = LabelSmoothingLoss(len(TRG.vocab), padding_idx=trg_pad, smoothing=0.1)
model.load_state_dict(torch.load('./transformer.pth'))
sentence='what is your name'
trans_sent = translate_sentence(sentence, model, SRC, TRG, opt['device'], opt['k'], opt['max_strlen'])
trans_sent