nemabruh404 commited on
Commit
f595d09
·
verified ·
1 Parent(s): 923db71

Upload 6 files

Browse files
Files changed (6) hide show
  1. README.md +1 -14
  2. app.py +71 -0
  3. main.py +7 -0
  4. model.py +151 -0
  5. requirements.txt +6 -0
  6. utils.py +23 -0
README.md CHANGED
@@ -1,14 +1 @@
1
- ---
2
- title: Machine Translation
3
- emoji: 🐠
4
- colorFrom: green
5
- colorTo: red
6
- sdk: gradio
7
- sdk_version: 5.43.1
8
- app_file: app.py
9
- pinned: false
10
- license: cc0-1.0
11
- short_description: translate english to vietnamese
12
- ---
13
-
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ # Machine_Translation
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from pydantic import BaseModel
3
+ from io import BytesIO
4
+ import requests
5
+ from model import TransformerSeq2Seq,translate
6
+ from utils import load_tokenizers_and_embeddings
7
+
8
+ import torch
9
+
10
+ # class mô hình của bạn
11
+
12
+ app = FastAPI()
13
+
14
+ # ===== 1. Load model và tokenizer khi khởi động server =====
15
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
16
+
17
+ # ===== Load 1 lần khi start server =====
18
+ resources = load_tokenizers_and_embeddings()
19
+ tokenizer_vi = resources["tokenizer_vi"]
20
+ embedding_matrix_vi = resources["embedding_vi"]
21
+ tokenizer_en = resources["tokenizer_en"]
22
+ embedding_matrix_en = resources["embedding_en"]
23
+ device = resources["device"]
24
+
25
+ print("✅ Tokenizers & embeddings loaded!")
26
+ if isinstance(embedding_matrix_en, torch.Tensor):
27
+ embed_dim = embedding_matrix_en.size(1)
28
+ else: # nn.Embedding
29
+ embed_dim = embedding_matrix_en.embedding_dim
30
+ max_len = 128
31
+ batch_size = 32
32
+ # Load model
33
+ model = TransformerSeq2Seq(
34
+ embed_dim=embed_dim,
35
+ vocab_size=tokenizer_vi.vocab_size, # hoặc len(tokenizer_vi)
36
+ embedding_decoder=embedding_matrix_vi, # embedding target đã có sẵn
37
+ num_heads=4,
38
+ num_layers=2,
39
+ dim_feedforward=256,
40
+ dropout=0.1,
41
+ freeze_decoder_emb=True,
42
+ max_len=max_len
43
+ )
44
+ MODEL_URL = "https://huggingface.co/nemabruh404/Machine_Translation/resolve/main/model_state_dict.pt"
45
+
46
+ # Fetch model từ Hub
47
+ checkpoint_bytes = BytesIO(requests.get(MODEL_URL).content)
48
+ checkpoint = torch.load(checkpoint_bytes, map_location=device)
49
+
50
+ # Load state dict
51
+ model.load_state_dict(checkpoint["model_state_dict"])
52
+ model.to(device)
53
+ model.eval()
54
+
55
+ print("✅ Model loaded from Hugging Face Hub")
56
+ print("Model loaded")
57
+ class TranslationRequest(BaseModel):
58
+ text: str
59
+ # ===== Endpoint dịch =====
60
+ @app.post("/translate")
61
+ def translate_api(req: TranslationRequest):
62
+ output = translate(
63
+ model=model,
64
+ src_sentence=req.text,
65
+ tokenizer_src=tokenizer_en, # tiếng Anh -> input
66
+ tokenizer_tgt=tokenizer_vi, # tiếng Việt -> output
67
+ embedding_src=embedding_matrix_en,
68
+ device=device,
69
+ max_len=max_len
70
+ )
71
+ return {"input": req.text, "translation": output}
main.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from app import app
2
+ import uvicorn
3
+ import os
4
+
5
+ port = int(os.environ.get("PORT", 10000)) # Render sẽ set PORT
6
+ if __name__ == "__main__":
7
+ uvicorn.run(app, host="0.0.0.0", port=port)
model.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import torch.nn as nn
4
+ import torch.nn.functional as F
5
+ from torch.utils.data import Dataset, DataLoader
6
+ import math
7
+
8
+ # ---------------- Positional Encoding ----------------
9
+ class PositionalEncoding(nn.Module):
10
+ def __init__(self, d_model, max_len=512):
11
+ super().__init__()
12
+ pe = torch.zeros(max_len, d_model)
13
+ position = torch.arange(0, max_len).unsqueeze(1).float()
14
+ div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
15
+ pe[:, 0::2] = torch.sin(position * div_term)
16
+ pe[:, 1::2] = torch.cos(position * div_term)
17
+ pe = pe.unsqueeze(0) # (1, max_len, d_model)
18
+ self.register_buffer('pe', pe)
19
+
20
+ def forward(self, x):
21
+ # x: (B, T, D)
22
+ return x + self.pe[:, :x.size(1)].to(x.device)
23
+
24
+ # ---------------- Transformer (sửa để match training) ----------------
25
+ class TransformerSeq2Seq(nn.Module):
26
+ """
27
+ Thiết kế sao cho forward(src_embedded, tgt_input_ids, src_attn_mask=None, tgt_attn_mask=None)
28
+ - src_embedded: (B, S, E) — bạn có thể pass embedding matrix bên ngoài (embedding_src[src_ids])
29
+ - tgt_input_ids: (B, T) — token ids cho decoder input (BOS.. token_{n-1})
30
+ - src_attn_mask / tgt_attn_mask: (B, S) / (B, T) with 1 for real tokens, 0 for pad
31
+ """
32
+ def __init__(self,
33
+ embed_dim,
34
+ vocab_size, # target vocab size (output dim)
35
+ embedding_decoder=None, # pretrained weights (np array or torch.Tensor) or None
36
+ num_heads=2,
37
+ num_layers=2,
38
+ dim_feedforward=256,
39
+ dropout=0.1,
40
+ freeze_decoder_emb=True,
41
+ max_len=512):
42
+ super().__init__()
43
+ self.embed_dim = embed_dim
44
+ self.vocab_size = vocab_size
45
+
46
+ # positional encoding
47
+ self.pos_encoder = PositionalEncoding(embed_dim, max_len=max_len)
48
+
49
+ # decoder embedding (pretrained optional)
50
+ if embedding_decoder is None:
51
+ self.embedding_decoder = nn.Embedding(vocab_size, embed_dim)
52
+ else:
53
+ if not isinstance(embedding_decoder, torch.Tensor):
54
+ embedding_decoder = torch.tensor(embedding_decoder, dtype=torch.float)
55
+ self.embedding_decoder = nn.Embedding.from_pretrained(embedding_decoder, freeze=freeze_decoder_emb)
56
+
57
+ # encoder/decoder (batch_first True -> inputs shape (B, T, E))
58
+ self.encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads,
59
+ dim_feedforward=dim_feedforward, dropout=dropout,
60
+ batch_first=True)
61
+ self.encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)
62
+
63
+ self.decoder_layer = nn.TransformerDecoderLayer(d_model=embed_dim, nhead=num_heads,
64
+ dim_feedforward=dim_feedforward, dropout=dropout,
65
+ batch_first=True)
66
+ self.decoder = nn.TransformerDecoder(self.decoder_layer, num_layers=num_layers)
67
+
68
+ self.output_proj = nn.Linear(embed_dim, vocab_size)
69
+
70
+ def forward(self, src_embedded, tgt_input_ids, src_attn_mask=None, tgt_attn_mask=None):
71
+ """
72
+ src_embedded : (B, S, E)
73
+ tgt_input_ids: (B, T)
74
+ src_attn_mask : (B, S) mask: 1 real token, 0 pad (optional)
75
+ tgt_attn_mask : (B, T) same
76
+ """
77
+ device = src_embedded.device
78
+ # tgt embedding
79
+ tgt_embedded = self.embedding_decoder(tgt_input_ids) # (B, T, E)
80
+
81
+ # add positional encoding
82
+ src = self.pos_encoder(src_embedded) # (B, S, E)
83
+ tgt = self.pos_encoder(tgt_embedded) # (B, T, E)
84
+
85
+ # prepare key_padding_mask: True at positions that should be masked (pad positions)
86
+ src_key_padding_mask = None
87
+ tgt_key_padding_mask = None
88
+ if src_attn_mask is not None:
89
+ src_key_padding_mask = (src_attn_mask == 0).to(device) # (B, S), bool
90
+ if tgt_attn_mask is not None:
91
+ tgt_key_padding_mask = (tgt_attn_mask == 0).to(device) # (B, T)
92
+
93
+ # encode
94
+ memory = self.encoder(src, src_key_padding_mask=src_key_padding_mask) # (B, S, E)
95
+
96
+ # causal mask for decoder (T x T)
97
+ T = tgt.size(1)
98
+ if T > 0:
99
+ tgt_mask = torch.triu(torch.full((T, T), float('-inf'), device=device), diagonal=1)
100
+ else:
101
+ tgt_mask = None
102
+
103
+ # decode
104
+ output = self.decoder(tgt, memory,
105
+ tgt_mask=tgt_mask,
106
+ tgt_key_padding_mask=tgt_key_padding_mask,
107
+ memory_key_padding_mask=src_key_padding_mask) # (B, T, E)
108
+
109
+ logits = self.output_proj(output) # (B, T, vocab)
110
+ return logits
111
+
112
+ # ---------------- Helpers to apply embedding_src (tensor or nn.Embedding) ----------------
113
+ def apply_src_embedding(embedding_src, src_ids):
114
+ """
115
+ embedding_src can be:
116
+ - torch.Tensor of shape (vocab_src, embed_dim) -> indexing
117
+ - nn.Embedding instance -> call( ids )
118
+ src_ids: LongTensor (B, S)
119
+ return: (B, S, E) float tensor on same device as src_ids
120
+ """
121
+ if isinstance(embedding_src, nn.Embedding):
122
+ return embedding_src(src_ids)
123
+ else:
124
+ # assume it's a tensor/ndarray
125
+ if not isinstance(embedding_src, torch.Tensor):
126
+ embedding_src = torch.tensor(embedding_src, dtype=torch.float, device=src_ids.device)
127
+ else:
128
+ embedding_src = embedding_src.to(src_ids.device)
129
+ return embedding_src[src_ids]
130
+ @torch.no_grad()
131
+ def translate(model, src_sentence, tokenizer_src, tokenizer_tgt, embedding_src, device, max_len=50):
132
+ model.eval()
133
+ inputs = tokenizer_src([src_sentence], return_tensors="pt", padding=True, truncation=True, max_length=128)
134
+ src_ids = inputs["input_ids"].to(device) # (1, S)
135
+ src_attn = inputs.get("attention_mask", None)
136
+ if src_attn is not None:
137
+ src_attn = src_attn.to(device)
138
+
139
+ src_embedded = apply_src_embedding(embedding_src, src_ids) # (1, S, E)
140
+
141
+ decoded_ids = [tokenizer_tgt.cls_token_id]
142
+ for _ in range(max_len):
143
+ decoder_input = torch.tensor([decoded_ids], device=device)
144
+ # for decode we don't need tgt_attn_mask (we build causal mask inside model)
145
+ logits = model(src_embedded, decoder_input, src_attn_mask=src_attn, tgt_attn_mask=None)
146
+ next_token = logits[:, -1, :].argmax(dim=-1).item()
147
+ decoded_ids.append(next_token)
148
+ if next_token == tokenizer_tgt.sep_token_id:
149
+ break
150
+
151
+ return tokenizer_tgt.decode(decoded_ids, skip_special_tokens=True)
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn[standard]
3
+ torch
4
+ transformers
5
+ requests
6
+ pydantic
utils.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoTokenizer, AutoModel
3
+
4
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
5
+
6
+ def load_tokenizers_and_embeddings():
7
+ # ===== Vietnamese PhoBERT =====
8
+ tokenizer_vi = AutoTokenizer.from_pretrained("vinai/phobert-base")
9
+ model_vi = AutoModel.from_pretrained("vinai/phobert-base").to(device)
10
+ embedding_matrix_vi = model_vi.embeddings.word_embeddings.weight
11
+
12
+ # ===== English BERT =====
13
+ tokenizer_en = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
14
+ model_en = AutoModel.from_pretrained("bert-base-cased-finetuned-mrpc").to(device)
15
+ embedding_matrix_en = model_en.embeddings.word_embeddings.weight
16
+
17
+ return {
18
+ "tokenizer_vi": tokenizer_vi,
19
+ "embedding_vi": embedding_matrix_vi,
20
+ "tokenizer_en": tokenizer_en,
21
+ "embedding_en": embedding_matrix_en,
22
+ "device": device
23
+ }