File size: 932 Bytes
f595d09
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
import torch
from transformers import AutoTokenizer, AutoModel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def load_tokenizers_and_embeddings():
    # ===== Vietnamese PhoBERT =====
    tokenizer_vi = AutoTokenizer.from_pretrained("vinai/phobert-base")
    model_vi = AutoModel.from_pretrained("vinai/phobert-base").to(device)
    embedding_matrix_vi = model_vi.embeddings.word_embeddings.weight

    # ===== English BERT =====
    tokenizer_en = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
    model_en = AutoModel.from_pretrained("bert-base-cased-finetuned-mrpc").to(device)
    embedding_matrix_en = model_en.embeddings.word_embeddings.weight
    
    return {
        "tokenizer_vi": tokenizer_vi,
        "embedding_vi": embedding_matrix_vi,
        "tokenizer_en": tokenizer_en,
        "embedding_en": embedding_matrix_en,
        "device": device
    }