File size: 634 Bytes
b7f6f88
 
 
 
 
 
1226569
b7f6f88
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
# feature_extraction/embedding_extractor.py

from transformers import DistilBertTokenizer, DistilBertModel
import torch
import numpy as np

# Load on CPU
model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertModel.from_pretrained(model_name)
model.eval()

def get_bert_embedding(text):
    with torch.no_grad():
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
        outputs = model(**inputs)
        cls_embedding = outputs.last_hidden_state[:, 0, :]  # shape: (1, 768)
        return cls_embedding.squeeze().numpy()