File size: 2,447 Bytes
d87c6b1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
from typing import List, Tuple

import torch
import torch.nn.functional as F
from transformers import AutoConfig, AutoTokenizer
from model import Easyrec


def load_model(model_path: str) -> Tuple[Easyrec, AutoTokenizer]:
    """

    Load the pre-trained model and tokenizer from the specified path.



    Args:

        model_path: The path to the pre-trained huggingface model or local directory.



    Returns:

        tuple: A tuple containing the model and tokenizer.

    """

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    config = AutoConfig.from_pretrained(model_path)
    model = Easyrec.from_pretrained(model_path, config=config).to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)

    return model, tokenizer


def compute_embeddings(

        sentences: List[str], 

        model: Easyrec, 

        tokenizer: AutoTokenizer, 

        batch_size: int = 8) -> torch.Tensor:
    """

    Compute embeddings for a list of sentences using the specified model and tokenizer.



    Args:

        sentences: A list of sentences for which to compute embeddings.

        model: The pre-trained model used for generating embeddings.

        tokenizer: The tokenizer used to preprocess the sentences.

        batch_size: The number of sentences to process in each batch (default is 8).



    Returns:

        torch.Tensor: A tensor containing the normalized embeddings for the input sentences.

    """

    embeddings = []
    count_sentences = len(sentences)
    device = next(model.parameters()).device  # Get the device on which the model is located

    for start in range(0, count_sentences, batch_size):
        end = start + batch_size
        batch_sentences = sentences[start:end]
        
        inputs = tokenizer(batch_sentences, padding=True, truncation=True, max_length=512, return_tensors="pt")
        inputs = {key: val.to(device) for key, val in inputs.items()} # Move input tensors to the same device as the model
        
        with torch.inference_mode():
            outputs = model.encode(inputs['input_ids'], inputs['attention_mask'])
            batch_embeddings = F.normalize(outputs.pooler_output.detach().float(), dim=-1)
            
            embeddings.append(batch_embeddings.cpu())
    
    return torch.cat(embeddings, dim=0) # Concatenate all computed embeddings into a single tensor