Spaces:
Running
Running
import torch | |
import random | |
import numpy as np | |
gpu_using = False | |
DEVICE = torch.device("cpu") | |
if gpu_using: | |
DEVICE = torch.device("cuda:0") | |
HWT = "HWT" | |
MGT = "MGT" | |
def init_random_seeds(): | |
print("Init random seeds") | |
random.seed(0) | |
np.random.seed(0) | |
torch.manual_seed(0) | |
torch.cuda.manual_seed(0) | |
torch.cuda.manual_seed_all(0) | |
torch.backends.cudnn.benchmark = False | |
torch.backends.cudnn.deterministic = True | |
class FeatureExtractor: | |
def __init__(self, model, net=None): | |
self.model = model # TODO: support different models | |
self.net = net | |
def process(self, text, net_required=True): | |
# Tokenize | |
tokens = self.model.tokenizer( | |
[text], | |
padding="max_length", | |
truncation=True, | |
max_length=100, | |
return_tensors="pt", | |
).to(DEVICE) | |
# Predict | |
outputs = self.model.model(**tokens) | |
# Get the feature for input text | |
attention_mask = tokens["attention_mask"].unsqueeze(-1) | |
hidden_states_masked = ( | |
outputs.last_hidden_state * attention_mask | |
) # Ignore the padding tokens | |
if net_required and self.net is not None: | |
feature = self.net.net(hidden_states_masked) | |
return feature | |
else: | |
return hidden_states_masked | |
def process_sents(self, sents, net_required=True): | |
features = [] | |
for sent in sents: | |
features.append(self.process(sent, net_required)) | |
return torch.cat(features, dim=0) |