|
import torch |
|
from torch import Tensor |
|
from torch import nn |
|
from typing import Union, Tuple, List, Iterable, Dict |
|
import os |
|
import json |
|
import logging |
|
import numpy as np |
|
from .tokenizer import WhitespaceTokenizer |
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
class BoW(nn.Module): |
|
"""Implements a Bag-of-Words (BoW) model to derive sentence embeddings. |
|
|
|
A weighting can be added to allow the generation of tf-idf vectors. The output vector has the size of the vocab. |
|
""" |
|
|
|
def __init__(self, vocab: List[str], word_weights: Dict[str, float] = {}, unknown_word_weight: float = 1, cumulative_term_frequency: bool = True): |
|
super(BoW, self).__init__() |
|
vocab = list(set(vocab)) |
|
self.config_keys = ['vocab', 'word_weights', 'unknown_word_weight', 'cumulative_term_frequency'] |
|
self.vocab = vocab |
|
self.word_weights = word_weights |
|
self.unknown_word_weight = unknown_word_weight |
|
self.cumulative_term_frequency = cumulative_term_frequency |
|
|
|
|
|
self.weights = [] |
|
num_unknown_words = 0 |
|
for word in vocab: |
|
weight = unknown_word_weight |
|
if word in word_weights: |
|
weight = word_weights[word] |
|
elif word.lower() in word_weights: |
|
weight = word_weights[word.lower()] |
|
else: |
|
num_unknown_words += 1 |
|
self.weights.append(weight) |
|
|
|
logger.info("{} out of {} words without a weighting value. Set weight to {}".format(num_unknown_words, len(vocab), unknown_word_weight)) |
|
|
|
self.tokenizer = WhitespaceTokenizer(vocab, stop_words=set(), do_lower_case=False) |
|
self.sentence_embedding_dimension = len(vocab) |
|
|
|
|
|
def forward(self, features: Dict[str, Tensor]): |
|
|
|
return features |
|
|
|
def tokenize(self, texts: List[str]) -> List[int]: |
|
tokenized = [self.tokenizer.tokenize(text) for text in texts] |
|
return self.get_sentence_features(tokenized) |
|
|
|
def get_sentence_embedding_dimension(self): |
|
return self.sentence_embedding_dimension |
|
|
|
def get_sentence_features(self, tokenized_texts: List[List[int]], pad_seq_length: int = 0): |
|
vectors = [] |
|
|
|
for tokens in tokenized_texts: |
|
vector = np.zeros(self.get_sentence_embedding_dimension(), dtype=np.float32) |
|
for token in tokens: |
|
if self.cumulative_term_frequency: |
|
vector[token] += self.weights[token] |
|
else: |
|
vector[token] = self.weights[token] |
|
vectors.append(vector) |
|
|
|
return {'sentence_embedding': torch.tensor(vectors, dtype=torch.float)} |
|
|
|
def get_config_dict(self): |
|
return {key: self.__dict__[key] for key in self.config_keys} |
|
|
|
def save(self, output_path): |
|
with open(os.path.join(output_path, 'config.json'), 'w') as fOut: |
|
json.dump(self.get_config_dict(), fOut, indent=2) |
|
|
|
@staticmethod |
|
def load(input_path): |
|
with open(os.path.join(input_path, 'config.json')) as fIn: |
|
config = json.load(fIn) |
|
|
|
return BoW(**config) |
|
|