lengocduc195
/

SentenceTransformer

Model card Files Files and versions Community

SentenceTransformer / sentence_transformers /models /BoW.py

lengocduc195's picture

pushNe

2359bda over 1 year ago

3.2 kB

	import torch
	from torch import Tensor
	from torch import nn
	from typing import Union, Tuple, List, Iterable, Dict
	import os
	import json
	import logging
	import numpy as np
	from .tokenizer import WhitespaceTokenizer


	logger = logging.getLogger(__name__)

	class BoW(nn.Module):
	"""Implements a Bag-of-Words (BoW) model to derive sentence embeddings.

	A weighting can be added to allow the generation of tf-idf vectors. The output vector has the size of the vocab.
	"""

	def __init__(self, vocab: List[str], word_weights: Dict[str, float] = {}, unknown_word_weight: float = 1, cumulative_term_frequency: bool = True):
	super(BoW, self).__init__()
	vocab = list(set(vocab)) #Ensure vocab is unique
	self.config_keys = ['vocab', 'word_weights', 'unknown_word_weight', 'cumulative_term_frequency']
	self.vocab = vocab
	self.word_weights = word_weights
	self.unknown_word_weight = unknown_word_weight
	self.cumulative_term_frequency = cumulative_term_frequency

	#Maps wordIdx -> word weight
	self.weights = []
	num_unknown_words = 0
	for word in vocab:
	weight = unknown_word_weight
	if word in word_weights:
	weight = word_weights[word]
	elif word.lower() in word_weights:
	weight = word_weights[word.lower()]
	else:
	num_unknown_words += 1
	self.weights.append(weight)

	logger.info("{} out of {} words without a weighting value. Set weight to {}".format(num_unknown_words, len(vocab), unknown_word_weight))

	self.tokenizer = WhitespaceTokenizer(vocab, stop_words=set(), do_lower_case=False)
	self.sentence_embedding_dimension = len(vocab)


	def forward(self, features: Dict[str, Tensor]):
	#Nothing to do, everything is done in get_sentence_features
	return features

	def tokenize(self, texts: List[str]) -> List[int]:
	tokenized = [self.tokenizer.tokenize(text) for text in texts]
	return self.get_sentence_features(tokenized)

	def get_sentence_embedding_dimension(self):
	return self.sentence_embedding_dimension

	def get_sentence_features(self, tokenized_texts: List[List[int]], pad_seq_length: int = 0):
	vectors = []

	for tokens in tokenized_texts:
	vector = np.zeros(self.get_sentence_embedding_dimension(), dtype=np.float32)
	for token in tokens:
	if self.cumulative_term_frequency:
	vector[token] += self.weights[token]
	else:
	vector[token] = self.weights[token]
	vectors.append(vector)

	return {'sentence_embedding': torch.tensor(vectors, dtype=torch.float)}

	def get_config_dict(self):
	return {key: self.__dict__[key] for key in self.config_keys}

	def save(self, output_path):
	with open(os.path.join(output_path, 'config.json'), 'w') as fOut:
	json.dump(self.get_config_dict(), fOut, indent=2)

	@staticmethod
	def load(input_path):
	with open(os.path.join(input_path, 'config.json')) as fIn:
	config = json.load(fIn)

	return BoW(**config)