SentenceTransformer / tests /test_util.py
lengocduc195's picture
pushNe
2359bda
from sentence_transformers import util, SentenceTransformer
import unittest
import numpy as np
import sklearn
import torch
class UtilTest(unittest.TestCase):
def test_normalize_embeddings(self):
"""Tests the correct computation of util.normalize_embeddings"""
embedding_size = 100
a = torch.tensor(np.random.randn(50, embedding_size))
a_norm = util.normalize_embeddings(a)
for embedding in a_norm:
assert len(embedding) == embedding_size
emb_norm = torch.norm(embedding)
assert abs(emb_norm.item() - 1) < 0.0001
def test_pytorch_cos_sim(self):
"""Tests the correct computation of util.pytorch_cos_scores"""
a = np.random.randn(50, 100)
b = np.random.randn(50, 100)
sklearn_pairwise = sklearn.metrics.pairwise.cosine_similarity(a, b)
pytorch_cos_scores = util.pytorch_cos_sim(a, b).numpy()
for i in range(len(sklearn_pairwise)):
for j in range(len(sklearn_pairwise[i])):
assert abs(sklearn_pairwise[i][j] - pytorch_cos_scores[i][j]) < 0.001
def test_semantic_search(self):
"""Tests util.semantic_search function"""
num_queries = 20
num_k = 10
doc_emb = torch.tensor(np.random.randn(1000, 100))
q_emb = torch.tensor(np.random.randn(num_queries, 100))
hits = util.semantic_search(q_emb, doc_emb, top_k=num_k, query_chunk_size=5, corpus_chunk_size=17)
assert len(hits) == num_queries
assert len(hits[0]) == num_k
#Sanity Check of the results
cos_scores = util.pytorch_cos_sim(q_emb, doc_emb)
cos_scores_values, cos_scores_idx = cos_scores.topk(num_k)
cos_scores_values = cos_scores_values.cpu().tolist()
cos_scores_idx = cos_scores_idx.cpu().tolist()
for qid in range(num_queries):
for hit_num in range(num_k):
assert hits[qid][hit_num]['corpus_id'] == cos_scores_idx[qid][hit_num]
assert np.abs(hits[qid][hit_num]['score'] - cos_scores_values[qid][hit_num]) < 0.001
def test_paraphrase_mining(self):
model = SentenceTransformer('paraphrase-distilroberta-base-v1')
sentences = [
"This is a test", "This is a test!",
"The cat sits on mat", "The cat sits on the mat", "On the mat a cat sits",
"A man eats pasta", "A woman eats pasta", "A man eats spaghetti"
]
duplicates = util.paraphrase_mining(model, sentences)
for score, a, b in duplicates:
if score > 0.5:
assert (a,b) in [(0,1), (2,3), (2,4), (3,4), (5,6), (5,7), (6,7)]
def test_pairwise_scores(self):
a = np.random.randn(50, 100)
b = np.random.randn(50, 100)
#Pairwise cos
sklearn_pairwise = 1-sklearn.metrics.pairwise.paired_cosine_distances(a, b)
pytorch_cos_scores = util.pairwise_cos_sim(a, b).numpy()
assert np.allclose(sklearn_pairwise, pytorch_cos_scores)
if "__main__" == __name__:
unittest.main()