|
""" |
|
This is a simple application for sentence embeddings: semantic search |
|
|
|
We have a corpus with various sentences. Then, for a given query sentence, |
|
we want to find the most similar sentence in this corpus. |
|
|
|
This script outputs for various queries the top 5 most similar sentences in the corpus. |
|
""" |
|
from sentence_transformers import SentenceTransformer, util |
|
import torch |
|
|
|
embedder = SentenceTransformer('all-MiniLM-L6-v2') |
|
|
|
|
|
corpus = ['A man is eating food.', |
|
'A man is eating a piece of bread.', |
|
'The girl is carrying a baby.', |
|
'A man is riding a horse.', |
|
'A woman is playing violin.', |
|
'Two men pushed carts through the woods.', |
|
'A man is riding a white horse on an enclosed ground.', |
|
'A monkey is playing drums.', |
|
'A cheetah is running behind its prey.' |
|
] |
|
corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True) |
|
|
|
|
|
queries = ['A man is eating pasta.', 'Someone in a gorilla costume is playing a set of drums.', 'A cheetah chases prey on across a field.'] |
|
|
|
|
|
|
|
top_k = min(5, len(corpus)) |
|
for query in queries: |
|
query_embedding = embedder.encode(query, convert_to_tensor=True) |
|
|
|
|
|
cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0] |
|
top_results = torch.topk(cos_scores, k=top_k) |
|
|
|
print("\n\n======================\n\n") |
|
print("Query:", query) |
|
print("\nTop 5 most similar sentences in corpus:") |
|
|
|
for score, idx in zip(top_results[0], top_results[1]): |
|
print(corpus[idx], "(Score: {:.4f})".format(score)) |
|
|
|
""" |
|
# Alternatively, we can also use util.semantic_search to perform cosine similarty + topk |
|
hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=5) |
|
hits = hits[0] #Get the hits for the first query |
|
for hit in hits: |
|
print(corpus[hit['corpus_id']], "(Score: {:.4f})".format(hit['score'])) |
|
""" |
|
|