|
""" |
|
This is a simple application for sentence embeddings: clustering |
|
|
|
Sentences are mapped to sentence embeddings and then k-mean clustering is applied. |
|
""" |
|
from sentence_transformers import SentenceTransformer |
|
from sklearn.cluster import KMeans |
|
|
|
embedder = SentenceTransformer('all-MiniLM-L6-v2') |
|
|
|
|
|
corpus = ['A man is eating food.', |
|
'A man is eating a piece of bread.', |
|
'A man is eating pasta.', |
|
'The girl is carrying a baby.', |
|
'The baby is carried by the woman', |
|
'A man is riding a horse.', |
|
'A man is riding a white horse on an enclosed ground.', |
|
'A monkey is playing drums.', |
|
'Someone in a gorilla costume is playing a set of drums.', |
|
'A cheetah is running behind its prey.', |
|
'A cheetah chases prey on across a field.' |
|
] |
|
corpus_embeddings = embedder.encode(corpus) |
|
|
|
|
|
num_clusters = 5 |
|
clustering_model = KMeans(n_clusters=num_clusters) |
|
clustering_model.fit(corpus_embeddings) |
|
cluster_assignment = clustering_model.labels_ |
|
|
|
clustered_sentences = [[] for i in range(num_clusters)] |
|
for sentence_id, cluster_id in enumerate(cluster_assignment): |
|
clustered_sentences[cluster_id].append(corpus[sentence_id]) |
|
|
|
for i, cluster in enumerate(clustered_sentences): |
|
print("Cluster ", i+1) |
|
print(cluster) |
|
print("") |
|
|