|
import torch
|
|
from sentence_transformers import SentenceTransformer
|
|
import matplotlib.pyplot as plt
|
|
from sklearn.decomposition import PCA
|
|
from sklearn.manifold import TSNE
|
|
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
print(f"Using device: {device}")
|
|
|
|
|
|
with open("../data/sample_sentences.txt", "r", encoding="utf-8") as f:
|
|
sentences = [line.strip() for line in f if line.strip()]
|
|
|
|
|
|
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)
|
|
|
|
|
|
embeddings = model.encode(sentences)
|
|
|
|
|
|
pca = PCA(n_components=2)
|
|
pca_result = pca.fit_transform(embeddings)
|
|
|
|
plt.figure(figsize=(8,6))
|
|
plt.scatter(pca_result[:,0], pca_result[:,1])
|
|
for i, txt in enumerate(sentences):
|
|
plt.annotate(txt, (pca_result[i,0], pca_result[i,1]))
|
|
plt.title("Text Embeddings (PCA)")
|
|
plt.show()
|
|
|
|
|
|
tsne = TSNE(n_components=2, random_state=42, perplexity=5)
|
|
tsne_result = tsne.fit_transform(embeddings)
|
|
|
|
plt.figure(figsize=(8,6))
|
|
plt.scatter(tsne_result[:,0], tsne_result[:,1])
|
|
for i, txt in enumerate(sentences):
|
|
plt.annotate(txt, (tsne_result[i,0], tsne_result[i,1]))
|
|
plt.title("Text Embeddings (t-SNE)")
|
|
plt.show()
|
|
|