import torch from sentence_transformers import SentenceTransformer import matplotlib.pyplot as plt from sklearn.decomposition import PCA from sklearn.manifold import TSNE # Detect device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"Using device: {device}") # Load dataset with open("../data/sample_sentences.txt", "r", encoding="utf-8") as f: sentences = [line.strip() for line in f if line.strip()] # Load embedding model model = SentenceTransformer('all-MiniLM-L6-v2', device=device) # Create embeddings embeddings = model.encode(sentences) # PCA Visualization pca = PCA(n_components=2) pca_result = pca.fit_transform(embeddings) plt.figure(figsize=(8,6)) plt.scatter(pca_result[:,0], pca_result[:,1]) for i, txt in enumerate(sentences): plt.annotate(txt, (pca_result[i,0], pca_result[i,1])) plt.title("Text Embeddings (PCA)") plt.show() # t-SNE Visualization tsne = TSNE(n_components=2, random_state=42, perplexity=5) tsne_result = tsne.fit_transform(embeddings) plt.figure(figsize=(8,6)) plt.scatter(tsne_result[:,0], tsne_result[:,1]) for i, txt in enumerate(sentences): plt.annotate(txt, (tsne_result[i,0], tsne_result[i,1])) plt.title("Text Embeddings (t-SNE)") plt.show()