Upload 6 files

Files changed (6) hide show

document_semantic_search/README.md ADDED Viewed

+# Document Semantic Search
+This project demonstrates searching documents by **meaning** rather than keywords using embeddings.
+## Install
+```bash
+pip install -r requirements.txt
+## Example Run
+```bash
+$ python search.py
+Using device: cpu
+# Enter your search query (or 'exit' to quit): landmarks in France
+# Best match: doc2.txt (score: 0.8143)
+# Content: The Eiffel Tower is one of the most visited landmarks in Paris, France.
+# Enter your search query (or 'exit' to quit): pets loved by people
+# Best match: doc3.txt (score: 0.7925)
+# Content: Cats and dogs are common household pets loved by millions.
+# Enter your search query (or 'exit' to quit): exit

document_semantic_search/data/doc1.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ Artificial intelligence is transforming industries such as healthcare and finance.

document_semantic_search/data/doc2.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ The Eiffel Tower is one of the most visited landmarks in Paris, France.

document_semantic_search/data/doc3.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ Cats and dogs are common household pets loved by millions.

document_semantic_search/requirements.txt ADDED Viewed

+torch==2.2.2
+transformers==4.41.2
+sentence-transformers==3.0.1
+scikit-learn==1.5.0
+numpy==1.26.4

document_semantic_search/src/search.py ADDED Viewed

+import torch
+from sentence_transformers import SentenceTransformer, util
+import os
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print(f"Using device: {device}")
+# Load model
+model = SentenceTransformer('all-MiniLM-L6-v2', device=device)
+# Load documents
+doc_dir = "../data"
+documents = []
+filenames = []
+for fname in os.listdir(doc_dir):
+    if fname.endswith(".txt"):
+        with open(os.path.join(doc_dir, fname), "r", encoding="utf-8") as f:
+            documents.append(f.read().strip())
+            filenames.append(fname)
+# Encode documents
+doc_embeddings = model.encode(documents)
+# Search loop
+while True:
+    query = input("\nEnter your search query (or 'exit' to quit): ")
+    if query.lower() == "exit":
+        break
+    query_embedding = model.encode(query)
+    # Compute cosine similarity
+    scores = util.cos_sim(query_embedding, doc_embeddings)[0]
+    best_match_idx = scores.argmax().item()
+    print(f"Best match: {filenames[best_match_idx]} (score: {scores[best_match_idx]:.4f})")
+    print(f"Content: {documents[best_match_idx]}")