remiai3 commited on
Commit
000db39
·
verified ·
1 Parent(s): 3ef729f

Upload 6 files

Browse files
document_semantic_search/README.md ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Document Semantic Search
2
+
3
+ This project demonstrates searching documents by **meaning** rather than keywords using embeddings.
4
+
5
+ ## Install
6
+ ```bash
7
+ pip install -r requirements.txt
8
+ ## Example Run
9
+
10
+ ```bash
11
+ $ python search.py
12
+ Using device: cpu
13
+
14
+ # Enter your search query (or 'exit' to quit): landmarks in France
15
+ # Best match: doc2.txt (score: 0.8143)
16
+ # Content: The Eiffel Tower is one of the most visited landmarks in Paris, France.
17
+
18
+ # Enter your search query (or 'exit' to quit): pets loved by people
19
+ # Best match: doc3.txt (score: 0.7925)
20
+ # Content: Cats and dogs are common household pets loved by millions.
21
+
22
+ # Enter your search query (or 'exit' to quit): exit
document_semantic_search/data/doc1.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Artificial intelligence is transforming industries such as healthcare and finance.
document_semantic_search/data/doc2.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ The Eiffel Tower is one of the most visited landmarks in Paris, France.
document_semantic_search/data/doc3.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Cats and dogs are common household pets loved by millions.
document_semantic_search/requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ torch==2.2.2
2
+ transformers==4.41.2
3
+ sentence-transformers==3.0.1
4
+ scikit-learn==1.5.0
5
+ numpy==1.26.4
document_semantic_search/src/search.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from sentence_transformers import SentenceTransformer, util
3
+ import os
4
+
5
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
6
+ print(f"Using device: {device}")
7
+
8
+ # Load model
9
+ model = SentenceTransformer('all-MiniLM-L6-v2', device=device)
10
+
11
+ # Load documents
12
+ doc_dir = "../data"
13
+ documents = []
14
+ filenames = []
15
+ for fname in os.listdir(doc_dir):
16
+ if fname.endswith(".txt"):
17
+ with open(os.path.join(doc_dir, fname), "r", encoding="utf-8") as f:
18
+ documents.append(f.read().strip())
19
+ filenames.append(fname)
20
+
21
+ # Encode documents
22
+ doc_embeddings = model.encode(documents)
23
+
24
+ # Search loop
25
+ while True:
26
+ query = input("\nEnter your search query (or 'exit' to quit): ")
27
+ if query.lower() == "exit":
28
+ break
29
+ query_embedding = model.encode(query)
30
+
31
+ # Compute cosine similarity
32
+ scores = util.cos_sim(query_embedding, doc_embeddings)[0]
33
+ best_match_idx = scores.argmax().item()
34
+
35
+ print(f"Best match: {filenames[best_match_idx]} (score: {scores[best_match_idx]:.4f})")
36
+ print(f"Content: {documents[best_match_idx]}")