Spaces:

mgbam
/

Medic

Running

mgbam commited on 6 days ago

Commit

d183895

verified ·

1 Parent(s): b61851e

Update retrieval.py

Files changed (1) hide show

retrieval.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import os
 import requests
 import torch
 from typing import List
@@ -79,10 +80,13 @@ class EmbedFunction:
 EMBED_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
 embed_function = EmbedFunction(EMBED_MODEL_NAME)
-# Use ephemeral (in-memory) storage by setting persist_directory to ":memory:".
 client = chromadb.Client(
     settings=Settings(
-        persist_directory=":memory:",
         anonymized_telemetry=False
     )
 )
@@ -95,7 +99,7 @@ collection = client.get_or_create_collection(
 def index_pubmed_docs(docs: List[str], prefix: str = "doc"):
     """
-    Adds documents to the Chroma collection with unique IDs.
     """
     for i, doc in enumerate(docs):
         if doc.strip():
@@ -109,7 +113,7 @@ def index_pubmed_docs(docs: List[str], prefix: str = "doc"):
 def query_similar_docs(query: str, top_k: int = 3) -> List[str]:
     """
-    Retrieves the top_k similar documents from Chroma based on embedding similarity.
     """
     results = collection.query(query_texts=[query], n_results=top_k)
     return results["documents"][0] if results and results["documents"] else []
@@ -121,7 +125,7 @@ def get_relevant_pubmed_docs(user_query: str) -> List[str]:
     """
     End-to-end pipeline:
       1. Fetch PubMed abstracts for the query.
-      2. Index them in Chroma.
       3. Retrieve the top relevant documents.
     """
     new_abstracts = fetch_pubmed_abstracts(user_query, max_results=5)

 import os
+import tempfile
 import requests
 import torch
 from typing import List
 EMBED_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
 embed_function = EmbedFunction(EMBED_MODEL_NAME)
+# Use a temporary directory for persistent storage
+temp_dir = tempfile.mkdtemp()
+print("Using temporary persist_directory:", temp_dir)
 client = chromadb.Client(
     settings=Settings(
+        persist_directory=temp_dir,
         anonymized_telemetry=False
     )
 )
 def index_pubmed_docs(docs: List[str], prefix: str = "doc"):
     """
+    Adds documents to the Chromadb collection with unique IDs.
     """
     for i, doc in enumerate(docs):
         if doc.strip():
 def query_similar_docs(query: str, top_k: int = 3) -> List[str]:
     """
+    Retrieves the top_k similar documents from Chromadb based on embedding similarity.
     """
     results = collection.query(query_texts=[query], n_results=top_k)
     return results["documents"][0] if results and results["documents"] else []
     """
     End-to-end pipeline:
       1. Fetch PubMed abstracts for the query.
+      2. Index them in Chromadb.
       3. Retrieve the top relevant documents.
     """
     new_abstracts = fetch_pubmed_abstracts(user_query, max_results=5)