Spaces:

mgbam
/

Medic

Running

App Files Files Community

mgbam commited on 3 days ago

Commit

6a2b285

verified ·

1 Parent(s): 1bc3e18

Update retrieval.py

Browse files

Files changed (1) hide show

retrieval.py +20 -18

retrieval.py CHANGED Viewed

@@ -1,3 +1,11 @@
 import os
 import tempfile
 import requests
@@ -15,8 +23,8 @@ PUBMED_API_KEY = os.environ.get("PUBMED_API_KEY", "<YOUR_NCBI_API_KEY>")
 #############################################
 def fetch_pubmed_abstracts(query: str, max_results: int = 5) -> List[str]:
     """
-    Retrieves PubMed abstracts for a given clinical query using NCBI's E-utilities.
-    Designed to quickly fetch up to 'max_results' abstracts.
     """
     search_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
     params = {
@@ -29,7 +37,6 @@ def fetch_pubmed_abstracts(query: str, max_results: int = 5) -> List[str]:
     r = requests.get(search_url, params=params, timeout=10)
     r.raise_for_status()
     data = r.json()
     pmid_list = data["esearchresult"].get("idlist", [])
     abstracts = []
     fetch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
@@ -53,8 +60,7 @@ def fetch_pubmed_abstracts(query: str, max_results: int = 5) -> List[str]:
 #############################################
 class EmbedFunction:
     """
-    Uses a Hugging Face embedding model to generate embeddings for a list of strings.
-    This function is crucial for indexing abstracts for similarity search.
     """
     def __init__(self, model_name: str):
         self.tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -73,15 +79,15 @@ class EmbedFunction:
         )
         with torch.no_grad():
             outputs = self.model(**tokenized, output_hidden_states=True)
         last_hidden = outputs.hidden_states[-1]
         pooled = last_hidden.mean(dim=1)
-        embeddings = pooled.cpu().tolist()
-        return embeddings
 EMBED_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
 embed_function = EmbedFunction(EMBED_MODEL_NAME)
-# Use a temporary directory for persistent storage to ensure a fresh initialization.
 temp_dir = tempfile.mkdtemp()
 print("Using temporary persist_directory:", temp_dir)
@@ -92,13 +98,13 @@ client = chromadb.Client(
     )
 )
-# Create or retrieve the collection for medical abstracts.
 collection = client.get_or_create_collection(
     name="ai_medical_knowledge",
     embedding_function=embed_function
 )
-# Optional: Force initialization with a dummy document to ensure the schema is set up.
 try:
     collection.add(documents=["dummy"], ids=["dummy"])
     _ = collection.query(query_texts=["dummy"], n_results=1)
@@ -108,8 +114,7 @@ except Exception as init_err:
 def index_pubmed_docs(docs: List[str], prefix: str = "doc"):
     """
-    Indexes PubMed abstracts into the Chroma vector store.
-    Each document is assigned a unique ID based on the query prefix.
     """
     for i, doc in enumerate(docs):
         if doc.strip():
@@ -123,8 +128,7 @@ def index_pubmed_docs(docs: List[str], prefix: str = "doc"):
 def query_similar_docs(query: str, top_k: int = 3) -> List[str]:
     """
-    Searches the indexed abstracts for those most similar to the given query.
-    Returns the top 'top_k' documents.
     """
     results = collection.query(query_texts=[query], n_results=top_k)
     return results["documents"][0] if results and results["documents"] else []
@@ -135,11 +139,9 @@ def query_similar_docs(query: str, top_k: int = 3) -> List[str]:
 def get_relevant_pubmed_docs(user_query: str) -> List[str]:
     """
     Complete retrieval pipeline:
-      1. Fetch PubMed abstracts for the query.
-      2. Index the abstracts into the vector store.
       3. Retrieve and return the most similar documents.
-    Designed for clinicians to quickly access relevant literature.
     """
     new_abstracts = fetch_pubmed_abstracts(user_query, max_results=5)
     if not new_abstracts:

+"""
+retrieval.py
+------------
+This module handles retrieval of PubMed abstracts and indexing via Chromadb.
+It fetches abstracts using NCBI's E-utilities and indexes them in a vector store
+to enable similarity search for clinical queries.
+"""
 import os
 import tempfile
 import requests
 #############################################
 def fetch_pubmed_abstracts(query: str, max_results: int = 5) -> List[str]:
     """
+    Retrieves PubMed abstracts for the given clinical query.
+    Returns a list of abstract texts.
     """
     search_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
     params = {
     r = requests.get(search_url, params=params, timeout=10)
     r.raise_for_status()
     data = r.json()
     pmid_list = data["esearchresult"].get("idlist", [])
     abstracts = []
     fetch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
 #############################################
 class EmbedFunction:
     """
+    Uses a Hugging Face embedding model to generate embeddings for clinical texts.
     """
     def __init__(self, model_name: str):
         self.tokenizer = AutoTokenizer.from_pretrained(model_name)
         )
         with torch.no_grad():
             outputs = self.model(**tokenized, output_hidden_states=True)
+        # Mean-pooling over the last hidden state.
         last_hidden = outputs.hidden_states[-1]
         pooled = last_hidden.mean(dim=1)
+        return pooled.cpu().tolist()
 EMBED_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
 embed_function = EmbedFunction(EMBED_MODEL_NAME)
+# Create a temporary directory for the Chromadb persistent storage.
 temp_dir = tempfile.mkdtemp()
 print("Using temporary persist_directory:", temp_dir)
     )
 )
+# Create or retrieve the collection for clinical abstracts.
 collection = client.get_or_create_collection(
     name="ai_medical_knowledge",
     embedding_function=embed_function
 )
+# Force initialization with a dummy document.
 try:
     collection.add(documents=["dummy"], ids=["dummy"])
     _ = collection.query(query_texts=["dummy"], n_results=1)
 def index_pubmed_docs(docs: List[str], prefix: str = "doc"):
     """
+    Indexes the retrieved PubMed abstracts into the Chromadb vector store.
     """
     for i, doc in enumerate(docs):
         if doc.strip():
 def query_similar_docs(query: str, top_k: int = 3) -> List[str]:
     """
+    Performs a similarity search on the indexed abstracts and returns the top relevant documents.
     """
     results = collection.query(query_texts=[query], n_results=top_k)
     return results["documents"][0] if results and results["documents"] else []
 def get_relevant_pubmed_docs(user_query: str) -> List[str]:
     """
     Complete retrieval pipeline:
+      1. Fetch PubMed abstracts.
+      2. Index them into the vector store.
       3. Retrieve and return the most similar documents.
     """
     new_abstracts = fetch_pubmed_abstracts(user_query, max_results=5)
     if not new_abstracts: