Spaces:

VatsalPatel18
/

MedDisover-space

Sleeping

App Files Files Community

VatsalPatel18 commited on 10 days ago

Commit

2f08951

1 Parent(s): a39a4c5

Force phi-3 to CPU and fall back to TinyLlama

Browse files

Files changed (3) hide show

NOTE.md +1 -0
README.md +1 -0
app.py +30 -9

NOTE.md CHANGED Viewed

@@ -7,6 +7,7 @@
   - Streaming answers; prompt forces context-grounded responses.
   - Uses `@spaces.GPU()` on heavy steps; caches/models under `/data/.cache` (`HF_HOME`).
   - No reranker; straight FAISS retrieval.
 - Dependencies captured in `requirements.txt`; Spaces front matter in `README.md`.
 - Pushed to HF Space `VatsalPatel18/MedDisover-space` (commit f6d2d7d).
 - To do next when resuming:

   - Streaming answers; prompt forces context-grounded responses.
   - Uses `@spaces.GPU()` on heavy steps; caches/models under `/data/.cache` (`HF_HOME`).
   - No reranker; straight FAISS retrieval.
+- ZeroGPU safeguard: Phi-3-mini-4k forced to CPU and will auto-fall back to TinyLlama if CUDA init is blocked in stateless GPU runtime.
 - Dependencies captured in `requirements.txt`; Spaces front matter in `README.md`.
 - Pushed to HF Space `VatsalPatel18/MedDisover-space` (commit f6d2d7d).
 - To do next when resuming:

README.md CHANGED Viewed

@@ -19,3 +19,4 @@ A Hugging Face Spaces-ready Gradio app that runs MedDiscover-style RAG (MedCPT e
 - No external API keys required; optionally use `HF_TOKEN` secret if models need auth.
 - Chunking: 500 words with 50-word overlap; MedCPT encoder for embeddings; FAISS IP index.
 - Generation is context-grounded; drop-down selects generator model.

 - No external API keys required; optionally use `HF_TOKEN` secret if models need auth.
 - Chunking: 500 words with 50-word overlap; MedCPT encoder for embeddings; FAISS IP index.
 - Generation is context-grounded; drop-down selects generator model.
+- Phi-3-mini-4k is forced to CPU on Spaces; if loading fails (stateless GPU), the app auto-falls back to TinyLlama.

app.py CHANGED Viewed

@@ -185,14 +185,27 @@ def search(index: faiss.IndexFlatIP, meta: List[Dict], query_vec: np.ndarray, k:
 # Model registry for generators
 # ----------------------------
 class GeneratorWrapper:
-    def __init__(self, name: str, load_fn):
         self.name = name
         self._load_fn = load_fn
         self._pipe = None
     def ensure(self):
         if self._pipe is None:
-            self._pipe = self._load_fn()
         return self._pipe
     def generate_stream(self, prompt: str, max_new_tokens: int, temperature: float, top_p: float):
@@ -208,6 +221,9 @@ class GeneratorWrapper:
         }
         thread = Thread(target=pipe, args=(prompt,), kwargs=kwargs)
         thread.start()
         for token in streamer:
             yield token
@@ -227,21 +243,26 @@ def load_tinyllama():
 def load_phi3_mini():
-    use_cuda = torch.cuda.is_available()
-    device_map = "cuda" if use_cuda else "cpu"
-    dtype = torch.float16 if use_cuda else torch.float32
     return pipeline(
         "text-generation",
         model="microsoft/Phi-3-mini-4k-instruct",
-        device_map=device_map,
-        torch_dtype=dtype,
         trust_remote_code=True,
     )
 GENERATORS = {
-    "tinyllama-1.1b-chat": GeneratorWrapper("tinyllama-1.1b-chat", load_tinyllama),
-    "phi-3-mini-4k": GeneratorWrapper("phi-3-mini-4k", load_phi3_mini),
 }

 # Model registry for generators
 # ----------------------------
 class GeneratorWrapper:
+    def __init__(self, name: str, load_fn, fallback=None, fallback_msg: str | None = None):
         self.name = name
         self._load_fn = load_fn
         self._pipe = None
+        self._fallback = fallback
+        self._fallback_msg = fallback_msg
+        self._note = None
     def ensure(self):
         if self._pipe is None:
+            try:
+                self._pipe = self._load_fn()
+                self._note = None
+            except Exception as exc:
+                print(f"[Generator:{self.name}] load failed: {exc}")
+                if self._fallback:
+                    print(f"[Generator:{self.name}] falling back to {self._fallback.name}")
+                    self._pipe = self._fallback.ensure()
+                    self._note = self._fallback_msg or f"Falling back to {self._fallback.name}."
+                else:
+                    raise
         return self._pipe
     def generate_stream(self, prompt: str, max_new_tokens: int, temperature: float, top_p: float):
         }
         thread = Thread(target=pipe, args=(prompt,), kwargs=kwargs)
         thread.start()
+        if self._note:
+            yield self._note + " "
+            self._note = None
         for token in streamer:
             yield token
 def load_phi3_mini():
     return pipeline(
         "text-generation",
         model="microsoft/Phi-3-mini-4k-instruct",
+        device_map="cpu",
+        torch_dtype=torch.float32,
         trust_remote_code=True,
     )
+_tiny_wrapper = GeneratorWrapper("tinyllama-1.1b-chat", load_tinyllama)
+_phi_wrapper = GeneratorWrapper(
+    "phi-3-mini-4k",
+    load_phi3_mini,
+    fallback=_tiny_wrapper,
+    fallback_msg="Phi-3-mini-4k unavailable on this Space (CUDA blocked); falling back to TinyLlama CPU.",
+)
 GENERATORS = {
+    "tinyllama-1.1b-chat": _tiny_wrapper,
+    "phi-3-mini-4k": _phi_wrapper,
 }