Spaces:

MANOJSEQ
/

newsglobe-backend

Sleeping

App Files Files Community

MANOJSEQ commited on Aug 19

Commit

d62f608

verified ·

1 Parent(s): 94e738e

Upload 2 files

Browse files

Files changed (2) hide show

Dockerfile +12 -15
main.py +15 -3

Dockerfile CHANGED Viewed

@@ -4,10 +4,11 @@ FROM python:3.10-slim
 ENV PYTHONUNBUFFERED=1 \
     PIP_NO_CACHE_DIR=1 \
     HF_HUB_DISABLE_TELEMETRY=1 \
-    # HF Spaces will set PORT; keep a default
     PORT=7860 \
-    # Ensure NLTK looks here (writable in container)
-    NLTK_DATA=/app/nltk_data
 # (optional) handy tools for healthchecks & logs
 RUN apt-get update && apt-get install -y --no-install-recommends curl git && \
@@ -18,10 +19,8 @@ WORKDIR /app
 # ---- Python deps ----
 COPY requirements.txt ./
 RUN python -m pip install --upgrade pip && \
-    # Install CPU-only PyTorch first (lighter & reliable on Spaces)
     pip install torch --index-url https://download.pytorch.org/whl/cpu && \
     pip install -r requirements.txt && \
-    # Optional but recommended so local OPUS translation can work
     pip install sentencepiece
 # ---- App code ----
@@ -35,7 +34,7 @@ SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
 print("✅ SBERT model cached")
 PY
-# 2) Cache NLTK VADER lexicon into /app/nltk_data (writable path)
 RUN python - <<'PY'
 import os, nltk
 os.makedirs("/app/nltk_data", exist_ok=True)
@@ -43,16 +42,14 @@ nltk.download("vader_lexicon", download_dir="/app/nltk_data")
 print("✅ NLTK VADER cached")
 PY
-# (Optional) You can also pre-warm the tweet topic model if you want:
-# RUN python - <<'PY'
-# from transformers import pipeline
-# p = pipeline("text-classification", model="cardiffnlp/tweet-topic-21-multi")
-# p("warmup")
-# print("✅ Topic model cached")
-# PY
 EXPOSE 7860
 # ---- Run ----
-# Use $PORT provided by Spaces (fallback to 7860)
-CMD ["sh", "-c", "uvicorn main:app --host 0.0.0.0 --port ${PORT:-7860}"]

 ENV PYTHONUNBUFFERED=1 \
     PIP_NO_CACHE_DIR=1 \
     HF_HUB_DISABLE_TELEMETRY=1 \
     PORT=7860 \
+    NLTK_DATA=/app/nltk_data \
+    HF_HOME=/app/hf_cache \
+    TRANSFORMERS_CACHE=/app/hf_cache \
+    SENTENCE_TRANSFORMERS_HOME=/app/hf_cache
 # (optional) handy tools for healthchecks & logs
 RUN apt-get update && apt-get install -y --no-install-recommends curl git && \
 # ---- Python deps ----
 COPY requirements.txt ./
 RUN python -m pip install --upgrade pip && \
     pip install torch --index-url https://download.pytorch.org/whl/cpu && \
     pip install -r requirements.txt && \
     pip install sentencepiece
 # ---- App code ----
 print("✅ SBERT model cached")
 PY
+# 2) Cache NLTK VADER lexicon into /app/nltk_data
 RUN python - <<'PY'
 import os, nltk
 os.makedirs("/app/nltk_data", exist_ok=True)
 print("✅ NLTK VADER cached")
 PY
+# 3) (Optional) Cache tldextract's PSL so first run is snappy
+RUN python - <<'PY'
+import tldextract
+tldextract.extract("example.com")
+print("✅ tldextract PSL cached")
+PY
 EXPOSE 7860
 # ---- Run ----
+CMD ["sh", "-c", "uvicorn main:app --host 0.0.0.0 --port ${PORT:-7860}"]

main.py CHANGED Viewed

@@ -28,9 +28,7 @@ import threading
 import difflib
 from starlette.middleware.gzip import GZipMiddleware
 from transformers import pipeline as hf_pipeline
-import os
 os.environ.setdefault("OMP_NUM_THREADS", "1")
-from fastapi import Path
 import torch
 torch.set_num_threads(2)
@@ -53,6 +51,9 @@ _local_pipes = {}
 _news_clf = None
 _sbert = None
 # --- Translation runtime flags / caches ---
 ALLOW_HF_REMOTE = os.getenv("ALLOW_HF_REMOTE", "0") == "1"  # default OFF
 _hf_bad_models: Set[str] = set()
@@ -796,10 +797,21 @@ def cluster_id(cluster, enriched_articles):
 # ----------------- NLTK / VADER -----------------
 try:
     nltk.data.find("sentiment/vader_lexicon")
 except LookupError:
-    nltk.download("vader_lexicon")  # one-time fetch in a fresh container
 try:
     _vader = SentimentIntensityAnalyzer()

 import difflib
 from starlette.middleware.gzip import GZipMiddleware
 from transformers import pipeline as hf_pipeline
 os.environ.setdefault("OMP_NUM_THREADS", "1")
 import torch
 torch.set_num_threads(2)
 _news_clf = None
 _sbert = None
 # --- Translation runtime flags / caches ---
 ALLOW_HF_REMOTE = os.getenv("ALLOW_HF_REMOTE", "0") == "1"  # default OFF
 _hf_bad_models: Set[str] = set()
 # ----------------- NLTK / VADER -----------------
+NLTK_DATA_DIR = os.environ.get("NLTK_DATA", "/app/nltk_data")
+# Make sure NLTK looks in the baked, writable dir first
+if NLTK_DATA_DIR not in nltk.data.path:
+    nltk.data.path.insert(0, NLTK_DATA_DIR)
 try:
     nltk.data.find("sentiment/vader_lexicon")
 except LookupError:
+    # As a fallback, try downloading into the writable dir (won't run if already baked)
+    try:
+        os.makedirs(NLTK_DATA_DIR, exist_ok=True)
+        nltk.download("vader_lexicon", download_dir=NLTK_DATA_DIR, quiet=True)
+    except Exception:
+        pass  # don't crash if download is blocked
 try:
     _vader = SentimentIntensityAnalyzer()