MANOJSEQ commited on
Commit
d62f608
Β·
verified Β·
1 Parent(s): 94e738e

Upload 2 files

Browse files
Files changed (2) hide show
  1. Dockerfile +12 -15
  2. main.py +15 -3
Dockerfile CHANGED
@@ -4,10 +4,11 @@ FROM python:3.10-slim
4
  ENV PYTHONUNBUFFERED=1 \
5
  PIP_NO_CACHE_DIR=1 \
6
  HF_HUB_DISABLE_TELEMETRY=1 \
7
- # HF Spaces will set PORT; keep a default
8
  PORT=7860 \
9
- # Ensure NLTK looks here (writable in container)
10
- NLTK_DATA=/app/nltk_data
 
 
11
 
12
  # (optional) handy tools for healthchecks & logs
13
  RUN apt-get update && apt-get install -y --no-install-recommends curl git && \
@@ -18,10 +19,8 @@ WORKDIR /app
18
  # ---- Python deps ----
19
  COPY requirements.txt ./
20
  RUN python -m pip install --upgrade pip && \
21
- # Install CPU-only PyTorch first (lighter & reliable on Spaces)
22
  pip install torch --index-url https://download.pytorch.org/whl/cpu && \
23
  pip install -r requirements.txt && \
24
- # Optional but recommended so local OPUS translation can work
25
  pip install sentencepiece
26
 
27
  # ---- App code ----
@@ -35,7 +34,7 @@ SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
35
  print("βœ… SBERT model cached")
36
  PY
37
 
38
- # 2) Cache NLTK VADER lexicon into /app/nltk_data (writable path)
39
  RUN python - <<'PY'
40
  import os, nltk
41
  os.makedirs("/app/nltk_data", exist_ok=True)
@@ -43,16 +42,14 @@ nltk.download("vader_lexicon", download_dir="/app/nltk_data")
43
  print("βœ… NLTK VADER cached")
44
  PY
45
 
46
- # (Optional) You can also pre-warm the tweet topic model if you want:
47
- # RUN python - <<'PY'
48
- # from transformers import pipeline
49
- # p = pipeline("text-classification", model="cardiffnlp/tweet-topic-21-multi")
50
- # p("warmup")
51
- # print("βœ… Topic model cached")
52
- # PY
53
 
54
  EXPOSE 7860
55
 
56
  # ---- Run ----
57
- # Use $PORT provided by Spaces (fallback to 7860)
58
- CMD ["sh", "-c", "uvicorn main:app --host 0.0.0.0 --port ${PORT:-7860}"]
 
4
  ENV PYTHONUNBUFFERED=1 \
5
  PIP_NO_CACHE_DIR=1 \
6
  HF_HUB_DISABLE_TELEMETRY=1 \
 
7
  PORT=7860 \
8
+ NLTK_DATA=/app/nltk_data \
9
+ HF_HOME=/app/hf_cache \
10
+ TRANSFORMERS_CACHE=/app/hf_cache \
11
+ SENTENCE_TRANSFORMERS_HOME=/app/hf_cache
12
 
13
  # (optional) handy tools for healthchecks & logs
14
  RUN apt-get update && apt-get install -y --no-install-recommends curl git && \
 
19
  # ---- Python deps ----
20
  COPY requirements.txt ./
21
  RUN python -m pip install --upgrade pip && \
 
22
  pip install torch --index-url https://download.pytorch.org/whl/cpu && \
23
  pip install -r requirements.txt && \
 
24
  pip install sentencepiece
25
 
26
  # ---- App code ----
 
34
  print("βœ… SBERT model cached")
35
  PY
36
 
37
+ # 2) Cache NLTK VADER lexicon into /app/nltk_data
38
  RUN python - <<'PY'
39
  import os, nltk
40
  os.makedirs("/app/nltk_data", exist_ok=True)
 
42
  print("βœ… NLTK VADER cached")
43
  PY
44
 
45
+ # 3) (Optional) Cache tldextract's PSL so first run is snappy
46
+ RUN python - <<'PY'
47
+ import tldextract
48
+ tldextract.extract("example.com")
49
+ print("βœ… tldextract PSL cached")
50
+ PY
 
51
 
52
  EXPOSE 7860
53
 
54
  # ---- Run ----
55
+ CMD ["sh", "-c", "uvicorn main:app --host 0.0.0.0 --port ${PORT:-7860}"]
 
main.py CHANGED
@@ -28,9 +28,7 @@ import threading
28
  import difflib
29
  from starlette.middleware.gzip import GZipMiddleware
30
  from transformers import pipeline as hf_pipeline
31
- import os
32
  os.environ.setdefault("OMP_NUM_THREADS", "1")
33
- from fastapi import Path
34
 
35
  import torch
36
  torch.set_num_threads(2)
@@ -53,6 +51,9 @@ _local_pipes = {}
53
  _news_clf = None
54
  _sbert = None
55
 
 
 
 
56
  # --- Translation runtime flags / caches ---
57
  ALLOW_HF_REMOTE = os.getenv("ALLOW_HF_REMOTE", "0") == "1" # default OFF
58
  _hf_bad_models: Set[str] = set()
@@ -796,10 +797,21 @@ def cluster_id(cluster, enriched_articles):
796
 
797
 
798
  # ----------------- NLTK / VADER -----------------
 
 
 
 
 
 
799
  try:
800
  nltk.data.find("sentiment/vader_lexicon")
801
  except LookupError:
802
- nltk.download("vader_lexicon") # one-time fetch in a fresh container
 
 
 
 
 
803
 
804
  try:
805
  _vader = SentimentIntensityAnalyzer()
 
28
  import difflib
29
  from starlette.middleware.gzip import GZipMiddleware
30
  from transformers import pipeline as hf_pipeline
 
31
  os.environ.setdefault("OMP_NUM_THREADS", "1")
 
32
 
33
  import torch
34
  torch.set_num_threads(2)
 
51
  _news_clf = None
52
  _sbert = None
53
 
54
+
55
+
56
+
57
  # --- Translation runtime flags / caches ---
58
  ALLOW_HF_REMOTE = os.getenv("ALLOW_HF_REMOTE", "0") == "1" # default OFF
59
  _hf_bad_models: Set[str] = set()
 
797
 
798
 
799
  # ----------------- NLTK / VADER -----------------
800
+ NLTK_DATA_DIR = os.environ.get("NLTK_DATA", "/app/nltk_data")
801
+
802
+ # Make sure NLTK looks in the baked, writable dir first
803
+ if NLTK_DATA_DIR not in nltk.data.path:
804
+ nltk.data.path.insert(0, NLTK_DATA_DIR)
805
+
806
  try:
807
  nltk.data.find("sentiment/vader_lexicon")
808
  except LookupError:
809
+ # As a fallback, try downloading into the writable dir (won't run if already baked)
810
+ try:
811
+ os.makedirs(NLTK_DATA_DIR, exist_ok=True)
812
+ nltk.download("vader_lexicon", download_dir=NLTK_DATA_DIR, quiet=True)
813
+ except Exception:
814
+ pass # don't crash if download is blocked
815
 
816
  try:
817
  _vader = SentimentIntensityAnalyzer()