newsglobe-backend / Dockerfile
MANOJSEQ's picture
Upload 2 files
d62f608 verified
raw
history blame
1.5 kB
# ---- Base image ----
FROM python:3.10-slim
ENV PYTHONUNBUFFERED=1 \
PIP_NO_CACHE_DIR=1 \
HF_HUB_DISABLE_TELEMETRY=1 \
PORT=7860 \
NLTK_DATA=/app/nltk_data \
HF_HOME=/app/hf_cache \
TRANSFORMERS_CACHE=/app/hf_cache \
SENTENCE_TRANSFORMERS_HOME=/app/hf_cache
# (optional) handy tools for healthchecks & logs
RUN apt-get update && apt-get install -y --no-install-recommends curl git && \
rm -rf /var/lib/apt/lists/*
WORKDIR /app
# ---- Python deps ----
COPY requirements.txt ./
RUN python -m pip install --upgrade pip && \
pip install torch --index-url https://download.pytorch.org/whl/cpu && \
pip install -r requirements.txt && \
pip install sentencepiece
# ---- App code ----
COPY . .
# ---- Warm caches into the image layer ----
# 1) Cache sentence-transformers model (SBERT)
RUN python - <<'PY'
from sentence_transformers import SentenceTransformer
SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
print("βœ… SBERT model cached")
PY
# 2) Cache NLTK VADER lexicon into /app/nltk_data
RUN python - <<'PY'
import os, nltk
os.makedirs("/app/nltk_data", exist_ok=True)
nltk.download("vader_lexicon", download_dir="/app/nltk_data")
print("βœ… NLTK VADER cached")
PY
# 3) (Optional) Cache tldextract's PSL so first run is snappy
RUN python - <<'PY'
import tldextract
tldextract.extract("example.com")
print("βœ… tldextract PSL cached")
PY
EXPOSE 7860
# ---- Run ----
CMD ["sh", "-c", "uvicorn main:app --host 0.0.0.0 --port ${PORT:-7860}"]