init repo
Browse files- .env.example +27 -0
- .gitignore +67 -0
- BUILD_INSTRUCTIONS.md +89 -0
- Dockerfile +70 -0
- README.md +150 -6
- api.py +213 -0
- app.py +441 -0
- config.py +63 -0
- docker-compose.yml +30 -0
- entrypoint.sh +53 -0
- main.py +76 -0
- packages.txt +12 -0
- requirements.txt +19 -0
- runtime.txt +1 -0
- test.ipynb +23 -0
.env.example
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Model configuration
|
2 |
+
MODEL_REPO=lmstudio-community/gemma-3n-E4B-it-text-GGUF
|
3 |
+
MODEL_FILENAME=gemma-3n-E4B-it-Q8_0.gguf
|
4 |
+
MODEL_PATH=./models/gemma-3n-E4B-it-Q8_0.gguf
|
5 |
+
HUGGINGFACE_TOKEN=
|
6 |
+
|
7 |
+
# Model parameters - optimized for Docker containers
|
8 |
+
N_CTX=4096
|
9 |
+
N_GPU_LAYERS=0
|
10 |
+
N_THREADS=4
|
11 |
+
N_BATCH=512
|
12 |
+
USE_MLOCK=false
|
13 |
+
USE_MMAP=true
|
14 |
+
F16_KV=true
|
15 |
+
SEED=42
|
16 |
+
|
17 |
+
# Server settings
|
18 |
+
HOST=0.0.0.0
|
19 |
+
GRADIO_PORT=7860
|
20 |
+
API_PORT=8000
|
21 |
+
|
22 |
+
# Generation settings
|
23 |
+
MAX_NEW_TOKENS=256
|
24 |
+
TEMPERATURE=0.1
|
25 |
+
|
26 |
+
# File upload settings
|
27 |
+
MAX_FILE_SIZE=10485760
|
.gitignore
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Python
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
*.so
|
6 |
+
.Python
|
7 |
+
build/
|
8 |
+
develop-eggs/
|
9 |
+
dist/
|
10 |
+
downloads/
|
11 |
+
eggs/
|
12 |
+
.eggs/
|
13 |
+
lib/
|
14 |
+
lib64/
|
15 |
+
parts/
|
16 |
+
sdist/
|
17 |
+
var/
|
18 |
+
wheels/
|
19 |
+
*.egg-info/
|
20 |
+
.installed.cfg
|
21 |
+
*.egg
|
22 |
+
MANIFEST
|
23 |
+
|
24 |
+
# Virtual environments
|
25 |
+
.env
|
26 |
+
.venv
|
27 |
+
env/
|
28 |
+
venv/
|
29 |
+
ENV/
|
30 |
+
env.bak/
|
31 |
+
venv.bak/
|
32 |
+
|
33 |
+
# IDE
|
34 |
+
.vscode/
|
35 |
+
.idea/
|
36 |
+
*.swp
|
37 |
+
*.swo
|
38 |
+
*~
|
39 |
+
|
40 |
+
# OS
|
41 |
+
.DS_Store
|
42 |
+
.DS_Store?
|
43 |
+
._*
|
44 |
+
.Spotlight-V100
|
45 |
+
.Trashes
|
46 |
+
ehthumbs.db
|
47 |
+
Thumbs.db
|
48 |
+
|
49 |
+
# Model files
|
50 |
+
models/*.gguf
|
51 |
+
models/*.bin
|
52 |
+
models/*.pt
|
53 |
+
models/*.safetensors
|
54 |
+
|
55 |
+
# Logs
|
56 |
+
*.log
|
57 |
+
logs/
|
58 |
+
|
59 |
+
# Temporary files
|
60 |
+
tmp/
|
61 |
+
temp/
|
62 |
+
|
63 |
+
# Docker
|
64 |
+
.dockerignore
|
65 |
+
|
66 |
+
# HuggingFace
|
67 |
+
.huggingface/
|
BUILD_INSTRUCTIONS.md
ADDED
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# ΠΠ½ΡΡΡΡΠΊΡΠΈΠΈ ΠΏΠΎ ΡΠ±ΠΎΡΠΊΠ΅ Docker ΠΎΠ±ΡΠ°Π·Π° Ρ ΠΏΡΠ΅Π΄Π·Π°Π³ΡΡΠΆΠ΅Π½Π½ΠΎΠΉ ΠΌΠΎΠ΄Π΅Π»ΡΡ
|
2 |
+
|
3 |
+
## ΠΠ±Π·ΠΎΡ ΠΈΠ·ΠΌΠ΅Π½Π΅Π½ΠΈΠΉ
|
4 |
+
|
5 |
+
Dockerfile Π±ΡΠ» ΠΌΠΎΠ΄ΠΈΡΠΈΡΠΈΡΠΎΠ²Π°Π½ Π΄Π»Ρ ΠΏΡΠ΅Π΄Π²Π°ΡΠΈΡΠ΅Π»ΡΠ½ΠΎΠΉ Π·Π°Π³ΡΡΠ·ΠΊΠΈ ΠΌΠΎΠ΄Π΅Π»ΠΈ Hugging Face Π²ΠΎ Π²ΡΠ΅ΠΌΡ ΡΠ±ΠΎΡΠΊΠΈ ΠΎΠ±ΡΠ°Π·Π°. ΠΡΠΎ ΠΎΠ±Π΅ΡΠΏΠ΅ΡΠΈΠ²Π°Π΅Ρ:
|
6 |
+
|
7 |
+
- β
ΠΡΡΡΡΠΎΠ΅ ΡΠ°Π·Π²Π΅ΡΡΡΠ²Π°Π½ΠΈΠ΅ (ΠΌΠΎΠ΄Π΅Π»Ρ ΡΠΆΠ΅ Π² ΠΊΠΎΠ½ΡΠ΅ΠΉΠ½Π΅ΡΠ΅)
|
8 |
+
- β
ΠΠ°Π΄Π΅ΠΆΠ½ΠΎΡΡΡ (Π½Π΅Ρ Π·Π°Π²ΠΈΡΠΈΠΌΠΎΡΡΠΈ ΠΎΡ ΡΠ΅ΡΠΈ ΠΏΡΠΈ Π·Π°ΠΏΡΡΠΊΠ΅)
|
9 |
+
- β
ΠΠΎΠ½ΡΠΈΡΡΠ΅Π½ΡΠ½ΠΎΡΡΡ (ΡΠΈΠΊΡΠΈΡΠΎΠ²Π°Π½Π½Π°Ρ Π²Π΅ΡΡΠΈΡ ΠΌΠΎΠ΄Π΅Π»ΠΈ)
|
10 |
+
|
11 |
+
## Π‘Π±ΠΎΡΠΊΠ° ΠΎΠ±ΡΠ°Π·Π°
|
12 |
+
|
13 |
+
### ΠΠ°Π·ΠΎΠ²Π°Ρ ΡΠ±ΠΎΡΠΊΠ° (Π΄Π»Ρ ΠΏΡΠ±Π»ΠΈΡΠ½ΡΡ
ΠΌΠΎΠ΄Π΅Π»Π΅ΠΉ):
|
14 |
+
|
15 |
+
```bash
|
16 |
+
docker build -t llm-structured-output .
|
17 |
+
```
|
18 |
+
|
19 |
+
### Π‘Π±ΠΎΡΠΊΠ° Ρ ΡΠΎΠΊΠ΅Π½ΠΎΠΌ Hugging Face (Π΄Π»Ρ ΠΏΡΠΈΠ²Π°ΡΠ½ΡΡ
ΠΌΠΎΠ΄Π΅Π»Π΅ΠΉ):
|
20 |
+
|
21 |
+
```bash
|
22 |
+
docker build --build-arg HUGGINGFACE_TOKEN=your_token_here -t llm-structured-output .
|
23 |
+
```
|
24 |
+
|
25 |
+
ΠΠ»ΠΈ ΡΠ΅ΡΠ΅Π· ΠΏΠ΅ΡΠ΅ΠΌΠ΅Π½Π½ΡΡ ΠΎΠΊΡΡΠΆΠ΅Π½ΠΈΡ:
|
26 |
+
|
27 |
+
```bash
|
28 |
+
export HUGGINGFACE_TOKEN=your_token_here
|
29 |
+
docker build -t llm-structured-output .
|
30 |
+
```
|
31 |
+
|
32 |
+
## ΠΠ°ΠΏΡΡΠΊ ΠΊΠΎΠ½ΡΠ΅ΠΉΠ½Π΅ΡΠ°
|
33 |
+
|
34 |
+
```bash
|
35 |
+
docker run -p 7860:7860 llm-structured-output
|
36 |
+
```
|
37 |
+
|
38 |
+
ΠΡΠΈΠ»ΠΎΠΆΠ΅Π½ΠΈΠ΅ Π±ΡΠ΄Π΅Ρ Π΄ΠΎΡΡΡΠΏΠ½ΠΎ ΠΏΠΎ Π°Π΄ΡΠ΅ΡΡ: http://localhost:7860
|
39 |
+
|
40 |
+
## ΠΠ°ΠΏΡΡΠΊ ΡΠ΅ΡΠ΅Π· docker-compose
|
41 |
+
|
42 |
+
```bash
|
43 |
+
docker-compose up --build
|
44 |
+
```
|
45 |
+
|
46 |
+
## ΠΠ°ΠΆΠ½ΡΠ΅ ΠΈΠ·ΠΌΠ΅Π½Π΅Π½ΠΈΡ
|
47 |
+
|
48 |
+
### 1. Dockerfile
|
49 |
+
- ΠΠΎΠ±Π°Π²Π»Π΅Π½ `git-lfs` Π΄Π»Ρ ΡΠ°Π±ΠΎΡΡ Ρ Π±ΠΎΠ»ΡΡΠΈΠΌΠΈ ΡΠ°ΠΉΠ»Π°ΠΌΠΈ
|
50 |
+
- ΠΠΎΠ±Π°Π²Π»Π΅Π½Π° ΠΏΠ΅ΡΠ΅ΠΌΠ΅Π½Π½Π°Ρ `DOCKER_CONTAINER=true`
|
51 |
+
- ΠΠΎΠ±Π°Π²Π»Π΅Π½ ΡΡΠ°ΠΏ ΠΏΡΠ΅Π΄Π²Π°ΡΠΈΡΠ΅Π»ΡΠ½ΠΎΠΉ Π·Π°Π³ΡΡΠ·ΠΊΠΈ ΠΌΠΎΠ΄Π΅Π»ΠΈ
|
52 |
+
- ΠΠΎΠ΄Π΅Π»Ρ ΡΠΊΠ°ΡΠΈΠ²Π°Π΅ΡΡΡ Π²ΠΎ Π²ΡΠ΅ΠΌΡ ΡΠ±ΠΎΡΠΊΠΈ ΠΎΠ±ΡΠ°Π·Π°
|
53 |
+
|
54 |
+
### 2. app.py
|
55 |
+
- ΠΠΎΠ±Π°Π²Π»Π΅Π½Π° ΠΏΡΠΎΠ²Π΅ΡΠΊΠ° Π½Π° Docker ΠΎΠΊΡΡΠΆΠ΅Π½ΠΈΠ΅
|
56 |
+
- ΠΡΠ»ΠΈ ΠΌΠΎΠ΄Π΅Π»Ρ Π½Π΅ Π½Π°ΠΉΠ΄Π΅Π½Π° Π² Docker ΠΊΠΎΠ½ΡΠ΅ΠΉΠ½Π΅ΡΠ΅, Π²ΡΠ±ΡΠ°ΡΡΠ²Π°Π΅ΡΡΡ ΠΎΡΠΈΠ±ΠΊΠ°
|
57 |
+
- ΠΠΎΠ³ΠΈΠΊΠ° Π·Π°Π³ΡΡΠ·ΠΊΠΈ ΠΌΠΎΠ΄Π΅Π»ΠΈ ΠΎΠΏΡΠΈΠΌΠΈΠ·ΠΈΡΠΎΠ²Π°Π½Π° Π΄Π»Ρ ΡΠ°Π±ΠΎΡΡ Ρ ΠΏΡΠ΅Π΄Π·Π°Π³ΡΡΠΆΠ΅Π½Π½ΡΠΌΠΈ ΠΌΠΎΠ΄Π΅Π»ΡΠΌΠΈ
|
58 |
+
|
59 |
+
## Π Π°Π·ΠΌΠ΅Ρ ΠΎΠ±ΡΠ°Π·Π°
|
60 |
+
|
61 |
+
ΠΠ±ΡΠ°Π· Π±ΡΠ΄Π΅Ρ Π±ΠΎΠ»ΡΡΠ΅ ΠΈΠ·-Π·Π° Π²ΠΊΠ»ΡΡΠ΅Π½Π½ΠΎΠΉ ΠΌΠΎΠ΄Π΅Π»ΠΈ, Π½ΠΎ ΡΡΠΎ ΠΊΠΎΠΌΠΏΠ΅Π½ΡΠΈΡΡΠ΅ΡΡΡ:
|
62 |
+
- ΠΡΡΡΡΡΠΌ Π·Π°ΠΏΡΡΠΊΠΎΠΌ ΠΊΠΎΠ½ΡΠ΅ΠΉΠ½Π΅ΡΠ°
|
63 |
+
- ΠΡΡΡΡΡΡΠ²ΠΈΠ΅ΠΌ ΡΠ΅ΡΠ΅Π²ΡΡ
Π·Π°Π²ΠΈΡΠΈΠΌΠΎΡΡΠ΅ΠΉ
|
64 |
+
- ΠΠΎΠ·ΠΌΠΎΠΆΠ½ΠΎΡΡΡΡ ΠΊΡΡΠΈΡΠΎΠ²Π°Π½ΠΈΡ ΡΠ»ΠΎΠ΅Π² Docker
|
65 |
+
|
66 |
+
## ΠΠ°ΡΡΡΠΎΠΉΠΊΠ° ΠΌΠΎΠ΄Π΅Π»ΠΈ
|
67 |
+
|
68 |
+
ΠΠ»Ρ ΠΈΠ·ΠΌΠ΅Π½Π΅Π½ΠΈΡ ΠΌΠΎΠ΄Π΅Π»ΠΈ ΠΎΡΡΠ΅Π΄Π°ΠΊΡΠΈΡΡΠΉΡΠ΅ `config.py`:
|
69 |
+
|
70 |
+
```python
|
71 |
+
MODEL_REPO: str = "your-repo/your-model"
|
72 |
+
MODEL_FILENAME: str = "your-model.gguf"
|
73 |
+
```
|
74 |
+
|
75 |
+
ΠΠ°ΡΠ΅ΠΌ ΠΏΠ΅ΡΠ΅ΡΠΎΠ±Π΅ΡΠΈΡΠ΅ ΠΎΠ±ΡΠ°Π·.
|
76 |
+
|
77 |
+
## ΠΡΠ»Π°Π΄ΠΊΠ°
|
78 |
+
|
79 |
+
ΠΠ»Ρ ΠΏΡΠΎΠ²Π΅ΡΠΊΠΈ Π½Π°Π»ΠΈΡΠΈΡ ΠΌΠΎΠ΄Π΅Π»ΠΈ Π² ΠΊΠΎΠ½ΡΠ΅ΠΉΠ½Π΅ΡΠ΅:
|
80 |
+
|
81 |
+
```bash
|
82 |
+
docker run -it llm-structured-output ls -la /app/models/
|
83 |
+
```
|
84 |
+
|
85 |
+
ΠΠ»Ρ ΠΏΡΠΎΠ²Π΅ΡΠΊΠΈ Π»ΠΎΠ³ΠΎΠ² ΡΠ±ΠΎΡΠΊΠΈ:
|
86 |
+
|
87 |
+
```bash
|
88 |
+
docker build --no-cache -t llm-structured-output .
|
89 |
+
```
|
Dockerfile
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Use Python 3.10 base image optimized for HuggingFace Spaces
|
2 |
+
FROM python:3.10-slim
|
3 |
+
|
4 |
+
# Set working directory
|
5 |
+
WORKDIR /app
|
6 |
+
|
7 |
+
# Install system dependencies required for llama-cpp-python and git-lfs
|
8 |
+
RUN apt-get update && apt-get install -y \
|
9 |
+
build-essential \
|
10 |
+
cmake \
|
11 |
+
wget \
|
12 |
+
curl \
|
13 |
+
git \
|
14 |
+
git-lfs \
|
15 |
+
pkg-config \
|
16 |
+
libopenblas-dev \
|
17 |
+
libssl-dev \
|
18 |
+
musl-dev \
|
19 |
+
&& rm -rf /var/lib/apt/lists/*
|
20 |
+
|
21 |
+
# Initialize git-lfs
|
22 |
+
RUN git lfs install
|
23 |
+
|
24 |
+
# Set environment variables for optimal Docker performance
|
25 |
+
ENV PYTHONUNBUFFERED=1
|
26 |
+
ENV PYTHONDONTWRITEBYTECODE=1
|
27 |
+
ENV PIP_NO_CACHE_DIR=1
|
28 |
+
ENV CMAKE_ARGS="-DLLAMA_OPENBLAS=on"
|
29 |
+
ENV FORCE_CMAKE=1
|
30 |
+
ENV DOCKER_CONTAINER=true
|
31 |
+
|
32 |
+
# Create models directory
|
33 |
+
RUN mkdir -p /app/models
|
34 |
+
|
35 |
+
# Create symbolic link for musl libc compatibility (required for llama-cpp-python)
|
36 |
+
RUN ln -sf /usr/lib/x86_64-linux-musl/libc.so /lib/libc.musl-x86_64.so.1 || \
|
37 |
+
ln -sf /usr/lib/x86_64-linux-gnu/libc.so.6 /lib/libc.musl-x86_64.so.1
|
38 |
+
|
39 |
+
# Copy requirements first for better Docker layer caching
|
40 |
+
COPY requirements.txt .
|
41 |
+
|
42 |
+
# Install Python dependencies
|
43 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
44 |
+
|
45 |
+
# Copy configuration to get model info
|
46 |
+
COPY config.py .
|
47 |
+
|
48 |
+
# Pre-download the model during build
|
49 |
+
RUN python -c "import os; from huggingface_hub import hf_hub_download; from config import Config; os.makedirs('/app/models', exist_ok=True); print(f'Downloading model {Config.MODEL_REPO}/{Config.MODEL_FILENAME}...'); p=hf_hub_download(repo_id=Config.MODEL_REPO, filename=Config.MODEL_FILENAME, local_dir='/app/models', token=os.getenv('HUGGINGFACE_TOKEN') or None); print(f'Model downloaded to: {p}'); import os; s=os.path.getsize(p) if os.path.exists(p) else (_ for _ in ()).throw(FileNotFoundError(f'Model file not found: {p}')); print(f'Model file size: {s/(1024**3):.2f} GB'); (s>1024*1024) or (_ for _ in ()).throw(ValueError(f'Downloaded model file seems too small: {s} bytes')); print('Model download verification successful')"
|
50 |
+
|
51 |
+
# Verify model file exists after build
|
52 |
+
RUN ls -la /app/models/ && \
|
53 |
+
[ -f "/app/models/gemma-3n-E4B-it-Q8_0.gguf" ] || (echo "Model file not found!" && exit 1)
|
54 |
+
|
55 |
+
# Copy application files
|
56 |
+
COPY . .
|
57 |
+
|
58 |
+
# Make entrypoint script executable
|
59 |
+
RUN chmod +x entrypoint.sh
|
60 |
+
|
61 |
+
# Create a non-root user for security
|
62 |
+
RUN useradd -m -u 1000 user && chown -R user:user /app
|
63 |
+
USER user
|
64 |
+
|
65 |
+
# Expose the port that Gradio will run on
|
66 |
+
EXPOSE 7860
|
67 |
+
|
68 |
+
# Set entrypoint and default command
|
69 |
+
ENTRYPOINT ["./entrypoint.sh"]
|
70 |
+
CMD ["python", "main.py", "--mode", "gradio"]
|
README.md
CHANGED
@@ -1,11 +1,155 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: docker
|
|
|
7 |
pinned: false
|
8 |
-
license:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
---
|
10 |
|
11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
title: LLM Structured Output Docker
|
3 |
+
emoji: π€
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: green
|
6 |
sdk: docker
|
7 |
+
app_port: 7860
|
8 |
pinned: false
|
9 |
+
license: mit
|
10 |
+
short_description: Get structured JSON responses from LLM using Docker
|
11 |
+
tags:
|
12 |
+
- llama-cpp
|
13 |
+
- gguf
|
14 |
+
- json-schema
|
15 |
+
- structured-output
|
16 |
+
- llm
|
17 |
+
- docker
|
18 |
+
- gradio
|
19 |
---
|
20 |
|
21 |
+
# π€ LLM Structured Output (Docker Version)
|
22 |
+
|
23 |
+
Dockerized application for getting structured responses from local GGUF language models in specified JSON format.
|
24 |
+
|
25 |
+
|
26 |
+
## β¨ Key Features
|
27 |
+
|
28 |
+
- **Docker containerized** for easy deployment on HuggingFace Spaces
|
29 |
+
- **Local GGUF model support** via llama-cpp-python
|
30 |
+
- **Optimized for containers** with configurable resources
|
31 |
+
- **JSON schema support** for structured output
|
32 |
+
- **Gradio web interface** for convenient interaction
|
33 |
+
- **REST API** for integration with other applications
|
34 |
+
- **Memory efficient** with GGUF quantized models
|
35 |
+
|
36 |
+
## π Deployment on HuggingFace Spaces
|
37 |
+
|
38 |
+
This version is specifically designed for HuggingFace Spaces with Docker SDK:
|
39 |
+
|
40 |
+
1. Clone this repository
|
41 |
+
2. Push to HuggingFace Spaces with `sdk: docker` in README.md
|
42 |
+
3. The application will automatically build and deploy
|
43 |
+
|
44 |
+
## π³ Local Docker Usage
|
45 |
+
|
46 |
+
### Build the image:
|
47 |
+
```bash
|
48 |
+
docker build -t llm-structured-output .
|
49 |
+
```
|
50 |
+
|
51 |
+
### Run the container:
|
52 |
+
```bash
|
53 |
+
docker run -p 7860:7860 -e MODEL_REPO="lmstudio-community/gemma-3n-E4B-it-text-GGUF" llm-structured-output
|
54 |
+
```
|
55 |
+
|
56 |
+
### With custom configuration:
|
57 |
+
```bash
|
58 |
+
docker run -p 7860:7860 \
|
59 |
+
-e MODEL_REPO="lmstudio-community/gemma-3n-E4B-it-text-GGUF" \
|
60 |
+
-e MODEL_FILENAME="gemma-3n-E4B-it-Q8_0.gguf" \
|
61 |
+
-e N_CTX="4096" \
|
62 |
+
-e MAX_NEW_TOKENS="512" \
|
63 |
+
llm-structured-output
|
64 |
+
```
|
65 |
+
|
66 |
+
## π Application Access
|
67 |
+
|
68 |
+
- **Web interface**: http://localhost:7860
|
69 |
+
- **API**: Available through the same port
|
70 |
+
- **Health check**: http://localhost:7860/health (when API mode is enabled)
|
71 |
+
|
72 |
+
## π Environment Variables
|
73 |
+
|
74 |
+
Configure the application using environment variables:
|
75 |
+
|
76 |
+
| Variable | Default | Description |
|
77 |
+
|----------|---------|-------------|
|
78 |
+
| `MODEL_REPO` | `lmstudio-community/gemma-3n-E4B-it-text-GGUF` | HuggingFace model repository |
|
79 |
+
| `MODEL_FILENAME` | `gemma-3n-E4B-it-Q8_0.gguf` | Model file name |
|
80 |
+
| `N_CTX` | `4096` | Context window size |
|
81 |
+
| `N_GPU_LAYERS` | `0` | GPU layers (0 for CPU-only) |
|
82 |
+
| `N_THREADS` | `4` | CPU threads |
|
83 |
+
| `MAX_NEW_TOKENS` | `256` | Maximum response length |
|
84 |
+
| `TEMPERATURE` | `0.1` | Generation temperature |
|
85 |
+
| `HUGGINGFACE_TOKEN` | `` | HF token for private models |
|
86 |
+
|
87 |
+
## π Usage Examples
|
88 |
+
|
89 |
+
### Example JSON Schema:
|
90 |
+
```json
|
91 |
+
{
|
92 |
+
"type": "object",
|
93 |
+
"properties": {
|
94 |
+
"summary": {"type": "string"},
|
95 |
+
"sentiment": {"type": "string", "enum": ["positive", "negative", "neutral"]},
|
96 |
+
"confidence": {"type": "number", "minimum": 0, "maximum": 1}
|
97 |
+
},
|
98 |
+
"required": ["summary", "sentiment"]
|
99 |
+
}
|
100 |
+
```
|
101 |
+
|
102 |
+
### Example Prompt:
|
103 |
+
```
|
104 |
+
Analyze this review: "The product exceeded my expectations! Great quality and fast delivery."
|
105 |
+
```
|
106 |
+
|
107 |
+
## π§ Docker Optimizations
|
108 |
+
|
109 |
+
This Docker version includes several optimizations:
|
110 |
+
|
111 |
+
- **Reduced memory usage** with smaller context window and batch sizes
|
112 |
+
- **CPU-optimized** configuration by default
|
113 |
+
- **Efficient layer caching** for faster builds
|
114 |
+
- **Security**: Runs as non-root user
|
115 |
+
- **Multi-stage build** capabilities for production
|
116 |
+
|
117 |
+
## ποΈ Architecture
|
118 |
+
|
119 |
+
- **Base Image**: Python 3.10 slim
|
120 |
+
- **ML Backend**: llama-cpp-python with OpenBLAS
|
121 |
+
- **Web Interface**: Gradio 4.x
|
122 |
+
- **API**: FastAPI with automatic documentation
|
123 |
+
- **Model Storage**: Downloaded on first run to `/app/models/`
|
124 |
+
|
125 |
+
## π‘ Performance Tips
|
126 |
+
|
127 |
+
1. **Memory**: Start with smaller models (7B or less)
|
128 |
+
2. **CPU**: Adjust `N_THREADS` based on available cores
|
129 |
+
3. **Context**: Reduce `N_CTX` if experiencing memory issues
|
130 |
+
4. **Batch size**: Lower `N_BATCH` for memory-constrained environments
|
131 |
+
|
132 |
+
## π Troubleshooting
|
133 |
+
|
134 |
+
### Container fails to start:
|
135 |
+
- Check available memory (minimum 4GB recommended)
|
136 |
+
- Verify model repository accessibility
|
137 |
+
- Ensure proper environment variable formatting
|
138 |
+
|
139 |
+
### Model download issues:
|
140 |
+
- Check internet connectivity in container
|
141 |
+
- Verify `HUGGINGFACE_TOKEN` for private models
|
142 |
+
- Ensure sufficient disk space
|
143 |
+
|
144 |
+
### Performance issues:
|
145 |
+
- Reduce `N_CTX` and `MAX_NEW_TOKENS`
|
146 |
+
- Adjust `N_THREADS` to match CPU cores
|
147 |
+
- Consider using smaller/quantized models
|
148 |
+
|
149 |
+
## π License
|
150 |
+
|
151 |
+
MIT License - see LICENSE file for details.
|
152 |
+
|
153 |
+
---
|
154 |
+
|
155 |
+
For more information about HuggingFace Spaces Docker configuration, see: https://huggingface.co/docs/hub/spaces-config-reference
|
api.py
ADDED
@@ -0,0 +1,213 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import FastAPI, HTTPException, UploadFile, File, Form
|
2 |
+
from fastapi.middleware.cors import CORSMiddleware
|
3 |
+
from pydantic import BaseModel
|
4 |
+
from typing import Optional, Dict, Any
|
5 |
+
import json
|
6 |
+
import base64
|
7 |
+
from PIL import Image
|
8 |
+
from io import BytesIO
|
9 |
+
import uvicorn
|
10 |
+
from app import llm_client
|
11 |
+
|
12 |
+
# Create FastAPI application
|
13 |
+
api_app = FastAPI(
|
14 |
+
title="LLM Structured Output API",
|
15 |
+
description="API for generating structured responses from local GGUF models via llama-cpp-python",
|
16 |
+
version="1.0.0"
|
17 |
+
)
|
18 |
+
|
19 |
+
# Setup CORS
|
20 |
+
api_app.add_middleware(
|
21 |
+
CORSMiddleware,
|
22 |
+
allow_origins=["*"],
|
23 |
+
allow_credentials=True,
|
24 |
+
allow_methods=["*"],
|
25 |
+
allow_headers=["*"],
|
26 |
+
)
|
27 |
+
|
28 |
+
# Data models for API
|
29 |
+
class StructuredOutputRequest(BaseModel):
|
30 |
+
prompt: str
|
31 |
+
json_schema: Dict[str, Any]
|
32 |
+
image_base64: Optional[str] = None
|
33 |
+
|
34 |
+
class StructuredOutputResponse(BaseModel):
|
35 |
+
success: bool
|
36 |
+
data: Optional[Dict[str, Any]] = None
|
37 |
+
error: Optional[str] = None
|
38 |
+
raw_response: Optional[str] = None
|
39 |
+
|
40 |
+
def decode_base64_image(base64_string: str) -> Image.Image:
|
41 |
+
"""Decode base64 string to PIL Image"""
|
42 |
+
try:
|
43 |
+
image_data = base64.b64decode(base64_string)
|
44 |
+
image = Image.open(BytesIO(image_data))
|
45 |
+
return image
|
46 |
+
except Exception as e:
|
47 |
+
raise HTTPException(status_code=400, detail=f"Image decoding error: {str(e)}")
|
48 |
+
|
49 |
+
@api_app.post("/generate", response_model=StructuredOutputResponse)
|
50 |
+
async def generate_structured_output(request: StructuredOutputRequest):
|
51 |
+
"""
|
52 |
+
Main endpoint for generating structured response
|
53 |
+
|
54 |
+
Args:
|
55 |
+
request: Request containing prompt, JSON schema and optionally base64 image
|
56 |
+
|
57 |
+
Returns:
|
58 |
+
StructuredOutputResponse: Structured response or error
|
59 |
+
"""
|
60 |
+
# Check model initialization
|
61 |
+
if llm_client is None:
|
62 |
+
raise HTTPException(
|
63 |
+
status_code=503,
|
64 |
+
detail="LLM model not initialized. Check server configuration."
|
65 |
+
)
|
66 |
+
|
67 |
+
try:
|
68 |
+
# Validate input data
|
69 |
+
if not request.prompt.strip():
|
70 |
+
raise HTTPException(status_code=400, detail="Prompt cannot be empty")
|
71 |
+
|
72 |
+
if not request.json_schema:
|
73 |
+
raise HTTPException(status_code=400, detail="JSON schema cannot be empty")
|
74 |
+
|
75 |
+
# Decode image if provided
|
76 |
+
image = None
|
77 |
+
if request.image_base64:
|
78 |
+
image = decode_base64_image(request.image_base64)
|
79 |
+
|
80 |
+
# Generate response
|
81 |
+
result = llm_client.generate_structured_response(
|
82 |
+
prompt=request.prompt,
|
83 |
+
json_schema=request.json_schema,
|
84 |
+
image=image
|
85 |
+
)
|
86 |
+
|
87 |
+
# Format response
|
88 |
+
if "error" in result:
|
89 |
+
return StructuredOutputResponse(
|
90 |
+
success=False,
|
91 |
+
error=result["error"],
|
92 |
+
raw_response=result.get("raw_response")
|
93 |
+
)
|
94 |
+
else:
|
95 |
+
return StructuredOutputResponse(
|
96 |
+
success=True,
|
97 |
+
data=result.get("data"),
|
98 |
+
raw_response=result.get("raw_response")
|
99 |
+
)
|
100 |
+
|
101 |
+
except HTTPException:
|
102 |
+
raise
|
103 |
+
except Exception as e:
|
104 |
+
raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
|
105 |
+
|
106 |
+
@api_app.post("/generate_with_file", response_model=StructuredOutputResponse)
|
107 |
+
async def generate_with_file(
|
108 |
+
prompt: str = Form(...),
|
109 |
+
json_schema: str = Form(...),
|
110 |
+
image: Optional[UploadFile] = File(None)
|
111 |
+
):
|
112 |
+
"""
|
113 |
+
Alternative endpoint for uploading image as file
|
114 |
+
|
115 |
+
Args:
|
116 |
+
prompt: Text prompt
|
117 |
+
json_schema: JSON schema as string
|
118 |
+
image: Uploaded image file
|
119 |
+
|
120 |
+
Returns:
|
121 |
+
StructuredOutputResponse: Structured response or error
|
122 |
+
"""
|
123 |
+
# Check model initialization
|
124 |
+
if llm_client is None:
|
125 |
+
raise HTTPException(
|
126 |
+
status_code=503,
|
127 |
+
detail="LLM model not initialized. Check server configuration."
|
128 |
+
)
|
129 |
+
|
130 |
+
try:
|
131 |
+
# Validate input data
|
132 |
+
if not prompt.strip():
|
133 |
+
raise HTTPException(status_code=400, detail="Prompt cannot be empty")
|
134 |
+
|
135 |
+
if not json_schema.strip():
|
136 |
+
raise HTTPException(status_code=400, detail="JSON schema cannot be empty")
|
137 |
+
|
138 |
+
# Parse JSON schema
|
139 |
+
try:
|
140 |
+
parsed_schema = json.loads(json_schema)
|
141 |
+
except json.JSONDecodeError as e:
|
142 |
+
raise HTTPException(status_code=400, detail=f"Invalid JSON schema: {str(e)}")
|
143 |
+
|
144 |
+
# Process image if provided
|
145 |
+
pil_image = None
|
146 |
+
if image:
|
147 |
+
# Check file type
|
148 |
+
if not image.content_type.startswith('image/'):
|
149 |
+
raise HTTPException(status_code=400, detail="Uploaded file must be an image")
|
150 |
+
|
151 |
+
# Read and convert image
|
152 |
+
image_data = await image.read()
|
153 |
+
pil_image = Image.open(BytesIO(image_data))
|
154 |
+
|
155 |
+
# Generate response
|
156 |
+
result = llm_client.generate_structured_response(
|
157 |
+
prompt=prompt,
|
158 |
+
json_schema=parsed_schema,
|
159 |
+
image=pil_image
|
160 |
+
)
|
161 |
+
|
162 |
+
# Format response
|
163 |
+
if "error" in result:
|
164 |
+
return StructuredOutputResponse(
|
165 |
+
success=False,
|
166 |
+
error=result["error"],
|
167 |
+
raw_response=result.get("raw_response")
|
168 |
+
)
|
169 |
+
else:
|
170 |
+
return StructuredOutputResponse(
|
171 |
+
success=True,
|
172 |
+
data=result.get("data"),
|
173 |
+
raw_response=result.get("raw_response")
|
174 |
+
)
|
175 |
+
|
176 |
+
except HTTPException:
|
177 |
+
raise
|
178 |
+
except Exception as e:
|
179 |
+
raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
|
180 |
+
|
181 |
+
@api_app.get("/health")
|
182 |
+
async def health_check():
|
183 |
+
"""API health check"""
|
184 |
+
model_status = "loaded" if llm_client is not None else "not_loaded"
|
185 |
+
return {
|
186 |
+
"status": "healthy" if llm_client is not None else "degraded",
|
187 |
+
"model_status": model_status,
|
188 |
+
"message": "API is working correctly" if llm_client is not None else "API is working, but model is not loaded"
|
189 |
+
}
|
190 |
+
|
191 |
+
@api_app.get("/")
|
192 |
+
async def root():
|
193 |
+
"""Root endpoint with API information"""
|
194 |
+
return {
|
195 |
+
"message": "LLM Structured Output API",
|
196 |
+
"version": "1.0.0",
|
197 |
+
"model_loaded": llm_client is not None,
|
198 |
+
"endpoints": {
|
199 |
+
"/generate": "POST - main endpoint for generating structured response",
|
200 |
+
"/generate_with_file": "POST - endpoint with image file upload",
|
201 |
+
"/health": "GET - health check",
|
202 |
+
"/docs": "GET - automatic Swagger documentation"
|
203 |
+
}
|
204 |
+
}
|
205 |
+
|
206 |
+
if __name__ == "__main__":
|
207 |
+
from config import Config
|
208 |
+
uvicorn.run(
|
209 |
+
"api:api_app",
|
210 |
+
host=Config.HOST,
|
211 |
+
port=Config.API_PORT,
|
212 |
+
reload=True
|
213 |
+
)
|
app.py
ADDED
@@ -0,0 +1,441 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
import gradio as gr
|
4 |
+
from typing import Optional, Dict, Any, Union
|
5 |
+
from PIL import Image
|
6 |
+
from pydantic import BaseModel
|
7 |
+
import logging
|
8 |
+
from config import Config
|
9 |
+
|
10 |
+
# Try to import llama_cpp with fallback
|
11 |
+
try:
|
12 |
+
from llama_cpp import Llama
|
13 |
+
LLAMA_CPP_AVAILABLE = True
|
14 |
+
except ImportError as e:
|
15 |
+
print(f"Warning: llama-cpp-python not available: {e}")
|
16 |
+
LLAMA_CPP_AVAILABLE = False
|
17 |
+
Llama = None
|
18 |
+
|
19 |
+
# Try to import huggingface_hub
|
20 |
+
try:
|
21 |
+
from huggingface_hub import hf_hub_download
|
22 |
+
HUGGINGFACE_HUB_AVAILABLE = True
|
23 |
+
except ImportError as e:
|
24 |
+
print(f"Warning: huggingface_hub not available: {e}")
|
25 |
+
HUGGINGFACE_HUB_AVAILABLE = False
|
26 |
+
hf_hub_download = None
|
27 |
+
|
28 |
+
# Setup logging
|
29 |
+
logging.basicConfig(level=logging.INFO)
|
30 |
+
logger = logging.getLogger(__name__)
|
31 |
+
|
32 |
+
class StructuredOutputRequest(BaseModel):
|
33 |
+
prompt: str
|
34 |
+
image: Optional[str] = None # base64 encoded image
|
35 |
+
json_schema: Dict[str, Any]
|
36 |
+
|
37 |
+
class LLMClient:
|
38 |
+
def __init__(self):
|
39 |
+
"""
|
40 |
+
Initialize client for working with local GGUF model via llama-cpp-python
|
41 |
+
"""
|
42 |
+
self.model_path = Config.get_model_path()
|
43 |
+
logger.info(f"Using model: {self.model_path}")
|
44 |
+
|
45 |
+
self.llm = None
|
46 |
+
|
47 |
+
self._initialize_model()
|
48 |
+
|
49 |
+
def _download_model_if_needed(self) -> str:
|
50 |
+
"""Download model from Hugging Face if it doesn't exist locally"""
|
51 |
+
if os.path.exists(self.model_path):
|
52 |
+
logger.info(f"Model already exists at: {self.model_path}")
|
53 |
+
return self.model_path
|
54 |
+
|
55 |
+
# If model doesn't exist and we're in production (Docker),
|
56 |
+
# it means the build process failed or model is in wrong location
|
57 |
+
if os.getenv('DOCKER_CONTAINER', 'false').lower() == 'true':
|
58 |
+
# Let's check common locations where model might be
|
59 |
+
alternative_paths = [
|
60 |
+
f"/app/models/{Config.MODEL_FILENAME}",
|
61 |
+
f"./models/{Config.MODEL_FILENAME}",
|
62 |
+
f"/models/{Config.MODEL_FILENAME}",
|
63 |
+
f"/app/{Config.MODEL_FILENAME}"
|
64 |
+
]
|
65 |
+
|
66 |
+
for alt_path in alternative_paths:
|
67 |
+
if os.path.exists(alt_path):
|
68 |
+
logger.info(f"Found model at alternative location: {alt_path}")
|
69 |
+
return alt_path
|
70 |
+
|
71 |
+
# List what's actually in the models directory
|
72 |
+
models_dir = "/app/models"
|
73 |
+
if os.path.exists(models_dir):
|
74 |
+
files = os.listdir(models_dir)
|
75 |
+
logger.error(f"Contents of {models_dir}: {files}")
|
76 |
+
else:
|
77 |
+
logger.error(f"Directory {models_dir} does not exist")
|
78 |
+
|
79 |
+
# Try to download as fallback
|
80 |
+
logger.warning("Model not found in expected locations, attempting download...")
|
81 |
+
|
82 |
+
if not HUGGINGFACE_HUB_AVAILABLE:
|
83 |
+
raise ImportError("huggingface_hub is not available. Please install it to download models.")
|
84 |
+
|
85 |
+
logger.info(f"Downloading model {Config.MODEL_REPO}/{Config.MODEL_FILENAME}...")
|
86 |
+
|
87 |
+
# Create models directory if it doesn't exist
|
88 |
+
models_dir = Config.get_models_dir()
|
89 |
+
os.makedirs(models_dir, exist_ok=True)
|
90 |
+
|
91 |
+
try:
|
92 |
+
# Download model
|
93 |
+
model_path = hf_hub_download(
|
94 |
+
repo_id=Config.MODEL_REPO,
|
95 |
+
filename=Config.MODEL_FILENAME,
|
96 |
+
local_dir=models_dir,
|
97 |
+
token=Config.HUGGINGFACE_TOKEN if Config.HUGGINGFACE_TOKEN else None
|
98 |
+
)
|
99 |
+
|
100 |
+
logger.info(f"Model downloaded to: {model_path}")
|
101 |
+
return model_path
|
102 |
+
except Exception as e:
|
103 |
+
logger.error(f"Failed to download model: {e}")
|
104 |
+
raise
|
105 |
+
|
106 |
+
def _initialize_model(self):
|
107 |
+
"""Initialize local GGUF model"""
|
108 |
+
try:
|
109 |
+
if not LLAMA_CPP_AVAILABLE:
|
110 |
+
raise ImportError("llama-cpp-python is not available. Please check installation.")
|
111 |
+
|
112 |
+
logger.info("Loading local model...")
|
113 |
+
|
114 |
+
# Download model if needed
|
115 |
+
model_path = self._download_model_if_needed()
|
116 |
+
|
117 |
+
# Verify model file exists and is readable
|
118 |
+
if not os.path.exists(model_path):
|
119 |
+
raise FileNotFoundError(f"Model file not found: {model_path}")
|
120 |
+
|
121 |
+
# Check file size to ensure it's not corrupted
|
122 |
+
file_size = os.path.getsize(model_path)
|
123 |
+
if file_size < 1024 * 1024: # Less than 1MB is suspicious for GGUF model
|
124 |
+
raise ValueError(f"Model file seems corrupted or incomplete. Size: {file_size} bytes")
|
125 |
+
|
126 |
+
logger.info(f"Model file verified. Size: {file_size / (1024**3):.2f} GB")
|
127 |
+
|
128 |
+
# Initialize Llama model with enhanced error handling
|
129 |
+
logger.info("Initializing Llama model...")
|
130 |
+
self.llm = Llama(
|
131 |
+
model_path=model_path,
|
132 |
+
n_ctx=Config.N_CTX,
|
133 |
+
n_batch=Config.N_BATCH,
|
134 |
+
n_gpu_layers=Config.N_GPU_LAYERS,
|
135 |
+
use_mlock=Config.USE_MLOCK,
|
136 |
+
use_mmap=Config.USE_MMAP,
|
137 |
+
vocab_only=False,
|
138 |
+
f16_kv=Config.F16_KV,
|
139 |
+
logits_all=False,
|
140 |
+
embedding=False,
|
141 |
+
n_threads=Config.N_THREADS,
|
142 |
+
last_n_tokens_size=64,
|
143 |
+
lora_base=None,
|
144 |
+
lora_path=None,
|
145 |
+
seed=Config.SEED,
|
146 |
+
verbose=True # Enable verbose for debugging
|
147 |
+
)
|
148 |
+
|
149 |
+
logger.info("Model successfully loaded and initialized")
|
150 |
+
|
151 |
+
# Test model with a simple prompt to verify it's working
|
152 |
+
logger.info("Testing model with simple prompt...")
|
153 |
+
test_response = self.llm("Hello", max_tokens=1, temperature=0.1)
|
154 |
+
logger.info("Model test successful")
|
155 |
+
|
156 |
+
except Exception as e:
|
157 |
+
logger.error(f"Error initializing model: {e}")
|
158 |
+
# Provide more specific error information
|
159 |
+
if "Failed to load model from file" in str(e):
|
160 |
+
logger.error("This error usually indicates:")
|
161 |
+
logger.error("1. Model file is corrupted or incomplete")
|
162 |
+
logger.error("2. llama-cpp-python version is incompatible with the model")
|
163 |
+
logger.error("3. Insufficient memory to load the model")
|
164 |
+
logger.error(f"4. Model path: {self.model_path}")
|
165 |
+
raise
|
166 |
+
|
167 |
+
def _validate_json_schema(self, schema: str) -> Dict[str, Any]:
|
168 |
+
"""Validate and parse JSON schema"""
|
169 |
+
try:
|
170 |
+
parsed_schema = json.loads(schema)
|
171 |
+
return parsed_schema
|
172 |
+
except json.JSONDecodeError as e:
|
173 |
+
raise ValueError(f"Invalid JSON schema: {e}")
|
174 |
+
|
175 |
+
def _format_prompt_with_schema(self, prompt: str, json_schema: Dict[str, Any]) -> str:
|
176 |
+
"""
|
177 |
+
Format prompt for structured output generation
|
178 |
+
"""
|
179 |
+
schema_str = json.dumps(json_schema, ensure_ascii=False, indent=2)
|
180 |
+
|
181 |
+
formatted_prompt = f"""User: {prompt}
|
182 |
+
|
183 |
+
Please respond in strict accordance with the following JSON schema:
|
184 |
+
|
185 |
+
```json
|
186 |
+
{schema_str}
|
187 |
+
```
|
188 |
+
|
189 |
+
Return ONLY valid JSON without additional comments or explanations."""
|
190 |
+
|
191 |
+
return formatted_prompt
|
192 |
+
|
193 |
+
def generate_structured_response(self,
|
194 |
+
prompt: str,
|
195 |
+
json_schema: Union[str, Dict[str, Any]],
|
196 |
+
image: Optional[Image.Image] = None) -> Dict[str, Any]:
|
197 |
+
"""
|
198 |
+
Generate structured response from local GGUF model
|
199 |
+
"""
|
200 |
+
try:
|
201 |
+
# Validate and parse JSON schema
|
202 |
+
if isinstance(json_schema, str):
|
203 |
+
parsed_schema = self._validate_json_schema(json_schema)
|
204 |
+
else:
|
205 |
+
parsed_schema = json_schema
|
206 |
+
|
207 |
+
# Format prompt
|
208 |
+
formatted_prompt = self._format_prompt_with_schema(prompt, parsed_schema)
|
209 |
+
|
210 |
+
# Warning about images (not supported in this implementation)
|
211 |
+
if image is not None:
|
212 |
+
logger.warning("Image processing is not supported with this local model")
|
213 |
+
|
214 |
+
# Generate response
|
215 |
+
logger.info("Generating response...")
|
216 |
+
|
217 |
+
response = self.llm(
|
218 |
+
formatted_prompt,
|
219 |
+
max_tokens=Config.MAX_NEW_TOKENS,
|
220 |
+
temperature=Config.TEMPERATURE,
|
221 |
+
stop=["User:", "\n\n"],
|
222 |
+
echo=False
|
223 |
+
)
|
224 |
+
|
225 |
+
# Extract generated text
|
226 |
+
generated_text = response['choices'][0]['text']
|
227 |
+
|
228 |
+
# Attempt to parse JSON response
|
229 |
+
try:
|
230 |
+
# Find JSON in response
|
231 |
+
json_start = generated_text.find('{')
|
232 |
+
json_end = generated_text.rfind('}') + 1
|
233 |
+
|
234 |
+
if json_start != -1 and json_end > json_start:
|
235 |
+
json_str = generated_text[json_start:json_end]
|
236 |
+
parsed_response = json.loads(json_str)
|
237 |
+
return {
|
238 |
+
"success": True,
|
239 |
+
"data": parsed_response,
|
240 |
+
"raw_response": generated_text
|
241 |
+
}
|
242 |
+
else:
|
243 |
+
return {
|
244 |
+
"error": "Could not find JSON in model response",
|
245 |
+
"raw_response": generated_text
|
246 |
+
}
|
247 |
+
|
248 |
+
except json.JSONDecodeError as e:
|
249 |
+
return {
|
250 |
+
"error": f"JSON parsing error: {e}",
|
251 |
+
"raw_response": generated_text
|
252 |
+
}
|
253 |
+
|
254 |
+
except Exception as e:
|
255 |
+
logger.error(f"Unexpected error: {e}")
|
256 |
+
return {
|
257 |
+
"error": f"Generation error: {str(e)}"
|
258 |
+
}
|
259 |
+
|
260 |
+
# Initialize client
|
261 |
+
logger.info("Initializing LLM client...")
|
262 |
+
try:
|
263 |
+
llm_client = LLMClient()
|
264 |
+
logger.info("LLM client successfully initialized")
|
265 |
+
except Exception as e:
|
266 |
+
logger.error(f"Error initializing LLM client: {e}")
|
267 |
+
llm_client = None
|
268 |
+
|
269 |
+
def process_request(prompt: str,
|
270 |
+
json_schema: str,
|
271 |
+
image: Optional[Image.Image] = None) -> str:
|
272 |
+
"""
|
273 |
+
Process request through Gradio interface
|
274 |
+
"""
|
275 |
+
if llm_client is None:
|
276 |
+
return json.dumps({
|
277 |
+
"error": "LLM client not initialized",
|
278 |
+
"details": "Check logs for detailed error information"
|
279 |
+
}, ensure_ascii=False, indent=2)
|
280 |
+
|
281 |
+
if not prompt.strip():
|
282 |
+
return json.dumps({"error": "Prompt cannot be empty"}, ensure_ascii=False, indent=2)
|
283 |
+
|
284 |
+
if not json_schema.strip():
|
285 |
+
return json.dumps({"error": "JSON schema cannot be empty"}, ensure_ascii=False, indent=2)
|
286 |
+
|
287 |
+
result = llm_client.generate_structured_response(prompt, json_schema, image)
|
288 |
+
return json.dumps(result, ensure_ascii=False, indent=2)
|
289 |
+
|
290 |
+
# Examples for demonstration
|
291 |
+
example_schema = """{
|
292 |
+
"type": "object",
|
293 |
+
"properties": {
|
294 |
+
"summary": {
|
295 |
+
"type": "string",
|
296 |
+
"description": "Brief summary of the response"
|
297 |
+
},
|
298 |
+
"sentiment": {
|
299 |
+
"type": "string",
|
300 |
+
"enum": ["positive", "negative", "neutral"],
|
301 |
+
"description": "Emotional tone"
|
302 |
+
},
|
303 |
+
"confidence": {
|
304 |
+
"type": "number",
|
305 |
+
"minimum": 0,
|
306 |
+
"maximum": 1,
|
307 |
+
"description": "Confidence level in the response"
|
308 |
+
},
|
309 |
+
"keywords": {
|
310 |
+
"type": "array",
|
311 |
+
"items": {
|
312 |
+
"type": "string"
|
313 |
+
},
|
314 |
+
"description": "Key words"
|
315 |
+
}
|
316 |
+
},
|
317 |
+
"required": ["summary", "sentiment", "confidence"]
|
318 |
+
}"""
|
319 |
+
|
320 |
+
example_prompt = "Analyze the following text and provide a structured assessment: 'The company's new product received enthusiastic user reviews. Sales exceeded all expectations by 150%.'"
|
321 |
+
|
322 |
+
def create_gradio_interface():
|
323 |
+
"""Create Gradio interface"""
|
324 |
+
|
325 |
+
with gr.Blocks(title="LLM Structured Output", theme=gr.themes.Soft()) as demo:
|
326 |
+
gr.Markdown("# π€ LLM with Structured Output")
|
327 |
+
gr.Markdown(f"Application for generating structured responses using model **{Config.MODEL_REPO}/{Config.MODEL_FILENAME}**")
|
328 |
+
|
329 |
+
# Show model status
|
330 |
+
if llm_client is None:
|
331 |
+
gr.Markdown("β οΈ **Warning**: Model not loaded. Check configuration and restart the application.")
|
332 |
+
else:
|
333 |
+
gr.Markdown("β
**Status**: Model successfully loaded and ready to work")
|
334 |
+
|
335 |
+
with gr.Row():
|
336 |
+
with gr.Column():
|
337 |
+
prompt_input = gr.Textbox(
|
338 |
+
label="Prompt for model",
|
339 |
+
placeholder="Enter your request...",
|
340 |
+
lines=5,
|
341 |
+
value=example_prompt
|
342 |
+
)
|
343 |
+
|
344 |
+
image_input = gr.Image(
|
345 |
+
label="Image (optional, for multimodal models)",
|
346 |
+
type="pil"
|
347 |
+
)
|
348 |
+
|
349 |
+
schema_input = gr.Textbox(
|
350 |
+
label="JSON schema for response structure",
|
351 |
+
placeholder="Enter JSON schema...",
|
352 |
+
lines=15,
|
353 |
+
value=example_schema
|
354 |
+
)
|
355 |
+
|
356 |
+
submit_btn = gr.Button("Generate Response", variant="primary")
|
357 |
+
|
358 |
+
with gr.Column():
|
359 |
+
output = gr.Textbox(
|
360 |
+
label="Structured Response",
|
361 |
+
lines=20,
|
362 |
+
interactive=False
|
363 |
+
)
|
364 |
+
|
365 |
+
submit_btn.click(
|
366 |
+
fn=process_request,
|
367 |
+
inputs=[prompt_input, schema_input, image_input],
|
368 |
+
outputs=output
|
369 |
+
)
|
370 |
+
|
371 |
+
# Examples
|
372 |
+
gr.Markdown("## π Usage Examples")
|
373 |
+
|
374 |
+
examples = gr.Examples(
|
375 |
+
examples=[
|
376 |
+
[
|
377 |
+
"Describe today's weather in New York",
|
378 |
+
"""{
|
379 |
+
"type": "object",
|
380 |
+
"properties": {
|
381 |
+
"temperature": {"type": "number"},
|
382 |
+
"description": {"type": "string"},
|
383 |
+
"humidity": {"type": "number"}
|
384 |
+
}
|
385 |
+
}""",
|
386 |
+
None
|
387 |
+
],
|
388 |
+
[
|
389 |
+
"Create a Python learning plan for one month",
|
390 |
+
"""{
|
391 |
+
"type": "object",
|
392 |
+
"properties": {
|
393 |
+
"weeks": {
|
394 |
+
"type": "array",
|
395 |
+
"items": {
|
396 |
+
"type": "object",
|
397 |
+
"properties": {
|
398 |
+
"week_number": {"type": "integer"},
|
399 |
+
"topics": {"type": "array", "items": {"type": "string"}},
|
400 |
+
"practice_hours": {"type": "number"}
|
401 |
+
}
|
402 |
+
}
|
403 |
+
},
|
404 |
+
"total_hours": {"type": "number"}
|
405 |
+
}
|
406 |
+
}""",
|
407 |
+
None
|
408 |
+
]
|
409 |
+
],
|
410 |
+
inputs=[prompt_input, schema_input, image_input]
|
411 |
+
)
|
412 |
+
|
413 |
+
# Model information
|
414 |
+
gr.Markdown(f"""
|
415 |
+
## βΉοΈ Model Information
|
416 |
+
|
417 |
+
- **Model**: {Config.MODEL_REPO}/{Config.MODEL_FILENAME}
|
418 |
+
- **Local path**: {Config.MODEL_PATH}
|
419 |
+
- **Context window**: {Config.N_CTX} tokens
|
420 |
+
- **Batch size**: {Config.N_BATCH}
|
421 |
+
- **GPU layers**: {Config.N_GPU_LAYERS if Config.N_GPU_LAYERS >= 0 else "All"}
|
422 |
+
- **CPU threads**: {Config.N_THREADS}
|
423 |
+
- **Maximum response length**: {Config.MAX_NEW_TOKENS} tokens
|
424 |
+
- **Temperature**: {Config.TEMPERATURE}
|
425 |
+
- **Memory lock**: {"Enabled" if Config.USE_MLOCK else "Disabled"}
|
426 |
+
- **Memory mapping**: {"Enabled" if Config.USE_MMAP else "Disabled"}
|
427 |
+
|
428 |
+
π‘ **Tip**: Use clear and specific JSON schemas for better results.
|
429 |
+
""")
|
430 |
+
|
431 |
+
return demo
|
432 |
+
|
433 |
+
if __name__ == "__main__":
|
434 |
+
# Create and launch Gradio interface
|
435 |
+
demo = create_gradio_interface()
|
436 |
+
demo.launch(
|
437 |
+
server_name=Config.HOST,
|
438 |
+
server_port=Config.GRADIO_PORT,
|
439 |
+
share=False,
|
440 |
+
debug=True
|
441 |
+
)
|
config.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from typing import Optional
|
3 |
+
|
4 |
+
class Config:
|
5 |
+
"""Application configuration for working with local GGUF models"""
|
6 |
+
|
7 |
+
# Model settings - using Hugging Face downloaded model
|
8 |
+
MODEL_REPO: str = os.getenv("MODEL_REPO", "lmstudio-community/gemma-3n-E4B-it-text-GGUF")
|
9 |
+
MODEL_FILENAME: str = os.getenv("MODEL_FILENAME", "gemma-3n-E4B-it-Q8_0.gguf")
|
10 |
+
MODEL_PATH: str = os.getenv("MODEL_PATH", "/app/models/gemma-3n-E4B-it-Q8_0.gguf")
|
11 |
+
HUGGINGFACE_TOKEN: str = os.getenv("HUGGINGFACE_TOKEN", "")
|
12 |
+
|
13 |
+
# Model loading settings - optimized for Docker container
|
14 |
+
N_CTX: int = int(os.getenv("N_CTX", "4096")) # Reduced context window for Docker
|
15 |
+
N_GPU_LAYERS: int = int(os.getenv("N_GPU_LAYERS", "0")) # CPU-only for Docker by default
|
16 |
+
N_THREADS: int = int(os.getenv("N_THREADS", "4")) # Conservative thread count
|
17 |
+
N_BATCH: int = int(os.getenv("N_BATCH", "512")) # Smaller batch size for Docker
|
18 |
+
USE_MLOCK: bool = os.getenv("USE_MLOCK", "false").lower() == "true" # Disabled for Docker
|
19 |
+
USE_MMAP: bool = os.getenv("USE_MMAP", "true").lower() == "true" # Keep memory mapping
|
20 |
+
F16_KV: bool = os.getenv("F16_KV", "true").lower() == "true" # Use 16-bit keys and values
|
21 |
+
SEED: int = int(os.getenv("SEED", "42")) # Random seed for reproducibility
|
22 |
+
|
23 |
+
# Server settings - Docker compatible
|
24 |
+
HOST: str = os.getenv("HOST", "0.0.0.0")
|
25 |
+
GRADIO_PORT: int = int(os.getenv("GRADIO_PORT", "7860")) # Standard HuggingFace Spaces port
|
26 |
+
API_PORT: int = int(os.getenv("API_PORT", "8000"))
|
27 |
+
|
28 |
+
# Generation settings - optimized for Docker
|
29 |
+
MAX_NEW_TOKENS: int = int(os.getenv("MAX_NEW_TOKENS", "256")) # Reduced for faster response
|
30 |
+
TEMPERATURE: float = float(os.getenv("TEMPERATURE", "0.1"))
|
31 |
+
|
32 |
+
# File upload settings
|
33 |
+
MAX_FILE_SIZE: int = int(os.getenv("MAX_FILE_SIZE", "10485760")) # 10MB
|
34 |
+
ALLOWED_IMAGE_EXTENSIONS: set = {".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp"}
|
35 |
+
|
36 |
+
@classmethod
|
37 |
+
def is_model_available(cls) -> bool:
|
38 |
+
"""Check if local model file exists"""
|
39 |
+
return os.path.exists(cls.MODEL_PATH)
|
40 |
+
|
41 |
+
@classmethod
|
42 |
+
def get_model_path(cls) -> str:
|
43 |
+
"""Get absolute path to model file"""
|
44 |
+
return os.path.abspath(cls.MODEL_PATH)
|
45 |
+
|
46 |
+
@classmethod
|
47 |
+
def get_models_dir(cls) -> str:
|
48 |
+
"""Get models directory path"""
|
49 |
+
return os.path.dirname(cls.MODEL_PATH)
|
50 |
+
|
51 |
+
@classmethod
|
52 |
+
def load_from_env_file(cls, env_file: str = ".env") -> None:
|
53 |
+
"""Load configuration from .env file"""
|
54 |
+
if os.path.exists(env_file):
|
55 |
+
with open(env_file, 'r') as f:
|
56 |
+
for line in f:
|
57 |
+
line = line.strip()
|
58 |
+
if line and not line.startswith('#') and '=' in line:
|
59 |
+
key, value = line.split('=', 1)
|
60 |
+
os.environ[key.strip()] = value.strip()
|
61 |
+
|
62 |
+
# Automatically load from .env file on import
|
63 |
+
Config.load_from_env_file()
|
docker-compose.yml
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
version: '3.8'
|
2 |
+
|
3 |
+
services:
|
4 |
+
llm-app:
|
5 |
+
build:
|
6 |
+
context: .
|
7 |
+
dockerfile: Dockerfile
|
8 |
+
ports:
|
9 |
+
- "7860:7860"
|
10 |
+
environment:
|
11 |
+
- MODEL_REPO=lmstudio-community/gemma-3n-E4B-it-text-GGUF
|
12 |
+
- MODEL_FILENAME=gemma-3n-E4B-it-Q8_0.gguf
|
13 |
+
- N_CTX=4096
|
14 |
+
- N_GPU_LAYERS=0
|
15 |
+
- N_THREADS=4
|
16 |
+
- MAX_NEW_TOKENS=256
|
17 |
+
- TEMPERATURE=0.1
|
18 |
+
volumes:
|
19 |
+
# Optional: Mount models directory to persist downloaded models
|
20 |
+
- ./models:/app/models
|
21 |
+
restart: unless-stopped
|
22 |
+
mem_limit: 8g
|
23 |
+
# Uncomment below for GPU support
|
24 |
+
# deploy:
|
25 |
+
# resources:
|
26 |
+
# reservations:
|
27 |
+
# devices:
|
28 |
+
# - driver: nvidia
|
29 |
+
# count: 1
|
30 |
+
# capabilities: [gpu]
|
entrypoint.sh
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
# Entrypoint script for LLM Structured Output Docker container
|
4 |
+
|
5 |
+
set -e
|
6 |
+
|
7 |
+
# Print environment info
|
8 |
+
echo "π³ Starting LLM Structured Output Docker container"
|
9 |
+
echo "Python version: $(python --version)"
|
10 |
+
echo "Working directory: $(pwd)"
|
11 |
+
echo "User: $(whoami)"
|
12 |
+
|
13 |
+
# Create models directory if it doesn't exist
|
14 |
+
mkdir -p /app/models
|
15 |
+
|
16 |
+
# Check if musl libc symbolic link exists (required for llama-cpp-python)
|
17 |
+
if [ ! -e "/lib/libc.musl-x86_64.so.1" ]; then
|
18 |
+
echo "β οΈ Warning: musl libc symbolic link not found. Checking for available libc libraries..."
|
19 |
+
ls -la /usr/lib/x86_64-linux-* 2>/dev/null || echo "No musl libraries found"
|
20 |
+
ls -la /usr/lib/x86_64-linux-gnu/libc.so* 2>/dev/null || echo "No glibc libraries found"
|
21 |
+
fi
|
22 |
+
|
23 |
+
# Check available memory
|
24 |
+
echo "π System information:"
|
25 |
+
echo "Memory: $(cat /proc/meminfo | grep MemTotal)"
|
26 |
+
echo "CPU cores: $(nproc)"
|
27 |
+
echo "Disk space: $(df -h /app)"
|
28 |
+
|
29 |
+
# Set default values for key environment variables if not provided
|
30 |
+
export MODEL_REPO=${MODEL_REPO:-"lmstudio-community/gemma-3n-E4B-it-text-GGUF"}
|
31 |
+
export MODEL_FILENAME=${MODEL_FILENAME:-"gemma-3n-E4B-it-Q8_0.gguf"}
|
32 |
+
export N_CTX=${N_CTX:-"4096"}
|
33 |
+
export N_GPU_LAYERS=${N_GPU_LAYERS:-"0"}
|
34 |
+
export N_THREADS=${N_THREADS:-"4"}
|
35 |
+
export MAX_NEW_TOKENS=${MAX_NEW_TOKENS:-"256"}
|
36 |
+
|
37 |
+
echo "π§ Configuration:"
|
38 |
+
echo "Model: $MODEL_REPO/$MODEL_FILENAME"
|
39 |
+
echo "Context size: $N_CTX"
|
40 |
+
echo "GPU layers: $N_GPU_LAYERS"
|
41 |
+
echo "CPU threads: $N_THREADS"
|
42 |
+
echo "Max tokens: $MAX_NEW_TOKENS"
|
43 |
+
|
44 |
+
# Check if running in HuggingFace Spaces
|
45 |
+
if [ "$SPACE_ID" ]; then
|
46 |
+
echo "π€ Running in HuggingFace Spaces: $SPACE_ID"
|
47 |
+
export HOST=0.0.0.0
|
48 |
+
export GRADIO_PORT=7860
|
49 |
+
fi
|
50 |
+
|
51 |
+
# Execute the main command
|
52 |
+
echo "π Starting application..."
|
53 |
+
exec "$@"
|
main.py
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Main file for launching LLM Structured Output application in Docker
|
4 |
+
"""
|
5 |
+
|
6 |
+
import argparse
|
7 |
+
import threading
|
8 |
+
import time
|
9 |
+
from config import Config
|
10 |
+
|
11 |
+
def run_gradio():
|
12 |
+
"""Launch Gradio interface"""
|
13 |
+
from app import create_gradio_interface
|
14 |
+
|
15 |
+
print(f"π¨ Starting Gradio interface at http://{Config.HOST}:{Config.GRADIO_PORT}")
|
16 |
+
demo = create_gradio_interface()
|
17 |
+
demo.launch(
|
18 |
+
server_name=Config.HOST,
|
19 |
+
server_port=Config.GRADIO_PORT,
|
20 |
+
share=False,
|
21 |
+
debug=False # Disabled debug for production
|
22 |
+
)
|
23 |
+
|
24 |
+
def run_api():
|
25 |
+
"""Launch FastAPI server"""
|
26 |
+
import uvicorn
|
27 |
+
from api import api_app
|
28 |
+
|
29 |
+
print(f"π Starting API at http://{Config.HOST}:{Config.API_PORT}")
|
30 |
+
uvicorn.run(
|
31 |
+
api_app,
|
32 |
+
host=Config.HOST,
|
33 |
+
port=Config.API_PORT,
|
34 |
+
log_level="info"
|
35 |
+
)
|
36 |
+
|
37 |
+
def run_both():
|
38 |
+
"""Launch both services simultaneously"""
|
39 |
+
print("π Starting LLM Structured Output application...")
|
40 |
+
print("=" * 60)
|
41 |
+
print(f"π Gradio interface: http://{Config.HOST}:{Config.GRADIO_PORT}")
|
42 |
+
print(f"π API: http://{Config.HOST}:{Config.API_PORT}")
|
43 |
+
print(f"π API documentation: http://{Config.HOST}:{Config.API_PORT}/docs")
|
44 |
+
print("=" * 60)
|
45 |
+
|
46 |
+
# Start API in separate thread
|
47 |
+
api_thread = threading.Thread(target=run_api, daemon=True)
|
48 |
+
api_thread.start()
|
49 |
+
|
50 |
+
# Small delay for API startup
|
51 |
+
time.sleep(2)
|
52 |
+
|
53 |
+
# Start Gradio in main thread
|
54 |
+
run_gradio()
|
55 |
+
|
56 |
+
def main():
|
57 |
+
"""Main function with command line arguments"""
|
58 |
+
parser = argparse.ArgumentParser(description="LLM Structured Output application")
|
59 |
+
parser.add_argument(
|
60 |
+
"--mode",
|
61 |
+
choices=["gradio", "api", "both"],
|
62 |
+
default="gradio", # Default to gradio only for HuggingFace Spaces
|
63 |
+
help="Launch mode: gradio (interface only), api (API only), both (both services)"
|
64 |
+
)
|
65 |
+
|
66 |
+
args = parser.parse_args()
|
67 |
+
|
68 |
+
if args.mode == "gradio":
|
69 |
+
run_gradio()
|
70 |
+
elif args.mode == "api":
|
71 |
+
run_api()
|
72 |
+
else:
|
73 |
+
run_both()
|
74 |
+
|
75 |
+
if __name__ == "__main__":
|
76 |
+
main()
|
packages.txt
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# System packages required for Docker build
|
2 |
+
# These are installed in the Dockerfile, not needed for HF Spaces with Docker SDK
|
3 |
+
# but keeping for reference
|
4 |
+
|
5 |
+
# build-essential
|
6 |
+
# cmake
|
7 |
+
# wget
|
8 |
+
# curl
|
9 |
+
# git
|
10 |
+
# pkg-config
|
11 |
+
# libopenblas-dev
|
12 |
+
# libssl-dev
|
requirements.txt
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
huggingface_hub==0.25.2
|
2 |
+
# Core ML dependencies - updated for compatibility with gemma-3n-E4B model
|
3 |
+
llama-cpp-python>=0.3.4
|
4 |
+
|
5 |
+
# Web interface
|
6 |
+
gradio==4.44.1
|
7 |
+
fastapi>=0.100.0,<0.115.0
|
8 |
+
uvicorn[standard]>=0.20.0,<0.31.0
|
9 |
+
|
10 |
+
# Data processing
|
11 |
+
pillow>=9.0.0,<11.0.0
|
12 |
+
pydantic==2.10.6
|
13 |
+
numpy>=1.24.0,<2.0.0
|
14 |
+
|
15 |
+
# HTTP requests
|
16 |
+
requests>=2.28.0
|
17 |
+
|
18 |
+
# Additional dependencies for Docker environment
|
19 |
+
python-multipart
|
runtime.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
3.10
|
test.ipynb
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": null,
|
6 |
+
"id": "c364ff11",
|
7 |
+
"metadata": {
|
8 |
+
"vscode": {
|
9 |
+
"languageId": "plaintext"
|
10 |
+
}
|
11 |
+
},
|
12 |
+
"outputs": [],
|
13 |
+
"source": []
|
14 |
+
}
|
15 |
+
],
|
16 |
+
"metadata": {
|
17 |
+
"language_info": {
|
18 |
+
"name": "python"
|
19 |
+
}
|
20 |
+
},
|
21 |
+
"nbformat": 4,
|
22 |
+
"nbformat_minor": 5
|
23 |
+
}
|