lyangas commited on
Commit
b269c5d
Β·
1 Parent(s): 99a5e1e
Files changed (15) hide show
  1. .env.example +27 -0
  2. .gitignore +67 -0
  3. BUILD_INSTRUCTIONS.md +89 -0
  4. Dockerfile +70 -0
  5. README.md +150 -6
  6. api.py +213 -0
  7. app.py +441 -0
  8. config.py +63 -0
  9. docker-compose.yml +30 -0
  10. entrypoint.sh +53 -0
  11. main.py +76 -0
  12. packages.txt +12 -0
  13. requirements.txt +19 -0
  14. runtime.txt +1 -0
  15. test.ipynb +23 -0
.env.example ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model configuration
2
+ MODEL_REPO=lmstudio-community/gemma-3n-E4B-it-text-GGUF
3
+ MODEL_FILENAME=gemma-3n-E4B-it-Q8_0.gguf
4
+ MODEL_PATH=./models/gemma-3n-E4B-it-Q8_0.gguf
5
+ HUGGINGFACE_TOKEN=
6
+
7
+ # Model parameters - optimized for Docker containers
8
+ N_CTX=4096
9
+ N_GPU_LAYERS=0
10
+ N_THREADS=4
11
+ N_BATCH=512
12
+ USE_MLOCK=false
13
+ USE_MMAP=true
14
+ F16_KV=true
15
+ SEED=42
16
+
17
+ # Server settings
18
+ HOST=0.0.0.0
19
+ GRADIO_PORT=7860
20
+ API_PORT=8000
21
+
22
+ # Generation settings
23
+ MAX_NEW_TOKENS=256
24
+ TEMPERATURE=0.1
25
+
26
+ # File upload settings
27
+ MAX_FILE_SIZE=10485760
.gitignore ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ *.egg-info/
20
+ .installed.cfg
21
+ *.egg
22
+ MANIFEST
23
+
24
+ # Virtual environments
25
+ .env
26
+ .venv
27
+ env/
28
+ venv/
29
+ ENV/
30
+ env.bak/
31
+ venv.bak/
32
+
33
+ # IDE
34
+ .vscode/
35
+ .idea/
36
+ *.swp
37
+ *.swo
38
+ *~
39
+
40
+ # OS
41
+ .DS_Store
42
+ .DS_Store?
43
+ ._*
44
+ .Spotlight-V100
45
+ .Trashes
46
+ ehthumbs.db
47
+ Thumbs.db
48
+
49
+ # Model files
50
+ models/*.gguf
51
+ models/*.bin
52
+ models/*.pt
53
+ models/*.safetensors
54
+
55
+ # Logs
56
+ *.log
57
+ logs/
58
+
59
+ # Temporary files
60
+ tmp/
61
+ temp/
62
+
63
+ # Docker
64
+ .dockerignore
65
+
66
+ # HuggingFace
67
+ .huggingface/
BUILD_INSTRUCTIONS.md ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Π˜Π½ΡΡ‚Ρ€ΡƒΠΊΡ†ΠΈΠΈ ΠΏΠΎ сборкС Docker ΠΎΠ±Ρ€Π°Π·Π° с ΠΏΡ€Π΅Π΄Π·Π°Π³Ρ€ΡƒΠΆΠ΅Π½Π½ΠΎΠΉ модСлью
2
+
3
+ ## ΠžΠ±Π·ΠΎΡ€ ΠΈΠ·ΠΌΠ΅Π½Π΅Π½ΠΈΠΉ
4
+
5
+ Dockerfile Π±Ρ‹Π» ΠΌΠΎΠ΄ΠΈΡ„ΠΈΡ†ΠΈΡ€ΠΎΠ²Π°Π½ для ΠΏΡ€Π΅Π΄Π²Π°Ρ€ΠΈΡ‚Π΅Π»ΡŒΠ½ΠΎΠΉ Π·Π°Π³Ρ€ΡƒΠ·ΠΊΠΈ ΠΌΠΎΠ΄Π΅Π»ΠΈ Hugging Face Π²ΠΎ врСмя сборки ΠΎΠ±Ρ€Π°Π·Π°. Π­Ρ‚ΠΎ обСспСчиваСт:
6
+
7
+ - βœ… БыстроС Ρ€Π°Π·Π²Π΅Ρ€Ρ‚Ρ‹Π²Π°Π½ΠΈΠ΅ (модСль ΡƒΠΆΠ΅ Π² ΠΊΠΎΠ½Ρ‚Π΅ΠΉΠ½Π΅Ρ€Π΅)
8
+ - βœ… ΠΠ°Π΄Π΅ΠΆΠ½ΠΎΡΡ‚ΡŒ (Π½Π΅Ρ‚ зависимости ΠΎΡ‚ сСти ΠΏΡ€ΠΈ запускС)
9
+ - βœ… ΠšΠΎΠ½ΡΠΈΡΡ‚Π΅Π½Ρ‚Π½ΠΎΡΡ‚ΡŒ (фиксированная вСрсия ΠΌΠΎΠ΄Π΅Π»ΠΈ)
10
+
11
+ ## Π‘Π±ΠΎΡ€ΠΊΠ° ΠΎΠ±Ρ€Π°Π·Π°
12
+
13
+ ### Базовая сборка (для ΠΏΡƒΠ±Π»ΠΈΡ‡Π½Ρ‹Ρ… ΠΌΠΎΠ΄Π΅Π»Π΅ΠΉ):
14
+
15
+ ```bash
16
+ docker build -t llm-structured-output .
17
+ ```
18
+
19
+ ### Π‘Π±ΠΎΡ€ΠΊΠ° с Ρ‚ΠΎΠΊΠ΅Π½ΠΎΠΌ Hugging Face (для ΠΏΡ€ΠΈΠ²Π°Ρ‚Π½Ρ‹Ρ… ΠΌΠΎΠ΄Π΅Π»Π΅ΠΉ):
20
+
21
+ ```bash
22
+ docker build --build-arg HUGGINGFACE_TOKEN=your_token_here -t llm-structured-output .
23
+ ```
24
+
25
+ Или Ρ‡Π΅Ρ€Π΅Π· ΠΏΠ΅Ρ€Π΅ΠΌΠ΅Π½Π½ΡƒΡŽ окруТСния:
26
+
27
+ ```bash
28
+ export HUGGINGFACE_TOKEN=your_token_here
29
+ docker build -t llm-structured-output .
30
+ ```
31
+
32
+ ## Запуск ΠΊΠΎΠ½Ρ‚Π΅ΠΉΠ½Π΅Ρ€Π°
33
+
34
+ ```bash
35
+ docker run -p 7860:7860 llm-structured-output
36
+ ```
37
+
38
+ ΠŸΡ€ΠΈΠ»ΠΎΠΆΠ΅Π½ΠΈΠ΅ Π±ΡƒΠ΄Π΅Ρ‚ доступно ΠΏΠΎ адрСсу: http://localhost:7860
39
+
40
+ ## Запуск Ρ‡Π΅Ρ€Π΅Π· docker-compose
41
+
42
+ ```bash
43
+ docker-compose up --build
44
+ ```
45
+
46
+ ## Π’Π°ΠΆΠ½Ρ‹Π΅ измСнСния
47
+
48
+ ### 1. Dockerfile
49
+ - Π”ΠΎΠ±Π°Π²Π»Π΅Π½ `git-lfs` для Ρ€Π°Π±ΠΎΡ‚Ρ‹ с большими Ρ„Π°ΠΉΠ»Π°ΠΌΠΈ
50
+ - Π”ΠΎΠ±Π°Π²Π»Π΅Π½Π° пСрСмСнная `DOCKER_CONTAINER=true`
51
+ - Π”ΠΎΠ±Π°Π²Π»Π΅Π½ этап ΠΏΡ€Π΅Π΄Π²Π°Ρ€ΠΈΡ‚Π΅Π»ΡŒΠ½ΠΎΠΉ Π·Π°Π³Ρ€ΡƒΠ·ΠΊΠΈ ΠΌΠΎΠ΄Π΅Π»ΠΈ
52
+ - МодСль скачиваСтся Π²ΠΎ врСмя сборки ΠΎΠ±Ρ€Π°Π·Π°
53
+
54
+ ### 2. app.py
55
+ - Π”ΠΎΠ±Π°Π²Π»Π΅Π½Π° ΠΏΡ€ΠΎΠ²Π΅Ρ€ΠΊΠ° Π½Π° Docker ΠΎΠΊΡ€ΡƒΠΆΠ΅Π½ΠΈΠ΅
56
+ - Если модСль Π½Π΅ Π½Π°ΠΉΠ΄Π΅Π½Π° Π² Docker ΠΊΠΎΠ½Ρ‚Π΅ΠΉΠ½Π΅Ρ€Π΅, выбрасываСтся ошибка
57
+ - Π›ΠΎΠ³ΠΈΠΊΠ° Π·Π°Π³Ρ€ΡƒΠ·ΠΊΠΈ ΠΌΠΎΠ΄Π΅Π»ΠΈ ΠΎΠΏΡ‚ΠΈΠΌΠΈΠ·ΠΈΡ€ΠΎΠ²Π°Π½Π° для Ρ€Π°Π±ΠΎΡ‚Ρ‹ с ΠΏΡ€Π΅Π΄Π·Π°Π³Ρ€ΡƒΠΆΠ΅Π½Π½Ρ‹ΠΌΠΈ модСлями
58
+
59
+ ## Π Π°Π·ΠΌΠ΅Ρ€ ΠΎΠ±Ρ€Π°Π·Π°
60
+
61
+ ΠžΠ±Ρ€Π°Π· Π±ΡƒΠ΄Π΅Ρ‚ большС ΠΈΠ·-Π·Π° Π²ΠΊΠ»ΡŽΡ‡Π΅Π½Π½ΠΎΠΉ ΠΌΠΎΠ΄Π΅Π»ΠΈ, Π½ΠΎ это компСнсируСтся:
62
+ - Быстрым запуском ΠΊΠΎΠ½Ρ‚Π΅ΠΉΠ½Π΅Ρ€Π°
63
+ - ΠžΡ‚ΡΡƒΡ‚ΡΡ‚Π²ΠΈΠ΅ΠΌ сСтСвых зависимостСй
64
+ - Π’ΠΎΠ·ΠΌΠΎΠΆΠ½ΠΎΡΡ‚ΡŒΡŽ ΠΊΡΡˆΠΈΡ€ΠΎΠ²Π°Π½ΠΈΡ слоСв Docker
65
+
66
+ ## Настройка ΠΌΠΎΠ΄Π΅Π»ΠΈ
67
+
68
+ Для измСнСния ΠΌΠΎΠ΄Π΅Π»ΠΈ ΠΎΡ‚Ρ€Π΅Π΄Π°ΠΊΡ‚ΠΈΡ€ΡƒΠΉΡ‚Π΅ `config.py`:
69
+
70
+ ```python
71
+ MODEL_REPO: str = "your-repo/your-model"
72
+ MODEL_FILENAME: str = "your-model.gguf"
73
+ ```
74
+
75
+ Π—Π°Ρ‚Π΅ΠΌ пСрСсобСритС ΠΎΠ±Ρ€Π°Π·.
76
+
77
+ ## ΠžΡ‚Π»Π°Π΄ΠΊΠ°
78
+
79
+ Для ΠΏΡ€ΠΎΠ²Π΅Ρ€ΠΊΠΈ наличия ΠΌΠΎΠ΄Π΅Π»ΠΈ Π² ΠΊΠΎΠ½Ρ‚Π΅ΠΉΠ½Π΅Ρ€Π΅:
80
+
81
+ ```bash
82
+ docker run -it llm-structured-output ls -la /app/models/
83
+ ```
84
+
85
+ Для ΠΏΡ€ΠΎΠ²Π΅Ρ€ΠΊΠΈ Π»ΠΎΠ³ΠΎΠ² сборки:
86
+
87
+ ```bash
88
+ docker build --no-cache -t llm-structured-output .
89
+ ```
Dockerfile ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use Python 3.10 base image optimized for HuggingFace Spaces
2
+ FROM python:3.10-slim
3
+
4
+ # Set working directory
5
+ WORKDIR /app
6
+
7
+ # Install system dependencies required for llama-cpp-python and git-lfs
8
+ RUN apt-get update && apt-get install -y \
9
+ build-essential \
10
+ cmake \
11
+ wget \
12
+ curl \
13
+ git \
14
+ git-lfs \
15
+ pkg-config \
16
+ libopenblas-dev \
17
+ libssl-dev \
18
+ musl-dev \
19
+ && rm -rf /var/lib/apt/lists/*
20
+
21
+ # Initialize git-lfs
22
+ RUN git lfs install
23
+
24
+ # Set environment variables for optimal Docker performance
25
+ ENV PYTHONUNBUFFERED=1
26
+ ENV PYTHONDONTWRITEBYTECODE=1
27
+ ENV PIP_NO_CACHE_DIR=1
28
+ ENV CMAKE_ARGS="-DLLAMA_OPENBLAS=on"
29
+ ENV FORCE_CMAKE=1
30
+ ENV DOCKER_CONTAINER=true
31
+
32
+ # Create models directory
33
+ RUN mkdir -p /app/models
34
+
35
+ # Create symbolic link for musl libc compatibility (required for llama-cpp-python)
36
+ RUN ln -sf /usr/lib/x86_64-linux-musl/libc.so /lib/libc.musl-x86_64.so.1 || \
37
+ ln -sf /usr/lib/x86_64-linux-gnu/libc.so.6 /lib/libc.musl-x86_64.so.1
38
+
39
+ # Copy requirements first for better Docker layer caching
40
+ COPY requirements.txt .
41
+
42
+ # Install Python dependencies
43
+ RUN pip install --no-cache-dir -r requirements.txt
44
+
45
+ # Copy configuration to get model info
46
+ COPY config.py .
47
+
48
+ # Pre-download the model during build
49
+ RUN python -c "import os; from huggingface_hub import hf_hub_download; from config import Config; os.makedirs('/app/models', exist_ok=True); print(f'Downloading model {Config.MODEL_REPO}/{Config.MODEL_FILENAME}...'); p=hf_hub_download(repo_id=Config.MODEL_REPO, filename=Config.MODEL_FILENAME, local_dir='/app/models', token=os.getenv('HUGGINGFACE_TOKEN') or None); print(f'Model downloaded to: {p}'); import os; s=os.path.getsize(p) if os.path.exists(p) else (_ for _ in ()).throw(FileNotFoundError(f'Model file not found: {p}')); print(f'Model file size: {s/(1024**3):.2f} GB'); (s>1024*1024) or (_ for _ in ()).throw(ValueError(f'Downloaded model file seems too small: {s} bytes')); print('Model download verification successful')"
50
+
51
+ # Verify model file exists after build
52
+ RUN ls -la /app/models/ && \
53
+ [ -f "/app/models/gemma-3n-E4B-it-Q8_0.gguf" ] || (echo "Model file not found!" && exit 1)
54
+
55
+ # Copy application files
56
+ COPY . .
57
+
58
+ # Make entrypoint script executable
59
+ RUN chmod +x entrypoint.sh
60
+
61
+ # Create a non-root user for security
62
+ RUN useradd -m -u 1000 user && chown -R user:user /app
63
+ USER user
64
+
65
+ # Expose the port that Gradio will run on
66
+ EXPOSE 7860
67
+
68
+ # Set entrypoint and default command
69
+ ENTRYPOINT ["./entrypoint.sh"]
70
+ CMD ["python", "main.py", "--mode", "gradio"]
README.md CHANGED
@@ -1,11 +1,155 @@
1
  ---
2
- title: Free Llm Structure Output Docker
3
- emoji: πŸ’»
4
- colorFrom: gray
5
- colorTo: gray
6
  sdk: docker
 
7
  pinned: false
8
- license: gemma
 
 
 
 
 
 
 
 
 
9
  ---
10
 
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: LLM Structured Output Docker
3
+ emoji: πŸ€–
4
+ colorFrom: blue
5
+ colorTo: green
6
  sdk: docker
7
+ app_port: 7860
8
  pinned: false
9
+ license: mit
10
+ short_description: Get structured JSON responses from LLM using Docker
11
+ tags:
12
+ - llama-cpp
13
+ - gguf
14
+ - json-schema
15
+ - structured-output
16
+ - llm
17
+ - docker
18
+ - gradio
19
  ---
20
 
21
+ # πŸ€– LLM Structured Output (Docker Version)
22
+
23
+ Dockerized application for getting structured responses from local GGUF language models in specified JSON format.
24
+
25
+
26
+ ## ✨ Key Features
27
+
28
+ - **Docker containerized** for easy deployment on HuggingFace Spaces
29
+ - **Local GGUF model support** via llama-cpp-python
30
+ - **Optimized for containers** with configurable resources
31
+ - **JSON schema support** for structured output
32
+ - **Gradio web interface** for convenient interaction
33
+ - **REST API** for integration with other applications
34
+ - **Memory efficient** with GGUF quantized models
35
+
36
+ ## πŸš€ Deployment on HuggingFace Spaces
37
+
38
+ This version is specifically designed for HuggingFace Spaces with Docker SDK:
39
+
40
+ 1. Clone this repository
41
+ 2. Push to HuggingFace Spaces with `sdk: docker` in README.md
42
+ 3. The application will automatically build and deploy
43
+
44
+ ## 🐳 Local Docker Usage
45
+
46
+ ### Build the image:
47
+ ```bash
48
+ docker build -t llm-structured-output .
49
+ ```
50
+
51
+ ### Run the container:
52
+ ```bash
53
+ docker run -p 7860:7860 -e MODEL_REPO="lmstudio-community/gemma-3n-E4B-it-text-GGUF" llm-structured-output
54
+ ```
55
+
56
+ ### With custom configuration:
57
+ ```bash
58
+ docker run -p 7860:7860 \
59
+ -e MODEL_REPO="lmstudio-community/gemma-3n-E4B-it-text-GGUF" \
60
+ -e MODEL_FILENAME="gemma-3n-E4B-it-Q8_0.gguf" \
61
+ -e N_CTX="4096" \
62
+ -e MAX_NEW_TOKENS="512" \
63
+ llm-structured-output
64
+ ```
65
+
66
+ ## 🌐 Application Access
67
+
68
+ - **Web interface**: http://localhost:7860
69
+ - **API**: Available through the same port
70
+ - **Health check**: http://localhost:7860/health (when API mode is enabled)
71
+
72
+ ## πŸ“ Environment Variables
73
+
74
+ Configure the application using environment variables:
75
+
76
+ | Variable | Default | Description |
77
+ |----------|---------|-------------|
78
+ | `MODEL_REPO` | `lmstudio-community/gemma-3n-E4B-it-text-GGUF` | HuggingFace model repository |
79
+ | `MODEL_FILENAME` | `gemma-3n-E4B-it-Q8_0.gguf` | Model file name |
80
+ | `N_CTX` | `4096` | Context window size |
81
+ | `N_GPU_LAYERS` | `0` | GPU layers (0 for CPU-only) |
82
+ | `N_THREADS` | `4` | CPU threads |
83
+ | `MAX_NEW_TOKENS` | `256` | Maximum response length |
84
+ | `TEMPERATURE` | `0.1` | Generation temperature |
85
+ | `HUGGINGFACE_TOKEN` | `` | HF token for private models |
86
+
87
+ ## πŸ“‹ Usage Examples
88
+
89
+ ### Example JSON Schema:
90
+ ```json
91
+ {
92
+ "type": "object",
93
+ "properties": {
94
+ "summary": {"type": "string"},
95
+ "sentiment": {"type": "string", "enum": ["positive", "negative", "neutral"]},
96
+ "confidence": {"type": "number", "minimum": 0, "maximum": 1}
97
+ },
98
+ "required": ["summary", "sentiment"]
99
+ }
100
+ ```
101
+
102
+ ### Example Prompt:
103
+ ```
104
+ Analyze this review: "The product exceeded my expectations! Great quality and fast delivery."
105
+ ```
106
+
107
+ ## πŸ”§ Docker Optimizations
108
+
109
+ This Docker version includes several optimizations:
110
+
111
+ - **Reduced memory usage** with smaller context window and batch sizes
112
+ - **CPU-optimized** configuration by default
113
+ - **Efficient layer caching** for faster builds
114
+ - **Security**: Runs as non-root user
115
+ - **Multi-stage build** capabilities for production
116
+
117
+ ## πŸ—οΈ Architecture
118
+
119
+ - **Base Image**: Python 3.10 slim
120
+ - **ML Backend**: llama-cpp-python with OpenBLAS
121
+ - **Web Interface**: Gradio 4.x
122
+ - **API**: FastAPI with automatic documentation
123
+ - **Model Storage**: Downloaded on first run to `/app/models/`
124
+
125
+ ## πŸ’‘ Performance Tips
126
+
127
+ 1. **Memory**: Start with smaller models (7B or less)
128
+ 2. **CPU**: Adjust `N_THREADS` based on available cores
129
+ 3. **Context**: Reduce `N_CTX` if experiencing memory issues
130
+ 4. **Batch size**: Lower `N_BATCH` for memory-constrained environments
131
+
132
+ ## πŸ” Troubleshooting
133
+
134
+ ### Container fails to start:
135
+ - Check available memory (minimum 4GB recommended)
136
+ - Verify model repository accessibility
137
+ - Ensure proper environment variable formatting
138
+
139
+ ### Model download issues:
140
+ - Check internet connectivity in container
141
+ - Verify `HUGGINGFACE_TOKEN` for private models
142
+ - Ensure sufficient disk space
143
+
144
+ ### Performance issues:
145
+ - Reduce `N_CTX` and `MAX_NEW_TOKENS`
146
+ - Adjust `N_THREADS` to match CPU cores
147
+ - Consider using smaller/quantized models
148
+
149
+ ## πŸ“„ License
150
+
151
+ MIT License - see LICENSE file for details.
152
+
153
+ ---
154
+
155
+ For more information about HuggingFace Spaces Docker configuration, see: https://huggingface.co/docs/hub/spaces-config-reference
api.py ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException, UploadFile, File, Form
2
+ from fastapi.middleware.cors import CORSMiddleware
3
+ from pydantic import BaseModel
4
+ from typing import Optional, Dict, Any
5
+ import json
6
+ import base64
7
+ from PIL import Image
8
+ from io import BytesIO
9
+ import uvicorn
10
+ from app import llm_client
11
+
12
+ # Create FastAPI application
13
+ api_app = FastAPI(
14
+ title="LLM Structured Output API",
15
+ description="API for generating structured responses from local GGUF models via llama-cpp-python",
16
+ version="1.0.0"
17
+ )
18
+
19
+ # Setup CORS
20
+ api_app.add_middleware(
21
+ CORSMiddleware,
22
+ allow_origins=["*"],
23
+ allow_credentials=True,
24
+ allow_methods=["*"],
25
+ allow_headers=["*"],
26
+ )
27
+
28
+ # Data models for API
29
+ class StructuredOutputRequest(BaseModel):
30
+ prompt: str
31
+ json_schema: Dict[str, Any]
32
+ image_base64: Optional[str] = None
33
+
34
+ class StructuredOutputResponse(BaseModel):
35
+ success: bool
36
+ data: Optional[Dict[str, Any]] = None
37
+ error: Optional[str] = None
38
+ raw_response: Optional[str] = None
39
+
40
+ def decode_base64_image(base64_string: str) -> Image.Image:
41
+ """Decode base64 string to PIL Image"""
42
+ try:
43
+ image_data = base64.b64decode(base64_string)
44
+ image = Image.open(BytesIO(image_data))
45
+ return image
46
+ except Exception as e:
47
+ raise HTTPException(status_code=400, detail=f"Image decoding error: {str(e)}")
48
+
49
+ @api_app.post("/generate", response_model=StructuredOutputResponse)
50
+ async def generate_structured_output(request: StructuredOutputRequest):
51
+ """
52
+ Main endpoint for generating structured response
53
+
54
+ Args:
55
+ request: Request containing prompt, JSON schema and optionally base64 image
56
+
57
+ Returns:
58
+ StructuredOutputResponse: Structured response or error
59
+ """
60
+ # Check model initialization
61
+ if llm_client is None:
62
+ raise HTTPException(
63
+ status_code=503,
64
+ detail="LLM model not initialized. Check server configuration."
65
+ )
66
+
67
+ try:
68
+ # Validate input data
69
+ if not request.prompt.strip():
70
+ raise HTTPException(status_code=400, detail="Prompt cannot be empty")
71
+
72
+ if not request.json_schema:
73
+ raise HTTPException(status_code=400, detail="JSON schema cannot be empty")
74
+
75
+ # Decode image if provided
76
+ image = None
77
+ if request.image_base64:
78
+ image = decode_base64_image(request.image_base64)
79
+
80
+ # Generate response
81
+ result = llm_client.generate_structured_response(
82
+ prompt=request.prompt,
83
+ json_schema=request.json_schema,
84
+ image=image
85
+ )
86
+
87
+ # Format response
88
+ if "error" in result:
89
+ return StructuredOutputResponse(
90
+ success=False,
91
+ error=result["error"],
92
+ raw_response=result.get("raw_response")
93
+ )
94
+ else:
95
+ return StructuredOutputResponse(
96
+ success=True,
97
+ data=result.get("data"),
98
+ raw_response=result.get("raw_response")
99
+ )
100
+
101
+ except HTTPException:
102
+ raise
103
+ except Exception as e:
104
+ raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
105
+
106
+ @api_app.post("/generate_with_file", response_model=StructuredOutputResponse)
107
+ async def generate_with_file(
108
+ prompt: str = Form(...),
109
+ json_schema: str = Form(...),
110
+ image: Optional[UploadFile] = File(None)
111
+ ):
112
+ """
113
+ Alternative endpoint for uploading image as file
114
+
115
+ Args:
116
+ prompt: Text prompt
117
+ json_schema: JSON schema as string
118
+ image: Uploaded image file
119
+
120
+ Returns:
121
+ StructuredOutputResponse: Structured response or error
122
+ """
123
+ # Check model initialization
124
+ if llm_client is None:
125
+ raise HTTPException(
126
+ status_code=503,
127
+ detail="LLM model not initialized. Check server configuration."
128
+ )
129
+
130
+ try:
131
+ # Validate input data
132
+ if not prompt.strip():
133
+ raise HTTPException(status_code=400, detail="Prompt cannot be empty")
134
+
135
+ if not json_schema.strip():
136
+ raise HTTPException(status_code=400, detail="JSON schema cannot be empty")
137
+
138
+ # Parse JSON schema
139
+ try:
140
+ parsed_schema = json.loads(json_schema)
141
+ except json.JSONDecodeError as e:
142
+ raise HTTPException(status_code=400, detail=f"Invalid JSON schema: {str(e)}")
143
+
144
+ # Process image if provided
145
+ pil_image = None
146
+ if image:
147
+ # Check file type
148
+ if not image.content_type.startswith('image/'):
149
+ raise HTTPException(status_code=400, detail="Uploaded file must be an image")
150
+
151
+ # Read and convert image
152
+ image_data = await image.read()
153
+ pil_image = Image.open(BytesIO(image_data))
154
+
155
+ # Generate response
156
+ result = llm_client.generate_structured_response(
157
+ prompt=prompt,
158
+ json_schema=parsed_schema,
159
+ image=pil_image
160
+ )
161
+
162
+ # Format response
163
+ if "error" in result:
164
+ return StructuredOutputResponse(
165
+ success=False,
166
+ error=result["error"],
167
+ raw_response=result.get("raw_response")
168
+ )
169
+ else:
170
+ return StructuredOutputResponse(
171
+ success=True,
172
+ data=result.get("data"),
173
+ raw_response=result.get("raw_response")
174
+ )
175
+
176
+ except HTTPException:
177
+ raise
178
+ except Exception as e:
179
+ raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
180
+
181
+ @api_app.get("/health")
182
+ async def health_check():
183
+ """API health check"""
184
+ model_status = "loaded" if llm_client is not None else "not_loaded"
185
+ return {
186
+ "status": "healthy" if llm_client is not None else "degraded",
187
+ "model_status": model_status,
188
+ "message": "API is working correctly" if llm_client is not None else "API is working, but model is not loaded"
189
+ }
190
+
191
+ @api_app.get("/")
192
+ async def root():
193
+ """Root endpoint with API information"""
194
+ return {
195
+ "message": "LLM Structured Output API",
196
+ "version": "1.0.0",
197
+ "model_loaded": llm_client is not None,
198
+ "endpoints": {
199
+ "/generate": "POST - main endpoint for generating structured response",
200
+ "/generate_with_file": "POST - endpoint with image file upload",
201
+ "/health": "GET - health check",
202
+ "/docs": "GET - automatic Swagger documentation"
203
+ }
204
+ }
205
+
206
+ if __name__ == "__main__":
207
+ from config import Config
208
+ uvicorn.run(
209
+ "api:api_app",
210
+ host=Config.HOST,
211
+ port=Config.API_PORT,
212
+ reload=True
213
+ )
app.py ADDED
@@ -0,0 +1,441 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import gradio as gr
4
+ from typing import Optional, Dict, Any, Union
5
+ from PIL import Image
6
+ from pydantic import BaseModel
7
+ import logging
8
+ from config import Config
9
+
10
+ # Try to import llama_cpp with fallback
11
+ try:
12
+ from llama_cpp import Llama
13
+ LLAMA_CPP_AVAILABLE = True
14
+ except ImportError as e:
15
+ print(f"Warning: llama-cpp-python not available: {e}")
16
+ LLAMA_CPP_AVAILABLE = False
17
+ Llama = None
18
+
19
+ # Try to import huggingface_hub
20
+ try:
21
+ from huggingface_hub import hf_hub_download
22
+ HUGGINGFACE_HUB_AVAILABLE = True
23
+ except ImportError as e:
24
+ print(f"Warning: huggingface_hub not available: {e}")
25
+ HUGGINGFACE_HUB_AVAILABLE = False
26
+ hf_hub_download = None
27
+
28
+ # Setup logging
29
+ logging.basicConfig(level=logging.INFO)
30
+ logger = logging.getLogger(__name__)
31
+
32
+ class StructuredOutputRequest(BaseModel):
33
+ prompt: str
34
+ image: Optional[str] = None # base64 encoded image
35
+ json_schema: Dict[str, Any]
36
+
37
+ class LLMClient:
38
+ def __init__(self):
39
+ """
40
+ Initialize client for working with local GGUF model via llama-cpp-python
41
+ """
42
+ self.model_path = Config.get_model_path()
43
+ logger.info(f"Using model: {self.model_path}")
44
+
45
+ self.llm = None
46
+
47
+ self._initialize_model()
48
+
49
+ def _download_model_if_needed(self) -> str:
50
+ """Download model from Hugging Face if it doesn't exist locally"""
51
+ if os.path.exists(self.model_path):
52
+ logger.info(f"Model already exists at: {self.model_path}")
53
+ return self.model_path
54
+
55
+ # If model doesn't exist and we're in production (Docker),
56
+ # it means the build process failed or model is in wrong location
57
+ if os.getenv('DOCKER_CONTAINER', 'false').lower() == 'true':
58
+ # Let's check common locations where model might be
59
+ alternative_paths = [
60
+ f"/app/models/{Config.MODEL_FILENAME}",
61
+ f"./models/{Config.MODEL_FILENAME}",
62
+ f"/models/{Config.MODEL_FILENAME}",
63
+ f"/app/{Config.MODEL_FILENAME}"
64
+ ]
65
+
66
+ for alt_path in alternative_paths:
67
+ if os.path.exists(alt_path):
68
+ logger.info(f"Found model at alternative location: {alt_path}")
69
+ return alt_path
70
+
71
+ # List what's actually in the models directory
72
+ models_dir = "/app/models"
73
+ if os.path.exists(models_dir):
74
+ files = os.listdir(models_dir)
75
+ logger.error(f"Contents of {models_dir}: {files}")
76
+ else:
77
+ logger.error(f"Directory {models_dir} does not exist")
78
+
79
+ # Try to download as fallback
80
+ logger.warning("Model not found in expected locations, attempting download...")
81
+
82
+ if not HUGGINGFACE_HUB_AVAILABLE:
83
+ raise ImportError("huggingface_hub is not available. Please install it to download models.")
84
+
85
+ logger.info(f"Downloading model {Config.MODEL_REPO}/{Config.MODEL_FILENAME}...")
86
+
87
+ # Create models directory if it doesn't exist
88
+ models_dir = Config.get_models_dir()
89
+ os.makedirs(models_dir, exist_ok=True)
90
+
91
+ try:
92
+ # Download model
93
+ model_path = hf_hub_download(
94
+ repo_id=Config.MODEL_REPO,
95
+ filename=Config.MODEL_FILENAME,
96
+ local_dir=models_dir,
97
+ token=Config.HUGGINGFACE_TOKEN if Config.HUGGINGFACE_TOKEN else None
98
+ )
99
+
100
+ logger.info(f"Model downloaded to: {model_path}")
101
+ return model_path
102
+ except Exception as e:
103
+ logger.error(f"Failed to download model: {e}")
104
+ raise
105
+
106
+ def _initialize_model(self):
107
+ """Initialize local GGUF model"""
108
+ try:
109
+ if not LLAMA_CPP_AVAILABLE:
110
+ raise ImportError("llama-cpp-python is not available. Please check installation.")
111
+
112
+ logger.info("Loading local model...")
113
+
114
+ # Download model if needed
115
+ model_path = self._download_model_if_needed()
116
+
117
+ # Verify model file exists and is readable
118
+ if not os.path.exists(model_path):
119
+ raise FileNotFoundError(f"Model file not found: {model_path}")
120
+
121
+ # Check file size to ensure it's not corrupted
122
+ file_size = os.path.getsize(model_path)
123
+ if file_size < 1024 * 1024: # Less than 1MB is suspicious for GGUF model
124
+ raise ValueError(f"Model file seems corrupted or incomplete. Size: {file_size} bytes")
125
+
126
+ logger.info(f"Model file verified. Size: {file_size / (1024**3):.2f} GB")
127
+
128
+ # Initialize Llama model with enhanced error handling
129
+ logger.info("Initializing Llama model...")
130
+ self.llm = Llama(
131
+ model_path=model_path,
132
+ n_ctx=Config.N_CTX,
133
+ n_batch=Config.N_BATCH,
134
+ n_gpu_layers=Config.N_GPU_LAYERS,
135
+ use_mlock=Config.USE_MLOCK,
136
+ use_mmap=Config.USE_MMAP,
137
+ vocab_only=False,
138
+ f16_kv=Config.F16_KV,
139
+ logits_all=False,
140
+ embedding=False,
141
+ n_threads=Config.N_THREADS,
142
+ last_n_tokens_size=64,
143
+ lora_base=None,
144
+ lora_path=None,
145
+ seed=Config.SEED,
146
+ verbose=True # Enable verbose for debugging
147
+ )
148
+
149
+ logger.info("Model successfully loaded and initialized")
150
+
151
+ # Test model with a simple prompt to verify it's working
152
+ logger.info("Testing model with simple prompt...")
153
+ test_response = self.llm("Hello", max_tokens=1, temperature=0.1)
154
+ logger.info("Model test successful")
155
+
156
+ except Exception as e:
157
+ logger.error(f"Error initializing model: {e}")
158
+ # Provide more specific error information
159
+ if "Failed to load model from file" in str(e):
160
+ logger.error("This error usually indicates:")
161
+ logger.error("1. Model file is corrupted or incomplete")
162
+ logger.error("2. llama-cpp-python version is incompatible with the model")
163
+ logger.error("3. Insufficient memory to load the model")
164
+ logger.error(f"4. Model path: {self.model_path}")
165
+ raise
166
+
167
+ def _validate_json_schema(self, schema: str) -> Dict[str, Any]:
168
+ """Validate and parse JSON schema"""
169
+ try:
170
+ parsed_schema = json.loads(schema)
171
+ return parsed_schema
172
+ except json.JSONDecodeError as e:
173
+ raise ValueError(f"Invalid JSON schema: {e}")
174
+
175
+ def _format_prompt_with_schema(self, prompt: str, json_schema: Dict[str, Any]) -> str:
176
+ """
177
+ Format prompt for structured output generation
178
+ """
179
+ schema_str = json.dumps(json_schema, ensure_ascii=False, indent=2)
180
+
181
+ formatted_prompt = f"""User: {prompt}
182
+
183
+ Please respond in strict accordance with the following JSON schema:
184
+
185
+ ```json
186
+ {schema_str}
187
+ ```
188
+
189
+ Return ONLY valid JSON without additional comments or explanations."""
190
+
191
+ return formatted_prompt
192
+
193
+ def generate_structured_response(self,
194
+ prompt: str,
195
+ json_schema: Union[str, Dict[str, Any]],
196
+ image: Optional[Image.Image] = None) -> Dict[str, Any]:
197
+ """
198
+ Generate structured response from local GGUF model
199
+ """
200
+ try:
201
+ # Validate and parse JSON schema
202
+ if isinstance(json_schema, str):
203
+ parsed_schema = self._validate_json_schema(json_schema)
204
+ else:
205
+ parsed_schema = json_schema
206
+
207
+ # Format prompt
208
+ formatted_prompt = self._format_prompt_with_schema(prompt, parsed_schema)
209
+
210
+ # Warning about images (not supported in this implementation)
211
+ if image is not None:
212
+ logger.warning("Image processing is not supported with this local model")
213
+
214
+ # Generate response
215
+ logger.info("Generating response...")
216
+
217
+ response = self.llm(
218
+ formatted_prompt,
219
+ max_tokens=Config.MAX_NEW_TOKENS,
220
+ temperature=Config.TEMPERATURE,
221
+ stop=["User:", "\n\n"],
222
+ echo=False
223
+ )
224
+
225
+ # Extract generated text
226
+ generated_text = response['choices'][0]['text']
227
+
228
+ # Attempt to parse JSON response
229
+ try:
230
+ # Find JSON in response
231
+ json_start = generated_text.find('{')
232
+ json_end = generated_text.rfind('}') + 1
233
+
234
+ if json_start != -1 and json_end > json_start:
235
+ json_str = generated_text[json_start:json_end]
236
+ parsed_response = json.loads(json_str)
237
+ return {
238
+ "success": True,
239
+ "data": parsed_response,
240
+ "raw_response": generated_text
241
+ }
242
+ else:
243
+ return {
244
+ "error": "Could not find JSON in model response",
245
+ "raw_response": generated_text
246
+ }
247
+
248
+ except json.JSONDecodeError as e:
249
+ return {
250
+ "error": f"JSON parsing error: {e}",
251
+ "raw_response": generated_text
252
+ }
253
+
254
+ except Exception as e:
255
+ logger.error(f"Unexpected error: {e}")
256
+ return {
257
+ "error": f"Generation error: {str(e)}"
258
+ }
259
+
260
+ # Initialize client
261
+ logger.info("Initializing LLM client...")
262
+ try:
263
+ llm_client = LLMClient()
264
+ logger.info("LLM client successfully initialized")
265
+ except Exception as e:
266
+ logger.error(f"Error initializing LLM client: {e}")
267
+ llm_client = None
268
+
269
+ def process_request(prompt: str,
270
+ json_schema: str,
271
+ image: Optional[Image.Image] = None) -> str:
272
+ """
273
+ Process request through Gradio interface
274
+ """
275
+ if llm_client is None:
276
+ return json.dumps({
277
+ "error": "LLM client not initialized",
278
+ "details": "Check logs for detailed error information"
279
+ }, ensure_ascii=False, indent=2)
280
+
281
+ if not prompt.strip():
282
+ return json.dumps({"error": "Prompt cannot be empty"}, ensure_ascii=False, indent=2)
283
+
284
+ if not json_schema.strip():
285
+ return json.dumps({"error": "JSON schema cannot be empty"}, ensure_ascii=False, indent=2)
286
+
287
+ result = llm_client.generate_structured_response(prompt, json_schema, image)
288
+ return json.dumps(result, ensure_ascii=False, indent=2)
289
+
290
+ # Examples for demonstration
291
+ example_schema = """{
292
+ "type": "object",
293
+ "properties": {
294
+ "summary": {
295
+ "type": "string",
296
+ "description": "Brief summary of the response"
297
+ },
298
+ "sentiment": {
299
+ "type": "string",
300
+ "enum": ["positive", "negative", "neutral"],
301
+ "description": "Emotional tone"
302
+ },
303
+ "confidence": {
304
+ "type": "number",
305
+ "minimum": 0,
306
+ "maximum": 1,
307
+ "description": "Confidence level in the response"
308
+ },
309
+ "keywords": {
310
+ "type": "array",
311
+ "items": {
312
+ "type": "string"
313
+ },
314
+ "description": "Key words"
315
+ }
316
+ },
317
+ "required": ["summary", "sentiment", "confidence"]
318
+ }"""
319
+
320
+ example_prompt = "Analyze the following text and provide a structured assessment: 'The company's new product received enthusiastic user reviews. Sales exceeded all expectations by 150%.'"
321
+
322
+ def create_gradio_interface():
323
+ """Create Gradio interface"""
324
+
325
+ with gr.Blocks(title="LLM Structured Output", theme=gr.themes.Soft()) as demo:
326
+ gr.Markdown("# πŸ€– LLM with Structured Output")
327
+ gr.Markdown(f"Application for generating structured responses using model **{Config.MODEL_REPO}/{Config.MODEL_FILENAME}**")
328
+
329
+ # Show model status
330
+ if llm_client is None:
331
+ gr.Markdown("⚠️ **Warning**: Model not loaded. Check configuration and restart the application.")
332
+ else:
333
+ gr.Markdown("βœ… **Status**: Model successfully loaded and ready to work")
334
+
335
+ with gr.Row():
336
+ with gr.Column():
337
+ prompt_input = gr.Textbox(
338
+ label="Prompt for model",
339
+ placeholder="Enter your request...",
340
+ lines=5,
341
+ value=example_prompt
342
+ )
343
+
344
+ image_input = gr.Image(
345
+ label="Image (optional, for multimodal models)",
346
+ type="pil"
347
+ )
348
+
349
+ schema_input = gr.Textbox(
350
+ label="JSON schema for response structure",
351
+ placeholder="Enter JSON schema...",
352
+ lines=15,
353
+ value=example_schema
354
+ )
355
+
356
+ submit_btn = gr.Button("Generate Response", variant="primary")
357
+
358
+ with gr.Column():
359
+ output = gr.Textbox(
360
+ label="Structured Response",
361
+ lines=20,
362
+ interactive=False
363
+ )
364
+
365
+ submit_btn.click(
366
+ fn=process_request,
367
+ inputs=[prompt_input, schema_input, image_input],
368
+ outputs=output
369
+ )
370
+
371
+ # Examples
372
+ gr.Markdown("## πŸ“‹ Usage Examples")
373
+
374
+ examples = gr.Examples(
375
+ examples=[
376
+ [
377
+ "Describe today's weather in New York",
378
+ """{
379
+ "type": "object",
380
+ "properties": {
381
+ "temperature": {"type": "number"},
382
+ "description": {"type": "string"},
383
+ "humidity": {"type": "number"}
384
+ }
385
+ }""",
386
+ None
387
+ ],
388
+ [
389
+ "Create a Python learning plan for one month",
390
+ """{
391
+ "type": "object",
392
+ "properties": {
393
+ "weeks": {
394
+ "type": "array",
395
+ "items": {
396
+ "type": "object",
397
+ "properties": {
398
+ "week_number": {"type": "integer"},
399
+ "topics": {"type": "array", "items": {"type": "string"}},
400
+ "practice_hours": {"type": "number"}
401
+ }
402
+ }
403
+ },
404
+ "total_hours": {"type": "number"}
405
+ }
406
+ }""",
407
+ None
408
+ ]
409
+ ],
410
+ inputs=[prompt_input, schema_input, image_input]
411
+ )
412
+
413
+ # Model information
414
+ gr.Markdown(f"""
415
+ ## ℹ️ Model Information
416
+
417
+ - **Model**: {Config.MODEL_REPO}/{Config.MODEL_FILENAME}
418
+ - **Local path**: {Config.MODEL_PATH}
419
+ - **Context window**: {Config.N_CTX} tokens
420
+ - **Batch size**: {Config.N_BATCH}
421
+ - **GPU layers**: {Config.N_GPU_LAYERS if Config.N_GPU_LAYERS >= 0 else "All"}
422
+ - **CPU threads**: {Config.N_THREADS}
423
+ - **Maximum response length**: {Config.MAX_NEW_TOKENS} tokens
424
+ - **Temperature**: {Config.TEMPERATURE}
425
+ - **Memory lock**: {"Enabled" if Config.USE_MLOCK else "Disabled"}
426
+ - **Memory mapping**: {"Enabled" if Config.USE_MMAP else "Disabled"}
427
+
428
+ πŸ’‘ **Tip**: Use clear and specific JSON schemas for better results.
429
+ """)
430
+
431
+ return demo
432
+
433
+ if __name__ == "__main__":
434
+ # Create and launch Gradio interface
435
+ demo = create_gradio_interface()
436
+ demo.launch(
437
+ server_name=Config.HOST,
438
+ server_port=Config.GRADIO_PORT,
439
+ share=False,
440
+ debug=True
441
+ )
config.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Optional
3
+
4
+ class Config:
5
+ """Application configuration for working with local GGUF models"""
6
+
7
+ # Model settings - using Hugging Face downloaded model
8
+ MODEL_REPO: str = os.getenv("MODEL_REPO", "lmstudio-community/gemma-3n-E4B-it-text-GGUF")
9
+ MODEL_FILENAME: str = os.getenv("MODEL_FILENAME", "gemma-3n-E4B-it-Q8_0.gguf")
10
+ MODEL_PATH: str = os.getenv("MODEL_PATH", "/app/models/gemma-3n-E4B-it-Q8_0.gguf")
11
+ HUGGINGFACE_TOKEN: str = os.getenv("HUGGINGFACE_TOKEN", "")
12
+
13
+ # Model loading settings - optimized for Docker container
14
+ N_CTX: int = int(os.getenv("N_CTX", "4096")) # Reduced context window for Docker
15
+ N_GPU_LAYERS: int = int(os.getenv("N_GPU_LAYERS", "0")) # CPU-only for Docker by default
16
+ N_THREADS: int = int(os.getenv("N_THREADS", "4")) # Conservative thread count
17
+ N_BATCH: int = int(os.getenv("N_BATCH", "512")) # Smaller batch size for Docker
18
+ USE_MLOCK: bool = os.getenv("USE_MLOCK", "false").lower() == "true" # Disabled for Docker
19
+ USE_MMAP: bool = os.getenv("USE_MMAP", "true").lower() == "true" # Keep memory mapping
20
+ F16_KV: bool = os.getenv("F16_KV", "true").lower() == "true" # Use 16-bit keys and values
21
+ SEED: int = int(os.getenv("SEED", "42")) # Random seed for reproducibility
22
+
23
+ # Server settings - Docker compatible
24
+ HOST: str = os.getenv("HOST", "0.0.0.0")
25
+ GRADIO_PORT: int = int(os.getenv("GRADIO_PORT", "7860")) # Standard HuggingFace Spaces port
26
+ API_PORT: int = int(os.getenv("API_PORT", "8000"))
27
+
28
+ # Generation settings - optimized for Docker
29
+ MAX_NEW_TOKENS: int = int(os.getenv("MAX_NEW_TOKENS", "256")) # Reduced for faster response
30
+ TEMPERATURE: float = float(os.getenv("TEMPERATURE", "0.1"))
31
+
32
+ # File upload settings
33
+ MAX_FILE_SIZE: int = int(os.getenv("MAX_FILE_SIZE", "10485760")) # 10MB
34
+ ALLOWED_IMAGE_EXTENSIONS: set = {".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp"}
35
+
36
+ @classmethod
37
+ def is_model_available(cls) -> bool:
38
+ """Check if local model file exists"""
39
+ return os.path.exists(cls.MODEL_PATH)
40
+
41
+ @classmethod
42
+ def get_model_path(cls) -> str:
43
+ """Get absolute path to model file"""
44
+ return os.path.abspath(cls.MODEL_PATH)
45
+
46
+ @classmethod
47
+ def get_models_dir(cls) -> str:
48
+ """Get models directory path"""
49
+ return os.path.dirname(cls.MODEL_PATH)
50
+
51
+ @classmethod
52
+ def load_from_env_file(cls, env_file: str = ".env") -> None:
53
+ """Load configuration from .env file"""
54
+ if os.path.exists(env_file):
55
+ with open(env_file, 'r') as f:
56
+ for line in f:
57
+ line = line.strip()
58
+ if line and not line.startswith('#') and '=' in line:
59
+ key, value = line.split('=', 1)
60
+ os.environ[key.strip()] = value.strip()
61
+
62
+ # Automatically load from .env file on import
63
+ Config.load_from_env_file()
docker-compose.yml ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: '3.8'
2
+
3
+ services:
4
+ llm-app:
5
+ build:
6
+ context: .
7
+ dockerfile: Dockerfile
8
+ ports:
9
+ - "7860:7860"
10
+ environment:
11
+ - MODEL_REPO=lmstudio-community/gemma-3n-E4B-it-text-GGUF
12
+ - MODEL_FILENAME=gemma-3n-E4B-it-Q8_0.gguf
13
+ - N_CTX=4096
14
+ - N_GPU_LAYERS=0
15
+ - N_THREADS=4
16
+ - MAX_NEW_TOKENS=256
17
+ - TEMPERATURE=0.1
18
+ volumes:
19
+ # Optional: Mount models directory to persist downloaded models
20
+ - ./models:/app/models
21
+ restart: unless-stopped
22
+ mem_limit: 8g
23
+ # Uncomment below for GPU support
24
+ # deploy:
25
+ # resources:
26
+ # reservations:
27
+ # devices:
28
+ # - driver: nvidia
29
+ # count: 1
30
+ # capabilities: [gpu]
entrypoint.sh ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Entrypoint script for LLM Structured Output Docker container
4
+
5
+ set -e
6
+
7
+ # Print environment info
8
+ echo "🐳 Starting LLM Structured Output Docker container"
9
+ echo "Python version: $(python --version)"
10
+ echo "Working directory: $(pwd)"
11
+ echo "User: $(whoami)"
12
+
13
+ # Create models directory if it doesn't exist
14
+ mkdir -p /app/models
15
+
16
+ # Check if musl libc symbolic link exists (required for llama-cpp-python)
17
+ if [ ! -e "/lib/libc.musl-x86_64.so.1" ]; then
18
+ echo "⚠️ Warning: musl libc symbolic link not found. Checking for available libc libraries..."
19
+ ls -la /usr/lib/x86_64-linux-* 2>/dev/null || echo "No musl libraries found"
20
+ ls -la /usr/lib/x86_64-linux-gnu/libc.so* 2>/dev/null || echo "No glibc libraries found"
21
+ fi
22
+
23
+ # Check available memory
24
+ echo "πŸ“Š System information:"
25
+ echo "Memory: $(cat /proc/meminfo | grep MemTotal)"
26
+ echo "CPU cores: $(nproc)"
27
+ echo "Disk space: $(df -h /app)"
28
+
29
+ # Set default values for key environment variables if not provided
30
+ export MODEL_REPO=${MODEL_REPO:-"lmstudio-community/gemma-3n-E4B-it-text-GGUF"}
31
+ export MODEL_FILENAME=${MODEL_FILENAME:-"gemma-3n-E4B-it-Q8_0.gguf"}
32
+ export N_CTX=${N_CTX:-"4096"}
33
+ export N_GPU_LAYERS=${N_GPU_LAYERS:-"0"}
34
+ export N_THREADS=${N_THREADS:-"4"}
35
+ export MAX_NEW_TOKENS=${MAX_NEW_TOKENS:-"256"}
36
+
37
+ echo "πŸ”§ Configuration:"
38
+ echo "Model: $MODEL_REPO/$MODEL_FILENAME"
39
+ echo "Context size: $N_CTX"
40
+ echo "GPU layers: $N_GPU_LAYERS"
41
+ echo "CPU threads: $N_THREADS"
42
+ echo "Max tokens: $MAX_NEW_TOKENS"
43
+
44
+ # Check if running in HuggingFace Spaces
45
+ if [ "$SPACE_ID" ]; then
46
+ echo "πŸ€— Running in HuggingFace Spaces: $SPACE_ID"
47
+ export HOST=0.0.0.0
48
+ export GRADIO_PORT=7860
49
+ fi
50
+
51
+ # Execute the main command
52
+ echo "πŸš€ Starting application..."
53
+ exec "$@"
main.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Main file for launching LLM Structured Output application in Docker
4
+ """
5
+
6
+ import argparse
7
+ import threading
8
+ import time
9
+ from config import Config
10
+
11
+ def run_gradio():
12
+ """Launch Gradio interface"""
13
+ from app import create_gradio_interface
14
+
15
+ print(f"🎨 Starting Gradio interface at http://{Config.HOST}:{Config.GRADIO_PORT}")
16
+ demo = create_gradio_interface()
17
+ demo.launch(
18
+ server_name=Config.HOST,
19
+ server_port=Config.GRADIO_PORT,
20
+ share=False,
21
+ debug=False # Disabled debug for production
22
+ )
23
+
24
+ def run_api():
25
+ """Launch FastAPI server"""
26
+ import uvicorn
27
+ from api import api_app
28
+
29
+ print(f"πŸ”Œ Starting API at http://{Config.HOST}:{Config.API_PORT}")
30
+ uvicorn.run(
31
+ api_app,
32
+ host=Config.HOST,
33
+ port=Config.API_PORT,
34
+ log_level="info"
35
+ )
36
+
37
+ def run_both():
38
+ """Launch both services simultaneously"""
39
+ print("πŸš€ Starting LLM Structured Output application...")
40
+ print("=" * 60)
41
+ print(f"πŸ“Š Gradio interface: http://{Config.HOST}:{Config.GRADIO_PORT}")
42
+ print(f"πŸ”Œ API: http://{Config.HOST}:{Config.API_PORT}")
43
+ print(f"πŸ“– API documentation: http://{Config.HOST}:{Config.API_PORT}/docs")
44
+ print("=" * 60)
45
+
46
+ # Start API in separate thread
47
+ api_thread = threading.Thread(target=run_api, daemon=True)
48
+ api_thread.start()
49
+
50
+ # Small delay for API startup
51
+ time.sleep(2)
52
+
53
+ # Start Gradio in main thread
54
+ run_gradio()
55
+
56
+ def main():
57
+ """Main function with command line arguments"""
58
+ parser = argparse.ArgumentParser(description="LLM Structured Output application")
59
+ parser.add_argument(
60
+ "--mode",
61
+ choices=["gradio", "api", "both"],
62
+ default="gradio", # Default to gradio only for HuggingFace Spaces
63
+ help="Launch mode: gradio (interface only), api (API only), both (both services)"
64
+ )
65
+
66
+ args = parser.parse_args()
67
+
68
+ if args.mode == "gradio":
69
+ run_gradio()
70
+ elif args.mode == "api":
71
+ run_api()
72
+ else:
73
+ run_both()
74
+
75
+ if __name__ == "__main__":
76
+ main()
packages.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # System packages required for Docker build
2
+ # These are installed in the Dockerfile, not needed for HF Spaces with Docker SDK
3
+ # but keeping for reference
4
+
5
+ # build-essential
6
+ # cmake
7
+ # wget
8
+ # curl
9
+ # git
10
+ # pkg-config
11
+ # libopenblas-dev
12
+ # libssl-dev
requirements.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ huggingface_hub==0.25.2
2
+ # Core ML dependencies - updated for compatibility with gemma-3n-E4B model
3
+ llama-cpp-python>=0.3.4
4
+
5
+ # Web interface
6
+ gradio==4.44.1
7
+ fastapi>=0.100.0,<0.115.0
8
+ uvicorn[standard]>=0.20.0,<0.31.0
9
+
10
+ # Data processing
11
+ pillow>=9.0.0,<11.0.0
12
+ pydantic==2.10.6
13
+ numpy>=1.24.0,<2.0.0
14
+
15
+ # HTTP requests
16
+ requests>=2.28.0
17
+
18
+ # Additional dependencies for Docker environment
19
+ python-multipart
runtime.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.10
test.ipynb ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "c364ff11",
7
+ "metadata": {
8
+ "vscode": {
9
+ "languageId": "plaintext"
10
+ }
11
+ },
12
+ "outputs": [],
13
+ "source": []
14
+ }
15
+ ],
16
+ "metadata": {
17
+ "language_info": {
18
+ "name": "python"
19
+ }
20
+ },
21
+ "nbformat": 4,
22
+ "nbformat_minor": 5
23
+ }