Spaces:

lyangas
/

free_llm_structure_output_docker

Sleeping

App Files Files Community

lyangas commited on 15 days ago

Commit

b269c5d

1 Parent(s): 99a5e1e

init repo

Browse files

Files changed (15) hide show

.env.example +27 -0
.gitignore +67 -0
BUILD_INSTRUCTIONS.md +89 -0
Dockerfile +70 -0
README.md +150 -6
api.py +213 -0
app.py +441 -0
config.py +63 -0
docker-compose.yml +30 -0
entrypoint.sh +53 -0
main.py +76 -0
packages.txt +12 -0
requirements.txt +19 -0
runtime.txt +1 -0
test.ipynb +23 -0

.env.example ADDED Viewed

	@@ -0,0 +1,27 @@

+# Model configuration
+MODEL_REPO=lmstudio-community/gemma-3n-E4B-it-text-GGUF
+MODEL_FILENAME=gemma-3n-E4B-it-Q8_0.gguf
+MODEL_PATH=./models/gemma-3n-E4B-it-Q8_0.gguf
+HUGGINGFACE_TOKEN=
+# Model parameters - optimized for Docker containers
+N_CTX=4096
+N_GPU_LAYERS=0
+N_THREADS=4
+N_BATCH=512
+USE_MLOCK=false
+USE_MMAP=true
+F16_KV=true
+SEED=42
+# Server settings
+HOST=0.0.0.0
+GRADIO_PORT=7860
+API_PORT=8000
+# Generation settings
+MAX_NEW_TOKENS=256
+TEMPERATURE=0.1
+# File upload settings
+MAX_FILE_SIZE=10485760

.gitignore ADDED Viewed

	@@ -0,0 +1,67 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# Virtual environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+# OS
+.DS_Store
+.DS_Store?
+._*
+.Spotlight-V100
+.Trashes
+ehthumbs.db
+Thumbs.db
+# Model files
+models/*.gguf
+models/*.bin
+models/*.pt
+models/*.safetensors
+# Logs
+*.log
+logs/
+# Temporary files
+tmp/
+temp/
+# Docker
+.dockerignore
+# HuggingFace
+.huggingface/

BUILD_INSTRUCTIONS.md ADDED Viewed

	@@ -0,0 +1,89 @@

+# Инструкции по сборке Docker образа с предзагруженной моделью
+## Обзор изменений
+Dockerfile был модифицирован для предварительной загрузки модели Hugging Face во время сборки образа. Это обеспечивает:
+- ✅ Быстрое развертывание (модель уже в контейнере)
+- ✅ Надежность (нет зависимости от сети при запуске)
+- ✅ Консистентность (фиксированная версия модели)
+## Сборка образа
+### Базовая сборка (для публичных моделей):
+```bash
+docker build -t llm-structured-output .
+```
+### Сборка с токеном Hugging Face (для приватных моделей):
+```bash
+docker build --build-arg HUGGINGFACE_TOKEN=your_token_here -t llm-structured-output .
+```
+Или через переменную окружения:
+```bash
+export HUGGINGFACE_TOKEN=your_token_here
+docker build -t llm-structured-output .
+```
+## Запуск контейнера
+```bash
+docker run -p 7860:7860 llm-structured-output
+```
+Приложение будет доступно по адресу: http://localhost:7860
+## Запуск через docker-compose
+```bash
+docker-compose up --build
+```
+## Важные изменения
+### 1. Dockerfile
+- Добавлен `git-lfs` для работы с большими файлами
+- Добавлена переменная `DOCKER_CONTAINER=true`
+- Добавлен этап предварительной загрузки модели
+- Модель скачивается во время сборки образа
+### 2. app.py
+- Добавлена проверка на Docker окружение
+- Если модель не найдена в Docker контейнере, выбрасывается ошибка
+- Логика загрузки модели оптимизирована для работы с предзагруженными моделями
+## Размер образа
+Образ будет больше из-за включенной модели, но это компенсируется:
+- Быстрым запуском контейнера
+- Отсутствием сетевых зависимостей
+- Возможностью кэширования слоев Docker
+## Настройка модели
+Для изменения модели отредактируйте `config.py`:
+```python
+MODEL_REPO: str = "your-repo/your-model"
+MODEL_FILENAME: str = "your-model.gguf"
+```
+Затем пересоберите образ.
+## Отладка
+Для проверки наличия модели в контейнере:
+```bash
+docker run -it llm-structured-output ls -la /app/models/
+```
+Для проверки логов сборки:
+```bash
+docker build --no-cache -t llm-structured-output .
+```

Dockerfile ADDED Viewed

	@@ -0,0 +1,70 @@

+# Use Python 3.10 base image optimized for HuggingFace Spaces
+FROM python:3.10-slim
+# Set working directory
+WORKDIR /app
+# Install system dependencies required for llama-cpp-python and git-lfs
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    cmake \
+    wget \
+    curl \
+    git \
+    git-lfs \
+    pkg-config \
+    libopenblas-dev \
+    libssl-dev \
+    musl-dev \
+    && rm -rf /var/lib/apt/lists/*
+# Initialize git-lfs
+RUN git lfs install
+# Set environment variables for optimal Docker performance
+ENV PYTHONUNBUFFERED=1
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PIP_NO_CACHE_DIR=1
+ENV CMAKE_ARGS="-DLLAMA_OPENBLAS=on"
+ENV FORCE_CMAKE=1
+ENV DOCKER_CONTAINER=true
+# Create models directory
+RUN mkdir -p /app/models
+# Create symbolic link for musl libc compatibility (required for llama-cpp-python)
+RUN ln -sf /usr/lib/x86_64-linux-musl/libc.so /lib/libc.musl-x86_64.so.1 || \
+    ln -sf /usr/lib/x86_64-linux-gnu/libc.so.6 /lib/libc.musl-x86_64.so.1
+# Copy requirements first for better Docker layer caching
+COPY requirements.txt .
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy configuration to get model info
+COPY config.py .
+# Pre-download the model during build
+RUN python -c "import os; from huggingface_hub import hf_hub_download; from config import Config; os.makedirs('/app/models', exist_ok=True); print(f'Downloading model {Config.MODEL_REPO}/{Config.MODEL_FILENAME}...'); p=hf_hub_download(repo_id=Config.MODEL_REPO, filename=Config.MODEL_FILENAME, local_dir='/app/models', token=os.getenv('HUGGINGFACE_TOKEN') or None); print(f'Model downloaded to: {p}'); import os; s=os.path.getsize(p) if os.path.exists(p) else (_ for _ in ()).throw(FileNotFoundError(f'Model file not found: {p}')); print(f'Model file size: {s/(1024**3):.2f} GB'); (s>1024*1024) or (_ for _ in ()).throw(ValueError(f'Downloaded model file seems too small: {s} bytes')); print('Model download verification successful')"
+# Verify model file exists after build
+RUN ls -la /app/models/ && \
+    [ -f "/app/models/gemma-3n-E4B-it-Q8_0.gguf" ] || (echo "Model file not found!" && exit 1)
+# Copy application files
+COPY . .
+# Make entrypoint script executable
+RUN chmod +x entrypoint.sh
+# Create a non-root user for security
+RUN useradd -m -u 1000 user && chown -R user:user /app
+USER user
+# Expose the port that Gradio will run on
+EXPOSE 7860
+# Set entrypoint and default command
+ENTRYPOINT ["./entrypoint.sh"]
+CMD ["python", "main.py", "--mode", "gradio"]

README.md CHANGED Viewed

@@ -1,11 +1,155 @@
 ---
-title: Free Llm Structure Output Docker
-emoji: 💻
-colorFrom: gray
-colorTo: gray
 sdk: docker
 pinned: false
-license: gemma
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: LLM Structured Output Docker
+emoji: 🤖
+colorFrom: blue
+colorTo: green
 sdk: docker
+app_port: 7860
 pinned: false
+license: mit
+short_description: Get structured JSON responses from LLM using Docker
+tags:
+- llama-cpp
+- gguf
+- json-schema
+- structured-output
+- llm
+- docker
+- gradio
 ---
+# 🤖 LLM Structured Output (Docker Version)
+Dockerized application for getting structured responses from local GGUF language models in specified JSON format.
+## ✨ Key Features
+- **Docker containerized** for easy deployment on HuggingFace Spaces
+- **Local GGUF model support** via llama-cpp-python
+- **Optimized for containers** with configurable resources
+- **JSON schema support** for structured output
+- **Gradio web interface** for convenient interaction
+- **REST API** for integration with other applications
+- **Memory efficient** with GGUF quantized models
+## 🚀 Deployment on HuggingFace Spaces
+This version is specifically designed for HuggingFace Spaces with Docker SDK:
+1. Clone this repository
+2. Push to HuggingFace Spaces with `sdk: docker` in README.md
+3. The application will automatically build and deploy
+## 🐳 Local Docker Usage
+### Build the image:
+```bash
+docker build -t llm-structured-output .
+```
+### Run the container:
+```bash
+docker run -p 7860:7860 -e MODEL_REPO="lmstudio-community/gemma-3n-E4B-it-text-GGUF" llm-structured-output
+```
+### With custom configuration:
+```bash
+docker run -p 7860:7860 \
+  -e MODEL_REPO="lmstudio-community/gemma-3n-E4B-it-text-GGUF" \
+  -e MODEL_FILENAME="gemma-3n-E4B-it-Q8_0.gguf" \
+  -e N_CTX="4096" \
+  -e MAX_NEW_TOKENS="512" \
+  llm-structured-output
+```
+## 🌐 Application Access
+- **Web interface**: http://localhost:7860
+- **API**: Available through the same port
+- **Health check**: http://localhost:7860/health (when API mode is enabled)
+## 📝 Environment Variables
+Configure the application using environment variables:
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `MODEL_REPO` | `lmstudio-community/gemma-3n-E4B-it-text-GGUF` | HuggingFace model repository |
+| `MODEL_FILENAME` | `gemma-3n-E4B-it-Q8_0.gguf` | Model file name |
+| `N_CTX` | `4096` | Context window size |
+| `N_GPU_LAYERS` | `0` | GPU layers (0 for CPU-only) |
+| `N_THREADS` | `4` | CPU threads |
+| `MAX_NEW_TOKENS` | `256` | Maximum response length |
+| `TEMPERATURE` | `0.1` | Generation temperature |
+| `HUGGINGFACE_TOKEN` | `` | HF token for private models |
+## 📋 Usage Examples
+### Example JSON Schema:
+```json
+{
+  "type": "object",
+  "properties": {
+    "summary": {"type": "string"},
+    "sentiment": {"type": "string", "enum": ["positive", "negative", "neutral"]},
+    "confidence": {"type": "number", "minimum": 0, "maximum": 1}
+  },
+  "required": ["summary", "sentiment"]
+}
+```
+### Example Prompt:
+```
+Analyze this review: "The product exceeded my expectations! Great quality and fast delivery."
+```
+## 🔧 Docker Optimizations
+This Docker version includes several optimizations:
+- **Reduced memory usage** with smaller context window and batch sizes
+- **CPU-optimized** configuration by default
+- **Efficient layer caching** for faster builds
+- **Security**: Runs as non-root user
+- **Multi-stage build** capabilities for production
+## 🏗️ Architecture
+- **Base Image**: Python 3.10 slim
+- **ML Backend**: llama-cpp-python with OpenBLAS
+- **Web Interface**: Gradio 4.x
+- **API**: FastAPI with automatic documentation
+- **Model Storage**: Downloaded on first run to `/app/models/`
+## 💡 Performance Tips
+1. **Memory**: Start with smaller models (7B or less)
+2. **CPU**: Adjust `N_THREADS` based on available cores
+3. **Context**: Reduce `N_CTX` if experiencing memory issues
+4. **Batch size**: Lower `N_BATCH` for memory-constrained environments
+## 🔍 Troubleshooting
+### Container fails to start:
+- Check available memory (minimum 4GB recommended)
+- Verify model repository accessibility
+- Ensure proper environment variable formatting
+### Model download issues:
+- Check internet connectivity in container
+- Verify `HUGGINGFACE_TOKEN` for private models
+- Ensure sufficient disk space
+### Performance issues:
+- Reduce `N_CTX` and `MAX_NEW_TOKENS`
+- Adjust `N_THREADS` to match CPU cores
+- Consider using smaller/quantized models
+## 📄 License
+MIT License - see LICENSE file for details.
+---
+For more information about HuggingFace Spaces Docker configuration, see: https://huggingface.co/docs/hub/spaces-config-reference

api.py ADDED Viewed

	@@ -0,0 +1,213 @@

+from fastapi import FastAPI, HTTPException, UploadFile, File, Form
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+from typing import Optional, Dict, Any
+import json
+import base64
+from PIL import Image
+from io import BytesIO
+import uvicorn
+from app import llm_client
+# Create FastAPI application
+api_app = FastAPI(
+    title="LLM Structured Output API",
+    description="API for generating structured responses from local GGUF models via llama-cpp-python",
+    version="1.0.0"
+)
+# Setup CORS
+api_app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Data models for API
+class StructuredOutputRequest(BaseModel):
+    prompt: str
+    json_schema: Dict[str, Any]
+    image_base64: Optional[str] = None
+class StructuredOutputResponse(BaseModel):
+    success: bool
+    data: Optional[Dict[str, Any]] = None
+    error: Optional[str] = None
+    raw_response: Optional[str] = None
+def decode_base64_image(base64_string: str) -> Image.Image:
+    """Decode base64 string to PIL Image"""
+    try:
+        image_data = base64.b64decode(base64_string)
+        image = Image.open(BytesIO(image_data))
+        return image
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=f"Image decoding error: {str(e)}")
+@api_app.post("/generate", response_model=StructuredOutputResponse)
+async def generate_structured_output(request: StructuredOutputRequest):
+    """
+    Main endpoint for generating structured response
+    Args:
+        request: Request containing prompt, JSON schema and optionally base64 image
+    Returns:
+        StructuredOutputResponse: Structured response or error
+    """
+    # Check model initialization
+    if llm_client is None:
+        raise HTTPException(
+            status_code=503,
+            detail="LLM model not initialized. Check server configuration."
+        )
+    try:
+        # Validate input data
+        if not request.prompt.strip():
+            raise HTTPException(status_code=400, detail="Prompt cannot be empty")
+        if not request.json_schema:
+            raise HTTPException(status_code=400, detail="JSON schema cannot be empty")
+        # Decode image if provided
+        image = None
+        if request.image_base64:
+            image = decode_base64_image(request.image_base64)
+        # Generate response
+        result = llm_client.generate_structured_response(
+            prompt=request.prompt,
+            json_schema=request.json_schema,
+            image=image
+        )
+        # Format response
+        if "error" in result:
+            return StructuredOutputResponse(
+                success=False,
+                error=result["error"],
+                raw_response=result.get("raw_response")
+            )
+        else:
+            return StructuredOutputResponse(
+                success=True,
+                data=result.get("data"),
+                raw_response=result.get("raw_response")
+            )
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
+@api_app.post("/generate_with_file", response_model=StructuredOutputResponse)
+async def generate_with_file(
+    prompt: str = Form(...),
+    json_schema: str = Form(...),
+    image: Optional[UploadFile] = File(None)
+):
+    """
+    Alternative endpoint for uploading image as file
+    Args:
+        prompt: Text prompt
+        json_schema: JSON schema as string
+        image: Uploaded image file
+    Returns:
+        StructuredOutputResponse: Structured response or error
+    """
+    # Check model initialization
+    if llm_client is None:
+        raise HTTPException(
+            status_code=503,
+            detail="LLM model not initialized. Check server configuration."
+        )
+    try:
+        # Validate input data
+        if not prompt.strip():
+            raise HTTPException(status_code=400, detail="Prompt cannot be empty")
+        if not json_schema.strip():
+            raise HTTPException(status_code=400, detail="JSON schema cannot be empty")
+        # Parse JSON schema
+        try:
+            parsed_schema = json.loads(json_schema)
+        except json.JSONDecodeError as e:
+            raise HTTPException(status_code=400, detail=f"Invalid JSON schema: {str(e)}")
+        # Process image if provided
+        pil_image = None
+        if image:
+            # Check file type
+            if not image.content_type.startswith('image/'):
+                raise HTTPException(status_code=400, detail="Uploaded file must be an image")
+            # Read and convert image
+            image_data = await image.read()
+            pil_image = Image.open(BytesIO(image_data))
+        # Generate response
+        result = llm_client.generate_structured_response(
+            prompt=prompt,
+            json_schema=parsed_schema,
+            image=pil_image
+        )
+        # Format response
+        if "error" in result:
+            return StructuredOutputResponse(
+                success=False,
+                error=result["error"],
+                raw_response=result.get("raw_response")
+            )
+        else:
+            return StructuredOutputResponse(
+                success=True,
+                data=result.get("data"),
+                raw_response=result.get("raw_response")
+            )
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
+@api_app.get("/health")
+async def health_check():
+    """API health check"""
+    model_status = "loaded" if llm_client is not None else "not_loaded"
+    return {
+        "status": "healthy" if llm_client is not None else "degraded",
+        "model_status": model_status,
+        "message": "API is working correctly" if llm_client is not None else "API is working, but model is not loaded"
+    }
+@api_app.get("/")
+async def root():
+    """Root endpoint with API information"""
+    return {
+        "message": "LLM Structured Output API",
+        "version": "1.0.0",
+        "model_loaded": llm_client is not None,
+        "endpoints": {
+            "/generate": "POST - main endpoint for generating structured response",
+            "/generate_with_file": "POST - endpoint with image file upload",
+            "/health": "GET - health check",
+            "/docs": "GET - automatic Swagger documentation"
+        }
+    }
+if __name__ == "__main__":
+    from config import Config
+    uvicorn.run(
+        "api:api_app",
+        host=Config.HOST,
+        port=Config.API_PORT,
+        reload=True
+    )

app.py ADDED Viewed

	@@ -0,0 +1,441 @@

+import json
+import os
+import gradio as gr
+from typing import Optional, Dict, Any, Union
+from PIL import Image
+from pydantic import BaseModel
+import logging
+from config import Config
+# Try to import llama_cpp with fallback
+try:
+    from llama_cpp import Llama
+    LLAMA_CPP_AVAILABLE = True
+except ImportError as e:
+    print(f"Warning: llama-cpp-python not available: {e}")
+    LLAMA_CPP_AVAILABLE = False
+    Llama = None
+# Try to import huggingface_hub
+try:
+    from huggingface_hub import hf_hub_download
+    HUGGINGFACE_HUB_AVAILABLE = True
+except ImportError as e:
+    print(f"Warning: huggingface_hub not available: {e}")
+    HUGGINGFACE_HUB_AVAILABLE = False
+    hf_hub_download = None
+# Setup logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class StructuredOutputRequest(BaseModel):
+    prompt: str
+    image: Optional[str] = None  # base64 encoded image
+    json_schema: Dict[str, Any]
+class LLMClient:
+    def __init__(self):
+        """
+        Initialize client for working with local GGUF model via llama-cpp-python
+        """
+        self.model_path = Config.get_model_path()
+        logger.info(f"Using model: {self.model_path}")
+        self.llm = None
+        self._initialize_model()
+    def _download_model_if_needed(self) -> str:
+        """Download model from Hugging Face if it doesn't exist locally"""
+        if os.path.exists(self.model_path):
+            logger.info(f"Model already exists at: {self.model_path}")
+            return self.model_path
+        # If model doesn't exist and we're in production (Docker),
+        # it means the build process failed or model is in wrong location
+        if os.getenv('DOCKER_CONTAINER', 'false').lower() == 'true':
+            # Let's check common locations where model might be
+            alternative_paths = [
+                f"/app/models/{Config.MODEL_FILENAME}",
+                f"./models/{Config.MODEL_FILENAME}",
+                f"/models/{Config.MODEL_FILENAME}",
+                f"/app/{Config.MODEL_FILENAME}"
+            ]
+            for alt_path in alternative_paths:
+                if os.path.exists(alt_path):
+                    logger.info(f"Found model at alternative location: {alt_path}")
+                    return alt_path
+            # List what's actually in the models directory
+            models_dir = "/app/models"
+            if os.path.exists(models_dir):
+                files = os.listdir(models_dir)
+                logger.error(f"Contents of {models_dir}: {files}")
+            else:
+                logger.error(f"Directory {models_dir} does not exist")
+            # Try to download as fallback
+            logger.warning("Model not found in expected locations, attempting download...")
+        if not HUGGINGFACE_HUB_AVAILABLE:
+            raise ImportError("huggingface_hub is not available. Please install it to download models.")
+        logger.info(f"Downloading model {Config.MODEL_REPO}/{Config.MODEL_FILENAME}...")
+        # Create models directory if it doesn't exist
+        models_dir = Config.get_models_dir()
+        os.makedirs(models_dir, exist_ok=True)
+        try:
+            # Download model
+            model_path = hf_hub_download(
+                repo_id=Config.MODEL_REPO,
+                filename=Config.MODEL_FILENAME,
+                local_dir=models_dir,
+                token=Config.HUGGINGFACE_TOKEN if Config.HUGGINGFACE_TOKEN else None
+            )
+            logger.info(f"Model downloaded to: {model_path}")
+            return model_path
+        except Exception as e:
+            logger.error(f"Failed to download model: {e}")
+            raise
+    def _initialize_model(self):
+        """Initialize local GGUF model"""
+        try:
+            if not LLAMA_CPP_AVAILABLE:
+                raise ImportError("llama-cpp-python is not available. Please check installation.")
+            logger.info("Loading local model...")
+            # Download model if needed
+            model_path = self._download_model_if_needed()
+            # Verify model file exists and is readable
+            if not os.path.exists(model_path):
+                raise FileNotFoundError(f"Model file not found: {model_path}")
+            # Check file size to ensure it's not corrupted
+            file_size = os.path.getsize(model_path)
+            if file_size < 1024 * 1024:  # Less than 1MB is suspicious for GGUF model
+                raise ValueError(f"Model file seems corrupted or incomplete. Size: {file_size} bytes")
+            logger.info(f"Model file verified. Size: {file_size / (1024**3):.2f} GB")
+            # Initialize Llama model with enhanced error handling
+            logger.info("Initializing Llama model...")
+            self.llm = Llama(
+                model_path=model_path,
+                n_ctx=Config.N_CTX,
+                n_batch=Config.N_BATCH,
+                n_gpu_layers=Config.N_GPU_LAYERS,
+                use_mlock=Config.USE_MLOCK,
+                use_mmap=Config.USE_MMAP,
+                vocab_only=False,
+                f16_kv=Config.F16_KV,
+                logits_all=False,
+                embedding=False,
+                n_threads=Config.N_THREADS,
+                last_n_tokens_size=64,
+                lora_base=None,
+                lora_path=None,
+                seed=Config.SEED,
+                verbose=True  # Enable verbose for debugging
+            )
+            logger.info("Model successfully loaded and initialized")
+            # Test model with a simple prompt to verify it's working
+            logger.info("Testing model with simple prompt...")
+            test_response = self.llm("Hello", max_tokens=1, temperature=0.1)
+            logger.info("Model test successful")
+        except Exception as e:
+            logger.error(f"Error initializing model: {e}")
+            # Provide more specific error information
+            if "Failed to load model from file" in str(e):
+                logger.error("This error usually indicates:")
+                logger.error("1. Model file is corrupted or incomplete")
+                logger.error("2. llama-cpp-python version is incompatible with the model")
+                logger.error("3. Insufficient memory to load the model")
+                logger.error(f"4. Model path: {self.model_path}")
+            raise
+    def _validate_json_schema(self, schema: str) -> Dict[str, Any]:
+        """Validate and parse JSON schema"""
+        try:
+            parsed_schema = json.loads(schema)
+            return parsed_schema
+        except json.JSONDecodeError as e:
+            raise ValueError(f"Invalid JSON schema: {e}")
+    def _format_prompt_with_schema(self, prompt: str, json_schema: Dict[str, Any]) -> str:
+        """
+        Format prompt for structured output generation
+        """
+        schema_str = json.dumps(json_schema, ensure_ascii=False, indent=2)
+        formatted_prompt = f"""User: {prompt}
+Please respond in strict accordance with the following JSON schema:
+```json
+{schema_str}
+```
+Return ONLY valid JSON without additional comments or explanations."""
+        return formatted_prompt
+    def generate_structured_response(self,
+                                   prompt: str,
+                                   json_schema: Union[str, Dict[str, Any]],
+                                   image: Optional[Image.Image] = None) -> Dict[str, Any]:
+        """
+        Generate structured response from local GGUF model
+        """
+        try:
+            # Validate and parse JSON schema
+            if isinstance(json_schema, str):
+                parsed_schema = self._validate_json_schema(json_schema)
+            else:
+                parsed_schema = json_schema
+            # Format prompt
+            formatted_prompt = self._format_prompt_with_schema(prompt, parsed_schema)
+            # Warning about images (not supported in this implementation)
+            if image is not None:
+                logger.warning("Image processing is not supported with this local model")
+            # Generate response
+            logger.info("Generating response...")
+            response = self.llm(
+                formatted_prompt,
+                max_tokens=Config.MAX_NEW_TOKENS,
+                temperature=Config.TEMPERATURE,
+                stop=["User:", "\n\n"],
+                echo=False
+            )
+            # Extract generated text
+            generated_text = response['choices'][0]['text']
+            # Attempt to parse JSON response
+            try:
+                # Find JSON in response
+                json_start = generated_text.find('{')
+                json_end = generated_text.rfind('}') + 1
+                if json_start != -1 and json_end > json_start:
+                    json_str = generated_text[json_start:json_end]
+                    parsed_response = json.loads(json_str)
+                    return {
+                        "success": True,
+                        "data": parsed_response,
+                        "raw_response": generated_text
+                    }
+                else:
+                    return {
+                        "error": "Could not find JSON in model response",
+                        "raw_response": generated_text
+                    }
+            except json.JSONDecodeError as e:
+                return {
+                    "error": f"JSON parsing error: {e}",
+                    "raw_response": generated_text
+                }
+        except Exception as e:
+            logger.error(f"Unexpected error: {e}")
+            return {
+                "error": f"Generation error: {str(e)}"
+            }
+# Initialize client
+logger.info("Initializing LLM client...")
+try:
+    llm_client = LLMClient()
+    logger.info("LLM client successfully initialized")
+except Exception as e:
+    logger.error(f"Error initializing LLM client: {e}")
+    llm_client = None
+def process_request(prompt: str,
+                   json_schema: str,
+                   image: Optional[Image.Image] = None) -> str:
+    """
+    Process request through Gradio interface
+    """
+    if llm_client is None:
+        return json.dumps({
+            "error": "LLM client not initialized",
+            "details": "Check logs for detailed error information"
+        }, ensure_ascii=False, indent=2)
+    if not prompt.strip():
+        return json.dumps({"error": "Prompt cannot be empty"}, ensure_ascii=False, indent=2)
+    if not json_schema.strip():
+        return json.dumps({"error": "JSON schema cannot be empty"}, ensure_ascii=False, indent=2)
+    result = llm_client.generate_structured_response(prompt, json_schema, image)
+    return json.dumps(result, ensure_ascii=False, indent=2)
+# Examples for demonstration
+example_schema = """{
+  "type": "object",
+  "properties": {
+    "summary": {
+      "type": "string",
+      "description": "Brief summary of the response"
+    },
+    "sentiment": {
+      "type": "string",
+      "enum": ["positive", "negative", "neutral"],
+      "description": "Emotional tone"
+    },
+    "confidence": {
+      "type": "number",
+      "minimum": 0,
+      "maximum": 1,
+      "description": "Confidence level in the response"
+    },
+    "keywords": {
+      "type": "array",
+      "items": {
+        "type": "string"
+      },
+      "description": "Key words"
+    }
+  },
+  "required": ["summary", "sentiment", "confidence"]
+}"""
+example_prompt = "Analyze the following text and provide a structured assessment: 'The company's new product received enthusiastic user reviews. Sales exceeded all expectations by 150%.'"
+def create_gradio_interface():
+    """Create Gradio interface"""
+    with gr.Blocks(title="LLM Structured Output", theme=gr.themes.Soft()) as demo:
+        gr.Markdown("# 🤖 LLM with Structured Output")
+        gr.Markdown(f"Application for generating structured responses using model **{Config.MODEL_REPO}/{Config.MODEL_FILENAME}**")
+        # Show model status
+        if llm_client is None:
+            gr.Markdown("⚠️ **Warning**: Model not loaded. Check configuration and restart the application.")
+        else:
+            gr.Markdown("✅ **Status**: Model successfully loaded and ready to work")
+        with gr.Row():
+            with gr.Column():
+                prompt_input = gr.Textbox(
+                    label="Prompt for model",
+                    placeholder="Enter your request...",
+                    lines=5,
+                    value=example_prompt
+                )
+                image_input = gr.Image(
+                    label="Image (optional, for multimodal models)",
+                    type="pil"
+                )
+                schema_input = gr.Textbox(
+                    label="JSON schema for response structure",
+                    placeholder="Enter JSON schema...",
+                    lines=15,
+                    value=example_schema
+                )
+                submit_btn = gr.Button("Generate Response", variant="primary")
+            with gr.Column():
+                output = gr.Textbox(
+                    label="Structured Response",
+                    lines=20,
+                    interactive=False
+                )
+        submit_btn.click(
+            fn=process_request,
+            inputs=[prompt_input, schema_input, image_input],
+            outputs=output
+        )
+        # Examples
+        gr.Markdown("## 📋 Usage Examples")
+        examples = gr.Examples(
+            examples=[
+                [
+                    "Describe today's weather in New York",
+                    """{
+  "type": "object",
+  "properties": {
+    "temperature": {"type": "number"},
+    "description": {"type": "string"},
+    "humidity": {"type": "number"}
+  }
+}""",
+                    None
+                ],
+                [
+                    "Create a Python learning plan for one month",
+                    """{
+  "type": "object",
+  "properties": {
+    "weeks": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "properties": {
+          "week_number": {"type": "integer"},
+          "topics": {"type": "array", "items": {"type": "string"}},
+          "practice_hours": {"type": "number"}
+        }
+      }
+    },
+    "total_hours": {"type": "number"}
+  }
+}""",
+                    None
+                ]
+            ],
+            inputs=[prompt_input, schema_input, image_input]
+        )
+        # Model information
+        gr.Markdown(f"""
+## ℹ️ Model Information
+- **Model**: {Config.MODEL_REPO}/{Config.MODEL_FILENAME}
+- **Local path**: {Config.MODEL_PATH}
+- **Context window**: {Config.N_CTX} tokens
+- **Batch size**: {Config.N_BATCH}
+- **GPU layers**: {Config.N_GPU_LAYERS if Config.N_GPU_LAYERS >= 0 else "All"}
+- **CPU threads**: {Config.N_THREADS}
+- **Maximum response length**: {Config.MAX_NEW_TOKENS} tokens
+- **Temperature**: {Config.TEMPERATURE}
+- **Memory lock**: {"Enabled" if Config.USE_MLOCK else "Disabled"}
+- **Memory mapping**: {"Enabled" if Config.USE_MMAP else "Disabled"}
+💡 **Tip**: Use clear and specific JSON schemas for better results.
+        """)
+    return demo
+if __name__ == "__main__":
+    # Create and launch Gradio interface
+    demo = create_gradio_interface()
+    demo.launch(
+        server_name=Config.HOST,
+        server_port=Config.GRADIO_PORT,
+        share=False,
+        debug=True
+    )

config.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import os
+from typing import Optional
+class Config:
+    """Application configuration for working with local GGUF models"""
+    # Model settings - using Hugging Face downloaded model
+    MODEL_REPO: str = os.getenv("MODEL_REPO", "lmstudio-community/gemma-3n-E4B-it-text-GGUF")
+    MODEL_FILENAME: str = os.getenv("MODEL_FILENAME", "gemma-3n-E4B-it-Q8_0.gguf")
+    MODEL_PATH: str = os.getenv("MODEL_PATH", "/app/models/gemma-3n-E4B-it-Q8_0.gguf")
+    HUGGINGFACE_TOKEN: str = os.getenv("HUGGINGFACE_TOKEN", "")
+    # Model loading settings - optimized for Docker container
+    N_CTX: int = int(os.getenv("N_CTX", "4096"))  # Reduced context window for Docker
+    N_GPU_LAYERS: int = int(os.getenv("N_GPU_LAYERS", "0"))  # CPU-only for Docker by default
+    N_THREADS: int = int(os.getenv("N_THREADS", "4"))  # Conservative thread count
+    N_BATCH: int = int(os.getenv("N_BATCH", "512"))  # Smaller batch size for Docker
+    USE_MLOCK: bool = os.getenv("USE_MLOCK", "false").lower() == "true"  # Disabled for Docker
+    USE_MMAP: bool = os.getenv("USE_MMAP", "true").lower() == "true"  # Keep memory mapping
+    F16_KV: bool = os.getenv("F16_KV", "true").lower() == "true"  # Use 16-bit keys and values
+    SEED: int = int(os.getenv("SEED", "42"))  # Random seed for reproducibility
+    # Server settings - Docker compatible
+    HOST: str = os.getenv("HOST", "0.0.0.0")
+    GRADIO_PORT: int = int(os.getenv("GRADIO_PORT", "7860"))  # Standard HuggingFace Spaces port
+    API_PORT: int = int(os.getenv("API_PORT", "8000"))
+    # Generation settings - optimized for Docker
+    MAX_NEW_TOKENS: int = int(os.getenv("MAX_NEW_TOKENS", "256"))  # Reduced for faster response
+    TEMPERATURE: float = float(os.getenv("TEMPERATURE", "0.1"))
+    # File upload settings
+    MAX_FILE_SIZE: int = int(os.getenv("MAX_FILE_SIZE", "10485760"))  # 10MB
+    ALLOWED_IMAGE_EXTENSIONS: set = {".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp"}
+    @classmethod
+    def is_model_available(cls) -> bool:
+        """Check if local model file exists"""
+        return os.path.exists(cls.MODEL_PATH)
+    @classmethod
+    def get_model_path(cls) -> str:
+        """Get absolute path to model file"""
+        return os.path.abspath(cls.MODEL_PATH)
+    @classmethod
+    def get_models_dir(cls) -> str:
+        """Get models directory path"""
+        return os.path.dirname(cls.MODEL_PATH)
+    @classmethod
+    def load_from_env_file(cls, env_file: str = ".env") -> None:
+        """Load configuration from .env file"""
+        if os.path.exists(env_file):
+            with open(env_file, 'r') as f:
+                for line in f:
+                    line = line.strip()
+                    if line and not line.startswith('#') and '=' in line:
+                        key, value = line.split('=', 1)
+                        os.environ[key.strip()] = value.strip()
+# Automatically load from .env file on import
+Config.load_from_env_file()

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,30 @@

+version: '3.8'
+services:
+  llm-app:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    ports:
+      - "7860:7860"
+    environment:
+      - MODEL_REPO=lmstudio-community/gemma-3n-E4B-it-text-GGUF
+      - MODEL_FILENAME=gemma-3n-E4B-it-Q8_0.gguf
+      - N_CTX=4096
+      - N_GPU_LAYERS=0
+      - N_THREADS=4
+      - MAX_NEW_TOKENS=256
+      - TEMPERATURE=0.1
+    volumes:
+      # Optional: Mount models directory to persist downloaded models
+      - ./models:/app/models
+    restart: unless-stopped
+    mem_limit: 8g
+    # Uncomment below for GPU support
+    # deploy:
+    #   resources:
+    #     reservations:
+    #       devices:
+    #         - driver: nvidia
+    #           count: 1
+    #           capabilities: [gpu]

entrypoint.sh ADDED Viewed

	@@ -0,0 +1,53 @@

+#!/bin/bash
+# Entrypoint script for LLM Structured Output Docker container
+set -e
+# Print environment info
+echo "🐳 Starting LLM Structured Output Docker container"
+echo "Python version: $(python --version)"
+echo "Working directory: $(pwd)"
+echo "User: $(whoami)"
+# Create models directory if it doesn't exist
+mkdir -p /app/models
+# Check if musl libc symbolic link exists (required for llama-cpp-python)
+if [ ! -e "/lib/libc.musl-x86_64.so.1" ]; then
+    echo "⚠️  Warning: musl libc symbolic link not found. Checking for available libc libraries..."
+    ls -la /usr/lib/x86_64-linux-* 2>/dev/null || echo "No musl libraries found"
+    ls -la /usr/lib/x86_64-linux-gnu/libc.so* 2>/dev/null || echo "No glibc libraries found"
+fi
+# Check available memory
+echo "📊 System information:"
+echo "Memory: $(cat /proc/meminfo | grep MemTotal)"
+echo "CPU cores: $(nproc)"
+echo "Disk space: $(df -h /app)"
+# Set default values for key environment variables if not provided
+export MODEL_REPO=${MODEL_REPO:-"lmstudio-community/gemma-3n-E4B-it-text-GGUF"}
+export MODEL_FILENAME=${MODEL_FILENAME:-"gemma-3n-E4B-it-Q8_0.gguf"}
+export N_CTX=${N_CTX:-"4096"}
+export N_GPU_LAYERS=${N_GPU_LAYERS:-"0"}
+export N_THREADS=${N_THREADS:-"4"}
+export MAX_NEW_TOKENS=${MAX_NEW_TOKENS:-"256"}
+echo "🔧 Configuration:"
+echo "Model: $MODEL_REPO/$MODEL_FILENAME"
+echo "Context size: $N_CTX"
+echo "GPU layers: $N_GPU_LAYERS"
+echo "CPU threads: $N_THREADS"
+echo "Max tokens: $MAX_NEW_TOKENS"
+# Check if running in HuggingFace Spaces
+if [ "$SPACE_ID" ]; then
+    echo "🤗 Running in HuggingFace Spaces: $SPACE_ID"
+    export HOST=0.0.0.0
+    export GRADIO_PORT=7860
+fi
+# Execute the main command
+echo "🚀 Starting application..."
+exec "$@"

main.py ADDED Viewed

	@@ -0,0 +1,76 @@

+#!/usr/bin/env python3
+"""
+Main file for launching LLM Structured Output application in Docker
+"""
+import argparse
+import threading
+import time
+from config import Config
+def run_gradio():
+    """Launch Gradio interface"""
+    from app import create_gradio_interface
+    print(f"🎨 Starting Gradio interface at http://{Config.HOST}:{Config.GRADIO_PORT}")
+    demo = create_gradio_interface()
+    demo.launch(
+        server_name=Config.HOST,
+        server_port=Config.GRADIO_PORT,
+        share=False,
+        debug=False  # Disabled debug for production
+    )
+def run_api():
+    """Launch FastAPI server"""
+    import uvicorn
+    from api import api_app
+    print(f"🔌 Starting API at http://{Config.HOST}:{Config.API_PORT}")
+    uvicorn.run(
+        api_app,
+        host=Config.HOST,
+        port=Config.API_PORT,
+        log_level="info"
+    )
+def run_both():
+    """Launch both services simultaneously"""
+    print("🚀 Starting LLM Structured Output application...")
+    print("=" * 60)
+    print(f"📊 Gradio interface: http://{Config.HOST}:{Config.GRADIO_PORT}")
+    print(f"🔌 API: http://{Config.HOST}:{Config.API_PORT}")
+    print(f"📖 API documentation: http://{Config.HOST}:{Config.API_PORT}/docs")
+    print("=" * 60)
+    # Start API in separate thread
+    api_thread = threading.Thread(target=run_api, daemon=True)
+    api_thread.start()
+    # Small delay for API startup
+    time.sleep(2)
+    # Start Gradio in main thread
+    run_gradio()
+def main():
+    """Main function with command line arguments"""
+    parser = argparse.ArgumentParser(description="LLM Structured Output application")
+    parser.add_argument(
+        "--mode",
+        choices=["gradio", "api", "both"],
+        default="gradio",  # Default to gradio only for HuggingFace Spaces
+        help="Launch mode: gradio (interface only), api (API only), both (both services)"
+    )
+    args = parser.parse_args()
+    if args.mode == "gradio":
+        run_gradio()
+    elif args.mode == "api":
+        run_api()
+    else:
+        run_both()
+if __name__ == "__main__":
+    main()

packages.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+# System packages required for Docker build
+# These are installed in the Dockerfile, not needed for HF Spaces with Docker SDK
+# but keeping for reference
+# build-essential
+# cmake
+# wget
+# curl
+# git
+# pkg-config
+# libopenblas-dev
+# libssl-dev

requirements.txt ADDED Viewed

	@@ -0,0 +1,19 @@

+huggingface_hub==0.25.2
+# Core ML dependencies - updated for compatibility with gemma-3n-E4B model
+llama-cpp-python>=0.3.4
+# Web interface
+gradio==4.44.1
+fastapi>=0.100.0,<0.115.0
+uvicorn[standard]>=0.20.0,<0.31.0
+# Data processing
+pillow>=9.0.0,<11.0.0
+pydantic==2.10.6
+numpy>=1.24.0,<2.0.0
+# HTTP requests
+requests>=2.28.0
+# Additional dependencies for Docker environment
+python-multipart

runtime.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.10

test.ipynb ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c364ff11",
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}