lyangas commited on
Commit
b9cb4a6
·
1 Parent(s): f2adbf5

wheel llama cpp was added

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.whl filter=lfs diff=lfs merge=lfs -text
.gitignore CHANGED
@@ -15,7 +15,6 @@ lib64/
15
  parts/
16
  sdist/
17
  var/
18
- wheels/
19
  *.egg-info/
20
  .installed.cfg
21
  *.egg
@@ -69,3 +68,4 @@ temp/
69
  # Test files
70
  test*
71
  test.ipynb
 
 
15
  parts/
16
  sdist/
17
  var/
 
18
  *.egg-info/
19
  .installed.cfg
20
  *.egg
 
68
  # Test files
69
  test*
70
  test.ipynb
71
+ logs.txt
BUILD_INSTRUCTIONS.md DELETED
@@ -1,89 +0,0 @@
1
- # Инструкции по сборке Docker образа с предзагруженной моделью
2
-
3
- ## Обзор изменений
4
-
5
- Dockerfile был модифицирован для предварительной загрузки модели Hugging Face во время сборки образа. Это обеспечивает:
6
-
7
- - ✅ Быстрое развертывание (модель уже в контейнере)
8
- - ✅ Надежность (нет зависимости от сети при запуске)
9
- - ✅ Консистентность (фиксированная версия модели)
10
-
11
- ## Сборка образа
12
-
13
- ### Базовая сборка (для публичных моделей):
14
-
15
- ```bash
16
- docker build -t llm-structured-output .
17
- ```
18
-
19
- ### Сборка с токеном Hugging Face (для приватных моделей):
20
-
21
- ```bash
22
- docker build --build-arg HUGGINGFACE_TOKEN=your_token_here -t llm-structured-output .
23
- ```
24
-
25
- Или через переменную окружения:
26
-
27
- ```bash
28
- export HUGGINGFACE_TOKEN=your_token_here
29
- docker build -t llm-structured-output .
30
- ```
31
-
32
- ## Запуск контейнера
33
-
34
- ```bash
35
- docker run -p 7860:7860 llm-structured-output
36
- ```
37
-
38
- Приложение будет доступно по адресу: http://localhost:7860
39
-
40
- ## Запуск через docker-compose
41
-
42
- ```bash
43
- docker-compose up --build
44
- ```
45
-
46
- ## Важные изменения
47
-
48
- ### 1. Dockerfile
49
- - Добавлен `git-lfs` для работы с большими файлами
50
- - Добавлена переменная `DOCKER_CONTAINER=true`
51
- - Добавлен этап предварительной загрузки модели
52
- - Модель скачивается во время сборки образа
53
-
54
- ### 2. app.py
55
- - Добавлена проверка на Docker окружение
56
- - Если модель не найдена в Docker контейнере, выбрасывается ошибка
57
- - Логика загрузки модели оптимизирована для работы с предзагруженными моделями
58
-
59
- ## Размер образа
60
-
61
- Образ будет больше из-за включенной модели, но это компенсируется:
62
- - Быстрым запуском контейнера
63
- - Отсутствием сетевых зависимостей
64
- - Возможностью кэширования слоев Docker
65
-
66
- ## Настройка модели
67
-
68
- Для изменения модели отредактируйте `config.py`:
69
-
70
- ```python
71
- MODEL_REPO: str = "your-repo/your-model"
72
- MODEL_FILENAME: str = "your-model.gguf"
73
- ```
74
-
75
- Затем пересоберите образ.
76
-
77
- ## Отладка
78
-
79
- Для проверки наличия модели в контейнере:
80
-
81
- ```bash
82
- docker run -it llm-structured-output ls -la /app/models/
83
- ```
84
-
85
- Для проверки логов сборки:
86
-
87
- ```bash
88
- docker build --no-cache -t llm-structured-output .
89
- ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Dockerfile CHANGED
@@ -4,14 +4,17 @@ FROM python:3.10-slim
4
  # Set working directory
5
  WORKDIR /app
6
 
7
- # Install system dependencies required for runtime and git-lfs
8
  RUN apt-get update && apt-get install -y \
9
  wget \
10
  curl \
11
  git \
12
  git-lfs \
 
 
13
  libopenblas-dev \
14
  libssl-dev \
 
15
  && rm -rf /var/lib/apt/lists/*
16
 
17
  # Initialize git-lfs
@@ -26,7 +29,9 @@ ENV DOCKER_CONTAINER=true
26
  # Create models directory
27
  RUN mkdir -p /app/models
28
 
29
-
 
 
30
 
31
  # Copy requirements first for better Docker layer caching
32
  COPY requirements.txt .
@@ -42,11 +47,7 @@ RUN python -c "import os; from huggingface_hub import hf_hub_download; from conf
42
 
43
  # Verify model file exists after build
44
  RUN ls -la /app/models/ && \
45
- [ -f "/app/models/gemma-3n-E4B-it-Q8_0.gguf" ] || (echo "Model file not found!" && exit 1)
46
-
47
- # Copy and install llama-cpp-python from local wheel
48
- COPY wheels/llama_cpp_python-0.3.16-cp310-cp310-linux_x86_64.whl /tmp/
49
- RUN pip install /tmp/llama_cpp_python-0.3.16-cp310-cp310-linux_x86_64.whl
50
 
51
  # Copy application files
52
  COPY . .
@@ -62,5 +63,5 @@ USER user
62
  EXPOSE 7860
63
 
64
  # Set entrypoint and default command
65
- ENTRYPOINT ["./entrypoint.sh"]
66
  CMD ["python", "main.py", "--mode", "gradio"]
 
4
  # Set working directory
5
  WORKDIR /app
6
 
7
+ # Install system dependencies required for runtime and compilation
8
  RUN apt-get update && apt-get install -y \
9
  wget \
10
  curl \
11
  git \
12
  git-lfs \
13
+ build-essential \
14
+ cmake \
15
  libopenblas-dev \
16
  libssl-dev \
17
+ libgomp1 \
18
  && rm -rf /var/lib/apt/lists/*
19
 
20
  # Initialize git-lfs
 
29
  # Create models directory
30
  RUN mkdir -p /app/models
31
 
32
+ # Copy and install llama-cpp-python from local wheel
33
+ COPY wheels/llama_cpp_python-0.3.16-cp310-cp310-linux_x86_64.whl /tmp/
34
+ RUN pip install /tmp/llama_cpp_python-0.3.16-cp310-cp310-linux_x86_64.whl
35
 
36
  # Copy requirements first for better Docker layer caching
37
  COPY requirements.txt .
 
47
 
48
  # Verify model file exists after build
49
  RUN ls -la /app/models/ && \
50
+ [ -n "$(ls /app/models/*.gguf 2>/dev/null)" ] || (echo "No .gguf model file found!" && exit 1)
 
 
 
 
51
 
52
  # Copy application files
53
  COPY . .
 
63
  EXPOSE 7860
64
 
65
  # Set entrypoint and default command
66
+ # ENTRYPOINT ["./entrypoint.sh"]
67
  CMD ["python", "main.py", "--mode", "gradio"]
GRAMMAR_CHANGES.md DELETED
@@ -1,100 +0,0 @@
1
- # 🔗 Grammar Support Implementation
2
-
3
- ## 📋 Summary
4
-
5
- Successfully integrated **Grammar-based Structured Output (GBNF)** support from the source project `/Users/ivan/Documents/Proging/free_llm_huggingface/free_llm_structure_output` into the current Docker project.
6
-
7
- ## 🔧 Changes Made
8
-
9
- ### 1. Core Grammar Implementation (`app.py`)
10
- - ✅ Added `LlamaGrammar` import from `llama_cpp`
11
- - ✅ Implemented `_json_schema_to_gbnf()` function for JSON Schema → GBNF conversion
12
- - ✅ Added `use_grammar` parameter to `generate_structured_response()` method
13
- - ✅ Enhanced generation logic with dual modes:
14
- - **Grammar Mode**: Uses GBNF constraints for strict JSON enforcement
15
- - **Schema Guidance Mode**: Uses prompt-based schema guidance
16
- - ✅ Added `test_grammar_generation()` function for testing
17
- - ✅ Updated `process_request()` to handle grammar parameter
18
-
19
- ### 2. Gradio Interface Enhancement
20
- - ✅ Added "🔗 Use Grammar (GBNF) Mode" checkbox
21
- - ✅ Updated submit button handler to pass grammar parameter
22
- - ✅ Enhanced model information section with grammar features description
23
-
24
- ### 3. REST API Updates (`api.py`)
25
- - ✅ Added `use_grammar: bool = True` to `StructuredOutputRequest` model
26
- - ✅ Updated `/generate` endpoint to support grammar parameter
27
- - ✅ Updated `/generate_with_file` endpoint with `use_grammar` form field
28
- - ✅ Enhanced API documentation
29
-
30
- ### 4. Documentation Updates
31
- - ✅ Updated `README.md` with comprehensive Grammar Mode section
32
- - ✅ Added feature tags: `grammar`, `gbnf`
33
- - ✅ Included usage examples for all interfaces
34
- - ✅ Added mode comparison table
35
- - ✅ Listed supported schema features
36
-
37
- ### 5. Testing
38
- - ✅ Created `test_grammar_standalone.py` for validation
39
- - ✅ Successfully tested grammar generation with multiple schema types:
40
- - Simple objects with required/optional properties
41
- - Nested objects with arrays
42
- - String enums support
43
-
44
- ## 🎯 Key Features Added
45
-
46
- ### Grammar Mode Benefits:
47
- - **100% valid JSON** - No parsing errors
48
- - **Schema compliance** - Guaranteed structure adherence
49
- - **Consistent output** - Reliable format every time
50
- - **Better performance** - Fewer retry attempts needed
51
-
52
- ### Supported Schema Features:
53
- - ✅ Objects with required/optional properties
54
- - ✅ Arrays with typed items
55
- - ✅ String enums
56
- - ✅ Numbers and integers
57
- - ✅ Booleans
58
- - ✅ Nested objects and arrays
59
- - ⚠️ Complex conditionals (simplified)
60
-
61
- ## 🎛️ Usage Examples
62
-
63
- ### Gradio Interface:
64
- - Toggle the "🔗 Use Grammar (GBNF) Mode" checkbox (enabled by default)
65
-
66
- ### REST API:
67
- ```json
68
- {
69
- "prompt": "Analyze this data...",
70
- "json_schema": {
71
- "type": "object",
72
- "properties": {
73
- "result": {"type": "string"},
74
- "confidence": {"type": "number"}
75
- }
76
- },
77
- "use_grammar": true
78
- }
79
- ```
80
-
81
- ### Python API:
82
- ```python
83
- result = llm_client.generate_structured_response(
84
- prompt="Your prompt",
85
- json_schema=schema,
86
- use_grammar=True # Enable grammar mode
87
- )
88
- ```
89
-
90
- ## 🔍 Validation
91
-
92
- All grammar generation functionality has been tested and validated:
93
- - ✅ Grammar generation from JSON schemas works correctly
94
- - ✅ GBNF output format is valid
95
- - ✅ Enum support is functional
96
- - ✅ Nested structures are handled properly
97
-
98
- ## 🚀 Ready for Production
99
-
100
- The implementation is complete and ready for use in Docker environments. Grammar mode provides more reliable structured output generation while maintaining backward compatibility with the existing schema guidance approach.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -1,3 +1,9 @@
 
 
 
 
 
 
1
  import json
2
  import os
3
  import gradio as gr
@@ -9,7 +15,7 @@ from config import Config
9
 
10
  # Try to import llama_cpp with fallback
11
  try:
12
- from llama_cpp import Llama, LlamaGrammar
13
  LLAMA_CPP_AVAILABLE = True
14
  except ImportError as e:
15
  print(f"Warning: llama-cpp-python not available: {e}")
@@ -27,9 +33,14 @@ except ImportError as e:
27
  hf_hub_download = None
28
 
29
  # Setup logging
30
- logging.basicConfig(level=logging.INFO)
 
31
  logger = logging.getLogger(__name__)
32
 
 
 
 
 
33
  class StructuredOutputRequest(BaseModel):
34
  prompt: str
35
  image: Optional[str] = None # base64 encoded image
@@ -144,14 +155,19 @@ class LLMClient:
144
  lora_base=None,
145
  lora_path=None,
146
  seed=Config.SEED,
147
- verbose=True # Enable verbose for debugging
148
  )
 
 
149
 
150
  logger.info("Model successfully loaded and initialized")
151
 
152
  # Test model with a simple prompt to verify it's working
 
153
  logger.info("Testing model with simple prompt...")
154
- test_response = self.llm("Hello", max_tokens=1, temperature=0.1)
 
 
155
  logger.info("Model test successful")
156
 
157
  except Exception as e:
@@ -175,11 +191,13 @@ class LLMClient:
175
 
176
  def _format_prompt_with_schema(self, prompt: str, json_schema: Dict[str, Any]) -> str:
177
  """
178
- Format prompt for structured output generation
179
  """
180
  schema_str = json.dumps(json_schema, ensure_ascii=False, indent=2)
181
 
182
- formatted_prompt = f"""User: {prompt}
 
 
183
 
184
  Please respond in strict accordance with the following JSON schema:
185
 
@@ -187,139 +205,72 @@ Please respond in strict accordance with the following JSON schema:
187
  {schema_str}
188
  ```
189
 
190
- Return ONLY valid JSON without additional comments or explanations."""
 
 
191
 
192
  return formatted_prompt
193
-
194
- def _json_schema_to_gbnf(schema: Dict[str, Any], root_name: str = "root") -> str:
195
- """Convert JSON schema to GBNF (Backus-Naur Form) grammar for structured output"""
196
- rules = []
197
- rule_names = set() # Track rule names to avoid duplicates
198
-
199
- def add_rule(name: str, definition: str):
200
- if name not in rule_names:
201
- rules.append(f"{name} ::= {definition}")
202
- rule_names.add(name)
203
 
204
- def process_type(schema_part: Dict[str, Any], type_name: str = "value") -> str:
205
- if "type" not in schema_part:
206
- # Handle anyOf, oneOf, allOf cases - simplified to string for now
207
- return "string"
208
 
209
- schema_type = schema_part["type"]
 
 
 
 
210
 
211
- if schema_type == "object":
212
- # Handle object type
213
- properties = schema_part.get("properties", {})
214
- required = schema_part.get("required", [])
215
-
216
- if not properties:
217
- add_rule(type_name, '"{" ws "}"')
218
- return type_name
219
-
220
- # Separate required and optional parts
221
- required_parts = []
222
- optional_parts = []
223
-
224
- for prop_name, prop_schema in properties.items():
225
- prop_type_name = f"{type_name}_{prop_name}"
226
- prop_type = process_type(prop_schema, prop_type_name)
227
- prop_def = f'"\\"" "{prop_name}" "\\"" ws ":" ws {prop_type}'
228
-
229
- if prop_name in required:
230
- required_parts.append(prop_def)
231
- else:
232
- optional_parts.append(prop_def)
233
-
234
- # Build object structure - simplified approach
235
- if not required_parts and not optional_parts:
236
- object_def = '"{" ws "}"'
237
- else:
238
- # For simplicity, create a fixed structure based on required fields only
239
- # and treat optional fields as always present but with optional values
240
- if not required_parts:
241
- # Only optional fields - make the whole object optional content
242
- if len(optional_parts) == 1:
243
- object_def = f'"{" ws ({optional_parts[0]})? ws "}"'
244
- else:
245
- comma_separated = ' ws "," ws '.join(optional_parts)
246
- object_def = f'"{" ws ({comma_separated})? ws "}"'
247
- else:
248
- # Has required fields
249
- all_parts = required_parts.copy()
250
-
251
- # Add optional parts as truly optional (with optional commas)
252
- for opt_part in optional_parts:
253
- all_parts.append(f'(ws "," ws {opt_part})?')
254
-
255
- if len(all_parts) == 1:
256
- object_def = f'"{" ws {all_parts[0]} ws "}"'
257
- else:
258
- # Join required parts with commas, optional parts are already with optional commas
259
- required_with_commas = ' ws "," ws '.join(required_parts)
260
- optional_with_commas = ' '.join([f'(ws "," ws {opt})?' for opt in optional_parts])
261
-
262
- if optional_with_commas:
263
- object_def = f'"{{" ws {required_with_commas} {optional_with_commas} ws "}}"'
264
- else:
265
- object_def = f'"{{" ws {required_with_commas} ws "}}"'
266
-
267
- add_rule(type_name, object_def)
268
- return type_name
269
-
270
- elif schema_type == "array":
271
- # Handle array type
272
- items_schema = schema_part.get("items", {})
273
- items_type_name = f"{type_name}_items"
274
- item_type = process_type(items_schema, f"{type_name}_item")
275
-
276
- # Create array items rule
277
- add_rule(items_type_name, f"{item_type} (ws \",\" ws {item_type})*")
278
- add_rule(type_name, f'"[" ws ({items_type_name})? ws "]"')
279
- return type_name
280
-
281
- elif schema_type == "string":
282
- # Handle string type with enum support
283
- if "enum" in schema_part:
284
- enum_values = schema_part["enum"]
285
- enum_options = ' | '.join([f'"\\"" "{val}" "\\""' for val in enum_values])
286
- add_rule(type_name, enum_options)
287
- return type_name
288
- else:
289
- return "string"
290
-
291
- elif schema_type == "number" or schema_type == "integer":
292
- return "number"
293
 
294
- elif schema_type == "boolean":
295
- return "boolean"
296
 
297
- else:
298
- return "string" # fallback
299
-
300
- # Process root schema
301
- process_type(schema, root_name)
302
-
303
- # Basic GBNF rules for primitives
304
- basic_rules = [
305
- 'ws ::= [ \\t\\n]*',
306
- 'string ::= "\\"" char* "\\""',
307
- 'char ::= [^"\\\\] | "\\\\" (["\\\\bfnrt] | "u" hex hex hex hex)',
308
- 'hex ::= [0-9a-fA-F]',
309
- 'number ::= "-"? ("0" | [1-9] [0-9]*) ("." [0-9]+)? ([eE] [+-]? [0-9]+)?',
310
- 'boolean ::= "true" | "false"',
311
- 'null ::= "null"'
312
- ]
313
-
314
- # Add basic rules only if they haven't been added yet
315
- for rule in basic_rules:
316
- rule_name = rule.split(' ::= ')[0]
317
- if rule_name not in rule_names:
318
- rules.append(rule)
319
- rule_names.add(rule_name)
320
-
321
- return "\\n".join(rules)
322
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
323
  def generate_structured_response(self,
324
  prompt: str,
325
  json_schema: Union[str, Dict[str, Any]],
@@ -360,17 +311,21 @@ def _json_schema_to_gbnf(schema: Dict[str, Any], root_name: str = "root") -> str
360
  generation_params = {
361
  "max_tokens": Config.MAX_NEW_TOKENS,
362
  "temperature": Config.TEMPERATURE,
 
 
 
363
  "echo": False
364
  }
365
 
366
  # Add grammar or stop tokens based on mode
367
  if use_grammar and grammar is not None:
368
  generation_params["grammar"] = grammar
369
- # For grammar mode, use a simpler prompt without schema explanation
370
- simple_prompt = f"User: {prompt}\n\nAssistant:"
371
  response = self.llm(simple_prompt, **generation_params)
372
  else:
373
- generation_params["stop"] = ["User:", "\n\n", "Assistant:", "Human:"]
 
374
  response = self.llm(formatted_prompt, **generation_params)
375
 
376
  # Extract generated text
@@ -385,11 +340,7 @@ def _json_schema_to_gbnf(schema: Dict[str, Any], root_name: str = "root") -> str
385
  if json_start != -1 and json_end > json_start:
386
  json_str = generated_text[json_start:json_end]
387
  parsed_response = json.loads(json_str)
388
- return {
389
- "success": True,
390
- "data": parsed_response,
391
- "raw_response": generated_text
392
- }
393
  else:
394
  return {
395
  "error": "Could not find JSON in model response",
@@ -408,6 +359,99 @@ def _json_schema_to_gbnf(schema: Dict[str, Any], root_name: str = "root") -> str
408
  "error": f"Generation error: {str(e)}"
409
  }
410
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
411
  def test_grammar_generation(json_schema_str: str) -> Dict[str, Any]:
412
  """
413
  Test grammar generation without running the full model
@@ -457,6 +501,43 @@ def process_request(prompt: str,
457
  result = llm_client.generate_structured_response(prompt, json_schema, image, use_grammar)
458
  return json.dumps(result, ensure_ascii=False, indent=2)
459
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
460
  # Examples for demonstration
461
  example_schema = """{
462
  "type": "object",
@@ -502,89 +583,12 @@ def create_gradio_interface():
502
  else:
503
  gr.Markdown("✅ **Status**: Model successfully loaded and ready to work")
504
 
505
- with gr.Row():
506
- with gr.Column():
507
- prompt_input = gr.Textbox(
508
- label="Prompt for model",
509
- placeholder="Enter your request...",
510
- lines=5,
511
- value=example_prompt
512
- )
513
-
514
- image_input = gr.Image(
515
- label="Image (optional, for multimodal models)",
516
- type="pil"
517
- )
518
-
519
- schema_input = gr.Textbox(
520
- label="JSON schema for response structure",
521
- placeholder="Enter JSON schema...",
522
- lines=15,
523
- value=example_schema
524
- )
525
-
526
- grammar_checkbox = gr.Checkbox(
527
- label="🔗 Use Grammar (GBNF) Mode",
528
- value=True,
529
- info="Enable grammar-based structured output for more precise JSON generation"
530
- )
531
-
532
- submit_btn = gr.Button("Generate Response", variant="primary")
533
-
534
- with gr.Column():
535
- output = gr.Textbox(
536
- label="Structured Response",
537
- lines=20,
538
- interactive=False
539
- )
540
-
541
- submit_btn.click(
542
- fn=process_request,
543
- inputs=[prompt_input, schema_input, image_input, grammar_checkbox],
544
- outputs=output
545
- )
546
-
547
- # Examples
548
- gr.Markdown("## 📋 Usage Examples")
549
-
550
- examples = gr.Examples(
551
- examples=[
552
- [
553
- "Describe today's weather in New York",
554
- """{
555
- "type": "object",
556
- "properties": {
557
- "temperature": {"type": "number"},
558
- "description": {"type": "string"},
559
- "humidity": {"type": "number"}
560
- }
561
- }""",
562
- None
563
- ],
564
- [
565
- "Create a Python learning plan for one month",
566
- """{
567
- "type": "object",
568
- "properties": {
569
- "weeks": {
570
- "type": "array",
571
- "items": {
572
- "type": "object",
573
- "properties": {
574
- "week_number": {"type": "integer"},
575
- "topics": {"type": "array", "items": {"type": "string"}},
576
- "practice_hours": {"type": "number"}
577
- }
578
- }
579
- },
580
- "total_hours": {"type": "number"}
581
- }
582
- }""",
583
- None
584
- ]
585
- ],
586
- inputs=[prompt_input, schema_input, image_input]
587
- )
588
 
589
  # Model information
590
  gr.Markdown(f"""
@@ -612,10 +616,155 @@ def create_gradio_interface():
612
  - Strict enforcement of JSON structure during generation
613
  - Support for objects, arrays, strings, numbers, booleans, and enums
614
  - Improved consistency and reliability of structured outputs
 
 
 
 
 
 
615
  """)
616
 
617
  return demo
618
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
619
  if __name__ == "__main__":
620
  # Create and launch Gradio interface
621
  demo = create_gradio_interface()
@@ -623,5 +772,5 @@ if __name__ == "__main__":
623
  server_name=Config.HOST,
624
  server_port=Config.GRADIO_PORT,
625
  share=False,
626
- debug=True
627
  )
 
1
+ import os
2
+ os.environ.setdefault("OMP_NUM_THREADS", "1")
3
+ os.environ.setdefault("OPENBLAS_NUM_THREADS", "1")
4
+ os.environ.setdefault("MKL_NUM_THREADS", "1")
5
+ os.environ.setdefault("NUMEXPR_NUM_THREADS", "1")
6
+
7
  import json
8
  import os
9
  import gradio as gr
 
15
 
16
  # Try to import llama_cpp with fallback
17
  try:
18
+ from llama_cpp import Llama, LlamaGrammar, LlamaRAMCache
19
  LLAMA_CPP_AVAILABLE = True
20
  except ImportError as e:
21
  print(f"Warning: llama-cpp-python not available: {e}")
 
33
  hf_hub_download = None
34
 
35
  # Setup logging
36
+ log_level = getattr(logging, Config.LOG_LEVEL.upper())
37
+ logging.basicConfig(level=log_level)
38
  logger = logging.getLogger(__name__)
39
 
40
+ # Reduce llama-cpp-python verbosity
41
+ llama_logger = logging.getLogger('llama_cpp')
42
+ llama_logger.setLevel(logging.WARNING)
43
+
44
  class StructuredOutputRequest(BaseModel):
45
  prompt: str
46
  image: Optional[str] = None # base64 encoded image
 
155
  lora_base=None,
156
  lora_path=None,
157
  seed=Config.SEED,
158
+ verbose=False # Disable verbose to reduce log noise
159
  )
160
+ # cache = LlamaRAMCache()
161
+ # self.llm.set_cache(cache)
162
 
163
  logger.info("Model successfully loaded and initialized")
164
 
165
  # Test model with a simple prompt to verify it's working
166
+ from time import time
167
  logger.info("Testing model with simple prompt...")
168
+ start_time = time()
169
+ test_response = self.llm("Hello", max_tokens=1, temperature=1.0, top_k=64, top_p=0.95, min_p=0.0)
170
+ logger.info(f"Model test time: {time() - start_time:.2f} seconds, response: {test_response}")
171
  logger.info("Model test successful")
172
 
173
  except Exception as e:
 
191
 
192
  def _format_prompt_with_schema(self, prompt: str, json_schema: Dict[str, Any]) -> str:
193
  """
194
+ Format prompt for structured output generation using Gemma chat format
195
  """
196
  schema_str = json.dumps(json_schema, ensure_ascii=False, indent=2)
197
 
198
+ # Use Gemma chat format with proper tokens
199
+ formatted_prompt = f"""<bos><start_of_turn>user
200
+ {prompt}
201
 
202
  Please respond in strict accordance with the following JSON schema:
203
 
 
205
  {schema_str}
206
  ```
207
 
208
+ Return ONLY valid JSON without additional comments or explanations.<end_of_turn>
209
+ <start_of_turn>model
210
+ """
211
 
212
  return formatted_prompt
 
 
 
 
 
 
 
 
 
 
213
 
214
+ def _format_gemma_chat(self, messages: list) -> str:
215
+ """
216
+ Format messages in Gemma chat format
 
217
 
218
+ Args:
219
+ messages: List of dicts with 'role' and 'content' keys
220
+ role can be 'user' or 'model'
221
+ """
222
+ formatted_parts = ["<bos>"]
223
 
224
+ for message in messages:
225
+ role = message.get('role', 'user')
226
+ content = message.get('content', '')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
 
228
+ if role not in ['user', 'model']:
229
+ role = 'user' # fallback to user role
230
 
231
+ formatted_parts.append(f"<start_of_turn>{role}")
232
+ formatted_parts.append(content)
233
+ formatted_parts.append("<end_of_turn>")
234
+
235
+ # Add start of model response
236
+ formatted_parts.append("<start_of_turn>model")
237
+
238
+ return "\n".join(formatted_parts)
239
+
240
+ def generate_chat_response(self, messages: list, max_tokens: int = None) -> str:
241
+ """
242
+ Generate response using Gemma chat format
243
+
244
+ Args:
245
+ messages: List of message dicts with 'role' and 'content' keys
246
+ max_tokens: Maximum tokens for generation
247
+
248
+ Returns:
249
+ Generated response text
250
+ """
251
+ if not messages:
252
+ raise ValueError("Messages list cannot be empty")
253
+
254
+ # Format messages using Gemma chat format
255
+ formatted_prompt = self._format_gemma_chat(messages)
256
+
257
+ # Set generation parameters
258
+ generation_params = {
259
+ "max_tokens": max_tokens or Config.MAX_NEW_TOKENS,
260
+ "temperature": Config.TEMPERATURE,
261
+ "top_k": 64,
262
+ "top_p": 0.95,
263
+ "min_p": 0.0,
264
+ "echo": False,
265
+ "stop": ["<end_of_turn>", "<start_of_turn>", "<bos>"]
266
+ }
267
+
268
+ # Generate response
269
+ response = self.llm(formatted_prompt, **generation_params)
270
+ generated_text = response['choices'][0]['text'].strip()
271
+
272
+ return generated_text
273
+
274
  def generate_structured_response(self,
275
  prompt: str,
276
  json_schema: Union[str, Dict[str, Any]],
 
311
  generation_params = {
312
  "max_tokens": Config.MAX_NEW_TOKENS,
313
  "temperature": Config.TEMPERATURE,
314
+ "top_k": 64,
315
+ "top_p": 0.95,
316
+ "min_p": 0.0,
317
  "echo": False
318
  }
319
 
320
  # Add grammar or stop tokens based on mode
321
  if use_grammar and grammar is not None:
322
  generation_params["grammar"] = grammar
323
+ # For grammar mode, use a simpler prompt in Gemma format
324
+ simple_prompt = f"<bos><start_of_turn>user\n{prompt}<end_of_turn>\n<start_of_turn>model\n"
325
  response = self.llm(simple_prompt, **generation_params)
326
  else:
327
+ # Update stop tokens for Gemma format
328
+ generation_params["stop"] = ["<end_of_turn>", "<start_of_turn>", "<bos>"]
329
  response = self.llm(formatted_prompt, **generation_params)
330
 
331
  # Extract generated text
 
340
  if json_start != -1 and json_end > json_start:
341
  json_str = generated_text[json_start:json_end]
342
  parsed_response = json.loads(json_str)
343
+ return parsed_response
 
 
 
 
344
  else:
345
  return {
346
  "error": "Could not find JSON in model response",
 
359
  "error": f"Generation error: {str(e)}"
360
  }
361
 
362
+ def _json_schema_to_gbnf(schema: Dict[str, Any], root_name: str = "root") -> str:
363
+ """Convert JSON schema to GBNF (Backus-Naur Form) grammar for structured output"""
364
+ rules = {} # Use dict to maintain order and avoid duplicates
365
+
366
+ def add_rule(name: str, definition: str):
367
+ if name not in rules:
368
+ rules[name] = f"{name} ::= {definition}"
369
+
370
+ def process_type(schema_part: Dict[str, Any], type_name: str = "value") -> str:
371
+ if "type" not in schema_part:
372
+ # Handle anyOf, oneOf, allOf cases - simplified to string for now
373
+ return "string"
374
+
375
+ schema_type = schema_part["type"]
376
+
377
+ if schema_type == "object":
378
+ # Handle object type
379
+ properties = schema_part.get("properties", {})
380
+ required = schema_part.get("required", [])
381
+
382
+ if not properties:
383
+ add_rule(type_name, '"{" ws "}"')
384
+ return type_name
385
+
386
+ # Build object properties
387
+ property_rules = []
388
+
389
+ for prop_name, prop_schema in properties.items():
390
+ prop_type_name = f"{type_name}_{prop_name}"
391
+ prop_type = process_type(prop_schema, prop_type_name)
392
+ property_rules.append(f'"\\"" "{prop_name}" "\\"" ws ":" ws {prop_type}')
393
+
394
+ # Create a simplified object structure with all properties as required
395
+ # This avoids complex optional field handling that can cause parsing issues
396
+ if len(property_rules) == 1:
397
+ object_def = f'"{{" ws {property_rules[0]} ws "}}"'
398
+ else:
399
+ properties_joined = ' ws "," ws '.join(property_rules)
400
+ object_def = f'"{{" ws {properties_joined} ws "}}"'
401
+
402
+ add_rule(type_name, object_def)
403
+ return type_name
404
+
405
+ elif schema_type == "array":
406
+ # Handle array type
407
+ items_schema = schema_part.get("items", {})
408
+ items_type_name = f"{type_name}_items"
409
+ item_type = process_type(items_schema, f"{type_name}_item")
410
+
411
+ # Create array items rule
412
+ add_rule(items_type_name, f"{item_type} (ws \",\" ws {item_type})*")
413
+ add_rule(type_name, f'"[" ws ({items_type_name})? ws "]"')
414
+ return type_name
415
+
416
+ elif schema_type == "string":
417
+ # Handle string type with enum support
418
+ if "enum" in schema_part:
419
+ enum_values = schema_part["enum"]
420
+ enum_options = ' | '.join([f'"\\"" "{val}" "\\""' for val in enum_values])
421
+ add_rule(type_name, enum_options)
422
+ return type_name
423
+ else:
424
+ return "string"
425
+
426
+ elif schema_type == "number" or schema_type == "integer":
427
+ return "number"
428
+
429
+ elif schema_type == "boolean":
430
+ return "boolean"
431
+
432
+ else:
433
+ return "string" # fallback
434
+
435
+ # First add basic GBNF rules for primitives to ensure they come first
436
+ basic_rules_data = [
437
+ ('ws', '[ \\t\\n]*'),
438
+ ('string', '"\\"" char* "\\""'),
439
+ ('char', '[^"\\\\] | "\\\\" (["\\\\bfnrt] | "u" hex hex hex hex)'),
440
+ ('hex', '[0-9a-fA-F]'),
441
+ ('number', '"-"? ("0" | [1-9] [0-9]*) ("." [0-9]+)? ([eE] [+-]? [0-9]+)?'),
442
+ ('boolean', '"true" | "false"'),
443
+ ('null', '"null"')
444
+ ]
445
+
446
+ for rule_name, rule_def in basic_rules_data:
447
+ add_rule(rule_name, rule_def)
448
+
449
+ # Process root schema to build all custom rules
450
+ process_type(schema, root_name)
451
+
452
+ # Return rules in the order they were added
453
+ return "\n".join(rules.values())
454
+
455
  def test_grammar_generation(json_schema_str: str) -> Dict[str, Any]:
456
  """
457
  Test grammar generation without running the full model
 
501
  result = llm_client.generate_structured_response(prompt, json_schema, image, use_grammar)
502
  return json.dumps(result, ensure_ascii=False, indent=2)
503
 
504
+ def test_gemma_chat(messages_text: str) -> str:
505
+ """
506
+ Test Gemma chat format with example conversation
507
+ """
508
+ if llm_client is None:
509
+ return "Error: LLM client not initialized"
510
+
511
+ try:
512
+ # Parse messages from text (simple format: role:message per line)
513
+ messages = []
514
+ for line in messages_text.strip().split('\n'):
515
+ if ':' in line:
516
+ role, content = line.split(':', 1)
517
+ role = role.strip().lower()
518
+ content = content.strip()
519
+ if role in ['user', 'model']:
520
+ messages.append({"role": role, "content": content})
521
+
522
+ if not messages:
523
+ # Use default example if no valid messages provided
524
+ messages = [
525
+ {"role": "user", "content": "Hello!"},
526
+ {"role": "model", "content": "Hey there!"},
527
+ {"role": "user", "content": "What is 1+1?"}
528
+ ]
529
+
530
+ # Generate formatted prompt to show the structure
531
+ formatted_prompt = llm_client._format_gemma_chat(messages)
532
+
533
+ # Generate response
534
+ response = llm_client.generate_chat_response(messages, max_tokens=100)
535
+
536
+ return f"Formatted prompt:\n{formatted_prompt}\n\nGenerated response:\n{response}"
537
+
538
+ except Exception as e:
539
+ return f"Error: {str(e)}"
540
+
541
  # Examples for demonstration
542
  example_schema = """{
543
  "type": "object",
 
583
  else:
584
  gr.Markdown("✅ **Status**: Model successfully loaded and ready to work")
585
 
586
+ with gr.Tabs():
587
+ with gr.TabItem("🔧 Structured Output"):
588
+ create_structured_output_tab()
589
+
590
+ with gr.TabItem("💬 Gemma Chat Format"):
591
+ create_gemma_chat_tab()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
592
 
593
  # Model information
594
  gr.Markdown(f"""
 
616
  - Strict enforcement of JSON structure during generation
617
  - Support for objects, arrays, strings, numbers, booleans, and enums
618
  - Improved consistency and reliability of structured outputs
619
+
620
+ 📝 **Gemma Format Features**:
621
+ - Uses proper Gemma chat tokens: `<bos>`, `<start_of_turn>`, `<end_of_turn>`
622
+ - Supports multi-turn conversations with user/model roles
623
+ - Compatible with Gemma model's expected input format
624
+ - Improved response quality with proper token structure
625
  """)
626
 
627
  return demo
628
 
629
+ def create_structured_output_tab():
630
+ """Create structured output tab"""
631
+ with gr.Row():
632
+ with gr.Column():
633
+ prompt_input = gr.Textbox(
634
+ label="Prompt for model",
635
+ placeholder="Enter your request...",
636
+ lines=5,
637
+ value=example_prompt
638
+ )
639
+
640
+ image_input = gr.Image(
641
+ label="Image (optional, for multimodal models)",
642
+ type="pil"
643
+ )
644
+
645
+ schema_input = gr.Textbox(
646
+ label="JSON schema for response structure",
647
+ placeholder="Enter JSON schema...",
648
+ lines=15,
649
+ value=example_schema
650
+ )
651
+
652
+ grammar_checkbox = gr.Checkbox(
653
+ label="🔗 Use Grammar (GBNF) Mode",
654
+ value=True,
655
+ info="Enable grammar-based structured output for more precise JSON generation"
656
+ )
657
+
658
+ submit_btn = gr.Button("Generate Response", variant="primary")
659
+
660
+ with gr.Column():
661
+ output = gr.Textbox(
662
+ label="Structured Response",
663
+ lines=20,
664
+ interactive=False
665
+ )
666
+
667
+ submit_btn.click(
668
+ fn=process_request,
669
+ inputs=[prompt_input, schema_input, image_input, grammar_checkbox],
670
+ outputs=output
671
+ )
672
+
673
+ # Examples
674
+ gr.Markdown("## 📋 Usage Examples")
675
+
676
+ examples = gr.Examples(
677
+ examples=[
678
+ [
679
+ "Describe today's weather in New York",
680
+ """{
681
+ "type": "object",
682
+ "properties": {
683
+ "temperature": {"type": "number"},
684
+ "description": {"type": "string"},
685
+ "humidity": {"type": "number"}
686
+ }
687
+ }""",
688
+ None
689
+ ],
690
+ [
691
+ "Create a Python learning plan for one month",
692
+ """{
693
+ "type": "object",
694
+ "properties": {
695
+ "weeks": {
696
+ "type": "array",
697
+ "items": {
698
+ "type": "object",
699
+ "properties": {
700
+ "week_number": {"type": "integer"},
701
+ "topics": {"type": "array", "items": {"type": "string"}},
702
+ "practice_hours": {"type": "number"}
703
+ }
704
+ }
705
+ },
706
+ "total_hours": {"type": "number"}
707
+ }
708
+ }""",
709
+ None
710
+ ]
711
+ ],
712
+ inputs=[prompt_input, schema_input, image_input]
713
+ )
714
+
715
+ def create_gemma_chat_tab():
716
+ """Create Gemma chat format demonstration tab"""
717
+ gr.Markdown("## 💬 Gemma Chat Format Demo")
718
+ gr.Markdown("This tab demonstrates the Gemma chat format with `<bos>`, `<start_of_turn>`, and `<end_of_turn>` tokens.")
719
+
720
+ with gr.Row():
721
+ with gr.Column():
722
+ messages_input = gr.Textbox(
723
+ label="Conversation Messages (format: role: message per line)",
724
+ placeholder="user: Hello!\nmodel: Hey there!\nuser: What is 1+1?",
725
+ lines=8,
726
+ value="user: Hello!\nmodel: Hey there!\nuser: What is 1+1?"
727
+ )
728
+
729
+ test_btn = gr.Button("Test Gemma Format", variant="primary")
730
+
731
+ with gr.Column():
732
+ chat_output = gr.Textbox(
733
+ label="Formatted Prompt and Response",
734
+ lines=15,
735
+ interactive=False
736
+ )
737
+
738
+ test_btn.click(
739
+ fn=test_gemma_chat,
740
+ inputs=messages_input,
741
+ outputs=chat_output
742
+ )
743
+
744
+ # Example explanation
745
+ gr.Markdown("""
746
+ ### 📝 Format Explanation
747
+
748
+ The Gemma chat format uses special tokens to structure conversations:
749
+ - `<bos>` - Beginning of sequence
750
+ - `<start_of_turn>user` - Start user message
751
+ - `<end_of_turn>` - End current message
752
+ - `<start_of_turn>model` - Start model response
753
+
754
+ **Example structure:**
755
+ ```
756
+ <bos><start_of_turn>user
757
+ Hello!<end_of_turn>
758
+ <start_of_turn>model
759
+ Hey there!<end_of_turn>
760
+ <start_of_turn>user
761
+ What is 1+1?<end_of_turn>
762
+ <start_of_turn>model
763
+ ```
764
+
765
+ This format is now used for both structured output and regular chat generation.
766
+ """)
767
+
768
  if __name__ == "__main__":
769
  # Create and launch Gradio interface
770
  demo = create_gradio_interface()
 
772
  server_name=Config.HOST,
773
  server_port=Config.GRADIO_PORT,
774
  share=False,
775
+ debug=False
776
  )
config.py CHANGED
@@ -5,19 +5,19 @@ class Config:
5
  """Application configuration for working with local GGUF models"""
6
 
7
  # Model settings - using Hugging Face downloaded model
8
- MODEL_REPO: str = os.getenv("MODEL_REPO", "lmstudio-community/gemma-3n-E4B-it-text-GGUF")
9
- MODEL_FILENAME: str = os.getenv("MODEL_FILENAME", "gemma-3n-E4B-it-Q8_0.gguf")
10
- MODEL_PATH: str = os.getenv("MODEL_PATH", "/app/models/gemma-3n-E4B-it-Q8_0.gguf")
11
  HUGGINGFACE_TOKEN: str = os.getenv("HUGGINGFACE_TOKEN", "")
12
 
13
  # Model loading settings - optimized for Docker container
14
- N_CTX: int = int(os.getenv("N_CTX", "4096")) # Reduced context window for Docker
15
  N_GPU_LAYERS: int = int(os.getenv("N_GPU_LAYERS", "0")) # CPU-only for Docker by default
16
- N_THREADS: int = int(os.getenv("N_THREADS", "4")) # Conservative thread count
17
  N_BATCH: int = int(os.getenv("N_BATCH", "512")) # Smaller batch size for Docker
18
  USE_MLOCK: bool = os.getenv("USE_MLOCK", "false").lower() == "true" # Disabled for Docker
19
  USE_MMAP: bool = os.getenv("USE_MMAP", "true").lower() == "true" # Keep memory mapping
20
- F16_KV: bool = os.getenv("F16_KV", "true").lower() == "true" # Use 16-bit keys and values
21
  SEED: int = int(os.getenv("SEED", "42")) # Random seed for reproducibility
22
 
23
  # Server settings - Docker compatible
@@ -25,9 +25,12 @@ class Config:
25
  GRADIO_PORT: int = int(os.getenv("GRADIO_PORT", "7860")) # Standard HuggingFace Spaces port
26
  API_PORT: int = int(os.getenv("API_PORT", "8000"))
27
 
 
 
 
28
  # Generation settings - optimized for Docker
29
  MAX_NEW_TOKENS: int = int(os.getenv("MAX_NEW_TOKENS", "256")) # Reduced for faster response
30
- TEMPERATURE: float = float(os.getenv("TEMPERATURE", "0.1"))
31
 
32
  # File upload settings
33
  MAX_FILE_SIZE: int = int(os.getenv("MAX_FILE_SIZE", "10485760")) # 10MB
 
5
  """Application configuration for working with local GGUF models"""
6
 
7
  # Model settings - using Hugging Face downloaded model
8
+ MODEL_REPO = "unsloth/gemma-3-270m-it-GGUF"
9
+ MODEL_FILENAME = "gemma-3-270m-it-Q8_0.gguf"
10
+ MODEL_PATH = f"/app/models/{MODEL_FILENAME}"
11
  HUGGINGFACE_TOKEN: str = os.getenv("HUGGINGFACE_TOKEN", "")
12
 
13
  # Model loading settings - optimized for Docker container
14
+ N_CTX: int = int(os.getenv("N_CTX", "1024")) # Reduced context window for Docker
15
  N_GPU_LAYERS: int = int(os.getenv("N_GPU_LAYERS", "0")) # CPU-only for Docker by default
16
+ N_THREADS: int = int(os.getenv("N_THREADS", "2")) # Conservative thread count
17
  N_BATCH: int = int(os.getenv("N_BATCH", "512")) # Smaller batch size for Docker
18
  USE_MLOCK: bool = os.getenv("USE_MLOCK", "false").lower() == "true" # Disabled for Docker
19
  USE_MMAP: bool = os.getenv("USE_MMAP", "true").lower() == "true" # Keep memory mapping
20
+ F16_KV: bool = os.getenv("F16_KV", "false").lower() == "true" # Use 16-bit keys and values
21
  SEED: int = int(os.getenv("SEED", "42")) # Random seed for reproducibility
22
 
23
  # Server settings - Docker compatible
 
25
  GRADIO_PORT: int = int(os.getenv("GRADIO_PORT", "7860")) # Standard HuggingFace Spaces port
26
  API_PORT: int = int(os.getenv("API_PORT", "8000"))
27
 
28
+ # Logging settings
29
+ LOG_LEVEL: str = os.getenv("LOG_LEVEL", "INFO") # INFO, WARNING, ERROR, DEBUG
30
+
31
  # Generation settings - optimized for Docker
32
  MAX_NEW_TOKENS: int = int(os.getenv("MAX_NEW_TOKENS", "256")) # Reduced for faster response
33
+ TEMPERATURE: float = 1.0
34
 
35
  # File upload settings
36
  MAX_FILE_SIZE: int = int(os.getenv("MAX_FILE_SIZE", "10485760")) # 10MB
requirements.txt CHANGED
@@ -1,6 +1,4 @@
1
  huggingface_hub==0.25.2
2
- # Core ML dependencies - updated for compatibility with gemma-3n-E4B model
3
- # https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.2/llama_cpp_python-0.3.2-cp310-cp310-linux_x86_64.whl
4
 
5
  # Web interface
6
  gradio==4.44.1
 
1
  huggingface_hub==0.25.2
 
 
2
 
3
  # Web interface
4
  gradio==4.44.1
test.ipynb DELETED
@@ -1,24 +0,0 @@
1
- {
2
- "cells": [],
3
- "metadata": {
4
- "kernelspec": {
5
- "display_name": "py310",
6
- "language": "python",
7
- "name": "python3"
8
- },
9
- "language_info": {
10
- "codemirror_mode": {
11
- "name": "ipython",
12
- "version": 3
13
- },
14
- "file_extension": ".py",
15
- "mimetype": "text/x-python",
16
- "name": "python",
17
- "nbconvert_exporter": "python",
18
- "pygments_lexer": "ipython3",
19
- "version": "3.10.18"
20
- }
21
- },
22
- "nbformat": 4,
23
- "nbformat_minor": 5
24
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
wheels/llama_cpp_python-0.3.16-cp310-cp310-linux_x86_64.whl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:73ff502f10b7d2c985879796fc80ea212a71a9114bf26b90b7bd70c2842ba967
3
+ size 4259580