.gitignore CHANGED
@@ -1,7 +1,14 @@
 
 
 
 
1
  # Virtual environment
2
  myenv/
3
  venv/
4
- env/
 
 
 
5
 
6
  # Python
7
  __pycache__/
@@ -9,24 +16,29 @@ __pycache__/
9
  *$py.class
10
  *.so
11
  .Python
12
- *.egg
13
- *.egg-info/
14
- dist/
15
  build/
 
 
 
16
  eggs/
17
- *.egg-info/
18
  .eggs/
19
-
20
- # Models
21
- models/
22
- main/models/
 
 
 
 
 
23
 
24
  # IDE
25
- .vscode/
26
  .idea/
 
27
  *.swp
28
- *~
29
-
30
- # OS
31
  .DS_Store
32
- Thumbs.db
 
 
 
 
1
+ ># Environment files
2
+ .env
3
+ .env.*
4
+
5
  # Virtual environment
6
  myenv/
7
  venv/
8
+ ENV/
9
+
10
+ # Model checkpoints
11
+ checkpoints/
12
 
13
  # Python
14
  __pycache__/
 
16
  *$py.class
17
  *.so
18
  .Python
 
 
 
19
  build/
20
+ develop-eggs/
21
+ dist/
22
+ downloads/
23
  eggs/
 
24
  .eggs/
25
+ lib/
26
+ lib64/
27
+ parts/
28
+ sdist/
29
+ var/
30
+ wheels/
31
+ *.egg-info/
32
+ .installed.cfg
33
+ *.egg
34
 
35
  # IDE
 
36
  .idea/
37
+ .vscode/
38
  *.swp
39
+ *.swo
 
 
40
  .DS_Store
41
+
42
+ # Logs
43
+ *.log
44
+ logs/
.idea/.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Default ignored files
2
+ /shelf/
3
+ /workspace.xml
.idea/Inference-Server.iml ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <module type="JAVA_MODULE" version="4">
3
+ <component name="NewModuleRootManager" inherit-compiler-output="true">
4
+ <exclude-output />
5
+ <content url="file://$MODULE_DIR$" />
6
+ <orderEntry type="inheritedJdk" />
7
+ <orderEntry type="sourceFolder" forTests="false" />
8
+ </component>
9
+ </module>
.idea/misc.xml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ProjectRootManager" version="2" languageLevel="JDK_21" default="true" project-jdk-name="21" project-jdk-type="JavaSDK">
4
+ <output url="file://$PROJECT_DIR$/out" />
5
+ </component>
6
+ </project>
.idea/modules.xml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ProjectModuleManager">
4
+ <modules>
5
+ <module fileurl="file://$PROJECT_DIR$/.idea/Inference-Server.iml" filepath="$PROJECT_DIR$/.idea/Inference-Server.iml" />
6
+ </modules>
7
+ </component>
8
+ </project>
.idea/vcs.xml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="VcsDirectoryMappings">
4
+ <mapping directory="" vcs="Git" />
5
+ </component>
6
+ </project>
=0.45.0 DELETED
@@ -1,30 +0,0 @@
1
- Collecting bitsandbytes
2
- Downloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl.metadata (2.9 kB)
3
- Requirement already satisfied: torch in ./myenv/lib/python3.12/site-packages (from bitsandbytes) (2.5.1)
4
- Requirement already satisfied: numpy in ./myenv/lib/python3.12/site-packages (from bitsandbytes) (2.2.1)
5
- Requirement already satisfied: typing_extensions>=4.8.0 in ./myenv/lib/python3.12/site-packages (from bitsandbytes) (4.12.2)
6
- Requirement already satisfied: filelock in ./myenv/lib/python3.12/site-packages (from torch->bitsandbytes) (3.16.1)
7
- Requirement already satisfied: networkx in ./myenv/lib/python3.12/site-packages (from torch->bitsandbytes) (3.4.2)
8
- Requirement already satisfied: jinja2 in ./myenv/lib/python3.12/site-packages (from torch->bitsandbytes) (3.1.5)
9
- Requirement already satisfied: fsspec in ./myenv/lib/python3.12/site-packages (from torch->bitsandbytes) (2024.12.0)
10
- Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.4.127 in ./myenv/lib/python3.12/site-packages (from torch->bitsandbytes) (12.4.127)
11
- Requirement already satisfied: nvidia-cuda-runtime-cu12==12.4.127 in ./myenv/lib/python3.12/site-packages (from torch->bitsandbytes) (12.4.127)
12
- Requirement already satisfied: nvidia-cuda-cupti-cu12==12.4.127 in ./myenv/lib/python3.12/site-packages (from torch->bitsandbytes) (12.4.127)
13
- Requirement already satisfied: nvidia-cudnn-cu12==9.1.0.70 in ./myenv/lib/python3.12/site-packages (from torch->bitsandbytes) (9.1.0.70)
14
- Requirement already satisfied: nvidia-cublas-cu12==12.4.5.8 in ./myenv/lib/python3.12/site-packages (from torch->bitsandbytes) (12.4.5.8)
15
- Requirement already satisfied: nvidia-cufft-cu12==11.2.1.3 in ./myenv/lib/python3.12/site-packages (from torch->bitsandbytes) (11.2.1.3)
16
- Requirement already satisfied: nvidia-curand-cu12==10.3.5.147 in ./myenv/lib/python3.12/site-packages (from torch->bitsandbytes) (10.3.5.147)
17
- Requirement already satisfied: nvidia-cusolver-cu12==11.6.1.9 in ./myenv/lib/python3.12/site-packages (from torch->bitsandbytes) (11.6.1.9)
18
- Requirement already satisfied: nvidia-cusparse-cu12==12.3.1.170 in ./myenv/lib/python3.12/site-packages (from torch->bitsandbytes) (12.3.1.170)
19
- Requirement already satisfied: nvidia-nccl-cu12==2.21.5 in ./myenv/lib/python3.12/site-packages (from torch->bitsandbytes) (2.21.5)
20
- Requirement already satisfied: nvidia-nvtx-cu12==12.4.127 in ./myenv/lib/python3.12/site-packages (from torch->bitsandbytes) (12.4.127)
21
- Requirement already satisfied: nvidia-nvjitlink-cu12==12.4.127 in ./myenv/lib/python3.12/site-packages (from torch->bitsandbytes) (12.4.127)
22
- Requirement already satisfied: triton==3.1.0 in ./myenv/lib/python3.12/site-packages (from torch->bitsandbytes) (3.1.0)
23
- Requirement already satisfied: setuptools in ./myenv/lib/python3.12/site-packages (from torch->bitsandbytes) (75.8.0)
24
- Requirement already satisfied: sympy==1.13.1 in ./myenv/lib/python3.12/site-packages (from torch->bitsandbytes) (1.13.1)
25
- Requirement already satisfied: mpmath<1.4,>=1.1.0 in ./myenv/lib/python3.12/site-packages (from sympy==1.13.1->torch->bitsandbytes) (1.3.0)
26
- Requirement already satisfied: MarkupSafe>=2.0 in ./myenv/lib/python3.12/site-packages (from jinja2->torch->bitsandbytes) (3.0.2)
27
- Downloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl (69.1 MB)
28
- ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 69.1/69.1 MB 64.0 MB/s eta 0:00:00
29
- Installing collected packages: bitsandbytes
30
- Successfully installed bitsandbytes-0.45.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Dockerfile CHANGED
@@ -1,19 +1,59 @@
1
- # Use Python 3.12 slim image as base
2
- FROM python:3.12-slim
3
-
4
- RUN useradd -m -u 1000 user
5
- USER user
6
- ENV PATH="/home/user/.local/bin:$PATH"
7
 
 
8
  WORKDIR /app
9
 
10
- COPY --chown=user ./requirements.txt requirements.txt
11
- RUN pip install --no-cache-dir --upgrade -r requirements.txt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
- COPY --chown=user . /app
14
- COPY --chown=user main/ /app/main
15
 
16
- EXPOSE 7860
 
 
 
17
 
18
- # We run the app object in the app.py file in the main folder.
19
- CMD ["uvicorn", "main.app:app", "--host", "0.0.0.0", "--port", "7860"]
 
1
+ # Use Python 3.10 as base image for better compatibility with ML libraries
2
+ FROM python:3.10-slim
 
 
 
 
3
 
4
+ # Set working directory
5
  WORKDIR /app
6
 
7
+ # Install git and required system dependencies
8
+ RUN apt-get update && \
9
+ apt-get install -y git && \
10
+ apt-get clean && \
11
+ rm -rf /var/lib/apt/lists/*
12
+
13
+ # Create cache directory and set permissions
14
+ RUN mkdir -p /app/.cache/huggingface && \
15
+ chmod 777 /app/.cache/huggingface
16
+
17
+ # Set environment variables for cache
18
+ ENV TRANSFORMERS_CACHE=/app/.cache/huggingface/hub
19
+ ENV HF_HOME=/app/.cache/huggingface
20
+
21
+ # Copy requirements first to leverage Docker cache
22
+ COPY requirements.txt .
23
+
24
+ # Install Python dependencies
25
+ RUN pip install --no-cache-dir -r requirements.txt
26
+
27
+ # Copy the rest of the application
28
+ COPY . .
29
+
30
+ # Create checkpoints directory with proper permissions
31
+ RUN mkdir -p /app/checkpoints && \
32
+ chmod 777 /app/checkpoints
33
+
34
+ # The token will be passed during build time
35
+ ARG HF_TOKEN
36
+ ENV HF_TOKEN=${HF_TOKEN}
37
+
38
+ # Download the Llama 2 model using litgpt
39
+ # Only proceed if HF_TOKEN is provided
40
+ RUN if [ -n "$HF_TOKEN" ]; then \
41
+ python -c "from huggingface_hub import login; from litgpt.cli import download; login('${HF_TOKEN}'); download('meta-llama/Llama-2-3b-chat-hf', '/app/checkpoints')"; \
42
+ else \
43
+ echo "No Hugging Face token provided. Model will need to be downloaded separately."; \
44
+ fi
45
+
46
+ # Set environment variables
47
+ ENV LLM_ENGINE_HOST=0.0.0.0
48
+ ENV LLM_ENGINE_PORT=8001
49
 
50
+ # Update MODEL_PATH for the new model
51
+ ENV MODEL_PATH=/app/checkpoints/meta-llama/Llama-2-3b-chat-hf
52
 
53
+ # Expose both ports:
54
+ # 8001 for FastAPI
55
+ # 7860 for Hugging Face Spaces
56
+ EXPOSE 8001 7860
57
 
58
+ # Command to run the application
59
+ CMD ["python", "main/main.py"]
README.md CHANGED
@@ -1,162 +1,28 @@
1
  ---
2
- title: LLMServer
3
- emoji: 👹
4
  colorFrom: indigo
5
- colorTo: purple
6
  sdk: docker
7
  pinned: false
 
8
  ---
9
 
10
- # LLM Server
11
 
12
- This repository contains a FastAPI-based server that serves open-source Large Language Models from Hugging Face.
13
 
14
- ## Getting Started
15
 
16
- These instructions will help you set up and run the project on your local machine.
17
 
18
- ### Prerequisites
19
 
20
- - Python 3.10 or higher
21
- - Git
22
 
23
- ### Cloning the Repository
24
-
25
- Choose one of the following methods to clone the repository:
26
-
27
- #### HTTPS
28
- ```bash
29
- git clone https://huggingface.co/spaces/TeamGenKI/LLMServer
30
- cd project-name
31
- ```
32
-
33
- #### SSH
34
- ```bash
35
- git clone [email protected]:spaces/TeamGenKI/LLMServer
36
- cd project-name
37
- ```
38
-
39
- ### Setting Up the Virtual Environment
40
-
41
- #### Windows
42
- ```bash
43
- # Create virtual environment
44
- python -m venv myenv
45
-
46
- # Activate virtual environment
47
- myenv\Scripts\activate
48
-
49
- # Install dependencies
50
- pip install -r requirements.txt
51
- ```
52
-
53
- #### Linux
54
- ```bash
55
- # Create virtual environment
56
- python -m venv myenv
57
-
58
- # Activate virtual environment
59
- source myenv/bin/activate
60
-
61
- # Install dependencies
62
- pip install -r requirements.txt
63
- ```
64
-
65
- #### macOS
66
- ```bash
67
- # Create virtual environment
68
- python3 -m venv myenv
69
-
70
- # Activate virtual environment
71
- source myenv/bin/activate
72
-
73
- # Install dependencies
74
- pip3 install -r requirements.txt
75
  ```
76
-
77
- ### Running the Application
78
-
79
- Once you have set up your environment and installed the dependencies, you can start the FastAPI application:
80
-
81
- ```bash
82
- uvicorn main.app:app --reload
83
- ```
84
-
85
- The API will be available at `http://localhost:8001`
86
-
87
- ### API Documentation
88
-
89
- Once the application is running, you can access:
90
- - Interactive API documentation (Swagger UI) at `http://localhost:8000/docs`
91
- - Alternative API documentation (ReDoc) at `http://localhost:8000/redoc`
92
-
93
- ### Deactivating the Virtual Environment
94
-
95
- When you're done working on the project, you can deactivate the virtual environment:
96
-
97
- ```bash
98
- deactivate
99
- ```
100
-
101
- ## Contributing
102
-
103
- [Add contributing guidelines here]
104
-
105
- ## License
106
-
107
- [Add license information here]
108
-
109
- ## Project Structure
110
-
111
- ```
112
- .
113
- ├── Dockerfile
114
- ├── main
115
- │ ├── api.py
116
- │ ├── app.py
117
- │ ├── config.yaml
118
- │ ├── env_template
119
- │ ├── __init__.py
120
- │ ├── logs
121
- │ │ └── llm_api.log
122
- │ ├── models
123
- │ ├── __pycache__
124
- │ │ ├── api.cpython-39.pyc
125
- │ │ ├── app.cpython-39.pyc
126
- │ │ ├── __init__.cpython-39.pyc
127
- │ │ └── routes.cpython-39.pyc
128
- │ ├── routes.py
129
- │ ├── test_locally.py
130
- │ └── utils
131
- │ ├── errors.py
132
- │ ├── helpers.py
133
- │ ├── __init__.py
134
- │ ├── logging.py
135
- │ ├── __pycache__
136
- │ │ ├── helpers.cpython-39.pyc
137
- │ │ ├── __init__.cpython-39.pyc
138
- │ │ ├── logging.cpython-39.pyc
139
- │ │ └── validation.cpython-39.pyc
140
- │ └── validation.py
141
- ├── README.md
142
- └── requirements.txt
143
- ```
144
-
145
- ERROR:
146
-
147
- INFO: 127.0.0.1:60874 - "POST /api/v1/model/download?model_name=microsoft%2FPhi-3.5-mini-instruct HTTP/1.1" 200 OK
148
- 2025-01-13 16:18:45,409 - api_routes - INFO - Received request to initialize model: microsoft/Phi-3.5-mini-instruct
149
- 2025-01-13 16:18:45,409 - llm_api - INFO - Initializing generation model: microsoft/Phi-3.5-mini-instruct
150
- 2025-01-13 16:18:45,412 - llm_api - INFO - Loading model from local path: main/models/Phi-3.5-mini-instruct
151
- The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
152
- Could not find the bitsandbytes CUDA binary at PosixPath('/home/aurelio/Desktop/Projects/LLMServer/myenv/lib/python3.13/site-packages/bitsandbytes/libbitsandbytes_cuda124.so')
153
- g++ (GCC) 14.2.1 20240910
154
- Copyright (C) 2024 Free Software Foundation, Inc.
155
- This is free software; see the source for copying conditions. There is NO
156
- warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
157
-
158
- 2025-01-13 16:18:45,982 - llm_api - ERROR - Failed to initialize generation model microsoft/Phi-3.5-mini-instruct: Failed to import transformers.integrations.bitsandbytes because of the following error (look up to see its traceback):
159
- Dynamo is not supported on Python 3.13+
160
- 2025-01-13 16:18:45,982 - api_routes - ERROR - Error initializing model: Failed to import transformers.integrations.bitsandbytes because of the following error (look up to see its traceback):
161
- Dynamo is not supported on Python 3.13+
162
- INFO: 127.0.0.1:38330 - "POST /api/v1/model/initialize?model_name=microsoft%2FPhi-3.5-mini-instruct HTTP/1.1" 500 Internal Server Error
 
1
  ---
2
+ title: LLM Engine
3
+ emoji: 🐨
4
  colorFrom: indigo
5
+ colorTo: yellow
6
  sdk: docker
7
  pinned: false
8
+ short_description: LLM Engine for Team Gen KI (GPU goes here)
9
  ---
10
 
11
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
12
 
 
13
 
14
+ HTTP Clone:
15
 
16
+ git clone https://huggingface.co/spaces/TeamGenKI/LLM-Engine
17
 
18
+ SSH Clone:
19
 
20
+ git clone git@hf.co:spaces/TeamGenKI/LLM-Engine
 
21
 
22
+ ```mermaid
23
+ folders
24
+ LLM-Engine
25
+ Main
26
+ main.py
27
+ utils.py
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/.cache/hub/version.txt DELETED
@@ -1 +0,0 @@
1
- 1
 
 
main/api.py CHANGED
@@ -1,352 +0,0 @@
1
- import os
2
- from pathlib import Path
3
- from threading import Thread
4
- import torch
5
- from typing import Optional, List, AsyncIterator
6
- import asyncio
7
- from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig
8
- from .utils.logging import setup_logger
9
-
10
- class LLMApi:
11
- def __init__(self, config: dict):
12
- """Initialize the LLM API with configuration."""
13
- self.logger = setup_logger(config, "llm_api")
14
- self.logger.info("Initializing LLM API")
15
-
16
- # Set up paths
17
- self.base_path = Path(config["model"]["base_path"])
18
- self.models_path = self.base_path / config["folders"]["models"]
19
- self.cache_path = self.base_path / config["folders"]["cache"]
20
-
21
- # Initialize model variables for both generation and embedding
22
- self.generation_model = None
23
- self.generation_model_name = None
24
- self.embedding_model = None
25
- self.embedding_model_name = None
26
- self.tokenizer = None
27
- self.embedding_tokenizer = None
28
-
29
- # Generation parameters from config
30
- gen_config = config["model"]["generation"]
31
- self.max_new_tokens = gen_config["max_new_tokens"]
32
- self.do_sample = gen_config["do_sample"]
33
- self.temperature = gen_config["temperature"]
34
- self.repetition_penalty = gen_config["repetition_penalty"]
35
-
36
- self.generation_config = {
37
- "max_new_tokens": self.max_new_tokens,
38
- "do_sample": self.do_sample,
39
- "temperature": self.temperature,
40
- "repetition_penalty": self.repetition_penalty,
41
- "eos_token_id": None,
42
- "pad_token_id": None
43
- }
44
-
45
- # Create necessary directories
46
- self.models_path.mkdir(parents=True, exist_ok=True)
47
- self.cache_path.mkdir(parents=True, exist_ok=True)
48
-
49
- # Set cache directory for transformers
50
- os.environ['HF_HOME'] = str(self.cache_path)
51
-
52
- self.logger.info("LLM API initialized successfully")
53
-
54
- def download_model(self, model_name: str) -> None:
55
- """
56
- Download a model and its tokenizer to the models directory.
57
-
58
- Args:
59
- model_name: The name of the model to download (e.g., "norallm/normistral-11b-warm")
60
- """
61
- self.logger.info(f"Starting download of model: {model_name}")
62
- try:
63
- model_path = self.models_path / model_name.split('/')[-1]
64
-
65
- # Download and save model
66
- self.logger.info(f"Enabling stdout logging for download")
67
- self.logger.enable_stream_to_logger()
68
- model = AutoModelForCausalLM.from_pretrained(model_name)
69
-
70
- # Download and save tokenizer
71
- tokenizer = AutoTokenizer.from_pretrained(model_name)
72
- self.logger.info(f"Disabling stdout logging")
73
- self.logger.disable_stream_to_logger()
74
-
75
- self.logger.info(f"Saving model to {model_path}")
76
- model.save_pretrained(model_path)
77
- tokenizer.save_pretrained(model_path)
78
-
79
- self.logger.info(f"Successfully downloaded model: {model_name}")
80
- except Exception as e:
81
- self.logger.error(f"Failed to download model {model_name}: {str(e)}")
82
- raise
83
-
84
- def initialize_model(self, model_name: str) -> None:
85
- """
86
- Initialize a model and tokenizer for text generation.
87
- Handles different platforms (CUDA, MPS, CPU) appropriately.
88
- """
89
- self.logger.info(f"Initializing generation model: {model_name}")
90
- try:
91
- self.generation_model_name = model_name
92
- local_model_path = self.models_path / model_name.split('/')[-1]
93
-
94
- # Check if model exists locally
95
- if local_model_path.exists():
96
- self.logger.info(f"Loading model from local path: {local_model_path}")
97
- model_path = local_model_path
98
- else:
99
- self.logger.info(f"Loading model from source: {model_name}")
100
- model_path = model_name
101
-
102
- # Check platform and set appropriate configuration
103
- if torch.cuda.is_available():
104
- self.logger.info("CUDA detected, using GPU with quantization")
105
- quantization_config = BitsAndBytesConfig(
106
- load_in_8bit=True,
107
- llm_int8_threshold=3.0
108
- )
109
- self.generation_model = AutoModelForCausalLM.from_pretrained(
110
- model_path,
111
- device_map="auto",
112
- quantization_config=quantization_config,
113
- torch_dtype=torch.float16
114
- )
115
- elif torch.backends.mps.is_available():
116
- self.logger.info("Apple Silicon detected, using MPS device")
117
- self.generation_model = AutoModelForCausalLM.from_pretrained(
118
- model_path,
119
- device_map="mps",
120
- torch_dtype=torch.float16
121
- )
122
- else:
123
- self.logger.info("No GPU detected, falling back to CPU")
124
- self.generation_model = AutoModelForCausalLM.from_pretrained(
125
- model_path,
126
- device_map="cpu",
127
- torch_dtype=torch.float32 # Use full precision for CPU
128
- )
129
-
130
- self.tokenizer = AutoTokenizer.from_pretrained(model_path)
131
-
132
- # Update generation config with tokenizer-specific values
133
- self.generation_config["eos_token_id"] = self.tokenizer.eos_token_id
134
- self.generation_config["pad_token_id"] = self.tokenizer.eos_token_id
135
-
136
- self.logger.info(f"Successfully initialized generation model: {model_name}")
137
- except Exception as e:
138
- self.logger.error(f"Failed to initialize generation model {model_name}: {str(e)}")
139
- raise
140
-
141
- def initialize_embedding_model(self, model_name: str) -> None:
142
- """
143
- Initialize a model and tokenizer specifically for embeddings.
144
-
145
- Args:
146
- model_name: The name of the model to initialize for embeddings
147
- """
148
- self.logger.info(f"Initializing embedding model: {model_name}")
149
- try:
150
- self.embedding_model_name = model_name
151
- local_model_path = self.models_path / model_name.split('/')[-1]
152
-
153
- # Check if model exists locally
154
- if local_model_path.exists():
155
- self.logger.info(f"Loading embedding model from local path: {local_model_path}")
156
- model_path = local_model_path
157
- else:
158
- self.logger.info(f"Loading embedding model from source: {model_name}")
159
- model_path = model_name
160
-
161
- self.embedding_model = AutoModelForCausalLM.from_pretrained(
162
- model_path,
163
- device_map="auto",
164
- load_in_8bit=True,
165
- torch_dtype=torch.float16
166
- )
167
- self.embedding_tokenizer = AutoTokenizer.from_pretrained(model_path)
168
-
169
- self.logger.info(f"Successfully initialized embedding model: {model_name}")
170
- except Exception as e:
171
- self.logger.error(f"Failed to initialize embedding model {model_name}: {str(e)}")
172
- raise
173
-
174
- def has_chat_template(self) -> bool:
175
- """Check if the current model has a chat template."""
176
- try:
177
- self.tokenizer.apply_chat_template(
178
- [{"role": "user", "content": "test"}],
179
- tokenize=False,
180
- )
181
- return True
182
- except (ValueError, AttributeError):
183
- return False
184
-
185
- def _prepare_prompt(self, prompt: str, system_message: Optional[str] = None) -> str:
186
- """
187
- Prepare the prompt text, either using the model's chat template if available,
188
- or falling back to a simple OpenAI-style format.
189
- """
190
- try:
191
- messages = []
192
- if system_message:
193
- messages.append({"role": "system", "content": system_message})
194
- messages.append({"role": "user", "content": prompt})
195
-
196
- return self.tokenizer.apply_chat_template(
197
- messages,
198
- tokenize=False,
199
- add_generation_prompt=True
200
- )
201
- except (ValueError, AttributeError):
202
- template = ""
203
- if system_message:
204
- template += f"System: {system_message}\n\n"
205
- template += f"User: {prompt}\n\nAssistant: "
206
- return template
207
-
208
- def generate_response(
209
- self,
210
- prompt: str,
211
- system_message: Optional[str] = None,
212
- max_new_tokens: Optional[int] = None
213
- ) -> str:
214
- """
215
- Generate a complete response for the given prompt.
216
- """
217
- self.logger.debug(f"Generating response for prompt: {prompt[:50]}...")
218
-
219
- if self.generation_model is None:
220
- raise RuntimeError("Generation model not initialized. Call initialize_model first.")
221
-
222
- try:
223
- text = self._prepare_prompt(prompt, system_message)
224
- inputs = self.tokenizer([text], return_tensors="pt")
225
-
226
- # Remove token_type_ids if present
227
- model_inputs = {k: v.to(self.generation_model.device) for k, v in inputs.items()
228
- if k != 'token_type_ids'}
229
-
230
- generation_config = self.generation_config.copy()
231
- if max_new_tokens:
232
- generation_config["max_new_tokens"] = max_new_tokens
233
-
234
- generated_ids = self.generation_model.generate(
235
- **model_inputs,
236
- **generation_config
237
- )
238
-
239
- generated_ids = [
240
- output_ids[len(input_ids):]
241
- for input_ids, output_ids in zip(model_inputs['input_ids'], generated_ids)
242
- ]
243
-
244
- response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
245
- self.logger.debug(f"Generated response: {response[:50]}...")
246
- return response
247
-
248
- except Exception as e:
249
- self.logger.error(f"Error generating response: {str(e)}")
250
- raise
251
-
252
- async def generate_stream(
253
- self,
254
- prompt: str,
255
- system_message: Optional[str] = None,
256
- max_new_tokens: Optional[int] = None
257
- ) -> AsyncIterator[str]:
258
- """
259
- Generate a streaming response for the given prompt.
260
- """
261
- self.logger.debug(f"Starting streaming generation for prompt: {prompt[:50]}...")
262
-
263
- if self.generation_model is None:
264
- raise RuntimeError("Generation model not initialized. Call initialize_model first.")
265
-
266
- try:
267
- text = self._prepare_prompt(prompt, system_message)
268
- inputs = self.tokenizer([text], return_tensors="pt")
269
-
270
- # Remove token_type_ids if present
271
- model_inputs = {k: v.to(self.generation_model.device) for k, v in inputs.items()
272
- if k != 'token_type_ids'}
273
-
274
- # Configure generation
275
- generation_config = self.generation_config.copy()
276
- if max_new_tokens:
277
- generation_config["max_new_tokens"] = max_new_tokens
278
-
279
- # Set up streaming
280
- streamer = TextIteratorStreamer(self.tokenizer)
281
- generation_kwargs = dict(
282
- **model_inputs,
283
- **generation_config,
284
- streamer=streamer
285
- )
286
-
287
- # Create a thread to run the generation
288
- thread = Thread(target=self.generation_model.generate, kwargs=generation_kwargs)
289
- thread.start()
290
-
291
- # Use async generator to yield chunks
292
- for new_text in streamer:
293
- self.logger.debug(f"Generated chunk: {new_text[:50]}...")
294
- yield new_text
295
- # Add a small delay to allow other tasks to run
296
- await asyncio.sleep(0)
297
-
298
- except Exception as e:
299
- self.logger.error(f"Error in streaming generation: {str(e)}")
300
- raise
301
-
302
- def generate_embedding(self, text: str) -> List[float]:
303
- """
304
- Generate a single embedding vector for a chunk of text using the dedicated embedding model.
305
- Returns a list of floats representing the text embedding.
306
- """
307
- self.logger.debug(f"Generating embedding for text: {text[:50]}...")
308
-
309
- if self.embedding_model is None or self.embedding_tokenizer is None:
310
- raise RuntimeError("Embedding model not initialized. Call initialize_embedding_model first.")
311
-
312
- try:
313
- # Tokenize the input text and ensure input_ids are Long type
314
- inputs = self.embedding_tokenizer(text, return_tensors='pt')
315
- input_ids = inputs.input_ids.to(dtype=torch.long, device=self.embedding_model.device)
316
-
317
- # Get the model's dtype from its parameters for the attention mask
318
- model_dtype = next(self.embedding_model.parameters()).dtype
319
-
320
- # Create an attention mask with matching dtype
321
- attention_mask = torch.zeros(
322
- input_ids.size(0),
323
- 1,
324
- input_ids.size(1),
325
- input_ids.size(1),
326
- device=input_ids.device,
327
- dtype=model_dtype
328
- )
329
-
330
- # Get model outputs
331
- with torch.no_grad():
332
- outputs = self.embedding_model(
333
- input_ids=input_ids,
334
- attention_mask=attention_mask,
335
- output_hidden_states=True,
336
- return_dict=True
337
- )
338
-
339
- # Get the last hidden state
340
- last_hidden_state = outputs.hidden_states[-1]
341
-
342
- # Average the hidden state over all tokens (excluding padding)
343
- embedding = last_hidden_state[0].mean(dim=0)
344
-
345
- # Convert to regular Python list
346
- embedding_list = embedding.cpu().tolist()
347
- self.logger.debug(f"Generated embedding of length: {len(embedding_list)}")
348
- return embedding_list
349
-
350
- except Exception as e:
351
- self.logger.error(f"Error generating embedding: {str(e)}")
352
- raise
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/app.py DELETED
@@ -1,55 +0,0 @@
1
- import yaml
2
- from fastapi import FastAPI
3
- from fastapi.middleware.cors import CORSMiddleware
4
- from .routes import router, init_router
5
- from .utils.logging import setup_logger
6
- from .utils.validation import validate_hf
7
- from .utils.helpers import load_config
8
-
9
-
10
- config = load_config()
11
- logger = setup_logger(config, "main")
12
-
13
- def create_app():
14
- global config
15
- global logger
16
-
17
- validate_hf(setup_logger, config)
18
- logger.info("Starting LLM API server")
19
-
20
- app = FastAPI(
21
- title="LLM API",
22
- description="API for Large Language Model operations",
23
- version=config["api"]["version"]
24
- )
25
-
26
- # Add CORS middleware
27
- app.add_middleware(
28
- CORSMiddleware,
29
- allow_origins=config["api"]["cors"]["origins"],
30
- allow_credentials=config["api"]["cors"]["credentials"],
31
- allow_methods=["*"],
32
- allow_headers=["*"],
33
- )
34
-
35
- # Initialize routes with config
36
- init_router(config)
37
-
38
- app.include_router(router, prefix=f"{config['api']['prefix']}/{config['api']['version']}")
39
-
40
- logger.info("FastAPI application created successfully")
41
- return app
42
-
43
- app = create_app()
44
-
45
- if __name__ == "__main__":
46
- host = config["server"]["host"]
47
- port = config["server"]["port"]
48
- import uvicorn
49
- uvicorn.run(
50
- app,
51
- host=host,
52
- port=port,
53
- log_level=config["logging"]["level"].lower()
54
- )
55
- logger.info(f"LLM API server started on {host}:{port}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/env_template DELETED
@@ -1,26 +0,0 @@
1
- # Hugging Face Authentication
2
- HF_TOKEN=your_token_here
3
-
4
- # CUDA Device Configuration
5
- CUDA_VISIBLE_DEVICES=0,1 # Specify GPUs to use (e.g., 0 for first GPU, 0,1 for first two)
6
-
7
- # Memory Management
8
- PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:512
9
- CUDA_LAUNCH_BLOCKING=1 # Set to 1 for debugging
10
- CUDA_AUTO_BOOST=0 # Disable auto boost for consistent performance
11
-
12
- # Cache Paths
13
- CUDA_CACHE_PATH=/path/to/cuda/cache
14
- TRANSFORMERS_CACHE=/path/to/transformers/cache
15
-
16
- # Performance Settings
17
- TF_ENABLE_ONEDNN_OPTS=1
18
- TF_GPU_ALLOCATOR=cuda_malloc_async
19
-
20
- # Model Settings
21
- TRANSFORMERS_OFFLINE=0 # Set to 1 for offline mode
22
-
23
- # Logging
24
- LOG_LEVEL=INFO # Options: DEBUG, INFO, WARNING, ERROR, CRITICAL
25
-
26
- # Add any additional environment-specific variables below
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/logs/llm_api.log DELETED
@@ -1,703 +0,0 @@
1
- 2025-01-09 15:54:08,215 - hf_validation - WARNING - No .env file found. Fine if you're on Huggingface, but you need one to run locally on your PC.
2
- 2025-01-09 15:54:08,215 - hf_validation - ERROR - No HF_TOKEN found in environment variables
3
- 2025-01-09 15:54:08,215 - main - INFO - Starting LLM API server
4
- 2025-01-09 15:54:08,216 - llm_api - INFO - Initializing LLM API
5
- 2025-01-09 15:54:08,216 - llm_api - INFO - LLM API initialized successfully
6
- 2025-01-09 15:54:08,216 - api_routes - INFO - Router initialized with LLM API instance
7
- 2025-01-09 15:54:08,218 - main - INFO - FastAPI application created successfully
8
- 2025-01-09 16:46:10,118 - api_routes - INFO - Received request to download model: microsoft/phi-4
9
- 2025-01-09 16:46:10,118 - llm_api - INFO - Starting download of model: microsoft/phi-4
10
- 2025-01-09 16:46:10,118 - llm_api - INFO - Enabling stdout logging for download
11
- 2025-01-09 17:00:32,400 - llm_api - INFO - Disabling stdout logging
12
- 2025-01-09 17:00:32,400 - llm_api - INFO - Saving model to main/models/phi-4
13
- 2025-01-09 17:02:39,928 - llm_api - INFO - Successfully downloaded model: microsoft/phi-4
14
- 2025-01-09 17:02:41,075 - api_routes - INFO - Successfully downloaded model: microsoft/phi-4
15
- 2025-01-09 17:02:41,080 - api_routes - INFO - Received request to initialize model: huihui-ai/Qwen2.5-Coder-32B-Instruct-abliterated
16
- 2025-01-09 17:02:41,080 - llm_api - INFO - Initializing generation model: huihui-ai/Qwen2.5-Coder-32B-Instruct-abliterated
17
- 2025-01-09 17:02:41,081 - llm_api - INFO - Loading model from source: huihui-ai/Qwen2.5-Coder-32B-Instruct-abliterated
18
- 2025-01-09 17:02:41,377 - llm_api - ERROR - Failed to initialize generation model huihui-ai/Qwen2.5-Coder-32B-Instruct-abliterated: Using `low_cpu_mem_usage=True` or a `device_map` requires Accelerate: `pip install 'accelerate>=0.26.0'`
19
- 2025-01-09 17:02:41,377 - api_routes - ERROR - Error initializing model: Using `low_cpu_mem_usage=True` or a `device_map` requires Accelerate: `pip install 'accelerate>=0.26.0'`
20
- 2025-01-09 17:11:25,843 - hf_validation - WARNING - No .env file found. Fine if you're on Huggingface, but you need one to run locally on your PC.
21
- 2025-01-09 17:11:25,843 - hf_validation - ERROR - No HF_TOKEN found in environment variables
22
- 2025-01-09 17:11:25,843 - main - INFO - Starting LLM API server
23
- 2025-01-09 17:11:25,843 - llm_api - INFO - Initializing LLM API
24
- 2025-01-09 17:11:25,844 - llm_api - INFO - LLM API initialized successfully
25
- 2025-01-09 17:11:25,844 - api_routes - INFO - Router initialized with LLM API instance
26
- 2025-01-09 17:11:25,846 - main - INFO - FastAPI application created successfully
27
- 2025-01-09 17:11:38,299 - api_routes - INFO - Received request to initialize model: huihui-ai/Qwen2.5-Coder-32B-Instruct-abliterated
28
- 2025-01-09 17:11:38,299 - llm_api - INFO - Initializing generation model: huihui-ai/Qwen2.5-Coder-32B-Instruct-abliterated
29
- 2025-01-09 17:11:38,299 - llm_api - INFO - Loading model from source: huihui-ai/Qwen2.5-Coder-32B-Instruct-abliterated
30
- 2025-01-09 17:11:38,487 - llm_api - ERROR - Failed to initialize generation model huihui-ai/Qwen2.5-Coder-32B-Instruct-abliterated: Using `bitsandbytes` 8-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`
31
- 2025-01-09 17:11:38,487 - api_routes - ERROR - Error initializing model: Using `bitsandbytes` 8-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`
32
- 2025-01-09 17:12:48,606 - hf_validation - WARNING - No .env file found. Fine if you're on Huggingface, but you need one to run locally on your PC.
33
- 2025-01-09 17:12:48,606 - hf_validation - ERROR - No HF_TOKEN found in environment variables
34
- 2025-01-09 17:12:48,606 - main - INFO - Starting LLM API server
35
- 2025-01-09 17:12:48,606 - llm_api - INFO - Initializing LLM API
36
- 2025-01-09 17:12:48,606 - llm_api - INFO - LLM API initialized successfully
37
- 2025-01-09 17:12:48,606 - api_routes - INFO - Router initialized with LLM API instance
38
- 2025-01-09 17:12:48,608 - main - INFO - FastAPI application created successfully
39
- 2025-01-09 17:12:59,453 - api_routes - INFO - Received request to initialize model: huihui-ai/Qwen2.5-Coder-32B-Instruct-abliterated
40
- 2025-01-09 17:12:59,453 - llm_api - INFO - Initializing generation model: huihui-ai/Qwen2.5-Coder-32B-Instruct-abliterated
41
- 2025-01-09 17:12:59,453 - llm_api - INFO - Loading model from source: huihui-ai/Qwen2.5-Coder-32B-Instruct-abliterated
42
- 2025-01-09 17:12:59,628 - llm_api - ERROR - Failed to initialize generation model huihui-ai/Qwen2.5-Coder-32B-Instruct-abliterated: CUDA is required but not available for bitsandbytes. Please consider installing the multi-platform enabled version of bitsandbytes, which is currently a work in progress. Please check currently supported platforms and installation instructions at https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend
43
- 2025-01-09 17:12:59,628 - api_routes - ERROR - Error initializing model: CUDA is required but not available for bitsandbytes. Please consider installing the multi-platform enabled version of bitsandbytes, which is currently a work in progress. Please check currently supported platforms and installation instructions at https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend
44
- 2025-01-09 17:14:44,390 - api_routes - INFO - Received request to initialize model: huihui-ai/Qwen2.5-Coder-32B-Instruct-abliterated
45
- 2025-01-09 17:14:44,390 - llm_api - INFO - Initializing generation model: huihui-ai/Qwen2.5-Coder-32B-Instruct-abliterated
46
- 2025-01-09 17:14:44,390 - llm_api - INFO - Loading model from source: huihui-ai/Qwen2.5-Coder-32B-Instruct-abliterated
47
- 2025-01-09 17:14:53,032 - llm_api - ERROR - Failed to initialize generation model huihui-ai/Qwen2.5-Coder-32B-Instruct-abliterated: CUDA is required but not available for bitsandbytes. Please consider installing the multi-platform enabled version of bitsandbytes, which is currently a work in progress. Please check currently supported platforms and installation instructions at https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend
48
- 2025-01-09 17:14:53,032 - api_routes - ERROR - Error initializing model: CUDA is required but not available for bitsandbytes. Please consider installing the multi-platform enabled version of bitsandbytes, which is currently a work in progress. Please check currently supported platforms and installation instructions at https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend
49
- 2025-01-09 17:15:14,956 - api_routes - INFO - Received request to initialize model: microsoft/phi-4
50
- 2025-01-09 17:15:14,956 - llm_api - INFO - Initializing generation model: microsoft/phi-4
51
- 2025-01-09 17:15:14,956 - llm_api - INFO - Loading model from local path: main/models/phi-4
52
- 2025-01-09 17:15:14,965 - llm_api - ERROR - Failed to initialize generation model microsoft/phi-4: CUDA is required but not available for bitsandbytes. Please consider installing the multi-platform enabled version of bitsandbytes, which is currently a work in progress. Please check currently supported platforms and installation instructions at https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend
53
- 2025-01-09 17:15:14,965 - api_routes - ERROR - Error initializing model: CUDA is required but not available for bitsandbytes. Please consider installing the multi-platform enabled version of bitsandbytes, which is currently a work in progress. Please check currently supported platforms and installation instructions at https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend
54
- 2025-01-13 16:04:32,247 - hf_validation - WARNING - No .env file found. Fine if you're on Huggingface, but you need one to run locally on your PC.
55
- 2025-01-13 16:04:32,247 - hf_validation - ERROR - No HF_TOKEN found in environment variables
56
- 2025-01-13 16:04:32,247 - main - INFO - Starting LLM API server
57
- 2025-01-13 16:04:32,248 - llm_api - INFO - Initializing LLM API
58
- 2025-01-13 16:04:32,248 - llm_api - INFO - LLM API initialized successfully
59
- 2025-01-13 16:04:32,248 - api_routes - INFO - Router initialized with LLM API instance
60
- 2025-01-13 16:04:32,252 - main - INFO - FastAPI application created successfully
61
- 2025-01-13 16:05:27,996 - api_routes - INFO - Received request to download model: microsoft/Phi-3.5-mini-instruct
62
- 2025-01-13 16:05:27,996 - llm_api - INFO - Starting download of model: microsoft/Phi-3.5-mini-instruct
63
- 2025-01-13 16:05:27,996 - llm_api - INFO - Enabling stdout logging for download
64
- 2025-01-13 16:08:46,773 - llm_api - INFO - Disabling stdout logging
65
- 2025-01-13 16:08:46,773 - llm_api - INFO - Saving model to main/models/Phi-3.5-mini-instruct
66
- 2025-01-13 16:10:23,543 - llm_api - INFO - Successfully downloaded model: microsoft/Phi-3.5-mini-instruct
67
- 2025-01-13 16:10:24,432 - api_routes - INFO - Successfully downloaded model: microsoft/Phi-3.5-mini-instruct
68
- 2025-01-13 16:18:45,409 - api_routes - INFO - Received request to initialize model: microsoft/Phi-3.5-mini-instruct
69
- 2025-01-13 16:18:45,409 - llm_api - INFO - Initializing generation model: microsoft/Phi-3.5-mini-instruct
70
- 2025-01-13 16:18:45,412 - llm_api - INFO - Loading model from local path: main/models/Phi-3.5-mini-instruct
71
- 2025-01-13 16:18:45,982 - llm_api - ERROR - Failed to initialize generation model microsoft/Phi-3.5-mini-instruct: Failed to import transformers.integrations.bitsandbytes because of the following error (look up to see its traceback):
72
- Dynamo is not supported on Python 3.13+
73
- 2025-01-13 16:18:45,982 - api_routes - ERROR - Error initializing model: Failed to import transformers.integrations.bitsandbytes because of the following error (look up to see its traceback):
74
- Dynamo is not supported on Python 3.13+
75
- 2025-01-14 11:41:25,502 - hf_validation - WARNING - No .env file found. Fine if you're on Huggingface, but you need one to run locally on your PC.
76
- 2025-01-14 11:41:25,502 - hf_validation - ERROR - No HF_TOKEN found in environment variables
77
- 2025-01-14 11:41:25,502 - main - INFO - Starting LLM API server
78
- 2025-01-14 11:41:25,503 - llm_api - INFO - Initializing LLM API
79
- 2025-01-14 11:41:25,503 - llm_api - INFO - LLM API initialized successfully
80
- 2025-01-14 11:41:25,503 - api_routes - INFO - Router initialized with LLM API instance
81
- 2025-01-14 11:41:25,509 - main - INFO - FastAPI application created successfully
82
- 2025-01-14 11:48:33,807 - hf_validation - WARNING - No .env file found. Fine if you're on Huggingface, but you need one to run locally on your PC.
83
- 2025-01-14 11:48:33,807 - hf_validation - ERROR - No HF_TOKEN found in environment variables
84
- 2025-01-14 11:48:33,807 - main - INFO - Starting LLM API server
85
- 2025-01-14 11:48:33,807 - llm_api - INFO - Initializing LLM API
86
- 2025-01-14 11:48:33,807 - llm_api - INFO - LLM API initialized successfully
87
- 2025-01-14 11:48:33,807 - api_routes - INFO - Router initialized with LLM API instance
88
- 2025-01-14 11:48:33,812 - main - INFO - FastAPI application created successfully
89
- 2025-01-14 11:53:20,777 - hf_validation - WARNING - No .env file found. Fine if you're on Huggingface, but you need one to run locally on your PC.
90
- 2025-01-14 11:53:20,777 - hf_validation - ERROR - No HF_TOKEN found in environment variables
91
- 2025-01-14 11:53:20,777 - main - INFO - Starting LLM API server
92
- 2025-01-14 11:53:20,777 - llm_api - INFO - Initializing LLM API
93
- 2025-01-14 11:53:20,778 - llm_api - INFO - LLM API initialized successfully
94
- 2025-01-14 11:53:20,778 - api_routes - INFO - Router initialized with LLM API instance
95
- 2025-01-14 11:53:20,783 - main - INFO - FastAPI application created successfully
96
- 2025-01-14 11:54:28,143 - api_routes - INFO - Received request to download model: microsoft/Phi-3.5-mini-instruct
97
- 2025-01-14 11:54:28,143 - llm_api - INFO - Starting download of model: microsoft/Phi-3.5-mini-instruct
98
- 2025-01-14 11:54:28,143 - llm_api - INFO - Enabling stdout logging for download
99
- 2025-01-14 11:54:47,061 - llm_api - INFO - Disabling stdout logging
100
- 2025-01-14 11:54:47,061 - llm_api - INFO - Saving model to main/models/Phi-3.5-mini-instruct
101
- 2025-01-14 11:56:40,600 - llm_api - INFO - Successfully downloaded model: microsoft/Phi-3.5-mini-instruct
102
- 2025-01-14 11:56:41,266 - api_routes - INFO - Successfully downloaded model: microsoft/Phi-3.5-mini-instruct
103
- 2025-01-14 11:56:41,364 - api_routes - INFO - Received request to initialize model: microsoft/Phi-3.5-mini-instruct
104
- 2025-01-14 11:56:41,365 - llm_api - INFO - Initializing generation model: microsoft/Phi-3.5-mini-instruct
105
- 2025-01-14 11:56:41,367 - llm_api - INFO - Loading model from local path: main/models/Phi-3.5-mini-instruct
106
- 2025-01-14 11:56:45,322 - llm_api - ERROR - Failed to initialize generation model microsoft/Phi-3.5-mini-instruct: /home/aurelio/Desktop/Projects/LLMServer/myenv/lib/python3.12/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cget_col_row_stats
107
- 2025-01-14 11:56:45,322 - api_routes - ERROR - Error initializing model: /home/aurelio/Desktop/Projects/LLMServer/myenv/lib/python3.12/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cget_col_row_stats
108
- 2025-01-14 12:29:54,971 - main - INFO - LLM API server started on 0.0.0.0:8001
109
- 2025-01-14 12:30:01,275 - hf_validation - WARNING - No .env file found. Fine if you're on Huggingface, but you need one to run locally on your PC.
110
- 2025-01-14 12:30:01,275 - hf_validation - ERROR - No HF_TOKEN found in environment variables
111
- 2025-01-14 12:30:01,275 - main - INFO - Starting LLM API server
112
- 2025-01-14 12:30:01,275 - llm_api - INFO - Initializing LLM API
113
- 2025-01-14 12:30:01,275 - llm_api - INFO - LLM API initialized successfully
114
- 2025-01-14 12:30:01,276 - api_routes - INFO - Router initialized with LLM API instance
115
- 2025-01-14 12:30:01,280 - main - INFO - FastAPI application created successfully
116
- 2025-01-14 12:31:15,345 - hf_validation - WARNING - No .env file found. Fine if you're on Huggingface, but you need one to run locally on your PC.
117
- 2025-01-14 12:31:15,345 - hf_validation - ERROR - No HF_TOKEN found in environment variables
118
- 2025-01-14 12:31:15,345 - main - INFO - Starting LLM API server
119
- 2025-01-14 12:31:15,345 - llm_api - INFO - Initializing LLM API
120
- 2025-01-14 12:31:15,346 - llm_api - INFO - LLM API initialized successfully
121
- 2025-01-14 12:31:15,346 - api_routes - INFO - Router initialized with LLM API instance
122
- 2025-01-14 12:31:15,350 - main - INFO - FastAPI application created successfully
123
- 2025-01-14 12:31:43,376 - hf_validation - WARNING - No .env file found. Fine if you're on Huggingface, but you need one to run locally on your PC.
124
- 2025-01-14 12:31:43,376 - hf_validation - ERROR - No HF_TOKEN found in environment variables
125
- 2025-01-14 12:31:43,376 - main - INFO - Starting LLM API server
126
- 2025-01-14 12:31:43,377 - llm_api - INFO - Initializing LLM API
127
- 2025-01-14 12:31:43,377 - llm_api - INFO - LLM API initialized successfully
128
- 2025-01-14 12:31:43,377 - api_routes - INFO - Router initialized with LLM API instance
129
- 2025-01-14 12:31:43,381 - main - INFO - FastAPI application created successfully
130
- 2025-01-14 12:31:51,142 - llm_api - INFO - INFO: 127.0.0.1:52554 - "GET /docs HTTP/1.1" 200 OK
131
- 2025-01-14 12:31:51,311 - llm_api - INFO - INFO: 127.0.0.1:52554 - "GET /openapi.json HTTP/1.1" 200 OK
132
- 2025-01-14 12:32:10,756 - api_routes - INFO - Received request to initialize model: microsoft/Phi-3.5-mini-instruct
133
- 2025-01-14 12:32:10,757 - llm_api - INFO - Initializing generation model: microsoft/Phi-3.5-mini-instruct
134
- 2025-01-14 12:32:10,757 - llm_api - INFO - Loading model from local path: main/models/Phi-3.5-mini-instruct
135
- 2025-01-14 12:32:26,447 - llm_api - INFO - Successfully initialized generation model: microsoft/Phi-3.5-mini-instruct
136
- 2025-01-14 12:32:26,448 - api_routes - INFO - Successfully initialized model: microsoft/Phi-3.5-mini-instruct
137
- 2025-01-14 12:32:26,448 - llm_api - INFO - INFO: 127.0.0.1:34282 - "POST /api/v1/model/initialize?model_name=microsoft%2FPhi-3.5-mini-instruct HTTP/1.1" 200 OK
138
- 2025-01-14 12:33:13,272 - api_routes - INFO - Received generation request for prompt: Tell me about yourself, and your capabilities...
139
- 2025-01-14 12:33:13,272 - llm_api - DEBUG - Generating response for prompt: Tell me about yourself, and your capabilities...
140
- 2025-01-14 12:33:46,448 - llm_api - DEBUG - Generated response: I am Phi, an AI language model developed by Micros...
141
- 2025-01-14 12:33:46,448 - api_routes - INFO - Successfully generated response
142
- 2025-01-14 12:33:46,448 - llm_api - INFO - INFO: 127.0.0.1:57442 - "POST /api/v1/generate HTTP/1.1" 200 OK
143
- 2025-01-14 12:34:41,588 - api_routes - INFO - Received streaming generation request for prompt: Tell me about yourself, and your capabilities...
144
- 2025-01-14 12:34:41,588 - llm_api - DEBUG - Starting streaming generation for prompt: Tell me about yourself, and your capabilities...
145
- 2025-01-14 12:34:41,608 - llm_api - DEBUG - Generated chunk: <|system|> You are a helpful assistant<|end|><|use...
146
- 2025-01-14 12:34:41,689 - llm_api - DEBUG - Generated chunk: capabilities<|end|><|assistant|> ...
147
- 2025-01-14 12:34:41,757 - llm_api - DEBUG - Generated chunk: I ...
148
- 2025-01-14 12:34:41,827 - llm_api - DEBUG - Generated chunk: am ...
149
- 2025-01-14 12:34:41,895 - llm_api - DEBUG - Generated chunk: ...
150
- 2025-01-14 12:34:41,965 - llm_api - DEBUG - Generated chunk: ...
151
- 2025-01-14 12:34:42,033 - llm_api - DEBUG - Generated chunk: Phi, ...
152
- 2025-01-14 12:34:42,102 - llm_api - DEBUG - Generated chunk: an ...
153
- 2025-01-14 12:34:42,172 - llm_api - DEBUG - Generated chunk: ...
154
- 2025-01-14 12:34:42,241 - llm_api - DEBUG - Generated chunk: AI ...
155
- 2025-01-14 12:34:42,309 - llm_api - DEBUG - Generated chunk: language ...
156
- 2025-01-14 12:34:42,377 - llm_api - DEBUG - Generated chunk: model ...
157
- 2025-01-14 12:34:42,448 - llm_api - DEBUG - Generated chunk: created ...
158
- 2025-01-14 12:34:42,521 - llm_api - DEBUG - Generated chunk: by ...
159
- 2025-01-14 12:34:42,590 - llm_api - DEBUG - Generated chunk: ...
160
- 2025-01-14 12:34:42,656 - llm_api - DEBUG - Generated chunk: Microsoft. ...
161
- 2025-01-14 12:34:42,721 - llm_api - DEBUG - Generated chunk: While ...
162
- 2025-01-14 12:34:42,788 - llm_api - DEBUG - Generated chunk: I ...
163
- 2025-01-14 12:34:42,854 - llm_api - DEBUG - Generated chunk: ...
164
- 2025-01-14 12:34:42,925 - llm_api - DEBUG - Generated chunk: ...
165
- 2025-01-14 12:34:42,991 - llm_api - DEBUG - Generated chunk: don't ...
166
- 2025-01-14 12:34:43,063 - llm_api - DEBUG - Generated chunk: have ...
167
- 2025-01-14 12:34:43,131 - llm_api - DEBUG - Generated chunk: personal ...
168
- 2025-01-14 12:34:43,201 - llm_api - DEBUG - Generated chunk: experiences ...
169
- 2025-01-14 12:34:43,267 - llm_api - DEBUG - Generated chunk: or ...
170
- 2025-01-14 12:34:43,334 - llm_api - DEBUG - Generated chunk: feelings ...
171
- 2025-01-14 12:34:43,402 - llm_api - DEBUG - Generated chunk: like ...
172
- 2025-01-14 12:34:43,472 - llm_api - DEBUG - Generated chunk: humans ...
173
- 2025-01-14 12:34:43,537 - llm_api - DEBUG - Generated chunk: ...
174
- 2025-01-14 12:34:43,600 - llm_api - DEBUG - Generated chunk: do, ...
175
- 2025-01-14 12:34:43,663 - llm_api - DEBUG - Generated chunk: let ...
176
- 2025-01-14 12:34:43,729 - llm_api - DEBUG - Generated chunk: me ...
177
- 2025-01-14 12:34:43,793 - llm_api - DEBUG - Generated chunk: tell ...
178
- 2025-01-14 12:34:43,859 - llm_api - DEBUG - Generated chunk: you ...
179
- 2025-01-14 12:34:43,924 - llm_api - DEBUG - Generated chunk: more ...
180
- 2025-01-14 12:34:43,989 - llm_api - DEBUG - Generated chunk: about ...
181
- 2025-01-14 12:34:44,053 - llm_api - DEBUG - Generated chunk: my ...
182
- 2025-01-14 12:34:44,119 - llm_api - DEBUG - Generated chunk: ...
183
- 2025-01-14 12:34:44,183 - llm_api - DEBUG - Generated chunk: ...
184
- 2025-01-14 12:34:44,247 - llm_api - DEBUG - Generated chunk: functionalities:
185
- ...
186
- 2025-01-14 12:34:44,312 - llm_api - DEBUG - Generated chunk:
187
- ...
188
- 2025-01-14 12:34:44,375 - llm_api - DEBUG - Generated chunk: ...
189
- 2025-01-14 12:34:44,440 - llm_api - DEBUG - Generated chunk: ...
190
- 2025-01-14 12:34:44,505 - llm_api - DEBUG - Generated chunk: 1. ...
191
- 2025-01-14 12:34:44,567 - llm_api - DEBUG - Generated chunk: ...
192
- 2025-01-14 12:34:44,632 - llm_api - DEBUG - Generated chunk: **Language ...
193
- 2025-01-14 12:34:44,698 - llm_api - DEBUG - Generated chunk: ...
194
- 2025-01-14 12:34:44,763 - llm_api - DEBUG - Generated chunk: ...
195
- 2025-01-14 12:34:44,828 - llm_api - DEBUG - Generated chunk: ...
196
- 2025-01-14 12:34:44,892 - llm_api - DEBUG - Generated chunk: Understanding**: ...
197
- 2025-01-14 12:34:44,956 - llm_api - DEBUG - Generated chunk: My ...
198
- 2025-01-14 12:34:45,020 - llm_api - DEBUG - Generated chunk: primary ...
199
- 2025-01-14 12:34:45,085 - llm_api - DEBUG - Generated chunk: ...
200
- 2025-01-14 12:34:45,148 - llm_api - DEBUG - Generated chunk: capability ...
201
- 2025-01-14 12:34:45,211 - llm_api - DEBUG - Generated chunk: is ...
202
- 2025-01-14 12:34:45,275 - llm_api - DEBUG - Generated chunk: understanding ...
203
- 2025-01-14 12:34:45,338 - llm_api - DEBUG - Generated chunk: natural ...
204
- 2025-01-14 12:34:45,401 - llm_api - DEBUG - Generated chunk: human ...
205
- 2025-01-14 12:34:45,465 - llm_api - DEBUG - Generated chunk: languages ...
206
- 2025-01-14 12:34:45,529 - llm_api - DEBUG - Generated chunk: to ...
207
- 2025-01-14 12:34:45,592 - llm_api - DEBUG - Generated chunk: the ...
208
- 2025-01-14 12:34:45,658 - llm_api - DEBUG - Generated chunk: best ...
209
- 2025-01-14 12:34:45,728 - llm_api - DEBUG - Generated chunk: of ...
210
- 2025-01-14 12:34:45,804 - llm_api - DEBUG - Generated chunk: our ...
211
- 2025-01-14 12:34:45,871 - llm_api - DEBUG - Generated chunk: current ...
212
- 2025-01-14 12:34:45,938 - llm_api - DEBUG - Generated chunk: technology ...
213
- 2025-01-14 12:34:46,004 - llm_api - DEBUG - Generated chunk: allows ...
214
- 2025-01-14 12:34:46,072 - llm_api - DEBUG - Generated chunk: ...
215
- 2025-01-14 12:34:46,139 - llm_api - DEBUG - Generated chunk: it. ...
216
- 2025-01-14 12:34:46,207 - llm_api - DEBUG - Generated chunk: This ...
217
- 2025-01-14 12:34:46,273 - llm_api - DEBUG - Generated chunk: includes ...
218
- 2025-01-14 12:34:46,341 - llm_api - DEBUG - Generated chunk: ...
219
- 2025-01-14 12:34:46,408 - llm_api - DEBUG - Generated chunk: interpreting ...
220
- 2025-01-14 12:34:46,473 - llm_api - DEBUG - Generated chunk: text ...
221
- 2025-01-14 12:34:46,539 - llm_api - DEBUG - Generated chunk: inputs ...
222
- 2025-01-14 12:34:46,605 - llm_api - DEBUG - Generated chunk: from ...
223
- 2025-01-14 12:34:46,670 - llm_api - DEBUG - Generated chunk: various ...
224
- 2025-01-14 12:34:46,735 - llm_api - DEBUG - Generated chunk: sources ...
225
- 2025-01-14 12:34:46,801 - llm_api - DEBUG - Generated chunk: such ...
226
- 2025-01-14 12:34:46,867 - llm_api - DEBUG - Generated chunk: as ...
227
- 2025-01-14 12:34:46,935 - llm_api - DEBUG - Generated chunk: ...
228
- 2025-01-14 12:34:47,005 - llm_api - DEBUG - Generated chunk: websites, ...
229
- 2025-01-14 12:34:47,073 - llm_api - DEBUG - Generated chunk: ...
230
- 2025-01-14 12:34:47,142 - llm_api - DEBUG - Generated chunk: books, ...
231
- 2025-01-14 12:34:47,208 - llm_api - DEBUG - Generated chunk: articles ...
232
- 2025-01-14 12:34:47,275 - llm_api - DEBUG - Generated chunk: ...
233
- 2025-01-14 12:34:47,341 - llm_api - DEBUG - Generated chunk: etc., ...
234
- 2025-01-14 12:34:47,408 - llm_api - DEBUG - Generated chunk: in ...
235
- 2025-01-14 12:34:47,472 - llm_api - DEBUG - Generated chunk: multiple ...
236
- 2025-01-14 12:34:47,536 - llm_api - DEBUG - Generated chunk: formats ...
237
- 2025-01-14 12:34:47,600 - llm_api - DEBUG - Generated chunk: including ...
238
- 2025-01-14 12:34:47,674 - llm_api - DEBUG - Generated chunk: ...
239
- 2025-01-14 12:34:47,744 - llm_api - DEBUG - Generated chunk: English, ...
240
- 2025-01-14 12:34:47,814 - llm_api - DEBUG - Generated chunk: ...
241
- 2025-01-14 12:34:47,901 - llm_api - DEBUG - Generated chunk: Spanish, ...
242
- 2025-01-14 12:34:47,991 - llm_api - DEBUG - Generated chunk: French ...
243
- 2025-01-14 12:34:48,066 - llm_api - DEBUG - Generated chunk: among ...
244
- 2025-01-14 12:34:48,131 - llm_api - DEBUG - Generated chunk: ...
245
- 2025-01-14 12:34:48,194 - llm_api - DEBUG - Generated chunk: others. ...
246
- 2025-01-14 12:34:48,259 - llm_api - DEBUG - Generated chunk:
247
- ...
248
- 2025-01-14 12:34:48,325 - llm_api - DEBUG - Generated chunk:
249
- ...
250
- 2025-01-14 12:34:48,390 - llm_api - DEBUG - Generated chunk: ...
251
- 2025-01-14 12:34:48,454 - llm_api - DEBUG - Generated chunk: ...
252
- 2025-01-14 12:34:48,519 - llm_api - DEBUG - Generated chunk: 2. ...
253
- 2025-01-14 12:34:48,583 - llm_api - DEBUG - Generated chunk: ...
254
- 2025-01-14 12:34:48,647 - llm_api - DEBUG - Generated chunk: **Text ...
255
- 2025-01-14 12:34:48,711 - llm_api - DEBUG - Generated chunk: Generation ...
256
- 2025-01-14 12:34:48,773 - llm_api - DEBUG - Generated chunk: & ...
257
- 2025-01-14 12:34:48,837 - llm_api - DEBUG - Generated chunk: ...
258
- 2025-01-14 12:34:48,902 - llm_api - DEBUG - Generated chunk: ...
259
- 2025-01-14 12:34:48,965 - llm_api - DEBUG - Generated chunk: ...
260
- 2025-01-14 12:34:49,031 - llm_api - DEBUG - Generated chunk: ...
261
- 2025-01-14 12:34:49,096 - llm_api - DEBUG - Generated chunk: ...
262
- 2025-01-14 12:34:49,160 - llm_api - DEBUG - Generated chunk: Comprehension**: ...
263
- 2025-01-14 12:34:49,224 - llm_api - DEBUG - Generated chunk: Based ...
264
- 2025-01-14 12:34:49,289 - llm_api - DEBUG - Generated chunk: on ...
265
- 2025-01-14 12:34:49,353 - llm_api - DEBUG - Generated chunk: patterns ...
266
- 2025-01-14 12:34:49,419 - llm_api - DEBUG - Generated chunk: learned ...
267
- 2025-01-14 12:34:49,488 - llm_api - DEBUG - Generated chunk: during ...
268
- 2025-01-14 12:34:49,555 - llm_api - DEBUG - Generated chunk: training ...
269
- 2025-01-14 12:34:49,621 - llm_api - DEBUG - Generated chunk: with ...
270
- 2025-01-14 12:34:49,686 - llm_api - DEBUG - Generated chunk: diverse ...
271
- 2025-01-14 12:34:49,750 - llm_api - DEBUG - Generated chunk: internet ...
272
- 2025-01-14 12:34:49,814 - llm_api - DEBUG - Generated chunk: texts ...
273
- 2025-01-14 12:34:49,879 - llm_api - DEBUG - Generated chunk: data ...
274
- 2025-01-14 12:34:49,944 - llm_api - DEBUG - Generated chunk: ...
275
- 2025-01-14 12:34:50,009 - llm_api - DEBUG - Generated chunk: sets, ...
276
- 2025-01-14 12:34:50,074 - llm_api - DEBUG - Generated chunk: I ...
277
- 2025-01-14 12:34:50,140 - llm_api - DEBUG - Generated chunk: can ...
278
- 2025-01-14 12:34:50,204 - llm_api - DEBUG - Generated chunk: generate ...
279
- 2025-01-14 12:34:50,269 - llm_api - DEBUG - Generated chunk: ...
280
- 2025-01-14 12:34:50,336 - llm_api - DEBUG - Generated chunk: ...
281
- 2025-01-14 12:34:50,401 - llm_api - DEBUG - Generated chunk: coherent ...
282
- 2025-01-14 12:34:50,466 - llm_api - DEBUG - Generated chunk: responses ...
283
- 2025-01-14 12:34:50,530 - llm_api - DEBUG - Generated chunk: that ...
284
- 2025-01-14 12:34:50,595 - llm_api - DEBUG - Generated chunk: ...
285
- 2025-01-14 12:34:50,658 - llm_api - DEBUG - Generated chunk: ...
286
- 2025-01-14 12:34:50,722 - llm_api - DEBUG - Generated chunk: mimic ...
287
- 2025-01-14 12:34:50,787 - llm_api - DEBUG - Generated chunk: how ...
288
- 2025-01-14 12:34:50,852 - llm_api - DEBUG - Generated chunk: real ...
289
- 2025-01-14 12:34:50,916 - llm_api - DEBUG - Generated chunk: people ...
290
- 2025-01-14 12:34:50,980 - llm_api - DEBUG - Generated chunk: ...
291
- 2025-01-14 12:34:51,043 - llm_api - DEBUG - Generated chunk: write. ...
292
- 2025-01-14 12:34:51,107 - llm_api - DEBUG - Generated chunk: ...
293
- 2025-01-14 12:34:51,171 - llm_api - DEBUG - Generated chunk: However, ...
294
- 2025-01-14 12:34:51,236 - llm_api - DEBUG - Generated chunk: please ...
295
- 2025-01-14 12:34:51,299 - llm_api - DEBUG - Generated chunk: note ...
296
- 2025-01-14 12:34:51,364 - llm_api - DEBUG - Generated chunk: these ...
297
- 2025-01-14 12:34:51,428 - llm_api - DEBUG - Generated chunk: generated ...
298
- 2025-01-14 12:34:51,491 - llm_api - DEBUG - Generated chunk: outputs ...
299
- 2025-01-14 12:34:51,556 - llm_api - DEBUG - Generated chunk: ...
300
- 2025-01-14 12:34:51,620 - llm_api - DEBUG - Generated chunk: ...
301
- 2025-01-14 12:34:51,685 - llm_api - DEBUG - Generated chunk: aren’t ...
302
- 2025-01-14 12:34:51,748 - llm_api - DEBUG - Generated chunk: perfect ...
303
- 2025-01-14 12:34:51,813 - llm_api - DEBUG - Generated chunk: nor ...
304
- 2025-01-14 12:34:51,877 - llm_api - DEBUG - Generated chunk: fully ...
305
- 2025-01-14 12:34:51,940 - llm_api - DEBUG - Generated chunk: accurate ...
306
- 2025-01-14 12:34:52,005 - llm_api - DEBUG - Generated chunk: but ...
307
- 2025-01-14 12:34:52,068 - llm_api - DEBUG - Generated chunk: they ...
308
- 2025-01-14 12:34:52,131 - llm_api - DEBUG - Generated chunk: often ...
309
- 2025-01-14 12:34:52,196 - llm_api - DEBUG - Generated chunk: make ...
310
- 2025-01-14 12:34:52,260 - llm_api - DEBUG - Generated chunk: sense ...
311
- 2025-01-14 12:34:52,324 - llm_api - DEBUG - Generated chunk: ...
312
- 2025-01-14 12:34:52,388 - llm_api - DEBUG - Generated chunk: contextually ...
313
- 2025-01-14 12:34:52,451 - llm_api - DEBUG - Generated chunk: within ...
314
- 2025-01-14 12:34:52,516 - llm_api - DEBUG - Generated chunk: given ...
315
- 2025-01-14 12:34:52,579 - llm_api - DEBUG - Generated chunk: ...
316
- 2025-01-14 12:34:52,643 - llm_api - DEBUG - Generated chunk: ...
317
- 2025-01-14 12:34:52,707 - llm_api - DEBUG - Generated chunk: prompts.
318
- ...
319
- 2025-01-14 12:34:52,771 - llm_api - DEBUG - Generated chunk:
320
- ...
321
- 2025-01-14 12:34:52,835 - llm_api - DEBUG - Generated chunk: ...
322
- 2025-01-14 12:34:52,899 - llm_api - DEBUG - Generated chunk: ...
323
- 2025-01-14 12:34:52,963 - llm_api - DEBUG - Generated chunk: 3. ...
324
- 2025-01-14 12:34:53,026 - llm_api - DEBUG - Generated chunk: ...
325
- 2025-01-14 12:34:53,095 - llm_api - DEBUG - Generated chunk: ...
326
- 2025-01-14 12:34:53,168 - llm_api - DEBUG - Generated chunk: ...
327
- 2025-01-14 12:34:53,237 - llm_api - DEBUG - Generated chunk: **Knowledge ...
328
- 2025-01-14 12:34:53,302 - llm_api - DEBUG - Generated chunk: Base ...
329
- 2025-01-14 12:34:53,371 - llm_api - DEBUG - Generated chunk: ...
330
- 2025-01-14 12:34:53,440 - llm_api - DEBUG - Generated chunk: ...
331
- 2025-01-14 12:34:53,508 - llm_api - DEBUG - Generated chunk: ...
332
- 2025-01-14 12:34:53,576 - llm_api - DEBUG - Generated chunk: Accessing**: ...
333
- 2025-01-14 12:34:53,642 - llm_api - DEBUG - Generated chunk: Although ...
334
- 2025-01-14 12:34:53,710 - llm_api - DEBUG - Generated chunk: not ...
335
- 2025-01-14 12:34:53,775 - llm_api - DEBUG - Generated chunk: connected ...
336
- 2025-01-14 12:34:53,838 - llm_api - DEBUG - Generated chunk: live ...
337
- 2025-01-14 12:34:53,903 - llm_api - DEBUG - Generated chunk: for ...
338
- 2025-01-14 12:34:53,968 - llm_api - DEBUG - Generated chunk: ...
339
- 2025-01-14 12:34:54,032 - llm_api - DEBUG - Generated chunk: browsing ...
340
- 2025-01-14 12:34:54,096 - llm_api - DEBUG - Generated chunk: external ...
341
- 2025-01-14 12:34:54,159 - llm_api - DEBUG - Generated chunk: databases ...
342
- 2025-01-14 12:34:54,224 - llm_api - DEBUG - Generated chunk: at ...
343
- 2025-01-14 12:34:54,287 - llm_api - DEBUG - Generated chunk: this ...
344
- 2025-01-14 12:34:54,351 - llm_api - DEBUG - Generated chunk: ...
345
- 2025-01-14 12:34:54,414 - llm_api - DEBUG - Generated chunk: moment, ...
346
- 2025-01-14 12:34:54,478 - llm_api - DEBUG - Generated chunk: information ...
347
- 2025-01-14 12:34:54,542 - llm_api - DEBUG - Generated chunk: up ...
348
- 2025-01-14 12:34:54,606 - llm_api - DEBUG - Generated chunk: until ...
349
- 2025-01-14 12:34:54,670 - llm_api - DEBUG - Generated chunk: September ...
350
- 2025-01-14 12:34:54,735 - llm_api - DEBUG - Generated chunk: ...
351
- 2025-01-14 12:34:54,799 - llm_api - DEBUG - Generated chunk: ...
352
- 2025-01-14 12:34:54,864 - llm_api - DEBUG - Generated chunk: ...
353
- 2025-01-14 12:34:54,928 - llm_api - DEBUG - Generated chunk: ...
354
- 2025-01-14 12:34:54,992 - llm_api - DEBUG - Generated chunk: 2021 ...
355
- 2025-01-14 12:34:55,056 - llm_api - DEBUG - Generated chunk: has ...
356
- 2025-01-14 12:34:55,120 - llm_api - DEBUG - Generated chunk: been ...
357
- 2025-01-14 12:34:55,184 - llm_api - DEBUG - Generated chunk: used ...
358
- 2025-01-14 12:34:55,249 - llm_api - DEBUG - Generated chunk: when ...
359
- 2025-01-14 12:34:55,313 - llm_api - DEBUG - Generated chunk: generating ...
360
- 2025-01-14 12:34:55,376 - llm_api - DEBUG - Generated chunk: answers ...
361
- 2025-01-14 12:34:55,441 - llm_api - DEBUG - Generated chunk: based ...
362
- 2025-01-14 12:34:55,504 - llm_api - DEBUG - Generated chunk: upon ...
363
- 2025-01-14 12:34:55,568 - llm_api - DEBUG - Generated chunk: extensive ...
364
- 2025-01-14 12:34:55,632 - llm_api - DEBUG - Generated chunk: datasets ...
365
- 2025-01-14 12:34:55,696 - llm_api - DEBUG - Generated chunk: which ...
366
- 2025-01-14 12:34:55,760 - llm_api - DEBUG - Generated chunk: include ...
367
- 2025-01-14 12:34:55,825 - llm_api - DEBUG - Generated chunk: facts ...
368
- 2025-01-14 12:34:55,889 - llm_api - DEBUG - Generated chunk: known ...
369
- 2025-01-14 12:34:55,953 - llm_api - DEBUG - Generated chunk: till ...
370
- 2025-01-14 12:34:56,017 - llm_api - DEBUG - Generated chunk: then ...
371
- 2025-01-14 12:34:56,082 - llm_api - DEBUG - Generated chunk: across ...
372
- 2025-01-14 12:34:56,146 - llm_api - DEBUG - Generated chunk: numerous ...
373
- 2025-01-14 12:34:56,210 - llm_api - DEBUG - Generated chunk: topics ...
374
- 2025-01-14 12:34:56,281 - llm_api - DEBUG - Generated chunk: ...
375
- 2025-01-14 12:34:56,346 - llm_api - DEBUG - Generated chunk: ranging ...
376
- 2025-01-14 12:34:56,410 - llm_api - DEBUG - Generated chunk: from ...
377
- 2025-01-14 12:34:56,474 - llm_api - DEBUG - Generated chunk: science ...
378
- 2025-01-14 12:34:56,537 - llm_api - DEBUG - Generated chunk: to ...
379
- 2025-01-14 12:34:56,631 - llm_api - DEBUG - Generated chunk: ...
380
- 2025-01-14 12:34:56,730 - llm_api - DEBUG - Generated chunk: arts, ...
381
- 2025-01-14 12:34:56,818 - llm_api - DEBUG - Generated chunk: ...
382
- 2025-01-14 12:34:56,886 - llm_api - DEBUG - Generated chunk: history, ...
383
- 2025-01-14 12:34:56,951 - llm_api - DEBUG - Generated chunk: ...
384
- 2025-01-14 12:34:57,016 - llm_api - DEBUG - Generated chunk: culture, ...
385
- 2025-01-14 12:34:57,080 - llm_api - DEBUG - Generated chunk: sports ...
386
- 2025-01-14 12:34:57,145 - llm_api - DEBUG - Generated chunk: amongst ...
387
- 2025-01-14 12:34:57,209 - llm_api - DEBUG - Generated chunk: many ...
388
- 2025-01-14 12:34:57,273 - llm_api - DEBUG - Generated chunk: other ...
389
- 2025-01-14 12:34:57,337 - llm_api - DEBUG - Generated chunk: ...
390
- 2025-01-14 12:34:57,401 - llm_api - DEBUG - Generated chunk: fields.
391
- ...
392
- 2025-01-14 12:34:57,467 - llm_api - DEBUG - Generated chunk:
393
- ...
394
- 2025-01-14 12:34:57,531 - llm_api - DEBUG - Generated chunk: ...
395
- 2025-01-14 12:34:57,595 - llm_api - DEBUG - Generated chunk: ...
396
- 2025-01-14 12:34:57,659 - llm_api - DEBUG - Generated chunk: 4. ...
397
- 2025-01-14 12:34:57,723 - llm_api - DEBUG - Generated chunk: ...
398
- 2025-01-14 12:34:57,786 - llm_api - DEBUG - Generated chunk: **Problem ...
399
- 2025-01-14 12:34:57,850 - llm_api - DEBUG - Generated chunk: ...
400
- 2025-01-14 12:34:57,914 - llm_api - DEBUG - Generated chunk: Solving ...
401
- 2025-01-14 12:34:57,978 - llm_api - DEBUG - Generated chunk: ...
402
- 2025-01-14 12:34:58,042 - llm_api - DEBUG - Generated chunk: ...
403
- 2025-01-14 12:34:58,106 - llm_api - DEBUG - Generated chunk: ...
404
- 2025-01-14 12:34:58,168 - llm_api - DEBUG - Generated chunk: Skills**: ...
405
- 2025-01-14 12:34:58,231 - llm_api - DEBUG - Generated chunk: In ...
406
- 2025-01-14 12:34:58,294 - llm_api - DEBUG - Generated chunk: certain ...
407
- 2025-01-14 12:34:58,357 - llm_api - DEBUG - Generated chunk: scenarios ...
408
- 2025-01-14 12:34:58,421 - llm_api - DEBUG - Generated chunk: where ...
409
- 2025-01-14 12:34:58,484 - llm_api - DEBUG - Generated chunk: logical ...
410
- 2025-01-14 12:34:58,549 - llm_api - DEBUG - Generated chunk: reasoning ...
411
- 2025-01-14 12:34:58,612 - llm_api - DEBUG - Generated chunk: might ...
412
- 2025-01-14 12:34:58,676 - llm_api - DEBUG - Generated chunk: be ...
413
- 2025-01-14 12:34:58,739 - llm_api - DEBUG - Generated chunk: required ...
414
- 2025-01-14 12:34:58,802 - llm_api - DEBUG - Generated chunk: ...
415
- 2025-01-14 12:34:58,867 - llm_api - DEBUG - Generated chunk: (like ...
416
- 2025-01-14 12:34:58,930 - llm_api - DEBUG - Generated chunk: math ...
417
- 2025-01-14 12:34:58,994 - llm_api - DEBUG - Generated chunk: ...
418
- 2025-01-14 12:34:59,057 - llm_api - DEBUG - Generated chunk: problems), ...
419
- 2025-01-14 12:34:59,120 - llm_api - DEBUG - Generated chunk: algorithms ...
420
- 2025-01-14 12:34:59,184 - llm_api - DEBUG - Generated chunk: enable ...
421
- 2025-01-14 12:34:59,248 - llm_api - DEBUG - Generated chunk: ...
422
- 2025-01-14 12:34:59,312 - llm_api - DEBUG - Generated chunk: ...
423
- 2025-01-14 12:34:59,376 - llm_api - DEBUG - Generated chunk: ...
424
- 2025-01-14 12:34:59,439 - llm_api - DEBUG - Generated chunk: problem-solving ...
425
- 2025-01-14 12:34:59,503 - llm_api - DEBUG - Generated chunk: ...
426
- 2025-01-14 12:34:59,567 - llm_api - DEBUG - Generated chunk: abilities ...
427
- 2025-01-14 12:34:59,631 - llm_api - DEBUG - Generated chunk: similar ...
428
- 2025-01-14 12:34:59,695 - llm_api - DEBUG - Generated chunk: those ...
429
- 2025-01-14 12:34:59,758 - llm_api - DEBUG - Generated chunk: found ...
430
- 2025-01-14 12:34:59,822 - llm_api - DEBUG - Generated chunk: commonly ...
431
- 2025-01-14 12:34:59,884 - llm_api - DEBUG - Generated chunk: seen ...
432
- 2025-01-14 12:34:59,948 - llm_api - DEBUG - Generated chunk: in ...
433
- 2025-01-14 12:35:00,011 - llm_api - DEBUG - Generated chunk: ...
434
- 2025-01-14 12:35:00,075 - llm_api - DEBUG - Generated chunk: calculators ...
435
- 2025-01-14 12:35:00,139 - llm_api - DEBUG - Generated chunk: yet ...
436
- 2025-01-14 12:35:00,202 - llm_api - DEBUG - Generated chunk: without ...
437
- 2025-01-14 12:35:00,266 - llm_api - DEBUG - Generated chunk: any ...
438
- 2025-01-14 12:35:00,329 - llm_api - DEBUG - Generated chunk: direct ...
439
- 2025-01-14 12:35:00,393 - llm_api - DEBUG - Generated chunk: interaction ...
440
- 2025-01-14 12:35:00,458 - llm_api - DEBUG - Generated chunk: beyond ...
441
- 2025-01-14 12:35:00,523 - llm_api - DEBUG - Generated chunk: what ...
442
- 2025-01-14 12:35:00,590 - llm_api - DEBUG - Generated chunk: was ...
443
- 2025-01-14 12:35:00,656 - llm_api - DEBUG - Generated chunk: provided ...
444
- 2025-01-14 12:35:00,721 - llm_api - DEBUG - Generated chunk: initially ...
445
- 2025-01-14 12:35:00,786 - llm_api - DEBUG - Generated chunk: into ...
446
- 2025-01-14 12:35:00,850 - llm_api - DEBUG - Generated chunk: them ...
447
- 2025-01-14 12:35:00,916 - llm_api - DEBUG - Generated chunk: - ...
448
- 2025-01-14 12:35:00,980 - llm_api - DEBUG - Generated chunk: no ...
449
- 2025-01-14 12:35:01,045 - llm_api - DEBUG - Generated chunk: memory ...
450
- 2025-01-14 12:35:01,109 - llm_api - DEBUG - Generated chunk: ...
451
- 2025-01-14 12:35:01,173 - llm_api - DEBUG - Generated chunk: retention ...
452
- 2025-01-14 12:35:01,239 - llm_api - DEBUG - Generated chunk: after ...
453
- 2025-01-14 12:35:01,305 - llm_api - DEBUG - Generated chunk: each ...
454
- 2025-01-14 12:35:01,372 - llm_api - DEBUG - Generated chunk: session ...
455
- 2025-01-14 12:35:01,438 - llm_api - DEBUG - Generated chunk: ends ...
456
- 2025-01-14 12:35:01,501 - llm_api - DEBUG - Generated chunk: due ...
457
- 2025-01-14 12:35:01,567 - llm_api - DEBUG - Generated chunk: to ...
458
- 2025-01-14 12:35:01,630 - llm_api - DEBUG - Generated chunk: design ...
459
- 2025-01-14 12:35:01,694 - llm_api - DEBUG - Generated chunk: ...
460
- 2025-01-14 12:35:01,759 - llm_api - DEBUG - Generated chunk: considerations ...
461
- 2025-01-14 12:35:01,823 - llm_api - DEBUG - Generated chunk: around ...
462
- 2025-01-14 12:35:01,887 - llm_api - DEBUG - Generated chunk: ...
463
- 2025-01-14 12:35:01,951 - llm_api - DEBUG - Generated chunk: privacy ...
464
- 2025-01-14 12:35:02,015 - llm_api - DEBUG - Generated chunk: protection ...
465
- 2025-01-14 12:35:02,079 - llm_api - DEBUG - Generated chunk: policies ...
466
- 2025-01-14 12:35:02,142 - llm_api - DEBUG - Generated chunk: followed ...
467
- 2025-01-14 12:35:02,207 - llm_api - DEBUG - Generated chunk: strictly ...
468
- 2025-01-14 12:35:02,271 - llm_api - DEBUG - Generated chunk: ...
469
- 2025-01-14 12:35:02,334 - llm_api - DEBUG - Generated chunk: ...
470
- 2025-01-14 12:35:02,399 - llm_api - DEBUG - Generated chunk: adhered ...
471
- 2025-01-14 12:35:02,462 - llm_api - DEBUG - Generated chunk: ...
472
- 2025-01-14 12:35:02,527 - llm_api - DEBUG - Generated chunk: too.
473
- ...
474
- 2025-01-14 12:35:02,592 - llm_api - DEBUG - Generated chunk:
475
- ...
476
- 2025-01-14 12:35:02,654 - llm_api - DEBUG - Generated chunk: ...
477
- 2025-01-14 12:35:02,716 - llm_api - DEBUG - Generated chunk: ...
478
- 2025-01-14 12:35:02,781 - llm_api - DEBUG - Generated chunk: 5. ...
479
- 2025-01-14 12:35:02,844 - llm_api - DEBUG - Generated chunk: ...
480
- 2025-01-14 12:35:02,908 - llm_api - DEBUG - Generated chunk: ...
481
- 2025-01-14 12:35:02,974 - llm_api - DEBUG - Generated chunk: ...
482
- 2025-01-14 12:35:03,043 - llm_api - DEBUG - Generated chunk: **Learning ...
483
- 2025-01-14 12:35:03,108 - llm_api - DEBUG - Generated chunk: ...
484
- 2025-01-14 12:35:03,173 - llm_api - DEBUG - Generated chunk: ...
485
- 2025-01-14 12:35:03,239 - llm_api - DEBUG - Generated chunk: ...
486
- 2025-01-14 12:35:03,303 - llm_api - DEBUG - Generated chunk: Capabilities**: ...
487
- 2025-01-14 12:35:03,369 - llm_api - DEBUG - Generated chunk: It ...
488
- 2025-01-14 12:35:03,433 - llm_api - DEBUG - Generated chunk: should ...
489
- 2025-01-14 12:35:03,498 - llm_api - DEBUG - Generated chunk: also ...
490
- 2025-01-14 12:35:03,562 - llm_api - DEBUG - Generated chunk: be ...
491
- 2025-01-14 12:35:03,626 - llm_api - DEBUG - Generated chunk: noted ...
492
- 2025-01-14 12:35:03,690 - llm_api - DEBUG - Generated chunk: though ...
493
- 2025-01-14 12:35:03,755 - llm_api - DEBUG - Generated chunk: there ...
494
- 2025-01-14 12:35:03,819 - llm_api - DEBUG - Generated chunk: ...
495
- 2025-01-14 12:35:03,884 - llm_api - DEBUG - Generated chunk: ...
496
- 2025-01-14 12:35:03,949 - llm_api - DEBUG - Generated chunk: isn't ...
497
- 2025-01-14 12:35:04,013 - llm_api - DEBUG - Generated chunk: learning ...
498
- 2025-01-14 12:35:04,078 - llm_api - DEBUG - Generated chunk: happening ...
499
- 2025-01-14 12:35:04,142 - llm_api - DEBUG - Generated chunk: per ...
500
- 2025-01-14 12:35:04,206 - llm_api - DEBUG - Generated chunk: ...
501
- 2025-01-14 12:35:04,270 - llm_api - DEBUG - Generated chunk: se; ...
502
- 2025-01-14 12:35:04,334 - llm_api - DEBUG - Generated chunk: continuous ...
503
- 2025-01-14 12:35:04,399 - llm_api - DEBUG - Generated chunk: improvements ...
504
- 2025-01-14 12:35:04,463 - llm_api - DEBUG - Generated chunk: occur ...
505
- 2025-01-14 12:35:04,528 - llm_api - DEBUG - Generated chunk: through ...
506
- 2025-01-14 12:35:04,593 - llm_api - DEBUG - Generated chunk: updates ...
507
- 2025-01-14 12:35:04,659 - llm_api - DEBUG - Generated chunk: made ...
508
- 2025-01-14 12:35:04,724 - llm_api - DEBUG - Generated chunk: ...
509
- 2025-01-14 12:35:04,790 - llm_api - DEBUG - Generated chunk: periodically ...
510
- 2025-01-14 12:35:04,855 - llm_api - DEBUG - Generated chunk: ...
511
- 2025-01-14 12:35:04,923 - llm_api - DEBUG - Generated chunk: reflective ...
512
- 2025-01-14 12:35:04,993 - llm_api - DEBUG - Generated chunk: ...
513
- 2025-01-14 12:35:05,061 - llm_api - DEBUG - Generated chunk: ...
514
- 2025-01-14 12:35:05,129 - llm_api - DEBUG - Generated chunk: advancements ...
515
- 2025-01-14 12:35:05,197 - llm_api - DEBUG - Generated chunk: achieved ...
516
- 2025-01-14 12:35:05,265 - llm_api - DEBUG - Generated chunk: over ...
517
- 2025-01-14 12:35:05,333 - llm_api - DEBUG - Generated chunk: time ...
518
- 2025-01-14 12:35:05,400 - llm_api - DEBUG - Generated chunk: via ...
519
- 2025-01-14 12:35:05,467 - llm_api - DEBUG - Generated chunk: machine ...
520
- 2025-01-14 12:35:05,535 - llm_api - DEBUG - Generated chunk: learning ...
521
- 2025-01-14 12:35:05,603 - llm_api - DEBUG - Generated chunk: techniques ...
522
- 2025-01-14 12:35:05,672 - llm_api - DEBUG - Generated chunk: applied ...
523
- 2025-01-14 12:35:05,742 - llm_api - DEBUG - Generated chunk: ...
524
- 2025-01-14 12:35:05,812 - llm_api - DEBUG - Generated chunk: systematically ...
525
- 2025-01-14 12:35:05,881 - llm_api - DEBUG - Generated chunk: throughout ...
526
- 2025-01-14 12:35:05,951 - llm_api - DEBUG - Generated chunk: development ...
527
- 2025-01-14 12:35:06,022 - llm_api - DEBUG - Generated chunk: phases ...
528
- 2025-01-14 12:35:06,091 - llm_api - DEBUG - Generated chunk: ...
529
- 2025-01-14 12:35:06,162 - llm_api - DEBUG - Generated chunk: aimed ...
530
- 2025-01-14 12:35:06,231 - llm_api - DEBUG - Generated chunk: towards ...
531
- 2025-01-14 12:35:06,301 - llm_api - DEBUG - Generated chunk: ...
532
- 2025-01-14 12:35:06,370 - llm_api - DEBUG - Generated chunk: ...
533
- 2025-01-14 12:35:06,439 - llm_api - DEBUG - Generated chunk: enhancing ...
534
- 2025-01-14 12:35:06,505 - llm_api - DEBUG - Generated chunk: performance ...
535
- 2025-01-14 12:35:06,569 - llm_api - DEBUG - Generated chunk: ...
536
- 2025-01-14 12:35:06,634 - llm_api - DEBUG - Generated chunk: consistently ...
537
- 2025-01-14 12:35:06,697 - llm_api - DEBUG - Generated chunk: while ...
538
- 2025-01-14 12:35:06,761 - llm_api - DEBUG - Generated chunk: ...
539
- 2025-01-14 12:35:06,825 - llm_api - DEBUG - Generated chunk: maintaining ...
540
- 2025-01-14 12:35:06,888 - llm_api - DEBUG - Generated chunk: user ...
541
- 2025-01-14 12:35:06,952 - llm_api - DEBUG - Generated chunk: trust ...
542
- 2025-01-14 12:35:07,017 - llm_api - DEBUG - Generated chunk: simultaneously ...
543
- 2025-01-14 12:35:07,081 - llm_api - DEBUG - Generated chunk: ...
544
- 2025-01-14 12:35:07,145 - llm_api - DEBUG - Generated chunk: ensuring ...
545
- 2025-01-14 12:35:07,210 - llm_api - DEBUG - Generated chunk: ...
546
- 2025-01-14 12:35:07,273 - llm_api - DEBUG - Generated chunk: ethical ...
547
- 2025-01-14 12:35:07,339 - llm_api - DEBUG - Generated chunk: ...
548
- 2025-01-14 12:35:07,406 - llm_api - DEBUG - Generated chunk: ...
549
- 2025-01-14 12:35:07,472 - llm_api - DEBUG - Generated chunk: guidelines ...
550
- 2025-01-14 12:35:07,539 - llm_api - DEBUG - Generated chunk: remain ...
551
- 2025-01-14 12:35:07,606 - llm_api - DEBUG - Generated chunk: ...
552
- 2025-01-14 12:35:07,673 - llm_api - DEBUG - Generated chunk: ...
553
- 2025-01-14 12:35:07,740 - llm_api - DEBUG - Generated chunk: ...
554
- 2025-01-14 12:35:07,806 - llm_api - DEBUG - Generated chunk: uncompromised ...
555
- 2025-01-14 12:35:07,871 - llm_api - DEBUG - Generated chunk: always ...
556
- 2025-01-14 12:35:07,938 - llm_api - DEBUG - Generated chunk: ...
557
- 2025-01-14 12:35:08,004 - llm_api - DEBUG - Generated chunk: ...
558
- 2025-01-14 12:35:08,070 - llm_api - DEBUG - Generated chunk: prioritized ...
559
- 2025-01-14 12:35:08,136 - llm_api - DEBUG - Generated chunk: above ...
560
- 2025-01-14 12:35:08,203 - llm_api - DEBUG - Generated chunk: all ...
561
- 2025-01-14 12:35:08,270 - llm_api - DEBUG - Generated chunk: ...
562
- 2025-01-14 12:35:08,337 - llm_api - DEBUG - Generated chunk: else!
563
- ...
564
- 2025-01-14 12:35:08,404 - llm_api - DEBUG - Generated chunk:
565
- ...
566
- 2025-01-14 12:35:08,470 - llm_api - DEBUG - Generated chunk: ...
567
- 2025-01-14 12:35:08,535 - llm_api - DEBUG - Generated chunk: ...
568
- 2025-01-14 12:35:08,602 - llm_api - DEBUG - Generated chunk: 6. ...
569
- 2025-01-14 12:35:08,667 - llm_api - DEBUG - Generated chunk: ...
570
- 2025-01-14 12:35:08,731 - llm_api - DEBUG - Generated chunk: ...
571
- 2025-01-14 12:35:08,796 - llm_api - DEBUG - Generated chunk: ...
572
- 2025-01-14 12:35:08,860 - llm_api - DEBUG - Generated chunk: **Multilingual ...
573
- 2025-01-14 12:35:08,927 - llm_api - DEBUG - Generated chunk: ...
574
- 2025-01-14 12:35:08,990 - llm_api - DEBUG - Generated chunk: ...
575
- 2025-01-14 12:35:09,054 - llm_api - DEBUG - Generated chunk: Support**: ...
576
- 2025-01-14 12:35:09,119 - llm_api - DEBUG - Generated chunk: As ...
577
- 2025-01-14 12:35:09,184 - llm_api - DEBUG - Generated chunk: mentioned ...
578
- 2025-01-14 12:35:09,248 - llm_api - DEBUG - Generated chunk: earlier ...
579
- 2025-01-14 12:35:09,311 - llm_api - DEBUG - Generated chunk: regarding ...
580
- 2025-01-14 12:35:09,375 - llm_api - DEBUG - Generated chunk: Language ...
581
- 2025-01-14 12:35:09,439 - llm_api - DEBUG - Generated chunk: ...
582
- 2025-01-14 12:35:09,503 - llm_api - DEBUG - Generated chunk: comprehension ...
583
- 2025-01-14 12:35:09,569 - llm_api - DEBUG - Generated chunk: skills ...
584
- 2025-01-14 12:35:09,633 - llm_api - DEBUG - Generated chunk: – ...
585
- 2025-01-14 12:35:09,698 - llm_api - DEBUG - Generated chunk: one ...
586
- 2025-01-14 12:35:09,762 - llm_api - DEBUG - Generated chunk: significant ...
587
- 2025-01-14 12:35:09,825 - llm_api - DEBUG - Generated chunk: advantage ...
588
- 2025-01-14 12:35:09,890 - llm_api - DEBUG - Generated chunk: here ...
589
- 2025-01-14 12:35:09,953 - llm_api - DEBUG - Generated chunk: lies ...
590
- 2025-01-14 12:35:10,019 - llm_api - DEBUG - Generated chunk: ...
591
- 2025-01-14 12:35:10,084 - llm_api - DEBUG - Generated chunk: ...
592
- 2025-01-14 12:35:10,149 - llm_api - DEBUG - Generated chunk: multilanguage ...
593
- 2025-01-14 12:35:10,213 - llm_api - DEBUG - Generated chunk: support ...
594
- 2025-01-14 12:35:10,278 - llm_api - DEBUG - Generated chunk: allowing ...
595
- 2025-01-14 12:35:10,342 - llm_api - DEBUG - Generated chunk: users ...
596
- 2025-01-14 12:35:10,408 - llm_api - DEBUG - Generated chunk: ...
597
- 2025-01-14 12:35:10,472 - llm_api - DEBUG - Generated chunk: worldwide ...
598
- 2025-01-14 12:35:10,537 - llm_api - DEBUG - Generated chunk: ...
599
- 2025-01-14 12:35:10,603 - llm_api - DEBUG - Generated chunk: ...
600
- 2025-01-14 12:35:10,669 - llm_api - DEBUG - Generated chunk: irrespective ...
601
- 2025-01-14 12:35:10,735 - llm_api - DEBUG - Generated chunk: ...
602
- 2025-01-14 12:35:10,801 - llm_api - DEBUG - Generated chunk: geographical ...
603
- 2025-01-14 12:35:10,866 - llm_api - DEBUG - Generated chunk: location ...
604
- 2025-01-14 12:35:10,931 - llm_api - DEBUG - Generated chunk: ...
605
- 2025-01-14 12:35:10,996 - llm_api - DEBUG - Generated chunk: accessibility ...
606
- 2025-01-14 12:35:11,061 - llm_api - DEBUG - Generated chunk: ...
607
- 2025-01-14 12:35:11,125 - llm_api - DEBUG - Generated chunk: facilitated ...
608
- 2025-01-14 12:35:11,190 - llm_api - DEBUG - Generated chunk: smooth ...
609
- 2025-01-14 12:35:11,255 - llm_api - DEBUG - Generated chunk: communication ...
610
- 2025-01-14 12:35:11,327 - llm_api - DEBUG - Generated chunk: experience ...
611
- 2025-01-14 12:35:11,393 - llm_api - DEBUG - Generated chunk: ...
612
- 2025-01-14 12:35:11,460 - llm_api - DEBUG - Generated chunk: fostering ...
613
- 2025-01-14 12:35:11,527 - llm_api - DEBUG - Generated chunk: ...
614
- 2025-01-14 12:35:11,593 - llm_api - DEBUG - Generated chunk: inclusivity ...
615
- 2025-01-14 12:35:11,699 - llm_api - DEBUG - Generated chunk: ...
616
- 2025-01-14 12:35:11,794 - llm_api - DEBUG - Generated chunk: globally ...
617
- 2025-01-14 12:35:11,887 - llm_api - DEBUG - Generated chunk: ...
618
- 2025-01-14 12:35:11,960 - llm_api - DEBUG - Generated chunk: promoting ...
619
- 2025-01-14 12:35:12,034 - llm_api - DEBUG - Generated chunk: cross ...
620
- 2025-01-14 12:35:12,113 - llm_api - DEBUG - Generated chunk: cultural ...
621
- 2025-01-14 12:35:12,184 - llm_api - DEBUG - Generated chunk: exchange ...
622
- 2025-01-14 12:35:12,252 - llm_api - DEBUG - Generated chunk: effectively ...
623
- 2025-01-14 12:35:12,331 - llm_api - DEBUG - Generated chunk: ...
624
- 2025-01-14 12:35:12,401 - llm_api - DEBUG - Generated chunk: bridging ...
625
- 2025-01-14 12:35:12,467 - llm_api - DEBUG - Generated chunk: ...
626
- 2025-01-14 12:35:12,532 - llm_api - DEBUG - Generated chunk: gaps ...
627
- 2025-01-14 12:35:12,598 - llm_api - DEBUG - Generated chunk: between ...
628
- 2025-01-14 12:35:12,662 - llm_api - DEBUG - Generated chunk: different ...
629
- 2025-01-14 12:35:12,726 - llm_api - DEBUG - Generated chunk: communities ...
630
- 2025-01-14 12:35:12,792 - llm_api - DEBUG - Generated chunk: everywhere ...
631
- 2025-01-14 12:35:12,857 - llm_api - DEBUG - Generated chunk: creating ...
632
- 2025-01-14 12:35:12,922 - llm_api - DEBUG - Generated chunk: ...
633
- 2025-01-14 12:35:12,986 - llm_api - DEBUG - Generated chunk: opportunities ...
634
- 2025-01-14 12:35:13,053 - llm_api - DEBUG - Generated chunk: connecting ...
635
- 2025-01-14 12:35:13,120 - llm_api - DEBUG - Generated chunk: hearts ...
636
- 2025-01-14 12:35:13,186 - llm_api - DEBUG - Generated chunk: minds ...
637
- 2025-01-14 12:35:13,252 - llm_api - DEBUG - Generated chunk: together ...
638
- 2025-01-14 12:35:13,319 - llm_api - DEBUG - Generated chunk: ...
639
- 2025-01-14 12:35:13,385 - llm_api - DEBUG - Generated chunk: ...
640
- 2025-01-14 12:35:13,450 - llm_api - DEBUG - Generated chunk: harmoniously ...
641
- 2025-01-14 12:35:13,516 - llm_api - DEBUG - Generated chunk: ...
642
- 2025-01-14 12:35:13,583 - llm_api - DEBUG - Generated chunk: ...
643
- 2025-01-14 12:35:13,648 - llm_api - DEBUG - Generated chunk: transcending ...
644
- 2025-01-14 12:35:13,713 - llm_api - DEBUG - Generated chunk: boundaries ...
645
- 2025-01-14 12:35:13,779 - llm_api - DEBUG - Generated chunk: ...
646
- 2025-01-14 12:35:13,844 - llm_api - DEBUG - Generated chunk: effortlessly ...
647
- 2025-01-14 12:35:13,909 - llm_api - DEBUG - Generated chunk: breaking ...
648
- 2025-01-14 12:35:13,975 - llm_api - DEBUG - Generated chunk: down ...
649
- 2025-01-14 12:35:14,042 - llm_api - DEBUG - Generated chunk: walls ...
650
- 2025-01-14 12:35:14,111 - llm_api - DEBUG - Generated chunk: ...
651
- 2025-01-14 12:35:14,176 - llm_api - DEBUG - Generated chunk: silencing ...
652
- 2025-01-14 12:35:14,242 - llm_api - DEBUG - Generated chunk: voices ...
653
- 2025-01-14 12:35:14,307 - llm_api - DEBUG - Generated chunk: ...
654
- 2025-01-14 12:35:14,372 - llm_api - DEBUG - Generated chunk: suppressions ...
655
- 2025-01-14 12:35:14,438 - llm_api - DEBUG - Generated chunk: fear ...
656
- 2025-01-14 12:35:14,503 - llm_api - DEBUG - Generated chunk: ...
657
- 2025-01-14 12:35:14,569 - llm_api - DEBUG - Generated chunk: dividing ...
658
- 2025-01-14 12:35:14,570 - llm_api - DEBUG - Generated chunk: us...
659
- 2025-01-14 12:35:14,570 - llm_api - INFO - INFO: 127.0.0.1:37118 - "POST /api/v1/generate/stream HTTP/1.1" 200 OK
660
- 2025-01-14 12:47:53,791 - api_routes - INFO - Received generation request for prompt: Tell me about yourself and your capabilities...
661
- 2025-01-14 12:47:53,791 - llm_api - DEBUG - Generating response for prompt: Tell me about yourself and your capabilities...
662
- 2025-01-14 12:48:27,656 - llm_api - DEBUG - Generated response: I'm Phi, an AI developed by Microsoft. While I don...
663
- 2025-01-14 12:48:27,656 - api_routes - INFO - Successfully generated response
664
- 2025-01-14 12:48:27,656 - llm_api - INFO - INFO: 127.0.0.1:43528 - "POST /api/v1/generate HTTP/1.1" 200 OK
665
- 2025-01-14 12:50:17,735 - api_routes - INFO - Received generation request for prompt: Please analyze this query and create a JSON respon...
666
- 2025-01-14 12:50:17,735 - llm_api - DEBUG - Generating response for prompt: Please analyze this query and create a JSON respon...
667
- 2025-01-14 12:50:31,906 - llm_api - DEBUG - Generated response: ```json
668
- {
669
- "original_query": "Who is Djengis Khan...
670
- 2025-01-14 12:50:31,906 - api_routes - INFO - Successfully generated response
671
- 2025-01-14 12:50:31,906 - llm_api - INFO - INFO: 127.0.0.1:50042 - "POST /api/v1/generate HTTP/1.1" 200 OK
672
- 2025-01-14 13:08:38,951 - api_routes - INFO - Received request to download model: PowerInfer/SmallThinker-3B-Preview
673
- 2025-01-14 13:08:38,951 - llm_api - INFO - Starting download of model: PowerInfer/SmallThinker-3B-Preview
674
- 2025-01-14 13:08:38,951 - llm_api - INFO - Enabling stdout logging for download
675
- 2025-01-14 13:11:52,350 - llm_api - INFO - Disabling stdout logging
676
- 2025-01-14 13:11:52,350 - llm_api - INFO - Saving model to main/models/SmallThinker-3B-Preview
677
- 2025-01-14 13:13:04,420 - llm_api - INFO - Successfully downloaded model: PowerInfer/SmallThinker-3B-Preview
678
- 2025-01-14 13:13:05,175 - api_routes - INFO - Successfully downloaded model: PowerInfer/SmallThinker-3B-Preview
679
- 2025-01-14 13:13:31,469 - api_routes - INFO - Received request to initialize model: PowerInfer/SmallThinker-3B-Preview
680
- 2025-01-14 13:13:31,469 - llm_api - INFO - Initializing generation model: PowerInfer/SmallThinker-3B-Preview
681
- 2025-01-14 13:13:31,472 - llm_api - INFO - Loading model from local path: main/models/SmallThinker-3B-Preview
682
- 2025-01-14 13:13:31,909 - llm_api - ERROR - Failed to initialize generation model PowerInfer/SmallThinker-3B-Preview: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details.
683
- 2025-01-14 13:13:31,909 - api_routes - ERROR - Error initializing model: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details.
684
- 2025-01-14 13:14:36,924 - main - INFO - LLM API server started on 0.0.0.0:8001
685
- 2025-01-14 13:14:49,486 - hf_validation - WARNING - No .env file found. Fine if you're on Huggingface, but you need one to run locally on your PC.
686
- 2025-01-14 13:14:49,486 - hf_validation - ERROR - No HF_TOKEN found in environment variables
687
- 2025-01-14 13:14:49,486 - main - INFO - Starting LLM API server
688
- 2025-01-14 13:14:49,486 - llm_api - INFO - Initializing LLM API
689
- 2025-01-14 13:14:49,486 - llm_api - INFO - LLM API initialized successfully
690
- 2025-01-14 13:14:49,486 - api_routes - INFO - Router initialized with LLM API instance
691
- 2025-01-14 13:14:49,490 - main - INFO - FastAPI application created successfully
692
- 2025-01-14 13:14:56,382 - api_routes - INFO - Received request to initialize model: PowerInfer/SmallThinker-3B-Preview
693
- 2025-01-14 13:14:56,383 - llm_api - INFO - Initializing generation model: PowerInfer/SmallThinker-3B-Preview
694
- 2025-01-14 13:14:56,383 - llm_api - INFO - Loading model from local path: main/models/SmallThinker-3B-Preview
695
- 2025-01-14 13:15:07,065 - llm_api - INFO - Successfully initialized generation model: PowerInfer/SmallThinker-3B-Preview
696
- 2025-01-14 13:15:07,065 - api_routes - INFO - Successfully initialized model: PowerInfer/SmallThinker-3B-Preview
697
- 2025-01-14 13:15:07,065 - llm_api - INFO - INFO: 127.0.0.1:40472 - "POST /api/v1/model/initialize?model_name=PowerInfer%2FSmallThinker-3B-Preview HTTP/1.1" 200 OK
698
- 2025-01-14 13:16:09,874 - api_routes - INFO - Received generation request for prompt: Tell me about yourself and your capabilities...
699
- 2025-01-14 13:16:09,874 - llm_api - DEBUG - Generating response for prompt: Tell me about yourself and your capabilities...
700
- 2025-01-14 13:17:03,595 - llm_api - DEBUG - Generated response: I'm an AI developed by Alibaba, designed to assist...
701
- 2025-01-14 13:17:03,595 - api_routes - INFO - Successfully generated response
702
- 2025-01-14 13:17:03,595 - llm_api - INFO - INFO: 127.0.0.1:44786 - "POST /api/v1/generate HTTP/1.1" 200 OK
703
- 2025-01-14 13:18:04,891 - main - INFO - LLM API server started on 0.0.0.0:8001
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/main.py ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from pydantic import BaseModel
3
+ from typing import Optional, Dict, Any, Union
4
+ import torch
5
+ import logging
6
+ from pathlib import Path
7
+ from litgpt.api import LLM
8
+ import os
9
+ import uvicorn
10
+
11
+ # Set up logging
12
+ logging.basicConfig(level=logging.INFO)
13
+ logger = logging.getLogger(__name__)
14
+
15
+ app = FastAPI(title="LLM Engine Service")
16
+
17
+ # Global variable to store the LLM instance
18
+ llm_instance = None
19
+
20
+ class InitializeRequest(BaseModel):
21
+ """
22
+ Configuration for model initialization including model path
23
+ """
24
+ mode: str = "cpu"
25
+ precision: Optional[str] = None
26
+ quantize: Optional[str] = None
27
+ gpu_count: Union[str, int] = "auto"
28
+ model_path: str
29
+
30
+ class GenerateRequest(BaseModel):
31
+ prompt: str
32
+ max_new_tokens: int = 50
33
+ temperature: float = 1.0
34
+ top_k: Optional[int] = None
35
+ top_p: float = 1.0
36
+ return_as_token_ids: bool = False
37
+ stream: bool = False
38
+
39
+ @app.post("/initialize")
40
+ async def initialize_model(request: InitializeRequest):
41
+ """
42
+ Initialize the LLM model with specified configuration.
43
+ """
44
+ global llm_instance
45
+
46
+ try:
47
+ if request.precision is None and request.quantize is None:
48
+ # Use auto distribution from load when no specific precision or quantization is set
49
+ llm_instance = LLM.load(
50
+ model=request.model_path,
51
+ distribute="auto" # Let the load function handle distribution automatically
52
+ )
53
+
54
+ logger.info(
55
+ f"Model initialized with auto settings:\n"
56
+ f"Model Path: {request.model_path}\n"
57
+ f"Current GPU Memory: {torch.cuda.memory_allocated()/1024**3:.2f}GB allocated, "
58
+ f"{torch.cuda.memory_reserved()/1024**3:.2f}GB reserved"
59
+ )
60
+ else:
61
+ # Original initialization path for when specific settings are requested
62
+ llm_instance = LLM.load(
63
+ model=request.model_path,
64
+ distribute=None # We'll distribute manually
65
+ )
66
+
67
+ # Distribute the model according to the configuration
68
+ llm_instance.distribute(
69
+ accelerator="cuda" if request.mode == "gpu" else "cpu",
70
+ devices=request.gpu_count,
71
+ precision=request.precision,
72
+ quantize=request.quantize
73
+ )
74
+
75
+ logger.info(
76
+ f"Model initialized successfully with config:\n"
77
+ f"Mode: {request.mode}\n"
78
+ f"Precision: {request.precision}\n"
79
+ f"Quantize: {request.quantize}\n"
80
+ f"GPU Count: {request.gpu_count}\n"
81
+ f"Model Path: {request.model_path}\n"
82
+ f"Current GPU Memory: {torch.cuda.memory_allocated()/1024**3:.2f}GB allocated, "
83
+ f"{torch.cuda.memory_reserved()/1024**3:.2f}GB reserved"
84
+ )
85
+
86
+ return {"success": True, "message": "Model initialized successfully"}
87
+
88
+ except Exception as e:
89
+ logger.error(f"Error initializing model: {str(e)}")
90
+ # Print detailed memory statistics on failure
91
+ logger.error(f"GPU Memory Stats:\n"
92
+ f"Allocated: {torch.cuda.memory_allocated()/1024**3:.2f}GB\n"
93
+ f"Reserved: {torch.cuda.memory_reserved()/1024**3:.2f}GB\n"
94
+ f"Max Allocated: {torch.cuda.max_memory_allocated()/1024**3:.2f}GB")
95
+ raise HTTPException(status_code=500, detail=f"Error initializing model: {str(e)}")
96
+
97
+ @app.post("/generate")
98
+ async def generate(request: GenerateRequest):
99
+ """
100
+ Generate text using the initialized model.
101
+ """
102
+ global llm_instance
103
+
104
+ if llm_instance is None:
105
+ raise HTTPException(status_code=400, detail="Model not initialized. Call /initialize first.")
106
+
107
+ try:
108
+ if request.stream:
109
+ # For streaming responses, we need to handle differently
110
+ # This is a placeholder as the actual streaming implementation
111
+ # would need to use StreamingResponse from FastAPI
112
+ raise HTTPException(
113
+ status_code=400,
114
+ detail="Streaming is not currently supported through the API"
115
+ )
116
+
117
+ generated_text = llm_instance.generate(
118
+ prompt=request.prompt,
119
+ max_new_tokens=request.max_new_tokens,
120
+ temperature=request.temperature,
121
+ top_k=request.top_k,
122
+ top_p=request.top_p,
123
+ return_as_token_ids=request.return_as_token_ids,
124
+ stream=False # Force stream to False for now
125
+ )
126
+
127
+ response = {
128
+ "generated_text": generated_text if not request.return_as_token_ids else generated_text.tolist(),
129
+ "metadata": {
130
+ "prompt": request.prompt,
131
+ "max_new_tokens": request.max_new_tokens,
132
+ "temperature": request.temperature,
133
+ "top_k": request.top_k,
134
+ "top_p": request.top_p
135
+ }
136
+ }
137
+
138
+ return response
139
+
140
+ except Exception as e:
141
+ logger.error(f"Error generating text: {str(e)}")
142
+ raise HTTPException(status_code=500, detail=f"Error generating text: {str(e)}")
143
+
144
+ @app.get("/health")
145
+ async def health_check():
146
+ """
147
+ Check if the service is running and model is loaded.
148
+ """
149
+ global llm_instance
150
+
151
+ status = {
152
+ "status": "healthy",
153
+ "model_loaded": llm_instance is not None,
154
+ }
155
+
156
+ if llm_instance is not None:
157
+ status["model_info"] = {
158
+ "model_path": llm_instance.config.name,
159
+ "device": str(next(llm_instance.model.parameters()).device)
160
+ }
161
+
162
+ return status
163
+
164
+ def main():
165
+ # Load environment variables or configuration here
166
+ host = os.getenv("LLM_ENGINE_HOST", "0.0.0.0")
167
+ port = int(os.getenv("LLM_ENGINE_PORT", "8001"))
168
+
169
+ # Start the server
170
+ uvicorn.run(
171
+ app,
172
+ host=host,
173
+ port=port,
174
+ log_level="info",
175
+ reload=False
176
+ )
177
+
178
+ if __name__ == "__main__":
179
+ main()
main/resources/config.yaml DELETED
@@ -1,31 +0,0 @@
1
- server:
2
- host: "0.0.0.0"
3
- port: 8001
4
-
5
- model:
6
- base_path: "."
7
- generation:
8
- max_new_tokens: 500
9
- do_sample: true
10
- temperature: 0.2
11
- repetition_penalty: 1.1
12
- defaults:
13
- #model_name: "huihui-ai/Llama-3.2-3B-Instruct-abliterated"
14
- model_name: "microsoft/Phi-3.5-mini-instruct"
15
-
16
- folders:
17
- models: "main/models"
18
- cache: "main/.cache"
19
- logs: "main/logs"
20
-
21
- logging:
22
- level: "DEBUG" # DEBUG, INFO, WARNING, ERROR, CRITICAL
23
- format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
24
- file: "llm_api.log"
25
-
26
- api:
27
- version: "v1"
28
- prefix: ""
29
- cors:
30
- origins: ["*"]
31
- credentials: true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/routes.py DELETED
@@ -1,391 +0,0 @@
1
- # routes.py for the LLM Engine.
2
- # This file contains the FastAPI routes for the LLM Engine API.
3
- # It includes routes for generating text, generating embeddings, checking system status, and validating system configuration.
4
-
5
- from fastapi import APIRouter, HTTPException
6
- from fastapi.responses import StreamingResponse
7
- from pydantic import BaseModel
8
- from typing import Optional, List, Dict, Union
9
- from .api import LLMApi
10
- from .utils.logging import setup_logger
11
- from .utils.helpers import get_system_info, format_memory_size
12
- from .utils.validation import validate_model_path
13
- import psutil
14
- from pathlib import Path
15
-
16
- router = APIRouter()
17
- logger = None
18
- api = None
19
- config = None
20
-
21
- def init_router(config_dict: dict):
22
- """Initialize router with config and LLM API instance"""
23
- global logger, api, config
24
- config = config_dict
25
- logger = setup_logger(config, "api_routes")
26
- api = LLMApi(config)
27
- logger.info("Router initialized with LLM API instance")
28
-
29
- class GenerateRequest(BaseModel):
30
- prompt: str
31
- system_message: Optional[str] = None
32
- max_new_tokens: Optional[int] = None
33
-
34
- class EmbeddingRequest(BaseModel):
35
- text: str
36
-
37
- class EmbeddingResponse(BaseModel):
38
- embedding: List[float]
39
- dimension: int
40
-
41
- class SystemStatusResponse(BaseModel):
42
- """Pydantic model for system status response"""
43
- cpu: Optional[Dict[str, Union[float, str]]] = None
44
- memory: Optional[Dict[str, Union[float, str]]] = None
45
- gpu: Optional[Dict[str, Union[bool, str, float]]] = None
46
- storage: Optional[Dict[str, str]] = None
47
- model: Optional[Dict[str, Union[bool, str]]] = None
48
-
49
- class ValidationResponse(BaseModel):
50
- config_validation: Dict[str, bool]
51
- model_validation: Dict[str, bool]
52
- folder_validation: Dict[str, bool]
53
- overall_status: str
54
- issues: List[str]
55
-
56
- @router.get("/system/validate",
57
- response_model=ValidationResponse,
58
- summary="Validate System Configuration",
59
- description="Validates system configuration, folders, and model setup for both generation and embedding models")
60
- async def validate_system():
61
- """
62
- Validates:
63
- - Configuration parameters
64
- - Model setup for both generation and embedding models
65
- - Folder structure
66
- - Required permissions
67
- """
68
- logger.info("Starting system validation")
69
- issues = []
70
-
71
- # Validate configuration
72
- try:
73
- config_status = {
74
- "has_required_fields": True, # Check if all required config fields exist
75
- "valid_paths": True, # Check if paths are valid
76
- "valid_parameters": True # Check if parameters are within acceptable ranges
77
- }
78
-
79
- # Example validation checks
80
- if not api.models_path.exists():
81
- config_status["valid_paths"] = False
82
- issues.append("Models directory does not exist")
83
-
84
- if api.temperature < 0 or api.temperature > 2:
85
- config_status["valid_parameters"] = False
86
- issues.append("Temperature parameter out of valid range (0-2)")
87
-
88
- except Exception as e:
89
- logger.error(f"Configuration validation failed: {str(e)}")
90
- config_status = {"error": str(e)}
91
- issues.append(f"Config validation error: {str(e)}")
92
-
93
- # Validate model setup
94
- try:
95
- model_status = {
96
- "generation_model_files_exist": False,
97
- "generation_model_loadable": False,
98
- "embedding_model_files_exist": False,
99
- "embedding_model_loadable": False,
100
- "tokenizer_valid": False
101
- }
102
-
103
- if api.generation_model_name:
104
- gen_model_path = api.models_path / api.generation_model_name.split('/')[-1]
105
- model_status["generation_model_files_exist"] = validate_model_path(gen_model_path)
106
- model_status["generation_model_loadable"] = api.generation_model is not None
107
-
108
- if api.embedding_model_name:
109
- emb_model_path = api.models_path / api.embedding_model_name.split('/')[-1]
110
- model_status["embedding_model_files_exist"] = validate_model_path(emb_model_path)
111
- model_status["embedding_model_loadable"] = api.embedding_model is not None
112
-
113
- model_status["tokenizer_valid"] = (
114
- api.tokenizer is not None and api.embedding_tokenizer is not None
115
- )
116
-
117
- if not model_status["generation_model_files_exist"]:
118
- issues.append("Generation model files are missing or incomplete")
119
- if not model_status["embedding_model_files_exist"]:
120
- issues.append("Embedding model files are missing or incomplete")
121
-
122
- except Exception as e:
123
- logger.error(f"Model validation failed: {str(e)}")
124
- model_status = {"error": str(e)}
125
- issues.append(f"Model validation error: {str(e)}")
126
-
127
- # Validate folder structure and permissions
128
- try:
129
- folder_status = {
130
- "models_folder": api.models_path.exists(),
131
- "cache_folder": api.cache_path.exists(),
132
- "logs_folder": Path(api.base_path / "logs").exists(),
133
- "write_permissions": False
134
- }
135
-
136
- # Test write permissions by attempting to create a test file
137
- test_file = api.models_path / ".test_write"
138
- try:
139
- test_file.touch()
140
- test_file.unlink()
141
- folder_status["write_permissions"] = True
142
- except:
143
- folder_status["write_permissions"] = False
144
- issues.append("Insufficient write permissions in models directory")
145
-
146
- except Exception as e:
147
- logger.error(f"Folder validation failed: {str(e)}")
148
- folder_status = {"error": str(e)}
149
- issues.append(f"Folder validation error: {str(e)}")
150
-
151
- # Determine overall status
152
- if not issues:
153
- overall_status = "valid"
154
- elif len(issues) < 3:
155
- overall_status = "warning"
156
- else:
157
- overall_status = "invalid"
158
-
159
- validation_response = ValidationResponse(
160
- config_validation=config_status,
161
- model_validation=model_status,
162
- folder_validation=folder_status,
163
- overall_status=overall_status,
164
- issues=issues
165
- )
166
-
167
- logger.info(f"System validation completed with status: {overall_status}")
168
- return validation_response
169
-
170
- @router.get("/system/status",
171
- response_model=SystemStatusResponse,
172
- summary="Check System Status",
173
- description="Returns comprehensive system status including CPU, Memory, GPU, Storage, and Model information")
174
- async def check_system():
175
- """
176
- Get system status including:
177
- - CPU usage
178
- - Memory usage
179
- - GPU availability and usage
180
- - Storage status for model and cache directories
181
- - Current model status
182
- """
183
- logger.info("Checking system status")
184
- status = SystemStatusResponse()
185
- system_info = None
186
-
187
- # Check CPU and Memory
188
- try:
189
- system_info = get_system_info()
190
- status.cpu = {
191
- "usage_percent": system_info["cpu_percent"],
192
- "status": "healthy" if system_info["cpu_percent"] < 90 else "high"
193
- }
194
- logger.debug(f"CPU status retrieved: {status.cpu}")
195
- except Exception as e:
196
- logger.error(f"Failed to get CPU info: {str(e)}")
197
- status.cpu = {"status": "error", "message": str(e)}
198
-
199
- # Check Memory
200
- try:
201
- if not system_info:
202
- system_info = get_system_info()
203
- status.memory = {
204
- "usage_percent": system_info["memory_percent"],
205
- "status": "healthy" if system_info["memory_percent"] < 90 else "critical",
206
- "available": format_memory_size(psutil.virtual_memory().available)
207
- }
208
- logger.debug(f"Memory status retrieved: {status.memory}")
209
- except Exception as e:
210
- logger.error(f"Failed to get memory info: {str(e)}")
211
- status.memory = {"status": "error", "message": str(e)}
212
-
213
- # Check GPU
214
- try:
215
- if not system_info:
216
- system_info = get_system_info()
217
- status.gpu = {
218
- "available": system_info["gpu_available"],
219
- "memory_used": format_memory_size(system_info["gpu_memory_used"]),
220
- "memory_total": format_memory_size(system_info["gpu_memory_total"]),
221
- "utilization_percent": system_info["gpu_memory_used"] / system_info["gpu_memory_total"] * 100 if system_info["gpu_available"] else 0
222
- }
223
- logger.debug(f"GPU status retrieved: {status.gpu}")
224
- except Exception as e:
225
- logger.error(f"Failed to get GPU info: {str(e)}")
226
- status.gpu = {"status": "error", "message": str(e)}
227
-
228
- # Check Storage
229
- try:
230
- models_path = Path(api.models_path)
231
- cache_path = Path(api.cache_path)
232
- status.storage = {
233
- "models_directory": str(models_path),
234
- "models_size": format_memory_size(sum(f.stat().st_size for f in models_path.glob('**/*') if f.is_file())),
235
- "cache_directory": str(cache_path),
236
- "cache_size": format_memory_size(sum(f.stat().st_size for f in cache_path.glob('**/*') if f.is_file()))
237
- }
238
- logger.debug(f"Storage status retrieved: {status.storage}")
239
- except Exception as e:
240
- logger.error(f"Failed to get storage info: {str(e)}")
241
- status.storage = {"status": "error", "message": str(e)}
242
-
243
- # Check Model Status
244
- try:
245
- status.model = {
246
- "generation_model": {
247
- "is_loaded": api.generation_model is not None,
248
- "current_model": api.generation_model_name,
249
- "has_chat_template": api.has_chat_template() if api.generation_model else False
250
- },
251
- "embedding_model": {
252
- "is_loaded": api.embedding_model is not None,
253
- "current_model": api.embedding_model_name
254
- }
255
- }
256
- logger.debug(f"Model status retrieved: {status.model}")
257
- except Exception as e:
258
- logger.error(f"Failed to get model status: {str(e)}")
259
- status.model = {"status": "error", "message": str(e)}
260
-
261
- logger.info("System status check completed")
262
- return status
263
-
264
- @router.post("/generate")
265
- async def generate_text(request: GenerateRequest):
266
- """Generate text response from prompt"""
267
- logger.info(f"Received generation request for prompt: {request.prompt[:50]}...")
268
- try:
269
- response = api.generate_response(
270
- prompt=request.prompt,
271
- system_message=request.system_message,
272
- max_new_tokens=request.max_new_tokens or api.max_new_tokens
273
- )
274
- logger.info("Successfully generated response")
275
- return {"generated_text": response}
276
- except Exception as e:
277
- logger.error(f"Error in generate_text endpoint: {str(e)}")
278
- raise HTTPException(status_code=500, detail=str(e))
279
-
280
- @router.post("/generate/stream")
281
- async def generate_stream(request: GenerateRequest):
282
- """Generate streaming text response from prompt"""
283
- logger.info(f"Received streaming generation request for prompt: {request.prompt[:50]}...")
284
- try:
285
- async def event_generator():
286
- async for chunk in api.generate_stream(
287
- prompt=request.prompt,
288
- system_message=request.system_message,
289
- max_new_tokens=request.max_new_tokens or api.max_new_tokens
290
- ):
291
- yield f"data: {chunk}\n\n"
292
- yield "data: [DONE]\n\n"
293
-
294
- return StreamingResponse(
295
- event_generator(),
296
- media_type="text/event-stream",
297
- headers={
298
- "Cache-Control": "no-cache",
299
- "Connection": "keep-alive",
300
- }
301
- )
302
- except Exception as e:
303
- logger.error(f"Error in generate_stream endpoint: {str(e)}")
304
- raise HTTPException(status_code=500, detail=str(e))
305
-
306
- @router.post("/embedding", response_model=EmbeddingResponse)
307
- async def generate_embedding(request: EmbeddingRequest):
308
- """Generate embedding vector from text"""
309
- logger.info(f"Received embedding request for text: {request.text[:50]}...")
310
- try:
311
- embedding = api.generate_embedding(request.text)
312
- logger.info(f"Successfully generated embedding of dimension {len(embedding)}")
313
- return EmbeddingResponse(
314
- embedding=embedding,
315
- dimension=len(embedding)
316
- )
317
- except Exception as e:
318
- logger.error(f"Error in generate_embedding endpoint: {str(e)}")
319
- raise HTTPException(status_code=500, detail=str(e))
320
-
321
- @router.post("/model/download",
322
- summary="Download default or specified model",
323
- description="Downloads model files. Uses default model from config if none specified.")
324
- async def download_model(model_name: Optional[str] = None):
325
- """Download model files to local storage"""
326
- try:
327
- # Use model name from config if none provided
328
- model_to_download = model_name or config["model"]["defaults"]["model_name"]
329
- logger.info(f"Received request to download model: {model_to_download}")
330
-
331
- api.download_model(model_to_download)
332
- logger.info(f"Successfully downloaded model: {model_to_download}")
333
-
334
- return {
335
- "status": "success",
336
- "message": f"Model {model_to_download} downloaded",
337
- "model_name": model_to_download
338
- }
339
- except Exception as e:
340
- logger.error(f"Error downloading model: {str(e)}")
341
- raise HTTPException(status_code=500, detail=str(e))
342
-
343
- @router.post("/model/initialize",
344
- summary="Initialize default or specified model",
345
- description="Initialize model for use. Uses default model from config if none specified.")
346
- async def initialize_model(model_name: Optional[str] = None):
347
- """Initialize a model for use"""
348
- try:
349
- # Use model name from config if none provided
350
- model_to_init = model_name or config["model"]["defaults"]["model_name"]
351
- logger.info(f"Received request to initialize model: {model_to_init}")
352
-
353
- api.initialize_model(model_to_init)
354
- logger.info(f"Successfully initialized model: {model_to_init}")
355
-
356
- return {
357
- "status": "success",
358
- "message": f"Model {model_to_init} initialized",
359
- "model_name": model_to_init
360
- }
361
- except Exception as e:
362
- logger.error(f"Error initializing model: {str(e)}")
363
- raise HTTPException(status_code=500, detail=str(e))
364
-
365
- @router.post("/model/initialize/embedding",
366
- summary="Initialize embedding model",
367
- description="Initialize a separate model specifically for generating embeddings")
368
- async def initialize_embedding_model(model_name: Optional[str] = None):
369
- """Initialize a model specifically for embeddings"""
370
- try:
371
- # Use model name from config if none provided
372
- embedding_model = model_name or config["model"]["defaults"].get("embedding_model_name")
373
- if not embedding_model:
374
- raise HTTPException(
375
- status_code=400,
376
- detail="No embedding model specified and no default found in config"
377
- )
378
-
379
- logger.info(f"Received request to initialize embedding model: {embedding_model}")
380
-
381
- api.initialize_embedding_model(embedding_model)
382
- logger.info(f"Successfully initialized embedding model: {embedding_model}")
383
-
384
- return {
385
- "status": "success",
386
- "message": f"Embedding model {embedding_model} initialized",
387
- "model_name": embedding_model
388
- }
389
- except Exception as e:
390
- logger.error(f"Error initializing embedding model: {str(e)}")
391
- raise HTTPException(status_code=500, detail=str(e))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/test_locally.py DELETED
@@ -1,56 +0,0 @@
1
-
2
-
3
- def test_locally(load_config, setup_logger, LLMApi):
4
- """Run local tests for development and debugging"""
5
- config = load_config()
6
- logger = setup_logger(config, "test")
7
- logger.info("Starting local tests")
8
-
9
- api = LLMApi(config)
10
- model_name = config["model"]["defaults"]["model_name"]
11
-
12
- logger.info(f"Testing with model: {model_name}")
13
-
14
- # Test download
15
- logger.info("Testing model download...")
16
- api.download_model(model_name)
17
- logger.info("Download complete")
18
-
19
- # Test initialization
20
- logger.info("Initializing model...")
21
- api.initialize_model(model_name)
22
- logger.info("Model initialized")
23
-
24
- # Test embedding
25
- test_text = "Dette er en test av embeddings generering fra en teknisk tekst om HMS rutiner på arbeidsplassen."
26
- logger.info("Testing embedding generation...")
27
- embedding = api.generate_embedding(test_text)
28
- logger.info(f"Generated embedding of length: {len(embedding)}")
29
- logger.info(f"First few values: {embedding[:5]}")
30
-
31
- # Test generation
32
- test_prompts = [
33
- "Tell me what happens in a nuclear reactor.",
34
- ]
35
-
36
- # Test regular generation
37
- logger.info("Testing regular generation:")
38
- for prompt in test_prompts:
39
- logger.info(f"Prompt: {prompt}")
40
- response = api.generate_response(
41
- prompt=prompt,
42
- system_message="You are a helpful assistant."
43
- )
44
- logger.info(f"Response: {response}")
45
-
46
- # Test streaming generation
47
- logger.info("Testing streaming generation:")
48
- logger.info(f"Prompt: {test_prompts[0]}")
49
- for chunk in api.generate_stream(
50
- prompt=test_prompts[0],
51
- system_message="You are a helpful assistant."
52
- ):
53
- print(chunk, end="", flush=True)
54
- print("\n")
55
-
56
- logger.info("Local tests completed")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/utils/__init__.py DELETED
File without changes
main/utils/errors.py DELETED
@@ -1,94 +0,0 @@
1
- class ModelNotFoundError(Exception):
2
- """Error raised when a model cannot be found or accessed"""
3
- def __init__(self, model_name: str, original_error: Exception = None):
4
- self.model_name = model_name
5
- self.original_error = original_error
6
-
7
- message = (
8
- f"Could not find or access model: '{model_name}'\n\n"
9
- f"This could be because:\n"
10
- f"1. The model name is misspelled - double check the name\n"
11
- f"2. The model requires authentication - you need to:\n"
12
- f" - Log in to Hugging Face (huggingface.co)\n"
13
- f" - Accept the model's terms of use on its page\n"
14
- f" - Create an access token in your HF account settings\n"
15
- f" - Set the token as an environment variable: export HUGGING_FACE_HUB_TOKEN=your_token\n\n"
16
- f"Original error: {str(original_error)}"
17
- )
18
- super().__init__(message)
19
-
20
- class ModelLoadError(Exception):
21
- """Error raised when a model fails to load"""
22
- def __init__(self, model_name: str, load_type: str, original_error: Exception = None):
23
- self.model_name = model_name
24
- self.load_type = load_type
25
- self.original_error = original_error
26
-
27
- message = (
28
- f"Failed to load model: '{model_name}' using {load_type} precision\n\n"
29
- f"Common reasons:\n"
30
- f"1. Not enough GPU memory - This model requires more VRAM than available\n"
31
- f" - Try using 8-bit quantization (load_in_8bit=True)\n"
32
- f" - Try using 4-bit quantization (load_in_4bit=True)\n"
33
- f" - Or use a smaller model\n"
34
- f"2. Incorrect model parameters - Check the model card for correct loading parameters\n"
35
- f"3. Corrupted model files - Try removing the model folder and downloading again\n\n"
36
- f"Original error: {str(original_error)}"
37
- )
38
- super().__init__(message)
39
-
40
- class InvalidConfigurationError(Exception):
41
- """Error raised when configuration is invalid"""
42
- def __init__(self, param_name: str, current_value: any, expected_value: str, original_error: Exception = None):
43
- self.param_name = param_name
44
- self.current_value = current_value
45
- self.expected_value = expected_value
46
- self.original_error = original_error
47
-
48
- message = (
49
- f"Invalid configuration parameter: '{param_name}'\n\n"
50
- f"Current value: {current_value}\n"
51
- f"Expected value: {expected_value}\n\n"
52
- f"Please update your config.yaml file with the correct value\n"
53
- f"Original error: {str(original_error)}"
54
- )
55
- super().__init__(message)
56
-
57
- class GenerationError(Exception):
58
- """Error raised when text generation fails"""
59
- def __init__(self, stage: str, original_error: Exception = None):
60
- self.stage = stage
61
- self.original_error = original_error
62
-
63
- message = (
64
- f"Text generation failed during {stage}\n\n"
65
- f"This could be because:\n"
66
- f"1. The model ran out of memory during generation\n"
67
- f" - Try reducing max_new_tokens\n"
68
- f" - Try reducing the input text length\n"
69
- f"2. The input prompt might be too complex or long\n"
70
- f"3. The model might be in an inconsistent state\n"
71
- f" - Try reinitializing the model\n\n"
72
- f"Original error: {str(original_error)}"
73
- )
74
- super().__init__(message)
75
-
76
- # Usage examples:
77
- """
78
- # When model not found:
79
- raise ModelNotFoundError("mistralai/Mistral-7B-v0.1", original_error=e)
80
-
81
- # When model fails to load:
82
- raise ModelLoadError("mistralai/Mistral-7B-v0.1", "8-bit quantization", original_error=e)
83
-
84
- # When config is invalid:
85
- raise InvalidConfigurationError(
86
- "temperature",
87
- 2.5,
88
- "a value between 0.0 and 2.0",
89
- original_error=e
90
- )
91
-
92
- # When generation fails:
93
- raise GenerationError("token generation", original_error=e)
94
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/utils/helpers.py DELETED
@@ -1,44 +0,0 @@
1
- import psutil
2
- import torch
3
- from pathlib import Path
4
- from typing import Dict, Any
5
-
6
- import yaml
7
-
8
-
9
- def get_system_info() -> Dict[str, Any]:
10
- """Get system resource information"""
11
- return {
12
- "cpu_percent": psutil.cpu_percent(),
13
- "memory_percent": psutil.virtual_memory().percent,
14
- "gpu_available": torch.cuda.is_available(),
15
- "gpu_memory_used": torch.cuda.memory_allocated() if torch.cuda.is_available() else 0,
16
- "gpu_memory_total": torch.cuda.get_device_properties(0).total_memory if torch.cuda.is_available() else 0
17
- }
18
-
19
- def calculate_optimal_batch_size(model_size: int, available_memory: int) -> int:
20
- """Calculate optimal batch size based on model size and available memory"""
21
- memory_per_sample = model_size * 1.5 # Rough estimate including overhead
22
- return max(1, available_memory // memory_per_sample)
23
-
24
- def ensure_folder_structure(config: Dict) -> None:
25
- """Ensure all necessary folders exist"""
26
- folders = [
27
- Path(config["folders"]["models"]),
28
- Path(config["folders"]["cache"]),
29
- Path(config["folders"]["logs"])
30
- ]
31
- for folder in folders:
32
- folder.mkdir(parents=True, exist_ok=True)
33
-
34
- def format_memory_size(size_bytes: int) -> str:
35
- """Format memory size to human readable format"""
36
- for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
37
- if size_bytes < 1024:
38
- return f"{size_bytes:.2f}{unit}"
39
- size_bytes /= 1024
40
-
41
- def load_config():
42
- """Load configuration from yaml file"""
43
- with open("main/resources/config.yaml", "r") as f:
44
- return yaml.safe_load(f)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/utils/logging.py DELETED
@@ -1,65 +0,0 @@
1
- import logging
2
- import sys
3
- from pathlib import Path
4
-
5
- class StreamToLogger:
6
- """
7
- Fake file-like stream object that redirects writes to a logger instance.
8
- """
9
- def __init__(self, logger, log_level=logging.INFO):
10
- self.logger = logger
11
- self.log_level = log_level
12
- self.linebuf = ''
13
- self.enabled = True
14
-
15
- def write(self, buf):
16
- if self.enabled:
17
- for line in buf.rstrip().splitlines():
18
- self.logger.log(self.log_level, line.rstrip())
19
-
20
- def flush(self):
21
- pass
22
-
23
- def enable(self):
24
- self.enabled = True
25
-
26
- def isatty(self):
27
- return False
28
-
29
- def disable(self):
30
- self.enabled = False
31
-
32
- def setup_logger(config: dict, name: str = None) -> logging.Logger:
33
- """Set up logger with configuration from config file."""
34
- logger = logging.getLogger(name or __name__)
35
-
36
- # Set level from config
37
- level = getattr(logging, config["logging"]["level"].upper())
38
- logger.setLevel(level)
39
-
40
- # Create logs directory if it doesn't exist
41
- log_path = Path(config["folders"]["logs"])
42
- log_path.mkdir(exist_ok=True)
43
-
44
- # Create handlers
45
- file_handler = logging.FileHandler(log_path / config["logging"]["file"])
46
- console_handler = logging.StreamHandler()
47
-
48
- # Create formatter
49
- formatter = logging.Formatter(config["logging"]["format"])
50
- file_handler.setFormatter(formatter)
51
- console_handler.setFormatter(formatter)
52
-
53
- # Add handlers
54
- logger.addHandler(file_handler)
55
- logger.addHandler(console_handler)
56
-
57
- # Redirect stdout to logger
58
- stream_to_logger = StreamToLogger(logger, logging.INFO)
59
- sys.stdout = stream_to_logger
60
-
61
- # Add methods to enable/disable StreamToLogger
62
- logger.enable_stream_to_logger = stream_to_logger.enable
63
- logger.disable_stream_to_logger = stream_to_logger.disable
64
-
65
- return logger
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main/utils/validation.py DELETED
@@ -1,56 +0,0 @@
1
- from typing import Dict, Any
2
- from pathlib import Path
3
- from dotenv import load_dotenv
4
- from huggingface_hub import login
5
- import os
6
-
7
- def validate_model_path(model_path: Path) -> bool:
8
- """Validate that a model path exists and contains necessary files"""
9
- if not model_path.exists():
10
- return False
11
- required_files = ['config.json', 'pytorch_model.bin']
12
- return all((model_path / file).exists() for file in required_files)
13
-
14
- def validate_generation_params(params: Dict[str, Any]) -> Dict[str, Any]:
15
- """Validate and normalize generation parameters"""
16
- validated = params.copy()
17
-
18
- # Ensure temperature is within bounds
19
- if 'temperature' in validated:
20
- validated['temperature'] = max(0.0, min(2.0, validated['temperature']))
21
-
22
- # Ensure max_new_tokens is reasonable
23
- if 'max_new_tokens' in validated:
24
- validated['max_new_tokens'] = max(1, min(4096, validated['max_new_tokens']))
25
-
26
- return validated
27
-
28
- def validate_hf(setup_logger, config):
29
- """
30
- Validate Hugging Face authentication.
31
- Checks for .env file, loads environment variables, and attempts HF login if token exists.
32
- """
33
- logger = setup_logger(config, "hf_validation")
34
-
35
- # Check for .env file
36
- env_path = Path('.env')
37
- if env_path.exists():
38
- logger.info("Found .env file, loading environment variables")
39
- load_dotenv()
40
- else:
41
- logger.warning("No .env file found. Fine if you're on Huggingface, but you need one to run locally on your PC.")
42
-
43
- # Check for HF token
44
- hf_token = os.getenv('HF_TOKEN')
45
- if not hf_token:
46
- logger.error("No HF_TOKEN found in environment variables")
47
- return False
48
-
49
- try:
50
- # Attempt login
51
- login(token=hf_token)
52
- logger.info("Successfully authenticated with Hugging Face")
53
- return True
54
- except Exception as e:
55
- logger.error(f"Failed to authenticate with Hugging Face: {str(e)}")
56
- return False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,51 +1,7 @@
1
- accelerate==1.2.1
2
- annotated-types==0.7.0
3
- anyio==4.8.0
4
- bitsandbytes==0.45.0
5
- certifi==2024.12.14
6
- charset-normalizer==3.4.1
7
- click==8.1.8
8
- fastapi==0.115.6
9
- filelock==3.16.1
10
- fsspec==2024.12.0
11
- h11==0.14.0
12
- huggingface-hub==0.27.1
13
- idna==3.10
14
- Jinja2==3.1.5
15
- MarkupSafe==3.0.2
16
- mpmath==1.3.0
17
- networkx==3.4.2
18
- numpy==2.2.1
19
- nvidia-cublas-cu12==12.4.5.8
20
- nvidia-cuda-cupti-cu12==12.4.127
21
- nvidia-cuda-nvrtc-cu12==12.4.127
22
- nvidia-cuda-runtime-cu12==12.4.127
23
- nvidia-cudnn-cu12==9.1.0.70
24
- nvidia-cufft-cu12==11.2.1.3
25
- nvidia-curand-cu12==10.3.5.147
26
- nvidia-cusolver-cu12==11.6.1.9
27
- nvidia-cusparse-cu12==12.3.1.170
28
- nvidia-nccl-cu12==2.21.5
29
- nvidia-nvjitlink-cu12==12.4.127
30
- nvidia-nvtx-cu12==12.4.127
31
- packaging==24.2
32
- psutil==6.1.1
33
- pydantic==2.10.5
34
- pydantic_core==2.27.2
35
- python-dotenv==1.0.1
36
- PyYAML==6.0.2
37
- regex==2024.11.6
38
- requests==2.32.3
39
- safetensors==0.5.2
40
- setuptools==75.8.0
41
- sniffio==1.3.1
42
- starlette==0.41.3
43
- sympy==1.13.1
44
- tokenizers==0.21.0
45
- torch==2.5.1
46
- tqdm==4.67.1
47
- transformers==4.47.1
48
- triton==3.1.0
49
- typing_extensions==4.12.2
50
- urllib3==2.3.0
51
- uvicorn==0.34.0
 
1
+ fastapi==0.109.0
2
+ uvicorn==0.27.0
3
+ pydantic==2.5.3
4
+ torch==2.4.1
5
+ transformers==4.36.2
6
+ litgpt==0.5.3
7
+ python-dotenv==1.0.0