Spaces:

TeamGenKI
/

LLMServer

Paused

App Files Files Community

dev

by AurelioAguirre - opened Nov 6, 2024

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+312

-2131

This PR is in draft mode

Files changed (24) hide show

.gitignore +26 -14
.idea/.gitignore +3 -0
.idea/Inference-Server.iml +9 -0
.idea/misc.xml +6 -0
.idea/modules.xml +8 -0
.idea/vcs.xml +6 -0
=0.45.0 +0 -30
Dockerfile +53 -13
README.md +15 -149
main/.cache/hub/version.txt +0 -1
main/api.py +0 -352
main/app.py +0 -55
main/env_template +0 -26
main/logs/llm_api.log +0 -703
main/main.py +179 -0
main/resources/config.yaml +0 -31
main/routes.py +0 -391
main/test_locally.py +0 -56
main/utils/__init__.py +0 -0
main/utils/errors.py +0 -94
main/utils/helpers.py +0 -44
main/utils/logging.py +0 -65
main/utils/validation.py +0 -56
requirements.txt +7 -51

.gitignore CHANGED Viewed

@@ -1,7 +1,14 @@
 # Virtual environment
 myenv/
 venv/
-env/
 # Python
 __pycache__/
@@ -9,24 +16,29 @@ __pycache__/
 *$py.class
 *.so
 .Python
-*.egg
-*.egg-info/
-dist/
 build/
 eggs/
-*.egg-info/
 .eggs/
-# Models
-models/
-main/models/
 # IDE
-.vscode/
 .idea/
 *.swp
-*~
-# OS
 .DS_Store
-Thumbs.db

+># Environment files
+.env
+.env.*
 # Virtual environment
 myenv/
 venv/
+ENV/
+# Model checkpoints
+checkpoints/
 # Python
 __pycache__/
 *$py.class
 *.so
 .Python
 build/
+develop-eggs/
+dist/
+downloads/
 eggs/
 .eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
 # IDE
 .idea/
+.vscode/
 *.swp
+*.swo
 .DS_Store
+# Logs
+*.log
+logs/

.idea/.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+# Default ignored files
+/shelf/
+/workspace.xml

.idea/Inference-Server.iml ADDED Viewed

	@@ -0,0 +1,9 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<module type="JAVA_MODULE" version="4">
+  <component name="NewModuleRootManager" inherit-compiler-output="true">
+    <exclude-output />
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>

.idea/misc.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" languageLevel="JDK_21" default="true" project-jdk-name="21" project-jdk-type="JavaSDK">
+    <output url="file://$PROJECT_DIR$/out" />
+  </component>
+</project>

.idea/modules.xml ADDED Viewed

	@@ -0,0 +1,8 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/Inference-Server.iml" filepath="$PROJECT_DIR$/.idea/Inference-Server.iml" />
+    </modules>
+  </component>
+</project>

.idea/vcs.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="" vcs="Git" />
+  </component>
+</project>

=0.45.0 DELETED Viewed

@@ -1,30 +0,0 @@
-Collecting bitsandbytes
-  Downloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl.metadata (2.9 kB)
-Requirement already satisfied: torch in ./myenv/lib/python3.12/site-packages (from bitsandbytes) (2.5.1)
-Requirement already satisfied: numpy in ./myenv/lib/python3.12/site-packages (from bitsandbytes) (2.2.1)
-Requirement already satisfied: typing_extensions>=4.8.0 in ./myenv/lib/python3.12/site-packages (from bitsandbytes) (4.12.2)
-Requirement already satisfied: filelock in ./myenv/lib/python3.12/site-packages (from torch->bitsandbytes) (3.16.1)
-Requirement already satisfied: networkx in ./myenv/lib/python3.12/site-packages (from torch->bitsandbytes) (3.4.2)
-Requirement already satisfied: jinja2 in ./myenv/lib/python3.12/site-packages (from torch->bitsandbytes) (3.1.5)
-Requirement already satisfied: fsspec in ./myenv/lib/python3.12/site-packages (from torch->bitsandbytes) (2024.12.0)
-Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.4.127 in ./myenv/lib/python3.12/site-packages (from torch->bitsandbytes) (12.4.127)
-Requirement already satisfied: nvidia-cuda-runtime-cu12==12.4.127 in ./myenv/lib/python3.12/site-packages (from torch->bitsandbytes) (12.4.127)
-Requirement already satisfied: nvidia-cuda-cupti-cu12==12.4.127 in ./myenv/lib/python3.12/site-packages (from torch->bitsandbytes) (12.4.127)
-Requirement already satisfied: nvidia-cudnn-cu12==9.1.0.70 in ./myenv/lib/python3.12/site-packages (from torch->bitsandbytes) (9.1.0.70)
-Requirement already satisfied: nvidia-cublas-cu12==12.4.5.8 in ./myenv/lib/python3.12/site-packages (from torch->bitsandbytes) (12.4.5.8)
-Requirement already satisfied: nvidia-cufft-cu12==11.2.1.3 in ./myenv/lib/python3.12/site-packages (from torch->bitsandbytes) (11.2.1.3)
-Requirement already satisfied: nvidia-curand-cu12==10.3.5.147 in ./myenv/lib/python3.12/site-packages (from torch->bitsandbytes) (10.3.5.147)
-Requirement already satisfied: nvidia-cusolver-cu12==11.6.1.9 in ./myenv/lib/python3.12/site-packages (from torch->bitsandbytes) (11.6.1.9)
-Requirement already satisfied: nvidia-cusparse-cu12==12.3.1.170 in ./myenv/lib/python3.12/site-packages (from torch->bitsandbytes) (12.3.1.170)
-Requirement already satisfied: nvidia-nccl-cu12==2.21.5 in ./myenv/lib/python3.12/site-packages (from torch->bitsandbytes) (2.21.5)
-Requirement already satisfied: nvidia-nvtx-cu12==12.4.127 in ./myenv/lib/python3.12/site-packages (from torch->bitsandbytes) (12.4.127)
-Requirement already satisfied: nvidia-nvjitlink-cu12==12.4.127 in ./myenv/lib/python3.12/site-packages (from torch->bitsandbytes) (12.4.127)
-Requirement already satisfied: triton==3.1.0 in ./myenv/lib/python3.12/site-packages (from torch->bitsandbytes) (3.1.0)
-Requirement already satisfied: setuptools in ./myenv/lib/python3.12/site-packages (from torch->bitsandbytes) (75.8.0)
-Requirement already satisfied: sympy==1.13.1 in ./myenv/lib/python3.12/site-packages (from torch->bitsandbytes) (1.13.1)
-Requirement already satisfied: mpmath<1.4,>=1.1.0 in ./myenv/lib/python3.12/site-packages (from sympy==1.13.1->torch->bitsandbytes) (1.3.0)
-Requirement already satisfied: MarkupSafe>=2.0 in ./myenv/lib/python3.12/site-packages (from jinja2->torch->bitsandbytes) (3.0.2)
-Downloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl (69.1 MB)
-   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 69.1/69.1 MB 64.0 MB/s eta 0:00:00
-Installing collected packages: bitsandbytes
-Successfully installed bitsandbytes-0.45.0

Dockerfile CHANGED Viewed

@@ -1,19 +1,59 @@
-# Use Python 3.12 slim image as base
-FROM python:3.12-slim
-RUN useradd -m -u 1000 user
-USER user
-ENV PATH="/home/user/.local/bin:$PATH"
 WORKDIR /app
-COPY --chown=user ./requirements.txt requirements.txt
-RUN pip install --no-cache-dir --upgrade -r requirements.txt
-COPY --chown=user . /app
-COPY --chown=user main/ /app/main
-EXPOSE 7860
-# We run the app object in the app.py file in the main folder.
-CMD ["uvicorn", "main.app:app", "--host", "0.0.0.0", "--port", "7860"]

+# Use Python 3.10 as base image for better compatibility with ML libraries
+FROM python:3.10-slim
+# Set working directory
 WORKDIR /app
+# Install git and required system dependencies
+RUN apt-get update && \
+    apt-get install -y git && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+# Create cache directory and set permissions
+RUN mkdir -p /app/.cache/huggingface && \
+    chmod 777 /app/.cache/huggingface
+# Set environment variables for cache
+ENV TRANSFORMERS_CACHE=/app/.cache/huggingface/hub
+ENV HF_HOME=/app/.cache/huggingface
+# Copy requirements first to leverage Docker cache
+COPY requirements.txt .
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy the rest of the application
+COPY . .
+# Create checkpoints directory with proper permissions
+RUN mkdir -p /app/checkpoints && \
+    chmod 777 /app/checkpoints
+# The token will be passed during build time
+ARG HF_TOKEN
+ENV HF_TOKEN=${HF_TOKEN}
+# Download the Llama 2 model using litgpt
+# Only proceed if HF_TOKEN is provided
+RUN if [ -n "$HF_TOKEN" ]; then \
+        python -c "from huggingface_hub import login; from litgpt.cli import download; login('${HF_TOKEN}'); download('meta-llama/Llama-2-3b-chat-hf', '/app/checkpoints')"; \
+    else \
+        echo "No Hugging Face token provided. Model will need to be downloaded separately."; \
+    fi
+# Set environment variables
+ENV LLM_ENGINE_HOST=0.0.0.0
+ENV LLM_ENGINE_PORT=8001
+# Update MODEL_PATH for the new model
+ENV MODEL_PATH=/app/checkpoints/meta-llama/Llama-2-3b-chat-hf
+# Expose both ports:
+# 8001 for FastAPI
+# 7860 for Hugging Face Spaces
+EXPOSE 8001 7860
+# Command to run the application
+CMD ["python", "main/main.py"]

README.md CHANGED Viewed

@@ -1,162 +1,28 @@
 ---
-title: LLMServer
-emoji: 👹
 colorFrom: indigo
-colorTo: purple
 sdk: docker
 pinned: false
 ---
-# LLM Server
-This repository contains a FastAPI-based server that serves open-source Large Language Models from Hugging Face.
-## Getting Started
-These instructions will help you set up and run the project on your local machine.
-### Prerequisites
-- Python 3.10 or higher
-- Git
-### Cloning the Repository
-Choose one of the following methods to clone the repository:
-#### HTTPS
-```bash
-git clone https://huggingface.co/spaces/TeamGenKI/LLMServer
-cd project-name
-```
-#### SSH
-```bash
-git clone [email protected]:spaces/TeamGenKI/LLMServer
-cd project-name
-```
-### Setting Up the Virtual Environment
-#### Windows
-```bash
-# Create virtual environment
-python -m venv myenv
-# Activate virtual environment
-myenv\Scripts\activate
-# Install dependencies
-pip install -r requirements.txt
-```
-#### Linux
-```bash
-# Create virtual environment
-python -m venv myenv
-# Activate virtual environment
-source myenv/bin/activate
-# Install dependencies
-pip install -r requirements.txt
-```
-#### macOS
-```bash
-# Create virtual environment
-python3 -m venv myenv
-# Activate virtual environment
-source myenv/bin/activate
-# Install dependencies
-pip3 install -r requirements.txt
 ```
-### Running the Application
-Once you have set up your environment and installed the dependencies, you can start the FastAPI application:
-```bash
-uvicorn main.app:app --reload
-```
-The API will be available at `http://localhost:8001`
-### API Documentation
-Once the application is running, you can access:
-- Interactive API documentation (Swagger UI) at `http://localhost:8000/docs`
-- Alternative API documentation (ReDoc) at `http://localhost:8000/redoc`
-### Deactivating the Virtual Environment
-When you're done working on the project, you can deactivate the virtual environment:
-```bash
-deactivate
-```
-## Contributing
-[Add contributing guidelines here]
-## License
-[Add license information here]
-## Project Structure
-```
-.
-├── Dockerfile
-├── main
-│   ├── api.py
-│   ├── app.py
-│   ├── config.yaml
-│   ├── env_template
-│   ├── __init__.py
-│   ├── logs
-│   │   └── llm_api.log
-│   ├── models
-│   ├── __pycache__
-│   │   ├── api.cpython-39.pyc
-│   │   ├── app.cpython-39.pyc
-│   │   ├── __init__.cpython-39.pyc
-│   │   └── routes.cpython-39.pyc
-│   ├── routes.py
-│   ├── test_locally.py
-│   └── utils
-│       ├── errors.py
-│       ├── helpers.py
-│       ├── __init__.py
-│       ├── logging.py
-│       ├── __pycache__
-│       │   ├── helpers.cpython-39.pyc
-│       │   ├── __init__.cpython-39.pyc
-│       │   ├── logging.cpython-39.pyc
-│       │   └── validation.cpython-39.pyc
-│       └── validation.py
-├── README.md
-└── requirements.txt
-```
-ERROR:
-INFO:     127.0.0.1:60874 - "POST /api/v1/model/download?model_name=microsoft%2FPhi-3.5-mini-instruct HTTP/1.1" 200 OK
-2025-01-13 16:18:45,409 - api_routes - INFO - Received request to initialize model: microsoft/Phi-3.5-mini-instruct
-2025-01-13 16:18:45,409 - llm_api - INFO - Initializing generation model: microsoft/Phi-3.5-mini-instruct
-2025-01-13 16:18:45,412 - llm_api - INFO - Loading model from local path: main/models/Phi-3.5-mini-instruct
-The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
-Could not find the bitsandbytes CUDA binary at PosixPath('/home/aurelio/Desktop/Projects/LLMServer/myenv/lib/python3.13/site-packages/bitsandbytes/libbitsandbytes_cuda124.so')
-g++ (GCC) 14.2.1 20240910
-Copyright (C) 2024 Free Software Foundation, Inc.
-This is free software; see the source for copying conditions.  There is NO
-warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-2025-01-13 16:18:45,982 - llm_api - ERROR - Failed to initialize generation model microsoft/Phi-3.5-mini-instruct: Failed to import transformers.integrations.bitsandbytes because of the following error (look up to see its traceback):
-Dynamo is not supported on Python 3.13+
-2025-01-13 16:18:45,982 - api_routes - ERROR - Error initializing model: Failed to import transformers.integrations.bitsandbytes because of the following error (look up to see its traceback):
-Dynamo is not supported on Python 3.13+
-INFO:     127.0.0.1:38330 - "POST /api/v1/model/initialize?model_name=microsoft%2FPhi-3.5-mini-instruct HTTP/1.1" 500 Internal Server Error

 ---
+title: LLM Engine
+emoji: 🐨
 colorFrom: indigo
+colorTo: yellow
 sdk: docker
 pinned: false
+short_description: LLM Engine for Team Gen KI (GPU goes here)
 ---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+HTTP Clone:
+    git clone https://huggingface.co/spaces/TeamGenKI/LLM-Engine
+SSH Clone:
+    git clone git@hf.co:spaces/TeamGenKI/LLM-Engine
+```mermaid
+folders
+  LLM-Engine
+    Main
+      main.py
+      utils.py
 ```

main/.cache/hub/version.txt DELETED Viewed

	@@ -1 +0,0 @@
1	- 1

main/api.py CHANGED Viewed

@@ -1,352 +0,0 @@
-import os
-from pathlib import Path
-from threading import Thread
-import torch
-from typing import Optional, List, AsyncIterator
-import asyncio
-from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig
-from .utils.logging import setup_logger
-class LLMApi:
-    def __init__(self, config: dict):
-        """Initialize the LLM API with configuration."""
-        self.logger = setup_logger(config, "llm_api")
-        self.logger.info("Initializing LLM API")
-        # Set up paths
-        self.base_path = Path(config["model"]["base_path"])
-        self.models_path = self.base_path / config["folders"]["models"]
-        self.cache_path = self.base_path / config["folders"]["cache"]
-        # Initialize model variables for both generation and embedding
-        self.generation_model = None
-        self.generation_model_name = None
-        self.embedding_model = None
-        self.embedding_model_name = None
-        self.tokenizer = None
-        self.embedding_tokenizer = None
-        # Generation parameters from config
-        gen_config = config["model"]["generation"]
-        self.max_new_tokens = gen_config["max_new_tokens"]
-        self.do_sample = gen_config["do_sample"]
-        self.temperature = gen_config["temperature"]
-        self.repetition_penalty = gen_config["repetition_penalty"]
-        self.generation_config = {
-            "max_new_tokens": self.max_new_tokens,
-            "do_sample": self.do_sample,
-            "temperature": self.temperature,
-            "repetition_penalty": self.repetition_penalty,
-            "eos_token_id": None,
-            "pad_token_id": None
-        }
-        # Create necessary directories
-        self.models_path.mkdir(parents=True, exist_ok=True)
-        self.cache_path.mkdir(parents=True, exist_ok=True)
-        # Set cache directory for transformers
-        os.environ['HF_HOME'] = str(self.cache_path)
-        self.logger.info("LLM API initialized successfully")
-    def download_model(self, model_name: str) -> None:
-        """
-        Download a model and its tokenizer to the models directory.
-        Args:
-            model_name: The name of the model to download (e.g., "norallm/normistral-11b-warm")
-        """
-        self.logger.info(f"Starting download of model: {model_name}")
-        try:
-            model_path = self.models_path / model_name.split('/')[-1]
-            # Download and save model
-            self.logger.info(f"Enabling stdout logging for download")
-            self.logger.enable_stream_to_logger()
-            model = AutoModelForCausalLM.from_pretrained(model_name)
-            # Download and save tokenizer
-            tokenizer = AutoTokenizer.from_pretrained(model_name)
-            self.logger.info(f"Disabling stdout logging")
-            self.logger.disable_stream_to_logger()
-            self.logger.info(f"Saving model to {model_path}")
-            model.save_pretrained(model_path)
-            tokenizer.save_pretrained(model_path)
-            self.logger.info(f"Successfully downloaded model: {model_name}")
-        except Exception as e:
-            self.logger.error(f"Failed to download model {model_name}: {str(e)}")
-            raise
-    def initialize_model(self, model_name: str) -> None:
-        """
-        Initialize a model and tokenizer for text generation.
-        Handles different platforms (CUDA, MPS, CPU) appropriately.
-        """
-        self.logger.info(f"Initializing generation model: {model_name}")
-        try:
-            self.generation_model_name = model_name
-            local_model_path = self.models_path / model_name.split('/')[-1]
-            # Check if model exists locally
-            if local_model_path.exists():
-                self.logger.info(f"Loading model from local path: {local_model_path}")
-                model_path = local_model_path
-            else:
-                self.logger.info(f"Loading model from source: {model_name}")
-                model_path = model_name
-            # Check platform and set appropriate configuration
-            if torch.cuda.is_available():
-                self.logger.info("CUDA detected, using GPU with quantization")
-                quantization_config = BitsAndBytesConfig(
-                    load_in_8bit=True,
-                    llm_int8_threshold=3.0
-                )
-                self.generation_model = AutoModelForCausalLM.from_pretrained(
-                    model_path,
-                    device_map="auto",
-                    quantization_config=quantization_config,
-                    torch_dtype=torch.float16
-                )
-            elif torch.backends.mps.is_available():
-                self.logger.info("Apple Silicon detected, using MPS device")
-                self.generation_model = AutoModelForCausalLM.from_pretrained(
-                    model_path,
-                    device_map="mps",
-                    torch_dtype=torch.float16
-                )
-            else:
-                self.logger.info("No GPU detected, falling back to CPU")
-                self.generation_model = AutoModelForCausalLM.from_pretrained(
-                    model_path,
-                    device_map="cpu",
-                    torch_dtype=torch.float32  # Use full precision for CPU
-                )
-            self.tokenizer = AutoTokenizer.from_pretrained(model_path)
-            # Update generation config with tokenizer-specific values
-            self.generation_config["eos_token_id"] = self.tokenizer.eos_token_id
-            self.generation_config["pad_token_id"] = self.tokenizer.eos_token_id
-            self.logger.info(f"Successfully initialized generation model: {model_name}")
-        except Exception as e:
-            self.logger.error(f"Failed to initialize generation model {model_name}: {str(e)}")
-            raise
-    def initialize_embedding_model(self, model_name: str) -> None:
-        """
-        Initialize a model and tokenizer specifically for embeddings.
-        Args:
-            model_name: The name of the model to initialize for embeddings
-        """
-        self.logger.info(f"Initializing embedding model: {model_name}")
-        try:
-            self.embedding_model_name = model_name
-            local_model_path = self.models_path / model_name.split('/')[-1]
-            # Check if model exists locally
-            if local_model_path.exists():
-                self.logger.info(f"Loading embedding model from local path: {local_model_path}")
-                model_path = local_model_path
-            else:
-                self.logger.info(f"Loading embedding model from source: {model_name}")
-                model_path = model_name
-            self.embedding_model = AutoModelForCausalLM.from_pretrained(
-                model_path,
-                device_map="auto",
-                load_in_8bit=True,
-                torch_dtype=torch.float16
-            )
-            self.embedding_tokenizer = AutoTokenizer.from_pretrained(model_path)
-            self.logger.info(f"Successfully initialized embedding model: {model_name}")
-        except Exception as e:
-            self.logger.error(f"Failed to initialize embedding model {model_name}: {str(e)}")
-            raise
-    def has_chat_template(self) -> bool:
-        """Check if the current model has a chat template."""
-        try:
-            self.tokenizer.apply_chat_template(
-                [{"role": "user", "content": "test"}],
-                tokenize=False,
-            )
-            return True
-        except (ValueError, AttributeError):
-            return False
-    def _prepare_prompt(self, prompt: str, system_message: Optional[str] = None) -> str:
-        """
-        Prepare the prompt text, either using the model's chat template if available,
-        or falling back to a simple OpenAI-style format.
-        """
-        try:
-            messages = []
-            if system_message:
-                messages.append({"role": "system", "content": system_message})
-            messages.append({"role": "user", "content": prompt})
-            return self.tokenizer.apply_chat_template(
-                messages,
-                tokenize=False,
-                add_generation_prompt=True
-            )
-        except (ValueError, AttributeError):
-            template = ""
-            if system_message:
-                template += f"System: {system_message}\n\n"
-            template += f"User: {prompt}\n\nAssistant: "
-            return template
-    def generate_response(
-            self,
-            prompt: str,
-            system_message: Optional[str] = None,
-            max_new_tokens: Optional[int] = None
-    ) -> str:
-        """
-        Generate a complete response for the given prompt.
-        """
-        self.logger.debug(f"Generating response for prompt: {prompt[:50]}...")
-        if self.generation_model is None:
-            raise RuntimeError("Generation model not initialized. Call initialize_model first.")
-        try:
-            text = self._prepare_prompt(prompt, system_message)
-            inputs = self.tokenizer([text], return_tensors="pt")
-            # Remove token_type_ids if present
-            model_inputs = {k: v.to(self.generation_model.device) for k, v in inputs.items()
-                            if k != 'token_type_ids'}
-            generation_config = self.generation_config.copy()
-            if max_new_tokens:
-                generation_config["max_new_tokens"] = max_new_tokens
-            generated_ids = self.generation_model.generate(
-                **model_inputs,
-                **generation_config
-            )
-            generated_ids = [
-                output_ids[len(input_ids):]
-                for input_ids, output_ids in zip(model_inputs['input_ids'], generated_ids)
-            ]
-            response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
-            self.logger.debug(f"Generated response: {response[:50]}...")
-            return response
-        except Exception as e:
-            self.logger.error(f"Error generating response: {str(e)}")
-            raise
-    async def generate_stream(
-            self,
-            prompt: str,
-            system_message: Optional[str] = None,
-            max_new_tokens: Optional[int] = None
-    ) -> AsyncIterator[str]:
-        """
-        Generate a streaming response for the given prompt.
-        """
-        self.logger.debug(f"Starting streaming generation for prompt: {prompt[:50]}...")
-        if self.generation_model is None:
-            raise RuntimeError("Generation model not initialized. Call initialize_model first.")
-        try:
-            text = self._prepare_prompt(prompt, system_message)
-            inputs = self.tokenizer([text], return_tensors="pt")
-            # Remove token_type_ids if present
-            model_inputs = {k: v.to(self.generation_model.device) for k, v in inputs.items()
-                            if k != 'token_type_ids'}
-            # Configure generation
-            generation_config = self.generation_config.copy()
-            if max_new_tokens:
-                generation_config["max_new_tokens"] = max_new_tokens
-            # Set up streaming
-            streamer = TextIteratorStreamer(self.tokenizer)
-            generation_kwargs = dict(
-                **model_inputs,
-                **generation_config,
-                streamer=streamer
-            )
-            # Create a thread to run the generation
-            thread = Thread(target=self.generation_model.generate, kwargs=generation_kwargs)
-            thread.start()
-            # Use async generator to yield chunks
-            for new_text in streamer:
-                self.logger.debug(f"Generated chunk: {new_text[:50]}...")
-                yield new_text
-                # Add a small delay to allow other tasks to run
-                await asyncio.sleep(0)
-        except Exception as e:
-            self.logger.error(f"Error in streaming generation: {str(e)}")
-            raise
-    def generate_embedding(self, text: str) -> List[float]:
-        """
-        Generate a single embedding vector for a chunk of text using the dedicated embedding model.
-        Returns a list of floats representing the text embedding.
-        """
-        self.logger.debug(f"Generating embedding for text: {text[:50]}...")
-        if self.embedding_model is None or self.embedding_tokenizer is None:
-            raise RuntimeError("Embedding model not initialized. Call initialize_embedding_model first.")
-        try:
-            # Tokenize the input text and ensure input_ids are Long type
-            inputs = self.embedding_tokenizer(text, return_tensors='pt')
-            input_ids = inputs.input_ids.to(dtype=torch.long, device=self.embedding_model.device)
-            # Get the model's dtype from its parameters for the attention mask
-            model_dtype = next(self.embedding_model.parameters()).dtype
-            # Create an attention mask with matching dtype
-            attention_mask = torch.zeros(
-                input_ids.size(0),
-                1,
-                input_ids.size(1),
-                input_ids.size(1),
-                device=input_ids.device,
-                dtype=model_dtype
-            )
-            # Get model outputs
-            with torch.no_grad():
-                outputs = self.embedding_model(
-                    input_ids=input_ids,
-                    attention_mask=attention_mask,
-                    output_hidden_states=True,
-                    return_dict=True
-                )
-            # Get the last hidden state
-            last_hidden_state = outputs.hidden_states[-1]
-            # Average the hidden state over all tokens (excluding padding)
-            embedding = last_hidden_state[0].mean(dim=0)
-            # Convert to regular Python list
-            embedding_list = embedding.cpu().tolist()
-            self.logger.debug(f"Generated embedding of length: {len(embedding_list)}")
-            return embedding_list
-        except Exception as e:
-            self.logger.error(f"Error generating embedding: {str(e)}")
-            raise

main/app.py DELETED Viewed

@@ -1,55 +0,0 @@
-import yaml
-from fastapi import FastAPI
-from fastapi.middleware.cors import CORSMiddleware
-from .routes import router, init_router
-from .utils.logging import setup_logger
-from .utils.validation import validate_hf
-from .utils.helpers import load_config
-config = load_config()
-logger = setup_logger(config, "main")
-def create_app():
-    global config
-    global logger
-    validate_hf(setup_logger, config)
-    logger.info("Starting LLM API server")
-    app = FastAPI(
-        title="LLM API",
-        description="API for Large Language Model operations",
-        version=config["api"]["version"]
-    )
-    # Add CORS middleware
-    app.add_middleware(
-        CORSMiddleware,
-        allow_origins=config["api"]["cors"]["origins"],
-        allow_credentials=config["api"]["cors"]["credentials"],
-        allow_methods=["*"],
-        allow_headers=["*"],
-    )
-    # Initialize routes with config
-    init_router(config)
-    app.include_router(router, prefix=f"{config['api']['prefix']}/{config['api']['version']}")
-    logger.info("FastAPI application created successfully")
-    return app
-app = create_app()
-if __name__ == "__main__":
-    host = config["server"]["host"]
-    port = config["server"]["port"]
-    import uvicorn
-    uvicorn.run(
-        app,
-        host=host,
-        port=port,
-        log_level=config["logging"]["level"].lower()
-    )
-    logger.info(f"LLM API server started on {host}:{port}")

main/env_template DELETED Viewed

@@ -1,26 +0,0 @@
-# Hugging Face Authentication
-HF_TOKEN=your_token_here
-# CUDA Device Configuration
-CUDA_VISIBLE_DEVICES=0,1    # Specify GPUs to use (e.g., 0 for first GPU, 0,1 for first two)
-# Memory Management
-PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:512
-CUDA_LAUNCH_BLOCKING=1      # Set to 1 for debugging
-CUDA_AUTO_BOOST=0          # Disable auto boost for consistent performance
-# Cache Paths
-CUDA_CACHE_PATH=/path/to/cuda/cache
-TRANSFORMERS_CACHE=/path/to/transformers/cache
-# Performance Settings
-TF_ENABLE_ONEDNN_OPTS=1
-TF_GPU_ALLOCATOR=cuda_malloc_async
-# Model Settings
-TRANSFORMERS_OFFLINE=0     # Set to 1 for offline mode
-# Logging
-LOG_LEVEL=INFO            # Options: DEBUG, INFO, WARNING, ERROR, CRITICAL
-# Add any additional environment-specific variables below

main/logs/llm_api.log DELETED Viewed

@@ -1,703 +0,0 @@
-2025-01-09 15:54:08,215 - hf_validation - WARNING - No .env file found. Fine if you're on Huggingface, but you need one to run locally on your PC.
-2025-01-09 15:54:08,215 - hf_validation - ERROR - No HF_TOKEN found in environment variables
-2025-01-09 15:54:08,215 - main - INFO - Starting LLM API server
-2025-01-09 15:54:08,216 - llm_api - INFO - Initializing LLM API
-2025-01-09 15:54:08,216 - llm_api - INFO - LLM API initialized successfully
-2025-01-09 15:54:08,216 - api_routes - INFO - Router initialized with LLM API instance
-2025-01-09 15:54:08,218 - main - INFO - FastAPI application created successfully
-2025-01-09 16:46:10,118 - api_routes - INFO - Received request to download model: microsoft/phi-4
-2025-01-09 16:46:10,118 - llm_api - INFO - Starting download of model: microsoft/phi-4
-2025-01-09 16:46:10,118 - llm_api - INFO - Enabling stdout logging for download
-2025-01-09 17:00:32,400 - llm_api - INFO - Disabling stdout logging
-2025-01-09 17:00:32,400 - llm_api - INFO - Saving model to main/models/phi-4
-2025-01-09 17:02:39,928 - llm_api - INFO - Successfully downloaded model: microsoft/phi-4
-2025-01-09 17:02:41,075 - api_routes - INFO - Successfully downloaded model: microsoft/phi-4
-2025-01-09 17:02:41,080 - api_routes - INFO - Received request to initialize model: huihui-ai/Qwen2.5-Coder-32B-Instruct-abliterated
-2025-01-09 17:02:41,080 - llm_api - INFO - Initializing generation model: huihui-ai/Qwen2.5-Coder-32B-Instruct-abliterated
-2025-01-09 17:02:41,081 - llm_api - INFO - Loading model from source: huihui-ai/Qwen2.5-Coder-32B-Instruct-abliterated
-2025-01-09 17:02:41,377 - llm_api - ERROR - Failed to initialize generation model huihui-ai/Qwen2.5-Coder-32B-Instruct-abliterated: Using `low_cpu_mem_usage=True` or a `device_map` requires Accelerate: `pip install 'accelerate>=0.26.0'`
-2025-01-09 17:02:41,377 - api_routes - ERROR - Error initializing model: Using `low_cpu_mem_usage=True` or a `device_map` requires Accelerate: `pip install 'accelerate>=0.26.0'`
-2025-01-09 17:11:25,843 - hf_validation - WARNING - No .env file found. Fine if you're on Huggingface, but you need one to run locally on your PC.
-2025-01-09 17:11:25,843 - hf_validation - ERROR - No HF_TOKEN found in environment variables
-2025-01-09 17:11:25,843 - main - INFO - Starting LLM API server
-2025-01-09 17:11:25,843 - llm_api - INFO - Initializing LLM API
-2025-01-09 17:11:25,844 - llm_api - INFO - LLM API initialized successfully
-2025-01-09 17:11:25,844 - api_routes - INFO - Router initialized with LLM API instance
-2025-01-09 17:11:25,846 - main - INFO - FastAPI application created successfully
-2025-01-09 17:11:38,299 - api_routes - INFO - Received request to initialize model: huihui-ai/Qwen2.5-Coder-32B-Instruct-abliterated
-2025-01-09 17:11:38,299 - llm_api - INFO - Initializing generation model: huihui-ai/Qwen2.5-Coder-32B-Instruct-abliterated
-2025-01-09 17:11:38,299 - llm_api - INFO - Loading model from source: huihui-ai/Qwen2.5-Coder-32B-Instruct-abliterated
-2025-01-09 17:11:38,487 - llm_api - ERROR - Failed to initialize generation model huihui-ai/Qwen2.5-Coder-32B-Instruct-abliterated: Using `bitsandbytes` 8-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`
-2025-01-09 17:11:38,487 - api_routes - ERROR - Error initializing model: Using `bitsandbytes` 8-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`
-2025-01-09 17:12:48,606 - hf_validation - WARNING - No .env file found. Fine if you're on Huggingface, but you need one to run locally on your PC.
-2025-01-09 17:12:48,606 - hf_validation - ERROR - No HF_TOKEN found in environment variables
-2025-01-09 17:12:48,606 - main - INFO - Starting LLM API server
-2025-01-09 17:12:48,606 - llm_api - INFO - Initializing LLM API
-2025-01-09 17:12:48,606 - llm_api - INFO - LLM API initialized successfully
-2025-01-09 17:12:48,606 - api_routes - INFO - Router initialized with LLM API instance
-2025-01-09 17:12:48,608 - main - INFO - FastAPI application created successfully
-2025-01-09 17:12:59,453 - api_routes - INFO - Received request to initialize model: huihui-ai/Qwen2.5-Coder-32B-Instruct-abliterated
-2025-01-09 17:12:59,453 - llm_api - INFO - Initializing generation model: huihui-ai/Qwen2.5-Coder-32B-Instruct-abliterated
-2025-01-09 17:12:59,453 - llm_api - INFO - Loading model from source: huihui-ai/Qwen2.5-Coder-32B-Instruct-abliterated
-2025-01-09 17:12:59,628 - llm_api - ERROR - Failed to initialize generation model huihui-ai/Qwen2.5-Coder-32B-Instruct-abliterated: CUDA is required but not available for bitsandbytes. Please consider installing the multi-platform enabled version of bitsandbytes, which is currently a work in progress. Please check currently supported platforms and installation instructions at https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend
-2025-01-09 17:12:59,628 - api_routes - ERROR - Error initializing model: CUDA is required but not available for bitsandbytes. Please consider installing the multi-platform enabled version of bitsandbytes, which is currently a work in progress. Please check currently supported platforms and installation instructions at https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend
-2025-01-09 17:14:44,390 - api_routes - INFO - Received request to initialize model: huihui-ai/Qwen2.5-Coder-32B-Instruct-abliterated
-2025-01-09 17:14:44,390 - llm_api - INFO - Initializing generation model: huihui-ai/Qwen2.5-Coder-32B-Instruct-abliterated
-2025-01-09 17:14:44,390 - llm_api - INFO - Loading model from source: huihui-ai/Qwen2.5-Coder-32B-Instruct-abliterated
-2025-01-09 17:14:53,032 - llm_api - ERROR - Failed to initialize generation model huihui-ai/Qwen2.5-Coder-32B-Instruct-abliterated: CUDA is required but not available for bitsandbytes. Please consider installing the multi-platform enabled version of bitsandbytes, which is currently a work in progress. Please check currently supported platforms and installation instructions at https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend
-2025-01-09 17:14:53,032 - api_routes - ERROR - Error initializing model: CUDA is required but not available for bitsandbytes. Please consider installing the multi-platform enabled version of bitsandbytes, which is currently a work in progress. Please check currently supported platforms and installation instructions at https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend
-2025-01-09 17:15:14,956 - api_routes - INFO - Received request to initialize model: microsoft/phi-4
-2025-01-09 17:15:14,956 - llm_api - INFO - Initializing generation model: microsoft/phi-4
-2025-01-09 17:15:14,956 - llm_api - INFO - Loading model from local path: main/models/phi-4
-2025-01-09 17:15:14,965 - llm_api - ERROR - Failed to initialize generation model microsoft/phi-4: CUDA is required but not available for bitsandbytes. Please consider installing the multi-platform enabled version of bitsandbytes, which is currently a work in progress. Please check currently supported platforms and installation instructions at https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend
-2025-01-09 17:15:14,965 - api_routes - ERROR - Error initializing model: CUDA is required but not available for bitsandbytes. Please consider installing the multi-platform enabled version of bitsandbytes, which is currently a work in progress. Please check currently supported platforms and installation instructions at https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend
-2025-01-13 16:04:32,247 - hf_validation - WARNING - No .env file found. Fine if you're on Huggingface, but you need one to run locally on your PC.
-2025-01-13 16:04:32,247 - hf_validation - ERROR - No HF_TOKEN found in environment variables
-2025-01-13 16:04:32,247 - main - INFO - Starting LLM API server
-2025-01-13 16:04:32,248 - llm_api - INFO - Initializing LLM API
-2025-01-13 16:04:32,248 - llm_api - INFO - LLM API initialized successfully
-2025-01-13 16:04:32,248 - api_routes - INFO - Router initialized with LLM API instance
-2025-01-13 16:04:32,252 - main - INFO - FastAPI application created successfully
-2025-01-13 16:05:27,996 - api_routes - INFO - Received request to download model: microsoft/Phi-3.5-mini-instruct
-2025-01-13 16:05:27,996 - llm_api - INFO - Starting download of model: microsoft/Phi-3.5-mini-instruct
-2025-01-13 16:05:27,996 - llm_api - INFO - Enabling stdout logging for download
-2025-01-13 16:08:46,773 - llm_api - INFO - Disabling stdout logging
-2025-01-13 16:08:46,773 - llm_api - INFO - Saving model to main/models/Phi-3.5-mini-instruct
-2025-01-13 16:10:23,543 - llm_api - INFO - Successfully downloaded model: microsoft/Phi-3.5-mini-instruct
-2025-01-13 16:10:24,432 - api_routes - INFO - Successfully downloaded model: microsoft/Phi-3.5-mini-instruct
-2025-01-13 16:18:45,409 - api_routes - INFO - Received request to initialize model: microsoft/Phi-3.5-mini-instruct
-2025-01-13 16:18:45,409 - llm_api - INFO - Initializing generation model: microsoft/Phi-3.5-mini-instruct
-2025-01-13 16:18:45,412 - llm_api - INFO - Loading model from local path: main/models/Phi-3.5-mini-instruct
-2025-01-13 16:18:45,982 - llm_api - ERROR - Failed to initialize generation model microsoft/Phi-3.5-mini-instruct: Failed to import transformers.integrations.bitsandbytes because of the following error (look up to see its traceback):
-Dynamo is not supported on Python 3.13+
-2025-01-13 16:18:45,982 - api_routes - ERROR - Error initializing model: Failed to import transformers.integrations.bitsandbytes because of the following error (look up to see its traceback):
-Dynamo is not supported on Python 3.13+
-2025-01-14 11:41:25,502 - hf_validation - WARNING - No .env file found. Fine if you're on Huggingface, but you need one to run locally on your PC.
-2025-01-14 11:41:25,502 - hf_validation - ERROR - No HF_TOKEN found in environment variables
-2025-01-14 11:41:25,502 - main - INFO - Starting LLM API server
-2025-01-14 11:41:25,503 - llm_api - INFO - Initializing LLM API
-2025-01-14 11:41:25,503 - llm_api - INFO - LLM API initialized successfully
-2025-01-14 11:41:25,503 - api_routes - INFO - Router initialized with LLM API instance
-2025-01-14 11:41:25,509 - main - INFO - FastAPI application created successfully
-2025-01-14 11:48:33,807 - hf_validation - WARNING - No .env file found. Fine if you're on Huggingface, but you need one to run locally on your PC.
-2025-01-14 11:48:33,807 - hf_validation - ERROR - No HF_TOKEN found in environment variables
-2025-01-14 11:48:33,807 - main - INFO - Starting LLM API server
-2025-01-14 11:48:33,807 - llm_api - INFO - Initializing LLM API
-2025-01-14 11:48:33,807 - llm_api - INFO - LLM API initialized successfully
-2025-01-14 11:48:33,807 - api_routes - INFO - Router initialized with LLM API instance
-2025-01-14 11:48:33,812 - main - INFO - FastAPI application created successfully
-2025-01-14 11:53:20,777 - hf_validation - WARNING - No .env file found. Fine if you're on Huggingface, but you need one to run locally on your PC.
-2025-01-14 11:53:20,777 - hf_validation - ERROR - No HF_TOKEN found in environment variables
-2025-01-14 11:53:20,777 - main - INFO - Starting LLM API server
-2025-01-14 11:53:20,777 - llm_api - INFO - Initializing LLM API
-2025-01-14 11:53:20,778 - llm_api - INFO - LLM API initialized successfully
-2025-01-14 11:53:20,778 - api_routes - INFO - Router initialized with LLM API instance
-2025-01-14 11:53:20,783 - main - INFO - FastAPI application created successfully
-2025-01-14 11:54:28,143 - api_routes - INFO - Received request to download model: microsoft/Phi-3.5-mini-instruct
-2025-01-14 11:54:28,143 - llm_api - INFO - Starting download of model: microsoft/Phi-3.5-mini-instruct
-2025-01-14 11:54:28,143 - llm_api - INFO - Enabling stdout logging for download
-2025-01-14 11:54:47,061 - llm_api - INFO - Disabling stdout logging
-2025-01-14 11:54:47,061 - llm_api - INFO - Saving model to main/models/Phi-3.5-mini-instruct
-2025-01-14 11:56:40,600 - llm_api - INFO - Successfully downloaded model: microsoft/Phi-3.5-mini-instruct
-2025-01-14 11:56:41,266 - api_routes - INFO - Successfully downloaded model: microsoft/Phi-3.5-mini-instruct
-2025-01-14 11:56:41,364 - api_routes - INFO - Received request to initialize model: microsoft/Phi-3.5-mini-instruct
-2025-01-14 11:56:41,365 - llm_api - INFO - Initializing generation model: microsoft/Phi-3.5-mini-instruct
-2025-01-14 11:56:41,367 - llm_api - INFO - Loading model from local path: main/models/Phi-3.5-mini-instruct
-2025-01-14 11:56:45,322 - llm_api - ERROR - Failed to initialize generation model microsoft/Phi-3.5-mini-instruct: /home/aurelio/Desktop/Projects/LLMServer/myenv/lib/python3.12/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cget_col_row_stats
-2025-01-14 11:56:45,322 - api_routes - ERROR - Error initializing model: /home/aurelio/Desktop/Projects/LLMServer/myenv/lib/python3.12/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cget_col_row_stats
-2025-01-14 12:29:54,971 - main - INFO - LLM API server started on 0.0.0.0:8001
-2025-01-14 12:30:01,275 - hf_validation - WARNING - No .env file found. Fine if you're on Huggingface, but you need one to run locally on your PC.
-2025-01-14 12:30:01,275 - hf_validation - ERROR - No HF_TOKEN found in environment variables
-2025-01-14 12:30:01,275 - main - INFO - Starting LLM API server
-2025-01-14 12:30:01,275 - llm_api - INFO - Initializing LLM API
-2025-01-14 12:30:01,275 - llm_api - INFO - LLM API initialized successfully
-2025-01-14 12:30:01,276 - api_routes - INFO - Router initialized with LLM API instance
-2025-01-14 12:30:01,280 - main - INFO - FastAPI application created successfully
-2025-01-14 12:31:15,345 - hf_validation - WARNING - No .env file found. Fine if you're on Huggingface, but you need one to run locally on your PC.
-2025-01-14 12:31:15,345 - hf_validation - ERROR - No HF_TOKEN found in environment variables
-2025-01-14 12:31:15,345 - main - INFO - Starting LLM API server
-2025-01-14 12:31:15,345 - llm_api - INFO - Initializing LLM API
-2025-01-14 12:31:15,346 - llm_api - INFO - LLM API initialized successfully
-2025-01-14 12:31:15,346 - api_routes - INFO - Router initialized with LLM API instance
-2025-01-14 12:31:15,350 - main - INFO - FastAPI application created successfully
-2025-01-14 12:31:43,376 - hf_validation - WARNING - No .env file found. Fine if you're on Huggingface, but you need one to run locally on your PC.
-2025-01-14 12:31:43,376 - hf_validation - ERROR - No HF_TOKEN found in environment variables
-2025-01-14 12:31:43,376 - main - INFO - Starting LLM API server
-2025-01-14 12:31:43,377 - llm_api - INFO - Initializing LLM API
-2025-01-14 12:31:43,377 - llm_api - INFO - LLM API initialized successfully
-2025-01-14 12:31:43,377 - api_routes - INFO - Router initialized with LLM API instance
-2025-01-14 12:31:43,381 - main - INFO - FastAPI application created successfully
-2025-01-14 12:31:51,142 - llm_api - INFO - INFO:     127.0.0.1:52554 - "GET /docs HTTP/1.1" 200 OK
-2025-01-14 12:31:51,311 - llm_api - INFO - INFO:     127.0.0.1:52554 - "GET /openapi.json HTTP/1.1" 200 OK
-2025-01-14 12:32:10,756 - api_routes - INFO - Received request to initialize model: microsoft/Phi-3.5-mini-instruct
-2025-01-14 12:32:10,757 - llm_api - INFO - Initializing generation model: microsoft/Phi-3.5-mini-instruct
-2025-01-14 12:32:10,757 - llm_api - INFO - Loading model from local path: main/models/Phi-3.5-mini-instruct
-2025-01-14 12:32:26,447 - llm_api - INFO - Successfully initialized generation model: microsoft/Phi-3.5-mini-instruct
-2025-01-14 12:32:26,448 - api_routes - INFO - Successfully initialized model: microsoft/Phi-3.5-mini-instruct
-2025-01-14 12:32:26,448 - llm_api - INFO - INFO:     127.0.0.1:34282 - "POST /api/v1/model/initialize?model_name=microsoft%2FPhi-3.5-mini-instruct HTTP/1.1" 200 OK
-2025-01-14 12:33:13,272 - api_routes - INFO - Received generation request for prompt: Tell me about yourself, and your capabilities...
-2025-01-14 12:33:13,272 - llm_api - DEBUG - Generating response for prompt: Tell me about yourself, and your capabilities...
-2025-01-14 12:33:46,448 - llm_api - DEBUG - Generated response: I am Phi, an AI language model developed by Micros...
-2025-01-14 12:33:46,448 - api_routes - INFO - Successfully generated response
-2025-01-14 12:33:46,448 - llm_api - INFO - INFO:     127.0.0.1:57442 - "POST /api/v1/generate HTTP/1.1" 200 OK
-2025-01-14 12:34:41,588 - api_routes - INFO - Received streaming generation request for prompt: Tell me about yourself, and your capabilities...
-2025-01-14 12:34:41,588 - llm_api - DEBUG - Starting streaming generation for prompt: Tell me about yourself, and your capabilities...
-2025-01-14 12:34:41,608 - llm_api - DEBUG - Generated chunk: <|system|> You are a helpful assistant<|end|><|use...
-2025-01-14 12:34:41,689 - llm_api - DEBUG - Generated chunk: capabilities<|end|><|assistant|> ...
-2025-01-14 12:34:41,757 - llm_api - DEBUG - Generated chunk: I ...
-2025-01-14 12:34:41,827 - llm_api - DEBUG - Generated chunk: am ...
-2025-01-14 12:34:41,895 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:41,965 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:42,033 - llm_api - DEBUG - Generated chunk: Phi, ...
-2025-01-14 12:34:42,102 - llm_api - DEBUG - Generated chunk: an ...
-2025-01-14 12:34:42,172 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:42,241 - llm_api - DEBUG - Generated chunk: AI ...
-2025-01-14 12:34:42,309 - llm_api - DEBUG - Generated chunk: language ...
-2025-01-14 12:34:42,377 - llm_api - DEBUG - Generated chunk: model ...
-2025-01-14 12:34:42,448 - llm_api - DEBUG - Generated chunk: created ...
-2025-01-14 12:34:42,521 - llm_api - DEBUG - Generated chunk: by ...
-2025-01-14 12:34:42,590 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:42,656 - llm_api - DEBUG - Generated chunk: Microsoft. ...
-2025-01-14 12:34:42,721 - llm_api - DEBUG - Generated chunk: While ...
-2025-01-14 12:34:42,788 - llm_api - DEBUG - Generated chunk: I ...
-2025-01-14 12:34:42,854 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:42,925 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:42,991 - llm_api - DEBUG - Generated chunk: don't ...
-2025-01-14 12:34:43,063 - llm_api - DEBUG - Generated chunk: have ...
-2025-01-14 12:34:43,131 - llm_api - DEBUG - Generated chunk: personal ...
-2025-01-14 12:34:43,201 - llm_api - DEBUG - Generated chunk: experiences ...
-2025-01-14 12:34:43,267 - llm_api - DEBUG - Generated chunk: or ...
-2025-01-14 12:34:43,334 - llm_api - DEBUG - Generated chunk: feelings ...
-2025-01-14 12:34:43,402 - llm_api - DEBUG - Generated chunk: like ...
-2025-01-14 12:34:43,472 - llm_api - DEBUG - Generated chunk: humans ...
-2025-01-14 12:34:43,537 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:43,600 - llm_api - DEBUG - Generated chunk: do, ...
-2025-01-14 12:34:43,663 - llm_api - DEBUG - Generated chunk: let ...
-2025-01-14 12:34:43,729 - llm_api - DEBUG - Generated chunk: me ...
-2025-01-14 12:34:43,793 - llm_api - DEBUG - Generated chunk: tell ...
-2025-01-14 12:34:43,859 - llm_api - DEBUG - Generated chunk: you ...
-2025-01-14 12:34:43,924 - llm_api - DEBUG - Generated chunk: more ...
-2025-01-14 12:34:43,989 - llm_api - DEBUG - Generated chunk: about ...
-2025-01-14 12:34:44,053 - llm_api - DEBUG - Generated chunk: my ...
-2025-01-14 12:34:44,119 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:44,183 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:44,247 - llm_api - DEBUG - Generated chunk: functionalities:
-...
-2025-01-14 12:34:44,312 - llm_api - DEBUG - Generated chunk:
-...
-2025-01-14 12:34:44,375 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:44,440 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:44,505 - llm_api - DEBUG - Generated chunk: 1. ...
-2025-01-14 12:34:44,567 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:44,632 - llm_api - DEBUG - Generated chunk: **Language ...
-2025-01-14 12:34:44,698 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:44,763 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:44,828 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:44,892 - llm_api - DEBUG - Generated chunk: Understanding**: ...
-2025-01-14 12:34:44,956 - llm_api - DEBUG - Generated chunk: My ...
-2025-01-14 12:34:45,020 - llm_api - DEBUG - Generated chunk: primary ...
-2025-01-14 12:34:45,085 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:45,148 - llm_api - DEBUG - Generated chunk: capability ...
-2025-01-14 12:34:45,211 - llm_api - DEBUG - Generated chunk: is ...
-2025-01-14 12:34:45,275 - llm_api - DEBUG - Generated chunk: understanding ...
-2025-01-14 12:34:45,338 - llm_api - DEBUG - Generated chunk: natural ...
-2025-01-14 12:34:45,401 - llm_api - DEBUG - Generated chunk: human ...
-2025-01-14 12:34:45,465 - llm_api - DEBUG - Generated chunk: languages ...
-2025-01-14 12:34:45,529 - llm_api - DEBUG - Generated chunk: to ...
-2025-01-14 12:34:45,592 - llm_api - DEBUG - Generated chunk: the ...
-2025-01-14 12:34:45,658 - llm_api - DEBUG - Generated chunk: best ...
-2025-01-14 12:34:45,728 - llm_api - DEBUG - Generated chunk: of ...
-2025-01-14 12:34:45,804 - llm_api - DEBUG - Generated chunk: our ...
-2025-01-14 12:34:45,871 - llm_api - DEBUG - Generated chunk: current ...
-2025-01-14 12:34:45,938 - llm_api - DEBUG - Generated chunk: technology ...
-2025-01-14 12:34:46,004 - llm_api - DEBUG - Generated chunk: allows ...
-2025-01-14 12:34:46,072 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:46,139 - llm_api - DEBUG - Generated chunk: it. ...
-2025-01-14 12:34:46,207 - llm_api - DEBUG - Generated chunk: This ...
-2025-01-14 12:34:46,273 - llm_api - DEBUG - Generated chunk: includes ...
-2025-01-14 12:34:46,341 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:46,408 - llm_api - DEBUG - Generated chunk: interpreting ...
-2025-01-14 12:34:46,473 - llm_api - DEBUG - Generated chunk: text ...
-2025-01-14 12:34:46,539 - llm_api - DEBUG - Generated chunk: inputs ...
-2025-01-14 12:34:46,605 - llm_api - DEBUG - Generated chunk: from ...
-2025-01-14 12:34:46,670 - llm_api - DEBUG - Generated chunk: various ...
-2025-01-14 12:34:46,735 - llm_api - DEBUG - Generated chunk: sources ...
-2025-01-14 12:34:46,801 - llm_api - DEBUG - Generated chunk: such ...
-2025-01-14 12:34:46,867 - llm_api - DEBUG - Generated chunk: as ...
-2025-01-14 12:34:46,935 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:47,005 - llm_api - DEBUG - Generated chunk: websites, ...
-2025-01-14 12:34:47,073 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:47,142 - llm_api - DEBUG - Generated chunk: books, ...
-2025-01-14 12:34:47,208 - llm_api - DEBUG - Generated chunk: articles ...
-2025-01-14 12:34:47,275 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:47,341 - llm_api - DEBUG - Generated chunk: etc., ...
-2025-01-14 12:34:47,408 - llm_api - DEBUG - Generated chunk: in ...
-2025-01-14 12:34:47,472 - llm_api - DEBUG - Generated chunk: multiple ...
-2025-01-14 12:34:47,536 - llm_api - DEBUG - Generated chunk: formats ...
-2025-01-14 12:34:47,600 - llm_api - DEBUG - Generated chunk: including ...
-2025-01-14 12:34:47,674 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:47,744 - llm_api - DEBUG - Generated chunk: English, ...
-2025-01-14 12:34:47,814 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:47,901 - llm_api - DEBUG - Generated chunk: Spanish, ...
-2025-01-14 12:34:47,991 - llm_api - DEBUG - Generated chunk: French ...
-2025-01-14 12:34:48,066 - llm_api - DEBUG - Generated chunk: among ...
-2025-01-14 12:34:48,131 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:48,194 - llm_api - DEBUG - Generated chunk: others. ...
-2025-01-14 12:34:48,259 - llm_api - DEBUG - Generated chunk:
-...
-2025-01-14 12:34:48,325 - llm_api - DEBUG - Generated chunk:
-...
-2025-01-14 12:34:48,390 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:48,454 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:48,519 - llm_api - DEBUG - Generated chunk: 2. ...
-2025-01-14 12:34:48,583 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:48,647 - llm_api - DEBUG - Generated chunk: **Text ...
-2025-01-14 12:34:48,711 - llm_api - DEBUG - Generated chunk: Generation ...
-2025-01-14 12:34:48,773 - llm_api - DEBUG - Generated chunk: & ...
-2025-01-14 12:34:48,837 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:48,902 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:48,965 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:49,031 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:49,096 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:49,160 - llm_api - DEBUG - Generated chunk: Comprehension**: ...
-2025-01-14 12:34:49,224 - llm_api - DEBUG - Generated chunk: Based ...
-2025-01-14 12:34:49,289 - llm_api - DEBUG - Generated chunk: on ...
-2025-01-14 12:34:49,353 - llm_api - DEBUG - Generated chunk: patterns ...
-2025-01-14 12:34:49,419 - llm_api - DEBUG - Generated chunk: learned ...
-2025-01-14 12:34:49,488 - llm_api - DEBUG - Generated chunk: during ...
-2025-01-14 12:34:49,555 - llm_api - DEBUG - Generated chunk: training ...
-2025-01-14 12:34:49,621 - llm_api - DEBUG - Generated chunk: with ...
-2025-01-14 12:34:49,686 - llm_api - DEBUG - Generated chunk: diverse ...
-2025-01-14 12:34:49,750 - llm_api - DEBUG - Generated chunk: internet ...
-2025-01-14 12:34:49,814 - llm_api - DEBUG - Generated chunk: texts ...
-2025-01-14 12:34:49,879 - llm_api - DEBUG - Generated chunk: data ...
-2025-01-14 12:34:49,944 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:50,009 - llm_api - DEBUG - Generated chunk: sets, ...
-2025-01-14 12:34:50,074 - llm_api - DEBUG - Generated chunk: I ...
-2025-01-14 12:34:50,140 - llm_api - DEBUG - Generated chunk: can ...
-2025-01-14 12:34:50,204 - llm_api - DEBUG - Generated chunk: generate ...
-2025-01-14 12:34:50,269 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:50,336 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:50,401 - llm_api - DEBUG - Generated chunk: coherent ...
-2025-01-14 12:34:50,466 - llm_api - DEBUG - Generated chunk: responses ...
-2025-01-14 12:34:50,530 - llm_api - DEBUG - Generated chunk: that ...
-2025-01-14 12:34:50,595 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:50,658 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:50,722 - llm_api - DEBUG - Generated chunk: mimic ...
-2025-01-14 12:34:50,787 - llm_api - DEBUG - Generated chunk: how ...
-2025-01-14 12:34:50,852 - llm_api - DEBUG - Generated chunk: real ...
-2025-01-14 12:34:50,916 - llm_api - DEBUG - Generated chunk: people ...
-2025-01-14 12:34:50,980 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:51,043 - llm_api - DEBUG - Generated chunk: write. ...
-2025-01-14 12:34:51,107 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:51,171 - llm_api - DEBUG - Generated chunk: However, ...
-2025-01-14 12:34:51,236 - llm_api - DEBUG - Generated chunk: please ...
-2025-01-14 12:34:51,299 - llm_api - DEBUG - Generated chunk: note ...
-2025-01-14 12:34:51,364 - llm_api - DEBUG - Generated chunk: these ...
-2025-01-14 12:34:51,428 - llm_api - DEBUG - Generated chunk: generated ...
-2025-01-14 12:34:51,491 - llm_api - DEBUG - Generated chunk: outputs ...
-2025-01-14 12:34:51,556 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:51,620 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:51,685 - llm_api - DEBUG - Generated chunk: aren’t ...
-2025-01-14 12:34:51,748 - llm_api - DEBUG - Generated chunk: perfect ...
-2025-01-14 12:34:51,813 - llm_api - DEBUG - Generated chunk: nor ...
-2025-01-14 12:34:51,877 - llm_api - DEBUG - Generated chunk: fully ...
-2025-01-14 12:34:51,940 - llm_api - DEBUG - Generated chunk: accurate ...
-2025-01-14 12:34:52,005 - llm_api - DEBUG - Generated chunk: but ...
-2025-01-14 12:34:52,068 - llm_api - DEBUG - Generated chunk: they ...
-2025-01-14 12:34:52,131 - llm_api - DEBUG - Generated chunk: often ...
-2025-01-14 12:34:52,196 - llm_api - DEBUG - Generated chunk: make ...
-2025-01-14 12:34:52,260 - llm_api - DEBUG - Generated chunk: sense ...
-2025-01-14 12:34:52,324 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:52,388 - llm_api - DEBUG - Generated chunk: contextually ...
-2025-01-14 12:34:52,451 - llm_api - DEBUG - Generated chunk: within ...
-2025-01-14 12:34:52,516 - llm_api - DEBUG - Generated chunk: given ...
-2025-01-14 12:34:52,579 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:52,643 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:52,707 - llm_api - DEBUG - Generated chunk: prompts.
-...
-2025-01-14 12:34:52,771 - llm_api - DEBUG - Generated chunk:
-...
-2025-01-14 12:34:52,835 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:52,899 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:52,963 - llm_api - DEBUG - Generated chunk: 3. ...
-2025-01-14 12:34:53,026 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:53,095 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:53,168 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:53,237 - llm_api - DEBUG - Generated chunk: **Knowledge ...
-2025-01-14 12:34:53,302 - llm_api - DEBUG - Generated chunk: Base ...
-2025-01-14 12:34:53,371 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:53,440 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:53,508 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:53,576 - llm_api - DEBUG - Generated chunk: Accessing**: ...
-2025-01-14 12:34:53,642 - llm_api - DEBUG - Generated chunk: Although ...
-2025-01-14 12:34:53,710 - llm_api - DEBUG - Generated chunk: not ...
-2025-01-14 12:34:53,775 - llm_api - DEBUG - Generated chunk: connected ...
-2025-01-14 12:34:53,838 - llm_api - DEBUG - Generated chunk: live ...
-2025-01-14 12:34:53,903 - llm_api - DEBUG - Generated chunk: for ...
-2025-01-14 12:34:53,968 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:54,032 - llm_api - DEBUG - Generated chunk: browsing ...
-2025-01-14 12:34:54,096 - llm_api - DEBUG - Generated chunk: external ...
-2025-01-14 12:34:54,159 - llm_api - DEBUG - Generated chunk: databases ...
-2025-01-14 12:34:54,224 - llm_api - DEBUG - Generated chunk: at ...
-2025-01-14 12:34:54,287 - llm_api - DEBUG - Generated chunk: this ...
-2025-01-14 12:34:54,351 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:54,414 - llm_api - DEBUG - Generated chunk: moment, ...
-2025-01-14 12:34:54,478 - llm_api - DEBUG - Generated chunk: information ...
-2025-01-14 12:34:54,542 - llm_api - DEBUG - Generated chunk: up ...
-2025-01-14 12:34:54,606 - llm_api - DEBUG - Generated chunk: until ...
-2025-01-14 12:34:54,670 - llm_api - DEBUG - Generated chunk: September ...
-2025-01-14 12:34:54,735 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:54,799 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:54,864 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:54,928 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:54,992 - llm_api - DEBUG - Generated chunk: 2021 ...
-2025-01-14 12:34:55,056 - llm_api - DEBUG - Generated chunk: has ...
-2025-01-14 12:34:55,120 - llm_api - DEBUG - Generated chunk: been ...
-2025-01-14 12:34:55,184 - llm_api - DEBUG - Generated chunk: used ...
-2025-01-14 12:34:55,249 - llm_api - DEBUG - Generated chunk: when ...
-2025-01-14 12:34:55,313 - llm_api - DEBUG - Generated chunk: generating ...
-2025-01-14 12:34:55,376 - llm_api - DEBUG - Generated chunk: answers ...
-2025-01-14 12:34:55,441 - llm_api - DEBUG - Generated chunk: based ...
-2025-01-14 12:34:55,504 - llm_api - DEBUG - Generated chunk: upon ...
-2025-01-14 12:34:55,568 - llm_api - DEBUG - Generated chunk: extensive ...
-2025-01-14 12:34:55,632 - llm_api - DEBUG - Generated chunk: datasets ...
-2025-01-14 12:34:55,696 - llm_api - DEBUG - Generated chunk: which ...
-2025-01-14 12:34:55,760 - llm_api - DEBUG - Generated chunk: include ...
-2025-01-14 12:34:55,825 - llm_api - DEBUG - Generated chunk: facts ...
-2025-01-14 12:34:55,889 - llm_api - DEBUG - Generated chunk: known ...
-2025-01-14 12:34:55,953 - llm_api - DEBUG - Generated chunk: till ...
-2025-01-14 12:34:56,017 - llm_api - DEBUG - Generated chunk: then ...
-2025-01-14 12:34:56,082 - llm_api - DEBUG - Generated chunk: across ...
-2025-01-14 12:34:56,146 - llm_api - DEBUG - Generated chunk: numerous ...
-2025-01-14 12:34:56,210 - llm_api - DEBUG - Generated chunk: topics ...
-2025-01-14 12:34:56,281 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:56,346 - llm_api - DEBUG - Generated chunk: ranging ...
-2025-01-14 12:34:56,410 - llm_api - DEBUG - Generated chunk: from ...
-2025-01-14 12:34:56,474 - llm_api - DEBUG - Generated chunk: science ...
-2025-01-14 12:34:56,537 - llm_api - DEBUG - Generated chunk: to ...
-2025-01-14 12:34:56,631 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:56,730 - llm_api - DEBUG - Generated chunk: arts, ...
-2025-01-14 12:34:56,818 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:56,886 - llm_api - DEBUG - Generated chunk: history, ...
-2025-01-14 12:34:56,951 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:57,016 - llm_api - DEBUG - Generated chunk: culture, ...
-2025-01-14 12:34:57,080 - llm_api - DEBUG - Generated chunk: sports ...
-2025-01-14 12:34:57,145 - llm_api - DEBUG - Generated chunk: amongst ...
-2025-01-14 12:34:57,209 - llm_api - DEBUG - Generated chunk: many ...
-2025-01-14 12:34:57,273 - llm_api - DEBUG - Generated chunk: other ...
-2025-01-14 12:34:57,337 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:57,401 - llm_api - DEBUG - Generated chunk: fields.
-...
-2025-01-14 12:34:57,467 - llm_api - DEBUG - Generated chunk:
-...
-2025-01-14 12:34:57,531 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:57,595 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:57,659 - llm_api - DEBUG - Generated chunk: 4. ...
-2025-01-14 12:34:57,723 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:57,786 - llm_api - DEBUG - Generated chunk: **Problem ...
-2025-01-14 12:34:57,850 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:57,914 - llm_api - DEBUG - Generated chunk: Solving ...
-2025-01-14 12:34:57,978 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:58,042 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:58,106 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:58,168 - llm_api - DEBUG - Generated chunk: Skills**: ...
-2025-01-14 12:34:58,231 - llm_api - DEBUG - Generated chunk: In ...
-2025-01-14 12:34:58,294 - llm_api - DEBUG - Generated chunk: certain ...
-2025-01-14 12:34:58,357 - llm_api - DEBUG - Generated chunk: scenarios ...
-2025-01-14 12:34:58,421 - llm_api - DEBUG - Generated chunk: where ...
-2025-01-14 12:34:58,484 - llm_api - DEBUG - Generated chunk: logical ...
-2025-01-14 12:34:58,549 - llm_api - DEBUG - Generated chunk: reasoning ...
-2025-01-14 12:34:58,612 - llm_api - DEBUG - Generated chunk: might ...
-2025-01-14 12:34:58,676 - llm_api - DEBUG - Generated chunk: be ...
-2025-01-14 12:34:58,739 - llm_api - DEBUG - Generated chunk: required ...
-2025-01-14 12:34:58,802 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:58,867 - llm_api - DEBUG - Generated chunk: (like ...
-2025-01-14 12:34:58,930 - llm_api - DEBUG - Generated chunk: math ...
-2025-01-14 12:34:58,994 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:59,057 - llm_api - DEBUG - Generated chunk: problems), ...
-2025-01-14 12:34:59,120 - llm_api - DEBUG - Generated chunk: algorithms ...
-2025-01-14 12:34:59,184 - llm_api - DEBUG - Generated chunk: enable ...
-2025-01-14 12:34:59,248 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:59,312 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:59,376 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:59,439 - llm_api - DEBUG - Generated chunk: problem-solving ...
-2025-01-14 12:34:59,503 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:34:59,567 - llm_api - DEBUG - Generated chunk: abilities ...
-2025-01-14 12:34:59,631 - llm_api - DEBUG - Generated chunk: similar ...
-2025-01-14 12:34:59,695 - llm_api - DEBUG - Generated chunk: those ...
-2025-01-14 12:34:59,758 - llm_api - DEBUG - Generated chunk: found ...
-2025-01-14 12:34:59,822 - llm_api - DEBUG - Generated chunk: commonly ...
-2025-01-14 12:34:59,884 - llm_api - DEBUG - Generated chunk: seen ...
-2025-01-14 12:34:59,948 - llm_api - DEBUG - Generated chunk: in ...
-2025-01-14 12:35:00,011 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:00,075 - llm_api - DEBUG - Generated chunk: calculators ...
-2025-01-14 12:35:00,139 - llm_api - DEBUG - Generated chunk: yet ...
-2025-01-14 12:35:00,202 - llm_api - DEBUG - Generated chunk: without ...
-2025-01-14 12:35:00,266 - llm_api - DEBUG - Generated chunk: any ...
-2025-01-14 12:35:00,329 - llm_api - DEBUG - Generated chunk: direct ...
-2025-01-14 12:35:00,393 - llm_api - DEBUG - Generated chunk: interaction ...
-2025-01-14 12:35:00,458 - llm_api - DEBUG - Generated chunk: beyond ...
-2025-01-14 12:35:00,523 - llm_api - DEBUG - Generated chunk: what ...
-2025-01-14 12:35:00,590 - llm_api - DEBUG - Generated chunk: was ...
-2025-01-14 12:35:00,656 - llm_api - DEBUG - Generated chunk: provided ...
-2025-01-14 12:35:00,721 - llm_api - DEBUG - Generated chunk: initially ...
-2025-01-14 12:35:00,786 - llm_api - DEBUG - Generated chunk: into ...
-2025-01-14 12:35:00,850 - llm_api - DEBUG - Generated chunk: them ...
-2025-01-14 12:35:00,916 - llm_api - DEBUG - Generated chunk: - ...
-2025-01-14 12:35:00,980 - llm_api - DEBUG - Generated chunk: no ...
-2025-01-14 12:35:01,045 - llm_api - DEBUG - Generated chunk: memory ...
-2025-01-14 12:35:01,109 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:01,173 - llm_api - DEBUG - Generated chunk: retention ...
-2025-01-14 12:35:01,239 - llm_api - DEBUG - Generated chunk: after ...
-2025-01-14 12:35:01,305 - llm_api - DEBUG - Generated chunk: each ...
-2025-01-14 12:35:01,372 - llm_api - DEBUG - Generated chunk: session ...
-2025-01-14 12:35:01,438 - llm_api - DEBUG - Generated chunk: ends ...
-2025-01-14 12:35:01,501 - llm_api - DEBUG - Generated chunk: due ...
-2025-01-14 12:35:01,567 - llm_api - DEBUG - Generated chunk: to ...
-2025-01-14 12:35:01,630 - llm_api - DEBUG - Generated chunk: design ...
-2025-01-14 12:35:01,694 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:01,759 - llm_api - DEBUG - Generated chunk: considerations ...
-2025-01-14 12:35:01,823 - llm_api - DEBUG - Generated chunk: around ...
-2025-01-14 12:35:01,887 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:01,951 - llm_api - DEBUG - Generated chunk: privacy ...
-2025-01-14 12:35:02,015 - llm_api - DEBUG - Generated chunk: protection ...
-2025-01-14 12:35:02,079 - llm_api - DEBUG - Generated chunk: policies ...
-2025-01-14 12:35:02,142 - llm_api - DEBUG - Generated chunk: followed ...
-2025-01-14 12:35:02,207 - llm_api - DEBUG - Generated chunk: strictly ...
-2025-01-14 12:35:02,271 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:02,334 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:02,399 - llm_api - DEBUG - Generated chunk: adhered ...
-2025-01-14 12:35:02,462 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:02,527 - llm_api - DEBUG - Generated chunk: too.
-...
-2025-01-14 12:35:02,592 - llm_api - DEBUG - Generated chunk:
-...
-2025-01-14 12:35:02,654 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:02,716 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:02,781 - llm_api - DEBUG - Generated chunk: 5. ...
-2025-01-14 12:35:02,844 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:02,908 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:02,974 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:03,043 - llm_api - DEBUG - Generated chunk: **Learning ...
-2025-01-14 12:35:03,108 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:03,173 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:03,239 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:03,303 - llm_api - DEBUG - Generated chunk: Capabilities**: ...
-2025-01-14 12:35:03,369 - llm_api - DEBUG - Generated chunk: It ...
-2025-01-14 12:35:03,433 - llm_api - DEBUG - Generated chunk: should ...
-2025-01-14 12:35:03,498 - llm_api - DEBUG - Generated chunk: also ...
-2025-01-14 12:35:03,562 - llm_api - DEBUG - Generated chunk: be ...
-2025-01-14 12:35:03,626 - llm_api - DEBUG - Generated chunk: noted ...
-2025-01-14 12:35:03,690 - llm_api - DEBUG - Generated chunk: though ...
-2025-01-14 12:35:03,755 - llm_api - DEBUG - Generated chunk: there ...
-2025-01-14 12:35:03,819 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:03,884 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:03,949 - llm_api - DEBUG - Generated chunk: isn't ...
-2025-01-14 12:35:04,013 - llm_api - DEBUG - Generated chunk: learning ...
-2025-01-14 12:35:04,078 - llm_api - DEBUG - Generated chunk: happening ...
-2025-01-14 12:35:04,142 - llm_api - DEBUG - Generated chunk: per ...
-2025-01-14 12:35:04,206 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:04,270 - llm_api - DEBUG - Generated chunk: se; ...
-2025-01-14 12:35:04,334 - llm_api - DEBUG - Generated chunk: continuous ...
-2025-01-14 12:35:04,399 - llm_api - DEBUG - Generated chunk: improvements ...
-2025-01-14 12:35:04,463 - llm_api - DEBUG - Generated chunk: occur ...
-2025-01-14 12:35:04,528 - llm_api - DEBUG - Generated chunk: through ...
-2025-01-14 12:35:04,593 - llm_api - DEBUG - Generated chunk: updates ...
-2025-01-14 12:35:04,659 - llm_api - DEBUG - Generated chunk: made ...
-2025-01-14 12:35:04,724 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:04,790 - llm_api - DEBUG - Generated chunk: periodically ...
-2025-01-14 12:35:04,855 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:04,923 - llm_api - DEBUG - Generated chunk: reflective ...
-2025-01-14 12:35:04,993 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:05,061 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:05,129 - llm_api - DEBUG - Generated chunk: advancements ...
-2025-01-14 12:35:05,197 - llm_api - DEBUG - Generated chunk: achieved ...
-2025-01-14 12:35:05,265 - llm_api - DEBUG - Generated chunk: over ...
-2025-01-14 12:35:05,333 - llm_api - DEBUG - Generated chunk: time ...
-2025-01-14 12:35:05,400 - llm_api - DEBUG - Generated chunk: via ...
-2025-01-14 12:35:05,467 - llm_api - DEBUG - Generated chunk: machine ...
-2025-01-14 12:35:05,535 - llm_api - DEBUG - Generated chunk: learning ...
-2025-01-14 12:35:05,603 - llm_api - DEBUG - Generated chunk: techniques ...
-2025-01-14 12:35:05,672 - llm_api - DEBUG - Generated chunk: applied ...
-2025-01-14 12:35:05,742 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:05,812 - llm_api - DEBUG - Generated chunk: systematically ...
-2025-01-14 12:35:05,881 - llm_api - DEBUG - Generated chunk: throughout ...
-2025-01-14 12:35:05,951 - llm_api - DEBUG - Generated chunk: development ...
-2025-01-14 12:35:06,022 - llm_api - DEBUG - Generated chunk: phases ...
-2025-01-14 12:35:06,091 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:06,162 - llm_api - DEBUG - Generated chunk: aimed ...
-2025-01-14 12:35:06,231 - llm_api - DEBUG - Generated chunk: towards ...
-2025-01-14 12:35:06,301 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:06,370 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:06,439 - llm_api - DEBUG - Generated chunk: enhancing ...
-2025-01-14 12:35:06,505 - llm_api - DEBUG - Generated chunk: performance ...
-2025-01-14 12:35:06,569 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:06,634 - llm_api - DEBUG - Generated chunk: consistently ...
-2025-01-14 12:35:06,697 - llm_api - DEBUG - Generated chunk: while ...
-2025-01-14 12:35:06,761 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:06,825 - llm_api - DEBUG - Generated chunk: maintaining ...
-2025-01-14 12:35:06,888 - llm_api - DEBUG - Generated chunk: user ...
-2025-01-14 12:35:06,952 - llm_api - DEBUG - Generated chunk: trust ...
-2025-01-14 12:35:07,017 - llm_api - DEBUG - Generated chunk: simultaneously ...
-2025-01-14 12:35:07,081 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:07,145 - llm_api - DEBUG - Generated chunk: ensuring ...
-2025-01-14 12:35:07,210 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:07,273 - llm_api - DEBUG - Generated chunk: ethical ...
-2025-01-14 12:35:07,339 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:07,406 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:07,472 - llm_api - DEBUG - Generated chunk: guidelines ...
-2025-01-14 12:35:07,539 - llm_api - DEBUG - Generated chunk: remain ...
-2025-01-14 12:35:07,606 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:07,673 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:07,740 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:07,806 - llm_api - DEBUG - Generated chunk: uncompromised ...
-2025-01-14 12:35:07,871 - llm_api - DEBUG - Generated chunk: always ...
-2025-01-14 12:35:07,938 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:08,004 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:08,070 - llm_api - DEBUG - Generated chunk: prioritized ...
-2025-01-14 12:35:08,136 - llm_api - DEBUG - Generated chunk: above ...
-2025-01-14 12:35:08,203 - llm_api - DEBUG - Generated chunk: all ...
-2025-01-14 12:35:08,270 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:08,337 - llm_api - DEBUG - Generated chunk: else!
-...
-2025-01-14 12:35:08,404 - llm_api - DEBUG - Generated chunk:
-...
-2025-01-14 12:35:08,470 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:08,535 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:08,602 - llm_api - DEBUG - Generated chunk: 6. ...
-2025-01-14 12:35:08,667 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:08,731 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:08,796 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:08,860 - llm_api - DEBUG - Generated chunk: **Multilingual ...
-2025-01-14 12:35:08,927 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:08,990 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:09,054 - llm_api - DEBUG - Generated chunk: Support**: ...
-2025-01-14 12:35:09,119 - llm_api - DEBUG - Generated chunk: As ...
-2025-01-14 12:35:09,184 - llm_api - DEBUG - Generated chunk: mentioned ...
-2025-01-14 12:35:09,248 - llm_api - DEBUG - Generated chunk: earlier ...
-2025-01-14 12:35:09,311 - llm_api - DEBUG - Generated chunk: regarding ...
-2025-01-14 12:35:09,375 - llm_api - DEBUG - Generated chunk: Language ...
-2025-01-14 12:35:09,439 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:09,503 - llm_api - DEBUG - Generated chunk: comprehension ...
-2025-01-14 12:35:09,569 - llm_api - DEBUG - Generated chunk: skills ...
-2025-01-14 12:35:09,633 - llm_api - DEBUG - Generated chunk: – ...
-2025-01-14 12:35:09,698 - llm_api - DEBUG - Generated chunk: one ...
-2025-01-14 12:35:09,762 - llm_api - DEBUG - Generated chunk: significant ...
-2025-01-14 12:35:09,825 - llm_api - DEBUG - Generated chunk: advantage ...
-2025-01-14 12:35:09,890 - llm_api - DEBUG - Generated chunk: here ...
-2025-01-14 12:35:09,953 - llm_api - DEBUG - Generated chunk: lies ...
-2025-01-14 12:35:10,019 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:10,084 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:10,149 - llm_api - DEBUG - Generated chunk: multilanguage ...
-2025-01-14 12:35:10,213 - llm_api - DEBUG - Generated chunk: support ...
-2025-01-14 12:35:10,278 - llm_api - DEBUG - Generated chunk: allowing ...
-2025-01-14 12:35:10,342 - llm_api - DEBUG - Generated chunk: users ...
-2025-01-14 12:35:10,408 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:10,472 - llm_api - DEBUG - Generated chunk: worldwide ...
-2025-01-14 12:35:10,537 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:10,603 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:10,669 - llm_api - DEBUG - Generated chunk: irrespective ...
-2025-01-14 12:35:10,735 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:10,801 - llm_api - DEBUG - Generated chunk: geographical ...
-2025-01-14 12:35:10,866 - llm_api - DEBUG - Generated chunk: location ...
-2025-01-14 12:35:10,931 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:10,996 - llm_api - DEBUG - Generated chunk: accessibility ...
-2025-01-14 12:35:11,061 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:11,125 - llm_api - DEBUG - Generated chunk: facilitated ...
-2025-01-14 12:35:11,190 - llm_api - DEBUG - Generated chunk: smooth ...
-2025-01-14 12:35:11,255 - llm_api - DEBUG - Generated chunk: communication ...
-2025-01-14 12:35:11,327 - llm_api - DEBUG - Generated chunk: experience ...
-2025-01-14 12:35:11,393 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:11,460 - llm_api - DEBUG - Generated chunk: fostering ...
-2025-01-14 12:35:11,527 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:11,593 - llm_api - DEBUG - Generated chunk: inclusivity ...
-2025-01-14 12:35:11,699 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:11,794 - llm_api - DEBUG - Generated chunk: globally ...
-2025-01-14 12:35:11,887 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:11,960 - llm_api - DEBUG - Generated chunk: promoting ...
-2025-01-14 12:35:12,034 - llm_api - DEBUG - Generated chunk: cross ...
-2025-01-14 12:35:12,113 - llm_api - DEBUG - Generated chunk: cultural ...
-2025-01-14 12:35:12,184 - llm_api - DEBUG - Generated chunk: exchange ...
-2025-01-14 12:35:12,252 - llm_api - DEBUG - Generated chunk: effectively ...
-2025-01-14 12:35:12,331 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:12,401 - llm_api - DEBUG - Generated chunk: bridging ...
-2025-01-14 12:35:12,467 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:12,532 - llm_api - DEBUG - Generated chunk: gaps ...
-2025-01-14 12:35:12,598 - llm_api - DEBUG - Generated chunk: between ...
-2025-01-14 12:35:12,662 - llm_api - DEBUG - Generated chunk: different ...
-2025-01-14 12:35:12,726 - llm_api - DEBUG - Generated chunk: communities ...
-2025-01-14 12:35:12,792 - llm_api - DEBUG - Generated chunk: everywhere ...
-2025-01-14 12:35:12,857 - llm_api - DEBUG - Generated chunk: creating ...
-2025-01-14 12:35:12,922 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:12,986 - llm_api - DEBUG - Generated chunk: opportunities ...
-2025-01-14 12:35:13,053 - llm_api - DEBUG - Generated chunk: connecting ...
-2025-01-14 12:35:13,120 - llm_api - DEBUG - Generated chunk: hearts ...
-2025-01-14 12:35:13,186 - llm_api - DEBUG - Generated chunk: minds ...
-2025-01-14 12:35:13,252 - llm_api - DEBUG - Generated chunk: together ...
-2025-01-14 12:35:13,319 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:13,385 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:13,450 - llm_api - DEBUG - Generated chunk: harmoniously ...
-2025-01-14 12:35:13,516 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:13,583 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:13,648 - llm_api - DEBUG - Generated chunk: transcending ...
-2025-01-14 12:35:13,713 - llm_api - DEBUG - Generated chunk: boundaries ...
-2025-01-14 12:35:13,779 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:13,844 - llm_api - DEBUG - Generated chunk: effortlessly ...
-2025-01-14 12:35:13,909 - llm_api - DEBUG - Generated chunk: breaking ...
-2025-01-14 12:35:13,975 - llm_api - DEBUG - Generated chunk: down ...
-2025-01-14 12:35:14,042 - llm_api - DEBUG - Generated chunk: walls ...
-2025-01-14 12:35:14,111 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:14,176 - llm_api - DEBUG - Generated chunk: silencing ...
-2025-01-14 12:35:14,242 - llm_api - DEBUG - Generated chunk: voices ...
-2025-01-14 12:35:14,307 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:14,372 - llm_api - DEBUG - Generated chunk: suppressions ...
-2025-01-14 12:35:14,438 - llm_api - DEBUG - Generated chunk: fear ...
-2025-01-14 12:35:14,503 - llm_api - DEBUG - Generated chunk: ...
-2025-01-14 12:35:14,569 - llm_api - DEBUG - Generated chunk: dividing ...
-2025-01-14 12:35:14,570 - llm_api - DEBUG - Generated chunk: us...
-2025-01-14 12:35:14,570 - llm_api - INFO - INFO:     127.0.0.1:37118 - "POST /api/v1/generate/stream HTTP/1.1" 200 OK
-2025-01-14 12:47:53,791 - api_routes - INFO - Received generation request for prompt: Tell me about yourself and your capabilities...
-2025-01-14 12:47:53,791 - llm_api - DEBUG - Generating response for prompt: Tell me about yourself and your capabilities...
-2025-01-14 12:48:27,656 - llm_api - DEBUG - Generated response: I'm Phi, an AI developed by Microsoft. While I don...
-2025-01-14 12:48:27,656 - api_routes - INFO - Successfully generated response
-2025-01-14 12:48:27,656 - llm_api - INFO - INFO:     127.0.0.1:43528 - "POST /api/v1/generate HTTP/1.1" 200 OK
-2025-01-14 12:50:17,735 - api_routes - INFO - Received generation request for prompt: Please analyze this query and create a JSON respon...
-2025-01-14 12:50:17,735 - llm_api - DEBUG - Generating response for prompt: Please analyze this query and create a JSON respon...
-2025-01-14 12:50:31,906 - llm_api - DEBUG - Generated response: ```json
-{
-  "original_query": "Who is Djengis Khan...
-2025-01-14 12:50:31,906 - api_routes - INFO - Successfully generated response
-2025-01-14 12:50:31,906 - llm_api - INFO - INFO:     127.0.0.1:50042 - "POST /api/v1/generate HTTP/1.1" 200 OK
-2025-01-14 13:08:38,951 - api_routes - INFO - Received request to download model: PowerInfer/SmallThinker-3B-Preview
-2025-01-14 13:08:38,951 - llm_api - INFO - Starting download of model: PowerInfer/SmallThinker-3B-Preview
-2025-01-14 13:08:38,951 - llm_api - INFO - Enabling stdout logging for download
-2025-01-14 13:11:52,350 - llm_api - INFO - Disabling stdout logging
-2025-01-14 13:11:52,350 - llm_api - INFO - Saving model to main/models/SmallThinker-3B-Preview
-2025-01-14 13:13:04,420 - llm_api - INFO - Successfully downloaded model: PowerInfer/SmallThinker-3B-Preview
-2025-01-14 13:13:05,175 - api_routes - INFO - Successfully downloaded model: PowerInfer/SmallThinker-3B-Preview
-2025-01-14 13:13:31,469 - api_routes - INFO - Received request to initialize model: PowerInfer/SmallThinker-3B-Preview
-2025-01-14 13:13:31,469 - llm_api - INFO - Initializing generation model: PowerInfer/SmallThinker-3B-Preview
-2025-01-14 13:13:31,472 - llm_api - INFO - Loading model from local path: main/models/SmallThinker-3B-Preview
-2025-01-14 13:13:31,909 - llm_api - ERROR - Failed to initialize generation model PowerInfer/SmallThinker-3B-Preview: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details.
-2025-01-14 13:13:31,909 - api_routes - ERROR - Error initializing model: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details.
-2025-01-14 13:14:36,924 - main - INFO - LLM API server started on 0.0.0.0:8001
-2025-01-14 13:14:49,486 - hf_validation - WARNING - No .env file found. Fine if you're on Huggingface, but you need one to run locally on your PC.
-2025-01-14 13:14:49,486 - hf_validation - ERROR - No HF_TOKEN found in environment variables
-2025-01-14 13:14:49,486 - main - INFO - Starting LLM API server
-2025-01-14 13:14:49,486 - llm_api - INFO - Initializing LLM API
-2025-01-14 13:14:49,486 - llm_api - INFO - LLM API initialized successfully
-2025-01-14 13:14:49,486 - api_routes - INFO - Router initialized with LLM API instance
-2025-01-14 13:14:49,490 - main - INFO - FastAPI application created successfully
-2025-01-14 13:14:56,382 - api_routes - INFO - Received request to initialize model: PowerInfer/SmallThinker-3B-Preview
-2025-01-14 13:14:56,383 - llm_api - INFO - Initializing generation model: PowerInfer/SmallThinker-3B-Preview
-2025-01-14 13:14:56,383 - llm_api - INFO - Loading model from local path: main/models/SmallThinker-3B-Preview
-2025-01-14 13:15:07,065 - llm_api - INFO - Successfully initialized generation model: PowerInfer/SmallThinker-3B-Preview
-2025-01-14 13:15:07,065 - api_routes - INFO - Successfully initialized model: PowerInfer/SmallThinker-3B-Preview
-2025-01-14 13:15:07,065 - llm_api - INFO - INFO:     127.0.0.1:40472 - "POST /api/v1/model/initialize?model_name=PowerInfer%2FSmallThinker-3B-Preview HTTP/1.1" 200 OK
-2025-01-14 13:16:09,874 - api_routes - INFO - Received generation request for prompt: Tell me about yourself and your capabilities...
-2025-01-14 13:16:09,874 - llm_api - DEBUG - Generating response for prompt: Tell me about yourself and your capabilities...
-2025-01-14 13:17:03,595 - llm_api - DEBUG - Generated response: I'm an AI developed by Alibaba, designed to assist...
-2025-01-14 13:17:03,595 - api_routes - INFO - Successfully generated response
-2025-01-14 13:17:03,595 - llm_api - INFO - INFO:     127.0.0.1:44786 - "POST /api/v1/generate HTTP/1.1" 200 OK
-2025-01-14 13:18:04,891 - main - INFO - LLM API server started on 0.0.0.0:8001

main/main.py ADDED Viewed

	@@ -0,0 +1,179 @@

+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+from typing import Optional, Dict, Any, Union
+import torch
+import logging
+from pathlib import Path
+from litgpt.api import LLM
+import os
+import uvicorn
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+app = FastAPI(title="LLM Engine Service")
+# Global variable to store the LLM instance
+llm_instance = None
+class InitializeRequest(BaseModel):
+    """
+    Configuration for model initialization including model path
+    """
+    mode: str = "cpu"
+    precision: Optional[str] = None
+    quantize: Optional[str] = None
+    gpu_count: Union[str, int] = "auto"
+    model_path: str
+class GenerateRequest(BaseModel):
+    prompt: str
+    max_new_tokens: int = 50
+    temperature: float = 1.0
+    top_k: Optional[int] = None
+    top_p: float = 1.0
+    return_as_token_ids: bool = False
+    stream: bool = False
+@app.post("/initialize")
+async def initialize_model(request: InitializeRequest):
+    """
+    Initialize the LLM model with specified configuration.
+    """
+    global llm_instance
+    try:
+        if request.precision is None and request.quantize is None:
+            # Use auto distribution from load when no specific precision or quantization is set
+            llm_instance = LLM.load(
+                model=request.model_path,
+                distribute="auto"  # Let the load function handle distribution automatically
+            )
+            logger.info(
+                f"Model initialized with auto settings:\n"
+                f"Model Path: {request.model_path}\n"
+                f"Current GPU Memory: {torch.cuda.memory_allocated()/1024**3:.2f}GB allocated, "
+                f"{torch.cuda.memory_reserved()/1024**3:.2f}GB reserved"
+            )
+        else:
+            # Original initialization path for when specific settings are requested
+            llm_instance = LLM.load(
+                model=request.model_path,
+                distribute=None  # We'll distribute manually
+            )
+            # Distribute the model according to the configuration
+            llm_instance.distribute(
+                accelerator="cuda" if request.mode == "gpu" else "cpu",
+                devices=request.gpu_count,
+                precision=request.precision,
+                quantize=request.quantize
+            )
+            logger.info(
+                f"Model initialized successfully with config:\n"
+                f"Mode: {request.mode}\n"
+                f"Precision: {request.precision}\n"
+                f"Quantize: {request.quantize}\n"
+                f"GPU Count: {request.gpu_count}\n"
+                f"Model Path: {request.model_path}\n"
+                f"Current GPU Memory: {torch.cuda.memory_allocated()/1024**3:.2f}GB allocated, "
+                f"{torch.cuda.memory_reserved()/1024**3:.2f}GB reserved"
+            )
+        return {"success": True, "message": "Model initialized successfully"}
+    except Exception as e:
+        logger.error(f"Error initializing model: {str(e)}")
+        # Print detailed memory statistics on failure
+        logger.error(f"GPU Memory Stats:\n"
+                     f"Allocated: {torch.cuda.memory_allocated()/1024**3:.2f}GB\n"
+                     f"Reserved: {torch.cuda.memory_reserved()/1024**3:.2f}GB\n"
+                     f"Max Allocated: {torch.cuda.max_memory_allocated()/1024**3:.2f}GB")
+        raise HTTPException(status_code=500, detail=f"Error initializing model: {str(e)}")
+@app.post("/generate")
+async def generate(request: GenerateRequest):
+    """
+    Generate text using the initialized model.
+    """
+    global llm_instance
+    if llm_instance is None:
+        raise HTTPException(status_code=400, detail="Model not initialized. Call /initialize first.")
+    try:
+        if request.stream:
+            # For streaming responses, we need to handle differently
+            # This is a placeholder as the actual streaming implementation
+            # would need to use StreamingResponse from FastAPI
+            raise HTTPException(
+                status_code=400,
+                detail="Streaming is not currently supported through the API"
+            )
+        generated_text = llm_instance.generate(
+            prompt=request.prompt,
+            max_new_tokens=request.max_new_tokens,
+            temperature=request.temperature,
+            top_k=request.top_k,
+            top_p=request.top_p,
+            return_as_token_ids=request.return_as_token_ids,
+            stream=False  # Force stream to False for now
+        )
+        response = {
+            "generated_text": generated_text if not request.return_as_token_ids else generated_text.tolist(),
+            "metadata": {
+                "prompt": request.prompt,
+                "max_new_tokens": request.max_new_tokens,
+                "temperature": request.temperature,
+                "top_k": request.top_k,
+                "top_p": request.top_p
+            }
+        }
+        return response
+    except Exception as e:
+        logger.error(f"Error generating text: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"Error generating text: {str(e)}")
+@app.get("/health")
+async def health_check():
+    """
+    Check if the service is running and model is loaded.
+    """
+    global llm_instance
+    status = {
+        "status": "healthy",
+        "model_loaded": llm_instance is not None,
+    }
+    if llm_instance is not None:
+        status["model_info"] = {
+            "model_path": llm_instance.config.name,
+            "device": str(next(llm_instance.model.parameters()).device)
+        }
+    return status
+def main():
+    # Load environment variables or configuration here
+    host = os.getenv("LLM_ENGINE_HOST", "0.0.0.0")
+    port = int(os.getenv("LLM_ENGINE_PORT", "8001"))
+    # Start the server
+    uvicorn.run(
+        app,
+        host=host,
+        port=port,
+        log_level="info",
+        reload=False
+    )
+if __name__ == "__main__":
+    main()

main/resources/config.yaml DELETED Viewed

@@ -1,31 +0,0 @@
-server:
-  host: "0.0.0.0"
-  port: 8001
-model:
-  base_path: "."
-  generation:
-    max_new_tokens: 500
-    do_sample: true
-    temperature: 0.2
-    repetition_penalty: 1.1
-  defaults:
-    #model_name: "huihui-ai/Llama-3.2-3B-Instruct-abliterated"
-    model_name: "microsoft/Phi-3.5-mini-instruct"
-folders:
-  models: "main/models"
-  cache: "main/.cache"
-  logs: "main/logs"
-logging:
-  level: "DEBUG" # DEBUG, INFO, WARNING, ERROR, CRITICAL
-  format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
-  file: "llm_api.log"
-api:
-  version: "v1"
-  prefix: ""
-  cors:
-    origins: ["*"]
-    credentials: true

main/routes.py DELETED Viewed

@@ -1,391 +0,0 @@
-# routes.py for the LLM Engine.
-# This file contains the FastAPI routes for the LLM Engine API.
-# It includes routes for generating text, generating embeddings, checking system status, and validating system configuration.
-from fastapi import APIRouter, HTTPException
-from fastapi.responses import StreamingResponse
-from pydantic import BaseModel
-from typing import Optional, List, Dict, Union
-from .api import LLMApi
-from .utils.logging import setup_logger
-from .utils.helpers import get_system_info, format_memory_size
-from .utils.validation import validate_model_path
-import psutil
-from pathlib import Path
-router = APIRouter()
-logger = None
-api = None
-config = None
-def init_router(config_dict: dict):
-    """Initialize router with config and LLM API instance"""
-    global logger, api, config
-    config = config_dict
-    logger = setup_logger(config, "api_routes")
-    api = LLMApi(config)
-    logger.info("Router initialized with LLM API instance")
-class GenerateRequest(BaseModel):
-    prompt: str
-    system_message: Optional[str] = None
-    max_new_tokens: Optional[int] = None
-class EmbeddingRequest(BaseModel):
-    text: str
-class EmbeddingResponse(BaseModel):
-    embedding: List[float]
-    dimension: int
-class SystemStatusResponse(BaseModel):
-    """Pydantic model for system status response"""
-    cpu: Optional[Dict[str, Union[float, str]]] = None
-    memory: Optional[Dict[str, Union[float, str]]] = None
-    gpu: Optional[Dict[str, Union[bool, str, float]]] = None
-    storage: Optional[Dict[str, str]] = None
-    model: Optional[Dict[str, Union[bool, str]]] = None
-class ValidationResponse(BaseModel):
-    config_validation: Dict[str, bool]
-    model_validation: Dict[str, bool]
-    folder_validation: Dict[str, bool]
-    overall_status: str
-    issues: List[str]
-@router.get("/system/validate",
-            response_model=ValidationResponse,
-            summary="Validate System Configuration",
-            description="Validates system configuration, folders, and model setup for both generation and embedding models")
-async def validate_system():
-    """
-    Validates:
-    - Configuration parameters
-    - Model setup for both generation and embedding models
-    - Folder structure
-    - Required permissions
-    """
-    logger.info("Starting system validation")
-    issues = []
-    # Validate configuration
-    try:
-        config_status = {
-            "has_required_fields": True,  # Check if all required config fields exist
-            "valid_paths": True,          # Check if paths are valid
-            "valid_parameters": True      # Check if parameters are within acceptable ranges
-        }
-        # Example validation checks
-        if not api.models_path.exists():
-            config_status["valid_paths"] = False
-            issues.append("Models directory does not exist")
-        if api.temperature < 0 or api.temperature > 2:
-            config_status["valid_parameters"] = False
-            issues.append("Temperature parameter out of valid range (0-2)")
-    except Exception as e:
-        logger.error(f"Configuration validation failed: {str(e)}")
-        config_status = {"error": str(e)}
-        issues.append(f"Config validation error: {str(e)}")
-    # Validate model setup
-    try:
-        model_status = {
-            "generation_model_files_exist": False,
-            "generation_model_loadable": False,
-            "embedding_model_files_exist": False,
-            "embedding_model_loadable": False,
-            "tokenizer_valid": False
-        }
-        if api.generation_model_name:
-            gen_model_path = api.models_path / api.generation_model_name.split('/')[-1]
-            model_status["generation_model_files_exist"] = validate_model_path(gen_model_path)
-            model_status["generation_model_loadable"] = api.generation_model is not None
-        if api.embedding_model_name:
-            emb_model_path = api.models_path / api.embedding_model_name.split('/')[-1]
-            model_status["embedding_model_files_exist"] = validate_model_path(emb_model_path)
-            model_status["embedding_model_loadable"] = api.embedding_model is not None
-        model_status["tokenizer_valid"] = (
-                api.tokenizer is not None and api.embedding_tokenizer is not None
-        )
-        if not model_status["generation_model_files_exist"]:
-            issues.append("Generation model files are missing or incomplete")
-        if not model_status["embedding_model_files_exist"]:
-            issues.append("Embedding model files are missing or incomplete")
-    except Exception as e:
-        logger.error(f"Model validation failed: {str(e)}")
-        model_status = {"error": str(e)}
-        issues.append(f"Model validation error: {str(e)}")
-    # Validate folder structure and permissions
-    try:
-        folder_status = {
-            "models_folder": api.models_path.exists(),
-            "cache_folder": api.cache_path.exists(),
-            "logs_folder": Path(api.base_path / "logs").exists(),
-            "write_permissions": False
-        }
-        # Test write permissions by attempting to create a test file
-        test_file = api.models_path / ".test_write"
-        try:
-            test_file.touch()
-            test_file.unlink()
-            folder_status["write_permissions"] = True
-        except:
-            folder_status["write_permissions"] = False
-            issues.append("Insufficient write permissions in models directory")
-    except Exception as e:
-        logger.error(f"Folder validation failed: {str(e)}")
-        folder_status = {"error": str(e)}
-        issues.append(f"Folder validation error: {str(e)}")
-    # Determine overall status
-    if not issues:
-        overall_status = "valid"
-    elif len(issues) < 3:
-        overall_status = "warning"
-    else:
-        overall_status = "invalid"
-    validation_response = ValidationResponse(
-        config_validation=config_status,
-        model_validation=model_status,
-        folder_validation=folder_status,
-        overall_status=overall_status,
-        issues=issues
-    )
-    logger.info(f"System validation completed with status: {overall_status}")
-    return validation_response
-@router.get("/system/status",
-            response_model=SystemStatusResponse,
-            summary="Check System Status",
-            description="Returns comprehensive system status including CPU, Memory, GPU, Storage, and Model information")
-async def check_system():
-    """
-    Get system status including:
-    - CPU usage
-    - Memory usage
-    - GPU availability and usage
-    - Storage status for model and cache directories
-    - Current model status
-    """
-    logger.info("Checking system status")
-    status = SystemStatusResponse()
-    system_info = None
-    # Check CPU and Memory
-    try:
-        system_info = get_system_info()
-        status.cpu = {
-            "usage_percent": system_info["cpu_percent"],
-            "status": "healthy" if system_info["cpu_percent"] < 90 else "high"
-        }
-        logger.debug(f"CPU status retrieved: {status.cpu}")
-    except Exception as e:
-        logger.error(f"Failed to get CPU info: {str(e)}")
-        status.cpu = {"status": "error", "message": str(e)}
-    # Check Memory
-    try:
-        if not system_info:
-            system_info = get_system_info()
-        status.memory = {
-            "usage_percent": system_info["memory_percent"],
-            "status": "healthy" if system_info["memory_percent"] < 90 else "critical",
-            "available": format_memory_size(psutil.virtual_memory().available)
-        }
-        logger.debug(f"Memory status retrieved: {status.memory}")
-    except Exception as e:
-        logger.error(f"Failed to get memory info: {str(e)}")
-        status.memory = {"status": "error", "message": str(e)}
-    # Check GPU
-    try:
-        if not system_info:
-            system_info = get_system_info()
-        status.gpu = {
-            "available": system_info["gpu_available"],
-            "memory_used": format_memory_size(system_info["gpu_memory_used"]),
-            "memory_total": format_memory_size(system_info["gpu_memory_total"]),
-            "utilization_percent": system_info["gpu_memory_used"] / system_info["gpu_memory_total"] * 100 if system_info["gpu_available"] else 0
-        }
-        logger.debug(f"GPU status retrieved: {status.gpu}")
-    except Exception as e:
-        logger.error(f"Failed to get GPU info: {str(e)}")
-        status.gpu = {"status": "error", "message": str(e)}
-    # Check Storage
-    try:
-        models_path = Path(api.models_path)
-        cache_path = Path(api.cache_path)
-        status.storage = {
-            "models_directory": str(models_path),
-            "models_size": format_memory_size(sum(f.stat().st_size for f in models_path.glob('**/*') if f.is_file())),
-            "cache_directory": str(cache_path),
-            "cache_size": format_memory_size(sum(f.stat().st_size for f in cache_path.glob('**/*') if f.is_file()))
-        }
-        logger.debug(f"Storage status retrieved: {status.storage}")
-    except Exception as e:
-        logger.error(f"Failed to get storage info: {str(e)}")
-        status.storage = {"status": "error", "message": str(e)}
-    # Check Model Status
-    try:
-        status.model = {
-            "generation_model": {
-                "is_loaded": api.generation_model is not None,
-                "current_model": api.generation_model_name,
-                "has_chat_template": api.has_chat_template() if api.generation_model else False
-            },
-            "embedding_model": {
-                "is_loaded": api.embedding_model is not None,
-                "current_model": api.embedding_model_name
-            }
-        }
-        logger.debug(f"Model status retrieved: {status.model}")
-    except Exception as e:
-        logger.error(f"Failed to get model status: {str(e)}")
-        status.model = {"status": "error", "message": str(e)}
-    logger.info("System status check completed")
-    return status
-@router.post("/generate")
-async def generate_text(request: GenerateRequest):
-    """Generate text response from prompt"""
-    logger.info(f"Received generation request for prompt: {request.prompt[:50]}...")
-    try:
-        response = api.generate_response(
-            prompt=request.prompt,
-            system_message=request.system_message,
-            max_new_tokens=request.max_new_tokens or api.max_new_tokens
-        )
-        logger.info("Successfully generated response")
-        return {"generated_text": response}
-    except Exception as e:
-        logger.error(f"Error in generate_text endpoint: {str(e)}")
-        raise HTTPException(status_code=500, detail=str(e))
-@router.post("/generate/stream")
-async def generate_stream(request: GenerateRequest):
-    """Generate streaming text response from prompt"""
-    logger.info(f"Received streaming generation request for prompt: {request.prompt[:50]}...")
-    try:
-        async def event_generator():
-            async for chunk in api.generate_stream(
-                    prompt=request.prompt,
-                    system_message=request.system_message,
-                    max_new_tokens=request.max_new_tokens or api.max_new_tokens
-            ):
-                yield f"data: {chunk}\n\n"
-            yield "data: [DONE]\n\n"
-        return StreamingResponse(
-            event_generator(),
-            media_type="text/event-stream",
-            headers={
-                "Cache-Control": "no-cache",
-                "Connection": "keep-alive",
-            }
-        )
-    except Exception as e:
-        logger.error(f"Error in generate_stream endpoint: {str(e)}")
-        raise HTTPException(status_code=500, detail=str(e))
-@router.post("/embedding", response_model=EmbeddingResponse)
-async def generate_embedding(request: EmbeddingRequest):
-    """Generate embedding vector from text"""
-    logger.info(f"Received embedding request for text: {request.text[:50]}...")
-    try:
-        embedding = api.generate_embedding(request.text)
-        logger.info(f"Successfully generated embedding of dimension {len(embedding)}")
-        return EmbeddingResponse(
-            embedding=embedding,
-            dimension=len(embedding)
-        )
-    except Exception as e:
-        logger.error(f"Error in generate_embedding endpoint: {str(e)}")
-        raise HTTPException(status_code=500, detail=str(e))
-@router.post("/model/download",
-             summary="Download default or specified model",
-             description="Downloads model files. Uses default model from config if none specified.")
-async def download_model(model_name: Optional[str] = None):
-    """Download model files to local storage"""
-    try:
-        # Use model name from config if none provided
-        model_to_download = model_name or config["model"]["defaults"]["model_name"]
-        logger.info(f"Received request to download model: {model_to_download}")
-        api.download_model(model_to_download)
-        logger.info(f"Successfully downloaded model: {model_to_download}")
-        return {
-            "status": "success",
-            "message": f"Model {model_to_download} downloaded",
-            "model_name": model_to_download
-        }
-    except Exception as e:
-        logger.error(f"Error downloading model: {str(e)}")
-        raise HTTPException(status_code=500, detail=str(e))
-@router.post("/model/initialize",
-             summary="Initialize default or specified model",
-             description="Initialize model for use. Uses default model from config if none specified.")
-async def initialize_model(model_name: Optional[str] = None):
-    """Initialize a model for use"""
-    try:
-        # Use model name from config if none provided
-        model_to_init = model_name or config["model"]["defaults"]["model_name"]
-        logger.info(f"Received request to initialize model: {model_to_init}")
-        api.initialize_model(model_to_init)
-        logger.info(f"Successfully initialized model: {model_to_init}")
-        return {
-            "status": "success",
-            "message": f"Model {model_to_init} initialized",
-            "model_name": model_to_init
-        }
-    except Exception as e:
-        logger.error(f"Error initializing model: {str(e)}")
-        raise HTTPException(status_code=500, detail=str(e))
-@router.post("/model/initialize/embedding",
-             summary="Initialize embedding model",
-             description="Initialize a separate model specifically for generating embeddings")
-async def initialize_embedding_model(model_name: Optional[str] = None):
-    """Initialize a model specifically for embeddings"""
-    try:
-        # Use model name from config if none provided
-        embedding_model = model_name or config["model"]["defaults"].get("embedding_model_name")
-        if not embedding_model:
-            raise HTTPException(
-                status_code=400,
-                detail="No embedding model specified and no default found in config"
-            )
-        logger.info(f"Received request to initialize embedding model: {embedding_model}")
-        api.initialize_embedding_model(embedding_model)
-        logger.info(f"Successfully initialized embedding model: {embedding_model}")
-        return {
-            "status": "success",
-            "message": f"Embedding model {embedding_model} initialized",
-            "model_name": embedding_model
-        }
-    except Exception as e:
-        logger.error(f"Error initializing embedding model: {str(e)}")
-        raise HTTPException(status_code=500, detail=str(e))

main/test_locally.py DELETED Viewed

@@ -1,56 +0,0 @@
-def test_locally(load_config, setup_logger, LLMApi):
-    """Run local tests for development and debugging"""
-    config = load_config()
-    logger = setup_logger(config, "test")
-    logger.info("Starting local tests")
-    api = LLMApi(config)
-    model_name = config["model"]["defaults"]["model_name"]
-    logger.info(f"Testing with model: {model_name}")
-    # Test download
-    logger.info("Testing model download...")
-    api.download_model(model_name)
-    logger.info("Download complete")
-    # Test initialization
-    logger.info("Initializing model...")
-    api.initialize_model(model_name)
-    logger.info("Model initialized")
-    # Test embedding
-    test_text = "Dette er en test av embeddings generering fra en teknisk tekst om HMS rutiner på arbeidsplassen."
-    logger.info("Testing embedding generation...")
-    embedding = api.generate_embedding(test_text)
-    logger.info(f"Generated embedding of length: {len(embedding)}")
-    logger.info(f"First few values: {embedding[:5]}")
-    # Test generation
-    test_prompts = [
-        "Tell me what happens in a nuclear reactor.",
-    ]
-    # Test regular generation
-    logger.info("Testing regular generation:")
-    for prompt in test_prompts:
-        logger.info(f"Prompt: {prompt}")
-        response = api.generate_response(
-            prompt=prompt,
-            system_message="You are a helpful assistant."
-        )
-        logger.info(f"Response: {response}")
-    # Test streaming generation
-    logger.info("Testing streaming generation:")
-    logger.info(f"Prompt: {test_prompts[0]}")
-    for chunk in api.generate_stream(
-            prompt=test_prompts[0],
-            system_message="You are a helpful assistant."
-    ):
-        print(chunk, end="", flush=True)
-    print("\n")
-    logger.info("Local tests completed")

main/utils/__init__.py DELETED Viewed

File without changes

main/utils/errors.py DELETED Viewed

@@ -1,94 +0,0 @@
-class ModelNotFoundError(Exception):
-    """Error raised when a model cannot be found or accessed"""
-    def __init__(self, model_name: str, original_error: Exception = None):
-        self.model_name = model_name
-        self.original_error = original_error
-        message = (
-            f"Could not find or access model: '{model_name}'\n\n"
-            f"This could be because:\n"
-            f"1. The model name is misspelled - double check the name\n"
-            f"2. The model requires authentication - you need to:\n"
-            f"   - Log in to Hugging Face (huggingface.co)\n"
-            f"   - Accept the model's terms of use on its page\n"
-            f"   - Create an access token in your HF account settings\n"
-            f"   - Set the token as an environment variable: export HUGGING_FACE_HUB_TOKEN=your_token\n\n"
-            f"Original error: {str(original_error)}"
-        )
-        super().__init__(message)
-class ModelLoadError(Exception):
-    """Error raised when a model fails to load"""
-    def __init__(self, model_name: str, load_type: str, original_error: Exception = None):
-        self.model_name = model_name
-        self.load_type = load_type
-        self.original_error = original_error
-        message = (
-            f"Failed to load model: '{model_name}' using {load_type} precision\n\n"
-            f"Common reasons:\n"
-            f"1. Not enough GPU memory - This model requires more VRAM than available\n"
-            f"   - Try using 8-bit quantization (load_in_8bit=True)\n"
-            f"   - Try using 4-bit quantization (load_in_4bit=True)\n"
-            f"   - Or use a smaller model\n"
-            f"2. Incorrect model parameters - Check the model card for correct loading parameters\n"
-            f"3. Corrupted model files - Try removing the model folder and downloading again\n\n"
-            f"Original error: {str(original_error)}"
-        )
-        super().__init__(message)
-class InvalidConfigurationError(Exception):
-    """Error raised when configuration is invalid"""
-    def __init__(self, param_name: str, current_value: any, expected_value: str, original_error: Exception = None):
-        self.param_name = param_name
-        self.current_value = current_value
-        self.expected_value = expected_value
-        self.original_error = original_error
-        message = (
-            f"Invalid configuration parameter: '{param_name}'\n\n"
-            f"Current value: {current_value}\n"
-            f"Expected value: {expected_value}\n\n"
-            f"Please update your config.yaml file with the correct value\n"
-            f"Original error: {str(original_error)}"
-        )
-        super().__init__(message)
-class GenerationError(Exception):
-    """Error raised when text generation fails"""
-    def __init__(self, stage: str, original_error: Exception = None):
-        self.stage = stage
-        self.original_error = original_error
-        message = (
-            f"Text generation failed during {stage}\n\n"
-            f"This could be because:\n"
-            f"1. The model ran out of memory during generation\n"
-            f"   - Try reducing max_new_tokens\n"
-            f"   - Try reducing the input text length\n"
-            f"2. The input prompt might be too complex or long\n"
-            f"3. The model might be in an inconsistent state\n"
-            f"   - Try reinitializing the model\n\n"
-            f"Original error: {str(original_error)}"
-        )
-        super().__init__(message)
-# Usage examples:
-"""
-# When model not found:
-raise ModelNotFoundError("mistralai/Mistral-7B-v0.1", original_error=e)
-# When model fails to load:
-raise ModelLoadError("mistralai/Mistral-7B-v0.1", "8-bit quantization", original_error=e)
-# When config is invalid:
-raise InvalidConfigurationError(
-    "temperature",
-    2.5,
-    "a value between 0.0 and 2.0",
-    original_error=e
-)
-# When generation fails:
-raise GenerationError("token generation", original_error=e)
-"""

main/utils/helpers.py DELETED Viewed

@@ -1,44 +0,0 @@
-import psutil
-import torch
-from pathlib import Path
-from typing import Dict, Any
-import yaml
-def get_system_info() -> Dict[str, Any]:
-    """Get system resource information"""
-    return {
-        "cpu_percent": psutil.cpu_percent(),
-        "memory_percent": psutil.virtual_memory().percent,
-        "gpu_available": torch.cuda.is_available(),
-        "gpu_memory_used": torch.cuda.memory_allocated() if torch.cuda.is_available() else 0,
-        "gpu_memory_total": torch.cuda.get_device_properties(0).total_memory if torch.cuda.is_available() else 0
-    }
-def calculate_optimal_batch_size(model_size: int, available_memory: int) -> int:
-    """Calculate optimal batch size based on model size and available memory"""
-    memory_per_sample = model_size * 1.5  # Rough estimate including overhead
-    return max(1, available_memory // memory_per_sample)
-def ensure_folder_structure(config: Dict) -> None:
-    """Ensure all necessary folders exist"""
-    folders = [
-        Path(config["folders"]["models"]),
-        Path(config["folders"]["cache"]),
-        Path(config["folders"]["logs"])
-    ]
-    for folder in folders:
-        folder.mkdir(parents=True, exist_ok=True)
-def format_memory_size(size_bytes: int) -> str:
-    """Format memory size to human readable format"""
-    for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
-        if size_bytes < 1024:
-            return f"{size_bytes:.2f}{unit}"
-        size_bytes /= 1024
-def load_config():
-    """Load configuration from yaml file"""
-    with open("main/resources/config.yaml", "r") as f:
-        return yaml.safe_load(f)

main/utils/logging.py DELETED Viewed

@@ -1,65 +0,0 @@
-import logging
-import sys
-from pathlib import Path
-class StreamToLogger:
-    """
-    Fake file-like stream object that redirects writes to a logger instance.
-    """
-    def __init__(self, logger, log_level=logging.INFO):
-        self.logger = logger
-        self.log_level = log_level
-        self.linebuf = ''
-        self.enabled = True
-    def write(self, buf):
-        if self.enabled:
-            for line in buf.rstrip().splitlines():
-                self.logger.log(self.log_level, line.rstrip())
-    def flush(self):
-        pass
-    def enable(self):
-        self.enabled = True
-    def isatty(self):
-        return False
-    def disable(self):
-        self.enabled = False
-def setup_logger(config: dict, name: str = None) -> logging.Logger:
-    """Set up logger with configuration from config file."""
-    logger = logging.getLogger(name or __name__)
-    # Set level from config
-    level = getattr(logging, config["logging"]["level"].upper())
-    logger.setLevel(level)
-    # Create logs directory if it doesn't exist
-    log_path = Path(config["folders"]["logs"])
-    log_path.mkdir(exist_ok=True)
-    # Create handlers
-    file_handler = logging.FileHandler(log_path / config["logging"]["file"])
-    console_handler = logging.StreamHandler()
-    # Create formatter
-    formatter = logging.Formatter(config["logging"]["format"])
-    file_handler.setFormatter(formatter)
-    console_handler.setFormatter(formatter)
-    # Add handlers
-    logger.addHandler(file_handler)
-    logger.addHandler(console_handler)
-    # Redirect stdout to logger
-    stream_to_logger = StreamToLogger(logger, logging.INFO)
-    sys.stdout = stream_to_logger
-    # Add methods to enable/disable StreamToLogger
-    logger.enable_stream_to_logger = stream_to_logger.enable
-    logger.disable_stream_to_logger = stream_to_logger.disable
-    return logger

main/utils/validation.py DELETED Viewed

@@ -1,56 +0,0 @@
-from typing import Dict, Any
-from pathlib import Path
-from dotenv import load_dotenv
-from huggingface_hub import login
-import os
-def validate_model_path(model_path: Path) -> bool:
-    """Validate that a model path exists and contains necessary files"""
-    if not model_path.exists():
-        return False
-    required_files = ['config.json', 'pytorch_model.bin']
-    return all((model_path / file).exists() for file in required_files)
-def validate_generation_params(params: Dict[str, Any]) -> Dict[str, Any]:
-    """Validate and normalize generation parameters"""
-    validated = params.copy()
-    # Ensure temperature is within bounds
-    if 'temperature' in validated:
-        validated['temperature'] = max(0.0, min(2.0, validated['temperature']))
-    # Ensure max_new_tokens is reasonable
-    if 'max_new_tokens' in validated:
-        validated['max_new_tokens'] = max(1, min(4096, validated['max_new_tokens']))
-    return validated
-def validate_hf(setup_logger, config):
-    """
-    Validate Hugging Face authentication.
-    Checks for .env file, loads environment variables, and attempts HF login if token exists.
-    """
-    logger = setup_logger(config, "hf_validation")
-    # Check for .env file
-    env_path = Path('.env')
-    if env_path.exists():
-        logger.info("Found .env file, loading environment variables")
-        load_dotenv()
-    else:
-        logger.warning("No .env file found. Fine if you're on Huggingface, but you need one to run locally on your PC.")
-    # Check for HF token
-    hf_token = os.getenv('HF_TOKEN')
-    if not hf_token:
-        logger.error("No HF_TOKEN found in environment variables")
-        return False
-    try:
-        # Attempt login
-        login(token=hf_token)
-        logger.info("Successfully authenticated with Hugging Face")
-        return True
-    except Exception as e:
-        logger.error(f"Failed to authenticate with Hugging Face: {str(e)}")
-        return False

requirements.txt CHANGED Viewed

@@ -1,51 +1,7 @@
-accelerate==1.2.1
-annotated-types==0.7.0
-anyio==4.8.0
-bitsandbytes==0.45.0
-certifi==2024.12.14
-charset-normalizer==3.4.1
-click==8.1.8
-fastapi==0.115.6
-filelock==3.16.1
-fsspec==2024.12.0
-h11==0.14.0
-huggingface-hub==0.27.1
-idna==3.10
-Jinja2==3.1.5
-MarkupSafe==3.0.2
-mpmath==1.3.0
-networkx==3.4.2
-numpy==2.2.1
-nvidia-cublas-cu12==12.4.5.8
-nvidia-cuda-cupti-cu12==12.4.127
-nvidia-cuda-nvrtc-cu12==12.4.127
-nvidia-cuda-runtime-cu12==12.4.127
-nvidia-cudnn-cu12==9.1.0.70
-nvidia-cufft-cu12==11.2.1.3
-nvidia-curand-cu12==10.3.5.147
-nvidia-cusolver-cu12==11.6.1.9
-nvidia-cusparse-cu12==12.3.1.170
-nvidia-nccl-cu12==2.21.5
-nvidia-nvjitlink-cu12==12.4.127
-nvidia-nvtx-cu12==12.4.127
-packaging==24.2
-psutil==6.1.1
-pydantic==2.10.5
-pydantic_core==2.27.2
-python-dotenv==1.0.1
-PyYAML==6.0.2
-regex==2024.11.6
-requests==2.32.3
-safetensors==0.5.2
-setuptools==75.8.0
-sniffio==1.3.1
-starlette==0.41.3
-sympy==1.13.1
-tokenizers==0.21.0
-torch==2.5.1
-tqdm==4.67.1
-transformers==4.47.1
-triton==3.1.0
-typing_extensions==4.12.2
-urllib3==2.3.0
-uvicorn==0.34.0

+fastapi==0.109.0
+uvicorn==0.27.0
+pydantic==2.5.3
+torch==2.4.1
+transformers==4.36.2
+litgpt==0.5.3
+python-dotenv==1.0.0