|
import os |
|
import gc |
|
import psutil |
|
import cachetools |
|
from pydantic import BaseModel |
|
from llama_cpp import Llama |
|
from concurrent.futures import ThreadPoolExecutor, as_completed |
|
import re |
|
import httpx |
|
import asyncio |
|
import gradio as gr |
|
import torch |
|
from dotenv import load_dotenv |
|
from fastapi import FastAPI, Request |
|
from fastapi.responses import JSONResponse |
|
import uvicorn |
|
from threading import Thread |
|
import gptcache |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
import nltk |
|
from nltk.corpus import stopwords |
|
|
|
nltk.download('stopwords') |
|
|
|
load_dotenv() |
|
HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN") |
|
|
|
cache = cachetools.TTLCache(maxsize=100, ttl=60) |
|
|
|
global_data = { |
|
'models': {}, |
|
'tokensxx': { |
|
'eos': '<|end_of-text|>', |
|
'pad': '<pad>', |
|
'padding': '<pad>', |
|
'unk': '<unk>', |
|
'bos': '<|begin_of_text|>', |
|
'sep': '<|sep|>', |
|
'cls': '<|cls|>', |
|
'mask': '<mask>', |
|
'eot': '<|eot_id|>', |
|
'eom': '<|eom_id|>', |
|
'lf': '<|0x0A|>' |
|
}, |
|
'tokens': { |
|
'eos': 'eos_token', |
|
'pad': 'pad_token', |
|
'padding': 'padding_token', |
|
'unk': 'unk_token', |
|
'bos': 'bos_token', |
|
'sep': 'sep_token', |
|
'cls': 'cls_token', |
|
'mask': 'mask_token' |
|
}, |
|
'model_metadata': {}, |
|
'eos': {}, |
|
'pad': {}, |
|
'padding': {}, |
|
'unk': {}, |
|
'bos': {}, |
|
'sep': {}, |
|
'cls': {}, |
|
'mask': {}, |
|
'eot': {}, |
|
'eom': {}, |
|
'lf': {}, |
|
'max_tokens': {}, |
|
'tokenizers': {}, |
|
'model_params': {}, |
|
'model_size': {}, |
|
'model_ftype': {}, |
|
'n_ctx_train': {}, |
|
'n_embd': {}, |
|
'n_layer': {}, |
|
'n_head': {}, |
|
'n_head_kv': {}, |
|
'n_rot': {}, |
|
'n_swa': {}, |
|
'n_embd_head_k': {}, |
|
'n_embd_head_v': {}, |
|
'n_gqa': {}, |
|
'n_embd_k_gqa': {}, |
|
'n_embd_v_gqa': {}, |
|
'f_norm_eps': {}, |
|
'f_norm_rms_eps': {}, |
|
'f_clamp_kqv': {}, |
|
'f_max_alibi_bias': {}, |
|
'f_logit_scale': {}, |
|
'n_ff': {}, |
|
'n_expert': {}, |
|
'n_expert_used': {}, |
|
'causal_attn': {}, |
|
'pooling_type': {}, |
|
'rope_type': {}, |
|
'rope_scaling': {}, |
|
'freq_base_train': {}, |
|
'freq_scale_train': {}, |
|
'n_ctx_orig_yarn': {}, |
|
'rope_finetuned': {}, |
|
'ssm_d_conv': {}, |
|
'ssm_d_inner': {}, |
|
'ssm_d_state': {}, |
|
'ssm_dt_rank': {}, |
|
'ssm_dt_b_c_rms': {}, |
|
'vocab_type': {}, |
|
'model_type': {}, |
|
"general.architecture": {}, |
|
"general.type": {}, |
|
"general.name": {}, |
|
"general.finetune": {}, |
|
"general.basename": {}, |
|
"general.size_label": {}, |
|
"general.license": {}, |
|
"general.license.link": {}, |
|
"general.tags": {}, |
|
"general.languages": {}, |
|
"general.organization": {}, |
|
"general.base_model.count": {}, |
|
'general.file_type': {}, |
|
"phi3.context_length": {}, |
|
"phi3.rope.scaling.original_context_length": {}, |
|
"phi3.embedding_length": {}, |
|
"phi3.feed_forward_length": {}, |
|
"phi3.block_count": {}, |
|
"phi3.attention.head_count": {}, |
|
"phi3.attention.head_count_kv": {}, |
|
"phi3.attention.layer_norm_rms_epsilon": {}, |
|
"phi3.rope.dimension_count": {}, |
|
"phi3.rope.freq_base": {}, |
|
"phi3.attention.sliding_window": {}, |
|
"phi3.rope.scaling.attn_factor": {}, |
|
"llama.block_count": {}, |
|
"llama.context_length": {}, |
|
"llama.embedding_length": {}, |
|
"llama.feed_forward_length": {}, |
|
"llama.attention.head_count": {}, |
|
"llama.attention.head_count_kv": {}, |
|
"llama.rope.freq_base": {}, |
|
"llama.attention.layer_norm_rms_epsilon": {}, |
|
"llama.attention.key_length": {}, |
|
"llama.attention.value_length": {}, |
|
"llama.vocab_size": {}, |
|
"llama.rope.dimension_count": {}, |
|
"deepseek2.block_count": {}, |
|
"deepseek2.context_length": {}, |
|
"deepseek2.embedding_length": {}, |
|
"deepseek2.feed_forward_length": {}, |
|
"deepseek2.attention.head_count": {}, |
|
"deepseek2.attention.head_count_kv": {}, |
|
"deepseek2.rope.freq_base": {}, |
|
"deepseek2.attention.layer_norm_rms_epsilon": {}, |
|
"deepseek2.expert_used_count": {}, |
|
"deepseek2.leading_dense_block_count": {}, |
|
"deepseek2.vocab_size": {}, |
|
"deepseek2.attention.kv_lora_rank": {}, |
|
"deepseek2.attention.key_length": {}, |
|
"deepseek2.attention.value_length": {}, |
|
"deepseek2.expert_feed_forward_length": {}, |
|
"deepseek2.expert_count": {}, |
|
"deepseek2.expert_shared_count": {}, |
|
"deepseek2.expert_weights_scale": {}, |
|
"deepseek2.rope.dimension_count": {}, |
|
"deepseek2.rope.scaling.type": {}, |
|
"deepseek2.rope.scaling.factor": {}, |
|
"deepseek2.rope.scaling.yarn_log_multiplier": {}, |
|
"qwen2.block_count": {}, |
|
"qwen2.context_length": {}, |
|
"qwen2.embedding_length": {}, |
|
"qwen2.feed_forward_length": {}, |
|
"qwen2.attention.head_count": {}, |
|
"qwen2.attention.head_count_kv": {}, |
|
"qwen2.rope.freq_base": {}, |
|
"qwen2.attention.layer_norm_rms_epsilon": {}, |
|
"general.version": {}, |
|
"general.datasets": {}, |
|
"tokenizer.ggml.model": {}, |
|
"tokenizer.ggml.pre": {}, |
|
"tokenizer.ggml.tokens": {}, |
|
"tokenizer.ggml.token_type": {}, |
|
"tokenizer.ggml.merges": {}, |
|
"tokenizer.ggml.bos_token_id": {}, |
|
"tokenizer.ggml.eos_token_id": {}, |
|
"tokenizer.ggml.unknown_token_id": {}, |
|
"tokenizer.ggml.padding_token_id": {}, |
|
"tokenizer.ggml.add_bos_token": {}, |
|
"tokenizer.ggml.add_eos_token": {}, |
|
"tokenizer.ggml.add_space_prefix": {}, |
|
"tokenizer.chat_template": {}, |
|
"quantize.imatrix.file": {}, |
|
"quantize.imatrix.dataset": {}, |
|
"quantize.imatrix.entries_count": {}, |
|
"quantize.imatrix.chunks_count": {}, |
|
"general.quantization_version": {}, |
|
'n_lora_q': {}, |
|
'n_lora_kv': {}, |
|
'n_expert_shared': {}, |
|
'n_ff_exp': {}, |
|
"n_layer_dense_lead": {}, |
|
"expert_weights_scale": {}, |
|
"rope_yarn_log_mul": {}, |
|
'model_type': {}, |
|
'eval': {}, |
|
'time': {}, |
|
'token': {}, |
|
'tokens': {}, |
|
'pads': {}, |
|
'model': {}, |
|
'base': {}, |
|
'model_base': {}, |
|
'perhaps': {}, |
|
'word': {}, |
|
'words': {}, |
|
'start': {}, |
|
'stop': {}, |
|
'run': {}, |
|
'runs': {}, |
|
'ms': {}, |
|
'vocabulary': {}, |
|
'timeout': {}, |
|
'load': {}, |
|
'load_time': {}, |
|
'bas': {}, |
|
'tok': {}, |
|
'second': {}, |
|
'seconds': {}, |
|
'graph': {}, |
|
'load_model': {}, |
|
'end': {}, |
|
'llama_perf_context_print': {}, |
|
'llm_load_print_meta': {} |
|
} |
|
|
|
model_configs = [ |
|
{"repo_id": "Hjgugugjhuhjggg/testing_semifinal-Q2_K-GGUF", "filename": "testing_semifinal-q2_k.gguf", "name": "testing"}, |
|
{"repo_id": "bartowski/Llama-3.2-3B-Instruct-uncensored-GGUF", "filename": "Llama-3.2-3B-Instruct-uncensored-Q2_K.gguf", "name": "Llama-3.2-3B-Instruct"}, |
|
{"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-70B-Q2_K-GGUF", "filename": "meta-llama-3.1-70b-q2_k.gguf", "name": "Meta-Llama-3.1-70B"}, |
|
{"repo_id": "Hhhbvvkgh/Heidi-Llama-v4-Q2_K-GGUF", "filename": "heidi-llama-v4-q2_k.gguf", "name": "Heidi-Llama-V4"} |
|
] |
|
|
|
def normalize_input(input_text): |
|
stop_words = set(stopwords.words('english')) |
|
words = input_text.split() |
|
filtered_words = [word for word in words if word.lower() not in stop_words] |
|
return " ".join(filtered_words) |
|
|
|
async def generate_model_response(model, inputs): |
|
try: |
|
response = await model.generate(inputs) |
|
return response |
|
except Exception as e: |
|
return "" |
|
|
|
def get_best_response(responses): |
|
return max(responses, key=lambda x: x['score']) |
|
|
|
async def process_message(message): |
|
inputs = normalize_input(message) |
|
tasks = [generate_model_response(model, inputs) for model in global_data['models'].values()] |
|
responses = await asyncio.gather(*tasks) |
|
best_response = get_best_response(responses) |
|
return best_response |
|
|
|
app = FastAPI() |
|
|
|
@app.post("/generate") |
|
async def generate(request: Request): |
|
try: |
|
body = await request.json() |
|
response = await process_message(body['message']) |
|
return JSONResponse(content={"response": response}) |
|
except Exception as e: |
|
return JSONResponse(content={"error": str(e)}) |
|
|
|
def run_uvicorn(): |
|
uvicorn.run(app, host="0.0.0.0", port=7860) |
|
|
|
iface = gr.Interface( |
|
fn=process_message, |
|
inputs=gr.Textbox(lines=2, placeholder="Enter your message here..."), |
|
outputs=gr.Markdown(), |
|
title="Multi-Model LLM API (CPU Optimized)", |
|
description="" |
|
) |
|
|
|
def run_gradio(): |
|
iface.launch(server_port=7862, prevent_thread_lock=True) |
|
|
|
if __name__ == "__main__": |
|
Thread(target=run_uvicorn).start() |
|
Thread(target=run_gradio).start() |
|
asyncio.get_event_loop().run_forever() |
|
|