Update app.py
Browse files
app.py
CHANGED
@@ -3,11 +3,10 @@ from pydantic import BaseModel
|
|
3 |
from llama_cpp import Llama
|
4 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
5 |
import asyncio
|
6 |
-
import gradio as gr
|
7 |
import os
|
8 |
from dotenv import load_dotenv
|
9 |
from fastapi import FastAPI
|
10 |
-
from fastapi.responses import JSONResponse
|
11 |
import uvicorn
|
12 |
from threading import Thread
|
13 |
import psutil
|
@@ -18,6 +17,8 @@ from PIL import Image
|
|
18 |
import stable_diffusion_cpp as sdcpp
|
19 |
import base64
|
20 |
import io
|
|
|
|
|
21 |
|
22 |
load_dotenv()
|
23 |
HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
|
@@ -254,6 +255,7 @@ class ModelManager:
|
|
254 |
def load_model(self, model_config):
|
255 |
if model_config['name'] not in self.models and model_config['name'] != "flux1-schnell":
|
256 |
try:
|
|
|
257 |
self.models[model_config['name']] = Llama.from_pretrained(
|
258 |
repo_id=model_config['repo_id'],
|
259 |
filename=model_config['filename'],
|
@@ -261,11 +263,20 @@ class ModelManager:
|
|
261 |
n_threads=20,
|
262 |
use_gpu=False
|
263 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
264 |
except Exception as e:
|
265 |
-
|
266 |
|
267 |
def load_image_model(self, model_config):
|
268 |
try:
|
|
|
269 |
self.image_model = sdcpp.StableDiffusionCpp(
|
270 |
repo_id=model_config['repo_id'],
|
271 |
filename=model_config['filename'],
|
@@ -273,6 +284,7 @@ class ModelManager:
|
|
273 |
n_threads=20,
|
274 |
use_gpu=False
|
275 |
)
|
|
|
276 |
except Exception as e:
|
277 |
print(f"Error loading image model: {e}")
|
278 |
|
@@ -320,9 +332,9 @@ def cache_response(func):
|
|
320 |
|
321 |
|
322 |
@cache_response
|
323 |
-
def generate_model_response(model, inputs):
|
324 |
try:
|
325 |
-
response = model(inputs, max_tokens=
|
326 |
return remove_duplicates(response['choices'][0]['text'])
|
327 |
except Exception as e:
|
328 |
return ""
|
@@ -334,24 +346,66 @@ def remove_repetitive_responses(responses):
|
|
334 |
unique_responses[response['model']] = response['response']
|
335 |
return unique_responses
|
336 |
|
337 |
-
|
|
|
338 |
inputs = normalize_input(message)
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
|
350 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
351 |
|
352 |
async def generate_image(prompt: str):
|
353 |
if global_data['image_model']:
|
354 |
try:
|
|
|
355 |
image_bytes = global_data['image_model'].generate(
|
356 |
prompt=prompt,
|
357 |
negative_prompt="ugly, deformed, disfigured",
|
@@ -364,6 +418,7 @@ async def generate_image(prompt: str):
|
|
364 |
)
|
365 |
|
366 |
image = Image.open(io.BytesIO(image_bytes))
|
|
|
367 |
return image
|
368 |
except Exception as e:
|
369 |
print(f"Error generating image: {e}")
|
@@ -377,11 +432,11 @@ app = FastAPI()
|
|
377 |
|
378 |
@app.post("/generate")
|
379 |
async def generate(request: ChatRequest):
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
|
384 |
-
|
385 |
|
386 |
@app.post("/generate_image")
|
387 |
async def generate_image_endpoint(request: ImageRequest):
|
@@ -405,32 +460,7 @@ def run_uvicorn():
|
|
405 |
except Exception as e:
|
406 |
print(f"Error al ejecutar uvicorn: {e}")
|
407 |
|
408 |
-
iface = gr.Interface(
|
409 |
-
fn=process_message,
|
410 |
-
inputs=gr.Textbox(lines=2, placeholder="Enter your message here..."),
|
411 |
-
outputs=gr.Markdown(),
|
412 |
-
title="Multi-Model LLM & Image API (CPU Optimized)",
|
413 |
-
description="Optimized version using GPU and memory management techniques."
|
414 |
-
)
|
415 |
-
iface_image = gr.Interface(
|
416 |
-
fn=generate_image,
|
417 |
-
inputs=gr.Textbox(lines=2, placeholder="Enter image prompt here..."),
|
418 |
-
outputs=gr.Image(),
|
419 |
-
title="Stable Diffusion Image Generator",
|
420 |
-
description="Generate images using the specified stable diffusion model."
|
421 |
-
)
|
422 |
-
|
423 |
-
|
424 |
-
def run_gradio():
|
425 |
-
with gr.Blocks(title="Multi-Model LLM & Image API (CPU Optimized)") as demo:
|
426 |
-
with gr.Tab("LLM"):
|
427 |
-
iface.render()
|
428 |
-
with gr.Tab("Image Generator"):
|
429 |
-
iface_image.render()
|
430 |
-
demo.launch(server_port=7862, prevent_thread_lock=True)
|
431 |
-
|
432 |
|
433 |
if __name__ == "__main__":
|
434 |
Thread(target=run_uvicorn).start()
|
435 |
-
Thread(target=run_gradio).start()
|
436 |
asyncio.get_event_loop().run_forever()
|
|
|
3 |
from llama_cpp import Llama
|
4 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
5 |
import asyncio
|
|
|
6 |
import os
|
7 |
from dotenv import load_dotenv
|
8 |
from fastapi import FastAPI
|
9 |
+
from fastapi.responses import StreamingResponse, JSONResponse
|
10 |
import uvicorn
|
11 |
from threading import Thread
|
12 |
import psutil
|
|
|
17 |
import stable_diffusion_cpp as sdcpp
|
18 |
import base64
|
19 |
import io
|
20 |
+
import time
|
21 |
+
from typing import AsyncGenerator
|
22 |
|
23 |
load_dotenv()
|
24 |
HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
|
|
|
255 |
def load_model(self, model_config):
|
256 |
if model_config['name'] not in self.models and model_config['name'] != "flux1-schnell":
|
257 |
try:
|
258 |
+
print(f"Loading model: {model_config['name']}")
|
259 |
self.models[model_config['name']] = Llama.from_pretrained(
|
260 |
repo_id=model_config['repo_id'],
|
261 |
filename=model_config['filename'],
|
|
|
263 |
n_threads=20,
|
264 |
use_gpu=False
|
265 |
)
|
266 |
+
print(f"Model loaded: {model_config['name']}")
|
267 |
+
# Load tokenizer after model load
|
268 |
+
if model_config['name'] not in global_data['tokenizers']:
|
269 |
+
global_data['tokenizers'][model_config['name']] = self.models[model_config['name']].tokenizer()
|
270 |
+
print(f"tokenizer loaded for: {model_config['name']}")
|
271 |
+
# load the eos token
|
272 |
+
global_data['eos'][model_config['name']] = self.models[model_config['name']].token_eos()
|
273 |
+
print(f"eos loaded for: {model_config['name']}")
|
274 |
except Exception as e:
|
275 |
+
print(f"Error loading model {model_config['name']}: {e}")
|
276 |
|
277 |
def load_image_model(self, model_config):
|
278 |
try:
|
279 |
+
print(f"Attempting to load image model with config: {model_config}")
|
280 |
self.image_model = sdcpp.StableDiffusionCpp(
|
281 |
repo_id=model_config['repo_id'],
|
282 |
filename=model_config['filename'],
|
|
|
284 |
n_threads=20,
|
285 |
use_gpu=False
|
286 |
)
|
287 |
+
print(f"Image model loaded successfully: {self.image_model}")
|
288 |
except Exception as e:
|
289 |
print(f"Error loading image model: {e}")
|
290 |
|
|
|
332 |
|
333 |
|
334 |
@cache_response
|
335 |
+
def generate_model_response(model, inputs, max_tokens=9999999):
|
336 |
try:
|
337 |
+
response = model(inputs, max_tokens=max_tokens)
|
338 |
return remove_duplicates(response['choices'][0]['text'])
|
339 |
except Exception as e:
|
340 |
return ""
|
|
|
346 |
unique_responses[response['model']] = response['response']
|
347 |
return unique_responses
|
348 |
|
349 |
+
|
350 |
+
async def process_message(message: str):
|
351 |
inputs = normalize_input(message)
|
352 |
+
|
353 |
+
async def stream_response(inputs: str) -> AsyncGenerator[str, None]:
|
354 |
+
max_token_limit = 150
|
355 |
+
full_response = ""
|
356 |
+
current_inputs = inputs
|
357 |
+
eos_found = False
|
358 |
+
|
359 |
+
start_time = time.time()
|
360 |
+
while current_inputs and not eos_found:
|
361 |
+
with ThreadPoolExecutor() as executor:
|
362 |
+
futures = [
|
363 |
+
executor.submit(generate_model_response, model, current_inputs, max_tokens=max_token_limit)
|
364 |
+
for model in global_data['models'].values()
|
365 |
+
]
|
366 |
+
responses = [
|
367 |
+
{'model': model_name, 'response': future.result()}
|
368 |
+
for model_name, future in zip(global_data['models'].keys(), as_completed(futures))
|
369 |
+
]
|
370 |
+
unique_responses = remove_repetitive_responses(responses)
|
371 |
+
formatted_response = next(iter(unique_responses.values()))
|
372 |
+
|
373 |
+
print(f"Generated chunk: {formatted_response}")
|
374 |
+
|
375 |
+
|
376 |
+
#tokenize the response
|
377 |
+
tokenizer = next(iter(global_data['tokenizers'].values()))
|
378 |
+
tokens = tokenizer.encode(formatted_response)
|
379 |
+
|
380 |
+
|
381 |
+
token_count = len(tokens)
|
382 |
+
chunk_size = 30 # Set token chunk size
|
383 |
+
for i in range(0, token_count, chunk_size):
|
384 |
+
chunk_tokens = tokens[i : i + chunk_size]
|
385 |
+
decoded_chunk = tokenizer.decode(chunk_tokens)
|
386 |
+
yield decoded_chunk
|
387 |
+
|
388 |
+
# Check for EOS token in decoded chunk
|
389 |
+
|
390 |
+
eos_token = next(iter(global_data['eos'].values()))
|
391 |
+
if eos_token in tokens:
|
392 |
+
eos_found = True
|
393 |
+
print(f"End of sequence token found")
|
394 |
+
break
|
395 |
+
|
396 |
+
full_response += formatted_response
|
397 |
+
current_inputs = formatted_response if len(formatted_response.split()) > 0 else ""
|
398 |
+
|
399 |
+
end_time = time.time()
|
400 |
+
print(f"Total time taken to process response {end_time-start_time}")
|
401 |
+
|
402 |
+
return StreamingResponse(stream_response(inputs), media_type="text/plain")
|
403 |
+
|
404 |
|
405 |
async def generate_image(prompt: str):
|
406 |
if global_data['image_model']:
|
407 |
try:
|
408 |
+
print("Generating image with prompt:", prompt)
|
409 |
image_bytes = global_data['image_model'].generate(
|
410 |
prompt=prompt,
|
411 |
negative_prompt="ugly, deformed, disfigured",
|
|
|
418 |
)
|
419 |
|
420 |
image = Image.open(io.BytesIO(image_bytes))
|
421 |
+
print("Image generated successfully.")
|
422 |
return image
|
423 |
except Exception as e:
|
424 |
print(f"Error generating image: {e}")
|
|
|
432 |
|
433 |
@app.post("/generate")
|
434 |
async def generate(request: ChatRequest):
|
435 |
+
try:
|
436 |
+
return await process_message(request.message)
|
437 |
+
except Exception as e:
|
438 |
+
return JSONResponse(content={"error": str(e)})
|
439 |
+
|
440 |
|
441 |
@app.post("/generate_image")
|
442 |
async def generate_image_endpoint(request: ImageRequest):
|
|
|
460 |
except Exception as e:
|
461 |
print(f"Error al ejecutar uvicorn: {e}")
|
462 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
463 |
|
464 |
if __name__ == "__main__":
|
465 |
Thread(target=run_uvicorn).start()
|
|
|
466 |
asyncio.get_event_loop().run_forever()
|