xfcxcxcdfdfd commited on
Commit
295803f
·
verified ·
1 Parent(s): 05109ee

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -48
app.py CHANGED
@@ -3,11 +3,10 @@ from pydantic import BaseModel
3
  from llama_cpp import Llama
4
  from concurrent.futures import ThreadPoolExecutor, as_completed
5
  import asyncio
6
- import gradio as gr
7
  import os
8
  from dotenv import load_dotenv
9
  from fastapi import FastAPI
10
- from fastapi.responses import JSONResponse
11
  import uvicorn
12
  from threading import Thread
13
  import psutil
@@ -18,6 +17,8 @@ from PIL import Image
18
  import stable_diffusion_cpp as sdcpp
19
  import base64
20
  import io
 
 
21
 
22
  load_dotenv()
23
  HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
@@ -254,6 +255,7 @@ class ModelManager:
254
  def load_model(self, model_config):
255
  if model_config['name'] not in self.models and model_config['name'] != "flux1-schnell":
256
  try:
 
257
  self.models[model_config['name']] = Llama.from_pretrained(
258
  repo_id=model_config['repo_id'],
259
  filename=model_config['filename'],
@@ -261,11 +263,20 @@ class ModelManager:
261
  n_threads=20,
262
  use_gpu=False
263
  )
 
 
 
 
 
 
 
 
264
  except Exception as e:
265
- pass
266
 
267
  def load_image_model(self, model_config):
268
  try:
 
269
  self.image_model = sdcpp.StableDiffusionCpp(
270
  repo_id=model_config['repo_id'],
271
  filename=model_config['filename'],
@@ -273,6 +284,7 @@ class ModelManager:
273
  n_threads=20,
274
  use_gpu=False
275
  )
 
276
  except Exception as e:
277
  print(f"Error loading image model: {e}")
278
 
@@ -320,9 +332,9 @@ def cache_response(func):
320
 
321
 
322
  @cache_response
323
- def generate_model_response(model, inputs):
324
  try:
325
- response = model(inputs, max_tokens=9999999)
326
  return remove_duplicates(response['choices'][0]['text'])
327
  except Exception as e:
328
  return ""
@@ -334,24 +346,66 @@ def remove_repetitive_responses(responses):
334
  unique_responses[response['model']] = response['response']
335
  return unique_responses
336
 
337
- async def process_message(message):
 
338
  inputs = normalize_input(message)
339
- with ThreadPoolExecutor() as executor:
340
- futures = [
341
- executor.submit(generate_model_response, model, inputs)
342
- for model in global_data['models'].values()
343
- ]
344
- responses = [
345
- {'model': model_name, 'response': future.result()}
346
- for model_name, future in zip(global_data['models'].keys(), as_completed(futures))
347
- ]
348
- unique_responses = remove_repetitive_responses(responses)
349
- formatted_response = next(iter(unique_responses.values()))
350
- return formatted_response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
351
 
352
  async def generate_image(prompt: str):
353
  if global_data['image_model']:
354
  try:
 
355
  image_bytes = global_data['image_model'].generate(
356
  prompt=prompt,
357
  negative_prompt="ugly, deformed, disfigured",
@@ -364,6 +418,7 @@ async def generate_image(prompt: str):
364
  )
365
 
366
  image = Image.open(io.BytesIO(image_bytes))
 
367
  return image
368
  except Exception as e:
369
  print(f"Error generating image: {e}")
@@ -377,11 +432,11 @@ app = FastAPI()
377
 
378
  @app.post("/generate")
379
  async def generate(request: ChatRequest):
380
- try:
381
- response = await process_message(request.message)
382
- return JSONResponse(content={"response": response})
383
- except Exception as e:
384
- return JSONResponse(content={"error": str(e)})
385
 
386
  @app.post("/generate_image")
387
  async def generate_image_endpoint(request: ImageRequest):
@@ -405,32 +460,7 @@ def run_uvicorn():
405
  except Exception as e:
406
  print(f"Error al ejecutar uvicorn: {e}")
407
 
408
- iface = gr.Interface(
409
- fn=process_message,
410
- inputs=gr.Textbox(lines=2, placeholder="Enter your message here..."),
411
- outputs=gr.Markdown(),
412
- title="Multi-Model LLM & Image API (CPU Optimized)",
413
- description="Optimized version using GPU and memory management techniques."
414
- )
415
- iface_image = gr.Interface(
416
- fn=generate_image,
417
- inputs=gr.Textbox(lines=2, placeholder="Enter image prompt here..."),
418
- outputs=gr.Image(),
419
- title="Stable Diffusion Image Generator",
420
- description="Generate images using the specified stable diffusion model."
421
- )
422
-
423
-
424
- def run_gradio():
425
- with gr.Blocks(title="Multi-Model LLM & Image API (CPU Optimized)") as demo:
426
- with gr.Tab("LLM"):
427
- iface.render()
428
- with gr.Tab("Image Generator"):
429
- iface_image.render()
430
- demo.launch(server_port=7862, prevent_thread_lock=True)
431
-
432
 
433
  if __name__ == "__main__":
434
  Thread(target=run_uvicorn).start()
435
- Thread(target=run_gradio).start()
436
  asyncio.get_event_loop().run_forever()
 
3
  from llama_cpp import Llama
4
  from concurrent.futures import ThreadPoolExecutor, as_completed
5
  import asyncio
 
6
  import os
7
  from dotenv import load_dotenv
8
  from fastapi import FastAPI
9
+ from fastapi.responses import StreamingResponse, JSONResponse
10
  import uvicorn
11
  from threading import Thread
12
  import psutil
 
17
  import stable_diffusion_cpp as sdcpp
18
  import base64
19
  import io
20
+ import time
21
+ from typing import AsyncGenerator
22
 
23
  load_dotenv()
24
  HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
 
255
  def load_model(self, model_config):
256
  if model_config['name'] not in self.models and model_config['name'] != "flux1-schnell":
257
  try:
258
+ print(f"Loading model: {model_config['name']}")
259
  self.models[model_config['name']] = Llama.from_pretrained(
260
  repo_id=model_config['repo_id'],
261
  filename=model_config['filename'],
 
263
  n_threads=20,
264
  use_gpu=False
265
  )
266
+ print(f"Model loaded: {model_config['name']}")
267
+ # Load tokenizer after model load
268
+ if model_config['name'] not in global_data['tokenizers']:
269
+ global_data['tokenizers'][model_config['name']] = self.models[model_config['name']].tokenizer()
270
+ print(f"tokenizer loaded for: {model_config['name']}")
271
+ # load the eos token
272
+ global_data['eos'][model_config['name']] = self.models[model_config['name']].token_eos()
273
+ print(f"eos loaded for: {model_config['name']}")
274
  except Exception as e:
275
+ print(f"Error loading model {model_config['name']}: {e}")
276
 
277
  def load_image_model(self, model_config):
278
  try:
279
+ print(f"Attempting to load image model with config: {model_config}")
280
  self.image_model = sdcpp.StableDiffusionCpp(
281
  repo_id=model_config['repo_id'],
282
  filename=model_config['filename'],
 
284
  n_threads=20,
285
  use_gpu=False
286
  )
287
+ print(f"Image model loaded successfully: {self.image_model}")
288
  except Exception as e:
289
  print(f"Error loading image model: {e}")
290
 
 
332
 
333
 
334
  @cache_response
335
+ def generate_model_response(model, inputs, max_tokens=9999999):
336
  try:
337
+ response = model(inputs, max_tokens=max_tokens)
338
  return remove_duplicates(response['choices'][0]['text'])
339
  except Exception as e:
340
  return ""
 
346
  unique_responses[response['model']] = response['response']
347
  return unique_responses
348
 
349
+
350
+ async def process_message(message: str):
351
  inputs = normalize_input(message)
352
+
353
+ async def stream_response(inputs: str) -> AsyncGenerator[str, None]:
354
+ max_token_limit = 150
355
+ full_response = ""
356
+ current_inputs = inputs
357
+ eos_found = False
358
+
359
+ start_time = time.time()
360
+ while current_inputs and not eos_found:
361
+ with ThreadPoolExecutor() as executor:
362
+ futures = [
363
+ executor.submit(generate_model_response, model, current_inputs, max_tokens=max_token_limit)
364
+ for model in global_data['models'].values()
365
+ ]
366
+ responses = [
367
+ {'model': model_name, 'response': future.result()}
368
+ for model_name, future in zip(global_data['models'].keys(), as_completed(futures))
369
+ ]
370
+ unique_responses = remove_repetitive_responses(responses)
371
+ formatted_response = next(iter(unique_responses.values()))
372
+
373
+ print(f"Generated chunk: {formatted_response}")
374
+
375
+
376
+ #tokenize the response
377
+ tokenizer = next(iter(global_data['tokenizers'].values()))
378
+ tokens = tokenizer.encode(formatted_response)
379
+
380
+
381
+ token_count = len(tokens)
382
+ chunk_size = 30 # Set token chunk size
383
+ for i in range(0, token_count, chunk_size):
384
+ chunk_tokens = tokens[i : i + chunk_size]
385
+ decoded_chunk = tokenizer.decode(chunk_tokens)
386
+ yield decoded_chunk
387
+
388
+ # Check for EOS token in decoded chunk
389
+
390
+ eos_token = next(iter(global_data['eos'].values()))
391
+ if eos_token in tokens:
392
+ eos_found = True
393
+ print(f"End of sequence token found")
394
+ break
395
+
396
+ full_response += formatted_response
397
+ current_inputs = formatted_response if len(formatted_response.split()) > 0 else ""
398
+
399
+ end_time = time.time()
400
+ print(f"Total time taken to process response {end_time-start_time}")
401
+
402
+ return StreamingResponse(stream_response(inputs), media_type="text/plain")
403
+
404
 
405
  async def generate_image(prompt: str):
406
  if global_data['image_model']:
407
  try:
408
+ print("Generating image with prompt:", prompt)
409
  image_bytes = global_data['image_model'].generate(
410
  prompt=prompt,
411
  negative_prompt="ugly, deformed, disfigured",
 
418
  )
419
 
420
  image = Image.open(io.BytesIO(image_bytes))
421
+ print("Image generated successfully.")
422
  return image
423
  except Exception as e:
424
  print(f"Error generating image: {e}")
 
432
 
433
  @app.post("/generate")
434
  async def generate(request: ChatRequest):
435
+ try:
436
+ return await process_message(request.message)
437
+ except Exception as e:
438
+ return JSONResponse(content={"error": str(e)})
439
+
440
 
441
  @app.post("/generate_image")
442
  async def generate_image_endpoint(request: ImageRequest):
 
460
  except Exception as e:
461
  print(f"Error al ejecutar uvicorn: {e}")
462
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
463
 
464
  if __name__ == "__main__":
465
  Thread(target=run_uvicorn).start()
 
466
  asyncio.get_event_loop().run_forever()