api-test

Runtime error

App Files Files Community

OjciecTadeusz commited on Nov 14, 2024

Commit

d6b0a9b

verified ·

1 Parent(s): 1fb73a8

Update app.py

Browse files

Files changed (1) hide show

app.py +61 -55

app.py CHANGED Viewed

@@ -1,34 +1,28 @@
 import gradio as gr
-from transformers import AutoTokenizer, AutoModelForCausalLM
-import torch
-import json
 from fastapi import FastAPI, Request
 from fastapi.responses import JSONResponse
 import datetime
 import asyncio
 # Initialize FastAPI
 app = FastAPI()
-# Load model and tokenizer
-model_name = "Qwen/Qwen2.5-Coder-32B"
-tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-# Configure model loading with specific parameters
-model = AutoModelForCausalLM.from_pretrained(
-    model_name,
-    device_map="auto",
-    trust_remote_code=True,
-    torch_dtype=torch.float16,
-    low_cpu_mem_usage=True
-)
-def format_chat_response(response_text, prompt_tokens, completion_tokens):
     return {
         "id": f"chatcmpl-{datetime.datetime.now().strftime('%Y%m%d%H%M%S')}",
         "object": "chat.completion",
         "created": int(datetime.datetime.now().timestamp()),
-        "model": model_name,
         "choices": [{
             "index": 0,
             "message": {
@@ -44,37 +38,48 @@ def format_chat_response(response_text, prompt_tokens, completion_tokens):
         }
     }
 @app.post("/v1/chat/completions")
 async def chat_completion(request: Request):
     try:
         data = await request.json()
         messages = data.get("messages", [])
-        # Convert messages to model input format
-        prompt = tokenizer.apply_chat_template(
-            messages,
-            tokenize=False,
-            add_generation_prompt=True
-        )
-        # Count prompt tokens
-        prompt_tokens = len(tokenizer.encode(prompt))
-        # Generate response
-        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
-        outputs = model.generate(
-            **inputs,
-            max_new_tokens=data.get("max_tokens", 2048),
-            temperature=data.get("temperature", 0.7),
-            top_p=data.get("top_p", 0.95),
-            do_sample=True
-        )
-        response_text = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
-        completion_tokens = len(tokenizer.encode(response_text))
         return JSONResponse(
-            content=format_chat_response(response_text, prompt_tokens, completion_tokens)
         )
     except Exception as e:
         return JSONResponse(
@@ -82,26 +87,27 @@ async def chat_completion(request: Request):
             content={"error": str(e)}
         )
-# Synchronous function to generate response
 def generate_response(messages):
-    # Convert messages to model input format
-    prompt = tokenizer.apply_chat_template(
-        messages,
-        tokenize=False,
-        add_generation_prompt=True
-    )
-    # Generate response
-    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
-    outputs = model.generate(
-        **inputs,
-        max_new_tokens=2048,
-        temperature=0.7,
-        top_p=0.95,
-        do_sample=True
-    )
-    return tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
 # Gradio interface for testing
 def chat_interface(message, history):
@@ -126,7 +132,7 @@ def chat_interface(message, history):
 interface = gr.ChatInterface(
     chat_interface,
     title="Qwen2.5-Coder-32B Chat",
-    description="Chat with Qwen2.5-Coder-32B model. This Space also provides a /v1/chat/completions endpoint."
 )
 # Mount both FastAPI and Gradio

 import gradio as gr
 from fastapi import FastAPI, Request
 from fastapi.responses import JSONResponse
 import datetime
+import requests
+import os
+import json
 import asyncio
 # Initialize FastAPI
 app = FastAPI()
+# Configuration
+API_URL = "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-Coder-32B"
+headers = {
+    "Authorization": f"Bearer {os.getenv('HF_API_TOKEN')}",
+    "Content-Type": "application/json"
+}
+def format_chat_response(response_text, prompt_tokens=0, completion_tokens=0):
     return {
         "id": f"chatcmpl-{datetime.datetime.now().strftime('%Y%m%d%H%M%S')}",
         "object": "chat.completion",
         "created": int(datetime.datetime.now().timestamp()),
+        "model": "Qwen/Qwen2.5-Coder-32B",
         "choices": [{
             "index": 0,
             "message": {
         }
     }
+async def query_model(payload):
+    response = requests.post(API_URL, headers=headers, json=payload)
+    return response.json()
 @app.post("/v1/chat/completions")
 async def chat_completion(request: Request):
     try:
         data = await request.json()
         messages = data.get("messages", [])
+        # Prepare the payload for the Inference API
+        payload = {
+            "inputs": {
+                "messages": messages
+            },
+            "parameters": {
+                "max_new_tokens": data.get("max_tokens", 2048),
+                "temperature": data.get("temperature", 0.7),
+                "top_p": data.get("top_p", 0.95),
+                "do_sample": True
+            }
+        }
+        # Get response from model
+        response = await query_model(payload)
+        if isinstance(response, dict) and "error" in response:
+            return JSONResponse(
+                status_code=500,
+                content={"error": response["error"]}
+            )
+        response_text = response[0]["generated_text"]
         return JSONResponse(
+            content=format_chat_response(
+                response_text,
+                # Note: Actual token counts would need to be calculated differently
+                # or obtained from the API response if available
+                prompt_tokens=0,
+                completion_tokens=0
+            )
         )
     except Exception as e:
         return JSONResponse(
             content={"error": str(e)}
         )
+# Synchronous function to generate response for Gradio
 def generate_response(messages):
+    payload = {
+        "inputs": {
+            "messages": messages
+        },
+        "parameters": {
+            "max_new_tokens": 2048,
+            "temperature": 0.7,
+            "top_p": 0.95,
+            "do_sample": True
+        }
+    }
+    response = requests.post(API_URL, headers=headers, json=payload)
+    result = response.json()
+    if isinstance(result, dict) and "error" in result:
+        return f"Error: {result['error']}"
+    return result[0]["generated_text"]
 # Gradio interface for testing
 def chat_interface(message, history):
 interface = gr.ChatInterface(
     chat_interface,
     title="Qwen2.5-Coder-32B Chat",
+    description="Chat with Qwen2.5-Coder-32B model via Hugging Face Inference API. This Space also provides a /v1/chat/completions endpoint."
 )
 # Mount both FastAPI and Gradio