ZeroGPU-LLM-Inference

Sleeping

App Files Files Community

Luigi commited on Apr 12

Commit

ac8e9cc

1 Parent(s): f248fec

usue chat pipeline instead of model and tokenizer individually

Browse files

Files changed (1) hide show

app.py +72 -92

app.py CHANGED Viewed

@@ -6,12 +6,11 @@ from itertools import islice
 from datetime import datetime
 import gradio as gr
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
 from duckduckgo_search import DDGS
 import spaces  # Import spaces early to enable ZeroGPU support
-# Disable GPU visibility if you wish to force CPU usage outside of GPU functions
-# (Not strictly needed for ZeroGPU as the decorator handles allocation)
 # os.environ["CUDA_VISIBLE_DEVICES"] = ""
 # ------------------------------
@@ -22,9 +21,6 @@ cancel_event = threading.Event()
 # ------------------------------
 # Torch-Compatible Model Definitions with Adjusted Descriptions
 # ------------------------------
-# ------------------------------
-# Torch-Compatible Model Definitions (Cleaned)
-# ------------------------------
 MODELS = {
     "Taiwan-tinyllama-v1.0-chat": {
         "repo_id": "DavidLanz/Taiwan-tinyllama-v1.0-chat",
@@ -72,34 +68,35 @@ MODELS = {
     },
 }
-LOADED_MODELS = {}
-CURRENT_MODEL_NAME = None
-# ------------------------------
-# Model Loading Helper Function (PyTorch/Transformers)
-# ------------------------------
-def load_model(model_name):
-    global LOADED_MODELS, CURRENT_MODEL_NAME
-    if model_name in LOADED_MODELS:
-        return LOADED_MODELS[model_name]
     selected_model = MODELS[model_name]
-    # Load the model and tokenizer using Transformers.
-    model = AutoModelForCausalLM.from_pretrained(selected_model["repo_id"], trust_remote_code=True)
-    tokenizer = AutoTokenizer.from_pretrained(selected_model["repo_id"], trust_remote_code=True)
-    # If the pad token is missing or the same as the eos token, add a new pad token.
-    if tokenizer.pad_token is None or tokenizer.pad_token == tokenizer.eos_token:
-        tokenizer.add_special_tokens({'pad_token': '<pad>'})
-        model.resize_token_embeddings(len(tokenizer))
-    LOADED_MODELS[model_name] = (model, tokenizer)
-    CURRENT_MODEL_NAME = model_name
-    return model, tokenizer
-# ------------------------------
-# Web Search Context Retrieval Function
-# ------------------------------
 def retrieve_context(query, max_results=6, max_chars_per_result=600):
     try:
         with DDGS() as ddgs:
             results = list(islice(ddgs.text(query, region="wt-wt", safesearch="off", timelimit="y"), max_results))
@@ -113,23 +110,31 @@ def retrieve_context(query, max_results=6, max_chars_per_result=600):
         return ""
 # ------------------------------
-# Chat Response Generation with ZeroGPU
 # ------------------------------
-@spaces.GPU(duration=60)  # This decorator triggers GPU allocation for up to 60 seconds.
 def chat_response(user_message, chat_history, system_prompt, enable_search,
                   max_results, max_chars, model_name, max_tokens, temperature, top_k, top_p, repeat_penalty):
-    # Reset the cancellation event.
     cancel_event.clear()
-    # Prepare internal chat history.
-    internal_history = list(chat_history) if chat_history else []
-    internal_history.append({"role": "user", "content": user_message})
-    # Retrieve web search context (with debug feedback).
     debug_message = ""
     if enable_search:
         debug_message = "Initiating web search..."
-        yield internal_history, debug_message
         search_result = [""]
         def do_search():
             search_result[0] = retrieve_context(user_message, max_results, max_chars)
@@ -139,71 +144,46 @@ def chat_response(user_message, chat_history, system_prompt, enable_search,
         retrieved_context = search_result[0]
         if retrieved_context:
             debug_message = f"Web search results:\n\n{retrieved_context}"
         else:
             debug_message = "Web search returned no results or timed out."
     else:
-        retrieved_context = ""
         debug_message = "Web search disabled."
-    # Augment the prompt with search context if available.
-    if enable_search and retrieved_context:
-        augmented_user_input = (
-            f"{system_prompt.strip()}\n\n"
-            "Use the following recent web search context to help answer the query:\n\n"
-            f"{retrieved_context}\n\n"
-            f"User Query: {user_message}"
-        )
-    else:
-        augmented_user_input = f"{system_prompt.strip()}\n\nUser Query: {user_message}"
     # Append a placeholder for the assistant's response.
-    internal_history.append({"role": "assistant", "content": ""})
     try:
-        # Load the model and tokenizer.
-        model, tokenizer = load_model(model_name)
-        # Move the model to GPU (using .to('cuda')) inside the GPU-decorated function.
-        model = model.to('cuda')
-        # Tokenize the augmented prompt with padding and retrieve the attention mask.
-        encoding = tokenizer(augmented_user_input, return_tensors="pt", padding=True)
-        input_ids = encoding["input_ids"].to('cuda')
-        attention_mask = encoding["attention_mask"].to('cuda')
-        with torch.no_grad():
-            output_ids = model.generate(
-                input_ids,
-                attention_mask=attention_mask,
-                max_new_tokens=max_tokens,
-                temperature=temperature,
-                top_k=top_k,
-                top_p=top_p,
-                repetition_penalty=repeat_penalty,
-                do_sample=True,
-                pad_token_id=tokenizer.pad_token_id,
-            )
-        # Decode the generated tokens.
-        generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
-        # Remove the original prompt to isolate the assistant's reply.
-        assistant_text = generated_text[len(augmented_user_input):].strip()
-        # Simulate streaming output by yielding word-by-word.
-        words = assistant_text.split()
-        assistant_message = ""
-        for word in words:
-            if cancel_event.is_set():
-                assistant_message += "\n\n[Response generation cancelled by user]"
-                internal_history[-1]["content"] = assistant_message
-                yield internal_history, debug_message
-                return
-            assistant_message += word + " "
-            internal_history[-1]["content"] = assistant_message
-            yield internal_history, debug_message
-            time.sleep(0.05)  # Short delay to simulate streaming
     except Exception as e:
-        internal_history[-1]["content"] = f"Error: {e}"
-        yield internal_history, debug_message
-    gc.collect()
 # ------------------------------
 # Cancel Function
@@ -265,7 +245,7 @@ with gr.Blocks(title="LLM Inference with ZeroGPU") as demo:
     clear_button.click(fn=clear_chat, outputs=[chatbot, msg_input, search_debug])
     cancel_button.click(fn=cancel_generation, outputs=search_debug)
-    # Submission: the chat_response function is now decorated with @spaces.GPU.
     msg_input.submit(
         fn=chat_response,
         inputs=[msg_input, chatbot, system_prompt_text, enable_search_checkbox,

 from datetime import datetime
 import gradio as gr
 import torch
+from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
 from duckduckgo_search import DDGS
 import spaces  # Import spaces early to enable ZeroGPU support
+# Optional: Disable GPU visibility if you wish to force CPU usage
 # os.environ["CUDA_VISIBLE_DEVICES"] = ""
 # ------------------------------
 # ------------------------------
 # Torch-Compatible Model Definitions with Adjusted Descriptions
 # ------------------------------
 MODELS = {
     "Taiwan-tinyllama-v1.0-chat": {
         "repo_id": "DavidLanz/Taiwan-tinyllama-v1.0-chat",
     },
 }
+# Global cache for pipelines to avoid re-loading.
+PIPELINES = {}
+def load_pipeline(model_name):
+    """
+    Load and cache a transformers pipeline for chat/text-generation.
+    Uses the model's repo_id from MODELS and caches the pipeline for future use.
+    """
+    global PIPELINES
+    if model_name in PIPELINES:
+        return PIPELINES[model_name]
     selected_model = MODELS[model_name]
+    # Create a chat-style text-generation pipeline.
+    pipe = pipeline(
+        task="text-generation",
+        model=selected_model["repo_id"],
+        tokenizer=selected_model["repo_id"],
+        trust_remote_code=True,
+        torch_dtype=torch.bfloat16,
+        device_map="auto"
+    )
+    PIPELINES[model_name] = pipe
+    return pipe
 def retrieve_context(query, max_results=6, max_chars_per_result=600):
+    """
+    Retrieve recent web search context for the given query using DuckDuckGo.
+    Returns a formatted string with search results.
+    """
     try:
         with DDGS() as ddgs:
             results = list(islice(ddgs.text(query, region="wt-wt", safesearch="off", timelimit="y"), max_results))
         return ""
 # ------------------------------
+# Chat Response Generation with ZeroGPU using Pipeline
 # ------------------------------
+@spaces.GPU(duration=60)
 def chat_response(user_message, chat_history, system_prompt, enable_search,
                   max_results, max_chars, model_name, max_tokens, temperature, top_k, top_p, repeat_penalty):
+    """
+    Generate a chat response by utilizing a transformers pipeline.
+    - Appends the user's message to the conversation history.
+    - Optionally retrieves web search context and inserts it as an additional system message.
+    - Uses a cached pipeline (loaded via load_pipeline) to generate a response.
+    - Returns the updated conversation history and a debug message.
+    """
     cancel_event.clear()
+    # Build conversation list from chat history.
+    conversation = list(chat_history) if chat_history else []
+    conversation.append({"role": "user", "content": user_message})
+    # Retrieve web search context if enabled.
     debug_message = ""
+    retrieved_context = ""
     if enable_search:
         debug_message = "Initiating web search..."
+        yield conversation, debug_message
         search_result = [""]
         def do_search():
             search_result[0] = retrieve_context(user_message, max_results, max_chars)
         retrieved_context = search_result[0]
         if retrieved_context:
             debug_message = f"Web search results:\n\n{retrieved_context}"
+            # Insert the search context as a system-level message immediately after the original system prompt.
+            conversation.insert(1, {"role": "system", "content": f"Web search context:\n{retrieved_context}"})
         else:
             debug_message = "Web search returned no results or timed out."
     else:
         debug_message = "Web search disabled."
     # Append a placeholder for the assistant's response.
+    conversation.append({"role": "assistant", "content": ""})
     try:
+        # Load the pipeline (cached) for the selected model.
+        pipe = load_pipeline(model_name)
+        # Use the pipeline directly with conversation history.
+        # Note: Many chat pipelines use internal chat templating to properly format the conversation.
+        response = pipe(
+            conversation,
+            max_new_tokens=max_tokens,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            repetition_penalty=repeat_penalty,
+        )
+        # Extract the assistant's reply.
+        try:
+            assistant_text = response[0]["generated_text"][-1]["content"]
+        except (KeyError, IndexError, TypeError):
+            assistant_text = response[0]["generated_text"]
+        # Update the conversation history.
+        conversation[-1]["content"] = assistant_text
+        # Yield the complete conversation history and the debug message.
+        yield conversation, debug_message
     except Exception as e:
+        conversation[-1]["content"] = f"Error: {e}"
+        yield conversation, debug_message
+    finally:
+        gc.collect()
 # ------------------------------
 # Cancel Function
     clear_button.click(fn=clear_chat, outputs=[chatbot, msg_input, search_debug])
     cancel_button.click(fn=cancel_generation, outputs=search_debug)
+    # Submission: the chat_response function is now used with the Transformers pipeline.
     msg_input.submit(
         fn=chat_response,
         inputs=[msg_input, chatbot, system_prompt_text, enable_search_checkbox,