ZeroGPU-LLM-Inference

Running

App Files Files Community

Luigi commited on Apr 12

Commit

d181b45

1 Parent(s): 5dd835a

Apply ZeroGPU

Browse files

Files changed (2) hide show

app.py +71 -130
requirements.txt +3 -2

app.py CHANGED Viewed

@@ -1,13 +1,12 @@
 import os
 import time
-import re
 import gc
 import threading
 from itertools import islice
 from datetime import datetime
 import gradio as gr
-from llama_cpp import Llama
-from llama_cpp.llama_speculative import LlamaPromptLookupDecoding
 from huggingface_hub import hf_hub_download
 from duckduckgo_search import DDGS
@@ -17,126 +16,77 @@ from duckduckgo_search import DDGS
 cancel_event = threading.Event()
 # ------------------------------
-# Model Definitions and Global Variables
 # ------------------------------
-REQUIRED_SPACE_BYTES = 5 * 1024 ** 3  # 5 GB
 MODELS = {
     "Taiwan-tinyllama-v1.0-chat (Q8_0)": {
-        "repo_id": "NapYang/DavidLanz-Taiwan-tinyllama-v1.0-chat.GGUF",
-        "filename": "Taiwan-tinyllama-v1.0-chat-Q8_0.gguf",
-        "description": "Taiwan-tinyllama-v1.0-chat (Q8_0)"
     },
     "Llama-3.2-Taiwan-3B-Instruct (Q4_K_M)": {
-        "repo_id": "itlwas/Llama-3.2-Taiwan-3B-Instruct-Q4_K_M-GGUF",
-        "filename": "llama-3.2-taiwan-3b-instruct-q4_k_m.gguf",
-        "description": "Llama-3.2-Taiwan-3B-Instruct (Q4_K_M)"
     },
     "MiniCPM3-4B (Q4_K_M)": {
-        "repo_id": "openbmb/MiniCPM3-4B-GGUF",
-        "filename": "minicpm3-4b-q4_k_m.gguf",
-        "description": "MiniCPM3-4B (Q4_K_M)"
     },
     "Qwen2.5-3B-Instruct (Q4_K_M)": {
-        "repo_id": "Qwen/Qwen2.5-3B-Instruct-GGUF",
-        "filename": "qwen2.5-3b-instruct-q4_k_m.gguf",
-        "description": "Qwen2.5-3B-Instruct (Q4_K_M)"
     },
     "Qwen2.5-7B-Instruct (Q2_K)": {
-        "repo_id": "Qwen/Qwen2.5-7B-Instruct-GGUF",
-        "filename": "qwen2.5-7b-instruct-q2_k.gguf",
-        "description": "Qwen2.5-7B Instruct (Q2_K)"
     },
     "Gemma-3-4B-IT (Q4_K_M)": {
-        "repo_id": "unsloth/gemma-3-4b-it-GGUF",
-        "filename": "gemma-3-4b-it-Q4_K_M.gguf",
-        "description": "Gemma 3 4B IT (Q4_K_M)"
     },
     "Phi-4-mini-Instruct (Q4_K_M)": {
-        "repo_id": "unsloth/Phi-4-mini-instruct-GGUF",
-        "filename": "Phi-4-mini-instruct-Q4_K_M.gguf",
-        "description": "Phi-4 Mini Instruct (Q4_K_M)"
     },
     "Meta-Llama-3.1-8B-Instruct (Q2_K)": {
-        "repo_id": "MaziyarPanahi/Meta-Llama-3.1-8B-Instruct-GGUF",
-        "filename": "Meta-Llama-3.1-8B-Instruct.Q2_K.gguf",
-        "description": "Meta-Llama-3.1-8B-Instruct (Q2_K)"
     },
     "DeepSeek-R1-Distill-Llama-8B (Q2_K)": {
-        "repo_id": "unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF",
-        "filename": "DeepSeek-R1-Distill-Llama-8B-Q2_K.gguf",
-        "description": "DeepSeek-R1-Distill-Llama-8B (Q2_K)"
     },
     "Mistral-7B-Instruct-v0.3 (IQ3_XS)": {
-        "repo_id": "MaziyarPanahi/Mistral-7B-Instruct-v0.3-GGUF",
-        "filename": "Mistral-7B-Instruct-v0.3.IQ3_XS.gguf",
-        "description": "Mistral-7B-Instruct-v0.3 (IQ3_XS)"
     },
     "Qwen2.5-Coder-7B-Instruct (Q2_K)": {
-        "repo_id": "Qwen/Qwen2.5-Coder-7B-Instruct-GGUF",
-        "filename": "qwen2.5-coder-7b-instruct-q2_k.gguf",
-        "description": "Qwen2.5-Coder-7B-Instruct (Q2_K)"
     },
 }
 LOADED_MODELS = {}
 CURRENT_MODEL_NAME = None
 # ------------------------------
-# Model Loading Helper Functions
 # ------------------------------
-def try_load_model(model_path):
-    try:
-        return Llama(
-            model_path=model_path,
-            n_ctx=4096,
-            n_threads=2,
-            n_threads_batch=1,
-            n_batch=256,
-            n_gpu_layers=0,
-            use_mlock=True,
-            use_mmap=True,
-            verbose=False,
-            logits_all=True,
-            draft_model=LlamaPromptLookupDecoding(num_pred_tokens=2),
-        )
-    except Exception as e:
-        return str(e)
-def download_model(selected_model):
-    hf_hub_download(
-        repo_id=selected_model["repo_id"],
-        filename=selected_model["filename"],
-        local_dir="./models",
-        local_dir_use_symlinks=False,
-    )
-def validate_or_download_model(selected_model):
-    model_path = os.path.join("models", selected_model["filename"])
-    os.makedirs("models", exist_ok=True)
-    if not os.path.exists(model_path):
-        download_model(selected_model)
-    result = try_load_model(model_path)
-    if isinstance(result, str):
-        try:
-            os.remove(model_path)
-        except Exception:
-            pass
-        download_model(selected_model)
-        result = try_load_model(model_path)
-        if isinstance(result, str):
-            raise Exception(f"Model load failed: {result}")
-    return result
 def load_model(model_name):
     global LOADED_MODELS, CURRENT_MODEL_NAME
     if model_name in LOADED_MODELS:
         return LOADED_MODELS[model_name]
     selected_model = MODELS[model_name]
-    model = validate_or_download_model(selected_model)
-    LOADED_MODELS[model_name] = model
     CURRENT_MODEL_NAME = model_name
-    return model
 # ------------------------------
 # Web Search Context Retrieval Function
@@ -155,18 +105,10 @@ def retrieve_context(query, max_results=6, max_chars_per_result=600):
         return ""
 # ------------------------------
-# Chat Response Generation (Streaming) with Cancellation
 # ------------------------------
 def chat_response(user_message, chat_history, system_prompt, enable_search,
                   max_results, max_chars, model_name, max_tokens, temperature, top_k, top_p, repeat_penalty):
-    """
-    Generator function that:
-      - Uses the chat history (list of dicts) from the Chatbot.
-      - Appends the new user message.
-      - Optionally retrieves web search context.
-      - Streams the assistant response token-by-token.
-      - Checks for cancellation.
-    """
     # Reset the cancellation event.
     cancel_event.clear()
@@ -194,7 +136,7 @@ def chat_response(user_message, chat_history, system_prompt, enable_search,
         retrieved_context = ""
         debug_message = "Web search disabled."
-    # Augment prompt.
     if enable_search and retrieved_context:
         augmented_user_input = (
             f"{system_prompt.strip()}\n\n"
@@ -205,41 +147,44 @@ def chat_response(user_message, chat_history, system_prompt, enable_search,
     else:
         augmented_user_input = f"{system_prompt.strip()}\n\nUser Query: {user_message}"
-    # Build final prompt messages.
-    messages = internal_history[:-1] + [{"role": "user", "content": augmented_user_input}]
-    # Load the model.
-    model = load_model(model_name)
-    # Add an empty assistant message.
     internal_history.append({"role": "assistant", "content": ""})
-    assistant_message = ""
     try:
-        stream = model.create_chat_completion(
-            messages=messages,
-            max_tokens=max_tokens,
-            temperature=temperature,
-            top_k=top_k,
-            top_p=top_p,
-            repeat_penalty=repeat_penalty,
-            stream=True,
-        )
-        for chunk in stream:
-            # Check if a cancellation has been requested.
             if cancel_event.is_set():
                 assistant_message += "\n\n[Response generation cancelled by user]"
                 internal_history[-1]["content"] = assistant_message
                 yield internal_history, debug_message
-                break
-            if "choices" in chunk:
-                delta = chunk["choices"][0]["delta"].get("content", "")
-                assistant_message += delta
-                internal_history[-1]["content"] = assistant_message
-                yield internal_history, debug_message
-                if chunk["choices"][0].get("finish_reason", ""):
-                    break
     except Exception as e:
         internal_history[-1]["content"] = f"Error: {e}"
         yield internal_history, debug_message
@@ -255,8 +200,8 @@ def cancel_generation():
 # ------------------------------
 # Gradio UI Definition
 # ------------------------------
-with gr.Blocks(title="Multi-GGUF LLM Inference") as demo:
-    gr.Markdown("## 🧠 Multi-GGUF LLM Inference with Web Search")
     gr.Markdown("Interact with the model. Select your model, set your system prompt, and adjust parameters on the left.")
     with gr.Row():
@@ -303,18 +248,14 @@ with gr.Blocks(title="Multi-GGUF LLM Inference") as demo:
         return [], "", ""
     clear_button.click(fn=clear_chat, outputs=[chatbot, msg_input, search_debug])
     cancel_button.click(fn=cancel_generation, outputs=search_debug)
-    # Submission that returns conversation and debug info.
     msg_input.submit(
         fn=chat_response,
         inputs=[msg_input, chatbot, system_prompt_text, enable_search_checkbox,
                 max_results_number, max_chars_number, model_dropdown,
                 max_tokens_slider, temperature_slider, top_k_slider, top_p_slider, repeat_penalty_slider],
         outputs=[chatbot, search_debug],
-        # Uncomment streaming=True if supported.
-        # streaming=True,
     )
 demo.launch()

 import os
 import time
 import gc
 import threading
 from itertools import islice
 from datetime import datetime
 import gradio as gr
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
 from huggingface_hub import hf_hub_download
 from duckduckgo_search import DDGS
 cancel_event = threading.Event()
 # ------------------------------
+# Model Definitions and Global Variables (PyTorch/Transformers)
+# ------------------------------
+# Here, the repo_id should point to a model checkpoint that is compatible with Hugging Face Transformers.
+# ------------------------------
+# Torch-Compatible Model Definitions with Adjusted Descriptions
 # ------------------------------
 MODELS = {
     "Taiwan-tinyllama-v1.0-chat (Q8_0)": {
+        "repo_id": "DavidLanz/Taiwan-tinyllama-v1.0-chat",
+        "description": "Taiwan-tinyllama-v1.0-chat (Q8_0) – Torch-compatible version converted from GGUF."
     },
     "Llama-3.2-Taiwan-3B-Instruct (Q4_K_M)": {
+        "repo_id": "https://huggingface.co/lianghsun/Llama-3.2-Taiwan-3B-Instruct",
+        "description": "Llama-3.2-Taiwan-3B-Instruct (Q4_K_M) – Torch-compatible version converted from GGUF."
     },
     "MiniCPM3-4B (Q4_K_M)": {
+        "repo_id": "openbmb/MiniCPM3-4B",
+        "description": "MiniCPM3-4B (Q4_K_M) – Torch-compatible version converted from GGUF."
     },
     "Qwen2.5-3B-Instruct (Q4_K_M)": {
+        "repo_id": "Qwen/Qwen2.5-3B-Instruct",
+        "description": "Qwen2.5-3B-Instruct (Q4_K_M) – Torch-compatible version converted from GGUF."
     },
     "Qwen2.5-7B-Instruct (Q2_K)": {
+        "repo_id": "Qwen/Qwen2.5-7B-Instruct",
+        "description": "Qwen2.5-7B-Instruct (Q2_K) – Torch-compatible version converted from GGUF."
     },
     "Gemma-3-4B-IT (Q4_K_M)": {
+        "repo_id": "unsloth/gemma-3-4b-it",
+        "description": "Gemma-3-4B-IT (Q4_K_M) – Torch-compatible version converted from GGUF."
     },
     "Phi-4-mini-Instruct (Q4_K_M)": {
+        "repo_id": "unsloth/Phi-4-mini-instruct",
+        "description": "Phi-4-mini-Instruct (Q4_K_M) – Torch-compatible version converted from GGUF."
     },
     "Meta-Llama-3.1-8B-Instruct (Q2_K)": {
+        "repo_id": "MaziyarPanahi/Meta-Llama-3.1-8B-Instruct",
+        "description": "Meta-Llama-3.1-8B-Instruct (Q2_K) – Torch-compatible version converted from GGUF."
     },
     "DeepSeek-R1-Distill-Llama-8B (Q2_K)": {
+        "repo_id": "unsloth/DeepSeek-R1-Distill-Llama-8B",
+        "description": "DeepSeek-R1-Distill-Llama-8B (Q2_K) – Torch-compatible version converted from GGUF."
     },
     "Mistral-7B-Instruct-v0.3 (IQ3_XS)": {
+        "repo_id": "MaziyarPanahi/Mistral-7B-Instruct-v0.3",
+        "description": "Mistral-7B-Instruct-v0.3 (IQ3_XS) – Torch-compatible version converted from GGUF."
     },
     "Qwen2.5-Coder-7B-Instruct (Q2_K)": {
+        "repo_id": "Qwen/Qwen2.5-Coder-7B-Instruct",
+        "description": "Qwen2.5-Coder-7B-Instruct (Q2_K) – Torch-compatible version converted from GGUF."
     },
 }
 LOADED_MODELS = {}
 CURRENT_MODEL_NAME = None
 # ------------------------------
+# Model Loading Helper Function (PyTorch/Transformers)
 # ------------------------------
 def load_model(model_name):
     global LOADED_MODELS, CURRENT_MODEL_NAME
     if model_name in LOADED_MODELS:
         return LOADED_MODELS[model_name]
     selected_model = MODELS[model_name]
+    # Load both the model and tokenizer using the Transformers library.
+    model = AutoModelForCausalLM.from_pretrained(selected_model["repo_id"], trust_remote_code=True)
+    tokenizer = AutoTokenizer.from_pretrained(selected_model["repo_id"], trust_remote_code=True)
+    LOADED_MODELS[model_name] = (model, tokenizer)
     CURRENT_MODEL_NAME = model_name
+    return model, tokenizer
 # ------------------------------
 # Web Search Context Retrieval Function
         return ""
 # ------------------------------
+# Chat Response Generation (Simulated Streaming) with Cancellation
 # ------------------------------
 def chat_response(user_message, chat_history, system_prompt, enable_search,
                   max_results, max_chars, model_name, max_tokens, temperature, top_k, top_p, repeat_penalty):
     # Reset the cancellation event.
     cancel_event.clear()
         retrieved_context = ""
         debug_message = "Web search disabled."
+    # Augment prompt with search context if available.
     if enable_search and retrieved_context:
         augmented_user_input = (
             f"{system_prompt.strip()}\n\n"
     else:
         augmented_user_input = f"{system_prompt.strip()}\n\nUser Query: {user_message}"
+    # Append a placeholder for the assistant's response.
     internal_history.append({"role": "assistant", "content": ""})
     try:
+        # Load the PyTorch model and tokenizer.
+        model, tokenizer = load_model(model_name)
+        # Tokenize the input prompt.
+        input_ids = tokenizer(augmented_user_input, return_tensors="pt").input_ids
+        with torch.no_grad():
+            output_ids = model.generate(
+                input_ids,
+                max_new_tokens=max_tokens,
+                temperature=temperature,
+                top_k=top_k,
+                top_p=top_p,
+                repetition_penalty=repeat_penalty,
+                do_sample=True
+            )
+        # Decode the generated tokens.
+        generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
+        # Strip the original prompt to isolate the assistant’s reply.
+        assistant_text = generated_text[len(augmented_user_input):].strip()
+        # Simulate streaming by yielding the output word by word.
+        words = assistant_text.split()
+        assistant_message = ""
+        for word in words:
             if cancel_event.is_set():
                 assistant_message += "\n\n[Response generation cancelled by user]"
                 internal_history[-1]["content"] = assistant_message
                 yield internal_history, debug_message
+                return
+            assistant_message += word + " "
+            internal_history[-1]["content"] = assistant_message
+            yield internal_history, debug_message
+            time.sleep(0.05)  # Short delay to simulate streaming
     except Exception as e:
         internal_history[-1]["content"] = f"Error: {e}"
         yield internal_history, debug_message
 # ------------------------------
 # Gradio UI Definition
 # ------------------------------
+with gr.Blocks(title="LLM Inference with ZeroGPU") as demo:
+    gr.Markdown("## 🧠 ZeroGPU LLM Inference with Web Search")
     gr.Markdown("Interact with the model. Select your model, set your system prompt, and adjust parameters on the left.")
     with gr.Row():
         return [], "", ""
     clear_button.click(fn=clear_chat, outputs=[chatbot, msg_input, search_debug])
     cancel_button.click(fn=cancel_generation, outputs=search_debug)
     msg_input.submit(
         fn=chat_response,
         inputs=[msg_input, chatbot, system_prompt_text, enable_search_checkbox,
                 max_results_number, max_chars_number, model_dropdown,
                 max_tokens_slider, temperature_slider, top_k_slider, top_p_slider, repeat_penalty_slider],
         outputs=[chatbot, search_debug],
     )
 demo.launch()

requirements.txt CHANGED Viewed

@@ -5,7 +5,8 @@
 wheel
 jieba
 docopt
-llama-cpp-python --no-binary=:all: --global-option=build_ext --global-option="--cmake-args=-DGGML_CUDA=on"
 streamlit
 duckduckgo_search
-gradio

 wheel
 jieba
 docopt
 streamlit
 duckduckgo_search
+gradio
+torch
+transformers