Yongdong
Implement GGUF model support with DAG visualization
a4f228c
import gradio as gr
import spaces # Import spaces module for ZeroGPU
from huggingface_hub import login
import os
from json_processor import JsonProcessor
from dag_visualizer import DAGVisualizer
import json
# 1) Read Secrets
hf_token = os.getenv("HUGGINGFACE_TOKEN")
if not hf_token:
raise RuntimeError("❌ HUGGINGFACE_TOKEN not detected, please check Space Settings β†’ Secrets")
# 2) Login to ensure all subsequent from_pretrained calls have proper permissions
login(hf_token)
from transformers import AutoTokenizer
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
import warnings
import os
warnings.filterwarnings("ignore")
# Model configurations for GGUF models
MODEL_CONFIGS = {
"1B": {
"name": "Dart-llm-model-1B",
"base_model": "meta-llama/Llama-3.2-1B", # For tokenizer
"gguf_model": "YongdongWang/llama-3.2-1b-lora-qlora-dart-llm-gguf",
"gguf_file": "llama_3.2_1b-lora-qlora-dart-llm_q5_k_m.gguf"
},
"3B": {
"name": "Dart-llm-model-3B",
"base_model": "meta-llama/Llama-3.2-3B", # For tokenizer
"gguf_model": "YongdongWang/llama-3.2-3b-lora-qlora-dart-llm-gguf",
"gguf_file": "llama_3.2_3b-lora-qlora-dart-llm_q4_k_m.gguf"
},
"8B": {
"name": "Dart-llm-model-8B",
"base_model": "meta-llama/Llama-3.1-8B", # For tokenizer
"gguf_model": "YongdongWang/llama-3.1-8b-lora-qlora-dart-llm-gguf",
"gguf_file": "llama3.1-8b-lora-qlora-dart-llm_q4_k_m_fp16.gguf"
}
}
DEFAULT_MODEL = "1B" # Set 1B as default
# Global variables to store model and tokenizer
llm_model = None
tokenizer = None
current_model_config = None
model_loaded = False
# Initialize DAG visualizer
dag_visualizer = DAGVisualizer()
def load_model_and_tokenizer(selected_model=DEFAULT_MODEL):
"""Load tokenizer - executed on CPU"""
global tokenizer, model_loaded, current_model_config
if model_loaded and current_model_config == selected_model:
return
print(f"πŸ”„ Loading tokenizer for {MODEL_CONFIGS[selected_model]['name']}...")
# Load tokenizer from base model
base_model = MODEL_CONFIGS[selected_model]["base_model"]
tokenizer = AutoTokenizer.from_pretrained(
base_model,
use_fast=False,
trust_remote_code=True
)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
current_model_config = selected_model
model_loaded = True
print("βœ… Tokenizer loaded successfully!")
@spaces.GPU(duration=60) # Request GPU for loading model at startup
def load_gguf_model_on_gpu(selected_model=DEFAULT_MODEL):
"""Load GGUF model using llama-cpp-python"""
global llm_model
# If model is already loaded and it's the same model, return it
if llm_model is not None and current_model_config == selected_model:
return llm_model
# Clear existing model if switching
if llm_model is not None:
print("πŸ—‘οΈ Clearing existing model from GPU...")
del llm_model
llm_model = None
model_config = MODEL_CONFIGS[selected_model]
print(f"πŸ”„ Loading {model_config['name']} GGUF model...")
try:
# Download GGUF model file from HuggingFace Hub
model_file = hf_hub_download(
repo_id=model_config["gguf_model"],
filename=model_config["gguf_file"],
cache_dir="./gguf_cache"
)
print(f"πŸ“¦ Downloaded GGUF file: {model_file}")
# Load GGUF model with llama-cpp-python
llm_model = Llama(
model_path=model_file,
n_ctx=2048, # Context length
n_gpu_layers=-1, # Use all GPU layers if available
verbose=False
)
print(f"βœ… {model_config['name']} GGUF model loaded successfully!")
return llm_model
except Exception as load_error:
print(f"❌ GGUF Model loading failed: {load_error}")
raise load_error
def process_json_in_response(response):
"""Process and format JSON content in the response, and generate DAG visualization"""
dag_image_path = None
try:
# Check if response contains JSON-like content
if '{' in response and '}' in response:
processor = JsonProcessor()
# Try to process the response for JSON content
processed_json = processor.process_response(response)
if processed_json:
# Format the JSON nicely
formatted_json = json.dumps(processed_json, indent=2, ensure_ascii=False)
# Generate DAG visualization if the JSON contains tasks
if "tasks" in processed_json and processed_json["tasks"]:
try:
dag_image_path = dag_visualizer.create_dag_visualization(
processed_json,
title="Robot Task Dependency Graph"
)
except Exception as e:
print(f"DAG visualization failed: {e}")
# Replace the JSON part in the response
import re
json_pattern = r'\{.*\}'
match = re.search(json_pattern, response, re.DOTALL)
if match:
# Replace the matched JSON with the formatted version
response = response.replace(match.group(), formatted_json)
return response, dag_image_path
except Exception:
# If processing fails, return original response
return response, None
@spaces.GPU(duration=60) # GPU inference
def generate_response_gpu(prompt, max_tokens=512, selected_model=DEFAULT_MODEL):
"""Generate response using GGUF model - executed on GPU"""
global llm_model
# Ensure model is loaded on GPU
if llm_model is None or current_model_config != selected_model:
llm_model = load_gguf_model_on_gpu(selected_model)
if llm_model is None:
return ("❌ GGUF Model failed to load. Please check the Space logs.", None)
try:
formatted_prompt = (
"### Instruction:\n"
f"{prompt.strip()}\n\n"
"### Response:\n"
)
# Generate response using llama-cpp-python
output = llm_model(
formatted_prompt,
max_tokens=max_tokens,
stop=["### Instruction:", "###"],
echo=False,
temperature=0.1,
top_p=0.9,
repeat_penalty=1.1
)
# Extract the generated text
response = output['choices'][0]['text'].strip()
# Process JSON if present in response and generate DAG
response, dag_image_path = process_json_in_response(response)
return (response if response else "❌ No response generated. Please try again with a different prompt.", dag_image_path)
except Exception as generation_error:
return (f"❌ Generation Error: {str(generation_error)}", None)
def chat_interface(message, history, max_tokens, selected_model):
"""Chat interface - runs on CPU, calls GPU functions"""
if not message.strip():
return history, "", None
try:
# Call GPU function to generate response
response, dag_image_path = generate_response_gpu(message, max_tokens, selected_model)
history.append((message, response))
return history, "", dag_image_path
except Exception as chat_error:
error_msg = f"❌ Chat Error: {str(chat_error)}"
history.append((message, error_msg))
return history, "", None
# GGUF models include tokenizer, no separate loading needed
# Create Gradio application
with gr.Blocks(
title="Robot Task Planning - DART-LLM Multi-Model",
theme=gr.themes.Soft(),
css="""
.gradio-container {
max-width: 1200px;
margin: auto;
}
"""
) as app:
gr.Markdown("""
# πŸ€– DART-LLM Multi-Model - Robot Task Planning
Choose from **three GGUF quantized models** specialized for **robot task planning** using QLoRA fine-tuning:
- **πŸš€ Dart-llm-model-1B** (Default): Fastest inference, Q5_K_M quantization
- **βš–οΈ Dart-llm-model-3B**: Balanced performance, Q4_K_M quantization
- **🎯 Dart-llm-model-8B**: Best quality output, Q4_K_M quantization
**GGUF Implementation**: Uses native GGUF format with llama-cpp-python for optimal memory efficiency and GPU acceleration.
**Capabilities**:
- Convert natural language robot commands into structured task sequences
- **NEW: Automatic DAG Visualization** - Generates visual dependency graphs for robot task sequences
- Support for excavators, dump trucks, and other construction robots
**GGUF Models**:
- [YongdongWang/llama-3.2-1b-lora-qlora-dart-llm-gguf](https://huggingface.co/YongdongWang/llama-3.2-1b-lora-qlora-dart-llm-gguf) (Default - Q5_K_M)
- [YongdongWang/llama-3.2-3b-lora-qlora-dart-llm-gguf](https://huggingface.co/YongdongWang/llama-3.2-3b-lora-qlora-dart-llm-gguf) (Q4_K_M)
- [YongdongWang/llama-3.1-8b-lora-qlora-dart-llm-gguf](https://huggingface.co/YongdongWang/llama-3.1-8b-lora-qlora-dart-llm-gguf) (Q4_K_M)
⚑ **Using ZeroGPU**: This Space uses dynamic GPU allocation (Nvidia H200). First generation might take a bit longer.
""")
with gr.Row():
with gr.Column(scale=2):
chatbot = gr.Chatbot(
label="Task Planning Results",
height=400,
show_label=True,
container=True,
bubble_full_width=False,
show_copy_button=True
)
msg = gr.Textbox(
label="Robot Command",
placeholder="Enter robot task command (e.g., 'Deploy Excavator 1 to Soil Area 1')...",
lines=2,
max_lines=5,
show_label=True,
container=True
)
with gr.Row():
send_btn = gr.Button("πŸš€ Generate Tasks", variant="primary", size="sm")
clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary", size="sm")
with gr.Column(scale=2):
dag_image = gr.Image(
label="Task Dependency Graph (DAG)",
show_label=True,
container=True,
height=400,
interactive=False
)
with gr.Column(scale=1):
gr.Markdown("### βš™οΈ Generation Settings")
model_selector = gr.Dropdown(
choices=[(config["name"], key) for key, config in MODEL_CONFIGS.items()],
value=DEFAULT_MODEL,
label="Model Size",
info="Select model size (1B = fastest, 8B = best quality)",
interactive=True
)
max_tokens = gr.Slider(
minimum=50,
maximum=5000,
value=512,
step=10,
label="Max Tokens",
info="Maximum number of tokens to generate"
)
gr.Markdown("""
### πŸ“Š Model Status
- **Hardware**: ZeroGPU (Dynamic Nvidia H200)
- **Status**: Ready
- **Note**: First generation allocates GPU resources
- **Dart-llm-model-1B**: Fastest inference (Default)
- **Dart-llm-model-3B**: Balanced speed/quality
- **Dart-llm-model-8B**: Best quality, slower
""")
# Example conversations
gr.Examples(
examples=[
"Dump truck 1 goes to the puddle for inspection, after which all robots avoid the puddle.",
"Drive the Excavator 1 to the obstacle, and perform excavation to clear the obstacle.",
"Send Excavator 1 and Dump Truck 1 to the soil area; Excavator 1 will excavate and unload, followed by Dump Truck 1 proceeding to the puddle for unloading.",
"Move Excavator 1 and Dump Truck 1 to soil area 2; Excavator 1 will excavate and unload, then Dump Truck 1 returns to the starting position to unload.",
"Excavator 1 is guided to the obstacle to excavate and unload to clear the obstacle, then excavator 1 and dump truck 1 are moved to the soil area, and the excavator excavates and unloads. Finally, dump truck 1 unloads the soil into the puddle.",
"Excavator 1 goes to the obstacle to excavate and unload to clear the obstacle. Once the obstacle is cleared, mobilize all available robots to proceed to the puddle area for inspection.",
],
inputs=msg,
label="πŸ’‘ Example Operator Commands"
)
# Event handling
msg.submit(
chat_interface,
inputs=[msg, chatbot, max_tokens, model_selector],
outputs=[chatbot, msg, dag_image]
)
send_btn.click(
chat_interface,
inputs=[msg, chatbot, max_tokens, model_selector],
outputs=[chatbot, msg, dag_image]
)
clear_btn.click(
lambda: ([], "", None),
outputs=[chatbot, msg, dag_image]
)
if __name__ == "__main__":
app.launch(
server_name="0.0.0.0",
server_port=7860,
share=True,
show_error=True
)