Spaces:
Build error
Build error
import gradio as gr | |
import spaces # Import spaces module for ZeroGPU | |
from huggingface_hub import login | |
import os | |
from json_processor import JsonProcessor | |
from dag_visualizer import DAGVisualizer | |
import json | |
# 1) Read Secrets | |
hf_token = os.getenv("HUGGINGFACE_TOKEN") | |
if not hf_token: | |
raise RuntimeError("β HUGGINGFACE_TOKEN not detected, please check Space Settings β Secrets") | |
# 2) Login to ensure all subsequent from_pretrained calls have proper permissions | |
login(hf_token) | |
from transformers import AutoTokenizer | |
from huggingface_hub import hf_hub_download | |
from llama_cpp import Llama | |
import warnings | |
import os | |
warnings.filterwarnings("ignore") | |
# Model configurations for GGUF models | |
MODEL_CONFIGS = { | |
"1B": { | |
"name": "Dart-llm-model-1B", | |
"base_model": "meta-llama/Llama-3.2-1B", # For tokenizer | |
"gguf_model": "YongdongWang/llama-3.2-1b-lora-qlora-dart-llm-gguf", | |
"gguf_file": "llama_3.2_1b-lora-qlora-dart-llm_q5_k_m.gguf" | |
}, | |
"3B": { | |
"name": "Dart-llm-model-3B", | |
"base_model": "meta-llama/Llama-3.2-3B", # For tokenizer | |
"gguf_model": "YongdongWang/llama-3.2-3b-lora-qlora-dart-llm-gguf", | |
"gguf_file": "llama_3.2_3b-lora-qlora-dart-llm_q4_k_m.gguf" | |
}, | |
"8B": { | |
"name": "Dart-llm-model-8B", | |
"base_model": "meta-llama/Llama-3.1-8B", # For tokenizer | |
"gguf_model": "YongdongWang/llama-3.1-8b-lora-qlora-dart-llm-gguf", | |
"gguf_file": "llama3.1-8b-lora-qlora-dart-llm_q4_k_m_fp16.gguf" | |
} | |
} | |
DEFAULT_MODEL = "1B" # Set 1B as default | |
# Global variables to store model and tokenizer | |
llm_model = None | |
tokenizer = None | |
current_model_config = None | |
model_loaded = False | |
# Initialize DAG visualizer | |
dag_visualizer = DAGVisualizer() | |
def load_model_and_tokenizer(selected_model=DEFAULT_MODEL): | |
"""Load tokenizer - executed on CPU""" | |
global tokenizer, model_loaded, current_model_config | |
if model_loaded and current_model_config == selected_model: | |
return | |
print(f"π Loading tokenizer for {MODEL_CONFIGS[selected_model]['name']}...") | |
# Load tokenizer from base model | |
base_model = MODEL_CONFIGS[selected_model]["base_model"] | |
tokenizer = AutoTokenizer.from_pretrained( | |
base_model, | |
use_fast=False, | |
trust_remote_code=True | |
) | |
if tokenizer.pad_token is None: | |
tokenizer.pad_token = tokenizer.eos_token | |
current_model_config = selected_model | |
model_loaded = True | |
print("β Tokenizer loaded successfully!") | |
# Request GPU for loading model at startup | |
def load_gguf_model_on_gpu(selected_model=DEFAULT_MODEL): | |
"""Load GGUF model using llama-cpp-python""" | |
global llm_model | |
# If model is already loaded and it's the same model, return it | |
if llm_model is not None and current_model_config == selected_model: | |
return llm_model | |
# Clear existing model if switching | |
if llm_model is not None: | |
print("ποΈ Clearing existing model from GPU...") | |
del llm_model | |
llm_model = None | |
model_config = MODEL_CONFIGS[selected_model] | |
print(f"π Loading {model_config['name']} GGUF model...") | |
try: | |
# Download GGUF model file from HuggingFace Hub | |
model_file = hf_hub_download( | |
repo_id=model_config["gguf_model"], | |
filename=model_config["gguf_file"], | |
cache_dir="./gguf_cache" | |
) | |
print(f"π¦ Downloaded GGUF file: {model_file}") | |
# Load GGUF model with llama-cpp-python | |
llm_model = Llama( | |
model_path=model_file, | |
n_ctx=2048, # Context length | |
n_gpu_layers=-1, # Use all GPU layers if available | |
verbose=False | |
) | |
print(f"β {model_config['name']} GGUF model loaded successfully!") | |
return llm_model | |
except Exception as load_error: | |
print(f"β GGUF Model loading failed: {load_error}") | |
raise load_error | |
def process_json_in_response(response): | |
"""Process and format JSON content in the response, and generate DAG visualization""" | |
dag_image_path = None | |
try: | |
# Check if response contains JSON-like content | |
if '{' in response and '}' in response: | |
processor = JsonProcessor() | |
# Try to process the response for JSON content | |
processed_json = processor.process_response(response) | |
if processed_json: | |
# Format the JSON nicely | |
formatted_json = json.dumps(processed_json, indent=2, ensure_ascii=False) | |
# Generate DAG visualization if the JSON contains tasks | |
if "tasks" in processed_json and processed_json["tasks"]: | |
try: | |
dag_image_path = dag_visualizer.create_dag_visualization( | |
processed_json, | |
title="Robot Task Dependency Graph" | |
) | |
except Exception as e: | |
print(f"DAG visualization failed: {e}") | |
# Replace the JSON part in the response | |
import re | |
json_pattern = r'\{.*\}' | |
match = re.search(json_pattern, response, re.DOTALL) | |
if match: | |
# Replace the matched JSON with the formatted version | |
response = response.replace(match.group(), formatted_json) | |
return response, dag_image_path | |
except Exception: | |
# If processing fails, return original response | |
return response, None | |
# GPU inference | |
def generate_response_gpu(prompt, max_tokens=512, selected_model=DEFAULT_MODEL): | |
"""Generate response using GGUF model - executed on GPU""" | |
global llm_model | |
# Ensure model is loaded on GPU | |
if llm_model is None or current_model_config != selected_model: | |
llm_model = load_gguf_model_on_gpu(selected_model) | |
if llm_model is None: | |
return ("β GGUF Model failed to load. Please check the Space logs.", None) | |
try: | |
formatted_prompt = ( | |
"### Instruction:\n" | |
f"{prompt.strip()}\n\n" | |
"### Response:\n" | |
) | |
# Generate response using llama-cpp-python | |
output = llm_model( | |
formatted_prompt, | |
max_tokens=max_tokens, | |
stop=["### Instruction:", "###"], | |
echo=False, | |
temperature=0.1, | |
top_p=0.9, | |
repeat_penalty=1.1 | |
) | |
# Extract the generated text | |
response = output['choices'][0]['text'].strip() | |
# Process JSON if present in response and generate DAG | |
response, dag_image_path = process_json_in_response(response) | |
return (response if response else "β No response generated. Please try again with a different prompt.", dag_image_path) | |
except Exception as generation_error: | |
return (f"β Generation Error: {str(generation_error)}", None) | |
def chat_interface(message, history, max_tokens, selected_model): | |
"""Chat interface - runs on CPU, calls GPU functions""" | |
if not message.strip(): | |
return history, "", None | |
try: | |
# Call GPU function to generate response | |
response, dag_image_path = generate_response_gpu(message, max_tokens, selected_model) | |
history.append((message, response)) | |
return history, "", dag_image_path | |
except Exception as chat_error: | |
error_msg = f"β Chat Error: {str(chat_error)}" | |
history.append((message, error_msg)) | |
return history, "", None | |
# GGUF models include tokenizer, no separate loading needed | |
# Create Gradio application | |
with gr.Blocks( | |
title="Robot Task Planning - DART-LLM Multi-Model", | |
theme=gr.themes.Soft(), | |
css=""" | |
.gradio-container { | |
max-width: 1200px; | |
margin: auto; | |
} | |
""" | |
) as app: | |
gr.Markdown(""" | |
# π€ DART-LLM Multi-Model - Robot Task Planning | |
Choose from **three GGUF quantized models** specialized for **robot task planning** using QLoRA fine-tuning: | |
- **π Dart-llm-model-1B** (Default): Fastest inference, Q5_K_M quantization | |
- **βοΈ Dart-llm-model-3B**: Balanced performance, Q4_K_M quantization | |
- **π― Dart-llm-model-8B**: Best quality output, Q4_K_M quantization | |
**GGUF Implementation**: Uses native GGUF format with llama-cpp-python for optimal memory efficiency and GPU acceleration. | |
**Capabilities**: | |
- Convert natural language robot commands into structured task sequences | |
- **NEW: Automatic DAG Visualization** - Generates visual dependency graphs for robot task sequences | |
- Support for excavators, dump trucks, and other construction robots | |
**GGUF Models**: | |
- [YongdongWang/llama-3.2-1b-lora-qlora-dart-llm-gguf](https://huggingface.co/YongdongWang/llama-3.2-1b-lora-qlora-dart-llm-gguf) (Default - Q5_K_M) | |
- [YongdongWang/llama-3.2-3b-lora-qlora-dart-llm-gguf](https://huggingface.co/YongdongWang/llama-3.2-3b-lora-qlora-dart-llm-gguf) (Q4_K_M) | |
- [YongdongWang/llama-3.1-8b-lora-qlora-dart-llm-gguf](https://huggingface.co/YongdongWang/llama-3.1-8b-lora-qlora-dart-llm-gguf) (Q4_K_M) | |
β‘ **Using ZeroGPU**: This Space uses dynamic GPU allocation (Nvidia H200). First generation might take a bit longer. | |
""") | |
with gr.Row(): | |
with gr.Column(scale=2): | |
chatbot = gr.Chatbot( | |
label="Task Planning Results", | |
height=400, | |
show_label=True, | |
container=True, | |
bubble_full_width=False, | |
show_copy_button=True | |
) | |
msg = gr.Textbox( | |
label="Robot Command", | |
placeholder="Enter robot task command (e.g., 'Deploy Excavator 1 to Soil Area 1')...", | |
lines=2, | |
max_lines=5, | |
show_label=True, | |
container=True | |
) | |
with gr.Row(): | |
send_btn = gr.Button("π Generate Tasks", variant="primary", size="sm") | |
clear_btn = gr.Button("ποΈ Clear", variant="secondary", size="sm") | |
with gr.Column(scale=2): | |
dag_image = gr.Image( | |
label="Task Dependency Graph (DAG)", | |
show_label=True, | |
container=True, | |
height=400, | |
interactive=False | |
) | |
with gr.Column(scale=1): | |
gr.Markdown("### βοΈ Generation Settings") | |
model_selector = gr.Dropdown( | |
choices=[(config["name"], key) for key, config in MODEL_CONFIGS.items()], | |
value=DEFAULT_MODEL, | |
label="Model Size", | |
info="Select model size (1B = fastest, 8B = best quality)", | |
interactive=True | |
) | |
max_tokens = gr.Slider( | |
minimum=50, | |
maximum=5000, | |
value=512, | |
step=10, | |
label="Max Tokens", | |
info="Maximum number of tokens to generate" | |
) | |
gr.Markdown(""" | |
### π Model Status | |
- **Hardware**: ZeroGPU (Dynamic Nvidia H200) | |
- **Status**: Ready | |
- **Note**: First generation allocates GPU resources | |
- **Dart-llm-model-1B**: Fastest inference (Default) | |
- **Dart-llm-model-3B**: Balanced speed/quality | |
- **Dart-llm-model-8B**: Best quality, slower | |
""") | |
# Example conversations | |
gr.Examples( | |
examples=[ | |
"Dump truck 1 goes to the puddle for inspection, after which all robots avoid the puddle.", | |
"Drive the Excavator 1 to the obstacle, and perform excavation to clear the obstacle.", | |
"Send Excavator 1 and Dump Truck 1 to the soil area; Excavator 1 will excavate and unload, followed by Dump Truck 1 proceeding to the puddle for unloading.", | |
"Move Excavator 1 and Dump Truck 1 to soil area 2; Excavator 1 will excavate and unload, then Dump Truck 1 returns to the starting position to unload.", | |
"Excavator 1 is guided to the obstacle to excavate and unload to clear the obstacle, then excavator 1 and dump truck 1 are moved to the soil area, and the excavator excavates and unloads. Finally, dump truck 1 unloads the soil into the puddle.", | |
"Excavator 1 goes to the obstacle to excavate and unload to clear the obstacle. Once the obstacle is cleared, mobilize all available robots to proceed to the puddle area for inspection.", | |
], | |
inputs=msg, | |
label="π‘ Example Operator Commands" | |
) | |
# Event handling | |
msg.submit( | |
chat_interface, | |
inputs=[msg, chatbot, max_tokens, model_selector], | |
outputs=[chatbot, msg, dag_image] | |
) | |
send_btn.click( | |
chat_interface, | |
inputs=[msg, chatbot, max_tokens, model_selector], | |
outputs=[chatbot, msg, dag_image] | |
) | |
clear_btn.click( | |
lambda: ([], "", None), | |
outputs=[chatbot, msg, dag_image] | |
) | |
if __name__ == "__main__": | |
app.launch( | |
server_name="0.0.0.0", | |
server_port=7860, | |
share=True, | |
show_error=True | |
) |