import gradio as gr import spaces # Import spaces module for ZeroGPU from huggingface_hub import login import os from json_processor import JsonProcessor from dag_visualizer import DAGVisualizer import json # 1) Read Secrets hf_token = os.getenv("HUGGINGFACE_TOKEN") if not hf_token: raise RuntimeError("❌ HUGGINGFACE_TOKEN not detected, please check Space Settings → Secrets") # 2) Login to ensure all subsequent from_pretrained calls have proper permissions login(hf_token) from transformers import AutoTokenizer from huggingface_hub import hf_hub_download from llama_cpp import Llama import warnings import os warnings.filterwarnings("ignore") # Model configurations for GGUF models MODEL_CONFIGS = { "1B": { "name": "Dart-llm-model-1B", "base_model": "meta-llama/Llama-3.2-1B", # For tokenizer "gguf_model": "YongdongWang/llama-3.2-1b-lora-qlora-dart-llm-gguf", "gguf_file": "llama_3.2_1b-lora-qlora-dart-llm_q5_k_m.gguf" }, "3B": { "name": "Dart-llm-model-3B", "base_model": "meta-llama/Llama-3.2-3B", # For tokenizer "gguf_model": "YongdongWang/llama-3.2-3b-lora-qlora-dart-llm-gguf", "gguf_file": "llama_3.2_3b-lora-qlora-dart-llm_q4_k_m.gguf" }, "8B": { "name": "Dart-llm-model-8B", "base_model": "meta-llama/Llama-3.1-8B", # For tokenizer "gguf_model": "YongdongWang/llama-3.1-8b-lora-qlora-dart-llm-gguf", "gguf_file": "llama3.1-8b-lora-qlora-dart-llm_q4_k_m_fp16.gguf" } } DEFAULT_MODEL = "1B" # Set 1B as default # Global variables to store model and tokenizer llm_model = None tokenizer = None current_model_config = None model_loaded = False # Initialize DAG visualizer dag_visualizer = DAGVisualizer() def load_model_and_tokenizer(selected_model=DEFAULT_MODEL): """Load tokenizer - executed on CPU""" global tokenizer, model_loaded, current_model_config if model_loaded and current_model_config == selected_model: return print(f"🔄 Loading tokenizer for {MODEL_CONFIGS[selected_model]['name']}...") # Load tokenizer from base model base_model = MODEL_CONFIGS[selected_model]["base_model"] tokenizer = AutoTokenizer.from_pretrained( base_model, use_fast=False, trust_remote_code=True ) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token current_model_config = selected_model model_loaded = True print("✅ Tokenizer loaded successfully!") @spaces.GPU(duration=60) # Request GPU for loading model at startup def load_gguf_model_on_gpu(selected_model=DEFAULT_MODEL): """Load GGUF model using llama-cpp-python""" global llm_model # If model is already loaded and it's the same model, return it if llm_model is not None and current_model_config == selected_model: return llm_model # Clear existing model if switching if llm_model is not None: print("🗑️ Clearing existing model from GPU...") del llm_model llm_model = None model_config = MODEL_CONFIGS[selected_model] print(f"🔄 Loading {model_config['name']} GGUF model...") try: # Download GGUF model file from HuggingFace Hub model_file = hf_hub_download( repo_id=model_config["gguf_model"], filename=model_config["gguf_file"], cache_dir="./gguf_cache" ) print(f"📦 Downloaded GGUF file: {model_file}") # Load GGUF model with llama-cpp-python llm_model = Llama( model_path=model_file, n_ctx=2048, # Context length n_gpu_layers=-1, # Use all GPU layers if available verbose=False ) print(f"✅ {model_config['name']} GGUF model loaded successfully!") return llm_model except Exception as load_error: print(f"❌ GGUF Model loading failed: {load_error}") raise load_error def process_json_in_response(response): """Process and format JSON content in the response, and generate DAG visualization""" dag_image_path = None try: # Check if response contains JSON-like content if '{' in response and '}' in response: processor = JsonProcessor() # Try to process the response for JSON content processed_json = processor.process_response(response) if processed_json: # Format the JSON nicely formatted_json = json.dumps(processed_json, indent=2, ensure_ascii=False) # Generate DAG visualization if the JSON contains tasks if "tasks" in processed_json and processed_json["tasks"]: try: dag_image_path = dag_visualizer.create_dag_visualization( processed_json, title="Robot Task Dependency Graph" ) except Exception as e: print(f"DAG visualization failed: {e}") # Replace the JSON part in the response import re json_pattern = r'\{.*\}' match = re.search(json_pattern, response, re.DOTALL) if match: # Replace the matched JSON with the formatted version response = response.replace(match.group(), formatted_json) return response, dag_image_path except Exception: # If processing fails, return original response return response, None @spaces.GPU(duration=60) # GPU inference def generate_response_gpu(prompt, max_tokens=512, selected_model=DEFAULT_MODEL): """Generate response using GGUF model - executed on GPU""" global llm_model # Ensure model is loaded on GPU if llm_model is None or current_model_config != selected_model: llm_model = load_gguf_model_on_gpu(selected_model) if llm_model is None: return ("❌ GGUF Model failed to load. Please check the Space logs.", None) try: formatted_prompt = ( "### Instruction:\n" f"{prompt.strip()}\n\n" "### Response:\n" ) # Generate response using llama-cpp-python output = llm_model( formatted_prompt, max_tokens=max_tokens, stop=["### Instruction:", "###"], echo=False, temperature=0.1, top_p=0.9, repeat_penalty=1.1 ) # Extract the generated text response = output['choices'][0]['text'].strip() # Process JSON if present in response and generate DAG response, dag_image_path = process_json_in_response(response) return (response if response else "❌ No response generated. Please try again with a different prompt.", dag_image_path) except Exception as generation_error: return (f"❌ Generation Error: {str(generation_error)}", None) def chat_interface(message, history, max_tokens, selected_model): """Chat interface - runs on CPU, calls GPU functions""" if not message.strip(): return history, "", None try: # Call GPU function to generate response response, dag_image_path = generate_response_gpu(message, max_tokens, selected_model) history.append((message, response)) return history, "", dag_image_path except Exception as chat_error: error_msg = f"❌ Chat Error: {str(chat_error)}" history.append((message, error_msg)) return history, "", None # GGUF models include tokenizer, no separate loading needed # Create Gradio application with gr.Blocks( title="Robot Task Planning - DART-LLM Multi-Model", theme=gr.themes.Soft(), css=""" .gradio-container { max-width: 1200px; margin: auto; } """ ) as app: gr.Markdown(""" # 🤖 DART-LLM Multi-Model - Robot Task Planning Choose from **three GGUF quantized models** specialized for **robot task planning** using QLoRA fine-tuning: - **🚀 Dart-llm-model-1B** (Default): Fastest inference, Q5_K_M quantization - **⚖️ Dart-llm-model-3B**: Balanced performance, Q4_K_M quantization - **🎯 Dart-llm-model-8B**: Best quality output, Q4_K_M quantization **GGUF Implementation**: Uses native GGUF format with llama-cpp-python for optimal memory efficiency and GPU acceleration. **Capabilities**: - Convert natural language robot commands into structured task sequences - **NEW: Automatic DAG Visualization** - Generates visual dependency graphs for robot task sequences - Support for excavators, dump trucks, and other construction robots **GGUF Models**: - [YongdongWang/llama-3.2-1b-lora-qlora-dart-llm-gguf](https://huggingface.co/YongdongWang/llama-3.2-1b-lora-qlora-dart-llm-gguf) (Default - Q5_K_M) - [YongdongWang/llama-3.2-3b-lora-qlora-dart-llm-gguf](https://huggingface.co/YongdongWang/llama-3.2-3b-lora-qlora-dart-llm-gguf) (Q4_K_M) - [YongdongWang/llama-3.1-8b-lora-qlora-dart-llm-gguf](https://huggingface.co/YongdongWang/llama-3.1-8b-lora-qlora-dart-llm-gguf) (Q4_K_M) ⚡ **Using ZeroGPU**: This Space uses dynamic GPU allocation (Nvidia H200). First generation might take a bit longer. """) with gr.Row(): with gr.Column(scale=2): chatbot = gr.Chatbot( label="Task Planning Results", height=400, show_label=True, container=True, bubble_full_width=False, show_copy_button=True ) msg = gr.Textbox( label="Robot Command", placeholder="Enter robot task command (e.g., 'Deploy Excavator 1 to Soil Area 1')...", lines=2, max_lines=5, show_label=True, container=True ) with gr.Row(): send_btn = gr.Button("🚀 Generate Tasks", variant="primary", size="sm") clear_btn = gr.Button("🗑️ Clear", variant="secondary", size="sm") with gr.Column(scale=2): dag_image = gr.Image( label="Task Dependency Graph (DAG)", show_label=True, container=True, height=400, interactive=False ) with gr.Column(scale=1): gr.Markdown("### ⚙️ Generation Settings") model_selector = gr.Dropdown( choices=[(config["name"], key) for key, config in MODEL_CONFIGS.items()], value=DEFAULT_MODEL, label="Model Size", info="Select model size (1B = fastest, 8B = best quality)", interactive=True ) max_tokens = gr.Slider( minimum=50, maximum=5000, value=512, step=10, label="Max Tokens", info="Maximum number of tokens to generate" ) gr.Markdown(""" ### 📊 Model Status - **Hardware**: ZeroGPU (Dynamic Nvidia H200) - **Status**: Ready - **Note**: First generation allocates GPU resources - **Dart-llm-model-1B**: Fastest inference (Default) - **Dart-llm-model-3B**: Balanced speed/quality - **Dart-llm-model-8B**: Best quality, slower """) # Example conversations gr.Examples( examples=[ "Dump truck 1 goes to the puddle for inspection, after which all robots avoid the puddle.", "Drive the Excavator 1 to the obstacle, and perform excavation to clear the obstacle.", "Send Excavator 1 and Dump Truck 1 to the soil area; Excavator 1 will excavate and unload, followed by Dump Truck 1 proceeding to the puddle for unloading.", "Move Excavator 1 and Dump Truck 1 to soil area 2; Excavator 1 will excavate and unload, then Dump Truck 1 returns to the starting position to unload.", "Excavator 1 is guided to the obstacle to excavate and unload to clear the obstacle, then excavator 1 and dump truck 1 are moved to the soil area, and the excavator excavates and unloads. Finally, dump truck 1 unloads the soil into the puddle.", "Excavator 1 goes to the obstacle to excavate and unload to clear the obstacle. Once the obstacle is cleared, mobilize all available robots to proceed to the puddle area for inspection.", ], inputs=msg, label="💡 Example Operator Commands" ) # Event handling msg.submit( chat_interface, inputs=[msg, chatbot, max_tokens, model_selector], outputs=[chatbot, msg, dag_image] ) send_btn.click( chat_interface, inputs=[msg, chatbot, max_tokens, model_selector], outputs=[chatbot, msg, dag_image] ) clear_btn.click( lambda: ([], "", None), outputs=[chatbot, msg, dag_image] ) if __name__ == "__main__": app.launch( server_name="0.0.0.0", server_port=7860, share=True, show_error=True )