Spaces:
Running
Running
| """ | |
| Modal Job Submission Module | |
| Handles submission of SMOLTRACE evaluation jobs to Modal's serverless compute platform. | |
| """ | |
| import os | |
| import sys | |
| import uuid | |
| from typing import Dict, Optional, List | |
| def submit_modal_job( | |
| model: str, | |
| provider: str, | |
| agent_type: str, | |
| hardware: str, | |
| dataset_name: str, | |
| split: str = "train", | |
| difficulty: str = "all", | |
| parallel_workers: int = 1, | |
| hf_token: Optional[str] = None, | |
| hf_inference_provider: Optional[str] = None, | |
| search_provider: str = "duckduckgo", | |
| enable_tools: Optional[List[str]] = None, | |
| output_format: str = "hub", | |
| output_dir: Optional[str] = None, | |
| enable_otel: bool = True, | |
| enable_gpu_metrics: bool = True, | |
| private: bool = False, | |
| debug: bool = False, | |
| quiet: bool = False, | |
| run_id: Optional[str] = None | |
| ) -> Dict: | |
| """ | |
| Submit an evaluation job to Modal | |
| Args: | |
| model: Model identifier (e.g., "openai/gpt-4") | |
| provider: Provider type ("litellm", "inference", "transformers") | |
| agent_type: Agent type ("tool", "code", "both") | |
| hardware: Hardware type (e.g., "auto", "gpu_a10", "gpu_h200") | |
| dataset_name: HuggingFace dataset for evaluation | |
| split: Dataset split to use | |
| difficulty: Difficulty filter | |
| parallel_workers: Number of parallel workers | |
| hf_token: HuggingFace token | |
| hf_inference_provider: HF Inference provider | |
| search_provider: Search provider for agents | |
| enable_tools: List of tools to enable | |
| output_format: Output format ("hub" or "json") | |
| output_dir: Output directory for JSON format | |
| enable_otel: Enable OpenTelemetry tracing | |
| enable_gpu_metrics: Enable GPU metrics collection | |
| private: Make datasets private | |
| debug: Enable debug mode | |
| quiet: Enable quiet mode | |
| run_id: Optional run ID (auto-generated if not provided) | |
| Returns: | |
| dict: Job submission result with job_id, status, and details | |
| """ | |
| try: | |
| import modal | |
| except ImportError: | |
| return { | |
| "success": False, | |
| "error": "Modal package not installed. Install with: pip install modal", | |
| "job_id": None | |
| } | |
| # Validate Modal credentials | |
| modal_token_id = os.environ.get("MODAL_TOKEN_ID") | |
| modal_token_secret = os.environ.get("MODAL_TOKEN_SECRET") | |
| if not modal_token_id or not modal_token_secret: | |
| return { | |
| "success": False, | |
| "error": "Modal credentials not configured. Please set MODAL_TOKEN_ID and MODAL_TOKEN_SECRET in Settings.", | |
| "job_id": None | |
| } | |
| # Generate job ID | |
| job_id = run_id if run_id else f"job_{uuid.uuid4().hex[:8]}" | |
| # Map hardware to Modal GPU types | |
| hardware_map = { | |
| "auto": _auto_select_modal_hardware(provider, model), | |
| "cpu": None, # CPU only | |
| "gpu_t4": "T4", | |
| "gpu_l4": "L4", | |
| "gpu_a10": "A10G", | |
| "gpu_l40s": "L40S", | |
| "gpu_a100": "A100", | |
| "gpu_a100_80gb": "A100-80GB", | |
| "gpu_h100": "H100", | |
| "gpu_h200": "H200", | |
| "gpu_b200": "B200" | |
| } | |
| modal_gpu = hardware_map.get(hardware, "A10G") | |
| # Build environment variables | |
| env_vars = { | |
| "HF_TOKEN": hf_token or os.environ.get("HF_TOKEN", ""), | |
| } | |
| # Add LLM provider API keys from environment | |
| llm_key_names = [ | |
| "OPENAI_API_KEY", "ANTHROPIC_API_KEY", "GOOGLE_API_KEY", | |
| "GEMINI_API_KEY", "COHERE_API_KEY", "MISTRAL_API_KEY", | |
| "TOGETHER_API_KEY", "GROQ_API_KEY", "REPLICATE_API_TOKEN", | |
| "ANYSCALE_API_KEY", "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", | |
| "AWS_REGION", "AZURE_OPENAI_API_KEY", "AZURE_OPENAI_ENDPOINT", | |
| "LITELLM_API_KEY" | |
| ] | |
| for key_name in llm_key_names: | |
| value = os.environ.get(key_name) | |
| if value: | |
| env_vars[key_name] = value | |
| # Build SMOLTRACE command | |
| cmd_parts = ["smoltrace-eval"] | |
| cmd_parts.append(f"--model {model}") | |
| cmd_parts.append(f"--provider {provider}") | |
| if hf_inference_provider: | |
| cmd_parts.append(f"--hf-inference-provider {hf_inference_provider}") | |
| cmd_parts.append(f"--search-provider {search_provider}") | |
| if enable_tools: | |
| cmd_parts.append(f"--enable-tools {','.join(enable_tools)}") | |
| cmd_parts.append(f"--agent-type {agent_type}") | |
| cmd_parts.append(f"--dataset-name {dataset_name}") | |
| cmd_parts.append(f"--split {split}") | |
| if difficulty != "all": | |
| cmd_parts.append(f"--difficulty {difficulty}") | |
| if parallel_workers > 1: | |
| cmd_parts.append(f"--parallel-workers {parallel_workers}") | |
| cmd_parts.append(f"--output-format {output_format}") | |
| if output_dir and output_format == "json": | |
| cmd_parts.append(f"--output-dir {output_dir}") | |
| if enable_otel: | |
| cmd_parts.append("--enable-otel") | |
| if not enable_gpu_metrics: | |
| cmd_parts.append("--disable-gpu-metrics") | |
| if private: | |
| cmd_parts.append("--private") | |
| if debug: | |
| cmd_parts.append("--debug") | |
| if quiet: | |
| cmd_parts.append("--quiet") | |
| cmd_parts.append(f"--run-id {job_id}") | |
| command = " ".join(cmd_parts) | |
| # Create Modal app dynamically | |
| try: | |
| app = modal.App(f"smoltrace-eval-{job_id}") | |
| # Detect current Python version dynamically (must match for serialized=True) | |
| python_version = f"{sys.version_info.major}.{sys.version_info.minor}" | |
| # Define Modal function with appropriate base image | |
| # Note: Must match local Python version when using serialized=True | |
| if modal_gpu: | |
| # Use GPU-optimized image with CUDA for GPU jobs (using latest stable CUDA) | |
| image = modal.Image.from_registry( | |
| "nvidia/cuda:12.6.0-cudnn-devel-ubuntu22.04", | |
| add_python=python_version # Dynamically match current environment | |
| ).pip_install([ | |
| "smoltrace", | |
| "ddgs", # DuckDuckGo search | |
| "litellm", | |
| "transformers", | |
| "torch", | |
| "accelerate", # Required for GPU device_map | |
| "bitsandbytes", # For quantization support | |
| "sentencepiece", # For some tokenizers | |
| "protobuf", # For some models | |
| "hf_transfer", # Fast HuggingFace downloads | |
| "nvidia-ml-py" # GPU metrics collection | |
| ]).env({ | |
| # Enable fast downloads and verbose logging | |
| "HF_HUB_ENABLE_HF_TRANSFER": "1", | |
| "TRANSFORMERS_VERBOSITY": "info", | |
| "HF_HUB_VERBOSITY": "info" | |
| }) | |
| else: | |
| # Use lightweight image for CPU jobs | |
| image = modal.Image.debian_slim(python_version=python_version).pip_install([ | |
| "smoltrace", | |
| "ddgs", # DuckDuckGo search | |
| "litellm" | |
| ]) | |
| def run_evaluation(command_to_run: str): | |
| """Run SMOLTRACE evaluation on Modal""" | |
| import subprocess | |
| import sys | |
| import os | |
| print("=" * 80) | |
| print(f"Starting SMOLTRACE evaluation on Modal") | |
| print(f"Command: {command_to_run}") | |
| print(f"Python version: {sys.version}") | |
| # Show GPU info if available | |
| try: | |
| import torch | |
| if torch.cuda.is_available(): | |
| print(f"GPU: {torch.cuda.get_device_name(0)}") | |
| print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB") | |
| except: | |
| pass | |
| print("=" * 80) | |
| print("\nNote: Model download may take several minutes for large models (14B = ~28GB)") | |
| print("Downloading and initializing model...\n") | |
| try: | |
| # Run with live output instead of capture_output so we can see progress | |
| result = subprocess.run( | |
| command_to_run, | |
| shell=True, | |
| capture_output=False, # Stream output in real-time | |
| text=True | |
| ) | |
| # Since we're not capturing, create a success message | |
| print("\n" + "=" * 80) | |
| print("EVALUATION COMPLETED") | |
| print(f"Return code: {result.returncode}") | |
| print("=" * 80) | |
| return { | |
| "returncode": result.returncode, | |
| "stdout": "Check Modal logs for full output (streaming mode)", | |
| "stderr": "" | |
| } | |
| except Exception as e: | |
| error_msg = f"Error running evaluation: {str(e)}" | |
| print("\n" + "=" * 80) | |
| print("EVALUATION FAILED") | |
| print(error_msg) | |
| print("=" * 80) | |
| import traceback | |
| traceback.print_exc() | |
| return { | |
| "returncode": -1, | |
| "stdout": "", | |
| "stderr": error_msg | |
| } | |
| # Submit the job using Modal's remote() in a background thread | |
| # Note: spawn() doesn't work well with dynamically created apps | |
| # remote() ensures the job actually executes, threading keeps UI responsive | |
| import threading | |
| # Store result in a shared dict since we're using threading | |
| result_container = {"modal_call_id": None, "started": False} | |
| def run_job_on_modal(): | |
| """Run the Modal job in background thread""" | |
| try: | |
| with app.run(): | |
| # Use remote() instead of spawn() for dynamic apps | |
| # This ensures the function actually executes | |
| function_call = run_evaluation.remote(command) | |
| result_container["started"] = True | |
| print(f"Modal job completed with return code: {function_call.get('returncode', 'unknown')}") | |
| except Exception as e: | |
| print(f"Error running Modal job: {e}") | |
| result_container["error"] = str(e) | |
| # Start the job in a background thread so we don't block the UI | |
| job_thread = threading.Thread(target=run_job_on_modal, daemon=True) | |
| job_thread.start() | |
| # Give Modal a moment to start the job and capture any immediate errors | |
| import time | |
| time.sleep(2) | |
| # Use job_id as the tracking ID since remote() doesn't give us a call_id | |
| modal_call_id = f"modal-{job_id}" | |
| return { | |
| "success": True, | |
| "job_id": job_id, | |
| "modal_call_id": modal_call_id, # Modal's internal function call ID | |
| "platform": "Modal", | |
| "hardware": modal_gpu or "CPU", | |
| "command": command, | |
| "status": "submitted", | |
| "message": f"Job successfully submitted to Modal (hardware: {modal_gpu or 'CPU'})", | |
| "instructions": f""" | |
| ✅ Job submitted successfully! | |
| **Job Details:** | |
| - Run ID: {job_id} | |
| - Modal Call ID: {modal_call_id} | |
| - Hardware: {modal_gpu or "CPU"} | |
| - Platform: Modal (serverless compute) | |
| **What happens next:** | |
| 1. Job starts running on Modal infrastructure | |
| 2. For GPU jobs: Model downloads first (14B models = ~28GB, can take 10-15 min) | |
| 3. SMOLTRACE evaluates your model | |
| 4. Results are automatically pushed to HuggingFace datasets | |
| 5. They will appear in TraceMind leaderboard when complete | |
| **Monitoring**: Check Modal dashboard for real-time logs and progress: | |
| https://modal.com/apps | |
| **Expected Duration**: | |
| - CPU jobs (API models): 2-5 minutes | |
| - GPU jobs (local models): 15-30 minutes (includes model download) | |
| **Cost**: Modal charges per-second usage. Estimated cost: $0.01-1.00 depending on model size and hardware. | |
| """.strip() | |
| } | |
| except Exception as e: | |
| error_msg = str(e) | |
| # Check for common Modal errors | |
| if "MODAL_TOKEN_ID" in error_msg or "authentication" in error_msg.lower(): | |
| return { | |
| "success": False, | |
| "error": "Modal authentication failed. Please verify your MODAL_TOKEN_ID and MODAL_TOKEN_SECRET in Settings.", | |
| "job_id": job_id, | |
| "troubleshooting": """ | |
| **Steps to fix:** | |
| 1. Go to https://modal.com/settings/tokens | |
| 2. Create a new token | |
| 3. Copy Token ID (starts with 'ak-') and Token Secret (starts with 'as-') | |
| 4. Add them to Settings in TraceMind | |
| 5. Try again | |
| """ | |
| } | |
| else: | |
| return { | |
| "success": False, | |
| "error": f"Failed to submit Modal job: {error_msg}", | |
| "job_id": job_id, | |
| "command": command | |
| } | |
| def _auto_select_modal_hardware(provider: str, model: str) -> Optional[str]: | |
| """ | |
| Automatically select Modal hardware based on model and provider. | |
| Memory estimation for agentic workloads: | |
| - Model weights (FP16): ~2GB per 1B params | |
| - KV cache for long contexts: ~1.5-2x model size for agentic tasks | |
| - Inference overhead: ~20-30% additional | |
| - Total: ~4-5GB per 1B params for safe agentic execution | |
| Args: | |
| provider: Provider type | |
| model: Model identifier | |
| Returns: | |
| str: Modal GPU type or None for CPU | |
| """ | |
| # API models don't need GPU | |
| if provider in ["litellm", "inference"]: | |
| return None | |
| # Local models need GPU - select based on model size | |
| # Conservative allocation for agentic tasks (model weights + KV cache + inference overhead) | |
| # Memory estimation: ~4-5GB per 1B params for safe agentic execution | |
| model_lower = model.lower() | |
| # Extract model size using regex to capture the number before 'b' | |
| import re | |
| size_match = re.search(r'(\d+\.?\d*)b', model_lower) | |
| if size_match: | |
| model_size = float(size_match.group(1)) | |
| # Complete coverage from 0.5B to 100B+ with no gaps | |
| if model_size >= 49: | |
| # 49B-100B+: H200 (140GB VRAM) | |
| return "H200" | |
| elif model_size >= 25: | |
| # 25B-48B: A100-80GB (e.g., Gemma-27B, Kimi-48B, 30B, 34B) | |
| return "A100-80GB" | |
| elif model_size >= 13: | |
| # 13B-24B: A100-80GB (e.g., 13B, 14B, 15B, 20B, 22B) | |
| return "A100-80GB" | |
| elif model_size >= 6: | |
| # 6B-12B: L40S 48GB (e.g., 6B, 7B, 8B, 9B, 10B, 11B, 12B) | |
| return "L40S" | |
| elif model_size >= 1: | |
| # 1B-5B: T4 16GB (e.g., 1B, 2B, 3B, 4B, 5B) | |
| return "T4" | |
| else: | |
| # < 1B: T4 16GB | |
| return "T4" | |
| else: | |
| # No size detected in model name - default to L40S (safe middle ground) | |
| return "L40S" | |