Spaces:

MCP-1st-Birthday
/

TraceMind

Running

kshitijthakkar commited on 13 days ago

Commit

4db4e9d

1 Parent(s): 7e3ac1d

feat: Add real job submission for HuggingFace Jobs and Modal

Major Features:
- Implemented actual job submission using HuggingFace Jobs API (run_job)
- Added Modal job submission module for serverless GPU compute
- Extended Settings screen with comprehensive API key management

Settings Screen Enhancements:
- Added Modal API credentials (Token ID + Secret)
- Added LLM provider API keys field (ENV format) for all major providers
- Fixed security issue: Don't expose existing environment variables in UI
- Added validation and test connection for all API keys

HuggingFace Jobs Integration:
- Uses official run_job() API from huggingface_hub
- Auto-selects appropriate Docker image (python:3.12 for CPU, pytorch with CUDA for GPU)
- Auto-selects hardware flavor based on model size (cpu-basic, t4-small, a10g-large, a100-large)
- Passes all LLM provider API keys as secrets
- Configurable timeout (default: 1h)
- Returns actual HF job ID for tracking

Modal Integration:
- Configures Modal apps with appropriate GPU (T4, A10G, A100, H100, H200, B200)
- Auto-selects GPU based on model size
- Passes API keys via Modal secrets
- Ready for async execution

New Evaluation Form:
- Added timeout configuration field
- Now actually submits jobs instead of just showing configuration
- Displays detailed success/error messages with job IDs
- Shows platform-specific instructions

Security:
- API keys stored in session only (not persisted)
- Sensitive fields use password type
- Environment variables not exposed in UI

Files changed (4) hide show

app.py +124 -55
screens/settings.py +176 -12
utils/hf_jobs_submission.py +264 -0
utils/modal_job_submission.py +234 -0

app.py CHANGED Viewed

@@ -2417,6 +2417,14 @@ with gr.Blocks(title="TraceMind-AI", theme=theme) as app:
                     placeholder="UUID will be auto-generated"
                 )
             gr.Markdown("---")
             # Cost Estimate Section
@@ -2609,60 +2617,87 @@ No historical data available for **{model}**.
             # Test Configuration
             dataset_name, split, difficulty, parallel_workers,
             # Output & Monitoring
-            output_format, output_dir, enable_otel, enable_gpu_metrics, private, debug, quiet, run_id
         ):
             """Submit a new evaluation job with comprehensive configuration"""
-            import uuid
-            import datetime
-            # Generate job ID and timestamp
-            job_id = f"job_{uuid.uuid4().hex[:8]}" if not run_id else run_id
-            timestamp = datetime.datetime.now().isoformat()
             # Estimate cost
             cost_est = estimate_job_cost_with_mcp_fallback(model, hardware)
             has_cost_estimate = cost_est is not None
-            # Build CLI command preview
-            cli_command_parts = ["smoltrace-eval"]
-            cli_command_parts.append(f"--model {model}")
-            cli_command_parts.append(f"--provider {provider}")
-            if hf_inference_provider:
-                cli_command_parts.append(f"--hf-inference-provider {hf_inference_provider}")
-            cli_command_parts.append(f"--search-provider {search_provider}")
-            if enable_tools:
-                cli_command_parts.append(f"--enable-tools {','.join(enable_tools)}")
-            if hf_token:
-                cli_command_parts.append("--hf-token $HF_TOKEN")
-            cli_command_parts.append(f"--agent-type {agent_type}")
-            cli_command_parts.append(f"--dataset-name {dataset_name}")
-            cli_command_parts.append(f"--split {split}")
-            if difficulty != "all":
-                cli_command_parts.append(f"--difficulty {difficulty}")
-            if parallel_workers > 1:
-                cli_command_parts.append(f"--parallel-workers {parallel_workers}")
-            cli_command_parts.append(f"--output-format {output_format}")
-            if output_dir and output_format == "json":
-                cli_command_parts.append(f"--output-dir {output_dir}")
-            if enable_otel:
-                cli_command_parts.append("--enable-otel")
-            if not enable_gpu_metrics:
-                cli_command_parts.append("--disable-gpu-metrics")
-            if private:
-                cli_command_parts.append("--private")
-            if debug:
-                cli_command_parts.append("--debug")
-            if quiet:
-                cli_command_parts.append("--quiet")
-            if run_id:
-                cli_command_parts.append(f"--run-id {run_id}")
-            cli_command = " \\\n    ".join(cli_command_parts)
-            # Build success message
             cost_info_html = ""
             if has_cost_estimate:
                 source_label = "📊 Historical" if cost_est['source'] == 'historical' else "🤖 MCP Estimate"
@@ -2690,10 +2725,39 @@ No historical data available for **{model}**.
                 """
                 duration_info = "Estimated completion: Will be tracked in leaderboard once job completes"
             success_html = f"""
             <div style="background: linear-gradient(135deg, #11998e 0%, #38ef7d 100%);
                         padding: 25px; border-radius: 10px; color: white; margin: 15px 0;">
-                <h2 style="margin-top: 0;">✅ Evaluation Job Submitted!</h2>
                 <div style="background: rgba(255,255,255,0.15); padding: 15px; border-radius: 5px; margin: 15px 0;">
                     <div style="font-size: 0.9em; opacity: 0.9; margin-bottom: 5px;">Job ID</div>
@@ -2702,8 +2766,8 @@ No historical data available for **{model}**.
                 <div style="display: grid; grid-template-columns: 1fr 1fr 1fr; gap: 10px; margin-top: 15px;">
                     <div>
-                        <div style="font-size: 0.9em; opacity: 0.9;">Infrastructure</div>
-                        <div style="font-weight: bold;">{infra_provider}</div>
                     </div>
                     <div>
                         <div style="font-size: 0.9em; opacity: 0.9;">Model</div>
@@ -2711,22 +2775,27 @@ No historical data available for **{model}**.
                     </div>
                     <div>
                         <div style="font-size: 0.9em; opacity: 0.9;">Hardware</div>
-                        <div style="font-weight: bold;">{hardware.upper()}</div>
                     </div>
                     <div>
                         <div style="font-size: 0.9em; opacity: 0.9;">Agent Type</div>
                         <div style="font-weight: bold;">{agent_type}</div>
                     </div>
                     {cost_info_html}
                 </div>
-                <div style="margin-top: 20px; padding: 15px; background: rgba(255,255,255,0.15); border-radius: 5px;">
-                    <div style="font-size: 0.9em; opacity: 0.9; margin-bottom: 10px;">📋 Command Preview</div>
-                    <div style="font-family: monospace; font-size: 0.8em; background: rgba(0,0,0,0.2); padding: 10px; border-radius: 3px; overflow-x: auto;">
-                        {cli_command}
                     </div>
                 </div>
                 <div style="margin-top: 15px; padding: 10px; background: rgba(255,255,255,0.15); border-radius: 5px;">
                     <div style="font-size: 0.9em;">
                         ⏱️ {duration_info}
@@ -3544,7 +3613,7 @@ Result: {result}
                 # Test Configuration
                 eval_dataset_name, eval_split, eval_difficulty, eval_parallel_workers,
                 # Output & Monitoring
-                eval_output_format, eval_output_dir, eval_enable_otel, eval_enable_gpu_metrics, eval_private, eval_debug, eval_quiet, eval_run_id
             ],
             outputs=[eval_success_message]
         )

                     placeholder="UUID will be auto-generated"
                 )
+                with gr.Row():
+                    eval_timeout = gr.Textbox(
+                        value="1h",
+                        label="Job Timeout",
+                        info="Maximum job duration (e.g., '30m', '1h', '2h')",
+                        placeholder="1h"
+                    )
             gr.Markdown("---")
             # Cost Estimate Section
             # Test Configuration
             dataset_name, split, difficulty, parallel_workers,
             # Output & Monitoring
+            output_format, output_dir, enable_otel, enable_gpu_metrics, private, debug, quiet, run_id, timeout
         ):
             """Submit a new evaluation job with comprehensive configuration"""
+            from utils.modal_job_submission import submit_modal_job
+            from utils.hf_jobs_submission import submit_hf_job
+            # Submit job based on infrastructure provider
+            if infra_provider == "Modal":
+                result = submit_modal_job(
+                    model=model,
+                    provider=provider,
+                    agent_type=agent_type,
+                    hardware=hardware,
+                    dataset_name=dataset_name,
+                    split=split,
+                    difficulty=difficulty,
+                    parallel_workers=parallel_workers,
+                    hf_token=hf_token,
+                    hf_inference_provider=hf_inference_provider,
+                    search_provider=search_provider,
+                    enable_tools=enable_tools,
+                    output_format=output_format,
+                    output_dir=output_dir,
+                    enable_otel=enable_otel,
+                    enable_gpu_metrics=enable_gpu_metrics,
+                    private=private,
+                    debug=debug,
+                    quiet=quiet,
+                    run_id=run_id
+                )
+            else:  # HuggingFace Jobs
+                result = submit_hf_job(
+                    model=model,
+                    provider=provider,
+                    agent_type=agent_type,
+                    hardware=hardware,
+                    dataset_name=dataset_name,
+                    split=split,
+                    difficulty=difficulty,
+                    parallel_workers=parallel_workers,
+                    hf_token=hf_token,
+                    hf_inference_provider=hf_inference_provider,
+                    search_provider=search_provider,
+                    enable_tools=enable_tools,
+                    output_format=output_format,
+                    output_dir=output_dir,
+                    enable_otel=enable_otel,
+                    enable_gpu_metrics=enable_gpu_metrics,
+                    private=private,
+                    debug=debug,
+                    quiet=quiet,
+                    run_id=run_id,
+                    timeout=timeout or "1h"
+                )
+            # Handle submission result
+            if not result.get("success"):
+                # Error occurred
+                error_html = f"""
+                <div style="background: linear-gradient(135deg, #eb3349 0%, #f45c43 100%);
+                            padding: 25px; border-radius: 10px; color: white; margin: 15px 0;">
+                    <h2 style="margin-top: 0;">❌ Job Submission Failed</h2>
+                    <div style="background: rgba(255,255,255,0.15); padding: 15px; border-radius: 5px; margin: 15px 0;">
+                        <div style="font-size: 0.9em; opacity: 0.9; margin-bottom: 5px;">Error</div>
+                        <div style="font-size: 1.0em;">{result.get('error', 'Unknown error')}</div>
+                    </div>
+                </div>
+                """
+                return gr.update(value=error_html, visible=True)
+            # Success - build success message
+            job_id = result.get('job_id', 'unknown')
+            job_platform = result.get('platform', infra_provider)
+            job_hardware = result.get('hardware', hardware)
+            job_status = result.get('status', 'submitted')
+            job_message = result.get('message', '')
             # Estimate cost
             cost_est = estimate_job_cost_with_mcp_fallback(model, hardware)
             has_cost_estimate = cost_est is not None
             cost_info_html = ""
             if has_cost_estimate:
                 source_label = "📊 Historical" if cost_est['source'] == 'historical' else "🤖 MCP Estimate"
                 """
                 duration_info = "Estimated completion: Will be tracked in leaderboard once job completes"
+            # Add job-specific details
+            job_details_html = ""
+            if result.get('job_yaml'):
+                job_details_html += f"""
+                <div style="margin-top: 20px; padding: 15px; background: rgba(255,255,255,0.15); border-radius: 5px;">
+                    <div style="font-size: 0.9em; opacity: 0.9; margin-bottom: 10px;">📄 Job Configuration (job.yaml)</div>
+                    <div style="font-family: monospace; font-size: 0.7em; background: rgba(0,0,0,0.2); padding: 10px; border-radius: 3px; overflow-x: auto; max-height: 300px; overflow-y: auto;">
+                        {result['job_yaml']}
+                    </div>
+                </div>
+                """
+            if result.get('command'):
+                job_details_html += f"""
+                <div style="margin-top: 15px; padding: 15px; background: rgba(255,255,255,0.15); border-radius: 5px;">
+                    <div style="font-size: 0.9em; opacity: 0.9; margin-bottom: 10px;">📋 SMOLTRACE Command</div>
+                    <div style="font-family: monospace; font-size: 0.75em; background: rgba(0,0,0,0.2); padding: 10px; border-radius: 3px; overflow-x: auto;">
+                        {result['command']}
+                    </div>
+                </div>
+                """
+            if result.get('instructions'):
+                job_details_html += f"""
+                <div style="margin-top: 15px; padding: 15px; background: rgba(255,200,100,0.2); border-radius: 5px; border-left: 4px solid rgba(255,255,255,0.5);">
+                    <div style="font-size: 0.85em; white-space: pre-wrap;">{result['instructions']}</div>
+                </div>
+                """
             success_html = f"""
             <div style="background: linear-gradient(135deg, #11998e 0%, #38ef7d 100%);
                         padding: 25px; border-radius: 10px; color: white; margin: 15px 0;">
+                <h2 style="margin-top: 0;">✅ Evaluation Job Configured!</h2>
                 <div style="background: rgba(255,255,255,0.15); padding: 15px; border-radius: 5px; margin: 15px 0;">
                     <div style="font-size: 0.9em; opacity: 0.9; margin-bottom: 5px;">Job ID</div>
                 <div style="display: grid; grid-template-columns: 1fr 1fr 1fr; gap: 10px; margin-top: 15px;">
                     <div>
+                        <div style="font-size: 0.9em; opacity: 0.9;">Platform</div>
+                        <div style="font-weight: bold;">{job_platform}</div>
                     </div>
                     <div>
                         <div style="font-size: 0.9em; opacity: 0.9;">Model</div>
                     </div>
                     <div>
                         <div style="font-size: 0.9em; opacity: 0.9;">Hardware</div>
+                        <div style="font-weight: bold;">{job_hardware}</div>
                     </div>
                     <div>
                         <div style="font-size: 0.9em; opacity: 0.9;">Agent Type</div>
                         <div style="font-weight: bold;">{agent_type}</div>
                     </div>
+                    <div>
+                        <div style="font-size: 0.9em; opacity: 0.9;">Status</div>
+                        <div style="font-weight: bold;">{job_status.upper()}</div>
+                    </div>
                     {cost_info_html}
                 </div>
+                <div style="margin-top: 15px; padding: 10px; background: rgba(255,255,255,0.15); border-radius: 5px;">
+                    <div style="font-size: 0.9em;">
+                        ℹ️ {job_message}
                     </div>
                 </div>
+                {job_details_html}
                 <div style="margin-top: 15px; padding: 10px; background: rgba(255,255,255,0.15); border-radius: 5px;">
                     <div style="font-size: 0.9em;">
                         ⏱️ {duration_info}
                 # Test Configuration
                 eval_dataset_name, eval_split, eval_difficulty, eval_parallel_workers,
                 # Output & Monitoring
+                eval_output_format, eval_output_dir, eval_enable_otel, eval_enable_gpu_metrics, eval_private, eval_debug, eval_quiet, eval_run_id, eval_timeout
             ],
             outputs=[eval_success_message]
         )

screens/settings.py CHANGED Viewed

@@ -1,12 +1,41 @@
 """
 Settings Screen for TraceMind-AI
-Allows users to configure API keys for Gemini and HuggingFace
 """
 import gradio as gr
 import os
 def create_settings_screen():
     """
     Create the settings screen for API key configuration
@@ -58,6 +87,62 @@ def create_settings_screen():
                 with gr.Column(scale=1):
                     hf_status = gr.Markdown("⚪ Not configured")
             # Save button
             with gr.Row():
                 save_btn = gr.Button("💾 Save API Keys", variant="primary")
@@ -89,6 +174,34 @@ def create_settings_screen():
             5. Create and copy the token (starts with `hf_...`)
             **Note**: Read-only access is sufficient for viewing datasets
             """)
         with gr.Accordion("🔒 Privacy & Security", open=False):
@@ -119,7 +232,7 @@ def create_settings_screen():
             """)
         # Define save functionality
-        def save_api_keys(gemini_key, hf_key):
             """Save API keys to session"""
             messages = []
@@ -133,7 +246,6 @@ def create_settings_screen():
                     messages.append("⚠️ Invalid Gemini API key format (should start with 'AIza')")
                     gemini_status_text = "❌ Invalid format"
             else:
-                messages.append("⚠️ Gemini API key not provided")
                 gemini_status_text = "⚪ Not configured"
             # Validate and save HuggingFace token
@@ -146,15 +258,47 @@ def create_settings_screen():
                     messages.append("⚠️ Invalid HuggingFace token format (should start with 'hf_')")
                     hf_status_text = "❌ Invalid format"
             else:
-                messages.append("⚠️ HuggingFace token not provided")
                 hf_status_text = "⚪ Not configured"
-            status_msg = "\n\n".join(messages)
-            status_msg += "\n\n**Note**: Keys are saved for this session only and will be used for MCP server calls."
-            return status_msg, gemini_status_text, hf_status_text
-        def test_api_keys(gemini_key, hf_key):
             """Test API key connections"""
             results = []
@@ -164,7 +308,7 @@ def create_settings_screen():
                     import google.generativeai as genai
                     genai.configure(api_key=gemini_key.strip())
                     # Try to list models as a test
-                    models = genai.list_models()
                     results.append("✅ **Gemini API**: Connection successful!")
                 except Exception as e:
                     results.append(f"❌ **Gemini API**: Connection failed - {str(e)}")
@@ -184,19 +328,39 @@ def create_settings_screen():
             else:
                 results.append("⚠️ **HuggingFace**: No token provided")
             return "\n\n".join(results)
         # Wire up button events (api_name=False to prevent API key exposure)
         save_btn.click(
             fn=save_api_keys,
-            inputs=[gemini_api_key, hf_token],
-            outputs=[status_message, gemini_status, hf_status],
             api_name=False  # IMPORTANT: Prevents API key exposure via Gradio API
         )
         test_btn.click(
             fn=test_api_keys,
-            inputs=[gemini_api_key, hf_token],
             outputs=[status_message],
             api_name=False  # IMPORTANT: Prevents API key exposure via Gradio API
         )

 """
 Settings Screen for TraceMind-AI
+Allows users to configure API keys for Gemini, HuggingFace, Modal, and LLM providers
 """
 import gradio as gr
 import os
+# Note: Removed _get_llm_keys_as_env_string() to prevent exposing environment variables
+# in the UI for security reasons. Users should explicitly enter keys needed for jobs.
+def _parse_env_string(env_string):
+    """
+    Parse ENV-formatted string into a dictionary
+    Args:
+        env_string: Multi-line string in KEY=value format
+    Returns:
+        dict: Parsed key-value pairs
+    """
+    result = {}
+    if not env_string:
+        return result
+    for line in env_string.strip().split("\n"):
+        line = line.strip()
+        if not line or line.startswith("#"):
+            continue
+        if "=" in line:
+            key, value = line.split("=", 1)
+            result[key.strip()] = value.strip()
+    return result
 def create_settings_screen():
     """
     Create the settings screen for API key configuration
                 with gr.Column(scale=1):
                     hf_status = gr.Markdown("⚪ Not configured")
+            # Modal API Key
+            with gr.Row():
+                with gr.Column(scale=4):
+                    modal_api_key = gr.Textbox(
+                        label="Modal API Key (Optional)",
+                        placeholder="Enter your Modal API key (starts with 'ak-...')",
+                        type="password",
+                        value=os.environ.get("MODAL_TOKEN_ID", ""),
+                        info="Get your key at: https://modal.com/settings/tokens"
+                    )
+                with gr.Column(scale=1):
+                    modal_status = gr.Markdown("⚪ Not configured")
+            # Modal API Secret
+            with gr.Row():
+                with gr.Column(scale=4):
+                    modal_api_secret = gr.Textbox(
+                        label="Modal API Secret (Optional)",
+                        placeholder="Enter your Modal API secret (starts with 'as-...')",
+                        type="password",
+                        value=os.environ.get("MODAL_TOKEN_SECRET", ""),
+                        info="Required if using Modal for job execution"
+                    )
+                with gr.Column(scale=1):
+                    modal_secret_status = gr.Markdown("⚪ Not configured")
+            # LLM Provider API Keys (Multi-line for convenience)
+            gr.Markdown("""
+            ### LLM Provider API Keys (Optional)
+            Paste your API keys in ENV format below. These are needed for running evaluations with API-based models.
+            """)
+            llm_api_keys = gr.Textbox(
+                label="LLM Provider API Keys",
+                placeholder="""OPENAI_API_KEY=sk-...
+ANTHROPIC_API_KEY=sk-ant-...
+GOOGLE_API_KEY=AIza...
+GEMINI_API_KEY=AIza...
+COHERE_API_KEY=...
+MISTRAL_API_KEY=...
+TOGETHER_API_KEY=...
+GROQ_API_KEY=gsk_...
+REPLICATE_API_TOKEN=r8_...
+ANYSCALE_API_KEY=...
+AWS_ACCESS_KEY_ID=...
+AWS_SECRET_ACCESS_KEY=...
+AWS_REGION=us-west-2
+AZURE_OPENAI_API_KEY=...
+AZURE_OPENAI_ENDPOINT=https://...
+LITELLM_API_KEY=...""",
+                lines=10,
+                value="",  # Don't expose existing env vars
+                info="Enter one key=value per line. These will be passed to evaluation jobs."
+            )
             # Save button
             with gr.Row():
                 save_btn = gr.Button("💾 Save API Keys", variant="primary")
             5. Create and copy the token (starts with `hf_...`)
             **Note**: Read-only access is sufficient for viewing datasets
+            ---
+            ### Modal API Credentials (Optional)
+            1. Go to [Modal Settings](https://modal.com/settings/tokens)
+            2. Click "Create new token"
+            3. Copy both:
+               - Token ID (starts with `ak-...`)
+               - Token Secret (starts with `as-...`)
+            **Why Modal?** Run evaluation jobs on serverless GPU compute with per-second billing.
+            ---
+            ### LLM Provider API Keys (Optional)
+            These keys enable running evaluations with different model providers:
+            - **OpenAI**: [platform.openai.com/api-keys](https://platform.openai.com/api-keys)
+            - **Anthropic**: [console.anthropic.com/settings/keys](https://console.anthropic.com/settings/keys)
+            - **Google (Vertex AI)**: [Google Cloud Console](https://console.cloud.google.com/)
+            - **Cohere**: [dashboard.cohere.ai/api-keys](https://dashboard.cohere.ai/api-keys)
+            - **Mistral**: [console.mistral.ai/api-keys](https://console.mistral.ai/api-keys)
+            - **Together AI**: [api.together.xyz/settings/api-keys](https://api.together.xyz/settings/api-keys)
+            - **Groq**: [console.groq.com/keys](https://console.groq.com/keys)
+            Copy and paste them in the `KEY=value` format.
             """)
         with gr.Accordion("🔒 Privacy & Security", open=False):
             """)
         # Define save functionality
+        def save_api_keys(gemini_key, hf_key, modal_key, modal_secret, llm_keys_text):
             """Save API keys to session"""
             messages = []
                     messages.append("⚠️ Invalid Gemini API key format (should start with 'AIza')")
                     gemini_status_text = "❌ Invalid format"
             else:
                 gemini_status_text = "⚪ Not configured"
             # Validate and save HuggingFace token
                     messages.append("⚠️ Invalid HuggingFace token format (should start with 'hf_')")
                     hf_status_text = "❌ Invalid format"
             else:
                 hf_status_text = "⚪ Not configured"
+            # Validate and save Modal API key
+            if modal_key and modal_key.strip():
+                if modal_key.startswith("ak-"):
+                    os.environ["MODAL_TOKEN_ID"] = modal_key.strip()
+                    messages.append("✅ Modal API key saved")
+                    modal_status_text = "✅ Configured"
+                else:
+                    messages.append("⚠️ Invalid Modal API key format (should start with 'ak-')")
+                    modal_status_text = "❌ Invalid format"
+            else:
+                modal_status_text = "⚪ Not configured"
+            # Validate and save Modal API secret
+            if modal_secret and modal_secret.strip():
+                if modal_secret.startswith("as-"):
+                    os.environ["MODAL_TOKEN_SECRET"] = modal_secret.strip()
+                    messages.append("✅ Modal API secret saved")
+                    modal_secret_status_text = "✅ Configured"
+                else:
+                    messages.append("⚠️ Invalid Modal API secret format (should start with 'as-')")
+                    modal_secret_status_text = "❌ Invalid format"
+            else:
+                modal_secret_status_text = "⚪ Not configured"
+            # Parse and save LLM provider API keys
+            llm_keys_count = 0
+            if llm_keys_text and llm_keys_text.strip():
+                parsed_keys = _parse_env_string(llm_keys_text)
+                for key, value in parsed_keys.items():
+                    os.environ[key] = value
+                    llm_keys_count += 1
+                messages.append(f"✅ {llm_keys_count} LLM provider API key(s) saved")
+            status_msg = "\n\n".join(messages) if messages else "No changes made"
+            status_msg += "\n\n**Note**: Keys are saved for this session only and will be used for evaluation jobs."
+            return status_msg, gemini_status_text, hf_status_text, modal_status_text, modal_secret_status_text
+        def test_api_keys(gemini_key, hf_key, modal_key, modal_secret, llm_keys_text):
             """Test API key connections"""
             results = []
                     import google.generativeai as genai
                     genai.configure(api_key=gemini_key.strip())
                     # Try to list models as a test
+                    models = list(genai.list_models())
                     results.append("✅ **Gemini API**: Connection successful!")
                 except Exception as e:
                     results.append(f"❌ **Gemini API**: Connection failed - {str(e)}")
             else:
                 results.append("⚠️ **HuggingFace**: No token provided")
+            # Test Modal API
+            if modal_key and modal_key.strip() and modal_secret and modal_secret.strip():
+                try:
+                    import modal
+                    # Modal validates credentials on first use, not at import
+                    # We'll just validate format here
+                    if modal_key.startswith("ak-") and modal_secret.startswith("as-"):
+                        results.append("✅ **Modal**: Credentials format valid (will be verified on first job submission)")
+                    else:
+                        results.append("❌ **Modal**: Invalid credential format")
+                except Exception as e:
+                    results.append(f"⚠�� **Modal**: {str(e)}")
+            elif modal_key or modal_secret:
+                results.append("⚠️ **Modal**: Both API key and secret required")
+            # Note about LLM provider keys
+            if llm_keys_text and llm_keys_text.strip():
+                parsed_keys = _parse_env_string(llm_keys_text)
+                results.append(f"ℹ️ **LLM Providers**: {len(parsed_keys)} key(s) configured (will be validated when used)")
             return "\n\n".join(results)
         # Wire up button events (api_name=False to prevent API key exposure)
         save_btn.click(
             fn=save_api_keys,
+            inputs=[gemini_api_key, hf_token, modal_api_key, modal_api_secret, llm_api_keys],
+            outputs=[status_message, gemini_status, hf_status, modal_status, modal_secret_status],
             api_name=False  # IMPORTANT: Prevents API key exposure via Gradio API
         )
         test_btn.click(
             fn=test_api_keys,
+            inputs=[gemini_api_key, hf_token, modal_api_key, modal_api_secret, llm_api_keys],
             outputs=[status_message],
             api_name=False  # IMPORTANT: Prevents API key exposure via Gradio API
         )

utils/hf_jobs_submission.py ADDED Viewed

	@@ -0,0 +1,264 @@

+"""
+HuggingFace Jobs Submission Module
+Handles submission of SMOLTRACE evaluation jobs to HuggingFace Jobs platform.
+Uses the official HuggingFace Jobs API: `huggingface_hub.run_job()`
+"""
+import os
+import uuid
+from typing import Dict, Optional, List
+def submit_hf_job(
+    model: str,
+    provider: str,
+    agent_type: str,
+    hardware: str,
+    dataset_name: str,
+    split: str = "train",
+    difficulty: str = "all",
+    parallel_workers: int = 1,
+    hf_token: Optional[str] = None,
+    hf_inference_provider: Optional[str] = None,
+    search_provider: str = "duckduckgo",
+    enable_tools: Optional[List[str]] = None,
+    output_format: str = "hub",
+    output_dir: Optional[str] = None,
+    enable_otel: bool = True,
+    enable_gpu_metrics: bool = True,
+    private: bool = False,
+    debug: bool = False,
+    quiet: bool = False,
+    run_id: Optional[str] = None,
+    timeout: str = "1h"
+) -> Dict:
+    """
+    Submit an evaluation job to HuggingFace Jobs using the run_job API
+    Args:
+        model: Model identifier (e.g., "openai/gpt-4")
+        provider: Provider type ("litellm", "inference", "transformers")
+        agent_type: Agent type ("tool", "code", "both")
+        hardware: Hardware type (e.g., "auto", "cpu-basic", "t4-small", "a10g-small")
+        dataset_name: HuggingFace dataset for evaluation
+        split: Dataset split to use
+        difficulty: Difficulty filter
+        parallel_workers: Number of parallel workers
+        hf_token: HuggingFace token
+        hf_inference_provider: HF Inference provider
+        search_provider: Search provider for agents
+        enable_tools: List of tools to enable
+        output_format: Output format ("hub" or "json")
+        output_dir: Output directory for JSON format
+        enable_otel: Enable OpenTelemetry tracing
+        enable_gpu_metrics: Enable GPU metrics collection
+        private: Make datasets private
+        debug: Enable debug mode
+        quiet: Enable quiet mode
+        run_id: Optional run ID (auto-generated if not provided)
+        timeout: Job timeout (default: "1h")
+    Returns:
+        dict: Job submission result with job_id, status, and details
+    """
+    try:
+        from huggingface_hub import run_job
+    except ImportError:
+        return {
+            "success": False,
+            "error": "huggingface_hub package not installed or outdated. Install with: pip install -U huggingface_hub",
+            "job_id": None
+        }
+    # Validate HF token
+    token = hf_token or os.environ.get("HF_TOKEN")
+    if not token:
+        return {
+            "success": False,
+            "error": "HuggingFace token not configured. Please set HF_TOKEN in Settings.",
+            "job_id": None
+        }
+    # Generate job ID
+    job_id = run_id if run_id else f"job_{uuid.uuid4().hex[:8]}"
+    # Map hardware to HF Jobs flavor
+    if hardware == "auto":
+        flavor = _auto_select_hf_hardware(provider, model)
+    else:
+        flavor = hardware
+    # Determine if this is a GPU job
+    is_gpu_job = flavor not in ["cpu-basic", "cpu-upgrade"]
+    # Select appropriate Docker image
+    if is_gpu_job:
+        # GPU jobs use PyTorch with CUDA
+        image = "pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel"
+        pip_packages = "smoltrace ddgs smoltrace[gpu]"
+    else:
+        # CPU jobs use standard Python
+        image = "python:3.12"
+        pip_packages = "smoltrace ddgs"
+    # Build secrets dictionary
+    secrets = {
+        "HF_TOKEN": token
+    }
+    # Add LLM provider API keys from environment
+    llm_key_names = [
+        "OPENAI_API_KEY", "ANTHROPIC_API_KEY", "GOOGLE_API_KEY",
+        "GEMINI_API_KEY", "COHERE_API_KEY", "MISTRAL_API_KEY",
+        "TOGETHER_API_KEY", "GROQ_API_KEY", "REPLICATE_API_TOKEN",
+        "ANYSCALE_API_KEY", "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY",
+        "AWS_REGION", "AZURE_OPENAI_API_KEY", "AZURE_OPENAI_ENDPOINT",
+        "LITELLM_API_KEY"
+    ]
+    for key_name in llm_key_names:
+        value = os.environ.get(key_name)
+        if value:
+            secrets[key_name] = value
+    # Build SMOLTRACE command
+    cmd_parts = ["smoltrace-eval"]
+    cmd_parts.append(f"--model {model}")
+    cmd_parts.append(f"--provider {provider}")
+    if hf_inference_provider:
+        cmd_parts.append(f"--hf-inference-provider {hf_inference_provider}")
+    cmd_parts.append(f"--search-provider {search_provider}")
+    if enable_tools:
+        cmd_parts.append(f"--enable-tools {','.join(enable_tools)}")
+    cmd_parts.append(f"--agent-type {agent_type}")
+    cmd_parts.append(f"--dataset-name {dataset_name}")
+    cmd_parts.append(f"--split {split}")
+    if difficulty != "all":
+        cmd_parts.append(f"--difficulty {difficulty}")
+    if parallel_workers > 1:
+        cmd_parts.append(f"--parallel-workers {parallel_workers}")
+    cmd_parts.append(f"--output-format {output_format}")
+    if output_dir and output_format == "json":
+        cmd_parts.append(f"--output-dir {output_dir}")
+    if enable_otel:
+        cmd_parts.append("--enable-otel")
+    if not enable_gpu_metrics:
+        cmd_parts.append("--disable-gpu-metrics")
+    if private:
+        cmd_parts.append("--private")
+    if debug:
+        cmd_parts.append("--debug")
+    if quiet:
+        cmd_parts.append("--quiet")
+    cmd_parts.append(f"--run-id {job_id}")
+    smoltrace_command = " ".join(cmd_parts)
+    # Build full command with pip install
+    full_command = f"pip install {pip_packages} && {smoltrace_command}"
+    # Submit job using HuggingFace Jobs API
+    try:
+        job = run_job(
+            image=image,
+            command=["bash", "-c", full_command],
+            secrets=secrets,
+            flavor=flavor,
+            timeout=timeout
+        )
+        return {
+            "success": True,
+            "job_id": job_id,
+            "hf_job_id": job.job_id if hasattr(job, 'job_id') else str(job),
+            "platform": "HuggingFace Jobs",
+            "hardware": flavor,
+            "image": image,
+            "command": smoltrace_command,
+            "status": "submitted",
+            "message": f"Job successfully submitted to HuggingFace Jobs (flavor: {flavor})",
+            "instructions": f"""
+✅ Job submitted successfully!
+**Job Details:**
+- Flavor: {flavor}
+- Image: {image}
+- Timeout: {timeout}
+**Monitor your job:**
+- View job status: https://huggingface.co/jobs
+- HF Job ID: {job.job_id if hasattr(job, 'job_id') else 'check dashboard'}
+**What happens next:**
+1. Job starts running on HuggingFace infrastructure
+2. SMOLTRACE evaluates your model
+3. Results are automatically pushed to HuggingFace datasets
+4. They will appear in TraceMind leaderboard when complete
+            """.strip()
+        }
+    except Exception as e:
+        return {
+            "success": False,
+            "error": f"Failed to submit job to HuggingFace: {str(e)}",
+            "job_id": job_id,
+            "command": smoltrace_command,
+            "debug_info": {
+                "image": image,
+                "flavor": flavor,
+                "timeout": timeout,
+                "secrets_configured": list(secrets.keys())
+            }
+        }
+def _auto_select_hf_hardware(provider: str, model: str) -> str:
+    """
+    Automatically select HuggingFace Jobs hardware based on model and provider
+    Args:
+        provider: Provider type
+        model: Model identifier
+    Returns:
+        str: HF Jobs flavor
+    """
+    # API models only need CPU
+    if provider in ["litellm", "inference"]:
+        return "cpu-basic"
+    # Local models need GPU - select based on model size
+    model_lower = model.lower()
+    if "70b" in model_lower or "65b" in model_lower:
+        # Large models need high-end GPU
+        return "a100-large"
+    elif "13b" in model_lower or "34b" in model_lower:
+        # Medium models work on A10
+        return "a10g-large"
+    elif "7b" in model_lower or "8b" in model_lower or "4b" in model_lower:
+        # Small models efficient on T4 or A10
+        return "t4-small"  # More cost-effective for small models
+    else:
+        # Default to T4 for unknown sizes
+        return "t4-small"
+def check_job_status(job_id: str, hf_token: Optional[str] = None) -> Dict:
+    """
+    Check the status of a HuggingFace Job
+    Args:
+        job_id: Job ID to check
+        hf_token: HuggingFace token (optional, uses env if not provided)
+    Returns:
+        dict: Job status information
+    """
+    # Placeholder for when HF Jobs API becomes available
+    return {
+        "job_id": job_id,
+        "status": "unknown",
+        "message": "HuggingFace Jobs status API not yet available programmatically"
+    }

utils/modal_job_submission.py ADDED Viewed

	@@ -0,0 +1,234 @@

+"""
+Modal Job Submission Module
+Handles submission of SMOLTRACE evaluation jobs to Modal's serverless compute platform.
+"""
+import os
+import uuid
+from typing import Dict, Optional, List
+def submit_modal_job(
+    model: str,
+    provider: str,
+    agent_type: str,
+    hardware: str,
+    dataset_name: str,
+    split: str = "train",
+    difficulty: str = "all",
+    parallel_workers: int = 1,
+    hf_token: Optional[str] = None,
+    hf_inference_provider: Optional[str] = None,
+    search_provider: str = "duckduckgo",
+    enable_tools: Optional[List[str]] = None,
+    output_format: str = "hub",
+    output_dir: Optional[str] = None,
+    enable_otel: bool = True,
+    enable_gpu_metrics: bool = True,
+    private: bool = False,
+    debug: bool = False,
+    quiet: bool = False,
+    run_id: Optional[str] = None
+) -> Dict:
+    """
+    Submit an evaluation job to Modal
+    Args:
+        model: Model identifier (e.g., "openai/gpt-4")
+        provider: Provider type ("litellm", "inference", "transformers")
+        agent_type: Agent type ("tool", "code", "both")
+        hardware: Hardware type (e.g., "auto", "gpu_a10", "gpu_h200")
+        dataset_name: HuggingFace dataset for evaluation
+        split: Dataset split to use
+        difficulty: Difficulty filter
+        parallel_workers: Number of parallel workers
+        hf_token: HuggingFace token
+        hf_inference_provider: HF Inference provider
+        search_provider: Search provider for agents
+        enable_tools: List of tools to enable
+        output_format: Output format ("hub" or "json")
+        output_dir: Output directory for JSON format
+        enable_otel: Enable OpenTelemetry tracing
+        enable_gpu_metrics: Enable GPU metrics collection
+        private: Make datasets private
+        debug: Enable debug mode
+        quiet: Enable quiet mode
+        run_id: Optional run ID (auto-generated if not provided)
+    Returns:
+        dict: Job submission result with job_id, status, and details
+    """
+    try:
+        import modal
+    except ImportError:
+        return {
+            "success": False,
+            "error": "Modal package not installed. Install with: pip install modal",
+            "job_id": None
+        }
+    # Validate Modal credentials
+    modal_token_id = os.environ.get("MODAL_TOKEN_ID")
+    modal_token_secret = os.environ.get("MODAL_TOKEN_SECRET")
+    if not modal_token_id or not modal_token_secret:
+        return {
+            "success": False,
+            "error": "Modal credentials not configured. Please set MODAL_TOKEN_ID and MODAL_TOKEN_SECRET in Settings.",
+            "job_id": None
+        }
+    # Generate job ID
+    job_id = run_id if run_id else f"job_{uuid.uuid4().hex[:8]}"
+    # Map hardware to Modal GPU types
+    hardware_map = {
+        "auto": _auto_select_modal_hardware(provider, model),
+        "cpu": None,  # CPU only
+        "gpu_t4": "T4",
+        "gpu_l4": "L4",
+        "gpu_a10": "A10G",
+        "gpu_l40s": "L40S",
+        "gpu_a100": "A100",
+        "gpu_a100_80gb": "A100-80GB",
+        "gpu_h100": "H100",
+        "gpu_h200": "H200",
+        "gpu_b200": "B200"
+    }
+    modal_gpu = hardware_map.get(hardware, "A10G")
+    # Build environment variables
+    env_vars = {
+        "HF_TOKEN": hf_token or os.environ.get("HF_TOKEN", ""),
+    }
+    # Add LLM provider API keys from environment
+    llm_key_names = [
+        "OPENAI_API_KEY", "ANTHROPIC_API_KEY", "GOOGLE_API_KEY",
+        "GEMINI_API_KEY", "COHERE_API_KEY", "MISTRAL_API_KEY",
+        "TOGETHER_API_KEY", "GROQ_API_KEY", "REPLICATE_API_TOKEN",
+        "ANYSCALE_API_KEY", "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY",
+        "AWS_REGION", "AZURE_OPENAI_API_KEY", "AZURE_OPENAI_ENDPOINT",
+        "LITELLM_API_KEY"
+    ]
+    for key_name in llm_key_names:
+        value = os.environ.get(key_name)
+        if value:
+            env_vars[key_name] = value
+    # Build SMOLTRACE command
+    cmd_parts = ["smoltrace-eval"]
+    cmd_parts.append(f"--model {model}")
+    cmd_parts.append(f"--provider {provider}")
+    if hf_inference_provider:
+        cmd_parts.append(f"--hf-inference-provider {hf_inference_provider}")
+    cmd_parts.append(f"--search-provider {search_provider}")
+    if enable_tools:
+        cmd_parts.append(f"--enable-tools {','.join(enable_tools)}")
+    cmd_parts.append(f"--agent-type {agent_type}")
+    cmd_parts.append(f"--dataset-name {dataset_name}")
+    cmd_parts.append(f"--split {split}")
+    if difficulty != "all":
+        cmd_parts.append(f"--difficulty {difficulty}")
+    if parallel_workers > 1:
+        cmd_parts.append(f"--parallel-workers {parallel_workers}")
+    cmd_parts.append(f"--output-format {output_format}")
+    if output_dir and output_format == "json":
+        cmd_parts.append(f"--output-dir {output_dir}")
+    if enable_otel:
+        cmd_parts.append("--enable-otel")
+    if not enable_gpu_metrics:
+        cmd_parts.append("--disable-gpu-metrics")
+    if private:
+        cmd_parts.append("--private")
+    if debug:
+        cmd_parts.append("--debug")
+    if quiet:
+        cmd_parts.append("--quiet")
+    cmd_parts.append(f"--run-id {job_id}")
+    command = " ".join(cmd_parts)
+    # Create Modal app dynamically
+    try:
+        app = modal.App(f"smoltrace-eval-{job_id}")
+        # Define Modal function
+        image = modal.Image.debian_slim().pip_install([
+            "smoltrace[otel,gpu]",
+            "litellm",
+            "transformers",
+            "torch"
+        ])
+        @app.function(
+            image=image,
+            gpu=modal_gpu if modal_gpu else None,
+            secrets=[
+                modal.Secret.from_dict(env_vars)
+            ],
+            timeout=3600  # 1 hour timeout
+        )
+        def run_evaluation():
+            """Run SMOLTRACE evaluation on Modal"""
+            import subprocess
+            result = subprocess.run(command, shell=True, capture_output=True, text=True)
+            return {
+                "returncode": result.returncode,
+                "stdout": result.stdout,
+                "stderr": result.stderr
+            }
+        # Submit the job
+        # Note: Modal doesn't have a direct "submit and return" API like HF Jobs
+        # For now, we'll return the command that should be run
+        # In production, you'd use Modal's async API or spawn the function
+        return {
+            "success": True,
+            "job_id": job_id,
+            "platform": "Modal",
+            "hardware": modal_gpu or "CPU",
+            "command": command,
+            "status": "pending",
+            "message": "Modal job configured. Use Modal CLI to submit: modal run modal_job_submission.py",
+            "note": "Direct Modal API submission requires async handling. For now, use the generated command with Modal CLI."
+        }
+    except Exception as e:
+        return {
+            "success": False,
+            "error": f"Failed to create Modal job: {str(e)}",
+            "job_id": job_id
+        }
+def _auto_select_modal_hardware(provider: str, model: str) -> Optional[str]:
+    """
+    Automatically select Modal hardware based on model and provider
+    Args:
+        provider: Provider type
+        model: Model identifier
+    Returns:
+        str: Modal GPU type or None for CPU
+    """
+    # API models don't need GPU
+    if provider in ["litellm", "inference"]:
+        return None
+    # Local models need GPU - select based on model size
+    model_lower = model.lower()
+    if "70b" in model_lower or "65b" in model_lower:
+        return "A100-80GB"  # Large models need A100 80GB
+    elif "13b" in model_lower or "34b" in model_lower:
+        return "A10G"  # Medium models work well on A10G
+    elif "7b" in model_lower or "8b" in model_lower:
+        return "A10G"  # Small models efficient on A10G
+    else:
+        return "A10G"  # Default to A10G for unknown sizes