""" Job Monitoring Screen for TraceMind-AI Allows users to monitor HuggingFace Jobs status and view logs """ import gradio as gr import os from typing import Optional def create_job_monitoring_screen(): """ Create the job monitoring screen for HF Jobs Returns: gr.Column: Gradio Column component for job monitoring """ with gr.Column(visible=False) as job_monitoring_interface: gr.Markdown(""" # 🔍 Job Monitoring Monitor your HuggingFace Jobs in real-time. Check job status, view logs, and track evaluation progress. """) with gr.Tabs(): # Tab 1: Single Job Inspection with gr.Tab("📋 Inspect Job"): gr.Markdown(""" ### Inspect a Specific Job Enter a HuggingFace Job ID to view its status and logs. """) with gr.Row(): job_id_input = gr.Textbox( label="HF Job ID", placeholder="e.g., kshitijthakkar/691eb073748f86bfa7144fcc", info="Format: username/job_hash" ) with gr.Row(): inspect_btn = gr.Button("🔍 Inspect Job", variant="primary") refresh_btn = gr.Button("🔄 Refresh", variant="secondary") # Job Status Section with gr.Accordion("📊 Job Status", open=True): job_status_display = gr.Markdown("Enter a Job ID and click 'Inspect Job' to view status") # Job Logs Section with gr.Accordion("📜 Job Logs", open=True): with gr.Row(): show_logs_btn = gr.Button("đŸ“Ĩ Load Logs", variant="secondary") auto_refresh_logs = gr.Checkbox( label="Auto-refresh logs (every 5s)", value=False ) job_logs_display = gr.Code( label="Job Logs", language="shell", value="Click 'Load Logs' to view job output", lines=20 ) # Tab 2: Recent Jobs List with gr.Tab("📑 Recent Jobs"): gr.Markdown(""" ### Your Recent Jobs View a list of your recent HuggingFace Jobs. """) with gr.Row(): list_jobs_btn = gr.Button("📋 Load Recent Jobs", variant="primary") jobs_limit = gr.Slider( minimum=5, maximum=50, value=10, step=5, label="Number of jobs to fetch" ) recent_jobs_display = gr.Markdown("Click 'Load Recent Jobs' to view your jobs") # Tab 3: Job Monitoring Guide with gr.Tab("📖 Guide"): gr.Markdown(""" ### Using Job Monitoring #### How to Get Your Job ID After submitting an evaluation from the "New Evaluation" tab, you'll receive: - **Run ID (SMOLTRACE)**: Used for tracking results in datasets (e.g., `job_3a22ceca`) - **HF Job ID**: Used for monitoring the actual job (e.g., `kshitijthakkar/691eb073748f86bfa7144fcc`) Use the **HF Job ID** here to monitor your job. #### Job Status Values - **QUEUED**: Job is waiting to start - **STARTING**: Job is being initialized - **RUNNING**: Job is currently executing - **SUCCEEDED**: Job completed successfully - **FAILED**: Job encountered an error - **CANCELLED**: Job was manually cancelled - **STOPPED**: Job was stopped by the system #### CLI Commands Reference You can also use the HuggingFace CLI to monitor jobs: ```bash # List your running jobs hf jobs ps # Inspect a specific job hf jobs inspect # View logs from a job hf jobs logs # Follow logs in real-time hf jobs logs --follow # Cancel a job hf jobs cancel ``` #### Tips - 💡 **Bookmark your Job ID** after submission for easy access - 🔄 **Use auto-refresh** for logs when job is running - 📊 **Check status regularly** to catch any issues early - 📝 **Review logs** if your job fails to understand what went wrong - đŸŽ¯ **Results appear in leaderboard** once job succeeds and uploads datasets """) # Functions for job monitoring def inspect_job(job_id: str): """Inspect a specific job's status""" import os if not job_id or not job_id.strip(): return gr.update(value="❌ Please enter a Job ID") # Check if token is configured before making API call token = os.environ.get("HF_TOKEN") if not token or not token.strip(): return gr.update( value=""" ### âš ī¸ HuggingFace Token Not Configured **Action Required**: 1. Go to "âš™ī¸ Settings" in the sidebar 2. Enter your HuggingFace token (must have "Run Jobs" permission) 3. Click "💾 Save API Keys" 4. Return to this tab and try again """ ) from utils.hf_jobs_submission import check_job_status result = check_job_status(job_id.strip()) if not result.get("success"): error_msg = result.get('error', 'Unknown error') return gr.update( value=f""" ### ❌ Failed to Fetch Job Status **Error**: {error_msg} **Job ID**: `{job_id}` **Troubleshooting**: - Verify the Job ID format is correct (format: `username/job_hash`) - Check that the job exists in your account - Ensure your HF token has the correct permissions - Token must have **Run Jobs** permission enabled """ ) # Format status with emoji status = result.get("status", "unknown") # Convert status to string if it's an enum status_str = str(status).upper() if status else "UNKNOWN" status_emoji = { "QUEUED": "âŗ", "STARTING": "🔄", "RUNNING": "â–ļī¸", "SUCCEEDED": "✅", "COMPLETED": "✅", # Alternative success status "FAILED": "❌", "ERROR": "❌", # Alternative failure status "CANCELLED": "đŸšĢ", "CANCELED": "đŸšĢ", # US spelling variant "STOPPED": "âšī¸", "TIMEOUT": "âąī¸" }.get(status_str, "❓") status_color = { "QUEUED": "#FFA500", "STARTING": "#1E90FF", "RUNNING": "#00CED1", "SUCCEEDED": "#32CD32", "COMPLETED": "#32CD32", # Alternative success status "FAILED": "#DC143C", "ERROR": "#DC143C", # Alternative failure status "CANCELLED": "#696969", "CANCELED": "#696969", # US spelling variant "STOPPED": "#A9A9A9", "TIMEOUT": "#FF8C00" }.get(status_str, "#888888") created_at = result.get("created_at", "N/A") flavor = result.get("flavor", "N/A") job_url = result.get("url", None) # Format job URL as clickable link job_url_display = f"[Open in HuggingFace]({job_url})" if job_url else "N/A" return gr.update( value=f""" ### {status_emoji} Job Status: {status_str} **Job ID**: `{job_id}` #### Details - **Created**: {created_at} - **Hardware**: {flavor} - **Job URL**: {job_url_display} #### Next Steps {_get_next_steps(status_str)} --- 💡 **Tip**: Use "đŸ“Ĩ Load Logs" button below to view detailed execution logs and check progress. """ ) def _get_next_steps(status: str) -> str: """Get next steps based on job status""" status_upper = str(status).upper() if status else "UNKNOWN" if status_upper == "QUEUED": return "âŗ Your job is waiting in the queue. It will start soon." elif status_upper == "STARTING": return "🔄 Your job is being initialized. This usually takes 1-2 minutes." elif status_upper == "RUNNING": return "â–ļī¸ Your job is running! Click 'Load Logs' below to view progress." elif status_upper in ["SUCCEEDED", "COMPLETED"]: return "✅ Your job completed successfully! Check the Leaderboard tab for results." elif status_upper in ["FAILED", "ERROR"]: return "❌ Your job failed. Click 'Load Logs' below to see what went wrong." elif status_upper in ["CANCELLED", "CANCELED", "STOPPED"]: return "đŸšĢ Your job was stopped. You can submit a new job from the 'New Evaluation' tab." elif status_upper == "TIMEOUT": return "âąī¸ Your job exceeded the time limit. Consider optimizing your model or increasing the timeout." else: return "❓ Unknown status. Try refreshing or check the HF Jobs dashboard." def load_job_logs(job_id: str): """Load logs for a specific job""" import os if not job_id or not job_id.strip(): return gr.update(value="❌ Please enter a Job ID first") # Check if token is configured before making API call token = os.environ.get("HF_TOKEN") if not token or not token.strip(): return gr.update( value="âš ī¸ HuggingFace Token Not Configured\n\nPlease configure your HF token in Settings first." ) from utils.hf_jobs_submission import get_job_logs result = get_job_logs(job_id.strip()) if not result.get("success"): return gr.update( value=f"❌ Failed to fetch logs: {result.get('error', 'Unknown error')}\n\nEnsure your HF token has 'Run Jobs' permission." ) logs = result.get("logs", "") if not logs or not logs.strip(): return gr.update(value="â„šī¸ No logs available yet. Job may not have started.\n\nTry refreshing after a minute.") return gr.update(value=logs) def list_recent_jobs(limit: int): """List user's recent jobs""" import os from utils.hf_jobs_submission import list_user_jobs # Check if token is configured before making API call token = os.environ.get("HF_TOKEN") if not token or not token.strip(): return gr.update( value=""" ### âš ī¸ HuggingFace Token Not Configured **Action Required**: 1. Go to "âš™ī¸ Settings" in the sidebar 2. Enter your HuggingFace token (must have "Run Jobs" permission) 3. Click "💾 Save API Keys" 4. Return to this tab and try again **Note**: Your HF token must: - Start with `hf_` - Have **Read**, **Write**, AND **Run Jobs** permissions - Be from a HuggingFace Pro account ($9/month) Get your token at: https://huggingface.co/settings/tokens """ ) result = list_user_jobs(limit=int(limit)) if not result.get("success"): error_msg = result.get('error', 'Unknown error') # Check for common error patterns if "invalid" in error_msg.lower() or "token" in error_msg.lower(): troubleshooting = """ **Troubleshooting**: - âš ī¸ **Token may be invalid** - Regenerate your token at HuggingFace settings - ✅ Ensure token has **Run Jobs** permission (not just Read/Write) - ✅ Verify you have an active **HuggingFace Pro account** - ✅ Token should start with `hf_` """ else: troubleshooting = """ **Troubleshooting**: - Refresh this page and try again - Check your internet connection - Verify HuggingFace services are operational """ return gr.update( value=f""" ### ❌ Failed to Fetch Jobs **Error**: {error_msg} {troubleshooting} """ ) jobs = result.get("jobs", []) if not jobs: return gr.update( value=""" ### â„šī¸ No Jobs Found You haven't submitted any jobs yet. **Get Started**: 1. Go to the "New Evaluation" tab 2. Configure your model and settings 3. Submit an evaluation job 4. Come back here to monitor progress! """ ) # Build jobs table jobs_table = "### 📋 Your Recent Jobs\n\n" jobs_table += "| Job ID | Status | Created At |\n" jobs_table += "|--------|--------|------------|\n" for job in jobs: job_id = job.get("job_id", "N/A") status = job.get("status", "unknown") created = job.get("created_at", "N/A") # Convert status to string if it's an enum status_str = str(status).upper() if status else "UNKNOWN" status_emoji = { "QUEUED": "âŗ", "STARTING": "🔄", "RUNNING": "â–ļī¸", "SUCCEEDED": "✅", "COMPLETED": "✅", # Alternative success status "FAILED": "❌", "ERROR": "❌", # Alternative failure status "CANCELLED": "đŸšĢ", "CANCELED": "đŸšĢ", # US spelling variant "STOPPED": "âšī¸", "TIMEOUT": "âąī¸" }.get(status_str, "❓") jobs_table += f"| `{job_id}` | {status_emoji} {status} | {created} |\n" jobs_table += f"\n**Total Jobs**: {len(jobs)}\n\n" jobs_table += "💡 **Tip**: Copy a Job ID and paste it in the 'Inspect Job' tab to view details and logs." return gr.update(value=jobs_table) # Wire up button events inspect_btn.click( fn=inspect_job, inputs=[job_id_input], outputs=[job_status_display] ) refresh_btn.click( fn=inspect_job, inputs=[job_id_input], outputs=[job_status_display] ) show_logs_btn.click( fn=load_job_logs, inputs=[job_id_input], outputs=[job_logs_display] ) list_jobs_btn.click( fn=list_recent_jobs, inputs=[jobs_limit], outputs=[recent_jobs_display] ) # Auto-refresh functionality (handled by Gradio's auto-update) # Note: For production, consider using gr.Timer or similar for automatic refreshes return job_monitoring_interface if __name__ == "__main__": # For standalone testing with gr.Blocks() as demo: job_monitoring = create_job_monitoring_screen() # Make it visible for standalone testing job_monitoring.visible = True demo.launch()