kshitijthakkar commited on
Commit
8c679b3
·
1 Parent(s): 4449927

feat: Add comprehensive HF Jobs monitoring system

Browse files

Implemented full job monitoring capabilities for HuggingFace Jobs:

**New Features:**
- Job Monitoring screen with 3 tabs (Inspect Job, Recent Jobs, Guide)
- Real-time job status inspection with color-coded indicators
- Job logs viewing directly in the UI
- Recent jobs list with pagination
- HF token validation and helpful error messages

**Technical Implementation:**
- Added screens/job_monitoring.py - Complete job monitoring UI
- Enhanced utils/hf_jobs_submission.py with 3 new API functions:
- check_job_status() - Inspect job details via HF API
- get_job_logs() - Fetch job logs
- list_user_jobs() - List recent user jobs
- Updated app.py - Integrated job monitoring into navigation system

**API Integration:**
- Uses official HuggingFace Hub API (inspect_job, fetch_job_logs, list_jobs)
- Proper handling of JobInfo and JobStatus objects
- Supports both username/job_hash and job_hash formats

**UI Improvements:**
- Fixed job ID display (now shows actual HF Job ID: username/hash)
- Status emojis for all job states (QUEUED, RUNNING, SUCCEEDED, CANCELED, etc.)
- Clickable job URLs to HF dashboard
- Hardware flavor display (cpu-basic, a10g-small, etc.)
- Comprehensive troubleshooting guides

**Bug Fixes:**
- Fixed JobStatus enum handling (status.stage vs status.upper())
- Removed non-existent timing fields (started_at, finished_at, runtime)
- Added all status variants (CANCELED/CANCELLED, COMPLETED/SUCCEEDED, ERROR/FAILED)
- Proper token checking before API calls

Files changed (3) hide show
  1. app.py +87 -24
  2. screens/job_monitoring.py +442 -0
  3. utils/hf_jobs_submission.py +180 -9
app.py CHANGED
@@ -61,6 +61,7 @@ from screens.chat import (
61
  )
62
  from screens.documentation import create_documentation_screen
63
  from screens.settings import create_settings_screen
 
64
  from screens.mcp_helpers import (
65
  call_analyze_leaderboard_sync,
66
  call_debug_trace_sync,
@@ -1593,6 +1594,7 @@ with gr.Blocks(title="TraceMind-AI", theme=theme) as app:
1593
  new_eval_nav_btn = gr.Button("▶️ New Evaluation", variant="secondary", size="lg")
1594
  compare_nav_btn = gr.Button("⚖️ Compare", variant="secondary", size="lg")
1595
  chat_nav_btn = gr.Button("🤖 Agent Chat", variant="secondary", size="lg")
 
1596
  synthetic_data_nav_btn = gr.Button("🔬 Synthetic Data", variant="secondary", size="lg")
1597
  docs_nav_btn = gr.Button("📚 Documentation", variant="secondary", size="lg")
1598
  settings_nav_btn = gr.Button("⚙️ Settings", variant="secondary", size="lg")
@@ -2451,6 +2453,11 @@ with gr.Blocks(title="TraceMind-AI", theme=theme) as app:
2451
  # ============================================================================
2452
  settings_screen = create_settings_screen()
2453
 
 
 
 
 
 
2454
  # ============================================================================
2455
  # Evaluation Helper Functions
2456
  # ============================================================================
@@ -2689,6 +2696,7 @@ No historical data available for **{model}**.
2689
 
2690
  # Success - build success message
2691
  job_id = result.get('job_id', 'unknown')
 
2692
  job_platform = result.get('platform', infra_provider)
2693
  job_hardware = result.get('hardware', hardware)
2694
  job_status = result.get('status', 'submitted')
@@ -2760,8 +2768,11 @@ No historical data available for **{model}**.
2760
  <h2 style="margin-top: 0;">✅ Evaluation Job Configured!</h2>
2761
 
2762
  <div style="background: rgba(255,255,255,0.15); padding: 15px; border-radius: 5px; margin: 15px 0;">
2763
- <div style="font-size: 0.9em; opacity: 0.9; margin-bottom: 5px;">Job ID</div>
2764
- <div style="font-family: monospace; font-size: 1.1em; font-weight: bold;">{job_id}</div>
 
 
 
2765
  </div>
2766
 
2767
  <div style="display: grid; grid-template-columns: 1fr 1fr 1fr; gap: 10px; margin-top: 15px;">
@@ -2884,11 +2895,13 @@ No historical data available for **{model}**.
2884
  new_evaluation_screen: gr.update(visible=False),
2885
  documentation_screen: gr.update(visible=False),
2886
  settings_screen: gr.update(visible=False),
 
2887
  dashboard_nav_btn: gr.update(variant="primary"),
2888
  leaderboard_nav_btn: gr.update(variant="secondary"),
2889
  new_eval_nav_btn: gr.update(variant="secondary"),
2890
  compare_nav_btn: gr.update(variant="secondary"),
2891
  chat_nav_btn: gr.update(variant="secondary"),
 
2892
  synthetic_data_nav_btn: gr.update(variant="secondary"),
2893
  docs_nav_btn: gr.update(variant="secondary"),
2894
  settings_nav_btn: gr.update(variant="secondary"),
@@ -2909,11 +2922,13 @@ No historical data available for **{model}**.
2909
  new_evaluation_screen: gr.update(visible=False),
2910
  documentation_screen: gr.update(visible=False),
2911
  settings_screen: gr.update(visible=False),
 
2912
  dashboard_nav_btn: gr.update(variant="secondary"),
2913
  leaderboard_nav_btn: gr.update(variant="primary"),
2914
  new_eval_nav_btn: gr.update(variant="secondary"),
2915
  compare_nav_btn: gr.update(variant="secondary"),
2916
  chat_nav_btn: gr.update(variant="secondary"),
 
2917
  synthetic_data_nav_btn: gr.update(variant="secondary"),
2918
  docs_nav_btn: gr.update(variant="secondary"),
2919
  settings_nav_btn: gr.update(variant="secondary"),
@@ -2932,11 +2947,13 @@ No historical data available for **{model}**.
2932
  new_evaluation_screen: gr.update(visible=True),
2933
  documentation_screen: gr.update(visible=False),
2934
  settings_screen: gr.update(visible=False),
 
2935
  dashboard_nav_btn: gr.update(variant="secondary"),
2936
  leaderboard_nav_btn: gr.update(variant="secondary"),
2937
  new_eval_nav_btn: gr.update(variant="primary"),
2938
  compare_nav_btn: gr.update(variant="secondary"),
2939
  chat_nav_btn: gr.update(variant="secondary"),
 
2940
  synthetic_data_nav_btn: gr.update(variant="secondary"),
2941
  docs_nav_btn: gr.update(variant="secondary"),
2942
  settings_nav_btn: gr.update(variant="secondary"),
@@ -2967,11 +2984,13 @@ No historical data available for **{model}**.
2967
  new_evaluation_screen: gr.update(visible=False),
2968
  documentation_screen: gr.update(visible=False),
2969
  settings_screen: gr.update(visible=False),
 
2970
  dashboard_nav_btn: gr.update(variant="secondary"),
2971
  leaderboard_nav_btn: gr.update(variant="secondary"),
2972
  new_eval_nav_btn: gr.update(variant="secondary"),
2973
  compare_nav_btn: gr.update(variant="primary"),
2974
  chat_nav_btn: gr.update(variant="secondary"),
 
2975
  synthetic_data_nav_btn: gr.update(variant="secondary"),
2976
  docs_nav_btn: gr.update(variant="secondary"),
2977
  settings_nav_btn: gr.update(variant="secondary"),
@@ -2991,11 +3010,13 @@ No historical data available for **{model}**.
2991
  new_evaluation_screen: gr.update(visible=False),
2992
  documentation_screen: gr.update(visible=False),
2993
  settings_screen: gr.update(visible=False),
 
2994
  dashboard_nav_btn: gr.update(variant="secondary"),
2995
  leaderboard_nav_btn: gr.update(variant="secondary"),
2996
  new_eval_nav_btn: gr.update(variant="secondary"),
2997
  compare_nav_btn: gr.update(variant="primary"),
2998
  chat_nav_btn: gr.update(variant="secondary"),
 
2999
  synthetic_data_nav_btn: gr.update(variant="secondary"),
3000
  docs_nav_btn: gr.update(variant="secondary"),
3001
  settings_nav_btn: gr.update(variant="secondary"),
@@ -3014,11 +3035,13 @@ No historical data available for **{model}**.
3014
  new_evaluation_screen: gr.update(visible=False),
3015
  documentation_screen: gr.update(visible=False),
3016
  settings_screen: gr.update(visible=False),
 
3017
  dashboard_nav_btn: gr.update(variant="secondary"),
3018
  leaderboard_nav_btn: gr.update(variant="secondary"),
3019
  new_eval_nav_btn: gr.update(variant="secondary"),
3020
  compare_nav_btn: gr.update(variant="secondary"),
3021
  chat_nav_btn: gr.update(variant="primary"),
 
3022
  synthetic_data_nav_btn: gr.update(variant="secondary"),
3023
  docs_nav_btn: gr.update(variant="secondary"),
3024
  settings_nav_btn: gr.update(variant="secondary"),
@@ -3037,11 +3060,13 @@ No historical data available for **{model}**.
3037
  new_evaluation_screen: gr.update(visible=False),
3038
  documentation_screen: gr.update(visible=False),
3039
  settings_screen: gr.update(visible=False),
 
3040
  dashboard_nav_btn: gr.update(variant="secondary"),
3041
  leaderboard_nav_btn: gr.update(variant="secondary"),
3042
  new_eval_nav_btn: gr.update(variant="secondary"),
3043
  compare_nav_btn: gr.update(variant="secondary"),
3044
  chat_nav_btn: gr.update(variant="secondary"),
 
3045
  synthetic_data_nav_btn: gr.update(variant="primary"),
3046
  docs_nav_btn: gr.update(variant="secondary"),
3047
  settings_nav_btn: gr.update(variant="secondary"),
@@ -3060,11 +3085,13 @@ No historical data available for **{model}**.
3060
  new_evaluation_screen: gr.update(visible=False),
3061
  documentation_screen: gr.update(visible=True),
3062
  settings_screen: gr.update(visible=False),
 
3063
  dashboard_nav_btn: gr.update(variant="secondary"),
3064
  leaderboard_nav_btn: gr.update(variant="secondary"),
3065
  new_eval_nav_btn: gr.update(variant="secondary"),
3066
  compare_nav_btn: gr.update(variant="secondary"),
3067
  chat_nav_btn: gr.update(variant="secondary"),
 
3068
  synthetic_data_nav_btn: gr.update(variant="secondary"),
3069
  docs_nav_btn: gr.update(variant="primary"),
3070
  settings_nav_btn: gr.update(variant="secondary"),
@@ -3083,16 +3110,43 @@ No historical data available for **{model}**.
3083
  new_evaluation_screen: gr.update(visible=False),
3084
  documentation_screen: gr.update(visible=False),
3085
  settings_screen: gr.update(visible=True),
 
3086
  dashboard_nav_btn: gr.update(variant="secondary"),
3087
  leaderboard_nav_btn: gr.update(variant="secondary"),
3088
  new_eval_nav_btn: gr.update(variant="secondary"),
3089
  compare_nav_btn: gr.update(variant="secondary"),
3090
  chat_nav_btn: gr.update(variant="secondary"),
 
3091
  synthetic_data_nav_btn: gr.update(variant="secondary"),
3092
  docs_nav_btn: gr.update(variant="secondary"),
3093
  settings_nav_btn: gr.update(variant="primary"),
3094
  }
3095
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3096
  # Synthetic Data Generator Callbacks
3097
  def on_generate_synthetic_data(domain, tools, num_tasks, difficulty, agent_type):
3098
  """Generate synthetic dataset AND prompt template using MCP server"""
@@ -3381,8 +3435,8 @@ Result: {result}
3381
  fn=navigate_to_dashboard,
3382
  outputs=[
3383
  dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen, chat_screen, synthetic_data_screen,
3384
- new_evaluation_screen, documentation_screen, settings_screen,
3385
- dashboard_nav_btn, leaderboard_nav_btn, new_eval_nav_btn, compare_nav_btn, chat_nav_btn, synthetic_data_nav_btn, docs_nav_btn, settings_nav_btn
3386
  ] + list(dashboard_components.values())
3387
  )
3388
 
@@ -3493,24 +3547,24 @@ Result: {result}
3493
  fn=navigate_to_dashboard,
3494
  outputs=[
3495
  dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen, chat_screen, synthetic_data_screen,
3496
- new_evaluation_screen, documentation_screen, settings_screen,
3497
- dashboard_nav_btn, leaderboard_nav_btn, new_eval_nav_btn, compare_nav_btn, chat_nav_btn, synthetic_data_nav_btn, docs_nav_btn, settings_nav_btn
3498
  ] + list(dashboard_components.values())
3499
  )
3500
 
3501
  leaderboard_nav_btn.click(
3502
  fn=navigate_to_leaderboard,
3503
  outputs=[
3504
- dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen, chat_screen, synthetic_data_screen, new_evaluation_screen, documentation_screen, settings_screen,
3505
- dashboard_nav_btn, leaderboard_nav_btn, new_eval_nav_btn, compare_nav_btn, chat_nav_btn, synthetic_data_nav_btn, docs_nav_btn, settings_nav_btn
3506
  ]
3507
  )
3508
 
3509
  new_eval_nav_btn.click(
3510
  fn=navigate_to_new_evaluation,
3511
  outputs=[
3512
- dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen, chat_screen, synthetic_data_screen, new_evaluation_screen, documentation_screen, settings_screen,
3513
- dashboard_nav_btn, leaderboard_nav_btn, new_eval_nav_btn, compare_nav_btn, chat_nav_btn, synthetic_data_nav_btn, docs_nav_btn, settings_nav_btn
3514
  ]
3515
  )
3516
 
@@ -3518,8 +3572,8 @@ Result: {result}
3518
  fn=navigate_to_compare,
3519
  outputs=[
3520
  dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen, chat_screen, synthetic_data_screen,
3521
- new_evaluation_screen, documentation_screen, settings_screen,
3522
- dashboard_nav_btn, leaderboard_nav_btn, new_eval_nav_btn, compare_nav_btn, chat_nav_btn, synthetic_data_nav_btn, docs_nav_btn, settings_nav_btn,
3523
  compare_components['compare_run_a_dropdown'], compare_components['compare_run_b_dropdown']
3524
  ]
3525
  )
@@ -3528,16 +3582,25 @@ Result: {result}
3528
  fn=navigate_to_chat,
3529
  outputs=[
3530
  dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen, chat_screen, synthetic_data_screen,
3531
- new_evaluation_screen, documentation_screen, settings_screen,
3532
- dashboard_nav_btn, leaderboard_nav_btn, new_eval_nav_btn, compare_nav_btn, chat_nav_btn, synthetic_data_nav_btn, docs_nav_btn, settings_nav_btn
3533
  ]
3534
  )
3535
  synthetic_data_nav_btn.click(
3536
  fn=navigate_to_synthetic_data,
3537
  outputs=[
3538
  dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen, chat_screen, synthetic_data_screen,
3539
- new_evaluation_screen, documentation_screen, settings_screen,
3540
- dashboard_nav_btn, leaderboard_nav_btn, new_eval_nav_btn, compare_nav_btn, chat_nav_btn, synthetic_data_nav_btn, docs_nav_btn, settings_nav_btn
 
 
 
 
 
 
 
 
 
3541
  ]
3542
  )
3543
 
@@ -3545,8 +3608,8 @@ Result: {result}
3545
  fn=navigate_to_documentation,
3546
  outputs=[
3547
  dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen, chat_screen, synthetic_data_screen,
3548
- new_evaluation_screen, documentation_screen, settings_screen,
3549
- dashboard_nav_btn, leaderboard_nav_btn, new_eval_nav_btn, compare_nav_btn, chat_nav_btn, synthetic_data_nav_btn, docs_nav_btn, settings_nav_btn
3550
  ]
3551
  )
3552
 
@@ -3554,8 +3617,8 @@ Result: {result}
3554
  fn=navigate_to_settings,
3555
  outputs=[
3556
  dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen, chat_screen, synthetic_data_screen,
3557
- new_evaluation_screen, documentation_screen, settings_screen,
3558
- dashboard_nav_btn, leaderboard_nav_btn, new_eval_nav_btn, compare_nav_btn, chat_nav_btn, synthetic_data_nav_btn, docs_nav_btn, settings_nav_btn
3559
  ]
3560
  )
3561
 
@@ -3576,8 +3639,8 @@ Result: {result}
3576
  back_to_leaderboard_from_eval_btn.click(
3577
  fn=navigate_to_leaderboard,
3578
  outputs=[
3579
- dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen, chat_screen, synthetic_data_screen, new_evaluation_screen, documentation_screen, settings_screen,
3580
- dashboard_nav_btn, leaderboard_nav_btn, new_eval_nav_btn, compare_nav_btn, chat_nav_btn, synthetic_data_nav_btn, docs_nav_btn, settings_nav_btn
3581
  ]
3582
  )
3583
 
@@ -3691,8 +3754,8 @@ Result: {result}
3691
  compare_components['back_to_leaderboard_btn'].click(
3692
  fn=navigate_to_leaderboard,
3693
  outputs=[
3694
- dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen, chat_screen, synthetic_data_screen, new_evaluation_screen, documentation_screen, settings_screen,
3695
- dashboard_nav_btn, leaderboard_nav_btn, new_eval_nav_btn, compare_nav_btn, chat_nav_btn, synthetic_data_nav_btn, docs_nav_btn, settings_nav_btn
3696
  ]
3697
  )
3698
 
 
61
  )
62
  from screens.documentation import create_documentation_screen
63
  from screens.settings import create_settings_screen
64
+ from screens.job_monitoring import create_job_monitoring_screen
65
  from screens.mcp_helpers import (
66
  call_analyze_leaderboard_sync,
67
  call_debug_trace_sync,
 
1594
  new_eval_nav_btn = gr.Button("▶️ New Evaluation", variant="secondary", size="lg")
1595
  compare_nav_btn = gr.Button("⚖️ Compare", variant="secondary", size="lg")
1596
  chat_nav_btn = gr.Button("🤖 Agent Chat", variant="secondary", size="lg")
1597
+ job_monitoring_nav_btn = gr.Button("🔍 Job Monitoring", variant="secondary", size="lg")
1598
  synthetic_data_nav_btn = gr.Button("🔬 Synthetic Data", variant="secondary", size="lg")
1599
  docs_nav_btn = gr.Button("📚 Documentation", variant="secondary", size="lg")
1600
  settings_nav_btn = gr.Button("⚙️ Settings", variant="secondary", size="lg")
 
2453
  # ============================================================================
2454
  settings_screen = create_settings_screen()
2455
 
2456
+ # ============================================================================
2457
+ # Screen 11: Job Monitoring
2458
+ # ============================================================================
2459
+ job_monitoring_screen = create_job_monitoring_screen()
2460
+
2461
  # ============================================================================
2462
  # Evaluation Helper Functions
2463
  # ============================================================================
 
2696
 
2697
  # Success - build success message
2698
  job_id = result.get('job_id', 'unknown')
2699
+ hf_job_id = result.get('hf_job_id', job_id) # Get actual HF job ID
2700
  job_platform = result.get('platform', infra_provider)
2701
  job_hardware = result.get('hardware', hardware)
2702
  job_status = result.get('status', 'submitted')
 
2768
  <h2 style="margin-top: 0;">✅ Evaluation Job Configured!</h2>
2769
 
2770
  <div style="background: rgba(255,255,255,0.15); padding: 15px; border-radius: 5px; margin: 15px 0;">
2771
+ <div style="font-size: 0.9em; opacity: 0.9; margin-bottom: 5px;">Run ID (SMOLTRACE)</div>
2772
+ <div style="font-family: monospace; font-size: 0.95em; font-weight: bold;">{job_id}</div>
2773
+ <div style="font-size: 0.9em; opacity: 0.9; margin-top: 10px; margin-bottom: 5px;">HF Job ID</div>
2774
+ <div style="font-family: monospace; font-size: 0.95em; font-weight: bold;">{hf_job_id}</div>
2775
+ <div style="font-size: 0.8em; opacity: 0.8; margin-top: 8px;">Use this ID to monitor: <code style="background: rgba(0,0,0,0.2); padding: 2px 6px; border-radius: 3px;">hf jobs inspect {hf_job_id}</code></div>
2776
  </div>
2777
 
2778
  <div style="display: grid; grid-template-columns: 1fr 1fr 1fr; gap: 10px; margin-top: 15px;">
 
2895
  new_evaluation_screen: gr.update(visible=False),
2896
  documentation_screen: gr.update(visible=False),
2897
  settings_screen: gr.update(visible=False),
2898
+ job_monitoring_screen: gr.update(visible=False),
2899
  dashboard_nav_btn: gr.update(variant="primary"),
2900
  leaderboard_nav_btn: gr.update(variant="secondary"),
2901
  new_eval_nav_btn: gr.update(variant="secondary"),
2902
  compare_nav_btn: gr.update(variant="secondary"),
2903
  chat_nav_btn: gr.update(variant="secondary"),
2904
+ job_monitoring_nav_btn: gr.update(variant="secondary"),
2905
  synthetic_data_nav_btn: gr.update(variant="secondary"),
2906
  docs_nav_btn: gr.update(variant="secondary"),
2907
  settings_nav_btn: gr.update(variant="secondary"),
 
2922
  new_evaluation_screen: gr.update(visible=False),
2923
  documentation_screen: gr.update(visible=False),
2924
  settings_screen: gr.update(visible=False),
2925
+ job_monitoring_screen: gr.update(visible=False),
2926
  dashboard_nav_btn: gr.update(variant="secondary"),
2927
  leaderboard_nav_btn: gr.update(variant="primary"),
2928
  new_eval_nav_btn: gr.update(variant="secondary"),
2929
  compare_nav_btn: gr.update(variant="secondary"),
2930
  chat_nav_btn: gr.update(variant="secondary"),
2931
+ job_monitoring_nav_btn: gr.update(variant="secondary"),
2932
  synthetic_data_nav_btn: gr.update(variant="secondary"),
2933
  docs_nav_btn: gr.update(variant="secondary"),
2934
  settings_nav_btn: gr.update(variant="secondary"),
 
2947
  new_evaluation_screen: gr.update(visible=True),
2948
  documentation_screen: gr.update(visible=False),
2949
  settings_screen: gr.update(visible=False),
2950
+ job_monitoring_screen: gr.update(visible=False),
2951
  dashboard_nav_btn: gr.update(variant="secondary"),
2952
  leaderboard_nav_btn: gr.update(variant="secondary"),
2953
  new_eval_nav_btn: gr.update(variant="primary"),
2954
  compare_nav_btn: gr.update(variant="secondary"),
2955
  chat_nav_btn: gr.update(variant="secondary"),
2956
+ job_monitoring_nav_btn: gr.update(variant="secondary"),
2957
  synthetic_data_nav_btn: gr.update(variant="secondary"),
2958
  docs_nav_btn: gr.update(variant="secondary"),
2959
  settings_nav_btn: gr.update(variant="secondary"),
 
2984
  new_evaluation_screen: gr.update(visible=False),
2985
  documentation_screen: gr.update(visible=False),
2986
  settings_screen: gr.update(visible=False),
2987
+ job_monitoring_screen: gr.update(visible=False),
2988
  dashboard_nav_btn: gr.update(variant="secondary"),
2989
  leaderboard_nav_btn: gr.update(variant="secondary"),
2990
  new_eval_nav_btn: gr.update(variant="secondary"),
2991
  compare_nav_btn: gr.update(variant="primary"),
2992
  chat_nav_btn: gr.update(variant="secondary"),
2993
+ job_monitoring_nav_btn: gr.update(variant="secondary"),
2994
  synthetic_data_nav_btn: gr.update(variant="secondary"),
2995
  docs_nav_btn: gr.update(variant="secondary"),
2996
  settings_nav_btn: gr.update(variant="secondary"),
 
3010
  new_evaluation_screen: gr.update(visible=False),
3011
  documentation_screen: gr.update(visible=False),
3012
  settings_screen: gr.update(visible=False),
3013
+ job_monitoring_screen: gr.update(visible=False),
3014
  dashboard_nav_btn: gr.update(variant="secondary"),
3015
  leaderboard_nav_btn: gr.update(variant="secondary"),
3016
  new_eval_nav_btn: gr.update(variant="secondary"),
3017
  compare_nav_btn: gr.update(variant="primary"),
3018
  chat_nav_btn: gr.update(variant="secondary"),
3019
+ job_monitoring_nav_btn: gr.update(variant="secondary"),
3020
  synthetic_data_nav_btn: gr.update(variant="secondary"),
3021
  docs_nav_btn: gr.update(variant="secondary"),
3022
  settings_nav_btn: gr.update(variant="secondary"),
 
3035
  new_evaluation_screen: gr.update(visible=False),
3036
  documentation_screen: gr.update(visible=False),
3037
  settings_screen: gr.update(visible=False),
3038
+ job_monitoring_screen: gr.update(visible=False),
3039
  dashboard_nav_btn: gr.update(variant="secondary"),
3040
  leaderboard_nav_btn: gr.update(variant="secondary"),
3041
  new_eval_nav_btn: gr.update(variant="secondary"),
3042
  compare_nav_btn: gr.update(variant="secondary"),
3043
  chat_nav_btn: gr.update(variant="primary"),
3044
+ job_monitoring_nav_btn: gr.update(variant="secondary"),
3045
  synthetic_data_nav_btn: gr.update(variant="secondary"),
3046
  docs_nav_btn: gr.update(variant="secondary"),
3047
  settings_nav_btn: gr.update(variant="secondary"),
 
3060
  new_evaluation_screen: gr.update(visible=False),
3061
  documentation_screen: gr.update(visible=False),
3062
  settings_screen: gr.update(visible=False),
3063
+ job_monitoring_screen: gr.update(visible=False),
3064
  dashboard_nav_btn: gr.update(variant="secondary"),
3065
  leaderboard_nav_btn: gr.update(variant="secondary"),
3066
  new_eval_nav_btn: gr.update(variant="secondary"),
3067
  compare_nav_btn: gr.update(variant="secondary"),
3068
  chat_nav_btn: gr.update(variant="secondary"),
3069
+ job_monitoring_nav_btn: gr.update(variant="secondary"),
3070
  synthetic_data_nav_btn: gr.update(variant="primary"),
3071
  docs_nav_btn: gr.update(variant="secondary"),
3072
  settings_nav_btn: gr.update(variant="secondary"),
 
3085
  new_evaluation_screen: gr.update(visible=False),
3086
  documentation_screen: gr.update(visible=True),
3087
  settings_screen: gr.update(visible=False),
3088
+ job_monitoring_screen: gr.update(visible=False),
3089
  dashboard_nav_btn: gr.update(variant="secondary"),
3090
  leaderboard_nav_btn: gr.update(variant="secondary"),
3091
  new_eval_nav_btn: gr.update(variant="secondary"),
3092
  compare_nav_btn: gr.update(variant="secondary"),
3093
  chat_nav_btn: gr.update(variant="secondary"),
3094
+ job_monitoring_nav_btn: gr.update(variant="secondary"),
3095
  synthetic_data_nav_btn: gr.update(variant="secondary"),
3096
  docs_nav_btn: gr.update(variant="primary"),
3097
  settings_nav_btn: gr.update(variant="secondary"),
 
3110
  new_evaluation_screen: gr.update(visible=False),
3111
  documentation_screen: gr.update(visible=False),
3112
  settings_screen: gr.update(visible=True),
3113
+ job_monitoring_screen: gr.update(visible=False),
3114
  dashboard_nav_btn: gr.update(variant="secondary"),
3115
  leaderboard_nav_btn: gr.update(variant="secondary"),
3116
  new_eval_nav_btn: gr.update(variant="secondary"),
3117
  compare_nav_btn: gr.update(variant="secondary"),
3118
  chat_nav_btn: gr.update(variant="secondary"),
3119
+ job_monitoring_nav_btn: gr.update(variant="secondary"),
3120
  synthetic_data_nav_btn: gr.update(variant="secondary"),
3121
  docs_nav_btn: gr.update(variant="secondary"),
3122
  settings_nav_btn: gr.update(variant="primary"),
3123
  }
3124
 
3125
+ def navigate_to_job_monitoring():
3126
+ """Navigate to job monitoring screen"""
3127
+ return {
3128
+ dashboard_screen: gr.update(visible=False),
3129
+ leaderboard_screen: gr.update(visible=False),
3130
+ run_detail_screen: gr.update(visible=False),
3131
+ trace_detail_screen: gr.update(visible=False),
3132
+ compare_screen: gr.update(visible=False),
3133
+ chat_screen: gr.update(visible=False),
3134
+ synthetic_data_screen: gr.update(visible=False),
3135
+ new_evaluation_screen: gr.update(visible=False),
3136
+ documentation_screen: gr.update(visible=False),
3137
+ settings_screen: gr.update(visible=False),
3138
+ job_monitoring_screen: gr.update(visible=True),
3139
+ dashboard_nav_btn: gr.update(variant="secondary"),
3140
+ leaderboard_nav_btn: gr.update(variant="secondary"),
3141
+ new_eval_nav_btn: gr.update(variant="secondary"),
3142
+ compare_nav_btn: gr.update(variant="secondary"),
3143
+ chat_nav_btn: gr.update(variant="secondary"),
3144
+ job_monitoring_nav_btn: gr.update(variant="primary"),
3145
+ synthetic_data_nav_btn: gr.update(variant="secondary"),
3146
+ docs_nav_btn: gr.update(variant="secondary"),
3147
+ settings_nav_btn: gr.update(variant="secondary"),
3148
+ }
3149
+
3150
  # Synthetic Data Generator Callbacks
3151
  def on_generate_synthetic_data(domain, tools, num_tasks, difficulty, agent_type):
3152
  """Generate synthetic dataset AND prompt template using MCP server"""
 
3435
  fn=navigate_to_dashboard,
3436
  outputs=[
3437
  dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen, chat_screen, synthetic_data_screen,
3438
+ new_evaluation_screen, documentation_screen, settings_screen, job_monitoring_screen,
3439
+ dashboard_nav_btn, leaderboard_nav_btn, new_eval_nav_btn, compare_nav_btn, chat_nav_btn, synthetic_data_nav_btn, job_monitoring_nav_btn, docs_nav_btn, settings_nav_btn
3440
  ] + list(dashboard_components.values())
3441
  )
3442
 
 
3547
  fn=navigate_to_dashboard,
3548
  outputs=[
3549
  dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen, chat_screen, synthetic_data_screen,
3550
+ new_evaluation_screen, documentation_screen, settings_screen, job_monitoring_screen,
3551
+ dashboard_nav_btn, leaderboard_nav_btn, new_eval_nav_btn, compare_nav_btn, chat_nav_btn, synthetic_data_nav_btn, job_monitoring_nav_btn, docs_nav_btn, settings_nav_btn
3552
  ] + list(dashboard_components.values())
3553
  )
3554
 
3555
  leaderboard_nav_btn.click(
3556
  fn=navigate_to_leaderboard,
3557
  outputs=[
3558
+ dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen, chat_screen, synthetic_data_screen, new_evaluation_screen, documentation_screen, settings_screen, job_monitoring_screen,
3559
+ dashboard_nav_btn, leaderboard_nav_btn, new_eval_nav_btn, compare_nav_btn, chat_nav_btn, synthetic_data_nav_btn, job_monitoring_nav_btn, docs_nav_btn, settings_nav_btn
3560
  ]
3561
  )
3562
 
3563
  new_eval_nav_btn.click(
3564
  fn=navigate_to_new_evaluation,
3565
  outputs=[
3566
+ dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen, chat_screen, synthetic_data_screen, new_evaluation_screen, documentation_screen, settings_screen, job_monitoring_screen,
3567
+ dashboard_nav_btn, leaderboard_nav_btn, new_eval_nav_btn, compare_nav_btn, chat_nav_btn, synthetic_data_nav_btn, job_monitoring_nav_btn, docs_nav_btn, settings_nav_btn
3568
  ]
3569
  )
3570
 
 
3572
  fn=navigate_to_compare,
3573
  outputs=[
3574
  dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen, chat_screen, synthetic_data_screen,
3575
+ new_evaluation_screen, documentation_screen, settings_screen, job_monitoring_screen,
3576
+ dashboard_nav_btn, leaderboard_nav_btn, new_eval_nav_btn, compare_nav_btn, chat_nav_btn, synthetic_data_nav_btn, job_monitoring_nav_btn, docs_nav_btn, settings_nav_btn,
3577
  compare_components['compare_run_a_dropdown'], compare_components['compare_run_b_dropdown']
3578
  ]
3579
  )
 
3582
  fn=navigate_to_chat,
3583
  outputs=[
3584
  dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen, chat_screen, synthetic_data_screen,
3585
+ new_evaluation_screen, documentation_screen, settings_screen, job_monitoring_screen,
3586
+ dashboard_nav_btn, leaderboard_nav_btn, new_eval_nav_btn, compare_nav_btn, chat_nav_btn, synthetic_data_nav_btn, job_monitoring_nav_btn, docs_nav_btn, settings_nav_btn
3587
  ]
3588
  )
3589
  synthetic_data_nav_btn.click(
3590
  fn=navigate_to_synthetic_data,
3591
  outputs=[
3592
  dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen, chat_screen, synthetic_data_screen,
3593
+ new_evaluation_screen, documentation_screen, settings_screen, job_monitoring_screen,
3594
+ dashboard_nav_btn, leaderboard_nav_btn, new_eval_nav_btn, compare_nav_btn, chat_nav_btn, synthetic_data_nav_btn, job_monitoring_nav_btn, docs_nav_btn, settings_nav_btn
3595
+ ]
3596
+ )
3597
+
3598
+ job_monitoring_nav_btn.click(
3599
+ fn=navigate_to_job_monitoring,
3600
+ outputs=[
3601
+ dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen, chat_screen, synthetic_data_screen,
3602
+ new_evaluation_screen, documentation_screen, settings_screen, job_monitoring_screen,
3603
+ dashboard_nav_btn, leaderboard_nav_btn, new_eval_nav_btn, compare_nav_btn, chat_nav_btn, synthetic_data_nav_btn, job_monitoring_nav_btn, docs_nav_btn, settings_nav_btn
3604
  ]
3605
  )
3606
 
 
3608
  fn=navigate_to_documentation,
3609
  outputs=[
3610
  dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen, chat_screen, synthetic_data_screen,
3611
+ new_evaluation_screen, documentation_screen, settings_screen, job_monitoring_screen,
3612
+ dashboard_nav_btn, leaderboard_nav_btn, new_eval_nav_btn, compare_nav_btn, chat_nav_btn, synthetic_data_nav_btn, job_monitoring_nav_btn, docs_nav_btn, settings_nav_btn
3613
  ]
3614
  )
3615
 
 
3617
  fn=navigate_to_settings,
3618
  outputs=[
3619
  dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen, chat_screen, synthetic_data_screen,
3620
+ new_evaluation_screen, documentation_screen, settings_screen, job_monitoring_screen,
3621
+ dashboard_nav_btn, leaderboard_nav_btn, new_eval_nav_btn, compare_nav_btn, chat_nav_btn, synthetic_data_nav_btn, job_monitoring_nav_btn, docs_nav_btn, settings_nav_btn
3622
  ]
3623
  )
3624
 
 
3639
  back_to_leaderboard_from_eval_btn.click(
3640
  fn=navigate_to_leaderboard,
3641
  outputs=[
3642
+ dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen, chat_screen, synthetic_data_screen, new_evaluation_screen, documentation_screen, settings_screen, job_monitoring_screen,
3643
+ dashboard_nav_btn, leaderboard_nav_btn, new_eval_nav_btn, compare_nav_btn, chat_nav_btn, synthetic_data_nav_btn, job_monitoring_nav_btn, docs_nav_btn, settings_nav_btn
3644
  ]
3645
  )
3646
 
 
3754
  compare_components['back_to_leaderboard_btn'].click(
3755
  fn=navigate_to_leaderboard,
3756
  outputs=[
3757
+ dashboard_screen, leaderboard_screen, run_detail_screen, trace_detail_screen, compare_screen, chat_screen, synthetic_data_screen, new_evaluation_screen, documentation_screen, settings_screen, job_monitoring_screen,
3758
+ dashboard_nav_btn, leaderboard_nav_btn, new_eval_nav_btn, compare_nav_btn, chat_nav_btn, synthetic_data_nav_btn, job_monitoring_nav_btn, docs_nav_btn, settings_nav_btn
3759
  ]
3760
  )
3761
 
screens/job_monitoring.py ADDED
@@ -0,0 +1,442 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Job Monitoring Screen for TraceMind-AI
3
+ Allows users to monitor HuggingFace Jobs status and view logs
4
+ """
5
+
6
+ import gradio as gr
7
+ import os
8
+ from typing import Optional
9
+
10
+
11
+ def create_job_monitoring_screen():
12
+ """
13
+ Create the job monitoring screen for HF Jobs
14
+
15
+ Returns:
16
+ gr.Column: Gradio Column component for job monitoring
17
+ """
18
+ with gr.Column(visible=False) as job_monitoring_interface:
19
+ gr.Markdown("""
20
+ # 🔍 Job Monitoring
21
+
22
+ Monitor your HuggingFace Jobs in real-time. Check job status, view logs, and track evaluation progress.
23
+ """)
24
+
25
+ with gr.Tabs():
26
+ # Tab 1: Single Job Inspection
27
+ with gr.Tab("📋 Inspect Job"):
28
+ gr.Markdown("""
29
+ ### Inspect a Specific Job
30
+
31
+ Enter a HuggingFace Job ID to view its status and logs.
32
+ """)
33
+
34
+ with gr.Row():
35
+ job_id_input = gr.Textbox(
36
+ label="HF Job ID",
37
+ placeholder="e.g., kshitijthakkar/691eb073748f86bfa7144fcc",
38
+ info="Format: username/job_hash"
39
+ )
40
+
41
+ with gr.Row():
42
+ inspect_btn = gr.Button("🔍 Inspect Job", variant="primary")
43
+ refresh_btn = gr.Button("🔄 Refresh", variant="secondary")
44
+
45
+ # Job Status Section
46
+ with gr.Accordion("📊 Job Status", open=True):
47
+ job_status_display = gr.Markdown("Enter a Job ID and click 'Inspect Job' to view status")
48
+
49
+ # Job Logs Section
50
+ with gr.Accordion("📜 Job Logs", open=True):
51
+ with gr.Row():
52
+ show_logs_btn = gr.Button("📥 Load Logs", variant="secondary")
53
+ auto_refresh_logs = gr.Checkbox(
54
+ label="Auto-refresh logs (every 5s)",
55
+ value=False
56
+ )
57
+
58
+ job_logs_display = gr.Code(
59
+ label="Job Logs",
60
+ language="shell",
61
+ value="Click 'Load Logs' to view job output",
62
+ lines=20
63
+ )
64
+
65
+ # Tab 2: Recent Jobs List
66
+ with gr.Tab("📑 Recent Jobs"):
67
+ gr.Markdown("""
68
+ ### Your Recent Jobs
69
+
70
+ View a list of your recent HuggingFace Jobs.
71
+ """)
72
+
73
+ with gr.Row():
74
+ list_jobs_btn = gr.Button("📋 Load Recent Jobs", variant="primary")
75
+ jobs_limit = gr.Slider(
76
+ minimum=5,
77
+ maximum=50,
78
+ value=10,
79
+ step=5,
80
+ label="Number of jobs to fetch"
81
+ )
82
+
83
+ recent_jobs_display = gr.Markdown("Click 'Load Recent Jobs' to view your jobs")
84
+
85
+ # Tab 3: Job Monitoring Guide
86
+ with gr.Tab("📖 Guide"):
87
+ gr.Markdown("""
88
+ ### Using Job Monitoring
89
+
90
+ #### How to Get Your Job ID
91
+
92
+ After submitting an evaluation from the "New Evaluation" tab, you'll receive:
93
+ - **Run ID (SMOLTRACE)**: Used for tracking results in datasets (e.g., `job_3a22ceca`)
94
+ - **HF Job ID**: Used for monitoring the actual job (e.g., `kshitijthakkar/691eb073748f86bfa7144fcc`)
95
+
96
+ Use the **HF Job ID** here to monitor your job.
97
+
98
+ #### Job Status Values
99
+
100
+ - **QUEUED**: Job is waiting to start
101
+ - **STARTING**: Job is being initialized
102
+ - **RUNNING**: Job is currently executing
103
+ - **SUCCEEDED**: Job completed successfully
104
+ - **FAILED**: Job encountered an error
105
+ - **CANCELLED**: Job was manually cancelled
106
+ - **STOPPED**: Job was stopped by the system
107
+
108
+ #### CLI Commands Reference
109
+
110
+ You can also use the HuggingFace CLI to monitor jobs:
111
+
112
+ ```bash
113
+ # List your running jobs
114
+ hf jobs ps
115
+
116
+ # Inspect a specific job
117
+ hf jobs inspect <job_id>
118
+
119
+ # View logs from a job
120
+ hf jobs logs <job_id>
121
+
122
+ # Follow logs in real-time
123
+ hf jobs logs <job_id> --follow
124
+
125
+ # Cancel a job
126
+ hf jobs cancel <job_id>
127
+ ```
128
+
129
+ #### Tips
130
+
131
+ - 💡 **Bookmark your Job ID** after submission for easy access
132
+ - 🔄 **Use auto-refresh** for logs when job is running
133
+ - 📊 **Check status regularly** to catch any issues early
134
+ - 📝 **Review logs** if your job fails to understand what went wrong
135
+ - 🎯 **Results appear in leaderboard** once job succeeds and uploads datasets
136
+ """)
137
+
138
+ # Functions for job monitoring
139
+ def inspect_job(job_id: str):
140
+ """Inspect a specific job's status"""
141
+ import os
142
+
143
+ if not job_id or not job_id.strip():
144
+ return gr.update(value="❌ Please enter a Job ID")
145
+
146
+ # Check if token is configured before making API call
147
+ token = os.environ.get("HF_TOKEN")
148
+ if not token or not token.strip():
149
+ return gr.update(
150
+ value="""
151
+ ### ⚠️ HuggingFace Token Not Configured
152
+
153
+ **Action Required**:
154
+ 1. Go to "⚙️ Settings" in the sidebar
155
+ 2. Enter your HuggingFace token (must have "Run Jobs" permission)
156
+ 3. Click "💾 Save API Keys"
157
+ 4. Return to this tab and try again
158
+ """
159
+ )
160
+
161
+ from utils.hf_jobs_submission import check_job_status
162
+
163
+ result = check_job_status(job_id.strip())
164
+
165
+ if not result.get("success"):
166
+ error_msg = result.get('error', 'Unknown error')
167
+
168
+ return gr.update(
169
+ value=f"""
170
+ ### ❌ Failed to Fetch Job Status
171
+
172
+ **Error**: {error_msg}
173
+
174
+ **Job ID**: `{job_id}`
175
+
176
+ **Troubleshooting**:
177
+ - Verify the Job ID format is correct (format: `username/job_hash`)
178
+ - Check that the job exists in your account
179
+ - Ensure your HF token has the correct permissions
180
+ - Token must have **Run Jobs** permission enabled
181
+ """
182
+ )
183
+
184
+ # Format status with emoji
185
+ status = result.get("status", "unknown")
186
+ # Convert status to string if it's an enum
187
+ status_str = str(status).upper() if status else "UNKNOWN"
188
+
189
+ status_emoji = {
190
+ "QUEUED": "⏳",
191
+ "STARTING": "🔄",
192
+ "RUNNING": "▶️",
193
+ "SUCCEEDED": "✅",
194
+ "COMPLETED": "✅", # Alternative success status
195
+ "FAILED": "❌",
196
+ "ERROR": "❌", # Alternative failure status
197
+ "CANCELLED": "🚫",
198
+ "CANCELED": "🚫", # US spelling variant
199
+ "STOPPED": "⏹️",
200
+ "TIMEOUT": "⏱️"
201
+ }.get(status_str, "❓")
202
+
203
+ status_color = {
204
+ "QUEUED": "#FFA500",
205
+ "STARTING": "#1E90FF",
206
+ "RUNNING": "#00CED1",
207
+ "SUCCEEDED": "#32CD32",
208
+ "COMPLETED": "#32CD32", # Alternative success status
209
+ "FAILED": "#DC143C",
210
+ "ERROR": "#DC143C", # Alternative failure status
211
+ "CANCELLED": "#696969",
212
+ "CANCELED": "#696969", # US spelling variant
213
+ "STOPPED": "#A9A9A9",
214
+ "TIMEOUT": "#FF8C00"
215
+ }.get(status_str, "#888888")
216
+
217
+ created_at = result.get("created_at", "N/A")
218
+ flavor = result.get("flavor", "N/A")
219
+ job_url = result.get("url", None)
220
+
221
+ # Format job URL as clickable link
222
+ job_url_display = f"[Open in HuggingFace]({job_url})" if job_url else "N/A"
223
+
224
+ return gr.update(
225
+ value=f"""
226
+ ### {status_emoji} Job Status: <span style="color: {status_color};">{status_str}</span>
227
+
228
+ **Job ID**: `{job_id}`
229
+
230
+ #### Details
231
+
232
+ - **Created**: {created_at}
233
+ - **Hardware**: {flavor}
234
+ - **Job URL**: {job_url_display}
235
+
236
+ #### Next Steps
237
+
238
+ {_get_next_steps(status_str)}
239
+
240
+ ---
241
+
242
+ 💡 **Tip**: Use "📥 Load Logs" button below to view detailed execution logs and check progress.
243
+ """
244
+ )
245
+
246
+ def _get_next_steps(status: str) -> str:
247
+ """Get next steps based on job status"""
248
+ status_upper = str(status).upper() if status else "UNKNOWN"
249
+
250
+ if status_upper == "QUEUED":
251
+ return "⏳ Your job is waiting in the queue. It will start soon."
252
+ elif status_upper == "STARTING":
253
+ return "🔄 Your job is being initialized. This usually takes 1-2 minutes."
254
+ elif status_upper == "RUNNING":
255
+ return "▶️ Your job is running! Click 'Load Logs' below to view progress."
256
+ elif status_upper in ["SUCCEEDED", "COMPLETED"]:
257
+ return "✅ Your job completed successfully! Check the Leaderboard tab for results."
258
+ elif status_upper in ["FAILED", "ERROR"]:
259
+ return "❌ Your job failed. Click 'Load Logs' below to see what went wrong."
260
+ elif status_upper in ["CANCELLED", "CANCELED", "STOPPED"]:
261
+ return "🚫 Your job was stopped. You can submit a new job from the 'New Evaluation' tab."
262
+ elif status_upper == "TIMEOUT":
263
+ return "⏱️ Your job exceeded the time limit. Consider optimizing your model or increasing the timeout."
264
+ else:
265
+ return "❓ Unknown status. Try refreshing or check the HF Jobs dashboard."
266
+
267
+ def load_job_logs(job_id: str):
268
+ """Load logs for a specific job"""
269
+ import os
270
+
271
+ if not job_id or not job_id.strip():
272
+ return gr.update(value="❌ Please enter a Job ID first")
273
+
274
+ # Check if token is configured before making API call
275
+ token = os.environ.get("HF_TOKEN")
276
+ if not token or not token.strip():
277
+ return gr.update(
278
+ value="⚠️ HuggingFace Token Not Configured\n\nPlease configure your HF token in Settings first."
279
+ )
280
+
281
+ from utils.hf_jobs_submission import get_job_logs
282
+
283
+ result = get_job_logs(job_id.strip())
284
+
285
+ if not result.get("success"):
286
+ return gr.update(
287
+ value=f"❌ Failed to fetch logs: {result.get('error', 'Unknown error')}\n\nEnsure your HF token has 'Run Jobs' permission."
288
+ )
289
+
290
+ logs = result.get("logs", "")
291
+ if not logs or not logs.strip():
292
+ return gr.update(value="ℹ️ No logs available yet. Job may not have started.\n\nTry refreshing after a minute.")
293
+
294
+ return gr.update(value=logs)
295
+
296
+ def list_recent_jobs(limit: int):
297
+ """List user's recent jobs"""
298
+ import os
299
+ from utils.hf_jobs_submission import list_user_jobs
300
+
301
+ # Check if token is configured before making API call
302
+ token = os.environ.get("HF_TOKEN")
303
+ if not token or not token.strip():
304
+ return gr.update(
305
+ value="""
306
+ ### ⚠️ HuggingFace Token Not Configured
307
+
308
+ **Action Required**:
309
+ 1. Go to "⚙️ Settings" in the sidebar
310
+ 2. Enter your HuggingFace token (must have "Run Jobs" permission)
311
+ 3. Click "💾 Save API Keys"
312
+ 4. Return to this tab and try again
313
+
314
+ **Note**: Your HF token must:
315
+ - Start with `hf_`
316
+ - Have **Read**, **Write**, AND **Run Jobs** permissions
317
+ - Be from a HuggingFace Pro account ($9/month)
318
+
319
+ Get your token at: https://huggingface.co/settings/tokens
320
+ """
321
+ )
322
+
323
+ result = list_user_jobs(limit=int(limit))
324
+
325
+ if not result.get("success"):
326
+ error_msg = result.get('error', 'Unknown error')
327
+
328
+ # Check for common error patterns
329
+ if "invalid" in error_msg.lower() or "token" in error_msg.lower():
330
+ troubleshooting = """
331
+ **Troubleshooting**:
332
+ - ⚠️ **Token may be invalid** - Regenerate your token at HuggingFace settings
333
+ - ✅ Ensure token has **Run Jobs** permission (not just Read/Write)
334
+ - ✅ Verify you have an active **HuggingFace Pro account**
335
+ - ✅ Token should start with `hf_`
336
+ """
337
+ else:
338
+ troubleshooting = """
339
+ **Troubleshooting**:
340
+ - Refresh this page and try again
341
+ - Check your internet connection
342
+ - Verify HuggingFace services are operational
343
+ """
344
+
345
+ return gr.update(
346
+ value=f"""
347
+ ### ❌ Failed to Fetch Jobs
348
+
349
+ **Error**: {error_msg}
350
+
351
+ {troubleshooting}
352
+ """
353
+ )
354
+
355
+ jobs = result.get("jobs", [])
356
+ if not jobs:
357
+ return gr.update(
358
+ value="""
359
+ ### ℹ️ No Jobs Found
360
+
361
+ You haven't submitted any jobs yet.
362
+
363
+ **Get Started**:
364
+ 1. Go to the "New Evaluation" tab
365
+ 2. Configure your model and settings
366
+ 3. Submit an evaluation job
367
+ 4. Come back here to monitor progress!
368
+ """
369
+ )
370
+
371
+ # Build jobs table
372
+ jobs_table = "### 📋 Your Recent Jobs\n\n"
373
+ jobs_table += "| Job ID | Status | Created At |\n"
374
+ jobs_table += "|--------|--------|------------|\n"
375
+
376
+ for job in jobs:
377
+ job_id = job.get("job_id", "N/A")
378
+ status = job.get("status", "unknown")
379
+ created = job.get("created_at", "N/A")
380
+
381
+ # Convert status to string if it's an enum
382
+ status_str = str(status).upper() if status else "UNKNOWN"
383
+
384
+ status_emoji = {
385
+ "QUEUED": "⏳",
386
+ "STARTING": "🔄",
387
+ "RUNNING": "▶️",
388
+ "SUCCEEDED": "✅",
389
+ "COMPLETED": "✅", # Alternative success status
390
+ "FAILED": "❌",
391
+ "ERROR": "❌", # Alternative failure status
392
+ "CANCELLED": "🚫",
393
+ "CANCELED": "🚫", # US spelling variant
394
+ "STOPPED": "⏹️",
395
+ "TIMEOUT": "⏱️"
396
+ }.get(status_str, "❓")
397
+
398
+ jobs_table += f"| `{job_id}` | {status_emoji} {status} | {created} |\n"
399
+
400
+ jobs_table += f"\n**Total Jobs**: {len(jobs)}\n\n"
401
+ jobs_table += "💡 **Tip**: Copy a Job ID and paste it in the 'Inspect Job' tab to view details and logs."
402
+
403
+ return gr.update(value=jobs_table)
404
+
405
+ # Wire up button events
406
+ inspect_btn.click(
407
+ fn=inspect_job,
408
+ inputs=[job_id_input],
409
+ outputs=[job_status_display]
410
+ )
411
+
412
+ refresh_btn.click(
413
+ fn=inspect_job,
414
+ inputs=[job_id_input],
415
+ outputs=[job_status_display]
416
+ )
417
+
418
+ show_logs_btn.click(
419
+ fn=load_job_logs,
420
+ inputs=[job_id_input],
421
+ outputs=[job_logs_display]
422
+ )
423
+
424
+ list_jobs_btn.click(
425
+ fn=list_recent_jobs,
426
+ inputs=[jobs_limit],
427
+ outputs=[recent_jobs_display]
428
+ )
429
+
430
+ # Auto-refresh functionality (handled by Gradio's auto-update)
431
+ # Note: For production, consider using gr.Timer or similar for automatic refreshes
432
+
433
+ return job_monitoring_interface
434
+
435
+
436
+ if __name__ == "__main__":
437
+ # For standalone testing
438
+ with gr.Blocks() as demo:
439
+ job_monitoring = create_job_monitoring_screen()
440
+ # Make it visible for standalone testing
441
+ job_monitoring.visible = True
442
+ demo.launch()
utils/hf_jobs_submission.py CHANGED
@@ -245,20 +245,191 @@ def _auto_select_hf_hardware(provider: str, model: str) -> str:
245
  return "t4-small"
246
 
247
 
248
- def check_job_status(job_id: str, hf_token: Optional[str] = None) -> Dict:
249
  """
250
- Check the status of a HuggingFace Job
251
 
252
  Args:
253
- job_id: Job ID to check
254
  hf_token: HuggingFace token (optional, uses env if not provided)
255
 
256
  Returns:
257
  dict: Job status information
258
  """
259
- # Placeholder for when HF Jobs API becomes available
260
- return {
261
- "job_id": job_id,
262
- "status": "unknown",
263
- "message": "HuggingFace Jobs status API not yet available programmatically"
264
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
245
  return "t4-small"
246
 
247
 
248
+ def check_job_status(hf_job_id: str, hf_token: Optional[str] = None) -> Dict:
249
  """
250
+ Check the status of a HuggingFace Job using the Jobs API
251
 
252
  Args:
253
+ hf_job_id: HF Job ID (format: username/job_hash or just job_hash)
254
  hf_token: HuggingFace token (optional, uses env if not provided)
255
 
256
  Returns:
257
  dict: Job status information
258
  """
259
+ try:
260
+ from huggingface_hub import HfApi
261
+ except ImportError:
262
+ return {
263
+ "success": False,
264
+ "error": "huggingface_hub package not installed",
265
+ "job_id": hf_job_id
266
+ }
267
+
268
+ token = hf_token or os.environ.get("HF_TOKEN")
269
+ if not token:
270
+ return {
271
+ "success": False,
272
+ "error": "HuggingFace token not configured",
273
+ "job_id": hf_job_id
274
+ }
275
+
276
+ try:
277
+ api = HfApi(token=token)
278
+
279
+ # Parse job_id and namespace (username)
280
+ # Format can be "username/job_hash" or just "job_hash"
281
+ if "/" in hf_job_id:
282
+ namespace, job_id_only = hf_job_id.split("/", 1)
283
+ job_info = api.inspect_job(job_id=job_id_only, namespace=namespace)
284
+ else:
285
+ job_info = api.inspect_job(job_id=hf_job_id)
286
+
287
+ # Extract status stage from JobStatus object
288
+ if hasattr(job_info, 'status') and hasattr(job_info.status, 'stage'):
289
+ status = job_info.status.stage
290
+ else:
291
+ status = str(job_info.status) if hasattr(job_info, 'status') else "unknown"
292
+
293
+ return {
294
+ "success": True,
295
+ "job_id": hf_job_id,
296
+ "status": status,
297
+ "created_at": str(job_info.created_at) if hasattr(job_info, 'created_at') else None,
298
+ "flavor": job_info.flavor if hasattr(job_info, 'flavor') else None,
299
+ "url": job_info.url if hasattr(job_info, 'url') else None,
300
+ "info": str(job_info)
301
+ }
302
+ except Exception as e:
303
+ return {
304
+ "success": False,
305
+ "error": f"Failed to fetch job status: {str(e)}",
306
+ "job_id": hf_job_id
307
+ }
308
+
309
+
310
+ def get_job_logs(hf_job_id: str, hf_token: Optional[str] = None) -> Dict:
311
+ """
312
+ Retrieve logs from a HuggingFace Job
313
+
314
+ Args:
315
+ hf_job_id: HF Job ID (format: username/job_hash or just job_hash)
316
+ hf_token: HuggingFace token (optional, uses env if not provided)
317
+
318
+ Returns:
319
+ dict: Job logs information
320
+ """
321
+ try:
322
+ from huggingface_hub import HfApi
323
+ except ImportError:
324
+ return {
325
+ "success": False,
326
+ "error": "huggingface_hub package not installed",
327
+ "job_id": hf_job_id
328
+ }
329
+
330
+ token = hf_token or os.environ.get("HF_TOKEN")
331
+ if not token:
332
+ return {
333
+ "success": False,
334
+ "error": "HuggingFace token not configured",
335
+ "job_id": hf_job_id
336
+ }
337
+
338
+ try:
339
+ api = HfApi(token=token)
340
+
341
+ # Parse job_id and namespace (username)
342
+ # Format can be "username/job_hash" or just "job_hash"
343
+ if "/" in hf_job_id:
344
+ namespace, job_id_only = hf_job_id.split("/", 1)
345
+ logs_iterable = api.fetch_job_logs(job_id=job_id_only, namespace=namespace)
346
+ else:
347
+ logs_iterable = api.fetch_job_logs(job_id=hf_job_id)
348
+
349
+ # Convert iterable to string
350
+ logs = "\n".join(logs_iterable)
351
+
352
+ return {
353
+ "success": True,
354
+ "job_id": hf_job_id,
355
+ "logs": logs
356
+ }
357
+ except Exception as e:
358
+ return {
359
+ "success": False,
360
+ "error": f"Failed to fetch job logs: {str(e)}",
361
+ "job_id": hf_job_id,
362
+ "logs": ""
363
+ }
364
+
365
+
366
+ def list_user_jobs(hf_token: Optional[str] = None, limit: int = 10) -> Dict:
367
+ """
368
+ List recent jobs for the authenticated user
369
+
370
+ Args:
371
+ hf_token: HuggingFace token (optional, uses env if not provided)
372
+ limit: Maximum number of jobs to return (applied after fetching)
373
+
374
+ Returns:
375
+ dict: List of user's jobs
376
+ """
377
+ try:
378
+ from huggingface_hub import HfApi
379
+ except ImportError:
380
+ return {
381
+ "success": False,
382
+ "error": "huggingface_hub package not installed"
383
+ }
384
+
385
+ token = hf_token or os.environ.get("HF_TOKEN")
386
+ if not token:
387
+ return {
388
+ "success": False,
389
+ "error": "HuggingFace token not configured"
390
+ }
391
+
392
+ try:
393
+ api = HfApi(token=token)
394
+ # List user's jobs (no limit parameter in API, so we fetch all and slice)
395
+ all_jobs = api.list_jobs()
396
+
397
+ # Limit the results
398
+ jobs_to_display = all_jobs[:limit] if limit > 0 else all_jobs
399
+
400
+ job_list = []
401
+ for job in jobs_to_display:
402
+ # Extract owner name from JobOwner object
403
+ owner_name = job.owner.name if hasattr(job, 'owner') and hasattr(job.owner, 'name') else None
404
+
405
+ # Build job_id in the format: owner/id
406
+ if owner_name and hasattr(job, 'id'):
407
+ job_id = f"{owner_name}/{job.id}"
408
+ elif hasattr(job, 'id'):
409
+ job_id = job.id
410
+ else:
411
+ job_id = "unknown"
412
+
413
+ # Extract status stage from JobStatus object
414
+ if hasattr(job, 'status') and hasattr(job.status, 'stage'):
415
+ status = job.status.stage
416
+ else:
417
+ status = str(job.status) if hasattr(job, 'status') else "unknown"
418
+
419
+ job_list.append({
420
+ "job_id": job_id,
421
+ "status": status,
422
+ "created_at": str(job.created_at) if hasattr(job, 'created_at') else None
423
+ })
424
+
425
+ return {
426
+ "success": True,
427
+ "jobs": job_list,
428
+ "count": len(job_list)
429
+ }
430
+ except Exception as e:
431
+ return {
432
+ "success": False,
433
+ "error": f"Failed to list jobs: {str(e)}",
434
+ "jobs": []
435
+ }