more factorization
Browse files
data.py
CHANGED
|
@@ -6,7 +6,8 @@ import threading
|
|
| 6 |
import traceback
|
| 7 |
import json
|
| 8 |
import re
|
| 9 |
-
|
|
|
|
| 10 |
|
| 11 |
# NOTE: if caching is an issue, try adding `use_listings_cache=False`
|
| 12 |
fs = HfFileSystem()
|
|
@@ -56,7 +57,35 @@ KEYS_TO_KEEP = [
|
|
| 56 |
"job_link_nvidia",
|
| 57 |
]
|
| 58 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
def log_dataframe_link(link: str) -> str:
|
| 61 |
"""
|
| 62 |
Adds the link to the dataset in the logs, modifies it to get a clockable link and then returns the date of the
|
|
@@ -109,69 +138,37 @@ def read_one_dataframe(json_path: str, device_label: str) -> tuple[pd.DataFrame,
|
|
| 109 |
def get_available_dates() -> List[str]:
|
| 110 |
"""Get list of available dates from both AMD and NVIDIA datasets."""
|
| 111 |
try:
|
| 112 |
-
# Get
|
| 113 |
amd_src = "hf://datasets/optimum-amd/transformers_daily_ci/**/runs/**/ci_results_run_models_gpu/model_results.json"
|
| 114 |
-
files_amd = sorted(fs.glob(amd_src, refresh=True), reverse=True)
|
| 115 |
-
logger.info(f"Found {len(files_amd)} AMD files")
|
| 116 |
-
|
| 117 |
-
# Get NVIDIA dates - structure is: YYYY-MM-DD/ci_results_run_models_gpu/model_results.json
|
| 118 |
nvidia_src = "hf://datasets/hf-internal-testing/transformers_daily_ci/*/ci_results_run_models_gpu/model_results.json"
|
|
|
|
|
|
|
| 119 |
files_nvidia = sorted(fs.glob(nvidia_src, refresh=True), reverse=True)
|
| 120 |
-
logger.info(f"Found {len(files_nvidia)} NVIDIA files")
|
| 121 |
-
|
| 122 |
-
# Extract dates from file paths
|
| 123 |
-
amd_dates = set()
|
| 124 |
-
for file_path in files_amd:
|
| 125 |
-
# Pattern to match the date in the AMD path: YYYY-MM-DD/runs/{run_id}/ci_results_run_models_gpu/model_results.json
|
| 126 |
-
pattern = r'transformers_daily_ci/(\d{4}-\d{2}-\d{2})/runs/[^/]+/ci_results_run_models_gpu/model_results\.json'
|
| 127 |
-
match = re.search(pattern, file_path)
|
| 128 |
-
if match:
|
| 129 |
-
amd_dates.add(match.group(1))
|
| 130 |
-
else:
|
| 131 |
-
# Log unmatched paths for debugging
|
| 132 |
-
logger.debug(f"AMD file path didn't match pattern: {file_path}")
|
| 133 |
|
| 134 |
-
|
| 135 |
-
if files_amd:
|
| 136 |
-
logger.info(f"Example AMD file paths: {files_amd[:3]}")
|
| 137 |
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
pattern = r'transformers_daily_ci/(\d{4}-\d{2}-\d{2})/ci_results_run_models_gpu/model_results\.json'
|
| 142 |
-
match = re.search(pattern, file_path)
|
| 143 |
-
if match:
|
| 144 |
-
nvidia_dates.add(match.group(1))
|
| 145 |
|
| 146 |
-
|
| 147 |
-
|
| 148 |
|
| 149 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
common_dates = sorted(amd_dates.intersection(nvidia_dates), reverse=True)
|
| 151 |
logger.info(f"Common dates: {len(common_dates)} dates where both AMD and NVIDIA have data")
|
| 152 |
|
| 153 |
-
if common_dates
|
| 154 |
-
return common_dates[:30] # Limit to last 30 days for performance
|
| 155 |
-
else:
|
| 156 |
-
# If no real dates available, generate fake dates for the last 7 days
|
| 157 |
-
logger.warning("No real dates available, generating fake dates for demo purposes")
|
| 158 |
-
fake_dates = []
|
| 159 |
-
today = datetime.now()
|
| 160 |
-
for i in range(7):
|
| 161 |
-
date = today - timedelta(days=i)
|
| 162 |
-
fake_dates.append(date.strftime("%Y-%m-%d"))
|
| 163 |
-
return fake_dates
|
| 164 |
|
| 165 |
except Exception as e:
|
| 166 |
logger.error(f"Error getting available dates: {e}")
|
| 167 |
-
|
| 168 |
-
logger.info("Generating fake dates due to error")
|
| 169 |
-
fake_dates = []
|
| 170 |
-
today = datetime.now()
|
| 171 |
-
for i in range(7):
|
| 172 |
-
date = today - timedelta(days=i)
|
| 173 |
-
fake_dates.append(date.strftime("%Y-%m-%d"))
|
| 174 |
-
return fake_dates
|
| 175 |
|
| 176 |
|
| 177 |
def get_data_for_date(target_date: str) -> tuple[pd.DataFrame, str]:
|
|
@@ -242,37 +239,30 @@ def get_historical_data(start_date: str, end_date: str, sample_data = False) ->
|
|
| 242 |
"""Get historical data for a date range."""
|
| 243 |
if sample_data:
|
| 244 |
return get_fake_historical_data(start_date, end_date)
|
|
|
|
| 245 |
try:
|
| 246 |
start_dt = datetime.strptime(start_date, "%Y-%m-%d")
|
| 247 |
end_dt = datetime.strptime(end_date, "%Y-%m-%d")
|
| 248 |
-
|
| 249 |
historical_data = []
|
| 250 |
-
current_dt = start_dt
|
| 251 |
|
|
|
|
|
|
|
| 252 |
while current_dt <= end_dt:
|
| 253 |
date_str = current_dt.strftime("%Y-%m-%d")
|
| 254 |
try:
|
| 255 |
df, _ = get_data_for_date(date_str)
|
| 256 |
-
# Only add non-empty dataframes
|
| 257 |
if not df.empty:
|
| 258 |
df['date'] = date_str
|
| 259 |
historical_data.append(df)
|
| 260 |
logger.info(f"Loaded data for {date_str}")
|
| 261 |
-
else:
|
| 262 |
-
logger.warning(f"No data available for {date_str}")
|
| 263 |
except Exception as e:
|
| 264 |
logger.warning(f"Could not load data for {date_str}: {e}")
|
| 265 |
-
|
| 266 |
current_dt += timedelta(days=1)
|
| 267 |
|
| 268 |
-
|
| 269 |
-
combined_df = pd.concat(historical_data, ignore_index=False)
|
| 270 |
-
return combined_df
|
| 271 |
|
| 272 |
except Exception as e:
|
| 273 |
logger.error(f"Error getting historical data: {e}")
|
| 274 |
-
# Fall back to fake data when there's an error
|
| 275 |
-
logger.info("Falling back to fake historical data due to error")
|
| 276 |
return get_fake_historical_data(start_date, end_date)
|
| 277 |
|
| 278 |
|
|
@@ -326,49 +316,36 @@ def get_fake_historical_data(start_date: str, end_date: str) -> pd.DataFrame:
|
|
| 326 |
try:
|
| 327 |
start_dt = datetime.strptime(start_date, "%Y-%m-%d")
|
| 328 |
end_dt = datetime.strptime(end_date, "%Y-%m-%d")
|
| 329 |
-
|
| 330 |
-
# Generate fake data for each date in the range
|
| 331 |
-
historical_data = []
|
| 332 |
-
current_dt = start_dt
|
| 333 |
-
|
| 334 |
-
# Get base sample data to use as template
|
| 335 |
sample_df, _ = get_sample_data()
|
|
|
|
| 336 |
|
|
|
|
|
|
|
| 337 |
while current_dt <= end_dt:
|
| 338 |
-
date_str = current_dt.strftime("%Y-%m-%d")
|
| 339 |
-
|
| 340 |
-
# Create a copy of sample data for this date with some random variations
|
| 341 |
date_df = sample_df.copy()
|
| 342 |
-
date_df['date'] =
|
| 343 |
|
| 344 |
-
# Add
|
| 345 |
-
import random
|
| 346 |
for idx in date_df.index:
|
| 347 |
-
# Vary
|
| 348 |
for col in ['success_amd', 'success_nvidia', 'skipped_amd', 'skipped_nvidia']:
|
| 349 |
-
if col in date_df.columns:
|
| 350 |
-
|
| 351 |
-
if
|
| 352 |
-
|
| 353 |
-
date_df.loc[idx, col] = max(0, int(original_val * variation))
|
| 354 |
|
| 355 |
-
# Vary failure counts more dramatically
|
| 356 |
for col in ['failed_multi_no_amd', 'failed_multi_no_nvidia', 'failed_single_no_amd', 'failed_single_no_nvidia']:
|
| 357 |
-
if col in date_df.columns:
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
# Sometimes have more failures, sometimes fewer
|
| 361 |
-
variation = random.uniform(0.5, 2.0)
|
| 362 |
-
date_df.loc[idx, col] = max(0, int(original_val * variation))
|
| 363 |
|
| 364 |
historical_data.append(date_df)
|
| 365 |
current_dt += timedelta(days=1)
|
| 366 |
|
| 367 |
if not historical_data:
|
| 368 |
-
logger.warning("No fake historical data generated")
|
| 369 |
return pd.DataFrame()
|
| 370 |
|
| 371 |
-
# Combine all dataframes
|
| 372 |
combined_df = pd.concat(historical_data, ignore_index=False)
|
| 373 |
logger.info(f"Generated fake historical data: {len(combined_df)} records from {start_date} to {end_date}")
|
| 374 |
return combined_df
|
|
@@ -377,53 +354,23 @@ def get_fake_historical_data(start_date: str, end_date: str) -> pd.DataFrame:
|
|
| 377 |
logger.error(f"Error generating fake historical data: {e}")
|
| 378 |
return pd.DataFrame()
|
| 379 |
|
| 380 |
-
def safe_extract(row: pd.DataFrame, key: str) -> int:
|
| 381 |
-
return int(row.get(key, 0)) if pd.notna(row.get(key, 0)) else 0
|
| 382 |
-
|
| 383 |
-
|
| 384 |
def find_failure_first_seen(historical_df: pd.DataFrame, model_name: str, test_name: str, device: str, gpu_type: str) -> Optional[str]:
|
| 385 |
-
"""
|
| 386 |
-
Find the first date when a specific test failure appeared in historical data.
|
| 387 |
-
"""
|
| 388 |
if historical_df.empty:
|
| 389 |
return None
|
| 390 |
|
| 391 |
try:
|
| 392 |
-
|
| 393 |
-
model_name_lower = model_name.lower()
|
| 394 |
-
|
| 395 |
-
# Filter historical data for this model
|
| 396 |
-
model_data = historical_df[historical_df.index == model_name_lower].copy()
|
| 397 |
-
|
| 398 |
if model_data.empty:
|
| 399 |
return None
|
| 400 |
|
| 401 |
-
#
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
# Check each date for this failure
|
| 405 |
-
for idx, row in model_data.iterrows():
|
| 406 |
-
failures = row.get(f'failures_{device}', None)
|
| 407 |
-
|
| 408 |
-
if failures is None or pd.isna(failures):
|
| 409 |
-
continue
|
| 410 |
-
|
| 411 |
-
# Handle case where failures might be a string (JSON)
|
| 412 |
-
if isinstance(failures, str):
|
| 413 |
-
try:
|
| 414 |
-
import json
|
| 415 |
-
failures = json.loads(failures)
|
| 416 |
-
except:
|
| 417 |
-
continue
|
| 418 |
-
|
| 419 |
-
# Check if this test appears in the failures for this gpu_type
|
| 420 |
if gpu_type in failures:
|
| 421 |
for test in failures[gpu_type]:
|
| 422 |
-
|
| 423 |
-
|
| 424 |
-
# Found the first occurrence
|
| 425 |
-
return row.get('date', None)
|
| 426 |
-
|
| 427 |
return None
|
| 428 |
|
| 429 |
except Exception as e:
|
|
@@ -431,148 +378,89 @@ def find_failure_first_seen(historical_df: pd.DataFrame, model_name: str, test_n
|
|
| 431 |
return None
|
| 432 |
|
| 433 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 434 |
def find_new_regressions(current_df: pd.DataFrame, historical_df: pd.DataFrame) -> list[dict]:
|
| 435 |
-
"""
|
| 436 |
-
Compare CURRENT failures against PREVIOUS day's failures to find NEW regressions.
|
| 437 |
-
|
| 438 |
-
A regression is a test that:
|
| 439 |
-
- Is failing in the CURRENT/LATEST run (current_df)
|
| 440 |
-
- Was NOT failing in the PREVIOUS run (yesterday in historical_df)
|
| 441 |
-
"""
|
| 442 |
if current_df.empty or historical_df.empty:
|
| 443 |
return []
|
| 444 |
|
| 445 |
-
|
| 446 |
-
|
| 447 |
-
# Get the most recent date from historical data (this is "yesterday")
|
| 448 |
available_dates = sorted(historical_df['date'].unique(), reverse=True)
|
| 449 |
-
if
|
| 450 |
-
# No history to compare against
|
| 451 |
return []
|
| 452 |
|
| 453 |
-
|
| 454 |
-
|
| 455 |
|
| 456 |
-
# For each model
|
| 457 |
for model_name in current_df.index:
|
| 458 |
-
model_name_lower = model_name.lower()
|
| 459 |
-
|
| 460 |
-
# Get CURRENT failures from current_df
|
| 461 |
current_row = current_df.loc[model_name]
|
|
|
|
| 462 |
|
| 463 |
-
#
|
| 464 |
-
|
| 465 |
-
|
| 466 |
-
yesterday_failures_nvidia = {}
|
| 467 |
|
|
|
|
|
|
|
|
|
|
| 468 |
if not yesterday_row.empty:
|
| 469 |
yesterday_row = yesterday_row.iloc[0]
|
| 470 |
-
|
| 471 |
-
|
| 472 |
-
|
| 473 |
-
|
| 474 |
-
|
| 475 |
-
|
| 476 |
-
yesterday_failures_amd = json.loads(yesterday_failures_amd)
|
| 477 |
-
except:
|
| 478 |
-
yesterday_failures_amd = {}
|
| 479 |
-
if isinstance(yesterday_failures_nvidia, str):
|
| 480 |
-
try:
|
| 481 |
-
yesterday_failures_nvidia = json.loads(yesterday_failures_nvidia)
|
| 482 |
-
except:
|
| 483 |
-
yesterday_failures_nvidia = {}
|
| 484 |
-
|
| 485 |
-
# Get CURRENT failures
|
| 486 |
-
current_failures_amd = current_row.get('failures_amd', {})
|
| 487 |
-
current_failures_nvidia = current_row.get('failures_nvidia', {})
|
| 488 |
-
|
| 489 |
-
# Handle string/dict conversion
|
| 490 |
-
if isinstance(current_failures_amd, str):
|
| 491 |
-
try:
|
| 492 |
-
current_failures_amd = json.loads(current_failures_amd)
|
| 493 |
-
except:
|
| 494 |
-
current_failures_amd = {}
|
| 495 |
-
if isinstance(current_failures_nvidia, str):
|
| 496 |
-
try:
|
| 497 |
-
current_failures_nvidia = json.loads(current_failures_nvidia)
|
| 498 |
-
except:
|
| 499 |
-
current_failures_nvidia = {}
|
| 500 |
-
|
| 501 |
-
# Check AMD failures - find tests failing NOW but NOT yesterday
|
| 502 |
-
for gpu_type in ['single', 'multi']:
|
| 503 |
-
current_tests = current_failures_amd.get(gpu_type, [])
|
| 504 |
-
yesterday_tests = yesterday_failures_amd.get(gpu_type, [])
|
| 505 |
-
|
| 506 |
-
# Get test names
|
| 507 |
-
current_test_names = {test.get('line', '') for test in current_tests}
|
| 508 |
-
yesterday_test_names = {test.get('line', '') for test in yesterday_tests}
|
| 509 |
-
|
| 510 |
-
# Find NEW failures: failing NOW but NOT yesterday
|
| 511 |
-
new_tests = current_test_names - yesterday_test_names
|
| 512 |
-
for test_name in new_tests:
|
| 513 |
-
if test_name: # Skip empty names
|
| 514 |
-
new_regressions.append({
|
| 515 |
-
'model': model_name,
|
| 516 |
-
'test': test_name.split('::')[-1], # Short name
|
| 517 |
-
'test_full': test_name, # Full name
|
| 518 |
-
'device': 'amd',
|
| 519 |
-
'gpu_type': gpu_type
|
| 520 |
-
})
|
| 521 |
-
|
| 522 |
-
# Check NVIDIA failures - find tests failing NOW but NOT yesterday
|
| 523 |
-
for gpu_type in ['single', 'multi']:
|
| 524 |
-
current_tests = current_failures_nvidia.get(gpu_type, [])
|
| 525 |
-
yesterday_tests = yesterday_failures_nvidia.get(gpu_type, [])
|
| 526 |
-
|
| 527 |
-
# Get test names
|
| 528 |
-
current_test_names = {test.get('line', '') for test in current_tests}
|
| 529 |
-
yesterday_test_names = {test.get('line', '') for test in yesterday_tests}
|
| 530 |
-
|
| 531 |
-
# Find NEW failures: failing NOW but NOT yesterday
|
| 532 |
-
new_tests = current_test_names - yesterday_test_names
|
| 533 |
-
for test_name in new_tests:
|
| 534 |
-
if test_name: # Skip empty names
|
| 535 |
-
new_regressions.append({
|
| 536 |
-
'model': model_name,
|
| 537 |
-
'test': test_name.split('::')[-1], # Short name
|
| 538 |
-
'test_full': test_name, # Full name
|
| 539 |
-
'device': 'nvidia',
|
| 540 |
-
'gpu_type': gpu_type
|
| 541 |
-
})
|
| 542 |
|
| 543 |
return new_regressions
|
| 544 |
|
| 545 |
|
| 546 |
def extract_model_data(row: pd.Series) -> tuple[dict[str, int], dict[str, int], int, int, int, int]:
|
| 547 |
"""Extract and process model data from DataFrame row."""
|
| 548 |
-
#
|
| 549 |
-
|
| 550 |
-
|
| 551 |
-
|
| 552 |
-
|
| 553 |
-
|
| 554 |
|
| 555 |
-
|
| 556 |
-
failed_multi_nvidia = safe_extract(row, 'failed_multi_no_nvidia')
|
| 557 |
-
failed_single_amd = safe_extract(row, 'failed_single_no_amd')
|
| 558 |
-
failed_single_nvidia = safe_extract(row, 'failed_single_no_nvidia')
|
| 559 |
-
# Calculate total failures
|
| 560 |
-
total_failed_amd = failed_multi_amd + failed_single_amd
|
| 561 |
-
total_failed_nvidia = failed_multi_nvidia + failed_single_nvidia
|
| 562 |
-
# Create stats dictionaries directly from dataframe values
|
| 563 |
amd_stats = {
|
| 564 |
-
'passed': success_amd,
|
| 565 |
-
'failed':
|
| 566 |
-
'skipped': skipped_amd,
|
| 567 |
-
'error': 0
|
| 568 |
}
|
| 569 |
nvidia_stats = {
|
| 570 |
-
'passed': success_nvidia,
|
| 571 |
-
'failed':
|
| 572 |
-
'skipped': skipped_nvidia,
|
| 573 |
-
'error': 0
|
| 574 |
}
|
| 575 |
-
|
|
|
|
|
|
|
|
|
|
| 576 |
|
| 577 |
|
| 578 |
|
|
@@ -643,19 +531,11 @@ class CIResults:
|
|
| 643 |
"""Load all available historical data at startup."""
|
| 644 |
try:
|
| 645 |
if not self.available_dates:
|
| 646 |
-
|
| 647 |
-
fake_dates = []
|
| 648 |
-
today = datetime.now()
|
| 649 |
-
for i in range(7):
|
| 650 |
-
date = today - timedelta(days=i)
|
| 651 |
-
fake_dates.append(date.strftime("%Y-%m-%d"))
|
| 652 |
-
self.available_dates = fake_dates
|
| 653 |
logger.info(f"No available dates found, generated {len(self.available_dates)} sample dates.")
|
| 654 |
-
|
| 655 |
-
logger.info(f"Loading all historical data for {len(self.available_dates)} dates...")
|
| 656 |
-
start_date = self.available_dates[-1] # Oldest date
|
| 657 |
-
end_date = self.available_dates[0] # Newest date
|
| 658 |
|
|
|
|
|
|
|
| 659 |
self.all_historical_data = get_historical_data(start_date, end_date, self.sample_data)
|
| 660 |
logger.info(f"All historical data loaded: {len(self.all_historical_data)} records")
|
| 661 |
except Exception as e:
|
|
@@ -672,17 +552,15 @@ class CIResults:
|
|
| 672 |
self.historical_df = pd.DataFrame()
|
| 673 |
return
|
| 674 |
|
| 675 |
-
# Filter
|
| 676 |
start_dt = datetime.strptime(start_date, "%Y-%m-%d")
|
| 677 |
end_dt = datetime.strptime(end_date, "%Y-%m-%d")
|
| 678 |
|
| 679 |
-
|
| 680 |
-
|
| 681 |
-
|
| 682 |
-
|
| 683 |
-
|
| 684 |
-
date_data = self.all_historical_data[self.all_historical_data['date'] == date_str]
|
| 685 |
-
filtered_data.append(date_data)
|
| 686 |
|
| 687 |
if filtered_data:
|
| 688 |
self.historical_df = pd.concat(filtered_data, ignore_index=False)
|
|
|
|
| 6 |
import traceback
|
| 7 |
import json
|
| 8 |
import re
|
| 9 |
+
import random
|
| 10 |
+
from typing import List, Tuple, Optional, Dict
|
| 11 |
|
| 12 |
# NOTE: if caching is an issue, try adding `use_listings_cache=False`
|
| 13 |
fs = HfFileSystem()
|
|
|
|
| 57 |
"job_link_nvidia",
|
| 58 |
]
|
| 59 |
|
| 60 |
+
# HELPER FUNCTIONS
|
| 61 |
+
def generate_fake_dates(num_days: int = 7) -> List[str]:
|
| 62 |
+
"""Generate fake dates for the last N days."""
|
| 63 |
+
today = datetime.now()
|
| 64 |
+
return [(today - timedelta(days=i)).strftime("%Y-%m-%d") for i in range(num_days)]
|
| 65 |
|
| 66 |
+
def parse_json_field(value) -> dict:
|
| 67 |
+
"""Safely parse a JSON field that might be a string or dict."""
|
| 68 |
+
if isinstance(value, str):
|
| 69 |
+
try:
|
| 70 |
+
return json.loads(value)
|
| 71 |
+
except:
|
| 72 |
+
return {}
|
| 73 |
+
return value if isinstance(value, dict) else {}
|
| 74 |
+
|
| 75 |
+
def extract_date_from_path(path: str, pattern: str) -> Optional[str]:
|
| 76 |
+
"""Extract date from file path using regex pattern."""
|
| 77 |
+
match = re.search(pattern, path)
|
| 78 |
+
return match.group(1) if match else None
|
| 79 |
+
|
| 80 |
+
def get_test_names(tests: list) -> set:
|
| 81 |
+
"""Extract test names from a list of test dictionaries."""
|
| 82 |
+
return {test.get('line', '') for test in tests}
|
| 83 |
+
|
| 84 |
+
def safe_extract(row: pd.Series, key: str) -> int:
|
| 85 |
+
"""Safely extract an integer value from a DataFrame row."""
|
| 86 |
+
return int(row.get(key, 0)) if pd.notna(row.get(key, 0)) else 0
|
| 87 |
+
|
| 88 |
+
# DATA LOADING FUNCTIONS
|
| 89 |
def log_dataframe_link(link: str) -> str:
|
| 90 |
"""
|
| 91 |
Adds the link to the dataset in the logs, modifies it to get a clockable link and then returns the date of the
|
|
|
|
| 138 |
def get_available_dates() -> List[str]:
|
| 139 |
"""Get list of available dates from both AMD and NVIDIA datasets."""
|
| 140 |
try:
|
| 141 |
+
# Get file lists
|
| 142 |
amd_src = "hf://datasets/optimum-amd/transformers_daily_ci/**/runs/**/ci_results_run_models_gpu/model_results.json"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
nvidia_src = "hf://datasets/hf-internal-testing/transformers_daily_ci/*/ci_results_run_models_gpu/model_results.json"
|
| 144 |
+
|
| 145 |
+
files_amd = sorted(fs.glob(amd_src, refresh=True), reverse=True)
|
| 146 |
files_nvidia = sorted(fs.glob(nvidia_src, refresh=True), reverse=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
|
| 148 |
+
logger.info(f"Found {len(files_amd)} AMD files, {len(files_nvidia)} NVIDIA files")
|
|
|
|
|
|
|
| 149 |
|
| 150 |
+
# Extract dates using patterns
|
| 151 |
+
amd_pattern = r'transformers_daily_ci/(\d{4}-\d{2}-\d{2})/runs/[^/]+/ci_results_run_models_gpu/model_results\.json'
|
| 152 |
+
nvidia_pattern = r'transformers_daily_ci/(\d{4}-\d{2}-\d{2})/ci_results_run_models_gpu/model_results\.json'
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
|
| 154 |
+
amd_dates = {extract_date_from_path(f, amd_pattern) for f in files_amd}
|
| 155 |
+
amd_dates.discard(None) # Remove None values
|
| 156 |
|
| 157 |
+
nvidia_dates = {extract_date_from_path(f, nvidia_pattern) for f in files_nvidia}
|
| 158 |
+
nvidia_dates.discard(None)
|
| 159 |
+
|
| 160 |
+
logger.info(f"AMD dates: {sorted(amd_dates, reverse=True)[:5]}...")
|
| 161 |
+
logger.info(f"NVIDIA dates: {sorted(nvidia_dates, reverse=True)[:5]}...")
|
| 162 |
+
|
| 163 |
+
# Return intersection of both datasets
|
| 164 |
common_dates = sorted(amd_dates.intersection(nvidia_dates), reverse=True)
|
| 165 |
logger.info(f"Common dates: {len(common_dates)} dates where both AMD and NVIDIA have data")
|
| 166 |
|
| 167 |
+
return common_dates[:30] if common_dates else generate_fake_dates()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
|
| 169 |
except Exception as e:
|
| 170 |
logger.error(f"Error getting available dates: {e}")
|
| 171 |
+
return generate_fake_dates()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
|
| 173 |
|
| 174 |
def get_data_for_date(target_date: str) -> tuple[pd.DataFrame, str]:
|
|
|
|
| 239 |
"""Get historical data for a date range."""
|
| 240 |
if sample_data:
|
| 241 |
return get_fake_historical_data(start_date, end_date)
|
| 242 |
+
|
| 243 |
try:
|
| 244 |
start_dt = datetime.strptime(start_date, "%Y-%m-%d")
|
| 245 |
end_dt = datetime.strptime(end_date, "%Y-%m-%d")
|
|
|
|
| 246 |
historical_data = []
|
|
|
|
| 247 |
|
| 248 |
+
# Load data for each day in range
|
| 249 |
+
current_dt = start_dt
|
| 250 |
while current_dt <= end_dt:
|
| 251 |
date_str = current_dt.strftime("%Y-%m-%d")
|
| 252 |
try:
|
| 253 |
df, _ = get_data_for_date(date_str)
|
|
|
|
| 254 |
if not df.empty:
|
| 255 |
df['date'] = date_str
|
| 256 |
historical_data.append(df)
|
| 257 |
logger.info(f"Loaded data for {date_str}")
|
|
|
|
|
|
|
| 258 |
except Exception as e:
|
| 259 |
logger.warning(f"Could not load data for {date_str}: {e}")
|
|
|
|
| 260 |
current_dt += timedelta(days=1)
|
| 261 |
|
| 262 |
+
return pd.concat(historical_data, ignore_index=False) if historical_data else pd.DataFrame()
|
|
|
|
|
|
|
| 263 |
|
| 264 |
except Exception as e:
|
| 265 |
logger.error(f"Error getting historical data: {e}")
|
|
|
|
|
|
|
| 266 |
return get_fake_historical_data(start_date, end_date)
|
| 267 |
|
| 268 |
|
|
|
|
| 316 |
try:
|
| 317 |
start_dt = datetime.strptime(start_date, "%Y-%m-%d")
|
| 318 |
end_dt = datetime.strptime(end_date, "%Y-%m-%d")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 319 |
sample_df, _ = get_sample_data()
|
| 320 |
+
historical_data = []
|
| 321 |
|
| 322 |
+
# Generate data for each date
|
| 323 |
+
current_dt = start_dt
|
| 324 |
while current_dt <= end_dt:
|
|
|
|
|
|
|
|
|
|
| 325 |
date_df = sample_df.copy()
|
| 326 |
+
date_df['date'] = current_dt.strftime("%Y-%m-%d")
|
| 327 |
|
| 328 |
+
# Add random variations to make it realistic
|
|
|
|
| 329 |
for idx in date_df.index:
|
| 330 |
+
# Vary success/skipped counts (±20%)
|
| 331 |
for col in ['success_amd', 'success_nvidia', 'skipped_amd', 'skipped_nvidia']:
|
| 332 |
+
if col in date_df.columns and pd.notna(date_df.loc[idx, col]):
|
| 333 |
+
val = date_df.loc[idx, col]
|
| 334 |
+
if val > 0:
|
| 335 |
+
date_df.loc[idx, col] = max(0, int(val * random.uniform(0.8, 1.2)))
|
|
|
|
| 336 |
|
| 337 |
+
# Vary failure counts more dramatically (±50-100%)
|
| 338 |
for col in ['failed_multi_no_amd', 'failed_multi_no_nvidia', 'failed_single_no_amd', 'failed_single_no_nvidia']:
|
| 339 |
+
if col in date_df.columns and pd.notna(date_df.loc[idx, col]):
|
| 340 |
+
val = date_df.loc[idx, col]
|
| 341 |
+
date_df.loc[idx, col] = max(0, int(val * random.uniform(0.5, 2.0)))
|
|
|
|
|
|
|
|
|
|
| 342 |
|
| 343 |
historical_data.append(date_df)
|
| 344 |
current_dt += timedelta(days=1)
|
| 345 |
|
| 346 |
if not historical_data:
|
|
|
|
| 347 |
return pd.DataFrame()
|
| 348 |
|
|
|
|
| 349 |
combined_df = pd.concat(historical_data, ignore_index=False)
|
| 350 |
logger.info(f"Generated fake historical data: {len(combined_df)} records from {start_date} to {end_date}")
|
| 351 |
return combined_df
|
|
|
|
| 354 |
logger.error(f"Error generating fake historical data: {e}")
|
| 355 |
return pd.DataFrame()
|
| 356 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 357 |
def find_failure_first_seen(historical_df: pd.DataFrame, model_name: str, test_name: str, device: str, gpu_type: str) -> Optional[str]:
|
| 358 |
+
"""Find the first date when a specific test failure appeared in historical data."""
|
|
|
|
|
|
|
| 359 |
if historical_df.empty:
|
| 360 |
return None
|
| 361 |
|
| 362 |
try:
|
| 363 |
+
model_data = historical_df[historical_df.index == model_name.lower()].copy()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 364 |
if model_data.empty:
|
| 365 |
return None
|
| 366 |
|
| 367 |
+
# Check each date (oldest first) for this failure
|
| 368 |
+
for _, row in model_data.sort_values('date').iterrows():
|
| 369 |
+
failures = parse_json_field(row.get(f'failures_{device}'))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 370 |
if gpu_type in failures:
|
| 371 |
for test in failures[gpu_type]:
|
| 372 |
+
if test.get('line', '') == test_name:
|
| 373 |
+
return row.get('date')
|
|
|
|
|
|
|
|
|
|
| 374 |
return None
|
| 375 |
|
| 376 |
except Exception as e:
|
|
|
|
| 378 |
return None
|
| 379 |
|
| 380 |
|
| 381 |
+
def _find_device_regressions(model_name: str, current_failures: dict, yesterday_failures: dict, device: str) -> list[dict]:
|
| 382 |
+
"""Helper to find regressions for a specific device."""
|
| 383 |
+
regressions = []
|
| 384 |
+
for gpu_type in ['single', 'multi']:
|
| 385 |
+
current_tests = get_test_names(current_failures.get(gpu_type, []))
|
| 386 |
+
yesterday_tests = get_test_names(yesterday_failures.get(gpu_type, []))
|
| 387 |
+
|
| 388 |
+
# Find NEW failures: failing NOW but NOT yesterday
|
| 389 |
+
new_tests = current_tests - yesterday_tests
|
| 390 |
+
for test_name in new_tests:
|
| 391 |
+
if test_name: # Skip empty names
|
| 392 |
+
regressions.append({
|
| 393 |
+
'model': model_name,
|
| 394 |
+
'test': test_name.split('::')[-1], # Short name
|
| 395 |
+
'test_full': test_name, # Full name
|
| 396 |
+
'device': device,
|
| 397 |
+
'gpu_type': gpu_type
|
| 398 |
+
})
|
| 399 |
+
return regressions
|
| 400 |
+
|
| 401 |
def find_new_regressions(current_df: pd.DataFrame, historical_df: pd.DataFrame) -> list[dict]:
|
| 402 |
+
"""Compare current failures against previous day's failures to find new regressions."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 403 |
if current_df.empty or historical_df.empty:
|
| 404 |
return []
|
| 405 |
|
| 406 |
+
# Get yesterday's data
|
|
|
|
|
|
|
| 407 |
available_dates = sorted(historical_df['date'].unique(), reverse=True)
|
| 408 |
+
if not available_dates:
|
|
|
|
| 409 |
return []
|
| 410 |
|
| 411 |
+
yesterday_data = historical_df[historical_df['date'] == available_dates[0]]
|
| 412 |
+
new_regressions = []
|
| 413 |
|
| 414 |
+
# For each model, compare current vs yesterday
|
| 415 |
for model_name in current_df.index:
|
|
|
|
|
|
|
|
|
|
| 416 |
current_row = current_df.loc[model_name]
|
| 417 |
+
yesterday_row = yesterday_data[yesterday_data.index == model_name.lower()]
|
| 418 |
|
| 419 |
+
# Parse current failures
|
| 420 |
+
current_amd = parse_json_field(current_row.get('failures_amd', {}))
|
| 421 |
+
current_nvidia = parse_json_field(current_row.get('failures_nvidia', {}))
|
|
|
|
| 422 |
|
| 423 |
+
# Parse yesterday failures
|
| 424 |
+
yesterday_amd = {}
|
| 425 |
+
yesterday_nvidia = {}
|
| 426 |
if not yesterday_row.empty:
|
| 427 |
yesterday_row = yesterday_row.iloc[0]
|
| 428 |
+
yesterday_amd = parse_json_field(yesterday_row.get('failures_amd', {}))
|
| 429 |
+
yesterday_nvidia = parse_json_field(yesterday_row.get('failures_nvidia', {}))
|
| 430 |
+
|
| 431 |
+
# Find regressions for both devices
|
| 432 |
+
new_regressions.extend(_find_device_regressions(model_name, current_amd, yesterday_amd, 'amd'))
|
| 433 |
+
new_regressions.extend(_find_device_regressions(model_name, current_nvidia, yesterday_nvidia, 'nvidia'))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 434 |
|
| 435 |
return new_regressions
|
| 436 |
|
| 437 |
|
| 438 |
def extract_model_data(row: pd.Series) -> tuple[dict[str, int], dict[str, int], int, int, int, int]:
|
| 439 |
"""Extract and process model data from DataFrame row."""
|
| 440 |
+
# Extract all counts
|
| 441 |
+
counts = {key: safe_extract(row, key) for key in [
|
| 442 |
+
'success_amd', 'success_nvidia', 'skipped_amd', 'skipped_nvidia',
|
| 443 |
+
'failed_multi_no_amd', 'failed_multi_no_nvidia',
|
| 444 |
+
'failed_single_no_amd', 'failed_single_no_nvidia'
|
| 445 |
+
]}
|
| 446 |
|
| 447 |
+
# Create stats dictionaries
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 448 |
amd_stats = {
|
| 449 |
+
'passed': counts['success_amd'],
|
| 450 |
+
'failed': counts['failed_multi_no_amd'] + counts['failed_single_no_amd'],
|
| 451 |
+
'skipped': counts['skipped_amd'],
|
| 452 |
+
'error': 0
|
| 453 |
}
|
| 454 |
nvidia_stats = {
|
| 455 |
+
'passed': counts['success_nvidia'],
|
| 456 |
+
'failed': counts['failed_multi_no_nvidia'] + counts['failed_single_no_nvidia'],
|
| 457 |
+
'skipped': counts['skipped_nvidia'],
|
| 458 |
+
'error': 0
|
| 459 |
}
|
| 460 |
+
|
| 461 |
+
return (amd_stats, nvidia_stats, counts['failed_multi_no_amd'],
|
| 462 |
+
counts['failed_single_no_amd'], counts['failed_multi_no_nvidia'],
|
| 463 |
+
counts['failed_single_no_nvidia'])
|
| 464 |
|
| 465 |
|
| 466 |
|
|
|
|
| 531 |
"""Load all available historical data at startup."""
|
| 532 |
try:
|
| 533 |
if not self.available_dates:
|
| 534 |
+
self.available_dates = generate_fake_dates()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 535 |
logger.info(f"No available dates found, generated {len(self.available_dates)} sample dates.")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 536 |
|
| 537 |
+
logger.info(f"Loading all historical data for {len(self.available_dates)} dates...")
|
| 538 |
+
start_date, end_date = self.available_dates[-1], self.available_dates[0]
|
| 539 |
self.all_historical_data = get_historical_data(start_date, end_date, self.sample_data)
|
| 540 |
logger.info(f"All historical data loaded: {len(self.all_historical_data)} records")
|
| 541 |
except Exception as e:
|
|
|
|
| 552 |
self.historical_df = pd.DataFrame()
|
| 553 |
return
|
| 554 |
|
| 555 |
+
# Filter by date range
|
| 556 |
start_dt = datetime.strptime(start_date, "%Y-%m-%d")
|
| 557 |
end_dt = datetime.strptime(end_date, "%Y-%m-%d")
|
| 558 |
|
| 559 |
+
filtered_data = [
|
| 560 |
+
self.all_historical_data[self.all_historical_data['date'] == date_str]
|
| 561 |
+
for date_str in self.all_historical_data['date'].unique()
|
| 562 |
+
if start_dt <= datetime.strptime(date_str, "%Y-%m-%d") <= end_dt
|
| 563 |
+
]
|
|
|
|
|
|
|
| 564 |
|
| 565 |
if filtered_data:
|
| 566 |
self.historical_df = pd.concat(filtered_data, ignore_index=False)
|