|
import glob |
|
import json |
|
import os |
|
import shutil |
|
import sys |
|
import urllib |
|
from collections import defaultdict |
|
from datetime import datetime |
|
from statistics import mean |
|
|
|
import pandas as pd |
|
import requests |
|
|
|
from constants import BASE_WHISPERKIT_BENCHMARK_URL |
|
from text_normalizer import text_normalizer |
|
from utils import compute_average_wer, download_dataset |
|
|
|
|
|
def fetch_evaluation_data(url): |
|
""" |
|
Fetches evaluation data from the given URL. |
|
:param url: The URL to fetch the evaluation data from. |
|
:returns: The evaluation data as a dictionary. |
|
:rauses: sys.exit if the request fails |
|
""" |
|
response = requests.get(url) |
|
if response.status_code == 200: |
|
return json.loads(response.text) |
|
else: |
|
sys.exit(f"Failed to fetch WhisperKit evals: {response.text}") |
|
|
|
|
|
def generate_device_map(base_dir): |
|
""" |
|
Generates a mapping of device identifiers to their corresponding device models. |
|
|
|
This function iterates through all summary files in the specified base directory and its subdirectories, |
|
extracting device identifier and device model information. It stores this information in a dictionary, |
|
where the keys are device identifiers and the values are device models. |
|
|
|
:param base_dir: The base directory to search for summary files. |
|
:returns: A dictionary mapping device identifiers to device models. |
|
""" |
|
device_map = {} |
|
|
|
|
|
summary_files = glob.glob(f"{base_dir}/**/*summary*.json", recursive=True) |
|
|
|
for file_path in summary_files: |
|
try: |
|
with open(file_path, "r") as f: |
|
data = json.load(f) |
|
|
|
|
|
if "deviceModel" in data and "deviceIdentifier" in data: |
|
device_map[data["deviceIdentifier"]] = data["deviceModel"] |
|
|
|
except json.JSONDecodeError: |
|
print(f"Error reading {file_path}") |
|
except Exception as e: |
|
print(f"Error processing {file_path}: {e}") |
|
|
|
|
|
output_path = "dashboard_data/device_map.json" |
|
|
|
with open(output_path, "w") as f: |
|
json.dump(device_map, f, indent=4, sort_keys=True) |
|
|
|
return device_map |
|
|
|
|
|
def get_device_name(device): |
|
""" |
|
Gets the device name from the device map if it exists. |
|
:param device: String representing the device name. |
|
:returns: The device name from the device map if it exists, otherwise the input device name. |
|
""" |
|
with open("dashboard_data/device_map.json", "r") as f: |
|
device_map = json.load(f) |
|
return device_map.get(device, device).replace(" ", "_") |
|
|
|
|
|
def process_benchmark_file(file_path, dataset_dfs, results, releases): |
|
""" |
|
Processes a single benchmark file and updates the results dictionary. |
|
|
|
:param file_path: Path to the benchmark JSON file. |
|
:param dataset_dfs: Dictionary of DataFrames containing dataset information. |
|
:param results: Dictionary to store the processed results. |
|
|
|
This function reads a benchmark JSON file, extracts relevant information, |
|
and updates the results dictionary with various metrics including WER, |
|
speed, tokens per second, and quality of inference (QoI). |
|
""" |
|
with open(file_path, "r") as file: |
|
test_results = json.load(file) |
|
|
|
if len(test_results) == 0: |
|
return |
|
|
|
commit_hash_timestamp = file_path.split("/")[-2] |
|
commit_timestamp, commit_hash = commit_hash_timestamp.split("_") |
|
|
|
if commit_hash not in releases: |
|
return |
|
|
|
first_test_result = test_results[0] |
|
model = first_test_result["testInfo"]["model"] |
|
device = first_test_result["testInfo"]["device"] |
|
dataset_dir = first_test_result["testInfo"]["datasetDir"] |
|
if "iPhone" in device or "iPad" in device: |
|
version_numbers = first_test_result["staticAttributes"]["osVersion"].split(".") |
|
if len(version_numbers) == 3 and version_numbers[-1] == "0": |
|
version_numbers.pop() |
|
os_info = f"""{'iOS' if 'iPhone' in device else 'iPadOS'}_{".".join(version_numbers)}""" |
|
else: |
|
os_info = f"macOS_{first_test_result['staticAttributes']['osVersion']}" |
|
timestamp = first_test_result["testInfo"]["date"] |
|
|
|
key = (model, device, os_info, commit_timestamp) |
|
dataset_name = dataset_dir |
|
for test_result in test_results: |
|
test_info = test_result["testInfo"] |
|
audio_file_name = test_info["audioFile"] |
|
|
|
dataset_df = dataset_dfs[dataset_name] |
|
|
|
wer_entry = { |
|
"prediction": text_normalizer(test_info["prediction"]), |
|
"reference": text_normalizer(test_info["reference"]), |
|
} |
|
results[key]["timestamp"] = timestamp |
|
results[key]["average_wer"].append(wer_entry) |
|
|
|
input_audio_seconds = test_info["timings"]["inputAudioSeconds"] |
|
full_pipeline = test_info["timings"]["fullPipeline"] |
|
total_decoding_loops = test_info["timings"]["totalDecodingLoops"] |
|
|
|
results[key]["dataset_speed"][dataset_name][ |
|
"inputAudioSeconds" |
|
] += input_audio_seconds |
|
results[key]["dataset_speed"][dataset_name]["fullPipeline"] += full_pipeline |
|
|
|
results[key]["speed"]["inputAudioSeconds"] += input_audio_seconds |
|
results[key]["speed"]["fullPipeline"] += full_pipeline |
|
|
|
results[key]["commit_hash"] = commit_hash |
|
results[key]["commit_timestamp"] = commit_timestamp |
|
|
|
results[key]["dataset_tokens_per_second"][dataset_name][ |
|
"totalDecodingLoops" |
|
] += total_decoding_loops |
|
results[key]["dataset_tokens_per_second"][dataset_name][ |
|
"fullPipeline" |
|
] += full_pipeline |
|
results[key]["tokens_per_second"]["totalDecodingLoops"] += total_decoding_loops |
|
results[key]["tokens_per_second"]["fullPipeline"] += full_pipeline |
|
|
|
audio = audio_file_name.split(".")[0] |
|
if dataset_name == "earnings22-10mins": |
|
audio = audio.split("-")[0] |
|
|
|
dataset_row = dataset_df.loc[dataset_df["file"].str.contains(audio)].iloc[0] |
|
reference_wer = dataset_row["wer"] |
|
prediction_wer = test_info["wer"] |
|
|
|
results[key]["qoi"].append(1 if prediction_wer <= reference_wer else 0) |
|
|
|
|
|
def process_summary_file(file_path, results, releases): |
|
""" |
|
Processes a summary file and updates the results dictionary with device support information. |
|
|
|
:param file_path: Path to the summary JSON file. |
|
:param results: Dictionary to store the processed results. |
|
|
|
This function reads a summary JSON file, extracts information about supported |
|
and failed models for a specific device and OS combination, and updates the |
|
results dictionary accordingly. |
|
""" |
|
with open(file_path, "r") as file: |
|
summary_data = json.load(file) |
|
|
|
if summary_data["commitHash"] not in releases: |
|
return |
|
|
|
device = summary_data["deviceIdentifier"] |
|
os = f"{'iPadOS' if 'iPad' in device else summary_data['osType']} {summary_data['osVersion']}" |
|
commit_timestamp = summary_data["commitTimestamp"] |
|
|
|
key = (device, os) |
|
if key in results: |
|
existing_timestamp = results[key]["commitTimestamp"] |
|
|
|
existing_dt = datetime.strptime(existing_timestamp, "%Y-%m-%dT%H%M%S") |
|
new_dt = datetime.strptime(commit_timestamp, "%Y-%m-%dT%H%M%S") |
|
|
|
if new_dt <= existing_dt: |
|
return |
|
else: |
|
results[key] = {} |
|
|
|
supported_models = set(summary_data["modelsTested"]) |
|
failed_models = set() |
|
|
|
dataset_count = 2 |
|
for model, value in summary_data["testResults"].items(): |
|
if model not in summary_data["failureInfo"]: |
|
dataset_count = len(value) |
|
break |
|
|
|
for failed_model in summary_data["failureInfo"]: |
|
if ( |
|
failed_model in summary_data["testResults"] |
|
and len(summary_data["testResults"][failed_model]) == dataset_count |
|
): |
|
continue |
|
supported_models.discard(failed_model) |
|
failed_models.add(failed_model) |
|
|
|
results[key]["supportedModels"] = supported_models |
|
results[key]["commitTimestamp"] = commit_timestamp |
|
results[key]["failedModels"] = (failed_models, file_path) |
|
results["modelsTested"] |= supported_models |
|
results["devices"].add(device) |
|
|
|
|
|
def calculate_and_save_performance_results( |
|
performance_results, performance_output_path |
|
): |
|
""" |
|
Calculates final performance metrics and saves them to a JSON file. |
|
|
|
:param performance_results: Dictionary containing raw performance data. |
|
:param performance_output_path: Path to save the processed performance results. |
|
|
|
This function processes the raw performance data, calculates average metrics, |
|
and writes the final results to a JSON file, with each entry representing |
|
a unique combination of model, device, and OS. |
|
""" |
|
not_supported = [] |
|
with open(performance_output_path, "w") as performance_file: |
|
for key, data in performance_results.items(): |
|
model, device, os_info, timestamp = key |
|
speed = round( |
|
data["speed"]["inputAudioSeconds"] / data["speed"]["fullPipeline"], 2 |
|
) |
|
|
|
if speed < 1.0: |
|
not_supported.append((model, device, os_info)) |
|
continue |
|
|
|
performance_entry = { |
|
"model": model.replace("_", "/"), |
|
"device": get_device_name(device).replace("_", " "), |
|
"os": os_info.replace("_", " "), |
|
"timestamp": data["timestamp"], |
|
"speed": speed, |
|
"tokens_per_second": round( |
|
data["tokens_per_second"]["totalDecodingLoops"] |
|
/ data["tokens_per_second"]["fullPipeline"], |
|
2, |
|
), |
|
"dataset_speed": { |
|
dataset: round( |
|
speed_info["inputAudioSeconds"] / speed_info["fullPipeline"], 2 |
|
) |
|
for dataset, speed_info in data["dataset_speed"].items() |
|
}, |
|
"dataset_tokens_per_second": { |
|
dataset: round( |
|
tps_info["totalDecodingLoops"] / tps_info["fullPipeline"], 2 |
|
) |
|
for dataset, tps_info in data["dataset_tokens_per_second"].items() |
|
}, |
|
"average_wer": compute_average_wer(data["average_wer"]), |
|
"qoi": round(mean(data["qoi"]), 2), |
|
"commit_hash": data["commit_hash"], |
|
"commit_timestamp": data["commit_timestamp"], |
|
} |
|
|
|
json.dump(performance_entry, performance_file) |
|
performance_file.write("\n") |
|
return not_supported |
|
|
|
|
|
def calculate_and_save_support_results( |
|
support_results, not_supported, support_output_path |
|
): |
|
""" |
|
Calculates device support results and saves them to a CSV file. |
|
|
|
:param support_results: Dictionary containing device support information. |
|
:param support_output_path: Path to save the processed support results. |
|
|
|
This function processes the device support data and creates a CSV file |
|
showing which models are supported on different devices and OS versions, |
|
using checkmarks, warning signs, quesiton marks or Not supported to |
|
indicate support status. |
|
""" |
|
all_models = sorted(support_results["modelsTested"]) |
|
all_devices = sorted(set(support_results["devices"])) |
|
|
|
df = pd.DataFrame(index=all_models, columns=["Model"] + all_devices) |
|
|
|
for model in all_models: |
|
row = {"Model": model} |
|
for device in all_devices: |
|
row[device] = "" |
|
|
|
for key, data in support_results.items(): |
|
if key in ["modelsTested", "devices"]: |
|
continue |
|
(device, os) = key |
|
supported_models = data["supportedModels"] |
|
failed_models, file_path = data["failedModels"] |
|
directories = file_path.split("/") |
|
commit_file, summary_file = directories[-2], directories[-1] |
|
url = f"{BASE_WHISPERKIT_BENCHMARK_URL}/{commit_file}/{urllib.parse.quote(summary_file)}" |
|
|
|
if model in supported_models: |
|
current_value = row[device] |
|
new_value = ( |
|
f"✅ {os}" |
|
if current_value == "" |
|
else f"{current_value}<p>✅ {os}</p>" |
|
) |
|
elif model in failed_models: |
|
current_value = row[device] |
|
new_value = ( |
|
f"""⚠️ <a style='color: #3B82F6; text-decoration: underline; text-decoration-style: dotted;' href={url}>{os}</a>""" |
|
if current_value == "" |
|
else f"""{current_value}<p>⚠️ <a style='color: #3B82F6; text-decoration: underline; text-decoration-style: dotted;' href={url}>{os}</a></p>""" |
|
) |
|
else: |
|
current_value = row[device] |
|
new_value = ( |
|
f"? {os}" |
|
if current_value == "" |
|
else f"{current_value}<p>? {os}</p>" |
|
) |
|
row[device] = new_value |
|
|
|
df.loc[model] = row |
|
|
|
remove_unsupported_cells(df, not_supported) |
|
|
|
cols = df.columns.tolist() |
|
cols = ["Model"] + [ |
|
get_device_name(col).replace("_", " ") for col in cols if col != "Model" |
|
] |
|
df.columns = cols |
|
|
|
df.to_csv(support_output_path, index=True) |
|
|
|
|
|
def remove_unsupported_cells(df, not_supported): |
|
""" |
|
Updates the DataFrame to mark unsupported model-device combinations. |
|
|
|
This function reads a configuration file to determine which models are supported |
|
on which devices. It then iterates over the DataFrame and sets the value to "Not supported" |
|
for any model-device combination that is not supported according to the configuration. |
|
|
|
:param df: A Pandas DataFrame where the index represents models and columns represent devices. |
|
""" |
|
with open("dashboard_data/config.json", "r") as file: |
|
config_data = json.load(file) |
|
|
|
device_support = config_data["device_support"] |
|
for info in device_support: |
|
identifiers = set(info["identifiers"]) |
|
supported = set(info["models"]["supported"]) |
|
|
|
for model in df.index: |
|
for device in df.columns: |
|
if ( |
|
any(identifier in device for identifier in identifiers) |
|
and model not in supported |
|
): |
|
df.at[model, device] = "Not Supported" |
|
|
|
for model, device, os in not_supported: |
|
df.at[model, device] = "Not Supported" |
|
|
|
|
|
def main(): |
|
""" |
|
Main function to orchestrate the performance data generation process. |
|
|
|
This function performs the following steps: |
|
1. Downloads benchmark data if requested. |
|
2. Fetches evaluation data for various datasets. |
|
3. Processes benchmark files and summary files. |
|
4. Calculates and saves performance and support results. |
|
""" |
|
source_xcresult_repo = "argmaxinc/whisperkit-evals-dataset" |
|
source_xcresult_subfolder = "benchmark_data/" |
|
source_xcresult_directory = f"{source_xcresult_repo}/{source_xcresult_subfolder}" |
|
if len(sys.argv) > 1 and sys.argv[1] == "download": |
|
try: |
|
shutil.rmtree(source_xcresult_repo) |
|
except: |
|
print("Nothing to remove.") |
|
download_dataset( |
|
source_xcresult_repo, source_xcresult_repo, source_xcresult_subfolder |
|
) |
|
|
|
datasets = { |
|
"Earnings-22": "https://huggingface.co/datasets/argmaxinc/whisperkit-evals/resolve/main/WhisperOpenAIAPI/openai_whisper-large-v2/earnings22/2024-03-04_13%3A39%3A42_GMT-0800.json", |
|
"LibriSpeech": "https://huggingface.co/datasets/argmaxinc/whisperkit-evals/resolve/main/WhisperOpenAIAPI/openai_whisper-large-v2/librispeech/2024-02-28_18%3A45%3A02_GMT-0800.json?download=true", |
|
"earnings22-10mins": "https://huggingface.co/datasets/argmaxinc/whisperkit-evals/resolve/main/WhisperOpenAIAPI/openai_whisper-large-v2/earnings22/2024-03-04_13%3A39%3A42_GMT-0800.json", |
|
"librispeech-10mins": "https://huggingface.co/datasets/argmaxinc/whisperkit-evals/resolve/main/WhisperOpenAIAPI/openai_whisper-large-v2/librispeech/2024-02-28_18%3A45%3A02_GMT-0800.json?download=true", |
|
"earnings22-12hours": "https://huggingface.co/datasets/argmaxinc/whisperkit-evals/resolve/main/WhisperOpenAIAPI/openai_whisper-large-v2/earnings22/2024-03-04_13%3A39%3A42_GMT-0800.json", |
|
"librispeech": "https://huggingface.co/datasets/argmaxinc/whisperkit-evals/resolve/main/WhisperOpenAIAPI/openai_whisper-large-v2/librispeech/2024-02-28_18%3A45%3A02_GMT-0800.json?download=true", |
|
} |
|
|
|
dataset_dfs = {} |
|
for dataset_name, url in datasets.items(): |
|
evals = fetch_evaluation_data(url) |
|
dataset_dfs[dataset_name] = pd.json_normalize(evals["results"]) |
|
|
|
performance_results = defaultdict( |
|
lambda: { |
|
"average_wer": [], |
|
"qoi": [], |
|
"speed": {"inputAudioSeconds": 0, "fullPipeline": 0}, |
|
"tokens_per_second": {"totalDecodingLoops": 0, "fullPipeline": 0}, |
|
"dataset_speed": defaultdict( |
|
lambda: {"inputAudioSeconds": 0, "fullPipeline": 0} |
|
), |
|
"dataset_tokens_per_second": defaultdict( |
|
lambda: {"totalDecodingLoops": 0, "fullPipeline": 0} |
|
), |
|
"timestamp": None, |
|
"commit_hash": None, |
|
"commit_timestamp": None, |
|
} |
|
) |
|
|
|
support_results = {"modelsTested": set(), "devices": set()} |
|
|
|
generate_device_map(source_xcresult_directory) |
|
|
|
with open("dashboard_data/version.json", "r") as f: |
|
version = json.load(f) |
|
releases = set(version["releases"]) |
|
|
|
for subdir, _, files in os.walk(source_xcresult_directory): |
|
for filename in files: |
|
file_path = os.path.join(subdir, filename) |
|
if not filename.endswith(".json"): |
|
continue |
|
elif "summary" in filename: |
|
process_summary_file(file_path, support_results, releases) |
|
else: |
|
process_benchmark_file(file_path, dataset_dfs, performance_results, releases) |
|
|
|
not_supported = calculate_and_save_performance_results( |
|
performance_results, "dashboard_data/performance_data.json" |
|
) |
|
calculate_and_save_support_results( |
|
support_results, not_supported, "dashboard_data/support_data.csv" |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|