import json import pandas as pd from statistics import mean from huggingface_hub import HfApi, create_repo from datasets import load_dataset, Dataset from datasets.data_files import EmptyDatasetError import re from constants import ( REPO_ID, HF_TOKEN, DATASETS, SHORT_DATASET_NAMES, DATASET_DESCRIPTIONS, ) api = HfApi(token=HF_TOKEN) def init_repo(): try: api.repo_info(REPO_ID, repo_type="dataset") except: create_repo(REPO_ID, repo_type="dataset", private=True, token=HF_TOKEN) def load_data(): columns = ( ["model_name", "link", "license", "overall_wer", "overall_cer"] + [f"wer_{ds}" for ds in DATASETS] + [f"cer_{ds}" for ds in DATASETS] ) try: dataset = load_dataset(REPO_ID, token=HF_TOKEN) df = dataset["train"].to_pandas() except EmptyDatasetError: df = pd.DataFrame(columns=columns) if not df.empty: df = df.sort_values("overall_wer").reset_index(drop=True) df.insert(0, "rank", df.index + 1) for col in ( ["overall_wer", "overall_cer"] + [f"wer_{ds}" for ds in DATASETS] + [f"cer_{ds}" for ds in DATASETS] ): df[col] = (df[col] * 100).round(2) best_values = {ds: df[f"wer_{ds}"].min() for ds in DATASETS} for short_ds, ds in zip(SHORT_DATASET_NAMES, DATASETS): df[short_ds] = df.apply( lambda row: f'' f"{row[f'wer_{ds}']:.2f}%", axis=1, ) df = df.drop(columns=[f"wer_{ds}", f"cer_{ds}"]) df["model_name"] = df.apply( lambda row: f'{row["model_name"]}', axis=1, ) df = df.drop(columns=["link"]) df["license"] = df["license"].apply( lambda x: "Открытая" if any( term in x.lower() for term in ["mit", "apache", "bsd", "gpl", "open"] ) else "Закрытая" ) df["rank"] = df["rank"].apply( lambda r: "🥇" if r == 1 else "🥈" if r == 2 else "🥉" if r == 3 else str(r) ) df.rename( columns={ "overall_wer": "Средний WER ⬇️", "overall_cer": "Средний CER ⬇️", "license": "Тип модели", "model_name": "Модель", "rank": "Ранг", }, inplace=True, ) table_html = df.to_html( escape=False, index=False, classes="display cell-border compact stripe" ) return f'
| Ранг | Модель | Тип модели | Средний WER ⬇️ | Средний CER ⬇️ | ' + "".join(f"{short} | " for short in SHORT_DATASET_NAMES) + "
|---|
{info["description"]}
📊 {info["num_rows"]} записей
Перед расчётом приводим текст к нижнему регистру и удаляем пунктуацию.
Сортировка по среднему WER по всем датасетам. Метрики отображаются в процентах.
Укажите WER и CER для всех датасетов в формате JSON. Значения — от 0 до 1.
{
"Russian_LibriSpeech": { "wer": 0.1234, "cer": 0.0567 },
"Common_Voice_Corpus_22.0": { "wer": 0.2345, "cer": 0.0789 },
"Tone_Webinars": { "wer": 0.3456, "cer": 0.0987 },
"Tone_Books": { "wer": 0.4567, "cer": 0.1098 },
"Tone_Speak": { "wer": 0.5678, "cer": 0.1209 },
"Sova_RuDevices": { "wer": 0.6789, "cer": 0.1310 }
}