|
import json |
|
import os |
|
from datetime import datetime, timedelta |
|
|
|
from github import Github |
|
from huggingface_hub import HfApi, login |
|
|
|
|
|
def check_dataset_updates(dataset_id): |
|
api = HfApi() |
|
github = Github(os.environ["GH_TOKEN"]) |
|
|
|
repo = github.get_repo("argmaxinc/whisperkit") |
|
|
|
dataset_info = api.dataset_info(dataset_id) |
|
last_modified = dataset_info.lastModified.isoformat() |
|
current_sha = dataset_info.sha |
|
|
|
repo_tree = api.list_repo_tree( |
|
repo_id=dataset_id, |
|
repo_type="dataset", |
|
path_in_repo="benchmark_data", |
|
recursive=False, |
|
) |
|
cutoff_date = datetime.now(dataset_info.lastModified.tzinfo) - timedelta(weeks=6) |
|
|
|
commit_dates_hashes = [item.path.split("/")[-1] for item in repo_tree] |
|
new_commit_hashes = [] |
|
|
|
for commit_date_hash in commit_dates_hashes: |
|
commit_date, commit_hash = commit_date_hash.split("_") |
|
commit_date = datetime.strptime(commit_date, "%Y-%m-%dT%H%M%S").replace( |
|
tzinfo=dataset_info.lastModified.tzinfo |
|
) |
|
if commit_date < cutoff_date: |
|
continue |
|
new_commit_hashes.append(commit_hash) |
|
|
|
commit_info = [] |
|
for commit_hash in new_commit_hashes: |
|
try: |
|
commit = repo.get_commit(commit_hash) |
|
commit_date = commit.commit.author.date |
|
version = get_commit_version(repo, commit_hash) |
|
if version: |
|
commit_info.append((commit_hash, commit_date, version)) |
|
except Exception as e: |
|
print(f"Error processing commit {commit_hash}: {str(e)}") |
|
continue |
|
|
|
|
|
commit_info.sort(key=lambda x: x[1]) |
|
|
|
|
|
new_releases = [info[0] for info in commit_info] |
|
new_versions = [info[2] for info in commit_info] |
|
|
|
cache_dir = "dashboard_data" |
|
|
|
cache_file = os.path.join(cache_dir, "version.json") |
|
|
|
with open(cache_file, "r") as f: |
|
version = json.load(f) |
|
releases = version["releases"] |
|
versions = version["versions"] |
|
|
|
updated_releases = [] |
|
updated_versions = [] |
|
|
|
for release, version in zip(new_releases, new_versions): |
|
if release not in releases: |
|
updated_releases.append(release) |
|
updated_versions.append(version) |
|
|
|
if os.path.exists(cache_file): |
|
with open(cache_file, "r") as f: |
|
cached_data = json.load(f) |
|
if cached_data.get("sha") == current_sha: |
|
with open(os.environ["GITHUB_OUTPUT"], "a") as fh: |
|
print(f"has_updates=false", file=fh) |
|
return |
|
|
|
with open(cache_file, "w") as f: |
|
json.dump( |
|
{ |
|
"last_modified": last_modified, |
|
"sha": current_sha, |
|
"releases": releases + updated_releases, |
|
"versions": versions + updated_versions, |
|
}, |
|
f, |
|
) |
|
|
|
with open(os.environ["GITHUB_OUTPUT"], "a") as fh: |
|
print(f"has_updates=true", file=fh) |
|
|
|
|
|
def get_commit_version(repo, commit_hash): |
|
try: |
|
releases = list(repo.get_releases()) |
|
releases.sort(key=lambda x: x.created_at) |
|
|
|
commit = repo.get_commit(commit_hash) |
|
commit_date = commit.commit.author.date |
|
|
|
for i, release in enumerate(releases): |
|
if commit_date <= release.created_at: |
|
return releases[i].tag_name.lstrip("v") |
|
|
|
return releases[-1].tag_name.lstrip("v") |
|
except Exception as e: |
|
print(f"Error processing commit {commit_hash}: {str(e)}") |
|
return None |
|
|
|
|
|
if __name__ == "__main__": |
|
login(token=os.environ["HF_TOKEN"]) |
|
check_dataset_updates("argmaxinc/whisperkit-evals-dataset") |
|
|