#!/usr/bin/env python3 from huggingface_hub import HfApi, hf_hub_download from huggingface_hub.repocard import metadata_load from collections import Counter import numpy as np import pandas as pd import streamlit as st ALL_LANGUAGES = [] ALL_LICENSES = [] def get_model_ids_and_tags(): api = HfApi() models = api.list_models(full=True) model_ids = [x.modelId for x in models] tags = [x.tags for x in models] return model_ids, tags def get_metadatas(model_ids): metadatas = {} for model_id in model_ids: try: readme_path = hf_hub_download(model_id, filename="README.md") metadatas[model_id] = metadata_load(readme_path) except: print(model_id + " has no README.md") return metadatas def retrieve_data(metadatas): for metadata in metadatas.values(): if metadata is None: continue if "language" in metadata: ALL_LANGUAGES.append(metadata["language"]) if "license" in metadata: ALL_LICENSES.append(metadata["license"]) @st.cache(persist=True) def main(): # 0. Get model ids model_ids, tags = get_model_ids_and_tags() # 1. Retrieve metadatas metadatas = get_metadatas(model_ids) # 2. Parse to results retrieve_data(metadatas) def clean_lists(list_like): clean_list = [] for item in list_like: if isinstance(item, str): clean_list.append(item) elif isinstance(item, list) and all(isinstance(x, str) for x in item): clean_list = clean_list + item return clean_list # 3. count data lang_counter = Counter(clean_lists(ALL_LANGUAGES)) license_counter = Counter(clean_lists(ALL_LICENSES)) # 4. count tags tags_counter = Counter(sum(tags, [])) # 5. change to frame lang_data_frame = pd.DataFrame.from_dict(lang_counter, orient="index") lang_data_frame = lang_data_frame.sort_index().transpose() license_data_frame = pd.DataFrame.from_dict(license_counter, orient="index") license_data_frame = license_data_frame.sort_index().transpose() tags_data_frame = pd.DataFrame.from_dict(tags_counter, orient="index") tags_data_frame = tags_data_frame.sort_index().transpose() return lang_data_frame, license_data_frame, tags_data_frame @st.cache def convert_df(df): # IMPORTANT: Cache the conversion to prevent computation on every rerun return df.to_csv().encode('utf-8') lang_data_frame, license_data_frame, tags_data_frame = main() st.title("All Languages") st.dataframe(lang_data_frame, width=600, height=1200) st.download_button( label="Download Languages Table as CSV", data=convert_df(lang_data_frame), file_name='all_languages.csv', mime='text/csv', ) st.write("Total num of langauges", lang_data_frame.shape[-1]) st.title("All Licenses") st.dataframe(license_data_frame, width=600, height=1200) st.download_button( label="Download Licenses Table as CSV", data=convert_df(license_data_frame), file_name='all_licenses.csv', mime='text/csv', ) st.write("Total num of licenses", license_data_frame.shape[-1]) st.title("All Tags") st.dataframe(tags_data_frame, width=600, height=1200) st.download_button( label="Download Tags Table as CSV", data=convert_df(tags_data_frame), file_name='all_tags.csv', mime='text/csv', ) st.write("Total num of different tags", tags_data_frame.shape[-1])