#!/usr/bin/env python3
from huggingface_hub import HfApi, hf_hub_download
from huggingface_hub.repocard import metadata_load
from collections import Counter
import numpy as np

import pandas as pd
import streamlit as st


ALL_LANGUAGES = []
ALL_LICENSES = []


def get_model_ids_and_tags():
    api = HfApi()
    models = api.list_models(full=True)
    model_ids = [x.modelId for x in models]
    tags = [x.tags for x in models]
    return model_ids, tags


def get_metadatas(model_ids):
    metadatas = {}
    for model_id in model_ids:
        try:
            readme_path = hf_hub_download(model_id, filename="README.md")
            metadatas[model_id] = metadata_load(readme_path)
        except:
            print(model_id + " has no README.md")
    return metadatas


def retrieve_data(metadatas):
    for metadata in metadatas.values():
        if metadata is None:
            continue

        if "language" in metadata:
            ALL_LANGUAGES.append(metadata["language"])
        if "license" in metadata:
            ALL_LICENSES.append(metadata["license"])


@st.cache(persist=True)
def main():
    # 0. Get model ids
    model_ids, tags = get_model_ids_and_tags()

    # 1. Retrieve metadatas

    metadatas = get_metadatas(model_ids)

    # 2. Parse to results
    retrieve_data(metadatas)

    def clean_lists(list_like):
        clean_list = []
        for item in list_like:
            if isinstance(item, str):
                clean_list.append(item)
            elif isinstance(item, list) and all(isinstance(x, str) for x in item):
                clean_list = clean_list + item
        return clean_list

    # 3. count data
    lang_counter = Counter(clean_lists(ALL_LANGUAGES))
    license_counter = Counter(clean_lists(ALL_LICENSES))

    # 4. count tags
    tags_counter = Counter(sum(tags, []))

    # 5. change to frame
    lang_data_frame = pd.DataFrame.from_dict(lang_counter, orient="index")
    lang_data_frame = lang_data_frame.sort_index().transpose()

    license_data_frame = pd.DataFrame.from_dict(license_counter, orient="index")
    license_data_frame = license_data_frame.sort_index().transpose()

    tags_data_frame = pd.DataFrame.from_dict(tags_counter, orient="index")
    tags_data_frame = tags_data_frame.sort_index().transpose()

    return lang_data_frame, license_data_frame, tags_data_frame
    
    
@st.cache
def convert_df(df):
    # IMPORTANT: Cache the conversion to prevent computation on every rerun
    return df.to_csv().encode('utf-8')


lang_data_frame, license_data_frame, tags_data_frame = main()

st.title("All Languages")
st.dataframe(lang_data_frame, width=600, height=1200)
st.download_button(
     label="Download Languages Table as CSV",
     data=convert_df(lang_data_frame),
     file_name='all_languages.csv',
     mime='text/csv',
 )
st.write("Total num of langauges", lang_data_frame.shape[-1])

st.title("All Licenses")
st.dataframe(license_data_frame, width=600, height=1200)
st.download_button(
     label="Download Licenses Table as CSV",
     data=convert_df(license_data_frame),
     file_name='all_licenses.csv',
     mime='text/csv',
 )
st.write("Total num of licenses", license_data_frame.shape[-1])

st.title("All Tags")
st.dataframe(tags_data_frame, width=600, height=1200)
st.download_button(
     label="Download Tags Table as CSV",
     data=convert_df(tags_data_frame),
     file_name='all_tags.csv',
     mime='text/csv',
 )
st.write("Total num of different tags", tags_data_frame.shape[-1])