Spaces:
Build error
Build error
| from datetime import datetime | |
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| from datasets import Dataset | |
| from load_dataframe import get_data | |
| def aggregated_data(df, aggregation_level="week"): | |
| st.write(f"Aggregated data by {aggregation_level}") | |
| # Create a column that indicates if a paper has any artifacts | |
| df['has_artifact'] = (df['num_models'] > 0) | (df['num_datasets'] > 0) | (df['num_spaces'] > 0) | |
| # Resample by week | |
| freq = 'W' if aggregation_level == "week" else 'ME' | |
| weekly_total_papers = df.resample(freq).size() | |
| weekly_papers_with_artifacts = df.resample(freq)['has_artifact'].sum() | |
| # Calculate the percentage of papers with artifacts | |
| percentage_papers_with_artifacts = (weekly_papers_with_artifacts / weekly_total_papers) * 100 | |
| # Calculate the growth rate | |
| growth_rate = percentage_papers_with_artifacts.pct_change() * 100 | |
| growth_rate = growth_rate.replace([float('inf'), float('-inf')], pd.NA).dropna() | |
| # Display the average growth rate as a big number | |
| average_growth_rate = growth_rate.mean() | |
| st.metric(label=f"{aggregation_level.capitalize()}ly Average Growth Rate", value=f"{average_growth_rate:.2f}%") | |
| # Create the plot | |
| plt.figure(figsize=(12, 6)) | |
| plt.plot(percentage_papers_with_artifacts.index, percentage_papers_with_artifacts, marker='o', linestyle='-', color='b', label='Percentage of Papers with on least 1 Artifact') | |
| # Set the y-axis limits | |
| plt.ylim(0, 100) | |
| plt.xlabel(aggregation_level) | |
| plt.ylabel('Percentage') | |
| plt.title('Percentage of Papers with Artifacts (Models, Datasets, Spaces) Over Time') | |
| plt.legend() | |
| plt.grid(True) | |
| # Use Streamlit to display the plot | |
| st.pyplot(plt) | |
| def show_data_editor(df: pd.DataFrame, key: str): | |
| edited_df = st.data_editor(df, | |
| hide_index=True, | |
| column_order=("reached_out", "reached_out_link", "paper_page", "title", "github", "num_models", "num_datasets", "num_spaces"), | |
| column_config={"github": st.column_config.LinkColumn(), | |
| "paper_page": st.column_config.LinkColumn(), | |
| "paper_page_with_title": st.column_config.LinkColumn(display_text=r'\|(.*)')}, | |
| width=2000, | |
| key=key) | |
| # Check if the dataframe has been edited | |
| # TODO this is wrong | |
| # rather we should probably do a merge-join (overwriting the edited rows) and then save the new dataframe | |
| # if not edited_df.equals(df): | |
| # save_data(edited_df) | |
| # st.success("Changes saved successfully!") | |
| def save_data(df: pd.DataFrame): | |
| # load as HF dataset | |
| dataset = Dataset.from_pandas(df) | |
| dataset.push_to_hub("nielsr/daily-papers-enriched") | |
| return | |
| def display_data(df: pd.DataFrame): | |
| df['has_artifact'] = (df['num_models'] > 0) | (df['num_datasets'] > 0) | (df['num_spaces'] > 0) | |
| num_artifacts = df['has_artifact'].sum() | |
| percentage_of_at_least_one_artifact = num_artifacts / df.shape[0] if df.shape[0] > 0 else 0 | |
| percentage_of_at_least_one_artifact = round(percentage_of_at_least_one_artifact * 100, 2) | |
| # add reached out and reached out link columns | |
| df['reached_out'] = [False for _ in range(df.shape[0])] | |
| df["reached_out_link"] = ["" for _ in range(df.shape[0])] | |
| st.markdown(f""" | |
| ## {percentage_of_at_least_one_artifact}% papers with at least one π€ artifact | |
| * Number of papers: {df.shape[0]} | |
| * Number of papers with a Github link: {df['github'].notnull().sum()} | |
| * Number of papers with at least one HF artifact: {num_artifacts} | |
| """) | |
| st.write("Papers with at least one artifact") | |
| show_data_editor(df[df['has_artifact']], key="papers_with_artifacts") | |
| st.write("Papers without artifacts") | |
| show_data_editor(df[~df['has_artifact']], key="papers_without_artifacts") | |
| st.write("Papers with a HF mention in README but no artifacts") | |
| show_data_editor(df[(df['hf_mention'] == 1) & (~df['has_artifact'])], key="papers_with_hf_mention_no_artifacts") | |
| def main(): | |
| st.title("Hugging Face Artifacts KPI Dashboard") | |
| # 2 tabs: one for daily data, one for weekly data | |
| st.sidebar.title("Navigation") | |
| selection = st.sidebar.selectbox("Go to", ["Daily/weekly/monthly data", "Aggregated data"]) | |
| if selection == "Daily/weekly/monthly data": | |
| # Button to select day, month or week | |
| # Add streamlit selectbox. | |
| view_level = st.selectbox(label="View data per day, week or month", options=["day", "week", "month"]) | |
| if view_level == "day": | |
| # get the latest dataframe | |
| df = get_data() | |
| # make a button to select the day, defaulting to today | |
| day = st.date_input("Select day", value="today", format="DD/MM/YYYY") | |
| # convert to the day of a Pandas Timestamp | |
| day = pd.Timestamp(day) | |
| filtered_df = df[df.index.date == day.date()] | |
| st.write(f"Showing data for {day.day_name()} {day.strftime('%d/%m/%Y')}") | |
| display_data(df=filtered_df) | |
| elif view_level == "week": | |
| # get the latest dataframe | |
| df = get_data() | |
| # make a button to select the week | |
| week_number = st.number_input("Select week", value=datetime.today().isocalendar()[1], min_value=1, max_value=52) | |
| # Extract week number from the index | |
| df['week'] = df.index.isocalendar().week | |
| # Filter the dataframe for the desired week number | |
| filtered_df = df[df['week'] == week_number] | |
| st.write(f"Showing data for week {week_number}") | |
| display_data(df=filtered_df) | |
| elif view_level == "month": | |
| # get the latest dataframe | |
| df = get_data() | |
| # make a button to select the month, defaulting to current month | |
| month_str = st.selectbox("Select month", options=["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]) | |
| year_str = st.selectbox("Select year", options=["2024"]) | |
| # Filter the dataframe for the desired week number | |
| month_map = { | |
| 'January': 1, 'February': 2, 'March': 3, 'April': 4, | |
| 'May': 5, 'June': 6, 'July': 7, 'August': 8, | |
| 'September': 9, 'October': 10, 'November': 11, 'December': 12 | |
| } | |
| # Convert month string to number | |
| month = month_map[month_str] | |
| year = int(year_str) | |
| filtered_df = df[(df.index.month == month) & (df.index.year == year)] | |
| st.write(f"Showing data for {month_str} {year_str}") | |
| display_data(df=filtered_df) | |
| elif selection == "Aggregated data": | |
| # get the latest dataframe | |
| df = get_data() | |
| aggregated_data(df) | |
| aggregated_data(df, aggregation_level="month") | |
| else: | |
| st.write("Error: selection not recognized") | |
| if __name__ == "__main__": | |
| main() |