Spaces:

nielsr
/

community-science-progress

Build error

App Files Files Community

community-science-progress / app.py

nielsr HF Staff

More improvements

2adbdb9 over 1 year ago

raw

history blame

7.2 kB

	from datetime import datetime

	import streamlit as st
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt

	from datasets import Dataset
	from load_dataframe import get_data


	def aggregated_data(df, aggregation_level="week"):

	st.write(f"Aggregated data by {aggregation_level}")

	# Create a column that indicates if a paper has any artifacts
	df['has_artifact'] = (df['num_models'] > 0) \| (df['num_datasets'] > 0) \| (df['num_spaces'] > 0)

	# Resample by week
	freq = 'W' if aggregation_level == "week" else 'ME'
	weekly_total_papers = df.resample(freq).size()
	weekly_papers_with_artifacts = df.resample(freq)['has_artifact'].sum()

	# Calculate the percentage of papers with artifacts
	percentage_papers_with_artifacts = (weekly_papers_with_artifacts / weekly_total_papers) * 100

	# Calculate the growth rate
	growth_rate = percentage_papers_with_artifacts.pct_change() * 100
	growth_rate = growth_rate.replace([float('inf'), float('-inf')], pd.NA).dropna()

	# Display the average growth rate as a big number
	average_growth_rate = growth_rate.mean()
	st.metric(label=f"{aggregation_level.capitalize()}ly Average Growth Rate", value=f"{average_growth_rate:.2f}%")

	# Create the plot
	plt.figure(figsize=(12, 6))
	plt.plot(percentage_papers_with_artifacts.index, percentage_papers_with_artifacts, marker='o', linestyle='-', color='b', label='Percentage of Papers with on least 1 Artifact')

	# Set the y-axis limits
	plt.ylim(0, 100)

	plt.xlabel(aggregation_level)
	plt.ylabel('Percentage')
	plt.title('Percentage of Papers with Artifacts (Models, Datasets, Spaces) Over Time')
	plt.legend()
	plt.grid(True)

	# Use Streamlit to display the plot
	st.pyplot(plt)


	def show_data_editor(df: pd.DataFrame, key: str):
	edited_df = st.data_editor(df,
	hide_index=True,
	column_order=("reached_out", "reached_out_link", "paper_page", "title", "github", "num_models", "num_datasets", "num_spaces"),
	column_config={"github": st.column_config.LinkColumn(),
	"paper_page": st.column_config.LinkColumn(),
	"paper_page_with_title": st.column_config.LinkColumn(display_text=r'\\|(.*)')},
	width=2000,
	key=key)

	# Check if the dataframe has been edited
	# TODO this is wrong
	# rather we should probably do a merge-join (overwriting the edited rows) and then save the new dataframe
	# if not edited_df.equals(df):
	# save_data(edited_df)
	# st.success("Changes saved successfully!")


	def save_data(df: pd.DataFrame):
	# load as HF dataset
	dataset = Dataset.from_pandas(df)

	dataset.push_to_hub("nielsr/daily-papers-enriched")

	return


	def display_data(df: pd.DataFrame):
	df['has_artifact'] = (df['num_models'] > 0) \| (df['num_datasets'] > 0) \| (df['num_spaces'] > 0)
	num_artifacts = df['has_artifact'].sum()
	percentage_of_at_least_one_artifact = num_artifacts / df.shape[0] if df.shape[0] > 0 else 0
	percentage_of_at_least_one_artifact = round(percentage_of_at_least_one_artifact * 100, 2)

	# add reached out and reached out link columns
	df['reached_out'] = [False for _ in range(df.shape[0])]
	df["reached_out_link"] = ["" for _ in range(df.shape[0])]

	st.markdown(f"""
	## {percentage_of_at_least_one_artifact}% papers with at least one 🤗 artifact

	* Number of papers: {df.shape[0]}
	* Number of papers with a Github link: {df['github'].notnull().sum()}
	* Number of papers with at least one HF artifact: {num_artifacts}
	""")

	st.write("Papers with at least one artifact")
	show_data_editor(df[df['has_artifact']], key="papers_with_artifacts")

	st.write("Papers without artifacts")
	show_data_editor(df[~df['has_artifact']], key="papers_without_artifacts")

	st.write("Papers with a HF mention in README but no artifacts")
	show_data_editor(df[(df['hf_mention'] == 1) & (~df['has_artifact'])], key="papers_with_hf_mention_no_artifacts")


	def main():
	st.title("Hugging Face Artifacts KPI Dashboard")

	# 2 tabs: one for daily data, one for weekly data
	st.sidebar.title("Navigation")
	selection = st.sidebar.selectbox("Go to", ["Daily/weekly/monthly data", "Aggregated data"])

	if selection == "Daily/weekly/monthly data":
	# Button to select day, month or week
	# Add streamlit selectbox.
	view_level = st.selectbox(label="View data per day, week or month", options=["day", "week", "month"])

	if view_level == "day":
	# get the latest dataframe
	df = get_data()

	# make a button to select the day, defaulting to today
	day = st.date_input("Select day", value="today", format="DD/MM/YYYY")
	# convert to the day of a Pandas Timestamp
	day = pd.Timestamp(day)

	filtered_df = df[df.index.date == day.date()]

	st.write(f"Showing data for {day.day_name()} {day.strftime('%d/%m/%Y')}")
	display_data(df=filtered_df)

	elif view_level == "week":
	# get the latest dataframe
	df = get_data()

	# make a button to select the week
	week_number = st.number_input("Select week", value=datetime.today().isocalendar()[1], min_value=1, max_value=52)

	# Extract week number from the index
	df['week'] = df.index.isocalendar().week

	# Filter the dataframe for the desired week number
	filtered_df = df[df['week'] == week_number]

	st.write(f"Showing data for week {week_number}")

	display_data(df=filtered_df)

	elif view_level == "month":
	# get the latest dataframe
	df = get_data()

	# make a button to select the month, defaulting to current month
	month_str = st.selectbox("Select month", options=["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"])
	year_str = st.selectbox("Select year", options=["2024"])

	# Filter the dataframe for the desired week number
	month_map = {
	'January': 1, 'February': 2, 'March': 3, 'April': 4,
	'May': 5, 'June': 6, 'July': 7, 'August': 8,
	'September': 9, 'October': 10, 'November': 11, 'December': 12
	}

	# Convert month string to number
	month = month_map[month_str]
	year = int(year_str)
	filtered_df = df[(df.index.month == month) & (df.index.year == year)]

	st.write(f"Showing data for {month_str} {year_str}")

	display_data(df=filtered_df)

	elif selection == "Aggregated data":

	# get the latest dataframe
	df = get_data()

	aggregated_data(df)
	aggregated_data(df, aggregation_level="month")

	else:
	st.write("Error: selection not recognized")


	if __name__ == "__main__":
	main()