Santosh
auto push
aa595c3
raw
history blame
25.2 kB
# import gradio as gr
# import polars as pl
# # Path for the combined Parquet file
# COMBINED_PARQUET_PATH = "datasetcards.parquet"
# ROWS_PER_PAGE = 50
# # Lazy load dataset
# lazy_df = pl.scan_parquet(COMBINED_PARQUET_PATH)
# lazy_df = lazy_df.sort(
# by=["downloads", "last_modified"],
# descending=[True, True]
# )
# # Helper function to fetch a page
# def get_page(lazy_df: pl.LazyFrame, page: int, column: str = None, query: str = ""):
# filtered_df = lazy_df
# if column and query:
# query_lower = query.lower().strip()
# filtered_df = filtered_df.with_columns([
# pl.col(column).cast(pl.Utf8).str.to_lowercase().alias(column)
# ]).filter(pl.col(column).str.contains(query_lower, literal=False))
# start = page * ROWS_PER_PAGE
# page_df = filtered_df.slice(start, ROWS_PER_PAGE).collect().to_pandas()
# # Replace NaN/None with empty string for display
# page_df = page_df.fillna("")
# total_rows = filtered_df.collect().height
# total_pages = (total_rows - 1) // ROWS_PER_PAGE + 1
# return page_df, total_pages
# # Initialize first page
# initial_df, total_pages = get_page(lazy_df, 0)
# columns = list(initial_df.columns)
# with gr.Blocks() as demo:
# gr.Markdown("## Dataset Insight Portal")
# gr.Markdown("This space allows you to explore the dataset of DatasetCards.<br>"
# "You can navigate pages, search within columns, and inspect the dataset easily.<br>"
# )
# # Pagination controls
# with gr.Row():
# prev_btn = gr.Button("Previous", elem_id="small-btn")
# next_btn = gr.Button("Next", elem_id="small-btn")
# page_number = gr.Number(value=0, label="Page", precision=0)
# total_pages_display = gr.Label(value=f"Total Pages: {total_pages}")
# # Data table
# data_table = gr.Dataframe(
# value=initial_df, headers=columns, datatype="str",
# interactive=False, row_count=ROWS_PER_PAGE
# )
# # Column search
# with gr.Row():
# col_dropdown = gr.Dropdown(choices=columns, label="Column")
# search_text = gr.Textbox(label="Search")
# search_btn = gr.Button("Search", elem_id="small-btn")
# reset_btn = gr.Button("Reset", elem_id="small-btn")
# # --- Functions ---
# current_lazy_df = lazy_df # single dataset
# def next_page_func(page, column, query):
# page += 1
# page_df, total_pages = get_page(current_lazy_df, page, column, query)
# if page >= total_pages:
# page = total_pages - 1
# page_df, total_pages = get_page(current_lazy_df, page, column, query)
# return page_df, f"Total Pages: {total_pages}", page
# def prev_page_func(page, column, query):
# page -= 1
# page = max(0, page)
# page_df, total_pages = get_page(current_lazy_df, page, column, query)
# return page_df, f"Total Pages: {total_pages}", page
# def search_func(column, query):
# page_df, total_pages = get_page(current_lazy_df, 0, column, query)
# return page_df, f"Total Pages: {total_pages}", 0
# def reset_func():
# page_df, total_pages = get_page(current_lazy_df, 0)
# return page_df, f"Total Pages: {total_pages}", 0
# # --- Event Listeners ---
# next_btn.click(next_page_func, [page_number, col_dropdown, search_text], [data_table, total_pages_display, page_number])
# prev_btn.click(prev_page_func, [page_number, col_dropdown, search_text], [data_table, total_pages_display, page_number])
# search_btn.click(search_func, [col_dropdown, search_text], [data_table, total_pages_display, page_number])
# reset_btn.click(reset_func, [], [data_table, total_pages_display, page_number])
# demo.launch()
# import gradio as gr
# import polars as pl
# COMBINED_PARQUET_PATH = "datasetcards.parquet"
# ROWS_PER_PAGE = 50
# # Load dataset
# df = pl.read_parquet(COMBINED_PARQUET_PATH) # eager DataFrame
# # Columns with dropdown instead of text search
# DROPDOWN_COLUMNS = ["reason", "category", "field", "keyword"]
# # Get unique values for the dropdown columns
# unique_values = {
# col: sorted(df[col].drop_nulls().unique().to_list()) for col in DROPDOWN_COLUMNS
# }
# # Get page helper
# def get_page(df, page, column, query):
# filtered_df = df
# if column and query:
# if column in DROPDOWN_COLUMNS:
# # Exact match from dropdown
# filtered_df = filtered_df.filter(pl.col(column) == query)
# else:
# # Text search
# q = query.lower().strip()
# filtered_df = (
# filtered_df.with_columns([
# pl.col(column).str.to_lowercase().alias(column)
# ])
# .filter(pl.col(column).str.contains(q, literal=False))
# )
# start = page * ROWS_PER_PAGE
# page_df = filtered_df[start:start + ROWS_PER_PAGE].to_pandas().fillna("")
# total_rows = filtered_df.height
# total_pages = (total_rows - 1) // ROWS_PER_PAGE + 1 if total_rows > 0 else 1
# return page_df, total_pages
# # Initial page
# initial_df, total_pages = get_page(df, 0, None, "")
# columns = list(initial_df.columns)
# # Build Gradio app
# with gr.Blocks() as demo:
# gr.Markdown("## Dataset Insight Portal")
# gr.Markdown(
# "This space allows you to explore the dataset of DatasetCards.<br>"
# "You can navigate pages, search within columns, and inspect the dataset easily.<br>"
# )
# with gr.Row():
# prev_btn = gr.Button("Previous")
# next_btn = gr.Button("Next")
# page_number = gr.Number(value=0, label="Page", precision=0)
# total_pages_display = gr.Label(value=f"Total Pages: {total_pages}")
# data_table = gr.Dataframe(
# value=initial_df,
# headers=columns,
# datatype="str",
# interactive=False,
# row_count=ROWS_PER_PAGE,
# )
# with gr.Row():
# col_dropdown = gr.Dropdown(choices=columns, label="Column to Search")
# search_text = gr.Textbox(label="Search Text")
# search_dropdown = gr.Dropdown(choices=[], label="Select Value", visible=False)
# search_btn = gr.Button("Search")
# reset_btn = gr.Button("Reset")
# # Show dropdown only for certain columns
# def update_search_input(column):
# if column in DROPDOWN_COLUMNS:
# return gr.update(choices=unique_values[column], visible=True), gr.update(visible=False)
# else:
# return gr.update(visible=False), gr.update(visible=True)
# col_dropdown.change(update_search_input, col_dropdown, [search_dropdown, search_text])
# # Search function
# def search_func(page, column, txt, ddl):
# query = ddl if column in DROPDOWN_COLUMNS else txt
# page_df, total_pages = get_page(df, page, column, query)
# return page_df, f"Total Pages: {total_pages}", 0
# def next_page(page, column, txt, ddl):
# page += 1
# query = ddl if column in DROPDOWN_COLUMNS else txt
# page_df, total_pages = get_page(df, page, column, query)
# if page >= total_pages:
# page = total_pages - 1
# page_df, total_pages = get_page(df, page, column, query)
# return page_df, f"Total Pages: {total_pages}", page
# def prev_page(page, column, txt, ddl):
# page = max(0, page - 1)
# query = ddl if column in DROPDOWN_COLUMNS else txt
# page_df, total_pages = get_page(df, page, column, query)
# return page_df, f"Total Pages: {total_pages}", page
# def reset_func():
# page_df, total_pages = get_page(df, 0, None, "")
# return page_df, f"Total Pages: {total_pages}", 0, "", ""
# # Wire events
# inputs = [page_number, col_dropdown, search_text, search_dropdown]
# outputs = [data_table, total_pages_display, page_number]
# search_btn.click(search_func, inputs, outputs)
# next_btn.click(next_page, inputs, outputs)
# prev_btn.click(prev_page, inputs, outputs)
# reset_btn.click(reset_func, [], outputs + [search_text, search_dropdown])
# demo.launch()
# import gradio as gr
# import polars as pl
# from huggingface_hub import HfApi
# import re
# # --- Hugging Face Org ---
# org_name = "hugging-science"
# api = HfApi()
# def fetch_members():
# members = api.list_organization_members(org_name)
# return [member.username for member in members]
# member_list = fetch_members()
# # --- Dataset ---
# COMBINED_PARQUET_PATH = "datasetcards_new.parquet"
# UPDATED_PARQUET_PATH = "datasetcards_new.parquet"
# ROWS_PER_PAGE = 50
# # df = pl.read_parquet(COMBINED_PARQUET_PATH)
# df = pl.read_parquet(COMBINED_PARQUET_PATH)
# df = df.with_columns([
# pl.lit("todo").alias("status"),
# pl.lit("").alias("assigned_to")
# ]).sort(by=["downloads", "last_modified", "usedStorage"], descending=[True, True, True])
# if "reason" in df.columns:
# df = df.with_columns([
# pl.Series(
# "reason",
# ["short description" if x and "short description" in x.lower() else (x if x is not None else "") for x in df["reason"]]
# )
# ])
# # Add editable columns if missing
# for col in ["assigned_to", "status"]:
# if col not in df.columns:
# default_val = "" if col == "assigned_to" else "todo"
# df = df.with_columns(pl.lit(default_val).alias(col))
# else:
# # Fill nulls with default
# default_val = "" if col == "assigned_to" else "todo"
# df = df.with_columns(pl.col(col).fill_null(default_val))
# # --- Columns ---
# DROPDOWN_COLUMNS = ["reason", "category", "field", "keyword", "assigned_to", "status"]
# STATUS_OPTIONS = ["todo", "inprogress", "PR submitted", "PR merged"]
# # Prepare unique values for dropdown search
# unique_values = {col: sorted(df[col].drop_nulls().unique().to_list()) for col in DROPDOWN_COLUMNS}
# unique_values['assigned_to'] = sorted(member_list)
# unique_values['status'] = STATUS_OPTIONS
# # --- Helper to get page ---
# def get_page(df, page, column=None, query=None):
# filtered_df = df
# if column and query:
# if column in DROPDOWN_COLUMNS:
# filtered_df = filtered_df.filter(pl.col(column) == query)
# else:
# q = query.lower().strip()
# filtered_df = (
# filtered_df.with_columns([pl.col(column).str.to_lowercase().alias(column)])
# .filter(pl.col(column).str.contains(q, literal=False))
# )
# start = page * ROWS_PER_PAGE
# page_df = filtered_df[start:start + ROWS_PER_PAGE].to_pandas().fillna("")
# total_rows = filtered_df.height
# total_pages = (total_rows - 1) // ROWS_PER_PAGE + 1 if total_rows > 0 else 1
# return page_df, total_pages
# initial_df, total_pages = get_page(df, 0)
# columns = list(initial_df.columns)
# with gr.Blocks() as demo:
# gr.Markdown("""
# # Dataset Insight Portal
# Welcome! This portal helps you explore and manage datasets from our Hugging Face organization.
# ## What is this space for?
# This space provides a table of datasets along with metadata. You can:
# - Browse datasets with pagination.
# - Search datasets by various fields.
# - Assign responsibility for reviewing datasets (`assigned_to`).
# - Track progress using `status`.
# ## Why the table?
# The table gives a structured view of all datasets, making it easy to sort, filter, and update information for each dataset. It consists of all datasets until 20-09-2025.
# ## What does the table contain?
# Each row represents a dataset. Columns include:
# - **dataset_id**: Unique identifier of the dataset.
# - **dataset_url**: Link to the dataset page on Hugging Face.
# - **downloads**: Number of downloads.
# - **author**: Dataset author.
# - **license**: License type.
# - **tags**: Tags describing the dataset. Obtained from the dataset card.
# - **task_categories**: Categories of tasks the dataset is useful for. Obtained from the dataset card.
# - **last_modified**: Date of last update.
# - **field, keyword**: Metadata columns describing dataset purpose based on heuristics. Use the `field` and `keyword` to filter for science based datasets.
# - **category**: Category of the dataset (`rich` means it is good dataset card. `minimal` means it needs improvement for the reasons below).
# - **reason**: Reason why the dataset is classified as `minimal`. Options: `Failed to load card`, `No metadata and no description`, `No metadata and has description`, `Short description`.
# - **usedStorage**: Storage used by the dataset (bytes).
# - **assigned_to**: Person responsible for the dataset (editable).
# - **status**: Progress status (editable). Options: `todo`, `inprogress`, `PR submitted`, `PR merged`.
# ## How to use search
# - Select a **column** from the dropdown.
# - If the column is textual, type your query in the text box.
# - If the column is a dropdown (like `assigned_to` or `status`), select the value from the dropdown.
# - Click **Search** to filter the table.
# ## How to add or update `assigned_to` and `status`
# 1. Search for the **dataset_id** initially.
# 2. Then, select the **dataset_id** from the dropdown below the table.
# 3. Choose the person responsible in **Assigned To**. If you are a member of the organization, your username should appear in the list. Else refresh and try again.
# 4. Select the current status in **Status**.
# 5. Click **Save Changes** to update the table and persist the changes.
# 6. Use **Refresh All** to reload the table and the latest members list.
# This portal makes it easy to keep track of dataset reviews, assignments, and progress all in one place.
# """)
# # --- Pagination controls ---
# with gr.Row():
# prev_btn = gr.Button("Previous")
# next_btn = gr.Button("Next")
# page_number = gr.Number(value=0, label="Page", precision=0)
# total_pages_display = gr.Label(value=f"Total Pages: {total_pages}")
# # --- Data table ---
# data_table = gr.Dataframe(
# value=initial_df,
# headers=columns,
# datatype="str",
# interactive=False,
# row_count=ROWS_PER_PAGE
# )
# # --- Search controls ---
# with gr.Row():
# col_dropdown = gr.Dropdown(choices=columns, label="Column to Search")
# search_text = gr.Textbox(label="Search Text")
# search_dropdown = gr.Dropdown(choices=[], label="Select Value", visible=False)
# search_btn = gr.Button("Search")
# reset_btn = gr.Button("Reset")
# # --- Dataset selection & editable fields ---
# selected_dataset_id = gr.Dropdown(label="Select dataset_id", choices=initial_df['dataset_id'].tolist())
# assigned_to_input = gr.Dropdown(choices=member_list, label="Assigned To")
# # status_input = gr.Dropdown(choices=STATUS_OPTIONS, label="Status")
# status_input = gr.Dropdown(choices=STATUS_OPTIONS, label="Status", value="todo")
# save_btn = gr.Button("Save Changes")
# refresh_btn = gr.Button("Refresh All")
# save_message = gr.Textbox(label="Save Status", interactive=False)
# # --- Update search input depending on column ---
# def update_search_input(column):
# if column in DROPDOWN_COLUMNS:
# return gr.update(choices=unique_values[column], visible=True), gr.update(visible=False)
# else:
# return gr.update(visible=False), gr.update(visible=True)
# col_dropdown.change(update_search_input, col_dropdown, [search_dropdown, search_text])
# # --- Prefill editable fields ---
# def prefill_fields(dataset_id):
# if not dataset_id:
# return "", "todo"
# dataset_id = str(dataset_id)
# filtered = [row for row in df.to_dicts() if str(row.get("dataset_id")) == dataset_id]
# if not filtered:
# return "", "todo"
# row = filtered[0]
# return row.get("assigned_to", ""), row.get("status", "todo")
# selected_dataset_id.change(prefill_fields, selected_dataset_id, [assigned_to_input, status_input])
# # --- Search function ---
# def search_func(page, column, txt, ddl):
# query = ddl if column in DROPDOWN_COLUMNS else txt
# page_df, total_pages = get_page(df, page, column, query)
# return page_df, f"Total Pages: {total_pages}", 0, gr.update(choices=page_df['dataset_id'].tolist())
# # --- Pagination functions ---
# def next_page(page, column, txt, ddl):
# page += 1
# query = ddl if column in DROPDOWN_COLUMNS else txt
# page_df, total_pages = get_page(df, page, column, query)
# if page >= total_pages:
# page = total_pages - 1
# page_df, total_pages = get_page(df, page, column, query)
# return page_df, f"Total Pages: {total_pages}", page, gr.update(choices=page_df['dataset_id'].tolist())
# def prev_page(page, column, txt, ddl):
# page = max(0, page - 1)
# query = ddl if column in DROPDOWN_COLUMNS else txt
# page_df, total_pages = get_page(df, page, column, query)
# return page_df, f"Total Pages: {total_pages}", page, gr.update(choices=page_df['dataset_id'].tolist())
# def reset_func():
# page_df, total_pages = get_page(df, 0)
# return page_df, f"Total Pages: {total_pages}", 0, gr.update(choices=page_df['dataset_id'].tolist())
# # --- Save changes & refresh ---
# def save_changes(dataset_id, assigned_to_val, status_val, page_val, col, txt, ddl):
# global df
# if not dataset_id:
# return gr.update(value="Please select a row first."), None, None, None
# df = df.with_columns([
# pl.when(pl.col("dataset_id") == dataset_id).then(pl.lit(assigned_to_val)).otherwise(pl.col("assigned_to")).alias("assigned_to"),
# pl.when(pl.col("dataset_id") == dataset_id).then(pl.lit(status_val)).otherwise(pl.col("status")).alias("status")
# ])
# df.write_parquet(UPDATED_PARQUET_PATH)
# page_df, total_pages = get_page(df, page_val, col, txt if col not in DROPDOWN_COLUMNS else ddl)
# return (
# gr.update(value=f"Saved changes for dataset_id: {dataset_id}"),
# page_df,
# gr.update(choices=page_df['dataset_id'].tolist()),
# f"Total Pages: {total_pages}"
# )
# # --- Refresh All: table + members ---
# def refresh_all(page, column, txt, ddl):
# global df, member_list, unique_values
# # Refresh members
# member_list = fetch_members()
# unique_values['assigned_to'] = sorted(member_list)
# # Refresh table
# try:
# df = pl.read_parquet(UPDATED_PARQUET_PATH)
# except FileNotFoundError:
# pass
# page_df, total_pages = get_page(df, page, column, txt if column not in DROPDOWN_COLUMNS else ddl)
# return page_df, f"Total Pages: {total_pages}", page, gr.update(choices=page_df['dataset_id'].tolist()), gr.update(choices=member_list)
# # --- Wire buttons ---
# inputs_search = [page_number, col_dropdown, search_text, search_dropdown]
# outputs_search = [data_table, total_pages_display, page_number, selected_dataset_id]
# search_btn.click(search_func, inputs_search, outputs_search)
# next_btn.click(next_page, inputs_search, outputs_search)
# prev_btn.click(prev_page, inputs_search, outputs_search)
# reset_btn.click(reset_func, [], outputs_search)
# save_btn.click(
# save_changes,
# [selected_dataset_id, assigned_to_input, status_input, page_number, col_dropdown, search_text, search_dropdown],
# [save_message, data_table, selected_dataset_id, total_pages_display]
# )
# refresh_btn.click(
# refresh_all,
# inputs=[page_number, col_dropdown, search_text, search_dropdown],
# outputs=[data_table, total_pages_display, page_number, selected_dataset_id, assigned_to_input]
# )
# demo.launch()
import gradio as gr
import polars as pl
import os
import subprocess
import threading
import time
# --- Config ---
COMBINED_PARQUET_PATH = "datasetcards_new.parquet"
UPDATED_PARQUET_PATH = "datasetcards_new.parquet" # overwrite same file
ROWS_PER_PAGE = 50
ORG_NAME = "hugging-science" # replace with your org
SPACE_NAME = "dataset-insight-portal" # replace with your space
# --- Load dataset ---
df = pl.read_parquet(COMBINED_PARQUET_PATH).with_columns([
pl.lit("").alias("assigned_to"),
pl.lit("todo").alias("status")
])
columns = df.columns
total_pages = (len(df) + ROWS_PER_PAGE - 1) // ROWS_PER_PAGE
# --- Git push helpers ---
def save_and_push():
"""Commit and push parquet file changes to the repo."""
try:
subprocess.run(["git", "config", "--global", "user.email", "[email protected]"])
subprocess.run(["git", "config", "--global", "user.name", "Santosh Sanjeev"])
hf_token = os.environ["HF_TOKEN"]
repo_url = f"https://user:{hf_token}@huggingface.co/spaces/{ORG_NAME}/{SPACE_NAME}"
subprocess.run(["git", "remote", "set-url", "origin", repo_url])
# Commit only if parquet changed
subprocess.run(["git", "add", UPDATED_PARQUET_PATH])
result = subprocess.run(["git", "diff", "--cached", "--quiet"])
if result.returncode != 0:
subprocess.run(["git", "commit", "-m", "Auto-update parquet file"])
subprocess.run(["git", "push", "origin", "main"])
print("โœ… Pushed parquet to repo")
else:
print("โ„น๏ธ No parquet changes to push")
except Exception as e:
print("โš ๏ธ Push failed:", e)
def auto_push_loop(interval=300):
"""Run save_and_push every `interval` seconds (default 5 min)."""
while True:
save_and_push()
time.sleep(interval)
# --- Gradio app functions ---
def get_page(page_num, col, search_text, search_dropdown):
global df
filtered = df
if col and col in df.columns:
if col in DROPDOWN_COLUMNS and search_dropdown:
filtered = filtered.filter(pl.col(col) == search_dropdown)
elif search_text:
filtered = filtered.filter(pl.col(col).cast(str).str.contains(search_text, literal=False))
total_pages = (len(filtered) + ROWS_PER_PAGE - 1) // ROWS_PER_PAGE
start, end = (page_num - 1) * ROWS_PER_PAGE, page_num * ROWS_PER_PAGE
page_df = filtered[start:end]
return page_df.to_pandas(), f"of {total_pages}", page_num, "", "", ""
def save_changes(dataset_id, assigned_to, status):
global df
mask = df["dataset_id"] == dataset_id
if mask.any():
df = df.with_columns([
pl.when(mask).then(assigned_to).otherwise(df["assigned_to"]).alias("assigned_to"),
pl.when(mask).then(status).otherwise(df["status"]).alias("status")
])
df.write_parquet(UPDATED_PARQUET_PATH)
save_and_push() # push immediately after change
return f"Saved for {dataset_id} โœ…"
def refresh_all(page_num, col, search_text, search_dropdown):
return get_page(page_num, col, search_text, search_dropdown)
# --- UI ---
DROPDOWN_COLUMNS = ["status", "assigned_to"]
with gr.Blocks() as demo:
with gr.Row():
col_dropdown = gr.Dropdown(choices=columns, label="Search Column")
search_text = gr.Textbox(label="Search Text")
search_dropdown = gr.Dropdown(choices=["todo", "inprogress", "PR submitted", "PR merged"], label="Status")
with gr.Row():
page_number = gr.Number(value=1, precision=0, label="Page #")
total_pages_display = gr.Textbox(value=f"of {total_pages}", interactive=False)
data_table = gr.Dataframe(headers=columns, datatype=["str"] * len(columns), row_count=ROWS_PER_PAGE)
selected_dataset_id = gr.Textbox(label="Selected Dataset ID", interactive=False)
assigned_to_input = gr.Textbox(label="Assigned To")
status_input = gr.Dropdown(choices=["todo", "inprogress", "PR submitted", "PR merged"], label="Status")
save_btn = gr.Button("Save Changes")
refresh_btn = gr.Button("Refresh")
output_msg = gr.Textbox(label="Message", interactive=False)
page_number.change(get_page, inputs=[page_number, col_dropdown, search_text, search_dropdown],
outputs=[data_table, total_pages_display, page_number,
selected_dataset_id, assigned_to_input, status_input])
save_btn.click(save_changes, inputs=[selected_dataset_id, assigned_to_input, status_input], outputs=[output_msg])
refresh_btn.click(refresh_all, inputs=[page_number, col_dropdown, search_text, search_dropdown],
outputs=[data_table, total_pages_display, page_number,
selected_dataset_id, assigned_to_input, status_input])
# ๐Ÿ”„ Start auto-push loop
threading.Thread(target=auto_push_loop, args=(300,), daemon=True).start()
demo.launch()