Santosh commited on
Commit
aa595c3
·
1 Parent(s): c7a5270
Files changed (3) hide show
  1. app.py +367 -245
  2. datasetcards.parquet +2 -2
  3. datasetcards_new.parquet +2 -2
app.py CHANGED
@@ -217,265 +217,387 @@
217
 
218
  # demo.launch()
219
 
220
- import gradio as gr
221
- import polars as pl
222
- from huggingface_hub import HfApi
223
- import re
224
- # --- Hugging Face Org ---
225
- org_name = "hugging-science"
226
- api = HfApi()
227
 
228
- def fetch_members():
229
- members = api.list_organization_members(org_name)
230
- return [member.username for member in members]
231
 
232
- member_list = fetch_members()
233
 
234
- # --- Dataset ---
235
- COMBINED_PARQUET_PATH = "datasetcards_new.parquet"
236
- UPDATED_PARQUET_PATH = "datasetcards_new.parquet"
237
- ROWS_PER_PAGE = 50
238
 
 
239
  # df = pl.read_parquet(COMBINED_PARQUET_PATH)
240
- df = pl.read_parquet(COMBINED_PARQUET_PATH)
241
- df = df.with_columns([
242
- pl.lit("todo").alias("status"),
243
- pl.lit("").alias("assigned_to")
244
- ]).sort(by=["downloads", "last_modified", "usedStorage"], descending=[True, True, True])
245
-
246
- if "reason" in df.columns:
247
- df = df.with_columns([
248
- pl.Series(
249
- "reason",
250
- ["short description" if x and "short description" in x.lower() else (x if x is not None else "") for x in df["reason"]]
251
- )
252
- ])
253
-
254
-
255
-
256
-
257
- # Add editable columns if missing
258
- for col in ["assigned_to", "status"]:
259
- if col not in df.columns:
260
- default_val = "" if col == "assigned_to" else "todo"
261
- df = df.with_columns(pl.lit(default_val).alias(col))
262
- else:
263
- # Fill nulls with default
264
- default_val = "" if col == "assigned_to" else "todo"
265
- df = df.with_columns(pl.col(col).fill_null(default_val))
266
-
267
- # --- Columns ---
268
- DROPDOWN_COLUMNS = ["reason", "category", "field", "keyword", "assigned_to", "status"]
269
- STATUS_OPTIONS = ["todo", "inprogress", "PR submitted", "PR merged"]
270
-
271
- # Prepare unique values for dropdown search
272
- unique_values = {col: sorted(df[col].drop_nulls().unique().to_list()) for col in DROPDOWN_COLUMNS}
273
- unique_values['assigned_to'] = sorted(member_list)
274
- unique_values['status'] = STATUS_OPTIONS
275
-
276
- # --- Helper to get page ---
277
- def get_page(df, page, column=None, query=None):
278
- filtered_df = df
279
- if column and query:
280
- if column in DROPDOWN_COLUMNS:
281
- filtered_df = filtered_df.filter(pl.col(column) == query)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
282
  else:
283
- q = query.lower().strip()
284
- filtered_df = (
285
- filtered_df.with_columns([pl.col(column).str.to_lowercase().alias(column)])
286
- .filter(pl.col(column).str.contains(q, literal=False))
287
- )
288
- start = page * ROWS_PER_PAGE
289
- page_df = filtered_df[start:start + ROWS_PER_PAGE].to_pandas().fillna("")
290
- total_rows = filtered_df.height
291
- total_pages = (total_rows - 1) // ROWS_PER_PAGE + 1 if total_rows > 0 else 1
292
- return page_df, total_pages
293
-
294
- initial_df, total_pages = get_page(df, 0)
295
- columns = list(initial_df.columns)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
296
 
297
  with gr.Blocks() as demo:
298
- gr.Markdown("""
299
- # Dataset Insight Portal
300
-
301
- Welcome! This portal helps you explore and manage datasets from our Hugging Face organization.
302
-
303
- ## What is this space for?
304
- This space provides a table of datasets along with metadata. You can:
305
- - Browse datasets with pagination.
306
- - Search datasets by various fields.
307
- - Assign responsibility for reviewing datasets (`assigned_to`).
308
- - Track progress using `status`.
309
-
310
- ## Why the table?
311
- The table gives a structured view of all datasets, making it easy to sort, filter, and update information for each dataset. It consists of all datasets until 20-09-2025.
312
-
313
- ## What does the table contain?
314
- Each row represents a dataset. Columns include:
315
- - **dataset_id**: Unique identifier of the dataset.
316
- - **dataset_url**: Link to the dataset page on Hugging Face.
317
- - **downloads**: Number of downloads.
318
- - **author**: Dataset author.
319
- - **license**: License type.
320
- - **tags**: Tags describing the dataset. Obtained from the dataset card.
321
- - **task_categories**: Categories of tasks the dataset is useful for. Obtained from the dataset card.
322
- - **last_modified**: Date of last update.
323
- - **field, keyword**: Metadata columns describing dataset purpose based on heuristics. Use the `field` and `keyword` to filter for science based datasets.
324
- - **category**: Category of the dataset (`rich` means it is good dataset card. `minimal` means it needs improvement for the reasons below).
325
- - **reason**: Reason why the dataset is classified as `minimal`. Options: `Failed to load card`, `No metadata and no description`, `No metadata and has description`, `Short description`.
326
- - **usedStorage**: Storage used by the dataset (bytes).
327
- - **assigned_to**: Person responsible for the dataset (editable).
328
- - **status**: Progress status (editable). Options: `todo`, `inprogress`, `PR submitted`, `PR merged`.
329
-
330
- ## How to use search
331
- - Select a **column** from the dropdown.
332
- - If the column is textual, type your query in the text box.
333
- - If the column is a dropdown (like `assigned_to` or `status`), select the value from the dropdown.
334
- - Click **Search** to filter the table.
335
-
336
- ## How to add or update `assigned_to` and `status`
337
- 1. Search for the **dataset_id** initially.
338
- 2. Then, select the **dataset_id** from the dropdown below the table.
339
- 3. Choose the person responsible in **Assigned To**. If you are a member of the organization, your username should appear in the list. Else refresh and try again.
340
- 4. Select the current status in **Status**.
341
- 5. Click **Save Changes** to update the table and persist the changes.
342
- 6. Use **Refresh All** to reload the table and the latest members list.
343
-
344
- This portal makes it easy to keep track of dataset reviews, assignments, and progress all in one place.
345
- """)
346
-
347
- # --- Pagination controls ---
348
  with gr.Row():
349
- prev_btn = gr.Button("Previous")
350
- next_btn = gr.Button("Next")
351
- page_number = gr.Number(value=0, label="Page", precision=0)
352
- total_pages_display = gr.Label(value=f"Total Pages: {total_pages}")
353
-
354
- # --- Data table ---
355
- data_table = gr.Dataframe(
356
- value=initial_df,
357
- headers=columns,
358
- datatype="str",
359
- interactive=False,
360
- row_count=ROWS_PER_PAGE
361
- )
362
-
363
- # --- Search controls ---
364
- with gr.Row():
365
- col_dropdown = gr.Dropdown(choices=columns, label="Column to Search")
366
  search_text = gr.Textbox(label="Search Text")
367
- search_dropdown = gr.Dropdown(choices=[], label="Select Value", visible=False)
368
- search_btn = gr.Button("Search")
369
- reset_btn = gr.Button("Reset")
 
 
370
 
371
- # --- Dataset selection & editable fields ---
372
- selected_dataset_id = gr.Dropdown(label="Select dataset_id", choices=initial_df['dataset_id'].tolist())
373
- assigned_to_input = gr.Dropdown(choices=member_list, label="Assigned To")
374
- # status_input = gr.Dropdown(choices=STATUS_OPTIONS, label="Status")
375
- status_input = gr.Dropdown(choices=STATUS_OPTIONS, label="Status", value="todo")
376
 
 
 
 
377
 
378
  save_btn = gr.Button("Save Changes")
379
- refresh_btn = gr.Button("Refresh All")
380
- save_message = gr.Textbox(label="Save Status", interactive=False)
381
 
382
- # --- Update search input depending on column ---
383
- def update_search_input(column):
384
- if column in DROPDOWN_COLUMNS:
385
- return gr.update(choices=unique_values[column], visible=True), gr.update(visible=False)
386
- else:
387
- return gr.update(visible=False), gr.update(visible=True)
388
-
389
- col_dropdown.change(update_search_input, col_dropdown, [search_dropdown, search_text])
390
-
391
- # --- Prefill editable fields ---
392
- def prefill_fields(dataset_id):
393
- if not dataset_id:
394
- return "", "todo"
395
- dataset_id = str(dataset_id)
396
- filtered = [row for row in df.to_dicts() if str(row.get("dataset_id")) == dataset_id]
397
- if not filtered:
398
- return "", "todo"
399
- row = filtered[0]
400
- return row.get("assigned_to", ""), row.get("status", "todo")
401
-
402
- selected_dataset_id.change(prefill_fields, selected_dataset_id, [assigned_to_input, status_input])
403
-
404
- # --- Search function ---
405
- def search_func(page, column, txt, ddl):
406
- query = ddl if column in DROPDOWN_COLUMNS else txt
407
- page_df, total_pages = get_page(df, page, column, query)
408
- return page_df, f"Total Pages: {total_pages}", 0, gr.update(choices=page_df['dataset_id'].tolist())
409
-
410
- # --- Pagination functions ---
411
- def next_page(page, column, txt, ddl):
412
- page += 1
413
- query = ddl if column in DROPDOWN_COLUMNS else txt
414
- page_df, total_pages = get_page(df, page, column, query)
415
- if page >= total_pages:
416
- page = total_pages - 1
417
- page_df, total_pages = get_page(df, page, column, query)
418
- return page_df, f"Total Pages: {total_pages}", page, gr.update(choices=page_df['dataset_id'].tolist())
419
-
420
- def prev_page(page, column, txt, ddl):
421
- page = max(0, page - 1)
422
- query = ddl if column in DROPDOWN_COLUMNS else txt
423
- page_df, total_pages = get_page(df, page, column, query)
424
- return page_df, f"Total Pages: {total_pages}", page, gr.update(choices=page_df['dataset_id'].tolist())
425
-
426
- def reset_func():
427
- page_df, total_pages = get_page(df, 0)
428
- return page_df, f"Total Pages: {total_pages}", 0, gr.update(choices=page_df['dataset_id'].tolist())
429
-
430
- # --- Save changes & refresh ---
431
- def save_changes(dataset_id, assigned_to_val, status_val, page_val, col, txt, ddl):
432
- global df
433
- if not dataset_id:
434
- return gr.update(value="Please select a row first."), None, None, None
435
- df = df.with_columns([
436
- pl.when(pl.col("dataset_id") == dataset_id).then(pl.lit(assigned_to_val)).otherwise(pl.col("assigned_to")).alias("assigned_to"),
437
- pl.when(pl.col("dataset_id") == dataset_id).then(pl.lit(status_val)).otherwise(pl.col("status")).alias("status")
438
- ])
439
- df.write_parquet(UPDATED_PARQUET_PATH)
440
- page_df, total_pages = get_page(df, page_val, col, txt if col not in DROPDOWN_COLUMNS else ddl)
441
- return (
442
- gr.update(value=f"Saved changes for dataset_id: {dataset_id}"),
443
- page_df,
444
- gr.update(choices=page_df['dataset_id'].tolist()),
445
- f"Total Pages: {total_pages}"
446
- )
447
-
448
- # --- Refresh All: table + members ---
449
- def refresh_all(page, column, txt, ddl):
450
- global df, member_list, unique_values
451
- # Refresh members
452
- member_list = fetch_members()
453
- unique_values['assigned_to'] = sorted(member_list)
454
- # Refresh table
455
- try:
456
- df = pl.read_parquet(UPDATED_PARQUET_PATH)
457
- except FileNotFoundError:
458
- pass
459
- page_df, total_pages = get_page(df, page, column, txt if column not in DROPDOWN_COLUMNS else ddl)
460
- return page_df, f"Total Pages: {total_pages}", page, gr.update(choices=page_df['dataset_id'].tolist()), gr.update(choices=member_list)
461
-
462
- # --- Wire buttons ---
463
- inputs_search = [page_number, col_dropdown, search_text, search_dropdown]
464
- outputs_search = [data_table, total_pages_display, page_number, selected_dataset_id]
465
-
466
- search_btn.click(search_func, inputs_search, outputs_search)
467
- next_btn.click(next_page, inputs_search, outputs_search)
468
- prev_btn.click(prev_page, inputs_search, outputs_search)
469
- reset_btn.click(reset_func, [], outputs_search)
470
- save_btn.click(
471
- save_changes,
472
- [selected_dataset_id, assigned_to_input, status_input, page_number, col_dropdown, search_text, search_dropdown],
473
- [save_message, data_table, selected_dataset_id, total_pages_display]
474
- )
475
- refresh_btn.click(
476
- refresh_all,
477
- inputs=[page_number, col_dropdown, search_text, search_dropdown],
478
- outputs=[data_table, total_pages_display, page_number, selected_dataset_id, assigned_to_input]
479
- )
480
 
481
  demo.launch()
 
217
 
218
  # demo.launch()
219
 
220
+ # import gradio as gr
221
+ # import polars as pl
222
+ # from huggingface_hub import HfApi
223
+ # import re
224
+ # # --- Hugging Face Org ---
225
+ # org_name = "hugging-science"
226
+ # api = HfApi()
227
 
228
+ # def fetch_members():
229
+ # members = api.list_organization_members(org_name)
230
+ # return [member.username for member in members]
231
 
232
+ # member_list = fetch_members()
233
 
234
+ # # --- Dataset ---
235
+ # COMBINED_PARQUET_PATH = "datasetcards_new.parquet"
236
+ # UPDATED_PARQUET_PATH = "datasetcards_new.parquet"
237
+ # ROWS_PER_PAGE = 50
238
 
239
+ # # df = pl.read_parquet(COMBINED_PARQUET_PATH)
240
  # df = pl.read_parquet(COMBINED_PARQUET_PATH)
241
+ # df = df.with_columns([
242
+ # pl.lit("todo").alias("status"),
243
+ # pl.lit("").alias("assigned_to")
244
+ # ]).sort(by=["downloads", "last_modified", "usedStorage"], descending=[True, True, True])
245
+
246
+ # if "reason" in df.columns:
247
+ # df = df.with_columns([
248
+ # pl.Series(
249
+ # "reason",
250
+ # ["short description" if x and "short description" in x.lower() else (x if x is not None else "") for x in df["reason"]]
251
+ # )
252
+ # ])
253
+
254
+
255
+
256
+
257
+ # # Add editable columns if missing
258
+ # for col in ["assigned_to", "status"]:
259
+ # if col not in df.columns:
260
+ # default_val = "" if col == "assigned_to" else "todo"
261
+ # df = df.with_columns(pl.lit(default_val).alias(col))
262
+ # else:
263
+ # # Fill nulls with default
264
+ # default_val = "" if col == "assigned_to" else "todo"
265
+ # df = df.with_columns(pl.col(col).fill_null(default_val))
266
+
267
+ # # --- Columns ---
268
+ # DROPDOWN_COLUMNS = ["reason", "category", "field", "keyword", "assigned_to", "status"]
269
+ # STATUS_OPTIONS = ["todo", "inprogress", "PR submitted", "PR merged"]
270
+
271
+ # # Prepare unique values for dropdown search
272
+ # unique_values = {col: sorted(df[col].drop_nulls().unique().to_list()) for col in DROPDOWN_COLUMNS}
273
+ # unique_values['assigned_to'] = sorted(member_list)
274
+ # unique_values['status'] = STATUS_OPTIONS
275
+
276
+ # # --- Helper to get page ---
277
+ # def get_page(df, page, column=None, query=None):
278
+ # filtered_df = df
279
+ # if column and query:
280
+ # if column in DROPDOWN_COLUMNS:
281
+ # filtered_df = filtered_df.filter(pl.col(column) == query)
282
+ # else:
283
+ # q = query.lower().strip()
284
+ # filtered_df = (
285
+ # filtered_df.with_columns([pl.col(column).str.to_lowercase().alias(column)])
286
+ # .filter(pl.col(column).str.contains(q, literal=False))
287
+ # )
288
+ # start = page * ROWS_PER_PAGE
289
+ # page_df = filtered_df[start:start + ROWS_PER_PAGE].to_pandas().fillna("")
290
+ # total_rows = filtered_df.height
291
+ # total_pages = (total_rows - 1) // ROWS_PER_PAGE + 1 if total_rows > 0 else 1
292
+ # return page_df, total_pages
293
+
294
+ # initial_df, total_pages = get_page(df, 0)
295
+ # columns = list(initial_df.columns)
296
+
297
+ # with gr.Blocks() as demo:
298
+ # gr.Markdown("""
299
+ # # Dataset Insight Portal
300
+
301
+ # Welcome! This portal helps you explore and manage datasets from our Hugging Face organization.
302
+
303
+ # ## What is this space for?
304
+ # This space provides a table of datasets along with metadata. You can:
305
+ # - Browse datasets with pagination.
306
+ # - Search datasets by various fields.
307
+ # - Assign responsibility for reviewing datasets (`assigned_to`).
308
+ # - Track progress using `status`.
309
+
310
+ # ## Why the table?
311
+ # The table gives a structured view of all datasets, making it easy to sort, filter, and update information for each dataset. It consists of all datasets until 20-09-2025.
312
+
313
+ # ## What does the table contain?
314
+ # Each row represents a dataset. Columns include:
315
+ # - **dataset_id**: Unique identifier of the dataset.
316
+ # - **dataset_url**: Link to the dataset page on Hugging Face.
317
+ # - **downloads**: Number of downloads.
318
+ # - **author**: Dataset author.
319
+ # - **license**: License type.
320
+ # - **tags**: Tags describing the dataset. Obtained from the dataset card.
321
+ # - **task_categories**: Categories of tasks the dataset is useful for. Obtained from the dataset card.
322
+ # - **last_modified**: Date of last update.
323
+ # - **field, keyword**: Metadata columns describing dataset purpose based on heuristics. Use the `field` and `keyword` to filter for science based datasets.
324
+ # - **category**: Category of the dataset (`rich` means it is good dataset card. `minimal` means it needs improvement for the reasons below).
325
+ # - **reason**: Reason why the dataset is classified as `minimal`. Options: `Failed to load card`, `No metadata and no description`, `No metadata and has description`, `Short description`.
326
+ # - **usedStorage**: Storage used by the dataset (bytes).
327
+ # - **assigned_to**: Person responsible for the dataset (editable).
328
+ # - **status**: Progress status (editable). Options: `todo`, `inprogress`, `PR submitted`, `PR merged`.
329
+
330
+ # ## How to use search
331
+ # - Select a **column** from the dropdown.
332
+ # - If the column is textual, type your query in the text box.
333
+ # - If the column is a dropdown (like `assigned_to` or `status`), select the value from the dropdown.
334
+ # - Click **Search** to filter the table.
335
+
336
+ # ## How to add or update `assigned_to` and `status`
337
+ # 1. Search for the **dataset_id** initially.
338
+ # 2. Then, select the **dataset_id** from the dropdown below the table.
339
+ # 3. Choose the person responsible in **Assigned To**. If you are a member of the organization, your username should appear in the list. Else refresh and try again.
340
+ # 4. Select the current status in **Status**.
341
+ # 5. Click **Save Changes** to update the table and persist the changes.
342
+ # 6. Use **Refresh All** to reload the table and the latest members list.
343
+
344
+ # This portal makes it easy to keep track of dataset reviews, assignments, and progress all in one place.
345
+ # """)
346
+
347
+ # # --- Pagination controls ---
348
+ # with gr.Row():
349
+ # prev_btn = gr.Button("Previous")
350
+ # next_btn = gr.Button("Next")
351
+ # page_number = gr.Number(value=0, label="Page", precision=0)
352
+ # total_pages_display = gr.Label(value=f"Total Pages: {total_pages}")
353
+
354
+ # # --- Data table ---
355
+ # data_table = gr.Dataframe(
356
+ # value=initial_df,
357
+ # headers=columns,
358
+ # datatype="str",
359
+ # interactive=False,
360
+ # row_count=ROWS_PER_PAGE
361
+ # )
362
+
363
+ # # --- Search controls ---
364
+ # with gr.Row():
365
+ # col_dropdown = gr.Dropdown(choices=columns, label="Column to Search")
366
+ # search_text = gr.Textbox(label="Search Text")
367
+ # search_dropdown = gr.Dropdown(choices=[], label="Select Value", visible=False)
368
+ # search_btn = gr.Button("Search")
369
+ # reset_btn = gr.Button("Reset")
370
+
371
+ # # --- Dataset selection & editable fields ---
372
+ # selected_dataset_id = gr.Dropdown(label="Select dataset_id", choices=initial_df['dataset_id'].tolist())
373
+ # assigned_to_input = gr.Dropdown(choices=member_list, label="Assigned To")
374
+ # # status_input = gr.Dropdown(choices=STATUS_OPTIONS, label="Status")
375
+ # status_input = gr.Dropdown(choices=STATUS_OPTIONS, label="Status", value="todo")
376
+
377
+
378
+ # save_btn = gr.Button("Save Changes")
379
+ # refresh_btn = gr.Button("Refresh All")
380
+ # save_message = gr.Textbox(label="Save Status", interactive=False)
381
+
382
+ # # --- Update search input depending on column ---
383
+ # def update_search_input(column):
384
+ # if column in DROPDOWN_COLUMNS:
385
+ # return gr.update(choices=unique_values[column], visible=True), gr.update(visible=False)
386
+ # else:
387
+ # return gr.update(visible=False), gr.update(visible=True)
388
+
389
+ # col_dropdown.change(update_search_input, col_dropdown, [search_dropdown, search_text])
390
+
391
+ # # --- Prefill editable fields ---
392
+ # def prefill_fields(dataset_id):
393
+ # if not dataset_id:
394
+ # return "", "todo"
395
+ # dataset_id = str(dataset_id)
396
+ # filtered = [row for row in df.to_dicts() if str(row.get("dataset_id")) == dataset_id]
397
+ # if not filtered:
398
+ # return "", "todo"
399
+ # row = filtered[0]
400
+ # return row.get("assigned_to", ""), row.get("status", "todo")
401
+
402
+ # selected_dataset_id.change(prefill_fields, selected_dataset_id, [assigned_to_input, status_input])
403
+
404
+ # # --- Search function ---
405
+ # def search_func(page, column, txt, ddl):
406
+ # query = ddl if column in DROPDOWN_COLUMNS else txt
407
+ # page_df, total_pages = get_page(df, page, column, query)
408
+ # return page_df, f"Total Pages: {total_pages}", 0, gr.update(choices=page_df['dataset_id'].tolist())
409
+
410
+ # # --- Pagination functions ---
411
+ # def next_page(page, column, txt, ddl):
412
+ # page += 1
413
+ # query = ddl if column in DROPDOWN_COLUMNS else txt
414
+ # page_df, total_pages = get_page(df, page, column, query)
415
+ # if page >= total_pages:
416
+ # page = total_pages - 1
417
+ # page_df, total_pages = get_page(df, page, column, query)
418
+ # return page_df, f"Total Pages: {total_pages}", page, gr.update(choices=page_df['dataset_id'].tolist())
419
+
420
+ # def prev_page(page, column, txt, ddl):
421
+ # page = max(0, page - 1)
422
+ # query = ddl if column in DROPDOWN_COLUMNS else txt
423
+ # page_df, total_pages = get_page(df, page, column, query)
424
+ # return page_df, f"Total Pages: {total_pages}", page, gr.update(choices=page_df['dataset_id'].tolist())
425
+
426
+ # def reset_func():
427
+ # page_df, total_pages = get_page(df, 0)
428
+ # return page_df, f"Total Pages: {total_pages}", 0, gr.update(choices=page_df['dataset_id'].tolist())
429
+
430
+ # # --- Save changes & refresh ---
431
+ # def save_changes(dataset_id, assigned_to_val, status_val, page_val, col, txt, ddl):
432
+ # global df
433
+ # if not dataset_id:
434
+ # return gr.update(value="Please select a row first."), None, None, None
435
+ # df = df.with_columns([
436
+ # pl.when(pl.col("dataset_id") == dataset_id).then(pl.lit(assigned_to_val)).otherwise(pl.col("assigned_to")).alias("assigned_to"),
437
+ # pl.when(pl.col("dataset_id") == dataset_id).then(pl.lit(status_val)).otherwise(pl.col("status")).alias("status")
438
+ # ])
439
+ # df.write_parquet(UPDATED_PARQUET_PATH)
440
+ # page_df, total_pages = get_page(df, page_val, col, txt if col not in DROPDOWN_COLUMNS else ddl)
441
+ # return (
442
+ # gr.update(value=f"Saved changes for dataset_id: {dataset_id}"),
443
+ # page_df,
444
+ # gr.update(choices=page_df['dataset_id'].tolist()),
445
+ # f"Total Pages: {total_pages}"
446
+ # )
447
+
448
+ # # --- Refresh All: table + members ---
449
+ # def refresh_all(page, column, txt, ddl):
450
+ # global df, member_list, unique_values
451
+ # # Refresh members
452
+ # member_list = fetch_members()
453
+ # unique_values['assigned_to'] = sorted(member_list)
454
+ # # Refresh table
455
+ # try:
456
+ # df = pl.read_parquet(UPDATED_PARQUET_PATH)
457
+ # except FileNotFoundError:
458
+ # pass
459
+ # page_df, total_pages = get_page(df, page, column, txt if column not in DROPDOWN_COLUMNS else ddl)
460
+ # return page_df, f"Total Pages: {total_pages}", page, gr.update(choices=page_df['dataset_id'].tolist()), gr.update(choices=member_list)
461
+
462
+ # # --- Wire buttons ---
463
+ # inputs_search = [page_number, col_dropdown, search_text, search_dropdown]
464
+ # outputs_search = [data_table, total_pages_display, page_number, selected_dataset_id]
465
+
466
+ # search_btn.click(search_func, inputs_search, outputs_search)
467
+ # next_btn.click(next_page, inputs_search, outputs_search)
468
+ # prev_btn.click(prev_page, inputs_search, outputs_search)
469
+ # reset_btn.click(reset_func, [], outputs_search)
470
+ # save_btn.click(
471
+ # save_changes,
472
+ # [selected_dataset_id, assigned_to_input, status_input, page_number, col_dropdown, search_text, search_dropdown],
473
+ # [save_message, data_table, selected_dataset_id, total_pages_display]
474
+ # )
475
+ # refresh_btn.click(
476
+ # refresh_all,
477
+ # inputs=[page_number, col_dropdown, search_text, search_dropdown],
478
+ # outputs=[data_table, total_pages_display, page_number, selected_dataset_id, assigned_to_input]
479
+ # )
480
+
481
+ # demo.launch()
482
+
483
+
484
+
485
+
486
+ import gradio as gr
487
+ import polars as pl
488
+ import os
489
+ import subprocess
490
+ import threading
491
+ import time
492
+
493
+ # --- Config ---
494
+ COMBINED_PARQUET_PATH = "datasetcards_new.parquet"
495
+ UPDATED_PARQUET_PATH = "datasetcards_new.parquet" # overwrite same file
496
+ ROWS_PER_PAGE = 50
497
+ ORG_NAME = "hugging-science" # replace with your org
498
+ SPACE_NAME = "dataset-insight-portal" # replace with your space
499
+
500
+ # --- Load dataset ---
501
+ df = pl.read_parquet(COMBINED_PARQUET_PATH).with_columns([
502
+ pl.lit("").alias("assigned_to"),
503
+ pl.lit("todo").alias("status")
504
+ ])
505
+ columns = df.columns
506
+ total_pages = (len(df) + ROWS_PER_PAGE - 1) // ROWS_PER_PAGE
507
+
508
+ # --- Git push helpers ---
509
+ def save_and_push():
510
+ """Commit and push parquet file changes to the repo."""
511
+ try:
512
+ subprocess.run(["git", "config", "--global", "user.email", "[email protected]"])
513
+ subprocess.run(["git", "config", "--global", "user.name", "Santosh Sanjeev"])
514
+
515
+ hf_token = os.environ["HF_TOKEN"]
516
+ repo_url = f"https://user:{hf_token}@huggingface.co/spaces/{ORG_NAME}/{SPACE_NAME}"
517
+ subprocess.run(["git", "remote", "set-url", "origin", repo_url])
518
+
519
+ # Commit only if parquet changed
520
+ subprocess.run(["git", "add", UPDATED_PARQUET_PATH])
521
+ result = subprocess.run(["git", "diff", "--cached", "--quiet"])
522
+ if result.returncode != 0:
523
+ subprocess.run(["git", "commit", "-m", "Auto-update parquet file"])
524
+ subprocess.run(["git", "push", "origin", "main"])
525
+ print("✅ Pushed parquet to repo")
526
  else:
527
+ print("ℹ️ No parquet changes to push")
528
+ except Exception as e:
529
+ print("⚠️ Push failed:", e)
530
+
531
+ def auto_push_loop(interval=300):
532
+ """Run save_and_push every `interval` seconds (default 5 min)."""
533
+ while True:
534
+ save_and_push()
535
+ time.sleep(interval)
536
+
537
+ # --- Gradio app functions ---
538
+ def get_page(page_num, col, search_text, search_dropdown):
539
+ global df
540
+ filtered = df
541
+
542
+ if col and col in df.columns:
543
+ if col in DROPDOWN_COLUMNS and search_dropdown:
544
+ filtered = filtered.filter(pl.col(col) == search_dropdown)
545
+ elif search_text:
546
+ filtered = filtered.filter(pl.col(col).cast(str).str.contains(search_text, literal=False))
547
+
548
+ total_pages = (len(filtered) + ROWS_PER_PAGE - 1) // ROWS_PER_PAGE
549
+ start, end = (page_num - 1) * ROWS_PER_PAGE, page_num * ROWS_PER_PAGE
550
+ page_df = filtered[start:end]
551
+ return page_df.to_pandas(), f"of {total_pages}", page_num, "", "", ""
552
+
553
+ def save_changes(dataset_id, assigned_to, status):
554
+ global df
555
+ mask = df["dataset_id"] == dataset_id
556
+ if mask.any():
557
+ df = df.with_columns([
558
+ pl.when(mask).then(assigned_to).otherwise(df["assigned_to"]).alias("assigned_to"),
559
+ pl.when(mask).then(status).otherwise(df["status"]).alias("status")
560
+ ])
561
+ df.write_parquet(UPDATED_PARQUET_PATH)
562
+ save_and_push() # push immediately after change
563
+ return f"Saved for {dataset_id} ✅"
564
+
565
+ def refresh_all(page_num, col, search_text, search_dropdown):
566
+ return get_page(page_num, col, search_text, search_dropdown)
567
+
568
+ # --- UI ---
569
+ DROPDOWN_COLUMNS = ["status", "assigned_to"]
570
 
571
  with gr.Blocks() as demo:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
572
  with gr.Row():
573
+ col_dropdown = gr.Dropdown(choices=columns, label="Search Column")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
574
  search_text = gr.Textbox(label="Search Text")
575
+ search_dropdown = gr.Dropdown(choices=["todo", "inprogress", "PR submitted", "PR merged"], label="Status")
576
+
577
+ with gr.Row():
578
+ page_number = gr.Number(value=1, precision=0, label="Page #")
579
+ total_pages_display = gr.Textbox(value=f"of {total_pages}", interactive=False)
580
 
581
+ data_table = gr.Dataframe(headers=columns, datatype=["str"] * len(columns), row_count=ROWS_PER_PAGE)
 
 
 
 
582
 
583
+ selected_dataset_id = gr.Textbox(label="Selected Dataset ID", interactive=False)
584
+ assigned_to_input = gr.Textbox(label="Assigned To")
585
+ status_input = gr.Dropdown(choices=["todo", "inprogress", "PR submitted", "PR merged"], label="Status")
586
 
587
  save_btn = gr.Button("Save Changes")
588
+ refresh_btn = gr.Button("Refresh")
 
589
 
590
+ output_msg = gr.Textbox(label="Message", interactive=False)
591
+
592
+ page_number.change(get_page, inputs=[page_number, col_dropdown, search_text, search_dropdown],
593
+ outputs=[data_table, total_pages_display, page_number,
594
+ selected_dataset_id, assigned_to_input, status_input])
595
+ save_btn.click(save_changes, inputs=[selected_dataset_id, assigned_to_input, status_input], outputs=[output_msg])
596
+ refresh_btn.click(refresh_all, inputs=[page_number, col_dropdown, search_text, search_dropdown],
597
+ outputs=[data_table, total_pages_display, page_number,
598
+ selected_dataset_id, assigned_to_input, status_input])
599
+
600
+ # 🔄 Start auto-push loop
601
+ threading.Thread(target=auto_push_loop, args=(300,), daemon=True).start()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
602
 
603
  demo.launch()
datasetcards.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c248074b63bc77b236e8096e3423779f3a5bf4cbe24a2683ea63da31a1c4c154
3
- size 35038132
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b0d3770a3024eaf459d5c12d2c4a9d0d5a5043660d0a15c062a387595602eacf
3
+ size 38347730
datasetcards_new.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c248074b63bc77b236e8096e3423779f3a5bf4cbe24a2683ea63da31a1c4c154
3
- size 35038132
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b0d3770a3024eaf459d5c12d2c4a9d0d5a5043660d0a15c062a387595602eacf
3
+ size 38347730