# import gradio as gr # import polars as pl # # Paths or HF Hub URLs for Parquet files # RICH_PARQUET_PATH = "all_rich_dataset_cards.parquet" # MISSING_PARQUET_PATH = "all_minimal_dataset_cards.parquet" # ROWS_PER_PAGE = 50 # # Lazy load datasets # lazy_rich = pl.scan_parquet(RICH_PARQUET_PATH) # lazy_missing = pl.scan_parquet(MISSING_PARQUET_PATH) # current_lazy_df = lazy_missing # Default dataset # # Helper function to fetch a page # def get_page(lazy_df: pl.LazyFrame, page: int, column: str = None, query: str = ""): # filtered_df = lazy_df # if column and query: # query_lower = query.lower().strip() # # Case-insensitive search # filtered_df = filtered_df.with_columns([ # pl.col(column).cast(pl.Utf8).str.to_lowercase().alias(column) # ]).filter(pl.col(column).str.contains(query_lower, literal=False)) # start = page * ROWS_PER_PAGE # page_df = filtered_df.slice(start, ROWS_PER_PAGE).collect().to_pandas() # total_rows = filtered_df.collect().height # total_pages = (total_rows - 1) // ROWS_PER_PAGE + 1 # return page_df, total_pages # # Initialize first page # initial_df, total_pages = get_page(current_lazy_df, 0) # columns = list(initial_df.columns) # with gr.Blocks() as demo: # gr.Markdown("## Dataset Insight Portal") # # Dataset selection # dataset_select = gr.Dropdown( # choices=["DatasetCards rich in information", "DatasetCards missing information"], # value="DatasetCards missing information", # label="Select Dataset" # ) # # Pagination controls # with gr.Row(): # prev_btn = gr.Button("Previous", elem_id="small-btn") # next_btn = gr.Button("Next", elem_id="small-btn") # page_number = gr.Number(value=0, label="Page", precision=0) # total_pages_display = gr.Label(value=f"Total Pages: {total_pages}") # # Data table # data_table = gr.Dataframe( # value=initial_df, headers=columns, datatype="str", # interactive=False, row_count=ROWS_PER_PAGE # ) # # Column search # with gr.Row(): # col_dropdown = gr.Dropdown(choices=columns, label="Column") # search_text = gr.Textbox(label="Search") # search_btn = gr.Button("Search", elem_id="small-btn") # reset_btn = gr.Button("Reset", elem_id="small-btn") # # --- Functions --- # def load_dataset(dataset_choice): # global current_lazy_df # current_lazy_df = lazy_rich if dataset_choice == "DatasetCards rich in information" else lazy_missing # initial_df, total_pages = get_page(current_lazy_df, 0) # columns = list(initial_df.columns) # return ( # gr.update(value=initial_df, headers=columns), # f"Total Pages: {total_pages}", # 0, # gr.update(choices=columns, value=columns[0]) # ) # def next_page_func(page, column, query): # page += 1 # page_df, total_pages = get_page(current_lazy_df, page, column, query) # if page >= total_pages: # page = total_pages - 1 # page_df, total_pages = get_page(current_lazy_df, page, column, query) # return page_df, f"Total Pages: {total_pages}", page # def prev_page_func(page, column, query): # page -= 1 # page = max(0, page) # page_df, total_pages = get_page(current_lazy_df, page, column, query) # return page_df, f"Total Pages: {total_pages}", page # def search_func(column, query): # page_df, total_pages = get_page(current_lazy_df, 0, column, query) # return page_df, f"Total Pages: {total_pages}", 0 # def reset_func(): # page_df, total_pages = get_page(current_lazy_df, 0) # return page_df, f"Total Pages: {total_pages}", 0 # # --- Event Listeners --- # dataset_select.change(load_dataset, dataset_select, [data_table, total_pages_display, page_number, col_dropdown]) # next_btn.click(next_page_func, [page_number, col_dropdown, search_text], [data_table, total_pages_display, page_number]) # prev_btn.click(prev_page_func, [page_number, col_dropdown, search_text], [data_table, total_pages_display, page_number]) # search_btn.click(search_func, [col_dropdown, search_text], [data_table, total_pages_display, page_number]) # reset_btn.click(reset_func, [], [data_table, total_pages_display, page_number]) # demo.launch() import gradio as gr import polars as pl # Path for the combined Parquet file COMBINED_PARQUET_PATH = "datasetcards.parquet" ROWS_PER_PAGE = 50 # Lazy load dataset lazy_df = pl.scan_parquet(COMBINED_PARQUET_PATH) # Helper function to fetch a page def get_page(lazy_df: pl.LazyFrame, page: int, column: str = None, query: str = ""): filtered_df = lazy_df if column and query: query_lower = query.lower().strip() # Case-insensitive search filtered_df = filtered_df.with_columns([ pl.col(column).cast(pl.Utf8).str.to_lowercase().alias(column) ]).filter(pl.col(column).str.contains(query_lower, literal=False)) start = page * ROWS_PER_PAGE page_df = filtered_df.slice(start, ROWS_PER_PAGE).collect().to_pandas() total_rows = filtered_df.collect().height total_pages = (total_rows - 1) // ROWS_PER_PAGE + 1 return page_df, total_pages # Initialize first page initial_df, total_pages = get_page(lazy_df, 0) columns = list(initial_df.columns) with gr.Blocks() as demo: gr.Markdown("## Dataset Insight Portal") gr.Markdown("This space allows you to explore the combined dataset of DatasetCards. " "You can navigate pages, search within columns, and inspect the dataset easily.") # Pagination controls with gr.Row(): prev_btn = gr.Button("Previous", elem_id="small-btn") next_btn = gr.Button("Next", elem_id="small-btn") page_number = gr.Number(value=0, label="Page", precision=0) total_pages_display = gr.Label(value=f"Total Pages: {total_pages}") # Data table data_table = gr.Dataframe( value=initial_df, headers=columns, datatype="str", interactive=False, row_count=ROWS_PER_PAGE ) # Column search with gr.Row(): col_dropdown = gr.Dropdown(choices=columns, label="Column") search_text = gr.Textbox(label="Search") search_btn = gr.Button("Search", elem_id="small-btn") reset_btn = gr.Button("Reset", elem_id="small-btn") # --- Functions --- current_lazy_df = lazy_df # single dataset def next_page_func(page, column, query): page += 1 page_df, total_pages = get_page(current_lazy_df, page, column, query) if page >= total_pages: page = total_pages - 1 page_df, total_pages = get_page(current_lazy_df, page, column, query) return page_df, f"Total Pages: {total_pages}", page def prev_page_func(page, column, query): page -= 1 page = max(0, page) page_df, total_pages = get_page(current_lazy_df, page, column, query) return page_df, f"Total Pages: {total_pages}", page def search_func(column, query): page_df, total_pages = get_page(current_lazy_df, 0, column, query) return page_df, f"Total Pages: {total_pages}", 0 def reset_func(): page_df, total_pages = get_page(current_lazy_df, 0) return page_df, f"Total Pages: {total_pages}", 0 # --- Event Listeners --- next_btn.click(next_page_func, [page_number, col_dropdown, search_text], [data_table, total_pages_display, page_number]) prev_btn.click(prev_page_func, [page_number, col_dropdown, search_text], [data_table, total_pages_display, page_number]) search_btn.click(search_func, [col_dropdown, search_text], [data_table, total_pages_display, page_number]) reset_btn.click(reset_func, [], [data_table, total_pages_display, page_number]) demo.launch()