Santosh commited on
Commit
bff727e
·
1 Parent(s): ebb0ec5

Fresh push: Dataset Insight Portal with Parquet files via LFS

Browse files
all_minimal_dataset_cards.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5adb59f94fb6f08f5c0859e21e55ed56ec40f40d9cde349427bf24065e775d60
3
+ size 17318878
all_rich_dataset_cards.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94ac600eb5100aa7acaeeec3d05becbee7ac11eba9595a0f9e38286879285349
3
+ size 5475858
app.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import polars as pl
3
+
4
+ # Paths or HF Hub URLs for Parquet files
5
+ RICH_PARQUET_PATH = "all_rich_dataset_cards.parquet"
6
+ MISSING_PARQUET_PATH = "all_minimal_dataset_cards.parquet"
7
+
8
+ ROWS_PER_PAGE = 50
9
+
10
+ # Lazy load datasets
11
+ lazy_rich = pl.scan_parquet(RICH_PARQUET_PATH)
12
+ lazy_missing = pl.scan_parquet(MISSING_PARQUET_PATH)
13
+
14
+ current_lazy_df = lazy_missing # Default dataset
15
+
16
+ # Helper function to fetch a page
17
+ def get_page(lazy_df: pl.LazyFrame, page: int, column: str = None, query: str = ""):
18
+ filtered_df = lazy_df
19
+ if column and query:
20
+ query_lower = query.lower().strip()
21
+ # Case-insensitive search
22
+ filtered_df = filtered_df.with_columns([
23
+ pl.col(column).cast(pl.Utf8).str.to_lowercase().alias(column)
24
+ ]).filter(pl.col(column).str.contains(query_lower, literal=False))
25
+ start = page * ROWS_PER_PAGE
26
+ page_df = filtered_df.slice(start, ROWS_PER_PAGE).collect().to_pandas()
27
+ total_rows = filtered_df.collect().height
28
+ total_pages = (total_rows - 1) // ROWS_PER_PAGE + 1
29
+ return page_df, total_pages
30
+
31
+ # Initialize first page
32
+ initial_df, total_pages = get_page(current_lazy_df, 0)
33
+ columns = list(initial_df.columns)
34
+
35
+ with gr.Blocks() as demo:
36
+ gr.Markdown("## Dataset Insight Portal")
37
+
38
+ # Dataset selection
39
+ dataset_select = gr.Dropdown(
40
+ choices=["DatasetCards rich in information", "DatasetCards missing information"],
41
+ value="DatasetCards missing information",
42
+ label="Select Dataset"
43
+ )
44
+
45
+ # Pagination controls
46
+ with gr.Row():
47
+ prev_btn = gr.Button("Previous", elem_id="small-btn")
48
+ next_btn = gr.Button("Next", elem_id="small-btn")
49
+ page_number = gr.Number(value=0, label="Page", precision=0)
50
+ total_pages_display = gr.Label(value=f"Total Pages: {total_pages}")
51
+
52
+ # Data table
53
+ data_table = gr.Dataframe(
54
+ value=initial_df, headers=columns, datatype="str",
55
+ interactive=False, row_count=ROWS_PER_PAGE
56
+ )
57
+
58
+ # Column search
59
+ with gr.Row():
60
+ col_dropdown = gr.Dropdown(choices=columns, label="Column")
61
+ search_text = gr.Textbox(label="Search")
62
+ search_btn = gr.Button("Search", elem_id="small-btn")
63
+ reset_btn = gr.Button("Reset", elem_id="small-btn")
64
+
65
+ # --- Functions ---
66
+ def load_dataset(dataset_choice):
67
+ global current_lazy_df
68
+ current_lazy_df = lazy_rich if dataset_choice == "DatasetCards rich in information" else lazy_missing
69
+ initial_df, total_pages = get_page(current_lazy_df, 0)
70
+ columns = list(initial_df.columns)
71
+ return (
72
+ gr.update(value=initial_df, headers=columns),
73
+ f"Total Pages: {total_pages}",
74
+ 0,
75
+ gr.update(choices=columns, value=columns[0])
76
+ )
77
+
78
+ def next_page_func(page, column, query):
79
+ page += 1
80
+ page_df, total_pages = get_page(current_lazy_df, page, column, query)
81
+ if page >= total_pages:
82
+ page = total_pages - 1
83
+ page_df, total_pages = get_page(current_lazy_df, page, column, query)
84
+ return page_df, f"Total Pages: {total_pages}", page
85
+
86
+ def prev_page_func(page, column, query):
87
+ page -= 1
88
+ page = max(0, page)
89
+ page_df, total_pages = get_page(current_lazy_df, page, column, query)
90
+ return page_df, f"Total Pages: {total_pages}", page
91
+
92
+ def search_func(column, query):
93
+ page_df, total_pages = get_page(current_lazy_df, 0, column, query)
94
+ return page_df, f"Total Pages: {total_pages}", 0
95
+
96
+ def reset_func():
97
+ page_df, total_pages = get_page(current_lazy_df, 0)
98
+ return page_df, f"Total Pages: {total_pages}", 0
99
+
100
+ # --- Event Listeners ---
101
+ dataset_select.change(load_dataset, dataset_select, [data_table, total_pages_display, page_number, col_dropdown])
102
+ next_btn.click(next_page_func, [page_number, col_dropdown, search_text], [data_table, total_pages_display, page_number])
103
+ prev_btn.click(prev_page_func, [page_number, col_dropdown, search_text], [data_table, total_pages_display, page_number])
104
+ search_btn.click(search_func, [col_dropdown, search_text], [data_table, total_pages_display, page_number])
105
+ reset_btn.click(reset_func, [], [data_table, total_pages_display, page_number])
106
+
107
+ demo.launch()