lhoestq HF staff commited on
Commit
d7d7fc8
Β·
1 Parent(s): ec912e5

update app

Browse files
Files changed (1) hide show
  1. app.py +58 -31
app.py CHANGED
@@ -1,42 +1,69 @@
 
 
1
  import gradio as gr
 
2
  import pyarrow.parquet as pq
3
  from gradio_huggingfacehub_search import HuggingfaceHubSearch
4
  from huggingface_hub import HfFileSystem
5
 
6
- fs = HfFileSystem()
7
-
8
- with gr.Blocks() as demo:
9
- with gr.Column():
10
- dataset_search = HuggingfaceHubSearch(
11
- label="Hub Dataset ID",
12
- placeholder="Search for dataset id on Huggingface",
13
- search_type="dataset",
14
- )
15
- revision_textbox = gr.Textbox("main")
16
- parquet_file_dropdown = gr.Dropdown()
17
- with gr.Column():
18
- output_dataframe = gr.DataFrame()
19
 
20
- def _show_input_preview(dataset, revision, parquet_file):
21
- yield {revision_textbox: revision}
22
- if isinstance(parquet_file, int):
23
- parquet_files = fs.glob(f"datasets/{dataset}@{revision}/**/*.parquet")
24
- parquet_file = parquet_files[parquet_file]
25
- yield {parquet_file_dropdown: gr.Dropdown(choices=parquet_files, value=parquet_file)}
26
- else:
27
- yield {parquet_file_dropdown: gr.Dropdown(value=parquet_file)}
28
- yield {output_dataframe: pq.ParquetFile(parquet_file, filesystem=fs).read_row_group(0).to_pandas()}
 
 
 
 
 
 
 
 
 
 
29
 
30
- @dataset_search.change(inputs=[dataset_search], outputs=[revision_textbox, parquet_file_dropdown, output_dataframe])
31
- def show_input_from_dataset_search(dataset):
32
- yield from _show_input_preview(dataset, revision="main", parquet_file=0)
 
 
 
 
 
 
 
 
33
 
34
- @revision_textbox.change(inputs=[dataset_search, revision_textbox], outputs=[revision_textbox, parquet_file_dropdown, output_dataframe])
35
- def show_input_from_revision(dataset, revision):
36
- yield from _show_input_preview(dataset, revision=revision, parquet_file=0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
- @revision_textbox.change(inputs=[dataset_search, revision_textbox, parquet_file_dropdown], outputs=[revision_textbox, parquet_file_dropdown, output_dataframe])
39
- def show_input_from_parquet_file(dataset, revision, parquet_file):
40
- yield from _show_input_preview(dataset, revision=revision, parquet_file=parquet_file)
41
 
42
  demo.launch()
 
1
+ from typing import Optional
2
+
3
  import gradio as gr
4
+ import pandas as pd
5
  import pyarrow.parquet as pq
6
  from gradio_huggingfacehub_search import HuggingfaceHubSearch
7
  from huggingface_hub import HfFileSystem
8
 
9
+ css = """
10
+ .settings {
11
+ background: transparent;
12
+ }
13
+ .settings button span {
14
+ color: var(--body-text-color-subdued);
15
+ }
16
+ """
 
 
 
 
 
17
 
18
+ with gr.Blocks(css=css) as demo:
19
+ gr.Markdown("# πŸ‘€ Parquet Viewer πŸ“š")
20
+ gr.Markdown("View the content of Parquet files inside a dataset repository or pull request.")
21
+ with gr.Row():
22
+ with gr.Column(scale=10):
23
+ dataset_search = HuggingfaceHubSearch(
24
+ label="Hub Dataset ID",
25
+ placeholder="Search for dataset id on Huggingface",
26
+ search_type="dataset",
27
+ )
28
+ with gr.Row():
29
+ revision_dropdown = gr.Dropdown("main", label="Revision", allow_custom_value=True)
30
+ parquet_file_dropdown = gr.Dropdown(label="Parquet file", allow_custom_value=True)
31
+ gr.Markdown("Parquet content:")
32
+ output_dataframe = gr.DataFrame()
33
+ with gr.Column(scale=4, min_width="200px"):
34
+ with gr.Accordion("Settings", open=False, elem_classes="settings"):
35
+ gr.Markdown("Access private/gated repos")
36
+ gr.LoginButton()
37
 
38
+ @dataset_search.change(inputs=[dataset_search], outputs=[revision_dropdown, parquet_file_dropdown, output_dataframe])
39
+ def dataset_update(dataset, oauth_token: Optional[gr.OAuthToken] = None):
40
+ fs = HfFileSystem(token=oauth_token)
41
+ if "/" not in dataset:
42
+ return {revision_dropdown: gr.Dropdown(choices=[], value="", info="")}
43
+ try:
44
+ prs = [f"{dataset}@refs/pr/{pr.num}" for pr in fs._api.get_repo_discussions(dataset, repo_type="dataset", discussion_type="pull_request")]
45
+ revision = f"{dataset}@main"
46
+ return {revision_dropdown: gr.Dropdown(choices=[revision] + prs, value=revision, info=f"{len(prs)} pull request{'s' if len(prs) > 1 else ''} available" if prs else None)}
47
+ except Exception:
48
+ return {revision_dropdown: gr.Dropdown(choices=[], value="", info="no revisions available")}
49
 
50
+ @revision_dropdown.change(inputs=[revision_dropdown], outputs=[parquet_file_dropdown, output_dataframe])
51
+ def revision_update(dataset_and_revision, oauth_token: Optional[gr.OAuthToken] = None):
52
+ fs = HfFileSystem(token=oauth_token)
53
+ try:
54
+ parquet_files = ["hf://" + path for path in fs.glob(f"datasets/{dataset_and_revision}/**/*.parquet")]
55
+ parquet_file = parquet_files[0] if parquet_files else None
56
+ return {parquet_file_dropdown: gr.Dropdown(choices=parquet_files, value=parquet_file, info=f"{len(parquet_files)} parquet file{'s' if len(parquet_files) > 1 else ''} available")}
57
+ except Exception:
58
+ return {parquet_file_dropdown: gr.Dropdown(choices=[], value="", info="")}
59
+
60
+ @parquet_file_dropdown.change(inputs=[parquet_file_dropdown], outputs=[output_dataframe])
61
+ def parquet_file_update(parquet_file, oauth_token: Optional[gr.OAuthToken] = None):
62
+ fs = HfFileSystem(token=oauth_token)
63
+ try:
64
+ return {output_dataframe: pd.DataFrame([{k: str(v)[:1000] for k, v in x.items()} for x in pq.ParquetFile(parquet_file, filesystem=fs).read_row_group(0).to_pylist()] if parquet_file else [])}
65
+ except Exception:
66
+ return {output_dataframe: []}
67
 
 
 
 
68
 
69
  demo.launch()