Spaces:
Running
Running
rename + parse json
Browse files
README.md
CHANGED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
colorFrom: green
|
| 5 |
-
colorTo:
|
| 6 |
sdk: gradio
|
| 7 |
sdk_version: 5.6.0
|
| 8 |
app_file: app.py
|
|
|
|
| 1 |
---
|
| 2 |
+
title: DuckDB Spreadsheets
|
| 3 |
+
emoji: π₯π
|
| 4 |
colorFrom: green
|
| 5 |
+
colorTo: yellow
|
| 6 |
sdk: gradio
|
| 7 |
sdk_version: 5.6.0
|
| 8 |
app_file: app.py
|
app.py
CHANGED
|
@@ -2,14 +2,17 @@ from functools import partial, lru_cache
|
|
| 2 |
|
| 3 |
import duckdb
|
| 4 |
import gradio as gr
|
|
|
|
| 5 |
import pandas as pd
|
| 6 |
import pyarrow as pa
|
|
|
|
| 7 |
import requests
|
| 8 |
from huggingface_hub import HfApi
|
| 9 |
|
| 10 |
READ_PARQUET_FUNCTIONS = ("dd.read_parquet", "pd.read_parquet")
|
| 11 |
EMPTY_TABLE = pa.Table.from_pylist([{str(i): "" for i in range(4)}] * 10)
|
| 12 |
EMPTY_DF: pd.DataFrame = EMPTY_TABLE.to_pandas()
|
|
|
|
| 13 |
MAX_NUM_COLUMNS = 20
|
| 14 |
NUM_TRENDING_DATASETS = 10
|
| 15 |
NUM_USER_DATASETS = 10
|
|
@@ -102,6 +105,14 @@ def get_prepared_functions_from_table(table: pa.Table) -> dict[str, list[str]]:
|
|
| 102 |
prepared_functions[field.name] = [prepare_function(numeric_func, ["x"], field.name) for numeric_func in numeric_functions_df.Name]
|
| 103 |
elif pa.types.is_string(field.type):
|
| 104 |
prepared_functions[field.name] = [prepare_function(text_func, ["string"], field.name) for text_func in text_functions_df.Name]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
elif pa.types.is_date(field.type):
|
| 106 |
prepared_functions[field.name] = [prepare_function(date_func, ["startdate", "date"], field.name) for date_func in date_functions_df.Name]
|
| 107 |
elif pa.types.is_list(field.type):
|
|
@@ -122,7 +133,7 @@ with gr.Blocks(css=css, js=js) as demo:
|
|
| 122 |
dataset_subset_split_textbox = gr.Textbox(visible=False)
|
| 123 |
input_table_state = gr.State()
|
| 124 |
run_button = gr.Button(visible=False, elem_id="run_button")
|
| 125 |
-
gr.Markdown("#
|
| 126 |
with gr.Group():
|
| 127 |
with gr.Row():
|
| 128 |
dataset_dropdown = gr.Dropdown(label="Dataset", allow_custom_value=True, scale=10)
|
|
@@ -133,7 +144,7 @@ with gr.Blocks(css=css, js=js) as demo:
|
|
| 133 |
transform_dropdowns = [gr.Dropdown(choices=[column_name] + [prepare_function(text_func, "string", column_name) for text_func in text_functions_df.Name if "string" in text_func], value=column_name, container=False, interactive=True, allow_custom_value=True, visible=True, elem_classes="transform_dropdown") for column_name in EMPTY_DF.columns]
|
| 134 |
transform_dropdowns += [gr.Dropdown(choices=[None], value=None, container=False, interactive=True, allow_custom_value=True, visible=False, elem_classes="transform_dropdown") for _ in range(MAX_NUM_COLUMNS - len(transform_dropdowns))]
|
| 135 |
dataframe = gr.DataFrame(EMPTY_DF, column_widths=[f"{1/len(EMPTY_DF.columns):.0%}"] * len(EMPTY_DF.columns), interactive=True, elem_classes="readonly-dataframe")
|
| 136 |
-
with gr.Accordion("Show SQL command", open=False, elem_classes="transparent-accordion"):
|
| 137 |
code_markdown = gr.Markdown()
|
| 138 |
|
| 139 |
def show_subset_dropdown(dataset: str):
|
|
@@ -153,7 +164,7 @@ with gr.Blocks(css=css, js=js) as demo:
|
|
| 153 |
def show_input_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict]):
|
| 154 |
pattern = ([loading_code["arguments"]["splits"][split] for loading_code in loading_codes if loading_code["config_name"] == subset] or [None])[0]
|
| 155 |
if dataset and subset and split and pattern:
|
| 156 |
-
table = duckdb_sql(f"SELECT * FROM 'hf://datasets/{dataset}/{pattern}' LIMIT
|
| 157 |
else:
|
| 158 |
table = EMPTY_TABLE
|
| 159 |
prepared_functions = get_prepared_functions_from_table(table)
|
|
@@ -181,7 +192,7 @@ with gr.Blocks(css=css, js=js) as demo:
|
|
| 181 |
code_markdown: (
|
| 182 |
"```sql\n"
|
| 183 |
+ f"SELECT {', '.join(new_transform_dropdown['value'] for new_transform_dropdown in new_transform_dropdowns if new_transform_dropdown['value'])} "
|
| 184 |
-
+ f"FROM 'hf://datasets/{dataset}/{pattern}';"
|
| 185 |
+ "\n```"
|
| 186 |
) if pattern else "",
|
| 187 |
}
|
|
@@ -213,7 +224,7 @@ with gr.Blocks(css=css, js=js) as demo:
|
|
| 213 |
code_markdown: (
|
| 214 |
"```sql\n"
|
| 215 |
+ f"SELECT {', '.join(new_transform_dropdown['value'] for new_transform_dropdown in new_transform_dropdowns if new_transform_dropdown['value'])} "
|
| 216 |
-
+ f"FROM 'hf://datasets/{dataset}/{pattern}';"
|
| 217 |
+ "\n```"
|
| 218 |
) if pattern else "",
|
| 219 |
}
|
|
@@ -234,7 +245,7 @@ with gr.Blocks(css=css, js=js) as demo:
|
|
| 234 |
code_markdown: (
|
| 235 |
"```sql\n"
|
| 236 |
+ f"SELECT {', '.join(new_transform_dropdown['value'] for new_transform_dropdown in new_transform_dropdowns if new_transform_dropdown['value'])} "
|
| 237 |
-
+ f"FROM 'hf://datasets/{dataset}/{pattern}';"
|
| 238 |
+ "\n```"
|
| 239 |
) if pattern else "",
|
| 240 |
}
|
|
@@ -252,7 +263,7 @@ with gr.Blocks(css=css, js=js) as demo:
|
|
| 252 |
code_markdown: (
|
| 253 |
"```sql\n"
|
| 254 |
+ f"SELECT {', '.join(new_transform_dropdown['value'] for new_transform_dropdown in new_transform_dropdowns if new_transform_dropdown['value'])} "
|
| 255 |
-
+ f"FROM 'hf://datasets/{dataset}/{pattern}';"
|
| 256 |
+ "\n```"
|
| 257 |
) if pattern else "",
|
| 258 |
}
|
|
@@ -268,7 +279,7 @@ with gr.Blocks(css=css, js=js) as demo:
|
|
| 268 |
code_markdown: (
|
| 269 |
"```sql\n"
|
| 270 |
+ f"SELECT {', '.join(new_transform_dropdown['value'] for new_transform_dropdown in new_transform_dropdowns if new_transform_dropdown['value'])} "
|
| 271 |
-
+ f"FROM 'hf://datasets/{dataset}/{pattern}';"
|
| 272 |
+ "\n```"
|
| 273 |
) if pattern else "",
|
| 274 |
}
|
|
|
|
| 2 |
|
| 3 |
import duckdb
|
| 4 |
import gradio as gr
|
| 5 |
+
import json
|
| 6 |
import pandas as pd
|
| 7 |
import pyarrow as pa
|
| 8 |
+
import pyarrow.compute as pc
|
| 9 |
import requests
|
| 10 |
from huggingface_hub import HfApi
|
| 11 |
|
| 12 |
READ_PARQUET_FUNCTIONS = ("dd.read_parquet", "pd.read_parquet")
|
| 13 |
EMPTY_TABLE = pa.Table.from_pylist([{str(i): "" for i in range(4)}] * 10)
|
| 14 |
EMPTY_DF: pd.DataFrame = EMPTY_TABLE.to_pandas()
|
| 15 |
+
NUM_ROWS = 10
|
| 16 |
MAX_NUM_COLUMNS = 20
|
| 17 |
NUM_TRENDING_DATASETS = 10
|
| 18 |
NUM_USER_DATASETS = 10
|
|
|
|
| 105 |
prepared_functions[field.name] = [prepare_function(numeric_func, ["x"], field.name) for numeric_func in numeric_functions_df.Name]
|
| 106 |
elif pa.types.is_string(field.type):
|
| 107 |
prepared_functions[field.name] = [prepare_function(text_func, ["string"], field.name) for text_func in text_functions_df.Name]
|
| 108 |
+
# try parsing json
|
| 109 |
+
if pc.all(pc.starts_with(table[field.name], "{")).as_py() or pc.all(pc.starts_with(table[field.name], "[")).as_py():
|
| 110 |
+
try:
|
| 111 |
+
json_parsed_table = pa.Table.from_pylist([{field.name: json.loads(row)} for row in table[field.name].to_pylist()])
|
| 112 |
+
parsed_type = str(duckdb.from_arrow(json_parsed_table).dtypes[0])
|
| 113 |
+
prepared_functions[field.name] = [f"CAST({field.name} as {parsed_type})"] + prepared_functions[field.name]
|
| 114 |
+
except Exception:
|
| 115 |
+
pass
|
| 116 |
elif pa.types.is_date(field.type):
|
| 117 |
prepared_functions[field.name] = [prepare_function(date_func, ["startdate", "date"], field.name) for date_func in date_functions_df.Name]
|
| 118 |
elif pa.types.is_list(field.type):
|
|
|
|
| 133 |
dataset_subset_split_textbox = gr.Textbox(visible=False)
|
| 134 |
input_table_state = gr.State()
|
| 135 |
run_button = gr.Button(visible=False, elem_id="run_button")
|
| 136 |
+
gr.Markdown("# DuckDB Spreadsheets\n\nEdit any dataset on Hugging Face (full list [here](https://huggingface.co/datasets)) using DuckDB functions (documentation [here](https://duckdb.org/docs/sql/functions/overview))")
|
| 137 |
with gr.Group():
|
| 138 |
with gr.Row():
|
| 139 |
dataset_dropdown = gr.Dropdown(label="Dataset", allow_custom_value=True, scale=10)
|
|
|
|
| 144 |
transform_dropdowns = [gr.Dropdown(choices=[column_name] + [prepare_function(text_func, "string", column_name) for text_func in text_functions_df.Name if "string" in text_func], value=column_name, container=False, interactive=True, allow_custom_value=True, visible=True, elem_classes="transform_dropdown") for column_name in EMPTY_DF.columns]
|
| 145 |
transform_dropdowns += [gr.Dropdown(choices=[None], value=None, container=False, interactive=True, allow_custom_value=True, visible=False, elem_classes="transform_dropdown") for _ in range(MAX_NUM_COLUMNS - len(transform_dropdowns))]
|
| 146 |
dataframe = gr.DataFrame(EMPTY_DF, column_widths=[f"{1/len(EMPTY_DF.columns):.0%}"] * len(EMPTY_DF.columns), interactive=True, elem_classes="readonly-dataframe")
|
| 147 |
+
with gr.Accordion("Show DuckDB SQL command", open=False, elem_classes="transparent-accordion"):
|
| 148 |
code_markdown = gr.Markdown()
|
| 149 |
|
| 150 |
def show_subset_dropdown(dataset: str):
|
|
|
|
| 164 |
def show_input_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict]):
|
| 165 |
pattern = ([loading_code["arguments"]["splits"][split] for loading_code in loading_codes if loading_code["config_name"] == subset] or [None])[0]
|
| 166 |
if dataset and subset and split and pattern:
|
| 167 |
+
table = duckdb_sql(f"SELECT * FROM 'hf://datasets/{dataset}/{pattern}' LIMIT {NUM_ROWS}").arrow()
|
| 168 |
else:
|
| 169 |
table = EMPTY_TABLE
|
| 170 |
prepared_functions = get_prepared_functions_from_table(table)
|
|
|
|
| 192 |
code_markdown: (
|
| 193 |
"```sql\n"
|
| 194 |
+ f"SELECT {', '.join(new_transform_dropdown['value'] for new_transform_dropdown in new_transform_dropdowns if new_transform_dropdown['value'])} "
|
| 195 |
+
+ f"FROM 'hf://datasets/{dataset}/{pattern}' LIMIT {NUM_ROWS};"
|
| 196 |
+ "\n```"
|
| 197 |
) if pattern else "",
|
| 198 |
}
|
|
|
|
| 224 |
code_markdown: (
|
| 225 |
"```sql\n"
|
| 226 |
+ f"SELECT {', '.join(new_transform_dropdown['value'] for new_transform_dropdown in new_transform_dropdowns if new_transform_dropdown['value'])} "
|
| 227 |
+
+ f"FROM 'hf://datasets/{dataset}/{pattern}' LIMIT {NUM_ROWS};"
|
| 228 |
+ "\n```"
|
| 229 |
) if pattern else "",
|
| 230 |
}
|
|
|
|
| 245 |
code_markdown: (
|
| 246 |
"```sql\n"
|
| 247 |
+ f"SELECT {', '.join(new_transform_dropdown['value'] for new_transform_dropdown in new_transform_dropdowns if new_transform_dropdown['value'])} "
|
| 248 |
+
+ f"FROM 'hf://datasets/{dataset}/{pattern}' LIMIT {NUM_ROWS};"
|
| 249 |
+ "\n```"
|
| 250 |
) if pattern else "",
|
| 251 |
}
|
|
|
|
| 263 |
code_markdown: (
|
| 264 |
"```sql\n"
|
| 265 |
+ f"SELECT {', '.join(new_transform_dropdown['value'] for new_transform_dropdown in new_transform_dropdowns if new_transform_dropdown['value'])} "
|
| 266 |
+
+ f"FROM 'hf://datasets/{dataset}/{pattern}' LIMIT {NUM_ROWS};"
|
| 267 |
+ "\n```"
|
| 268 |
) if pattern else "",
|
| 269 |
}
|
|
|
|
| 279 |
code_markdown: (
|
| 280 |
"```sql\n"
|
| 281 |
+ f"SELECT {', '.join(new_transform_dropdown['value'] for new_transform_dropdown in new_transform_dropdowns if new_transform_dropdown['value'])} "
|
| 282 |
+
+ f"FROM 'hf://datasets/{dataset}/{pattern}' LIMIT {NUM_ROWS};"
|
| 283 |
+ "\n```"
|
| 284 |
) if pattern else "",
|
| 285 |
}
|