Spaces:
Sleeping
Sleeping
add proper structured generation
Browse files
app.py
CHANGED
@@ -27,8 +27,8 @@ NUM_ROWS_PREVIEW = 3
|
|
27 |
REWRITE_DATASET = (
|
28 |
"A Machine Learning practitioner is looking for a dataset similar to '{dataset}' but slightly different. "
|
29 |
"They want you to rewrite the dataset and apply this transformation: {prompt}."
|
30 |
-
"The first rows of the dataset are below in JSON format
|
31 |
-
"Rewrite those rows from the '{dataset}' dataset using the same
|
32 |
"Try to keep some of the text or meaning intact, and apply the requested transformation '{prompt}'."
|
33 |
)
|
34 |
|
@@ -49,15 +49,15 @@ with gr.Blocks() as demo:
|
|
49 |
split_dropdown = gr.Dropdown(info="Split", show_label=False, visible=False)
|
50 |
|
51 |
gr.Markdown("### Input")
|
52 |
-
|
53 |
-
pretty_input_preview = gr.DataFrame(interactive=False, wrap=True)
|
54 |
|
55 |
gr.Markdown("### ReWrite")
|
56 |
-
|
57 |
-
|
58 |
-
|
|
|
59 |
rewrite_button = gr.Button("ReWrite Dataset", variant="primary")
|
60 |
-
|
61 |
save_button = gr.Button("ReWrite Full Dataset", interactive=False)
|
62 |
|
63 |
|
@@ -193,11 +193,11 @@ with gr.Blocks() as demo:
|
|
193 |
subset = default_subset if default_subset in subsets else subsets[0]
|
194 |
splits: list[str] = info_resp["dataset_info"][subset]["splits"]
|
195 |
split = default_split if default_split in splits else splits[0]
|
196 |
-
|
197 |
return subset, split, {
|
198 |
subset_dropdown: gr.Dropdown(value=subset, choices=subsets, visible=len(subsets) > 1),
|
199 |
split_dropdown: gr.Dropdown(value=split, choices=splits, visible=len(splits) > 1),
|
200 |
-
|
201 |
}
|
202 |
|
203 |
|
@@ -205,33 +205,36 @@ with gr.Blocks() as demo:
|
|
205 |
subset, split, output = _resolve_dataset_selection(dataset, default_subset=default_subset, default_split=default_split)
|
206 |
if subset is None or split is None:
|
207 |
return output
|
|
|
208 |
rows = list(islice((stream_rows(dataset, subset, split, batch_size=NUM_ROWS_PREVIEW)), NUM_ROWS_PREVIEW))
|
209 |
return {
|
210 |
-
|
211 |
-
pretty_input_preview: pd.DataFrame([{k: str(v) for k, v in row.items()} for row in rows]),
|
212 |
**output
|
213 |
}
|
214 |
|
215 |
|
216 |
-
@dataset_search.change(inputs=[dataset_search], outputs=[
|
217 |
def show_input_from_dataset_search(dataset: str) -> dict:
|
218 |
return _show_input_preview(dataset, default_subset="default", default_split="train")
|
219 |
|
220 |
-
@subset_dropdown.change(inputs=[dataset_search, subset_dropdown], outputs=[
|
221 |
def show_input_from_subset_dropdown(dataset: str, subset: str) -> dict:
|
222 |
return _show_input_preview(dataset, default_subset=subset, default_split="train")
|
223 |
|
224 |
-
@split_dropdown.change(inputs=[dataset_search, subset_dropdown, split_dropdown], outputs=[
|
225 |
def show_input_from_split_dropdown(dataset: str, subset: str, split: str) -> dict:
|
226 |
return _show_input_preview(dataset, default_subset=subset, default_split=split)
|
227 |
|
228 |
|
229 |
-
@rewrite_button.click(inputs=[dataset_search, subset_dropdown, split_dropdown,
|
230 |
-
def rewrite(dataset: str, subset: str, split: str,
|
231 |
-
rows =
|
|
|
|
|
232 |
output_rows = []
|
233 |
-
|
234 |
-
|
|
|
235 |
yield pd.DataFrame(output_rows)
|
236 |
|
237 |
|
|
|
27 |
REWRITE_DATASET = (
|
28 |
"A Machine Learning practitioner is looking for a dataset similar to '{dataset}' but slightly different. "
|
29 |
"They want you to rewrite the dataset and apply this transformation: {prompt}."
|
30 |
+
"The first rows of the dataset are below in JSON format:\n\n{rows}\n\n"
|
31 |
+
"Rewrite those rows from the '{dataset}' dataset using the same JSON format. "
|
32 |
"Try to keep some of the text or meaning intact, and apply the requested transformation '{prompt}'."
|
33 |
)
|
34 |
|
|
|
49 |
split_dropdown = gr.Dropdown(info="Split", show_label=False, visible=False)
|
50 |
|
51 |
gr.Markdown("### Input")
|
52 |
+
pretty_input_preview = gr.DataFrame(interactive=False)
|
|
|
53 |
|
54 |
gr.Markdown("### ReWrite")
|
55 |
+
with gr.Group():
|
56 |
+
input_prompt = gr.Textbox(label="Enter the adjustment or transformation to apply to the dataset:")
|
57 |
+
with gr.Accordion("(Advanced) Edit columns", open=False):
|
58 |
+
output_format_dataframe = gr.DataFrame(col_count=(2, "fixed"), headers=["column", "type"])
|
59 |
rewrite_button = gr.Button("ReWrite Dataset", variant="primary")
|
60 |
+
pretty_output_preview = gr.DataFrame(interactive=False)
|
61 |
save_button = gr.Button("ReWrite Full Dataset", interactive=False)
|
62 |
|
63 |
|
|
|
193 |
subset = default_subset if default_subset in subsets else subsets[0]
|
194 |
splits: list[str] = info_resp["dataset_info"][subset]["splits"]
|
195 |
split = default_split if default_split in splits else splits[0]
|
196 |
+
dict_format = features_to_format(Features.from_dict(info_resp["dataset_info"][subset]["features"]))
|
197 |
return subset, split, {
|
198 |
subset_dropdown: gr.Dropdown(value=subset, choices=subsets, visible=len(subsets) > 1),
|
199 |
split_dropdown: gr.Dropdown(value=split, choices=splits, visible=len(splits) > 1),
|
200 |
+
output_format_dataframe: pd.DataFrame([{"column": col, "type": json.dumps(format_type)} for col, format_type in dict_format["properties"].items()])
|
201 |
}
|
202 |
|
203 |
|
|
|
205 |
subset, split, output = _resolve_dataset_selection(dataset, default_subset=default_subset, default_split=default_split)
|
206 |
if subset is None or split is None:
|
207 |
return output
|
208 |
+
print(f"Showing {dataset}")
|
209 |
rows = list(islice((stream_rows(dataset, subset, split, batch_size=NUM_ROWS_PREVIEW)), NUM_ROWS_PREVIEW))
|
210 |
return {
|
211 |
+
pretty_input_preview: gr.DataFrame(pd.DataFrame([{k: json.dumps(v, ensure_ascii=False) for k, v in row.items()} for row in rows])),
|
|
|
212 |
**output
|
213 |
}
|
214 |
|
215 |
|
216 |
+
@dataset_search.change(inputs=[dataset_search], outputs=[pretty_input_preview, subset_dropdown, split_dropdown, output_format_dataframe])
|
217 |
def show_input_from_dataset_search(dataset: str) -> dict:
|
218 |
return _show_input_preview(dataset, default_subset="default", default_split="train")
|
219 |
|
220 |
+
@subset_dropdown.change(inputs=[dataset_search, subset_dropdown], outputs=[pretty_input_preview, subset_dropdown, split_dropdown, output_format_dataframe])
|
221 |
def show_input_from_subset_dropdown(dataset: str, subset: str) -> dict:
|
222 |
return _show_input_preview(dataset, default_subset=subset, default_split="train")
|
223 |
|
224 |
+
@split_dropdown.change(inputs=[dataset_search, subset_dropdown, split_dropdown], outputs=[pretty_input_preview, subset_dropdown, split_dropdown, output_format_dataframe])
|
225 |
def show_input_from_split_dropdown(dataset: str, subset: str, split: str) -> dict:
|
226 |
return _show_input_preview(dataset, default_subset=subset, default_split=split)
|
227 |
|
228 |
|
229 |
+
@rewrite_button.click(inputs=[dataset_search, subset_dropdown, split_dropdown, pretty_input_preview, input_prompt, output_format_dataframe], outputs=[pretty_output_preview])
|
230 |
+
def rewrite(dataset: str, subset: str, split: str, pretty_input_preview_df: pd.DataFrame, prompt: str, output_format_df: pd.DataFrame) -> Iterator[pd.DataFrame]:
|
231 |
+
rows = [{k: json.loads(v) for k, v in row.items()} for row in pretty_input_preview_df.to_dict(orient="records")]
|
232 |
+
format = output_format_df.to_dict(orient="records")
|
233 |
+
format = {"properties": {x["column"]: json.loads(x["type"]) for x in format}, "required": [x["column"] for x in format]}
|
234 |
output_rows = []
|
235 |
+
print(f"ReWriting {dataset} with instructions '{prompt}'")
|
236 |
+
for row in stream_rewrite_dataset_row_by_row(dataset=dataset, rows=rows, prompt=prompt, format=format):
|
237 |
+
output_rows.append({k: json.dumps(row[k], ensure_ascii=False) for k in pretty_input_preview_df.columns})
|
238 |
yield pd.DataFrame(output_rows)
|
239 |
|
240 |
|