lhoestq HF staff commited on
Commit
91400d0
·
1 Parent(s): 65c21a7

add proper structured generation

Browse files
Files changed (1) hide show
  1. app.py +23 -20
app.py CHANGED
@@ -27,8 +27,8 @@ NUM_ROWS_PREVIEW = 3
27
  REWRITE_DATASET = (
28
  "A Machine Learning practitioner is looking for a dataset similar to '{dataset}' but slightly different. "
29
  "They want you to rewrite the dataset and apply this transformation: {prompt}."
30
- "The first rows of the dataset are below in JSON format (one JSON object per line):\n\n{rows}\n\n"
31
- "Rewrite those rows from the '{dataset}' dataset using the same format (one JSON object per line). "
32
  "Try to keep some of the text or meaning intact, and apply the requested transformation '{prompt}'."
33
  )
34
 
@@ -49,15 +49,15 @@ with gr.Blocks() as demo:
49
  split_dropdown = gr.Dropdown(info="Split", show_label=False, visible=False)
50
 
51
  gr.Markdown("### Input")
52
- input_preview = gr.DataFrame(visible=False)
53
- pretty_input_preview = gr.DataFrame(interactive=False, wrap=True)
54
 
55
  gr.Markdown("### ReWrite")
56
- input_prompt = gr.Textbox(label="Enter the adjustment or transformation to apply to the dataset:")
57
- with gr.Accordion("Modify Format", open=False):
58
- output_format = gr.Textbox(interactive=True, show_label=False, container=False)
 
59
  rewrite_button = gr.Button("ReWrite Dataset", variant="primary")
60
- output_preview = gr.DataFrame(interactive=False, wrap=True)
61
  save_button = gr.Button("ReWrite Full Dataset", interactive=False)
62
 
63
 
@@ -193,11 +193,11 @@ with gr.Blocks() as demo:
193
  subset = default_subset if default_subset in subsets else subsets[0]
194
  splits: list[str] = info_resp["dataset_info"][subset]["splits"]
195
  split = default_split if default_split in splits else splits[0]
196
- json_format = json.dumps(features_to_format(Features.from_dict(info_resp["dataset_info"][subset]["features"])), indent=2)
197
  return subset, split, {
198
  subset_dropdown: gr.Dropdown(value=subset, choices=subsets, visible=len(subsets) > 1),
199
  split_dropdown: gr.Dropdown(value=split, choices=splits, visible=len(splits) > 1),
200
- output_format: gr.Textbox(json_format, lines=json_format.count("\n") + 1)
201
  }
202
 
203
 
@@ -205,33 +205,36 @@ with gr.Blocks() as demo:
205
  subset, split, output = _resolve_dataset_selection(dataset, default_subset=default_subset, default_split=default_split)
206
  if subset is None or split is None:
207
  return output
 
208
  rows = list(islice((stream_rows(dataset, subset, split, batch_size=NUM_ROWS_PREVIEW)), NUM_ROWS_PREVIEW))
209
  return {
210
- input_preview: pd.DataFrame(rows),
211
- pretty_input_preview: pd.DataFrame([{k: str(v) for k, v in row.items()} for row in rows]),
212
  **output
213
  }
214
 
215
 
216
- @dataset_search.change(inputs=[dataset_search], outputs=[input_preview, pretty_input_preview, subset_dropdown, split_dropdown, output_format])
217
  def show_input_from_dataset_search(dataset: str) -> dict:
218
  return _show_input_preview(dataset, default_subset="default", default_split="train")
219
 
220
- @subset_dropdown.change(inputs=[dataset_search, subset_dropdown], outputs=[input_preview, pretty_input_preview, subset_dropdown, split_dropdown, output_format])
221
  def show_input_from_subset_dropdown(dataset: str, subset: str) -> dict:
222
  return _show_input_preview(dataset, default_subset=subset, default_split="train")
223
 
224
- @split_dropdown.change(inputs=[dataset_search, subset_dropdown, split_dropdown], outputs=[input_preview, pretty_input_preview, subset_dropdown, split_dropdown, output_format])
225
  def show_input_from_split_dropdown(dataset: str, subset: str, split: str) -> dict:
226
  return _show_input_preview(dataset, default_subset=subset, default_split=split)
227
 
228
 
229
- @rewrite_button.click(inputs=[dataset_search, subset_dropdown, split_dropdown, input_preview, input_prompt, output_format], outputs=[output_preview])
230
- def rewrite(dataset: str, subset: str, split: str, input_preview_df: pd.DataFrame, prompt: str, json_format: str) -> Iterator[pd.DataFrame]:
231
- rows = input_preview_df.to_dict(orient="records")
 
 
232
  output_rows = []
233
- for row in stream_rewrite_dataset_row_by_row(dataset=dataset, rows=rows, prompt=prompt, format=json.loads(json_format)):
234
- output_rows.append(row)
 
235
  yield pd.DataFrame(output_rows)
236
 
237
 
 
27
  REWRITE_DATASET = (
28
  "A Machine Learning practitioner is looking for a dataset similar to '{dataset}' but slightly different. "
29
  "They want you to rewrite the dataset and apply this transformation: {prompt}."
30
+ "The first rows of the dataset are below in JSON format:\n\n{rows}\n\n"
31
+ "Rewrite those rows from the '{dataset}' dataset using the same JSON format. "
32
  "Try to keep some of the text or meaning intact, and apply the requested transformation '{prompt}'."
33
  )
34
 
 
49
  split_dropdown = gr.Dropdown(info="Split", show_label=False, visible=False)
50
 
51
  gr.Markdown("### Input")
52
+ pretty_input_preview = gr.DataFrame(interactive=False)
 
53
 
54
  gr.Markdown("### ReWrite")
55
+ with gr.Group():
56
+ input_prompt = gr.Textbox(label="Enter the adjustment or transformation to apply to the dataset:")
57
+ with gr.Accordion("(Advanced) Edit columns", open=False):
58
+ output_format_dataframe = gr.DataFrame(col_count=(2, "fixed"), headers=["column", "type"])
59
  rewrite_button = gr.Button("ReWrite Dataset", variant="primary")
60
+ pretty_output_preview = gr.DataFrame(interactive=False)
61
  save_button = gr.Button("ReWrite Full Dataset", interactive=False)
62
 
63
 
 
193
  subset = default_subset if default_subset in subsets else subsets[0]
194
  splits: list[str] = info_resp["dataset_info"][subset]["splits"]
195
  split = default_split if default_split in splits else splits[0]
196
+ dict_format = features_to_format(Features.from_dict(info_resp["dataset_info"][subset]["features"]))
197
  return subset, split, {
198
  subset_dropdown: gr.Dropdown(value=subset, choices=subsets, visible=len(subsets) > 1),
199
  split_dropdown: gr.Dropdown(value=split, choices=splits, visible=len(splits) > 1),
200
+ output_format_dataframe: pd.DataFrame([{"column": col, "type": json.dumps(format_type)} for col, format_type in dict_format["properties"].items()])
201
  }
202
 
203
 
 
205
  subset, split, output = _resolve_dataset_selection(dataset, default_subset=default_subset, default_split=default_split)
206
  if subset is None or split is None:
207
  return output
208
+ print(f"Showing {dataset}")
209
  rows = list(islice((stream_rows(dataset, subset, split, batch_size=NUM_ROWS_PREVIEW)), NUM_ROWS_PREVIEW))
210
  return {
211
+ pretty_input_preview: gr.DataFrame(pd.DataFrame([{k: json.dumps(v, ensure_ascii=False) for k, v in row.items()} for row in rows])),
 
212
  **output
213
  }
214
 
215
 
216
+ @dataset_search.change(inputs=[dataset_search], outputs=[pretty_input_preview, subset_dropdown, split_dropdown, output_format_dataframe])
217
  def show_input_from_dataset_search(dataset: str) -> dict:
218
  return _show_input_preview(dataset, default_subset="default", default_split="train")
219
 
220
+ @subset_dropdown.change(inputs=[dataset_search, subset_dropdown], outputs=[pretty_input_preview, subset_dropdown, split_dropdown, output_format_dataframe])
221
  def show_input_from_subset_dropdown(dataset: str, subset: str) -> dict:
222
  return _show_input_preview(dataset, default_subset=subset, default_split="train")
223
 
224
+ @split_dropdown.change(inputs=[dataset_search, subset_dropdown, split_dropdown], outputs=[pretty_input_preview, subset_dropdown, split_dropdown, output_format_dataframe])
225
  def show_input_from_split_dropdown(dataset: str, subset: str, split: str) -> dict:
226
  return _show_input_preview(dataset, default_subset=subset, default_split=split)
227
 
228
 
229
+ @rewrite_button.click(inputs=[dataset_search, subset_dropdown, split_dropdown, pretty_input_preview, input_prompt, output_format_dataframe], outputs=[pretty_output_preview])
230
+ def rewrite(dataset: str, subset: str, split: str, pretty_input_preview_df: pd.DataFrame, prompt: str, output_format_df: pd.DataFrame) -> Iterator[pd.DataFrame]:
231
+ rows = [{k: json.loads(v) for k, v in row.items()} for row in pretty_input_preview_df.to_dict(orient="records")]
232
+ format = output_format_df.to_dict(orient="records")
233
+ format = {"properties": {x["column"]: json.loads(x["type"]) for x in format}, "required": [x["column"] for x in format]}
234
  output_rows = []
235
+ print(f"ReWriting {dataset} with instructions '{prompt}'")
236
+ for row in stream_rewrite_dataset_row_by_row(dataset=dataset, rows=rows, prompt=prompt, format=format):
237
+ output_rows.append({k: json.dumps(row[k], ensure_ascii=False) for k in pretty_input_preview_df.columns})
238
  yield pd.DataFrame(output_rows)
239
 
240