dataset-rewriter

Sleeping

lhoestq HF staff commited on Sep 13, 2024

Commit

93ec714

1 Parent(s): 06ee5d2

add -1k suffix

Files changed (1) hide show

app.py CHANGED Viewed

@@ -29,6 +29,7 @@ NAMESPACE = "dataset-rewriter"
 NUM_ROWS_PREVIEW = 3
 MAX_NUM_ROWS_TO_REWRITE = 1000
 NUM_PARALLEL_CALLS = 10
 NUM_ROWS_PER_CALL = 10
 MAX_PROGRESS_UPDATES_PER_SECOND = 4
@@ -187,7 +188,7 @@ with gr.Blocks(css=css) as demo:
             out = dataset.split("/")[-1] + out.split("should be", 1)[1].replace(" ", "-").replace(".", "").replace(",", "")
         else:
             out = dataset.split("/")[-1] + prompt.replace(" ", "-")
-        return out[:90] + "-" + Hasher.hash(prompt)[:4]
     def _write_generator_to_queue(queue: Queue, func: Callable[..., Iterable], kwargs: dict) -> None:
         for i, result in enumerate(func(**kwargs)):
@@ -359,7 +360,7 @@ with gr.Blocks(css=css) as demo:
         print(f"Done ReWriting {dataset} (full dataset) with instruction '{prompt}'")
         output_rows = [{k: json.loads(row[k]) for k in output_format_df["column"]} for rows in parallel_output_rows for row in rows]
-        repo_id = namespace + "/" + find_new_name(dataset, prompt)
         yield {full_dataset_generation_label: gr.Label({f"✅ ReWriting {dataset}": len(output_rows) / total, f"⚙️ Saving to {repo_id}": 0.})}
         token = oauth_token.token if oauth_token else save_dataset_hf_token
         print(f"Saving {repo_id}")

 NUM_ROWS_PREVIEW = 3
 MAX_NUM_ROWS_TO_REWRITE = 1000
+PARTIAL_SUFFIX = "-1k"
 NUM_PARALLEL_CALLS = 10
 NUM_ROWS_PER_CALL = 10
 MAX_PROGRESS_UPDATES_PER_SECOND = 4
             out = dataset.split("/")[-1] + out.split("should be", 1)[1].replace(" ", "-").replace(".", "").replace(",", "")
         else:
             out = dataset.split("/")[-1] + prompt.replace(" ", "-")
+        return out[:80] + "-" + Hasher.hash(prompt)[:4]
     def _write_generator_to_queue(queue: Queue, func: Callable[..., Iterable], kwargs: dict) -> None:
         for i, result in enumerate(func(**kwargs)):
         print(f"Done ReWriting {dataset} (full dataset) with instruction '{prompt}'")
         output_rows = [{k: json.loads(row[k]) for k in output_format_df["column"]} for rows in parallel_output_rows for row in rows]
+        repo_id = namespace + "/" + find_new_name(dataset + (PARTIAL_SUFFIX if num_examples > total else ""), prompt)
         yield {full_dataset_generation_label: gr.Label({f"✅ ReWriting {dataset}": len(output_rows) / total, f"⚙️ Saving to {repo_id}": 0.})}
         token = oauth_token.token if oauth_token else save_dataset_hf_token
         print(f"Saving {repo_id}")