Spaces:
Runtime error
Runtime error
fix minor errors and improve prompt
Browse files
src/synthetic_dataset_generator/apps/rag.py
CHANGED
|
@@ -116,7 +116,7 @@ def _preprocess_input_data(file_paths, num_rows, progress=gr.Progress(track_tqdm
|
|
| 116 |
return (
|
| 117 |
dataframe,
|
| 118 |
gr.Dropdown(
|
| 119 |
-
choices=["
|
| 120 |
label="Documents column",
|
| 121 |
value=col_doc,
|
| 122 |
interactive=(False if col_doc == "" else True),
|
|
@@ -170,7 +170,7 @@ def generate_dataset(
|
|
| 170 |
progress=gr.Progress(),
|
| 171 |
):
|
| 172 |
num_rows = test_max_num_rows(num_rows)
|
| 173 |
-
progress(0.0, desc="
|
| 174 |
if input_type == "prompt-input":
|
| 175 |
chunk_generator = get_chunks_generator(
|
| 176 |
temperature=temperature, is_sample=is_sample
|
|
@@ -399,7 +399,9 @@ def push_dataset(
|
|
| 399 |
retrieval = "Retrieval" in retrieval_reranking
|
| 400 |
reranking = "Reranking" in retrieval_reranking
|
| 401 |
|
| 402 |
-
if input_type
|
|
|
|
|
|
|
| 403 |
dataframe, _ = load_dataset_file(
|
| 404 |
repo_id=original_repo_id,
|
| 405 |
file_paths=file_paths,
|
|
@@ -522,8 +524,12 @@ def push_dataset(
|
|
| 522 |
)
|
| 523 |
|
| 524 |
for item in ["context", "question", "response"]:
|
| 525 |
-
dataframe[f"{item}_length"] = dataframe[item].apply(
|
| 526 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 527 |
|
| 528 |
rg_dataset = client.datasets(name=repo_name, workspace=hf_user)
|
| 529 |
if rg_dataset is None:
|
|
|
|
| 116 |
return (
|
| 117 |
dataframe,
|
| 118 |
gr.Dropdown(
|
| 119 |
+
choices=["chunks"],
|
| 120 |
label="Documents column",
|
| 121 |
value=col_doc,
|
| 122 |
interactive=(False if col_doc == "" else True),
|
|
|
|
| 170 |
progress=gr.Progress(),
|
| 171 |
):
|
| 172 |
num_rows = test_max_num_rows(num_rows)
|
| 173 |
+
progress(0.0, desc="Initializing dataset generation")
|
| 174 |
if input_type == "prompt-input":
|
| 175 |
chunk_generator = get_chunks_generator(
|
| 176 |
temperature=temperature, is_sample=is_sample
|
|
|
|
| 399 |
retrieval = "Retrieval" in retrieval_reranking
|
| 400 |
reranking = "Reranking" in retrieval_reranking
|
| 401 |
|
| 402 |
+
if input_type == "prompt-input":
|
| 403 |
+
dataframe = pd.DataFrame(columns=["context", "question", "response"])
|
| 404 |
+
else:
|
| 405 |
dataframe, _ = load_dataset_file(
|
| 406 |
repo_id=original_repo_id,
|
| 407 |
file_paths=file_paths,
|
|
|
|
| 524 |
)
|
| 525 |
|
| 526 |
for item in ["context", "question", "response"]:
|
| 527 |
+
dataframe[f"{item}_length"] = dataframe[item].apply(
|
| 528 |
+
lambda x: len(x) if x is not None else 0
|
| 529 |
+
)
|
| 530 |
+
dataframe[f"{item}_embeddings"] = get_embeddings(
|
| 531 |
+
dataframe[item].apply(lambda x: x if x is not None else "").to_list()
|
| 532 |
+
)
|
| 533 |
|
| 534 |
rg_dataset = client.datasets(name=repo_name, workspace=hf_user)
|
| 535 |
if rg_dataset is None:
|
src/synthetic_dataset_generator/pipelines/rag.py
CHANGED
|
@@ -18,11 +18,11 @@ DEFAULT_DATASET_DESCRIPTIONS = [
|
|
| 18 |
|
| 19 |
PROMPT_CREATION_PROMPT = """
|
| 20 |
|
| 21 |
-
You are an AI assistant specialized in designing retrieval-augmented generation (RAG) tasks for dataset
|
| 22 |
|
| 23 |
-
Your task is to generate a well-structured and descriptive prompt based on the provided dataset description
|
| 24 |
|
| 25 |
-
The prompt should closely follow the style and structure of the example prompts below. Ensure that you include all relevant details from the dataset description
|
| 26 |
|
| 27 |
Description: A dataset to retrieve information from legal documents.
|
| 28 |
Output: A dataset to retrieve information from a collection of legal documents related to the US law system and the status of contracts.
|
|
@@ -48,9 +48,9 @@ Do not include or reference the retrieval task itself in the generated chunks.
|
|
| 48 |
|
| 49 |
CHUNKS_TEMPLATE = """You have been assigned to generate text chunks based on the following retrieval task: {{ task }}.
|
| 50 |
|
| 51 |
-
Provide only the text chunks without explaining your process or reasoning.
|
| 52 |
|
| 53 |
-
Ensure the chunks are
|
| 54 |
|
| 55 |
Use your general knowledge to create informative and precise outputs.
|
| 56 |
"""
|
|
@@ -145,12 +145,12 @@ def generate_pipeline_code(
|
|
| 145 |
retrieval_reranking: list[str],
|
| 146 |
num_rows: int = 10,
|
| 147 |
) -> str:
|
| 148 |
-
if repo_id is None:
|
| 149 |
-
subset = "default"
|
| 150 |
-
split = "train"
|
| 151 |
-
else:
|
| 152 |
subset = get_dataset_config_names(repo_id)[0]
|
| 153 |
split = get_dataset_split_names(repo_id, subset)[0]
|
|
|
|
|
|
|
|
|
|
| 154 |
retrieval = "Retrieval" in retrieval_reranking
|
| 155 |
reranking = "Reranking" in retrieval_reranking
|
| 156 |
base_code = f"""
|
|
|
|
| 18 |
|
| 19 |
PROMPT_CREATION_PROMPT = """
|
| 20 |
|
| 21 |
+
You are an AI assistant specialized in designing retrieval-augmented generation (RAG) tasks for dataset generation.
|
| 22 |
|
| 23 |
+
Your task is to generate a well-structured and descriptive prompt based on the provided dataset description. Respond with only the generated prompt and nothing else.
|
| 24 |
|
| 25 |
+
The prompt should closely follow the style and structure of the example prompts below. Ensure that you include all relevant details from the dataset description.
|
| 26 |
|
| 27 |
Description: A dataset to retrieve information from legal documents.
|
| 28 |
Output: A dataset to retrieve information from a collection of legal documents related to the US law system and the status of contracts.
|
|
|
|
| 48 |
|
| 49 |
CHUNKS_TEMPLATE = """You have been assigned to generate text chunks based on the following retrieval task: {{ task }}.
|
| 50 |
|
| 51 |
+
Provide only the text chunks without explaining your process or reasoning. Do not include any additional information. Do not indicate that it is a text chunk.
|
| 52 |
|
| 53 |
+
Ensure the chunks are concise, clear, and directly relevant to the task.
|
| 54 |
|
| 55 |
Use your general knowledge to create informative and precise outputs.
|
| 56 |
"""
|
|
|
|
| 145 |
retrieval_reranking: list[str],
|
| 146 |
num_rows: int = 10,
|
| 147 |
) -> str:
|
| 148 |
+
if input_type == "dataset-input" and repo_id is not None:
|
|
|
|
|
|
|
|
|
|
| 149 |
subset = get_dataset_config_names(repo_id)[0]
|
| 150 |
split = get_dataset_split_names(repo_id, subset)[0]
|
| 151 |
+
else:
|
| 152 |
+
subset = "default"
|
| 153 |
+
split = "train"
|
| 154 |
retrieval = "Retrieval" in retrieval_reranking
|
| 155 |
reranking = "Reranking" in retrieval_reranking
|
| 156 |
base_code = f"""
|