Spaces:
Runtime error
Runtime error
Commit
·
7b3a105
1
Parent(s):
df69857
first commit
Browse files- app.py +147 -0
- personas.py +71 -0
app.py
ADDED
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import io
|
2 |
+
import multiprocessing
|
3 |
+
import os
|
4 |
+
import time
|
5 |
+
|
6 |
+
import gradio as gr
|
7 |
+
import pandas as pd
|
8 |
+
from unstructured.partition.pdf import partition_pdf
|
9 |
+
import nltk
|
10 |
+
from distilabel.pipeline import Pipeline
|
11 |
+
from distilabel.llms import InferenceEndpointsLLM
|
12 |
+
from distilabel.steps import LoadDataFromDicts, KeepColumns
|
13 |
+
from distilabel.steps.tasks import TextGeneration
|
14 |
+
|
15 |
+
from personas import * # Assuming this contains TextToPersona and other necessary definitions
|
16 |
+
|
17 |
+
nltk.download("punkt", quiet=True)
|
18 |
+
|
19 |
+
PROMPT_TEMPLATE = """\
|
20 |
+
Generate a single prompt the persona below might ask to an AI assistant:
|
21 |
+
|
22 |
+
{{ persona }}
|
23 |
+
"""
|
24 |
+
|
25 |
+
# Get HF_TOKEN from environment variable
|
26 |
+
HF_TOKEN = os.environ.get("HF_TOKEN")
|
27 |
+
|
28 |
+
|
29 |
+
def process_pdfs(pdf_files):
|
30 |
+
all_data = []
|
31 |
+
for pdf_file in pdf_files:
|
32 |
+
elements = partition_pdf(pdf_file.name)
|
33 |
+
|
34 |
+
full_text = ""
|
35 |
+
for element in elements:
|
36 |
+
full_text += element.text + "\n"
|
37 |
+
|
38 |
+
all_data.append({"text": full_text.strip()})
|
39 |
+
|
40 |
+
return all_data
|
41 |
+
|
42 |
+
|
43 |
+
def _run_pipeline(result_queue, pdf_files):
|
44 |
+
data = process_pdfs(pdf_files)
|
45 |
+
|
46 |
+
with Pipeline(name="personahub-fineweb-edu-text-to-persona") as pipeline:
|
47 |
+
input_batch_size = 10
|
48 |
+
|
49 |
+
data_loader = LoadDataFromDicts(data=data)
|
50 |
+
|
51 |
+
llm = InferenceEndpointsLLM(
|
52 |
+
model_id="meta-llama/Meta-Llama-3.1-8B-Instruct",
|
53 |
+
api_key=HF_TOKEN,
|
54 |
+
)
|
55 |
+
|
56 |
+
text_to_persona = TextToPersona(
|
57 |
+
llm=llm,
|
58 |
+
input_batch_size=input_batch_size,
|
59 |
+
)
|
60 |
+
|
61 |
+
text_gen = TextGeneration(
|
62 |
+
llm=llm,
|
63 |
+
system_prompt="You are an AI assistant expert at simulating user interactions.",
|
64 |
+
template=PROMPT_TEMPLATE,
|
65 |
+
columns="persona",
|
66 |
+
output_mappings={"generation": "instruction"},
|
67 |
+
num_generations=1,
|
68 |
+
)
|
69 |
+
|
70 |
+
response_gen = TextGeneration(
|
71 |
+
llm=llm,
|
72 |
+
system_prompt="You are an AI assistant expert in responding to tasks",
|
73 |
+
output_mappings={"generation": "response"},
|
74 |
+
)
|
75 |
+
|
76 |
+
keep = KeepColumns(
|
77 |
+
columns=["text", "persona", "model_name", "instruction", "response"],
|
78 |
+
input_batch_size=input_batch_size,
|
79 |
+
)
|
80 |
+
|
81 |
+
(data_loader >> text_to_persona >> text_gen >> response_gen >> keep)
|
82 |
+
|
83 |
+
distiset = pipeline.run(use_cache=False)
|
84 |
+
result_queue.put(distiset)
|
85 |
+
|
86 |
+
|
87 |
+
def generate_dataset(pdf_files, progress=gr.Progress()):
|
88 |
+
result_queue = multiprocessing.Queue()
|
89 |
+
p = multiprocessing.Process(
|
90 |
+
target=_run_pipeline,
|
91 |
+
args=(result_queue, pdf_files),
|
92 |
+
)
|
93 |
+
|
94 |
+
try:
|
95 |
+
p.start()
|
96 |
+
total_steps = 100
|
97 |
+
for step in range(total_steps):
|
98 |
+
if not p.is_alive() or p._popen.poll() is not None:
|
99 |
+
break
|
100 |
+
progress(
|
101 |
+
(step + 1) / total_steps,
|
102 |
+
desc="Generating dataset. Don't close this window.",
|
103 |
+
)
|
104 |
+
time.sleep(2) # Adjust this value based on your needs
|
105 |
+
p.join()
|
106 |
+
except Exception as e:
|
107 |
+
raise gr.Error(f"An error occurred during dataset generation: {str(e)}")
|
108 |
+
|
109 |
+
distiset = result_queue.get()
|
110 |
+
df = distiset["default"]["train"].to_pandas()
|
111 |
+
progress(1.0, desc="Dataset generation completed")
|
112 |
+
return df
|
113 |
+
|
114 |
+
|
115 |
+
def gradio_interface(pdf_files):
|
116 |
+
if HF_TOKEN is None:
|
117 |
+
raise gr.Error(
|
118 |
+
"HF_TOKEN environment variable is not set. Please set it and restart the application."
|
119 |
+
)
|
120 |
+
df = generate_dataset(pdf_files)
|
121 |
+
return df
|
122 |
+
|
123 |
+
|
124 |
+
with gr.Blocks(title="MyPersonas Dataset Generator") as app:
|
125 |
+
gr.Markdown("# MyPersonas Dataset Generator")
|
126 |
+
gr.Markdown("Upload one or more PDFs to generate a persona based SFT dataset.")
|
127 |
+
|
128 |
+
with gr.Row():
|
129 |
+
pdf_files = gr.File(label="Upload PDFs", file_count="multiple")
|
130 |
+
|
131 |
+
with gr.Row():
|
132 |
+
generate_button = gr.Button("Generate Dataset")
|
133 |
+
|
134 |
+
output_dataframe = gr.DataFrame(
|
135 |
+
label="Generated Dataset",
|
136 |
+
interactive=False,
|
137 |
+
wrap=True,
|
138 |
+
)
|
139 |
+
|
140 |
+
generate_button.click(
|
141 |
+
fn=gradio_interface,
|
142 |
+
inputs=[pdf_files],
|
143 |
+
outputs=[output_dataframe],
|
144 |
+
)
|
145 |
+
|
146 |
+
if __name__ == "__main__":
|
147 |
+
app.launch()
|
personas.py
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import TYPE_CHECKING, Any, Dict, List, Union
|
2 |
+
|
3 |
+
from distilabel.llms import InferenceEndpointsLLM
|
4 |
+
from distilabel.pipeline import Pipeline
|
5 |
+
from distilabel.steps import LoadDataFromDicts
|
6 |
+
from distilabel.steps.tasks.base import Task
|
7 |
+
from distilabel.steps import KeepColumns
|
8 |
+
from distilabel.steps.base import StepResources
|
9 |
+
|
10 |
+
from distilabel.steps.tasks.typing import ChatType
|
11 |
+
from distilabel.steps.tasks import TextGeneration
|
12 |
+
|
13 |
+
|
14 |
+
SYSTEM_PROMPT_TEXT_TO_PERSONA: str = (
|
15 |
+
"You are an expert in analyzing the text content and assigning finding the general type of persona that could be associated with such a way of expressing. "
|
16 |
+
"Please use one or two sentences for the definition, but try to make it as fine-grained if input texts involve many detailed elements. "
|
17 |
+
"The persona definition must go straight to the point, be assertive. The following are starts of persona definitions:\n"
|
18 |
+
"A machine learning researcher...\n"
|
19 |
+
"A pedriatric nurse whose...\n"
|
20 |
+
"An urban planner focused on..."
|
21 |
+
)
|
22 |
+
|
23 |
+
TEXT_TO_PERSONA_PROMPT: str = (
|
24 |
+
"What is the likely profession, interest, or role of the person who would write or be interested in this text?\n\n"
|
25 |
+
"## Text\n"
|
26 |
+
"{text}"
|
27 |
+
)
|
28 |
+
|
29 |
+
|
30 |
+
class TextToPersona(Task):
|
31 |
+
"""
|
32 |
+
You are an expert in analyzing the text content and assigning finding the general type of persona that could be associated with such a way of expressing.
|
33 |
+
Please use one or two sentences for the definition, but try to make it as fine-grained if input texts involve many detailed elements.
|
34 |
+
The persona definition must go straight to the point, be assertive. For example, you can start the definition as:
|
35 |
+
A machine learning researcher... or A pedriatric nurse whose...
|
36 |
+
See Figure 3 in PersonaHub paper.
|
37 |
+
"""
|
38 |
+
|
39 |
+
system_prompt: str = SYSTEM_PROMPT_TEXT_TO_PERSONA
|
40 |
+
|
41 |
+
@property
|
42 |
+
def inputs(self) -> List[str]:
|
43 |
+
"""The inputs for the task are the `text`."""
|
44 |
+
return ["text"]
|
45 |
+
|
46 |
+
def format_input(self, input: Dict[str, Any]) -> "ChatType":
|
47 |
+
"""The input is formatted as a `ChatType`."""
|
48 |
+
return [
|
49 |
+
{"role": "system", "content": self.system_prompt},
|
50 |
+
{
|
51 |
+
"role": "user",
|
52 |
+
"content": TEXT_TO_PERSONA_PROMPT.format(text=input["text"]), # type: ignore
|
53 |
+
},
|
54 |
+
]
|
55 |
+
|
56 |
+
@property
|
57 |
+
def outputs(self) -> List[str]:
|
58 |
+
"""The output for the task is the persona definition."""
|
59 |
+
return ["persona", "model_name"]
|
60 |
+
|
61 |
+
def format_output(
|
62 |
+
self, output: Union[str, None], input: Dict[str, Any]
|
63 |
+
) -> Dict[str, Any]:
|
64 |
+
"""The output is formatted as a list with the score of each instruction.
|
65 |
+
Args:
|
66 |
+
output: the raw output of the LLM.
|
67 |
+
input: the input to the task. Used for obtaining the number of responses.
|
68 |
+
Returns:
|
69 |
+
A dict with the persona definition.
|
70 |
+
"""
|
71 |
+
return {"persona": output}
|