UPD: added a function and code generation module
Browse files- LlamaManager.py +109 -0
- app.py +98 -9
LlamaManager.py
CHANGED
@@ -219,6 +219,115 @@ class LlamaManager():
|
|
219 |
print("LlamaManager::auto_generate_questions_from_shots::Generated questions from shots")
|
220 |
|
221 |
return questions
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
222 |
|
223 |
|
224 |
if __name__ == "__main__":
|
|
|
219 |
print("LlamaManager::auto_generate_questions_from_shots::Generated questions from shots")
|
220 |
|
221 |
return questions
|
222 |
+
|
223 |
+
|
224 |
+
def __postprocess_for_auto_generate_function_signature_from_question(self, out):
|
225 |
+
if self.verbose:
|
226 |
+
print("LlamaManager::__postprocess_for_auto_generate_function_signature_from_question::Postprocessing")
|
227 |
+
|
228 |
+
out = self.__get_items_between_tags(out, r"\[A\]", r"\[/A\]")[0]
|
229 |
+
function_name = self.__get_items_between_tags(out, r"\[F\]", r"\[/F\]")[0]
|
230 |
+
input_parameters = self.__get_items_between_tags(out, r"\[I\]", r"\[/I\]")
|
231 |
+
return_type = self.__get_items_between_tags(out, r"\[R\]", r"\[/R\]")[0]
|
232 |
+
return function_name, input_parameters, return_type
|
233 |
+
|
234 |
+
def auto_generate_function_signature_from_question(
|
235 |
+
self,
|
236 |
+
question,
|
237 |
+
seed = 123,
|
238 |
+
temperature = 1.0,
|
239 |
+
top_p = 0.9,
|
240 |
+
frequency_penalty = 0.0
|
241 |
+
):
|
242 |
+
if self.verbose:
|
243 |
+
print("LlamaManager::auto_generate_function_signature_from_question::Generating function signature from question")
|
244 |
+
|
245 |
+
message_content = [
|
246 |
+
{"role": "system", "content": """You are a synthetic data generator.
|
247 |
+
You must answer the question between [A] and [/A] tags.
|
248 |
+
The answer should include a function name, input parameters and return type.
|
249 |
+
The function name should be between [F] and [/F] tags.
|
250 |
+
Each input parameter should be between [I] and [/I] tags.
|
251 |
+
The return type should be between [R] and [/R] tags.
|
252 |
+
"""},
|
253 |
+
{"role": "user", "content": f"""Write me a function signature, input parameters and return type for the following question:
|
254 |
+
Write a program that takes two positive integers as input and computes the sum of their digits using a for loop."""},
|
255 |
+
{"role": "assistant", "content": f"[A][F]sum_of_digits[/F][I]num_1: int[/I][I]num_2: int[/I][R]int[/R][/A]"},
|
256 |
+
{"role": "user", "content": f"Write me a function signature, input parameters and return type for the following question: {question}"},
|
257 |
+
{"role": "assistant", "content": f"[A]"}
|
258 |
+
]
|
259 |
+
|
260 |
+
out = self.client.chat_completion(
|
261 |
+
messages = message_content,
|
262 |
+
max_tokens = 1000,
|
263 |
+
stream = False,
|
264 |
+
seed = seed,
|
265 |
+
temperature = temperature,
|
266 |
+
top_p = top_p,
|
267 |
+
frequency_penalty = frequency_penalty
|
268 |
+
)
|
269 |
+
|
270 |
+
function_name, input_parameters, return_type = self.__postprocess_for_auto_generate_function_signature_from_question(out.choices[0].message.content)
|
271 |
+
if self.verbose:
|
272 |
+
print("LlamaManager::auto_generate_function_signature_from_question::Generated function signature from question")
|
273 |
+
|
274 |
+
return function_name, input_parameters, return_type
|
275 |
+
|
276 |
+
|
277 |
+
def __postprocess_for_auto_generate_answers_and_tests(self, out):
|
278 |
+
if self.verbose:
|
279 |
+
print("LlamaManager::__postprocess_for_auto_generate_answers_and_tests::Postprocessing")
|
280 |
+
|
281 |
+
out = self.__get_items_between_tags(out, r"\[A\]", r"\[/A\]")[0]
|
282 |
+
answer = self.__get_items_between_tags(out, r"\[F\]", r"\[/F\]")[0]
|
283 |
+
test_cases = self.__get_items_between_tags(out, r"\[T\]", r"\[/T\]")
|
284 |
+
return answer, test_cases
|
285 |
+
|
286 |
+
|
287 |
+
def auto_generate_answers_and_tests(
|
288 |
+
self,
|
289 |
+
question,
|
290 |
+
function_name,
|
291 |
+
input_parameters,
|
292 |
+
return_type,
|
293 |
+
seed = 123,
|
294 |
+
temperature = 1.0,
|
295 |
+
top_p = 0.9,
|
296 |
+
frequency_penalty = 0.0
|
297 |
+
):
|
298 |
+
if self.verbose:
|
299 |
+
print("LlamaManager::auto_generate_answers_and_tests::Generating answers and test cases")
|
300 |
+
|
301 |
+
function_signature = f"{function_name}({', '.join(input_parameters)}) -> {return_type}"
|
302 |
+
|
303 |
+
message_content = [
|
304 |
+
{"role": "system", "content": """You are a synthetic data generator.
|
305 |
+
Your must answer the question between [A] and [/A] tags.
|
306 |
+
The answer should include a function implementation and test cases.
|
307 |
+
The function implementation should be between [F] and [/F] tags.
|
308 |
+
Each test cases should be between [T] and [/T] tags.
|
309 |
+
Test cases must use assert statements.
|
310 |
+
Do not comment on the code. No need to explain the solution.
|
311 |
+
"""},
|
312 |
+
{"role": "user", "content": f"""Write me a function implementation along with the test cases for the following question: {question},
|
313 |
+
The function has the following signature: {function_signature}"""}
|
314 |
+
]
|
315 |
+
|
316 |
+
out = self.client.chat_completion(
|
317 |
+
messages = message_content,
|
318 |
+
max_tokens = 1000,
|
319 |
+
stream = False,
|
320 |
+
seed = seed,
|
321 |
+
temperature = temperature,
|
322 |
+
top_p = top_p,
|
323 |
+
frequency_penalty = frequency_penalty
|
324 |
+
)
|
325 |
+
|
326 |
+
answer, test_cases = self.__postprocess_for_auto_generate_answers_and_tests(out.choices[0].message.content)
|
327 |
+
if self.verbose:
|
328 |
+
print("LlamaManager::auto_generate_answers_and_tests::Generated answers and test cases")
|
329 |
+
|
330 |
+
return answer, test_cases
|
331 |
|
332 |
|
333 |
if __name__ == "__main__":
|
app.py
CHANGED
@@ -2,6 +2,8 @@ import gradio
|
|
2 |
import LlamaManager
|
3 |
import os
|
4 |
import huggingface_hub
|
|
|
|
|
5 |
|
6 |
HF_API = huggingface_hub.HfApi()
|
7 |
LLAMAMANAGER = LlamaManager.LlamaManager(os.environ.get("HF_KEY_2"), True)
|
@@ -43,7 +45,7 @@ def generate_categories(categories_count, seed, temperature, top_p, frequency_pe
|
|
43 |
"frequency_penalty": frequency_penalty
|
44 |
}
|
45 |
store_generated_data(data)
|
46 |
-
return gradio.Dropdown(choices = categories, value = categories
|
47 |
|
48 |
|
49 |
def generate_shots(category, shots_count, seed, temperature, top_p, frequency_penalty):
|
@@ -60,15 +62,15 @@ def generate_shots(category, shots_count, seed, temperature, top_p, frequency_pe
|
|
60 |
"frequency_penalty": frequency_penalty
|
61 |
}
|
62 |
store_generated_data(data)
|
63 |
-
return gradio.DataFrame(value = shots, type = "array", label = "Generated Shots", interactive = False, headers =
|
64 |
|
65 |
|
66 |
def generate_questions(questions_count, category, shots, seed, temperature, top_p, frequency_penalty):
|
67 |
questions = LLAMAMANAGER.auto_generate_questions_from_shots(questions_count, category, shots, seed, temperature, top_p, frequency_penalty)
|
68 |
-
|
69 |
data = {
|
70 |
"type": "generate_questions",
|
71 |
-
"questions":
|
72 |
"count": questions_count,
|
73 |
"category": category,
|
74 |
"shots": shots,
|
@@ -78,13 +80,57 @@ def generate_questions(questions_count, category, shots, seed, temperature, top_
|
|
78 |
"frequency_penalty": frequency_penalty
|
79 |
}
|
80 |
store_generated_data(data)
|
81 |
-
return gradio.DataFrame(value =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
|
83 |
|
84 |
with gradio.Blocks(fill_height=True) as base_app:
|
85 |
gradio.Markdown("# Synthetic Python Programming Data Generation βοΈ")
|
86 |
gradio.Markdown("# βοΈ Note: The data generated here by Llama3 and the settings used to generate it will be stored in the repository [here](https://huggingface.co/datasets/xqt/SyntheticMBPP2) for future use.")
|
87 |
-
gradio.Markdown("# βοΈ Each successful interaction is saved [here](https://huggingface.co/datasets/xqt/SyntheticMBPP2/discussions/1)")
|
88 |
gradio.Markdown("# βοΈ Feel free to use your own API key if the key here is rate limited. API Key is never stored in the repository.")
|
89 |
gradio.Markdown("# βοΈ If you want to use a passcode, please text me.")
|
90 |
gradio.Markdown("# Step 0: Use your own API Key/Passcode")
|
@@ -124,7 +170,7 @@ with gradio.Blocks(fill_height=True) as base_app:
|
|
124 |
with gradio.Column():
|
125 |
__shots_frequency_penalty = gradio.Slider(minimum = -2.0, maximum = 2.0, step = 0.01, value = 0.0, label = "Frequency Penalty", interactive = True)
|
126 |
__shots_seed = gradio.Slider(minimum = 0, maximum = 1000, step = 1, value = 123, label = "Seed", interactive = True)
|
127 |
-
__generated_shots = gradio.DataFrame(value = [], col_count = 1, type = "array", label = "Generated Shots", interactive = False, headers =
|
128 |
|
129 |
gradio.Markdown("# Step 3: Generate Python Programming Questions for the generated shots")
|
130 |
with gradio.Row():
|
@@ -140,8 +186,41 @@ with gradio.Blocks(fill_height=True) as base_app:
|
|
140 |
with gradio.Column():
|
141 |
__questions_frequency_penalty = gradio.Slider(minimum = -2.0, maximum = 2.0, step = 0.01, value = 0.0, label = "Frequency Penalty", interactive = True)
|
142 |
__questions_seed = gradio.Slider(minimum = 0, maximum = 1000, step = 1, value = 123, label = "Seed", interactive = True)
|
143 |
-
__generated_questions = gradio.DataFrame(value = [], col_count = 1, type = "array", label = "Generated Questions", interactive = False, headers =
|
144 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
145 |
|
146 |
__passcode_authenticate.click(authenticate,
|
147 |
inputs = [__secret_textbox],
|
@@ -159,8 +238,18 @@ with gradio.Blocks(fill_height=True) as base_app:
|
|
159 |
|
160 |
__questions_generate.click(generate_questions,
|
161 |
inputs = [__questions_count, __shots_category, __generated_shots, __questions_seed, __questions_temperature, __questions_top_p, __questions_frequency_penalty],
|
162 |
-
outputs = [__generated_questions]
|
163 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
164 |
|
165 |
|
166 |
if __name__ == "__main__":
|
|
|
2 |
import LlamaManager
|
3 |
import os
|
4 |
import huggingface_hub
|
5 |
+
import random
|
6 |
+
import ast
|
7 |
|
8 |
HF_API = huggingface_hub.HfApi()
|
9 |
LLAMAMANAGER = LlamaManager.LlamaManager(os.environ.get("HF_KEY_2"), True)
|
|
|
45 |
"frequency_penalty": frequency_penalty
|
46 |
}
|
47 |
store_generated_data(data)
|
48 |
+
return gradio.Dropdown(choices = categories, value = random.choice(categories), label = "Select Category", interactive = True)
|
49 |
|
50 |
|
51 |
def generate_shots(category, shots_count, seed, temperature, top_p, frequency_penalty):
|
|
|
62 |
"frequency_penalty": frequency_penalty
|
63 |
}
|
64 |
store_generated_data(data)
|
65 |
+
return gradio.DataFrame(value = shots, type = "array", label = "Generated Shots", interactive = False, headers = ["Shots"])
|
66 |
|
67 |
|
68 |
def generate_questions(questions_count, category, shots, seed, temperature, top_p, frequency_penalty):
|
69 |
questions = LLAMAMANAGER.auto_generate_questions_from_shots(questions_count, category, shots, seed, temperature, top_p, frequency_penalty)
|
70 |
+
questions_for_dataframe = [[question] for question in questions]
|
71 |
data = {
|
72 |
"type": "generate_questions",
|
73 |
+
"questions": questions_for_dataframe,
|
74 |
"count": questions_count,
|
75 |
"category": category,
|
76 |
"shots": shots,
|
|
|
80 |
"frequency_penalty": frequency_penalty
|
81 |
}
|
82 |
store_generated_data(data)
|
83 |
+
return gradio.DataFrame(value = questions_for_dataframe, type = "array", label = "Generated Shots", interactive = False, headers = ["Questions"]), \
|
84 |
+
gradio.Dropdown(choices = questions, value = random.choice(questions), label = "Select a Question", interactive = True)
|
85 |
+
|
86 |
+
|
87 |
+
def generate_function(question, temperature, top_p, frequency_penalty, seed):
|
88 |
+
function_name, function_parameters, function_return = LLAMAMANAGER.auto_generate_function_signature_from_question(
|
89 |
+
question, seed, temperature, top_p, frequency_penalty
|
90 |
+
)
|
91 |
+
data = {
|
92 |
+
"type": "generate_function",
|
93 |
+
"question": question,
|
94 |
+
"function_name": function_name,
|
95 |
+
"function_parameters": function_parameters,
|
96 |
+
"function_return": function_return,
|
97 |
+
"temperature": temperature,
|
98 |
+
"top_p": top_p,
|
99 |
+
"frequency_penalty": frequency_penalty,
|
100 |
+
"seed": seed
|
101 |
+
}
|
102 |
+
store_generated_data(data)
|
103 |
+
return function_name, function_parameters, function_return
|
104 |
+
|
105 |
+
|
106 |
+
def generate_answers_and_tests(question, function_name, function_parameters, function_return, temperature, top_p, frequency_penalty, seed):
|
107 |
+
function_parameters = ast.literal_eval(function_parameters)
|
108 |
+
code, tests = LLAMAMANAGER.auto_generate_answers_and_tests(
|
109 |
+
question, function_name, function_parameters, function_return, seed, temperature, top_p, frequency_penalty
|
110 |
+
)
|
111 |
+
data = {
|
112 |
+
"type": "generate_answers_and_test",
|
113 |
+
"question": question,
|
114 |
+
"function_name": function_name,
|
115 |
+
"function_parameters": function_parameters,
|
116 |
+
"function_return": function_return,
|
117 |
+
"code": code,
|
118 |
+
"tests": tests,
|
119 |
+
"temperature": temperature,
|
120 |
+
"top_p": top_p,
|
121 |
+
"frequency_penalty": frequency_penalty,
|
122 |
+
"seed": seed
|
123 |
+
}
|
124 |
+
store_generated_data(data)
|
125 |
+
for test in tests:
|
126 |
+
code += f"\n{test}"
|
127 |
+
return gradio.Markdown(f"\n```python\n{code}\n```", show_copy_button = True)
|
128 |
|
129 |
|
130 |
with gradio.Blocks(fill_height=True) as base_app:
|
131 |
gradio.Markdown("# Synthetic Python Programming Data Generation βοΈ")
|
132 |
gradio.Markdown("# βοΈ Note: The data generated here by Llama3 and the settings used to generate it will be stored in the repository [here](https://huggingface.co/datasets/xqt/SyntheticMBPP2) for future use.")
|
133 |
+
gradio.Markdown("# βοΈ Each successful interaction is saved [here](https://huggingface.co/datasets/xqt/SyntheticMBPP2/discussions/1).")
|
134 |
gradio.Markdown("# βοΈ Feel free to use your own API key if the key here is rate limited. API Key is never stored in the repository.")
|
135 |
gradio.Markdown("# βοΈ If you want to use a passcode, please text me.")
|
136 |
gradio.Markdown("# Step 0: Use your own API Key/Passcode")
|
|
|
170 |
with gradio.Column():
|
171 |
__shots_frequency_penalty = gradio.Slider(minimum = -2.0, maximum = 2.0, step = 0.01, value = 0.0, label = "Frequency Penalty", interactive = True)
|
172 |
__shots_seed = gradio.Slider(minimum = 0, maximum = 1000, step = 1, value = 123, label = "Seed", interactive = True)
|
173 |
+
__generated_shots = gradio.DataFrame(value = [], col_count = 1, type = "array", label = "Generated Shots", interactive = False, headers = ["Shots"])
|
174 |
|
175 |
gradio.Markdown("# Step 3: Generate Python Programming Questions for the generated shots")
|
176 |
with gradio.Row():
|
|
|
186 |
with gradio.Column():
|
187 |
__questions_frequency_penalty = gradio.Slider(minimum = -2.0, maximum = 2.0, step = 0.01, value = 0.0, label = "Frequency Penalty", interactive = True)
|
188 |
__questions_seed = gradio.Slider(minimum = 0, maximum = 1000, step = 1, value = 123, label = "Seed", interactive = True)
|
189 |
+
__generated_questions = gradio.DataFrame(value = [], col_count = 1, type = "array", label = "Generated Questions", interactive = False, headers = ["Questions"])
|
190 |
|
191 |
+
gradio.Markdown("# Step 4: Generate a function name, input parameters, and return type for the generated questions")
|
192 |
+
with gradio.Row():
|
193 |
+
with gradio.Column(scale = 2):
|
194 |
+
__function_question_dropdown = gradio.Dropdown(choices = [], label = "Select a Question", interactive = True, scale = 2)
|
195 |
+
with gradio.Column():
|
196 |
+
__function_generate = gradio.Button("Generate Function", scale = 2)
|
197 |
+
with gradio.Accordion("Advanced Settings", open = False):
|
198 |
+
with gradio.Row():
|
199 |
+
with gradio.Column():
|
200 |
+
__function_temperature = gradio.Slider(minimum = 0.1, maximum = 2.0, step = 0.01, value = 1.0, label = "Temperature", interactive = True)
|
201 |
+
__function_top_p = gradio.Slider(minimum = 0.1, maximum = 0.99, step = 0.01, value = 0.9, label = "Top P", interactive = True)
|
202 |
+
with gradio.Column():
|
203 |
+
__function_frequency_penalty = gradio.Slider(minimum = -2.0, maximum = 2.0, step = 0.01, value = 0.0, label = "Frequency Penalty", interactive = True)
|
204 |
+
__function_seed = gradio.Slider(minimum = 0, maximum = 1000, step = 1, value = 123, label = "Seed", interactive = True)
|
205 |
+
with gradio.Row():
|
206 |
+
with gradio.Column():
|
207 |
+
__function_name = gradio.Textbox(label = "Function Name", placeholder = "function name", interactive = False)
|
208 |
+
with gradio.Column():
|
209 |
+
__function_parameters = gradio.Textbox(label = "Input Parameters", placeholder = "input parameters", interactive = False)
|
210 |
+
with gradio.Column():
|
211 |
+
__function_return = gradio.Textbox(label = "Return Type", placeholder = "return type", interactive = False)
|
212 |
+
|
213 |
+
gradio.Markdown("# π Step 5: Generate a code.")
|
214 |
+
__code_generate = gradio.Button("Generate Code", scale = 2)
|
215 |
+
with gradio.Accordion("Advanced Settings", open = False):
|
216 |
+
with gradio.Row():
|
217 |
+
with gradio.Column():
|
218 |
+
__code_temperature = gradio.Slider(minimum = 0.1, maximum = 2.0, step = 0.01, value = 1.0, label = "Temperature", interactive = True)
|
219 |
+
__code_top_p = gradio.Slider(minimum = 0.1, maximum = 0.99, step = 0.01, value = 0.9, label = "Top P", interactive = True)
|
220 |
+
with gradio.Column():
|
221 |
+
__code_frequency_penalty = gradio.Slider(minimum = -2.0, maximum = 2.0, step = 0.01, value = 0.0, label = "Frequency Penalty", interactive = True)
|
222 |
+
__code_seed = gradio.Slider(minimum = 0, maximum = 1000, step = 1, value = 123, label = "Seed", interactive = True)
|
223 |
+
__code = gradio.Markdown("π Code will be generated here...", show_copy_button = True)
|
224 |
|
225 |
__passcode_authenticate.click(authenticate,
|
226 |
inputs = [__secret_textbox],
|
|
|
238 |
|
239 |
__questions_generate.click(generate_questions,
|
240 |
inputs = [__questions_count, __shots_category, __generated_shots, __questions_seed, __questions_temperature, __questions_top_p, __questions_frequency_penalty],
|
241 |
+
outputs = [__generated_questions, __function_question_dropdown]
|
242 |
)
|
243 |
+
|
244 |
+
__function_generate.click(generate_function,
|
245 |
+
inputs = [__function_question_dropdown, __function_temperature, __function_top_p, __function_frequency_penalty, __function_seed],
|
246 |
+
outputs = [__function_name, __function_parameters, __function_return]
|
247 |
+
)
|
248 |
+
|
249 |
+
__code_generate.click(generate_answers_and_tests,
|
250 |
+
inputs = [__function_question_dropdown, __function_name, __function_parameters, __function_return, __code_temperature, __code_top_p, __code_frequency_penalty, __code_seed],
|
251 |
+
outputs = [__code]
|
252 |
+
)
|
253 |
|
254 |
|
255 |
if __name__ == "__main__":
|