xqt commited on
Commit
76e2397
Β·
1 Parent(s): 9da93ad

UPD: added a function and code generation module

Browse files
Files changed (2) hide show
  1. LlamaManager.py +109 -0
  2. app.py +98 -9
LlamaManager.py CHANGED
@@ -219,6 +219,115 @@ class LlamaManager():
219
  print("LlamaManager::auto_generate_questions_from_shots::Generated questions from shots")
220
 
221
  return questions
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
 
223
 
224
  if __name__ == "__main__":
 
219
  print("LlamaManager::auto_generate_questions_from_shots::Generated questions from shots")
220
 
221
  return questions
222
+
223
+
224
+ def __postprocess_for_auto_generate_function_signature_from_question(self, out):
225
+ if self.verbose:
226
+ print("LlamaManager::__postprocess_for_auto_generate_function_signature_from_question::Postprocessing")
227
+
228
+ out = self.__get_items_between_tags(out, r"\[A\]", r"\[/A\]")[0]
229
+ function_name = self.__get_items_between_tags(out, r"\[F\]", r"\[/F\]")[0]
230
+ input_parameters = self.__get_items_between_tags(out, r"\[I\]", r"\[/I\]")
231
+ return_type = self.__get_items_between_tags(out, r"\[R\]", r"\[/R\]")[0]
232
+ return function_name, input_parameters, return_type
233
+
234
+ def auto_generate_function_signature_from_question(
235
+ self,
236
+ question,
237
+ seed = 123,
238
+ temperature = 1.0,
239
+ top_p = 0.9,
240
+ frequency_penalty = 0.0
241
+ ):
242
+ if self.verbose:
243
+ print("LlamaManager::auto_generate_function_signature_from_question::Generating function signature from question")
244
+
245
+ message_content = [
246
+ {"role": "system", "content": """You are a synthetic data generator.
247
+ You must answer the question between [A] and [/A] tags.
248
+ The answer should include a function name, input parameters and return type.
249
+ The function name should be between [F] and [/F] tags.
250
+ Each input parameter should be between [I] and [/I] tags.
251
+ The return type should be between [R] and [/R] tags.
252
+ """},
253
+ {"role": "user", "content": f"""Write me a function signature, input parameters and return type for the following question:
254
+ Write a program that takes two positive integers as input and computes the sum of their digits using a for loop."""},
255
+ {"role": "assistant", "content": f"[A][F]sum_of_digits[/F][I]num_1: int[/I][I]num_2: int[/I][R]int[/R][/A]"},
256
+ {"role": "user", "content": f"Write me a function signature, input parameters and return type for the following question: {question}"},
257
+ {"role": "assistant", "content": f"[A]"}
258
+ ]
259
+
260
+ out = self.client.chat_completion(
261
+ messages = message_content,
262
+ max_tokens = 1000,
263
+ stream = False,
264
+ seed = seed,
265
+ temperature = temperature,
266
+ top_p = top_p,
267
+ frequency_penalty = frequency_penalty
268
+ )
269
+
270
+ function_name, input_parameters, return_type = self.__postprocess_for_auto_generate_function_signature_from_question(out.choices[0].message.content)
271
+ if self.verbose:
272
+ print("LlamaManager::auto_generate_function_signature_from_question::Generated function signature from question")
273
+
274
+ return function_name, input_parameters, return_type
275
+
276
+
277
+ def __postprocess_for_auto_generate_answers_and_tests(self, out):
278
+ if self.verbose:
279
+ print("LlamaManager::__postprocess_for_auto_generate_answers_and_tests::Postprocessing")
280
+
281
+ out = self.__get_items_between_tags(out, r"\[A\]", r"\[/A\]")[0]
282
+ answer = self.__get_items_between_tags(out, r"\[F\]", r"\[/F\]")[0]
283
+ test_cases = self.__get_items_between_tags(out, r"\[T\]", r"\[/T\]")
284
+ return answer, test_cases
285
+
286
+
287
+ def auto_generate_answers_and_tests(
288
+ self,
289
+ question,
290
+ function_name,
291
+ input_parameters,
292
+ return_type,
293
+ seed = 123,
294
+ temperature = 1.0,
295
+ top_p = 0.9,
296
+ frequency_penalty = 0.0
297
+ ):
298
+ if self.verbose:
299
+ print("LlamaManager::auto_generate_answers_and_tests::Generating answers and test cases")
300
+
301
+ function_signature = f"{function_name}({', '.join(input_parameters)}) -> {return_type}"
302
+
303
+ message_content = [
304
+ {"role": "system", "content": """You are a synthetic data generator.
305
+ Your must answer the question between [A] and [/A] tags.
306
+ The answer should include a function implementation and test cases.
307
+ The function implementation should be between [F] and [/F] tags.
308
+ Each test cases should be between [T] and [/T] tags.
309
+ Test cases must use assert statements.
310
+ Do not comment on the code. No need to explain the solution.
311
+ """},
312
+ {"role": "user", "content": f"""Write me a function implementation along with the test cases for the following question: {question},
313
+ The function has the following signature: {function_signature}"""}
314
+ ]
315
+
316
+ out = self.client.chat_completion(
317
+ messages = message_content,
318
+ max_tokens = 1000,
319
+ stream = False,
320
+ seed = seed,
321
+ temperature = temperature,
322
+ top_p = top_p,
323
+ frequency_penalty = frequency_penalty
324
+ )
325
+
326
+ answer, test_cases = self.__postprocess_for_auto_generate_answers_and_tests(out.choices[0].message.content)
327
+ if self.verbose:
328
+ print("LlamaManager::auto_generate_answers_and_tests::Generated answers and test cases")
329
+
330
+ return answer, test_cases
331
 
332
 
333
  if __name__ == "__main__":
app.py CHANGED
@@ -2,6 +2,8 @@ import gradio
2
  import LlamaManager
3
  import os
4
  import huggingface_hub
 
 
5
 
6
  HF_API = huggingface_hub.HfApi()
7
  LLAMAMANAGER = LlamaManager.LlamaManager(os.environ.get("HF_KEY_2"), True)
@@ -43,7 +45,7 @@ def generate_categories(categories_count, seed, temperature, top_p, frequency_pe
43
  "frequency_penalty": frequency_penalty
44
  }
45
  store_generated_data(data)
46
- return gradio.Dropdown(choices = categories, value = categories[0], label = "Select Category", interactive = True)
47
 
48
 
49
  def generate_shots(category, shots_count, seed, temperature, top_p, frequency_penalty):
@@ -60,15 +62,15 @@ def generate_shots(category, shots_count, seed, temperature, top_p, frequency_pe
60
  "frequency_penalty": frequency_penalty
61
  }
62
  store_generated_data(data)
63
- return gradio.DataFrame(value = shots, type = "array", label = "Generated Shots", interactive = False, headers = None)
64
 
65
 
66
  def generate_questions(questions_count, category, shots, seed, temperature, top_p, frequency_penalty):
67
  questions = LLAMAMANAGER.auto_generate_questions_from_shots(questions_count, category, shots, seed, temperature, top_p, frequency_penalty)
68
- questions = [[question] for question in questions]
69
  data = {
70
  "type": "generate_questions",
71
- "questions": questions,
72
  "count": questions_count,
73
  "category": category,
74
  "shots": shots,
@@ -78,13 +80,57 @@ def generate_questions(questions_count, category, shots, seed, temperature, top_
78
  "frequency_penalty": frequency_penalty
79
  }
80
  store_generated_data(data)
81
- return gradio.DataFrame(value = questions, type = "array", label = "Generated Shots", interactive = False, headers = None)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
 
84
  with gradio.Blocks(fill_height=True) as base_app:
85
  gradio.Markdown("# Synthetic Python Programming Data Generation βš™οΈ")
86
  gradio.Markdown("# ❗️ Note: The data generated here by Llama3 and the settings used to generate it will be stored in the repository [here](https://huggingface.co/datasets/xqt/SyntheticMBPP2) for future use.")
87
- gradio.Markdown("# ❗️ Each successful interaction is saved [here](https://huggingface.co/datasets/xqt/SyntheticMBPP2/discussions/1)")
88
  gradio.Markdown("# ❗️ Feel free to use your own API key if the key here is rate limited. API Key is never stored in the repository.")
89
  gradio.Markdown("# ❗️ If you want to use a passcode, please text me.")
90
  gradio.Markdown("# Step 0: Use your own API Key/Passcode")
@@ -124,7 +170,7 @@ with gradio.Blocks(fill_height=True) as base_app:
124
  with gradio.Column():
125
  __shots_frequency_penalty = gradio.Slider(minimum = -2.0, maximum = 2.0, step = 0.01, value = 0.0, label = "Frequency Penalty", interactive = True)
126
  __shots_seed = gradio.Slider(minimum = 0, maximum = 1000, step = 1, value = 123, label = "Seed", interactive = True)
127
- __generated_shots = gradio.DataFrame(value = [], col_count = 1, type = "array", label = "Generated Shots", interactive = False, headers = None)
128
 
129
  gradio.Markdown("# Step 3: Generate Python Programming Questions for the generated shots")
130
  with gradio.Row():
@@ -140,8 +186,41 @@ with gradio.Blocks(fill_height=True) as base_app:
140
  with gradio.Column():
141
  __questions_frequency_penalty = gradio.Slider(minimum = -2.0, maximum = 2.0, step = 0.01, value = 0.0, label = "Frequency Penalty", interactive = True)
142
  __questions_seed = gradio.Slider(minimum = 0, maximum = 1000, step = 1, value = 123, label = "Seed", interactive = True)
143
- __generated_questions = gradio.DataFrame(value = [], col_count = 1, type = "array", label = "Generated Questions", interactive = False, headers = None)
144
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
 
146
  __passcode_authenticate.click(authenticate,
147
  inputs = [__secret_textbox],
@@ -159,8 +238,18 @@ with gradio.Blocks(fill_height=True) as base_app:
159
 
160
  __questions_generate.click(generate_questions,
161
  inputs = [__questions_count, __shots_category, __generated_shots, __questions_seed, __questions_temperature, __questions_top_p, __questions_frequency_penalty],
162
- outputs = [__generated_questions]
163
  )
 
 
 
 
 
 
 
 
 
 
164
 
165
 
166
  if __name__ == "__main__":
 
2
  import LlamaManager
3
  import os
4
  import huggingface_hub
5
+ import random
6
+ import ast
7
 
8
  HF_API = huggingface_hub.HfApi()
9
  LLAMAMANAGER = LlamaManager.LlamaManager(os.environ.get("HF_KEY_2"), True)
 
45
  "frequency_penalty": frequency_penalty
46
  }
47
  store_generated_data(data)
48
+ return gradio.Dropdown(choices = categories, value = random.choice(categories), label = "Select Category", interactive = True)
49
 
50
 
51
  def generate_shots(category, shots_count, seed, temperature, top_p, frequency_penalty):
 
62
  "frequency_penalty": frequency_penalty
63
  }
64
  store_generated_data(data)
65
+ return gradio.DataFrame(value = shots, type = "array", label = "Generated Shots", interactive = False, headers = ["Shots"])
66
 
67
 
68
  def generate_questions(questions_count, category, shots, seed, temperature, top_p, frequency_penalty):
69
  questions = LLAMAMANAGER.auto_generate_questions_from_shots(questions_count, category, shots, seed, temperature, top_p, frequency_penalty)
70
+ questions_for_dataframe = [[question] for question in questions]
71
  data = {
72
  "type": "generate_questions",
73
+ "questions": questions_for_dataframe,
74
  "count": questions_count,
75
  "category": category,
76
  "shots": shots,
 
80
  "frequency_penalty": frequency_penalty
81
  }
82
  store_generated_data(data)
83
+ return gradio.DataFrame(value = questions_for_dataframe, type = "array", label = "Generated Shots", interactive = False, headers = ["Questions"]), \
84
+ gradio.Dropdown(choices = questions, value = random.choice(questions), label = "Select a Question", interactive = True)
85
+
86
+
87
+ def generate_function(question, temperature, top_p, frequency_penalty, seed):
88
+ function_name, function_parameters, function_return = LLAMAMANAGER.auto_generate_function_signature_from_question(
89
+ question, seed, temperature, top_p, frequency_penalty
90
+ )
91
+ data = {
92
+ "type": "generate_function",
93
+ "question": question,
94
+ "function_name": function_name,
95
+ "function_parameters": function_parameters,
96
+ "function_return": function_return,
97
+ "temperature": temperature,
98
+ "top_p": top_p,
99
+ "frequency_penalty": frequency_penalty,
100
+ "seed": seed
101
+ }
102
+ store_generated_data(data)
103
+ return function_name, function_parameters, function_return
104
+
105
+
106
+ def generate_answers_and_tests(question, function_name, function_parameters, function_return, temperature, top_p, frequency_penalty, seed):
107
+ function_parameters = ast.literal_eval(function_parameters)
108
+ code, tests = LLAMAMANAGER.auto_generate_answers_and_tests(
109
+ question, function_name, function_parameters, function_return, seed, temperature, top_p, frequency_penalty
110
+ )
111
+ data = {
112
+ "type": "generate_answers_and_test",
113
+ "question": question,
114
+ "function_name": function_name,
115
+ "function_parameters": function_parameters,
116
+ "function_return": function_return,
117
+ "code": code,
118
+ "tests": tests,
119
+ "temperature": temperature,
120
+ "top_p": top_p,
121
+ "frequency_penalty": frequency_penalty,
122
+ "seed": seed
123
+ }
124
+ store_generated_data(data)
125
+ for test in tests:
126
+ code += f"\n{test}"
127
+ return gradio.Markdown(f"\n```python\n{code}\n```", show_copy_button = True)
128
 
129
 
130
  with gradio.Blocks(fill_height=True) as base_app:
131
  gradio.Markdown("# Synthetic Python Programming Data Generation βš™οΈ")
132
  gradio.Markdown("# ❗️ Note: The data generated here by Llama3 and the settings used to generate it will be stored in the repository [here](https://huggingface.co/datasets/xqt/SyntheticMBPP2) for future use.")
133
+ gradio.Markdown("# ❗️ Each successful interaction is saved [here](https://huggingface.co/datasets/xqt/SyntheticMBPP2/discussions/1).")
134
  gradio.Markdown("# ❗️ Feel free to use your own API key if the key here is rate limited. API Key is never stored in the repository.")
135
  gradio.Markdown("# ❗️ If you want to use a passcode, please text me.")
136
  gradio.Markdown("# Step 0: Use your own API Key/Passcode")
 
170
  with gradio.Column():
171
  __shots_frequency_penalty = gradio.Slider(minimum = -2.0, maximum = 2.0, step = 0.01, value = 0.0, label = "Frequency Penalty", interactive = True)
172
  __shots_seed = gradio.Slider(minimum = 0, maximum = 1000, step = 1, value = 123, label = "Seed", interactive = True)
173
+ __generated_shots = gradio.DataFrame(value = [], col_count = 1, type = "array", label = "Generated Shots", interactive = False, headers = ["Shots"])
174
 
175
  gradio.Markdown("# Step 3: Generate Python Programming Questions for the generated shots")
176
  with gradio.Row():
 
186
  with gradio.Column():
187
  __questions_frequency_penalty = gradio.Slider(minimum = -2.0, maximum = 2.0, step = 0.01, value = 0.0, label = "Frequency Penalty", interactive = True)
188
  __questions_seed = gradio.Slider(minimum = 0, maximum = 1000, step = 1, value = 123, label = "Seed", interactive = True)
189
+ __generated_questions = gradio.DataFrame(value = [], col_count = 1, type = "array", label = "Generated Questions", interactive = False, headers = ["Questions"])
190
 
191
+ gradio.Markdown("# Step 4: Generate a function name, input parameters, and return type for the generated questions")
192
+ with gradio.Row():
193
+ with gradio.Column(scale = 2):
194
+ __function_question_dropdown = gradio.Dropdown(choices = [], label = "Select a Question", interactive = True, scale = 2)
195
+ with gradio.Column():
196
+ __function_generate = gradio.Button("Generate Function", scale = 2)
197
+ with gradio.Accordion("Advanced Settings", open = False):
198
+ with gradio.Row():
199
+ with gradio.Column():
200
+ __function_temperature = gradio.Slider(minimum = 0.1, maximum = 2.0, step = 0.01, value = 1.0, label = "Temperature", interactive = True)
201
+ __function_top_p = gradio.Slider(minimum = 0.1, maximum = 0.99, step = 0.01, value = 0.9, label = "Top P", interactive = True)
202
+ with gradio.Column():
203
+ __function_frequency_penalty = gradio.Slider(minimum = -2.0, maximum = 2.0, step = 0.01, value = 0.0, label = "Frequency Penalty", interactive = True)
204
+ __function_seed = gradio.Slider(minimum = 0, maximum = 1000, step = 1, value = 123, label = "Seed", interactive = True)
205
+ with gradio.Row():
206
+ with gradio.Column():
207
+ __function_name = gradio.Textbox(label = "Function Name", placeholder = "function name", interactive = False)
208
+ with gradio.Column():
209
+ __function_parameters = gradio.Textbox(label = "Input Parameters", placeholder = "input parameters", interactive = False)
210
+ with gradio.Column():
211
+ __function_return = gradio.Textbox(label = "Return Type", placeholder = "return type", interactive = False)
212
+
213
+ gradio.Markdown("# πŸš€ Step 5: Generate a code.")
214
+ __code_generate = gradio.Button("Generate Code", scale = 2)
215
+ with gradio.Accordion("Advanced Settings", open = False):
216
+ with gradio.Row():
217
+ with gradio.Column():
218
+ __code_temperature = gradio.Slider(minimum = 0.1, maximum = 2.0, step = 0.01, value = 1.0, label = "Temperature", interactive = True)
219
+ __code_top_p = gradio.Slider(minimum = 0.1, maximum = 0.99, step = 0.01, value = 0.9, label = "Top P", interactive = True)
220
+ with gradio.Column():
221
+ __code_frequency_penalty = gradio.Slider(minimum = -2.0, maximum = 2.0, step = 0.01, value = 0.0, label = "Frequency Penalty", interactive = True)
222
+ __code_seed = gradio.Slider(minimum = 0, maximum = 1000, step = 1, value = 123, label = "Seed", interactive = True)
223
+ __code = gradio.Markdown("πŸš€ Code will be generated here...", show_copy_button = True)
224
 
225
  __passcode_authenticate.click(authenticate,
226
  inputs = [__secret_textbox],
 
238
 
239
  __questions_generate.click(generate_questions,
240
  inputs = [__questions_count, __shots_category, __generated_shots, __questions_seed, __questions_temperature, __questions_top_p, __questions_frequency_penalty],
241
+ outputs = [__generated_questions, __function_question_dropdown]
242
  )
243
+
244
+ __function_generate.click(generate_function,
245
+ inputs = [__function_question_dropdown, __function_temperature, __function_top_p, __function_frequency_penalty, __function_seed],
246
+ outputs = [__function_name, __function_parameters, __function_return]
247
+ )
248
+
249
+ __code_generate.click(generate_answers_and_tests,
250
+ inputs = [__function_question_dropdown, __function_name, __function_parameters, __function_return, __code_temperature, __code_top_p, __code_frequency_penalty, __code_seed],
251
+ outputs = [__code]
252
+ )
253
 
254
 
255
  if __name__ == "__main__":