Spaces:

xqt
/

Synthetic-Python-Programing-Data-Generator

Sleeping

App Files Files Community

xqt commited on Sep 21, 2024

Commit

76e2397

1 Parent(s): 9da93ad

UPD: added a function and code generation module

Browse files

Files changed (2) hide show

LlamaManager.py +109 -0
app.py +98 -9

LlamaManager.py CHANGED Viewed

@@ -219,6 +219,115 @@ class LlamaManager():
             print("LlamaManager::auto_generate_questions_from_shots::Generated questions from shots")
         return questions
 if __name__ == "__main__":

             print("LlamaManager::auto_generate_questions_from_shots::Generated questions from shots")
         return questions
+    def __postprocess_for_auto_generate_function_signature_from_question(self, out):
+        if self.verbose:
+            print("LlamaManager::__postprocess_for_auto_generate_function_signature_from_question::Postprocessing")
+        out = self.__get_items_between_tags(out, r"\[A\]", r"\[/A\]")[0]
+        function_name = self.__get_items_between_tags(out, r"\[F\]", r"\[/F\]")[0]
+        input_parameters = self.__get_items_between_tags(out, r"\[I\]", r"\[/I\]")
+        return_type = self.__get_items_between_tags(out, r"\[R\]", r"\[/R\]")[0]
+        return function_name, input_parameters, return_type
+    def auto_generate_function_signature_from_question(
+        self,
+        question,
+        seed = 123,
+        temperature = 1.0,
+        top_p = 0.9,
+        frequency_penalty = 0.0
+    ):
+        if self.verbose:
+            print("LlamaManager::auto_generate_function_signature_from_question::Generating function signature from question")
+        message_content = [
+            {"role": "system", "content": """You are a synthetic data generator.
+                            You must answer the question between [A] and [/A] tags.
+                            The answer should include a function name, input parameters and return type.
+                            The function name should be between [F] and [/F] tags.
+                            Each input parameter should be between [I] and [/I] tags.
+                            The return type should be between [R] and [/R] tags.
+                            """},
+            {"role": "user", "content": f"""Write me a function signature, input parameters and return type for the following question:
+                            Write a program that takes two positive integers as input and computes the sum of their digits using a for loop."""},
+            {"role": "assistant", "content": f"[A][F]sum_of_digits[/F][I]num_1: int[/I][I]num_2: int[/I][R]int[/R][/A]"},
+            {"role": "user", "content": f"Write me a function signature, input parameters and return type for the following question: {question}"},
+            {"role": "assistant", "content": f"[A]"}
+        ]
+        out = self.client.chat_completion(
+            messages = message_content,
+            max_tokens = 1000,
+            stream = False,
+            seed = seed,
+            temperature = temperature,
+            top_p = top_p,
+            frequency_penalty = frequency_penalty
+        )
+        function_name, input_parameters, return_type = self.__postprocess_for_auto_generate_function_signature_from_question(out.choices[0].message.content)
+        if self.verbose:
+            print("LlamaManager::auto_generate_function_signature_from_question::Generated function signature from question")
+        return function_name, input_parameters, return_type
+    def __postprocess_for_auto_generate_answers_and_tests(self, out):
+        if self.verbose:
+            print("LlamaManager::__postprocess_for_auto_generate_answers_and_tests::Postprocessing")
+        out = self.__get_items_between_tags(out, r"\[A\]", r"\[/A\]")[0]
+        answer = self.__get_items_between_tags(out, r"\[F\]", r"\[/F\]")[0]
+        test_cases = self.__get_items_between_tags(out, r"\[T\]", r"\[/T\]")
+        return answer, test_cases
+    def auto_generate_answers_and_tests(
+        self,
+        question,
+        function_name,
+        input_parameters,
+        return_type,
+        seed = 123,
+        temperature = 1.0,
+        top_p = 0.9,
+        frequency_penalty = 0.0
+    ):
+        if self.verbose:
+            print("LlamaManager::auto_generate_answers_and_tests::Generating answers and test cases")
+        function_signature = f"{function_name}({', '.join(input_parameters)}) -> {return_type}"
+        message_content = [
+            {"role": "system", "content": """You are a synthetic data generator.
+                            Your must answer the question between [A] and [/A] tags.
+                            The answer should include a function implementation and test cases.
+                            The function implementation should be between [F] and [/F] tags.
+                            Each test cases should be between [T] and [/T] tags.
+                            Test cases must use assert statements.
+                            Do not comment on the code. No need to explain the solution.
+                            """},
+            {"role": "user", "content": f"""Write me a function implementation along with the test cases for the following question: {question},
+                            The function has the following signature: {function_signature}"""}
+        ]
+        out = self.client.chat_completion(
+            messages = message_content,
+            max_tokens = 1000,
+            stream = False,
+            seed = seed,
+            temperature = temperature,
+            top_p = top_p,
+            frequency_penalty = frequency_penalty
+        )
+        answer, test_cases = self.__postprocess_for_auto_generate_answers_and_tests(out.choices[0].message.content)
+        if self.verbose:
+            print("LlamaManager::auto_generate_answers_and_tests::Generated answers and test cases")
+        return answer, test_cases
 if __name__ == "__main__":

app.py CHANGED Viewed

@@ -2,6 +2,8 @@ import gradio
 import LlamaManager
 import os
 import huggingface_hub
 HF_API = huggingface_hub.HfApi()
 LLAMAMANAGER = LlamaManager.LlamaManager(os.environ.get("HF_KEY_2"), True)
@@ -43,7 +45,7 @@ def generate_categories(categories_count, seed, temperature, top_p, frequency_pe
         "frequency_penalty": frequency_penalty
     }
     store_generated_data(data)
-    return gradio.Dropdown(choices = categories, value = categories[0], label = "Select Category", interactive = True)
 def generate_shots(category, shots_count, seed, temperature, top_p, frequency_penalty):
@@ -60,15 +62,15 @@ def generate_shots(category, shots_count, seed, temperature, top_p, frequency_pe
         "frequency_penalty": frequency_penalty
     }
     store_generated_data(data)
-    return gradio.DataFrame(value = shots, type = "array", label = "Generated Shots", interactive = False, headers = None)
 def generate_questions(questions_count, category, shots, seed, temperature, top_p, frequency_penalty):
     questions = LLAMAMANAGER.auto_generate_questions_from_shots(questions_count, category, shots, seed, temperature, top_p, frequency_penalty)
-    questions = [[question] for question in questions]
     data = {
         "type": "generate_questions",
-        "questions": questions,
         "count": questions_count,
         "category": category,
         "shots": shots,
@@ -78,13 +80,57 @@ def generate_questions(questions_count, category, shots, seed, temperature, top_
         "frequency_penalty": frequency_penalty
     }
     store_generated_data(data)
-    return gradio.DataFrame(value = questions, type = "array", label = "Generated Shots", interactive = False, headers = None)
 with gradio.Blocks(fill_height=True) as base_app:
     gradio.Markdown("# Synthetic Python Programming Data Generation ⚙️")
     gradio.Markdown("# ❗️ Note: The data generated here by Llama3 and the settings used to generate it will be stored in the repository [here](https://huggingface.co/datasets/xqt/SyntheticMBPP2) for future use.")
-    gradio.Markdown("# ❗️ Each successful interaction is saved [here](https://huggingface.co/datasets/xqt/SyntheticMBPP2/discussions/1)")
     gradio.Markdown("# ❗️ Feel free to use your own API key if the key here is rate limited. API Key is never stored in the repository.")
     gradio.Markdown("# ❗️ If you want to use a passcode, please text me.")
     gradio.Markdown("# Step 0: Use your own API Key/Passcode")
@@ -124,7 +170,7 @@ with gradio.Blocks(fill_height=True) as base_app:
             with gradio.Column():
                 __shots_frequency_penalty = gradio.Slider(minimum = -2.0, maximum = 2.0, step = 0.01, value = 0.0, label = "Frequency Penalty", interactive = True)
                 __shots_seed = gradio.Slider(minimum = 0, maximum = 1000, step = 1, value = 123, label = "Seed", interactive = True)
-    __generated_shots = gradio.DataFrame(value = [], col_count = 1, type = "array", label = "Generated Shots", interactive = False, headers = None)
     gradio.Markdown("# Step 3: Generate Python Programming Questions for the generated shots")
     with gradio.Row():
@@ -140,8 +186,41 @@ with gradio.Blocks(fill_height=True) as base_app:
             with gradio.Column():
                 __questions_frequency_penalty = gradio.Slider(minimum = -2.0, maximum = 2.0, step = 0.01, value = 0.0, label = "Frequency Penalty", interactive = True)
                 __questions_seed = gradio.Slider(minimum = 0, maximum = 1000, step = 1, value = 123, label = "Seed", interactive = True)
-    __generated_questions = gradio.DataFrame(value = [], col_count = 1, type = "array", label = "Generated Questions", interactive = False, headers = None)
     __passcode_authenticate.click(authenticate,
                                     inputs = [__secret_textbox],
@@ -159,8 +238,18 @@ with gradio.Blocks(fill_height=True) as base_app:
     __questions_generate.click(generate_questions,
                                 inputs = [__questions_count, __shots_category, __generated_shots, __questions_seed, __questions_temperature, __questions_top_p, __questions_frequency_penalty],
-                                outputs = [__generated_questions]
                                 )
 if __name__ == "__main__":

 import LlamaManager
 import os
 import huggingface_hub
+import random
+import ast
 HF_API = huggingface_hub.HfApi()
 LLAMAMANAGER = LlamaManager.LlamaManager(os.environ.get("HF_KEY_2"), True)
         "frequency_penalty": frequency_penalty
     }
     store_generated_data(data)
+    return gradio.Dropdown(choices = categories, value = random.choice(categories), label = "Select Category", interactive = True)
 def generate_shots(category, shots_count, seed, temperature, top_p, frequency_penalty):
         "frequency_penalty": frequency_penalty
     }
     store_generated_data(data)
+    return gradio.DataFrame(value = shots, type = "array", label = "Generated Shots", interactive = False, headers = ["Shots"])
 def generate_questions(questions_count, category, shots, seed, temperature, top_p, frequency_penalty):
     questions = LLAMAMANAGER.auto_generate_questions_from_shots(questions_count, category, shots, seed, temperature, top_p, frequency_penalty)
+    questions_for_dataframe = [[question] for question in questions]
     data = {
         "type": "generate_questions",
+        "questions": questions_for_dataframe,
         "count": questions_count,
         "category": category,
         "shots": shots,
         "frequency_penalty": frequency_penalty
     }
     store_generated_data(data)
+    return gradio.DataFrame(value = questions_for_dataframe, type = "array", label = "Generated Shots", interactive = False, headers = ["Questions"]), \
+        gradio.Dropdown(choices = questions, value = random.choice(questions), label = "Select a Question", interactive = True)
+def generate_function(question, temperature, top_p, frequency_penalty, seed):
+    function_name, function_parameters, function_return = LLAMAMANAGER.auto_generate_function_signature_from_question(
+        question, seed, temperature, top_p, frequency_penalty
+    )
+    data = {
+        "type": "generate_function",
+        "question": question,
+        "function_name": function_name,
+        "function_parameters": function_parameters,
+        "function_return": function_return,
+        "temperature": temperature,
+        "top_p": top_p,
+        "frequency_penalty": frequency_penalty,
+        "seed": seed
+    }
+    store_generated_data(data)
+    return function_name, function_parameters, function_return
+def generate_answers_and_tests(question, function_name, function_parameters, function_return, temperature, top_p, frequency_penalty, seed):
+    function_parameters = ast.literal_eval(function_parameters)
+    code, tests = LLAMAMANAGER.auto_generate_answers_and_tests(
+        question, function_name, function_parameters, function_return, seed, temperature, top_p, frequency_penalty
+    )
+    data = {
+        "type": "generate_answers_and_test",
+        "question": question,
+        "function_name": function_name,
+        "function_parameters": function_parameters,
+        "function_return": function_return,
+        "code": code,
+        "tests": tests,
+        "temperature": temperature,
+        "top_p": top_p,
+        "frequency_penalty": frequency_penalty,
+        "seed": seed
+    }
+    store_generated_data(data)
+    for test in tests:
+        code += f"\n{test}"
+    return  gradio.Markdown(f"\n```python\n{code}\n```", show_copy_button = True)
 with gradio.Blocks(fill_height=True) as base_app:
     gradio.Markdown("# Synthetic Python Programming Data Generation ⚙️")
     gradio.Markdown("# ❗️ Note: The data generated here by Llama3 and the settings used to generate it will be stored in the repository [here](https://huggingface.co/datasets/xqt/SyntheticMBPP2) for future use.")
+    gradio.Markdown("# ❗️ Each successful interaction is saved [here](https://huggingface.co/datasets/xqt/SyntheticMBPP2/discussions/1).")
     gradio.Markdown("# ❗️ Feel free to use your own API key if the key here is rate limited. API Key is never stored in the repository.")
     gradio.Markdown("# ❗️ If you want to use a passcode, please text me.")
     gradio.Markdown("# Step 0: Use your own API Key/Passcode")
             with gradio.Column():
                 __shots_frequency_penalty = gradio.Slider(minimum = -2.0, maximum = 2.0, step = 0.01, value = 0.0, label = "Frequency Penalty", interactive = True)
                 __shots_seed = gradio.Slider(minimum = 0, maximum = 1000, step = 1, value = 123, label = "Seed", interactive = True)
+    __generated_shots = gradio.DataFrame(value = [], col_count = 1, type = "array", label = "Generated Shots", interactive = False, headers = ["Shots"])
     gradio.Markdown("# Step 3: Generate Python Programming Questions for the generated shots")
     with gradio.Row():
             with gradio.Column():
                 __questions_frequency_penalty = gradio.Slider(minimum = -2.0, maximum = 2.0, step = 0.01, value = 0.0, label = "Frequency Penalty", interactive = True)
                 __questions_seed = gradio.Slider(minimum = 0, maximum = 1000, step = 1, value = 123, label = "Seed", interactive = True)
+    __generated_questions = gradio.DataFrame(value = [], col_count = 1, type = "array", label = "Generated Questions", interactive = False, headers = ["Questions"])
+    gradio.Markdown("# Step 4: Generate a function name, input parameters, and return type for the generated questions")
+    with gradio.Row():
+        with gradio.Column(scale = 2):
+            __function_question_dropdown = gradio.Dropdown(choices = [], label = "Select a Question", interactive = True, scale = 2)
+        with gradio.Column():
+            __function_generate = gradio.Button("Generate Function", scale = 2)
+    with gradio.Accordion("Advanced Settings", open = False):
+        with gradio.Row():
+            with gradio.Column():
+                __function_temperature = gradio.Slider(minimum = 0.1, maximum = 2.0, step = 0.01, value = 1.0, label = "Temperature", interactive = True)
+                __function_top_p = gradio.Slider(minimum = 0.1, maximum = 0.99, step = 0.01, value = 0.9, label = "Top P", interactive = True)
+            with gradio.Column():
+                __function_frequency_penalty = gradio.Slider(minimum = -2.0, maximum = 2.0, step = 0.01, value = 0.0, label = "Frequency Penalty", interactive = True)
+                __function_seed = gradio.Slider(minimum = 0, maximum = 1000, step = 1, value = 123, label = "Seed", interactive = True)
+    with gradio.Row():
+        with gradio.Column():
+            __function_name = gradio.Textbox(label = "Function Name", placeholder = "function name", interactive = False)
+        with gradio.Column():
+            __function_parameters = gradio.Textbox(label = "Input Parameters", placeholder = "input parameters", interactive = False)
+        with gradio.Column():
+            __function_return = gradio.Textbox(label = "Return Type", placeholder = "return type", interactive = False)
+    gradio.Markdown("# 🚀 Step 5: Generate a code.")
+    __code_generate = gradio.Button("Generate Code", scale = 2)
+    with gradio.Accordion("Advanced Settings", open = False):
+        with gradio.Row():
+            with gradio.Column():
+                __code_temperature = gradio.Slider(minimum = 0.1, maximum = 2.0, step = 0.01, value = 1.0, label = "Temperature", interactive = True)
+                __code_top_p = gradio.Slider(minimum = 0.1, maximum = 0.99, step = 0.01, value = 0.9, label = "Top P", interactive = True)
+            with gradio.Column():
+                __code_frequency_penalty = gradio.Slider(minimum = -2.0, maximum = 2.0, step = 0.01, value = 0.0, label = "Frequency Penalty", interactive = True)
+                __code_seed = gradio.Slider(minimum = 0, maximum = 1000, step = 1, value = 123, label = "Seed", interactive = True)
+    __code = gradio.Markdown("🚀 Code will be generated here...", show_copy_button = True)
     __passcode_authenticate.click(authenticate,
                                     inputs = [__secret_textbox],
     __questions_generate.click(generate_questions,
                                 inputs = [__questions_count, __shots_category, __generated_shots, __questions_seed, __questions_temperature, __questions_top_p, __questions_frequency_penalty],
+                                outputs = [__generated_questions, __function_question_dropdown]
                                 )
+    __function_generate.click(generate_function,
+                                inputs = [__function_question_dropdown, __function_temperature, __function_top_p, __function_frequency_penalty, __function_seed],
+                                outputs = [__function_name, __function_parameters, __function_return]
+                                )
+    __code_generate.click(generate_answers_and_tests,
+                            inputs = [__function_question_dropdown, __function_name, __function_parameters, __function_return, __code_temperature, __code_top_p, __code_frequency_penalty, __code_seed],
+                            outputs = [__code]
+                            )
 if __name__ == "__main__":