Spaces:

mostlyai
/

synthetic-sdk-demo

Running

App Files Files Community

ZennyKenny commited on 9 days ago

Commit

6f7ec0d

verified ·

1 Parent(s): d71f074

Update app.py

Browse files

Files changed (1) hide show

app.py +115 -10

app.py CHANGED Viewed

@@ -42,6 +42,15 @@ class SyntheticDataGenerator:
         max_training_time: int = 60,
         batch_size: int = 32,
         value_protection: bool = True,
     ) -> Tuple[bool, str]:
         if not self.mostly:
             return False, "Mostly AI SDK not initialized. Please initialize the SDK first."
@@ -57,6 +66,15 @@ class SyntheticDataGenerator:
                             "max_training_time": max_training_time,
                             "value_protection": value_protection,
                             "batch_size": batch_size,
                         },
                     }
                 ]
@@ -110,11 +128,34 @@ def train_model(
     max_training_time: int,
     batch_size: int,
     value_protection: bool,
 ) -> str:
     if data is None or data.empty:
         return "Error: No data provided. Please upload or create sample data first."
     ok, msg = generator.train_generator(
-        data, model_name, epochs, max_training_time, batch_size, value_protection
     )
     return ("Success: " if ok else "Error: ") + msg
@@ -209,23 +250,81 @@ def create_interface():
             memory_info = gr.Markdown(label="Memory Usage Info", visible=False)
             with gr.Row():
-                with gr.Column():
                     model_name = gr.Textbox(
-                        value="My Synthetic Model", label="Model Name", placeholder="Enter a name for your model"
                     )
-                    epochs = gr.Slider(1, 200, value=100, step=1, label="Training Epochs")
-                    max_training_time = gr.Slider(1, 1000, value=60, step=1, label="Maximum Training Time")
-                    batch_size = gr.Slider(8, 1024, value=32, step=8, label="Training Batch Size")
-                    value_protection = gr.Checkbox(label="Value Protection", info="Enable Value Protection")
                     train_btn = gr.Button("Train Model", variant="primary")
-                with gr.Column():
                     train_status = gr.Textbox(label="Training Status", interactive=False)
         with gr.Tab("Generate Data"):
             gr.Markdown("### Generate synthetic data from your trained model")
             with gr.Row():
                 with gr.Column():
-                    gen_size = gr.Slider(10, 1000, value=100, step=10, label="Number of Records to Generate")
                     generate_btn = gr.Button("Generate Synthetic Data", variant="primary")
                 with gr.Column():
                     gen_status = gr.Textbox(label="Generation Status", interactive=False)
@@ -243,7 +342,13 @@ def create_interface():
         train_btn.click(
             train_model,
-            inputs=[uploaded_data, model_name, epochs, max_training_time, batch_size, value_protection],
             outputs=[train_status],
         )

         max_training_time: int = 60,
         batch_size: int = 32,
         value_protection: bool = True,
+        rare_category_protection: bool = True,
+        flexible_generation: bool = True,
+        model_size: str = "MEDIUM",
+        target_accuracy: float = 0.95,
+        validation_split: float = 0.2,
+        learning_rate: float = 0.001,
+        early_stopping_patience: int = 10,
+        dropout_rate: float = 0.1,
+        weight_decay: float = 0.0001,
     ) -> Tuple[bool, str]:
         if not self.mostly:
             return False, "Mostly AI SDK not initialized. Please initialize the SDK first."
                             "max_training_time": max_training_time,
                             "value_protection": value_protection,
                             "batch_size": batch_size,
+                            "rare_category_protection": rare_category_protection,
+                            "flexible_generation": flexible_generation,
+                            "model_size": model_size,            # "SMALL" | "MEDIUM" | "LARGE"
+                            "target_accuracy": target_accuracy,  # early stop once target met
+                            "validation_split": validation_split,
+                            "learning_rate": learning_rate,
+                            "early_stopping_patience": early_stopping_patience,
+                            "dropout_rate": dropout_rate,
+                            "weight_decay": weight_decay,
                         },
                     }
                 ]
     max_training_time: int,
     batch_size: int,
     value_protection: bool,
+    rare_category_protection: bool,
+    flexible_generation: bool,
+    model_size: str,
+    target_accuracy: float,
+    validation_split: float,
+    learning_rate: float,
+    early_stopping_patience: int,
+    dropout_rate: float,
+    weight_decay: float,
 ) -> str:
     if data is None or data.empty:
         return "Error: No data provided. Please upload or create sample data first."
     ok, msg = generator.train_generator(
+        data=data,
+        name=model_name,
+        epochs=epochs,
+        max_training_time=max_training_time,
+        batch_size=batch_size,
+        value_protection=value_protection,
+        rare_category_protection=rare_category_protection,
+        flexible_generation=flexible_generation,
+        model_size=model_size,
+        target_accuracy=target_accuracy,
+        validation_split=validation_split,
+        learning_rate=learning_rate,
+        early_stopping_patience=early_stopping_patience,
+        dropout_rate=dropout_rate,
+        weight_decay=weight_decay,
     )
     return ("Success: " if ok else "Error: ") + msg
             memory_info = gr.Markdown(label="Memory Usage Info", visible=False)
             with gr.Row():
+                with gr.Column(scale=1):
                     model_name = gr.Textbox(
+                        value="My Synthetic Model",
+                        label="Model Name",
+                        placeholder="Enter a name for your model",
+                        info="Appears in training runs and saved generators."
+                    )
+                    epochs = gr.Slider(
+                        1, 200, value=100, step=1, label="Training Epochs",
+                        info="Maximum number of passes over the training data."
+                    )
+                    max_training_time = gr.Slider(
+                        1, 1000, value=60, step=1, label="Maximum Training Time (minutes)",
+                        info="Upper bound in minutes; training stops if exceeded."
+                    )
+                    batch_size = gr.Slider(
+                        8, 1024, value=32, step=8, label="Batch Size",
+                        info="Number of rows per optimization step. Larger can speed up but needs more memory."
+                    )
+                    value_protection = gr.Checkbox(
+                        label="Value Protection",
+                        info="Adds protections to reduce memorization of unique or sensitive values.",
+                        value=True
+                    )
+                    rare_category_protection = gr.Checkbox(
+                        label="Rare Category Protection",
+                        info="Prevents overfitting to infrequent categories to improve privacy and robustness.",
+                        value=True
+                    )
+                with gr.Column(scale=1):
+                    flexible_generation = gr.Checkbox(
+                        label="Flexible Generation",
+                        info="Allows generation when inputs slightly differ from training schema.",
+                        value=True
+                    )
+                    model_size = gr.Dropdown(
+                        choices=["SMALL", "MEDIUM", "LARGE"],
+                        value="MEDIUM",
+                        label="Model Size",
+                        info="Sets model capacity. Larger can improve fidelity but uses more compute."
+                    )
+                    target_accuracy = gr.Slider(
+                        0.50, 0.999, value=0.95, step=0.001, label="Target Accuracy",
+                        info="Stop early when validation accuracy reaches this threshold."
+                    )
+                    validation_split = gr.Slider(
+                        0.05, 0.5, value=0.2, step=0.01, label="Validation Split",
+                        info="Fraction of the dataset held out for validation during training."
+                    )
+                    early_stopping_patience = gr.Slider(
+                        0, 50, value=10, step=1, label="Early Stopping Patience (epochs)",
+                        info="Stop if no validation improvement after this many epochs."
+                    )
+                with gr.Column(scale=1):
+                    learning_rate = gr.Number(
+                        value=0.001, precision=6, label="Learning Rate",
+                        info="Step size for the optimizer. Typical range: 1e-4 to 1e-2."
+                    )
+                    dropout_rate = gr.Slider(
+                        0.0, 0.6, value=0.1, step=0.01, label="Dropout Rate",
+                        info="Regularization to reduce overfitting by randomly dropping units."
+                    )
+                    weight_decay = gr.Number(
+                        value=0.0001, precision=6, label="Weight Decay",
+                        info="L2 regularization strength applied to model weights."
                     )
                     train_btn = gr.Button("Train Model", variant="primary")
                     train_status = gr.Textbox(label="Training Status", interactive=False)
         with gr.Tab("Generate Data"):
             gr.Markdown("### Generate synthetic data from your trained model")
             with gr.Row():
                 with gr.Column():
+                    gen_size = gr.Slider(10, 1000, value=100, step=10, label="Number of Records to Generate",
+                                         info="How many synthetic rows to create in the table.")
                     generate_btn = gr.Button("Generate Synthetic Data", variant="primary")
                 with gr.Column():
                     gen_status = gr.Textbox(label="Generation Status", interactive=False)
         train_btn.click(
             train_model,
+            inputs=[
+                uploaded_data, model_name,
+                epochs, max_training_time, batch_size,
+                value_protection, rare_category_protection, flexible_generation,
+                model_size, target_accuracy, validation_split,
+                learning_rate, early_stopping_patience, dropout_rate, weight_decay
+            ],
             outputs=[train_status],
         )