Spaces:

mostlyai
/

synthetic-sdk-demo

Sleeping

App Files Files

xet

Community

ZennyKenny commited on Sep 15

Commit

6ce5adf

verified ·

1 Parent(s): 01b4e73

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -65

app.py CHANGED Viewed

@@ -26,7 +26,6 @@ class SyntheticDataGenerator:
         self.original_data = None
     def initialize_mostly_ai(self) -> Tuple[bool, str]:
-        """Initialize Mostly AI SDK"""
         if not MOSTLY_AI_AVAILABLE:
             return False, "Mostly AI SDK not installed. Please install with: pip install mostlyai[local]"
         try:
@@ -44,7 +43,6 @@ class SyntheticDataGenerator:
         batch_size: int = 32,
         value_protection: bool = True,
     ) -> Tuple[bool, str]:
-        """Train the synthetic data generator"""
         if not self.mostly:
             return False, "Mostly AI SDK not initialized. Please initialize the SDK first."
         try:
@@ -63,14 +61,12 @@ class SyntheticDataGenerator:
                     }
                 ]
             }
             self.generator = self.mostly.train(config=train_config)
             return True, f"Training completed successfully. Model name: {name}"
         except Exception as e:
             return False, f"Training failed with error: {str(e)}"
     def generate_synthetic_data(self, size: int) -> Tuple[Optional[pd.DataFrame], str]:
-        """Generate synthetic data"""
         if not self.generator:
             return None, "No trained generator available. Please train a model first."
         try:
@@ -82,27 +78,28 @@ class SyntheticDataGenerator:
     def get_quality_report_file(self) -> Optional[str]:
         """
-        Generate/export the quality report and return a file path for download.
-        Tries to find an existing ZIP; otherwise saves a TXT fallback.
         """
         if not self.generator:
             return None
         try:
             rep = self.generator.reports(display=False)
-            # 1) If a string path to a .zip is returned
             if isinstance(rep, str) and rep.endswith(".zip") and os.path.exists(rep):
                 return rep
-            # 2) If the object exposes a path-like attribute
             for attr in ("archive_path", "zip_path", "path", "file_path"):
                 if hasattr(rep, attr):
                     p = getattr(rep, attr)
                     if isinstance(p, str) and os.path.exists(p):
                         return p
-            # 3) If the object can save/export itself
-            target_zip = "/mnt/data/quality_report.zip"
             if hasattr(rep, "save"):
                 try:
                     rep.save(target_zip)
@@ -118,8 +115,8 @@ class SyntheticDataGenerator:
                 except Exception:
                     pass
-            # 4) Fallback: write string representation
-            target_txt = "/mnt/data/quality_report.txt"
             with open(target_txt, "w", encoding="utf-8") as f:
                 f.write(str(rep))
             return target_txt
@@ -128,21 +125,12 @@ class SyntheticDataGenerator:
             return None
     def estimate_memory_usage(self, df: pd.DataFrame) -> str:
-        """Estimate memory usage for the dataset"""
         if df is None or df.empty:
             return "No data available to analyze."
         memory_mb = df.memory_usage(deep=True).sum() / (1024 * 1024)
         rows, cols = len(df), len(df.columns)
         estimated_training_mb = memory_mb * 4
-        if memory_mb < 100:
-            status = "Good"
-        elif memory_mb < 500:
-            status = "Large"
-        else:
-            status = "Very Large"
         return f"""
 Memory Usage Estimate:
 - Data size: {memory_mb:.1f} MB
@@ -152,10 +140,12 @@ Memory Usage Estimate:
         """.strip()
-# Initialize the generator
 generator = SyntheticDataGenerator()
-# ---- Wrapper functions for Gradio ----
 def initialize_sdk() -> str:
     ok, msg = generator.initialize_mostly_ai()
     return ("Success: " if ok else "Error: ") + msg
@@ -178,62 +168,53 @@ def train_model(
 def generate_data(size: int) -> Tuple[Optional[pd.DataFrame], str]:
-    synthetic_df, message = generator.generate_synthetic_data(size)
-    status = "Success" if synthetic_df is not None else "Error"
-    return synthetic_df, f"{status}: {message}"
-def create_comparison_plot(original_df: pd.DataFrame, synthetic_df: pd.DataFrame) -> Optional[go.Figure]:
     if original_df is None or synthetic_df is None:
         return None
     numeric_cols = original_df.select_dtypes(include=[np.number]).columns.tolist()
     if not numeric_cols:
         return None
     n_cols = min(3, len(numeric_cols))
     n_rows = (len(numeric_cols) + n_cols - 1) // n_cols
     fig = make_subplots(rows=n_rows, cols=n_cols, subplot_titles=numeric_cols[: n_rows * n_cols])
     for i, col in enumerate(numeric_cols[: n_rows * n_cols]):
         row = i // n_cols + 1
         col_idx = i % n_cols + 1
-        fig.add_trace(
-            go.Histogram(x=original_df[col], name=f"Original {col}", opacity=0.7, nbinsx=20),
-            row=row,
-            col=col_idx,
-        )
-        fig.add_trace(
-            go.Histogram(x=synthetic_df[col], name=f"Synthetic {col}", opacity=0.7, nbinsx=20),
-            row=row,
-            col=col_idx,
-        )
     fig.update_layout(title="Original vs Synthetic Data Comparison", height=300 * n_rows, showlegend=True)
     return fig
-def download_csv(df: pd.DataFrame) -> Optional[str]:
-    if df is None or df.empty:
-        return None
-    path = "/mnt/data/synthetic_data.csv"
-    df.to_csv(path, index=False)
-    return path
 # ---- UI ----
 def create_interface():
     with gr.Blocks(title="MOSTLY AI Synthetic Data Generator", theme=gr.themes.Soft()) as demo:
-        # Header image
         gr.Image(
             value="https://img.mailinblue.com/8225865/images/content_library/original/6880d164e4e4ea1a183ad4c0.png",
             show_label=False,
             elem_id="header-image",
         )
-        # README
         gr.Markdown(
             """
         # Synthetic Data SDK by MOSTLY AI Demo Space
@@ -289,6 +270,7 @@ def create_interface():
                     train_status = gr.Textbox(label="Training Status", interactive=False)
             with gr.Row():
                 get_report_btn = gr.DownloadButton("Get Quality Report", variant="secondary")
         with gr.Tab("Generate Data"):
@@ -302,10 +284,11 @@ def create_interface():
             synthetic_data = gr.Dataframe(label="Synthetic Data", interactive=False)
             with gr.Row():
                 download_btn = gr.DownloadButton("Download CSV", variant="secondary")
                 comparison_plot = gr.Plot(label="Data Comparison")
-        # ---- Event handlers ----
         init_btn.click(initialize_sdk, outputs=[init_status])
         train_btn.click(
@@ -314,21 +297,18 @@ def create_interface():
             outputs=[train_status],
         )
-        # Direct download of quality report
-        get_report_btn.click(generator.get_quality_report_file, outputs=[get_report_btn])
-        # Generate data
         generate_btn.click(generate_data, inputs=[gen_size], outputs=[synthetic_data, gen_status])
-        # Update CSV DownloadButton whenever synthetic data changes
-        synthetic_data.change(download_csv, inputs=[synthetic_data], outputs=[download_btn])
         # Build comparison plot when both datasets are available
-        synthetic_data.change(
-            create_comparison_plot, inputs=[uploaded_data, synthetic_data], outputs=[comparison_plot]
-        )
-        # Handle file upload with size and column limits
         def process_uploaded_file(file):
             if file is None:
                 return None, "No file uploaded.", gr.update(visible=False)

         self.original_data = None
     def initialize_mostly_ai(self) -> Tuple[bool, str]:
         if not MOSTLY_AI_AVAILABLE:
             return False, "Mostly AI SDK not installed. Please install with: pip install mostlyai[local]"
         try:
         batch_size: int = 32,
         value_protection: bool = True,
     ) -> Tuple[bool, str]:
         if not self.mostly:
             return False, "Mostly AI SDK not initialized. Please initialize the SDK first."
         try:
                     }
                 ]
             }
             self.generator = self.mostly.train(config=train_config)
             return True, f"Training completed successfully. Model name: {name}"
         except Exception as e:
             return False, f"Training failed with error: {str(e)}"
     def generate_synthetic_data(self, size: int) -> Tuple[Optional[pd.DataFrame], str]:
         if not self.generator:
             return None, "No trained generator available. Please train a model first."
         try:
     def get_quality_report_file(self) -> Optional[str]:
         """
+        Build/export the quality report and return a file path for immediate download.
+        Uses /tmp for Spaces; tries ZIP, falls back to TXT.
         """
         if not self.generator:
             return None
         try:
             rep = self.generator.reports(display=False)
+            # If a string path to a .zip is returned
             if isinstance(rep, str) and rep.endswith(".zip") and os.path.exists(rep):
                 return rep
+            # If object exposes a path-like attribute
             for attr in ("archive_path", "zip_path", "path", "file_path"):
                 if hasattr(rep, attr):
                     p = getattr(rep, attr)
                     if isinstance(p, str) and os.path.exists(p):
                         return p
+            # Try saving/exporting
+            os.makedirs("/tmp", exist_ok=True)
+            target_zip = "/tmp/quality_report.zip"
             if hasattr(rep, "save"):
                 try:
                     rep.save(target_zip)
                 except Exception:
                     pass
+            # Fallback: stringify into TXT
+            target_txt = "/tmp/quality_report.txt"
             with open(target_txt, "w", encoding="utf-8") as f:
                 f.write(str(rep))
             return target_txt
             return None
     def estimate_memory_usage(self, df: pd.DataFrame) -> str:
         if df is None or df.empty:
             return "No data available to analyze."
         memory_mb = df.memory_usage(deep=True).sum() / (1024 * 1024)
         rows, cols = len(df), len(df.columns)
         estimated_training_mb = memory_mb * 4
+        status = "Good" if memory_mb < 100 else ("Large" if memory_mb < 500 else "Very Large")
         return f"""
 Memory Usage Estimate:
 - Data size: {memory_mb:.1f} MB
         """.strip()
+# App state
 generator = SyntheticDataGenerator()
+_last_synth_df: Optional[pd.DataFrame] = None  # store latest synthetic DF for download
+# ---- Gradio wrappers ----
 def initialize_sdk() -> str:
     ok, msg = generator.initialize_mostly_ai()
     return ("Success: " if ok else "Error: ") + msg
 def generate_data(size: int) -> Tuple[Optional[pd.DataFrame], str]:
+    global _last_synth_df
+    synth_df, message = generator.generate_synthetic_data(size)
+    if synth_df is not None:
+        _last_synth_df = synth_df.copy()
+        return synth_df, f"Success: {message}"
+    else:
+        return None, f"Error: {message}"
+def download_csv_now() -> Optional[str]:
+    """Write the most recent synthetic DF to /tmp and return the path for direct download."""
+    global _last_synth_df
+    if _last_synth_df is None or _last_synth_df.empty:
+        return None
+    os.makedirs("/tmp", exist_ok=True)
+    path = "/tmp/synthetic_data.csv"
+    _last_synth_df.to_csv(path, index=False)
+    return path
+def create_comparison_plot(original_df: pd.DataFrame, synthetic_df: pd.DataFrame):
     if original_df is None or synthetic_df is None:
         return None
     numeric_cols = original_df.select_dtypes(include=[np.number]).columns.tolist()
     if not numeric_cols:
         return None
     n_cols = min(3, len(numeric_cols))
     n_rows = (len(numeric_cols) + n_cols - 1) // n_cols
     fig = make_subplots(rows=n_rows, cols=n_cols, subplot_titles=numeric_cols[: n_rows * n_cols])
     for i, col in enumerate(numeric_cols[: n_rows * n_cols]):
         row = i // n_cols + 1
         col_idx = i % n_cols + 1
+        fig.add_trace(go.Histogram(x=original_df[col], name=f"Original {col}", opacity=0.7, nbinsx=20), row=row, col=col_idx)
+        fig.add_trace(go.Histogram(x=synthetic_df[col], name=f"Synthetic {col}", opacity=0.7, nbinsx=20), row=row, col=col_idx)
     fig.update_layout(title="Original vs Synthetic Data Comparison", height=300 * n_rows, showlegend=True)
     return fig
 # ---- UI ----
 def create_interface():
     with gr.Blocks(title="MOSTLY AI Synthetic Data Generator", theme=gr.themes.Soft()) as demo:
         gr.Image(
             value="https://img.mailinblue.com/8225865/images/content_library/original/6880d164e4e4ea1a183ad4c0.png",
             show_label=False,
             elem_id="header-image",
         )
         gr.Markdown(
             """
         # Synthetic Data SDK by MOSTLY AI Demo Space
                     train_status = gr.Textbox(label="Training Status", interactive=False)
             with gr.Row():
+                # This download button calls a function that returns a file path → download starts immediately
                 get_report_btn = gr.DownloadButton("Get Quality Report", variant="secondary")
         with gr.Tab("Generate Data"):
             synthetic_data = gr.Dataframe(label="Synthetic Data", interactive=False)
             with gr.Row():
+                # Same pattern: click → function returns the CSV path → immediate download
                 download_btn = gr.DownloadButton("Download CSV", variant="secondary")
                 comparison_plot = gr.Plot(label="Data Comparison")
+        # ---- Events ----
         init_btn.click(initialize_sdk, outputs=[init_status])
         train_btn.click(
             outputs=[train_status],
         )
+        # IMPORTANT: For DownloadButton, do NOT specify outputs — the returned path is auto-downloaded.
+        get_report_btn.click(generator.get_quality_report_file, inputs=None, outputs=None)
         generate_btn.click(generate_data, inputs=[gen_size], outputs=[synthetic_data, gen_status])
         # Build comparison plot when both datasets are available
+        synthetic_data.change(create_comparison_plot, inputs=[uploaded_data, synthetic_data], outputs=[comparison_plot])
+        # CSV download: return a path from the click handler (no outputs)
+        download_btn.click(download_csv_now, inputs=None, outputs=None)
+        # File upload handler
         def process_uploaded_file(file):
             if file is None:
                 return None, "No file uploaded.", gr.update(visible=False)