Spaces:

mostlyai
/

synthetic-sdk-demo

Sleeping

App Files Files Community

ZennyKenny commited on Sep 12

Commit

f376d1d

verified ·

1 Parent(s): 74dabbb

Create app.py

Browse files

Files changed (1) hide show

app.py +380 -0

app.py ADDED Viewed

	@@ -0,0 +1,380 @@

+import gradio as gr
+import pandas as pd
+import numpy as np
+import io
+import base64
+from typing import Optional, Tuple
+import plotly.express as px
+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
+import warnings
+warnings.filterwarnings("ignore")
+# Import Mostly AI SDK
+try:
+    from mostlyai.sdk import MostlyAI
+    MOSTLY_AI_AVAILABLE = True
+except ImportError:
+    MOSTLY_AI_AVAILABLE = False
+    print("Warning: Mostly AI SDK not available. Please install with: pip install mostlyai[local]")
+class SyntheticDataGenerator:
+    def __init__(self):
+        self.mostly = None
+        self.generator = None
+        self.original_data = None
+    def initialize_mostly_ai(self):
+        """Initialize Mostly AI SDK"""
+        if not MOSTLY_AI_AVAILABLE:
+            return False, "Mostly AI SDK not installed. Please install with: pip install mostlyai[local]"
+        try:
+            self.mostly = MostlyAI(local=True, local_port=8080)
+            return True, "Mostly AI SDK initialized successfully!"
+        except Exception as e:
+            return False, f"Failed to initialize Mostly AI SDK: {str(e)}"
+    def train_generator(self, data: pd.DataFrame, name: str, epochs: int = 10, max_training_time: int = 60, batch_size: int = 32, value_protection: bool = True) -> Tuple[bool, str]:
+        """Train the synthetic data generator"""
+        if not self.mostly:
+            return False, "Mostly AI SDK not initialized"
+        try:
+            self.original_data = data
+            train_config = {'tables':
+                            [
+                                {
+                                    'name': name,
+                                    'data': data,
+                                    'tabular_model_configuration':
+                                    {
+                                        'max_epochs': epochs,
+                                        'max_training_time': max_training_time,
+                                        'value_protection': value_protection,
+                                        'batch_size': batch_size
+                                    }
+                                }
+                            ]
+                           }
+            self.generator = self.mostly.train(
+                config = train_config
+            )
+            return True, f"Generator trained successfully! Model: {name}"
+        except Exception as e:
+            return False, f"Training failed: {str(e)}"
+    def generate_synthetic_data(self, size: int) -> Tuple[pd.DataFrame, str]:
+        """Generate synthetic data"""
+        if not self.generator:
+            return None, "No trained generator available"
+        try:
+            synthetic_data = self.mostly.generate(self.generator, size=size)
+            df = synthetic_data.data()
+            return df, f"Generated {len(df)} synthetic records successfully!"
+        except Exception as e:
+            return None, f"Generation failed: {str(e)}"
+    def get_quality_report(self) -> str:
+        """Get quality assurance report"""
+        if not self.generator:
+            return "No trained generator available"
+        try:
+            report = self.generator.reports(display=False)
+            return str(report)
+        except Exception as e:
+            return f"Failed to generate report: {str(e)}"
+    def estimate_memory_usage(self, df: pd.DataFrame) -> str:
+        """Estimate memory usage for the dataset"""
+        if df is None or df.empty:
+            return "No data to analyze"
+        # Calculate approximate memory usage
+        memory_mb = df.memory_usage(deep=True).sum() / (1024 * 1024)
+        rows, cols = len(df), len(df.columns)
+        # Estimate training memory (roughly 3-5x the data size)
+        estimated_training_mb = memory_mb * 4
+        status = "✅ Good" if memory_mb < 100 else "⚠️ Large" if memory_mb < 500 else "❌ Very Large"
+        return f"""
+**Memory Usage Estimate:**
+- Data size: {memory_mb:.1f} MB
+- Estimated training memory: {estimated_training_mb:.1f} MB
+- Status: {status}
+- Rows: {rows:,} | Columns: {cols}
+        """.strip()
+# Initialize the generator
+generator = SyntheticDataGenerator()
+def initialize_sdk() -> Tuple[str, str]:
+    """Initialize the Mostly AI SDK"""
+    success, message = generator.initialize_mostly_ai()
+    status = "✅ Success" if success else "❌ Error"
+    return status, message
+def train_model(data: pd.DataFrame, model_name: str, epochs: int, max_training_time: int, batch_size: int, value_protection: bool) -> Tuple[str, str]:
+    """Train the synthetic data generator"""
+    if data is None or data.empty:
+        return "❌ Error", "Please upload or create sample data first"
+    success, message = generator.train_generator(data, model_name, epochs, max_training_time, batch_size, value_protection)
+    status = "✅ Success" if success else "❌ Error"
+    return status, message
+def generate_data(size: int) -> Tuple[pd.DataFrame, str]:
+    """Generate synthetic data"""
+    if generator.generator is None:
+        return None, "❌ Please train a model first"
+    synthetic_df, message = generator.generate_synthetic_data(size)
+    if synthetic_df is not None:
+        status = "✅ Success"
+    else:
+        status = "❌ Error"
+    return synthetic_df, f"{status} - {message}"
+def get_quality_report() -> str:
+    """Get quality report"""
+    return generator.get_quality_report()
+def create_comparison_plot(original_df: pd.DataFrame, synthetic_df: pd.DataFrame) -> go.Figure:
+    """Create comparison plots between original and synthetic data"""
+    if original_df is None or synthetic_df is None:
+        return None
+    # Select numeric columns for comparison
+    numeric_cols = original_df.select_dtypes(include=[np.number]).columns.tolist()
+    if not numeric_cols:
+        return None
+    # Create subplots
+    n_cols = min(3, len(numeric_cols))
+    n_rows = (len(numeric_cols) + n_cols - 1) // n_cols
+    fig = make_subplots(
+        rows=n_rows,
+        cols=n_cols,
+        subplot_titles=numeric_cols[:n_rows*n_cols]
+    )
+    for i, col in enumerate(numeric_cols[:n_rows*n_cols]):
+        row = i // n_cols + 1
+        col_idx = i % n_cols + 1
+        # Add original data histogram
+        fig.add_trace(
+            go.Histogram(
+                x=original_df[col],
+                name=f'Original {col}',
+                opacity=0.7,
+                nbinsx=20
+            ),
+            row=row, col=col_idx
+        )
+        # Add synthetic data histogram
+        fig.add_trace(
+            go.Histogram(
+                x=synthetic_df[col],
+                name=f'Synthetic {col}',
+                opacity=0.7,
+                nbinsx=20
+            ),
+            row=row, col=col_idx
+        )
+    fig.update_layout(
+        title="Original vs Synthetic Data Comparison",
+        height=300 * n_rows,
+        showlegend=True
+    )
+    return fig
+def download_csv(df: pd.DataFrame) -> str:
+    """Convert DataFrame to CSV for download"""
+    if df is None or df.empty:
+        return None
+    csv = df.to_csv(index=False)
+    return csv
+# Create the Gradio interface
+def create_interface():
+    with gr.Blocks(title="MOSTLY AI Synthetic Data Generator", theme=gr.themes.Soft()) as demo:
+        gr.Markdown("""
+        # 🎭 MOSTLY AI Synthetic Data Generator
+        Generate high-quality synthetic data using the Mostly AI SDK. Upload your own CSV files to generate synthetic data that preserves the statistical properties of your original dataset.
+        """)
+        with gr.Tab("🚀 Quick Start"):
+            gr.Markdown("### Initialize the SDK and upload your data")
+            with gr.Row():
+                with gr.Column():
+                    init_btn = gr.Button("Initialize Mostly AI SDK", variant="primary")
+                    init_status = gr.Textbox(label="Initialization Status", interactive=False)
+                with gr.Column():
+                    gr.Markdown("""
+                    **Next Steps:**
+                    1. Initialize the SDK (click button above)
+                    2. Go to "Upload Data and Train Model" tab to upload your CSV file
+                    3. Train a model on your data
+                    4. Generate synthetic data
+                    """)
+        with gr.Tab("📊 Upload Data and Train Model"):
+            gr.Markdown("### Upload your CSV file to generate synthetic data")
+            gr.Markdown("""
+            **📋 File Requirements:**
+            - **Format:** CSV with header row
+            - **Size:** Optimized for Hugging Face Spaces (2 vCPU, 16GB RAM)
+            """)
+            file_upload = gr.File(
+                label="Upload CSV File",
+                file_types=[".csv"],
+                file_count="single"
+            )
+            uploaded_data = gr.Dataframe(label="Uploaded Data", interactive=False)
+            memory_info = gr.Markdown(label="Memory Usage Info", visible=False)
+            with gr.Row():
+                with gr.Column():
+                    model_name = gr.Textbox(
+                        value="My Synthetic Model",
+                        label="Model Name",
+                        placeholder="Enter a name for your model"
+                    )
+                    epochs = gr.Slider(1, 200, value=100, step=1, label="Training Epochs")
+                    max_training_time = gr.Slider(1, 1000, value=60, step=1, label="Maximum Training Time")
+                    batch_size = gr.Slider(8, 1024, value=32, step=8, label="Training Batch Size")
+                    value_protection = gr.Checkbox(label="Value Protection", info="Enable Value Protection")
+                    train_btn = gr.Button("Train Model", variant="primary")
+                with gr.Column():
+                    train_status = gr.Textbox(label="Training Status", interactive=False)
+                    quality_report = gr.Textbox(label="Quality Report", lines=10, interactive=False)
+            get_report_btn = gr.Button("Get Quality Report", variant="secondary")
+        with gr.Tab("🎲 Generate Data"):
+            gr.Markdown("### Generate synthetic data from your trained model")
+            with gr.Row():
+                with gr.Column():
+                    gen_size = gr.Slider(10, 1000, value=100, step=10, label="Number of Records to Generate")
+                    generate_btn = gr.Button("Generate Synthetic Data", variant="primary")
+                with gr.Column():
+                    gen_status = gr.Textbox(label="Generation Status", interactive=False)
+            synthetic_data = gr.Dataframe(label="Synthetic Data", interactive=False)
+            with gr.Row():
+                download_btn = gr.DownloadButton("Download CSV", variant="secondary")
+                comparison_plot = gr.Plot(label="Data Comparison")
+        # Event handlers
+        init_btn.click(
+            initialize_sdk,
+            outputs=[init_status, init_status]
+        )
+        train_btn.click(
+            train_model,
+            inputs=[uploaded_data, model_name, epochs, max_training_time, batch_size, value_protection],
+            outputs=[train_status, train_status]
+        )
+        get_report_btn.click(
+            get_quality_report,
+            outputs=[quality_report]
+        )
+        generate_btn.click(
+            generate_data,
+            inputs=[gen_size],
+            outputs=[synthetic_data, gen_status]
+        )
+        # Update download button when synthetic data changes
+        synthetic_data.change(
+            download_csv,
+            inputs=[synthetic_data],
+            outputs=[download_btn]
+        )
+        # Create comparison plot when both datasets are available
+        synthetic_data.change(
+            create_comparison_plot,
+            inputs=[uploaded_data, synthetic_data],
+            outputs=[comparison_plot]
+        )
+        # Handle file upload with size and column limits
+        def process_uploaded_file(file):
+            if file is None:
+                return None, "No file uploaded", gr.update(visible=False)
+            try:
+                # Read the CSV file
+                df = pd.read_csv(file.name)
+                # # Check column limit (max 20 columns)
+                # if len(df.columns) > 20:
+                #     return None, f"❌ Too many columns! Maximum allowed: 20, found: {len(df.columns)}. Please reduce the number of columns in your CSV file.", gr.update(visible=False)
+                # # Check row limit (max 10,000 records)
+                # if len(df) > 10000:
+                #     return None, f"❌ Too many records! Maximum allowed: 10,000, found: {len(df)}. Please reduce the number of rows in your CSV file.", gr.update(visible=False)
+                # # Check minimum requirements
+                # if len(df) < 1000:
+                #     return None, f"❌ Too few records! Minimum required: 1,000, found: {len(df)}. Please provide more data for training.", gr.update(visible=False)
+                # if len(df.columns) < 2:
+                #     return None, f"❌ Too few columns! Minimum required: 2, found: {len(df.columns)}. Please provide more columns for training.", gr.update(visible=False)
+                # Success message with file info
+                success_msg = f"✅ File uploaded successfully! {len(df)} rows × {len(df.columns)} columns"
+                # Generate memory usage info
+                memory_info = generator.estimate_memory_usage(df)
+                return df, success_msg, gr.update(value=memory_info, visible=True)
+            except Exception as e:
+                return None, f"❌ Error reading file: {str(e)}", gr.update(visible=False)
+        file_upload.change(
+            process_uploaded_file,
+            inputs=[file_upload],
+            outputs=[uploaded_data, train_status, memory_info]
+        )
+    return demo
+if __name__ == "__main__":
+    demo = create_interface()
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=True
+    )