Spaces:

mostlyai
/

synthetic-sdk-demo

Running

File size: 14,618 Bytes

f376d1d

import gradio as gr
import pandas as pd
import numpy as np
import io
import base64
from typing import Optional, Tuple
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings("ignore")

# Import Mostly AI SDK
try:
    from mostlyai.sdk import MostlyAI
    MOSTLY_AI_AVAILABLE = True
except ImportError:
    MOSTLY_AI_AVAILABLE = False
    print("Warning: Mostly AI SDK not available. Please install with: pip install mostlyai[local]")

class SyntheticDataGenerator:
    def __init__(self):
        self.mostly = None
        self.generator = None
        self.original_data = None
        
    def initialize_mostly_ai(self):
        """Initialize Mostly AI SDK"""
        if not MOSTLY_AI_AVAILABLE:
            return False, "Mostly AI SDK not installed. Please install with: pip install mostlyai[local]"
        
        try:
            self.mostly = MostlyAI(local=True, local_port=8080)
            return True, "Mostly AI SDK initialized successfully!"
        except Exception as e:
            return False, f"Failed to initialize Mostly AI SDK: {str(e)}"
    
    
    def train_generator(self, data: pd.DataFrame, name: str, epochs: int = 10, max_training_time: int = 60, batch_size: int = 32, value_protection: bool = True) -> Tuple[bool, str]:
        """Train the synthetic data generator"""
        if not self.mostly:
            return False, "Mostly AI SDK not initialized"
        
        try:
            self.original_data = data
            train_config = {'tables':
                            [
                                {
                                    'name': name,
                                    'data': data,
                                    'tabular_model_configuration':
                                    {
                                        'max_epochs': epochs,
                                        'max_training_time': max_training_time,
                                        'value_protection': value_protection,
                                        'batch_size': batch_size
                                    }
                                }
                            ]
                           }
                            
            self.generator = self.mostly.train(
                config = train_config
            )
            return True, f"Generator trained successfully! Model: {name}"
        except Exception as e:
            return False, f"Training failed: {str(e)}"
    
    def generate_synthetic_data(self, size: int) -> Tuple[pd.DataFrame, str]:
        """Generate synthetic data"""
        if not self.generator:
            return None, "No trained generator available"
        
        try:
            synthetic_data = self.mostly.generate(self.generator, size=size)
            df = synthetic_data.data()
            return df, f"Generated {len(df)} synthetic records successfully!"
        except Exception as e:
            return None, f"Generation failed: {str(e)}"
    
    def get_quality_report(self) -> str:
        """Get quality assurance report"""
        if not self.generator:
            return "No trained generator available"
        
        try:
            report = self.generator.reports(display=False)
            return str(report)
        except Exception as e:
            return f"Failed to generate report: {str(e)}"
    
    def estimate_memory_usage(self, df: pd.DataFrame) -> str:
        """Estimate memory usage for the dataset"""
        if df is None or df.empty:
            return "No data to analyze"
        
        # Calculate approximate memory usage
        memory_mb = df.memory_usage(deep=True).sum() / (1024 * 1024)
        rows, cols = len(df), len(df.columns)
        
        # Estimate training memory (roughly 3-5x the data size)
        estimated_training_mb = memory_mb * 4
        
        status = "✅ Good" if memory_mb < 100 else "⚠️ Large" if memory_mb < 500 else "❌ Very Large"
        
        return f"""
**Memory Usage Estimate:**
- Data size: {memory_mb:.1f} MB
- Estimated training memory: {estimated_training_mb:.1f} MB
- Status: {status}
- Rows: {rows:,} | Columns: {cols}
        """.strip()

# Initialize the generator
generator = SyntheticDataGenerator()


def initialize_sdk() -> Tuple[str, str]:
    """Initialize the Mostly AI SDK"""
    success, message = generator.initialize_mostly_ai()
    status = "✅ Success" if success else "❌ Error"
    return status, message

def train_model(data: pd.DataFrame, model_name: str, epochs: int, max_training_time: int, batch_size: int, value_protection: bool) -> Tuple[str, str]:
    """Train the synthetic data generator"""
    if data is None or data.empty:
        return "❌ Error", "Please upload or create sample data first"
    
    success, message = generator.train_generator(data, model_name, epochs, max_training_time, batch_size, value_protection)
    status = "✅ Success" if success else "❌ Error"
    return status, message

def generate_data(size: int) -> Tuple[pd.DataFrame, str]:
    """Generate synthetic data"""
    if generator.generator is None:
        return None, "❌ Please train a model first"
    
    synthetic_df, message = generator.generate_synthetic_data(size)
    if synthetic_df is not None:
        status = "✅ Success"
    else:
        status = "❌ Error"
    
    return synthetic_df, f"{status} - {message}"

def get_quality_report() -> str:
    """Get quality report"""
    return generator.get_quality_report()

def create_comparison_plot(original_df: pd.DataFrame, synthetic_df: pd.DataFrame) -> go.Figure:
    """Create comparison plots between original and synthetic data"""
    if original_df is None or synthetic_df is None:
        return None
    
    # Select numeric columns for comparison
    numeric_cols = original_df.select_dtypes(include=[np.number]).columns.tolist()
    
    if not numeric_cols:
        return None
    
    # Create subplots
    n_cols = min(3, len(numeric_cols))
    n_rows = (len(numeric_cols) + n_cols - 1) // n_cols
    
    fig = make_subplots(
        rows=n_rows, 
        cols=n_cols,
        subplot_titles=numeric_cols[:n_rows*n_cols]
    )
    
    for i, col in enumerate(numeric_cols[:n_rows*n_cols]):
        row = i // n_cols + 1
        col_idx = i % n_cols + 1
        
        # Add original data histogram
        fig.add_trace(
            go.Histogram(
                x=original_df[col],
                name=f'Original {col}',
                opacity=0.7,
                nbinsx=20
            ),
            row=row, col=col_idx
        )
        
        # Add synthetic data histogram
        fig.add_trace(
            go.Histogram(
                x=synthetic_df[col],
                name=f'Synthetic {col}',
                opacity=0.7,
                nbinsx=20
            ),
            row=row, col=col_idx
        )
    
    fig.update_layout(
        title="Original vs Synthetic Data Comparison",
        height=300 * n_rows,
        showlegend=True
    )
    
    return fig

def download_csv(df: pd.DataFrame) -> str:
    """Convert DataFrame to CSV for download"""
    if df is None or df.empty:
        return None
    
    csv = df.to_csv(index=False)
    return csv

# Create the Gradio interface
def create_interface():
    with gr.Blocks(title="MOSTLY AI Synthetic Data Generator", theme=gr.themes.Soft()) as demo:
        gr.Markdown("""
        # 🎭 MOSTLY AI Synthetic Data Generator
        
        Generate high-quality synthetic data using the Mostly AI SDK. Upload your own CSV files to generate synthetic data that preserves the statistical properties of your original dataset.
        """)
        
        with gr.Tab("🚀 Quick Start"):
            gr.Markdown("### Initialize the SDK and upload your data")
            
            with gr.Row():
                with gr.Column():
                    init_btn = gr.Button("Initialize Mostly AI SDK", variant="primary")
                    init_status = gr.Textbox(label="Initialization Status", interactive=False)
                
                with gr.Column():
                    gr.Markdown("""
                    **Next Steps:**
                    1. Initialize the SDK (click button above)
                    2. Go to "Upload Data and Train Model" tab to upload your CSV file
                    3. Train a model on your data
                    4. Generate synthetic data
                    """)
        
        with gr.Tab("📊 Upload Data and Train Model"):
            gr.Markdown("### Upload your CSV file to generate synthetic data")
            
            gr.Markdown("""
            **📋 File Requirements:**
            - **Format:** CSV with header row
            - **Size:** Optimized for Hugging Face Spaces (2 vCPU, 16GB RAM)
            """)
            
            file_upload = gr.File(
                label="Upload CSV File",
                file_types=[".csv"],
                file_count="single"
            )
            
            uploaded_data = gr.Dataframe(label="Uploaded Data", interactive=False)
            
            memory_info = gr.Markdown(label="Memory Usage Info", visible=False)
            
            with gr.Row():
                with gr.Column():
                    model_name = gr.Textbox(
                        value="My Synthetic Model",
                        label="Model Name",
                        placeholder="Enter a name for your model"
                    )
                    epochs = gr.Slider(1, 200, value=100, step=1, label="Training Epochs")
                    max_training_time = gr.Slider(1, 1000, value=60, step=1, label="Maximum Training Time")
                    batch_size = gr.Slider(8, 1024, value=32, step=8, label="Training Batch Size")
                    value_protection = gr.Checkbox(label="Value Protection", info="Enable Value Protection")
                    train_btn = gr.Button("Train Model", variant="primary")
                
                with gr.Column():
                    train_status = gr.Textbox(label="Training Status", interactive=False)
                    quality_report = gr.Textbox(label="Quality Report", lines=10, interactive=False)
            
            get_report_btn = gr.Button("Get Quality Report", variant="secondary")
        
        with gr.Tab("🎲 Generate Data"):
            gr.Markdown("### Generate synthetic data from your trained model")
            
            with gr.Row():
                with gr.Column():
                    gen_size = gr.Slider(10, 1000, value=100, step=10, label="Number of Records to Generate")
                    generate_btn = gr.Button("Generate Synthetic Data", variant="primary")
                
                with gr.Column():
                    gen_status = gr.Textbox(label="Generation Status", interactive=False)
            
            synthetic_data = gr.Dataframe(label="Synthetic Data", interactive=False)
            
            with gr.Row():
                download_btn = gr.DownloadButton("Download CSV", variant="secondary")
                comparison_plot = gr.Plot(label="Data Comparison")
        
        # Event handlers
        init_btn.click(
            initialize_sdk,
            outputs=[init_status, init_status]
        )
        
        train_btn.click(
            train_model,
            inputs=[uploaded_data, model_name, epochs, max_training_time, batch_size, value_protection],
            outputs=[train_status, train_status]
        )
        
        get_report_btn.click(
            get_quality_report,
            outputs=[quality_report]
        )
        
        generate_btn.click(
            generate_data,
            inputs=[gen_size],
            outputs=[synthetic_data, gen_status]
        )
        
        # Update download button when synthetic data changes
        synthetic_data.change(
            download_csv,
            inputs=[synthetic_data],
            outputs=[download_btn]
        )
        
        # Create comparison plot when both datasets are available
        synthetic_data.change(
            create_comparison_plot,
            inputs=[uploaded_data, synthetic_data],
            outputs=[comparison_plot]
        )
        
        # Handle file upload with size and column limits
        def process_uploaded_file(file):
            if file is None:
                return None, "No file uploaded", gr.update(visible=False)
            
            try:
                # Read the CSV file
                df = pd.read_csv(file.name)
                
                # # Check column limit (max 20 columns)
                # if len(df.columns) > 20:
                #     return None, f"❌ Too many columns! Maximum allowed: 20, found: {len(df.columns)}. Please reduce the number of columns in your CSV file.", gr.update(visible=False)
                
                # # Check row limit (max 10,000 records)
                # if len(df) > 10000:
                #     return None, f"❌ Too many records! Maximum allowed: 10,000, found: {len(df)}. Please reduce the number of rows in your CSV file.", gr.update(visible=False)
                
                # # Check minimum requirements
                # if len(df) < 1000:
                #     return None, f"❌ Too few records! Minimum required: 1,000, found: {len(df)}. Please provide more data for training.", gr.update(visible=False)
                
                # if len(df.columns) < 2:
                #     return None, f"❌ Too few columns! Minimum required: 2, found: {len(df.columns)}. Please provide more columns for training.", gr.update(visible=False)
                
                # Success message with file info
                success_msg = f"✅ File uploaded successfully! {len(df)} rows × {len(df.columns)} columns"
                
                # Generate memory usage info
                memory_info = generator.estimate_memory_usage(df)
                
                return df, success_msg, gr.update(value=memory_info, visible=True)
                
            except Exception as e:
                return None, f"❌ Error reading file: {str(e)}", gr.update(visible=False)
        
        file_upload.change(
            process_uploaded_file,
            inputs=[file_upload],
            outputs=[uploaded_data, train_status, memory_info]
        )
    
    return demo

if __name__ == "__main__":
    demo = create_interface()
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=True
    )