Spaces:

Shilpaj
/

MnistStudio

Sleeping

App Files Files Community

Shilpaj commited on Nov 17, 2024

Commit

6f5f635

1 Parent(s): 0d84fb8

Feat: Logic for model training and inference

Browse files

Files changed (9) hide show

scripts/__init__.py +0 -0
scripts/app.py +74 -0
scripts/inference/__init__.py +0 -0
scripts/inference/infer.py +0 -0
scripts/model.py +85 -0
scripts/training/__init__.py +0 -0
scripts/training/config.py +15 -0
scripts/training/model.py +37 -0
scripts/training/train.py +273 -0

scripts/__init__.py ADDED Viewed

File without changes

scripts/app.py ADDED Viewed

	@@ -0,0 +1,74 @@

+@app.post("/train")
+async def train_model(config: dict):
+    network_config = NetworkConfig()
+    network_config.update(**config)
+    # Create model with configured architecture
+    model = Net(
+        block1=network_config.block1,
+        block2=network_config.block2,
+        block3=network_config.block3
+    )
+    # Start training with websocket updates
+    result = await train(model, network_config)
+    return result
+@app.websocket("/ws/compare")
+async def websocket_compare_endpoint(websocket: WebSocket):
+    await websocket.accept()
+    try:
+        while True:
+            data = await websocket.receive_json()
+            if data.get("type") == "start_comparison":
+                # Create and train both models
+                model1_config = NetworkConfig()
+                model2_config = NetworkConfig()
+                # Update configs with received data
+                model1_config.update(**data["model1"])
+                model2_config.update(**data["model2"])
+                # Create models with respective configurations
+                model1 = Net(
+                    block1=model1_config.block1,
+                    block2=model1_config.block2,
+                    block3=model1_config.block3
+                )
+                model2 = Net(
+                    block1=model2_config.block1,
+                    block2=model2_config.block2,
+                    block3=model2_config.block3
+                )
+                # Train both models concurrently
+                tasks = [
+                    train(model1, model1_config, websocket),
+                    train(model2, model2_config, websocket)
+                ]
+                results = await asyncio.gather(*tasks)
+                # Send completion message
+                await websocket.send_json({
+                    "type": "comparison_complete",
+                    "data": {
+                        "model1": results[0],
+                        "model2": results[1]
+                    }
+                })
+    except Exception as e:
+        print(f"Error in websocket connection: {e}")
+    finally:
+        await websocket.close()
+@app.post("/compare")
+async def compare_models(request: Request):
+    data = await request.json()
+    return {"status": "started", "message": "Model comparison initiated"}
+@app.get("/train/compare")
+async def compare_page(request: Request):
+    return templates.TemplateResponse("train_compare.html", {"request": request})

scripts/inference/__init__.py ADDED Viewed

File without changes

scripts/inference/infer.py ADDED Viewed

File without changes

scripts/model.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class Net(nn.Module):
+    def __init__(self, kernels=[32, 64, 128]):
+        super(Net, self).__init__()
+        # First Convolutional Block
+        self.conv1 = nn.Conv2d(1, kernels[0], 3, padding=1)
+        self.bn1 = nn.BatchNorm2d(kernels[0])
+        # Second Convolutional Block
+        self.conv2 = nn.Conv2d(kernels[0], kernels[1], 3, padding=1)
+        self.bn2 = nn.BatchNorm2d(kernels[1])
+        # Third Convolutional Block
+        self.conv3 = nn.Conv2d(kernels[1], kernels[2], 3, padding=1)
+        self.bn3 = nn.BatchNorm2d(kernels[2])
+        self.pool = nn.MaxPool2d(2, 2)
+        self.dropout = nn.Dropout(0.25)
+        # Calculate the size after convolutions and pooling
+        # Input: 28x28 -> after three pooling layers: 7x7
+        # Final feature map size will be kernels[2] x 7 x 7
+        self.fc1 = nn.Linear(kernels[2] * 7 * 7, 256)
+        self.fc1_bn = nn.BatchNorm1d(256)
+        self.fc2 = nn.Linear(256, 10)
+        # Initialize weights
+        self._initialize_weights()
+    def forward(self, x):
+        # First conv block
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = F.relu(x)
+        x = self.pool(x)  # 28x28 -> 14x14
+        # Second conv block
+        x = self.conv2(x)
+        x = self.bn2(x)
+        x = F.relu(x)
+        x = self.pool(x)  # 14x14 -> 7x7
+        # Third conv block
+        x = self.conv3(x)
+        x = self.bn3(x)
+        x = F.relu(x)
+        # No pooling here to maintain spatial dimensions
+        # Flatten
+        x = x.view(-1, self.num_flat_features(x))
+        x = self.dropout(x)
+        # Fully connected layers
+        x = self.fc1(x)
+        x = self.fc1_bn(x)
+        x = F.relu(x)
+        x = self.dropout(x)
+        x = self.fc2(x)
+        return F.log_softmax(x, dim=1)
+    def num_flat_features(self, x):
+        size = x.size()[1:]
+        num_features = 1
+        for s in size:
+            num_features *= s
+        return num_features
+    def _initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                # Xavier initialization for CONV layers
+                nn.init.xavier_uniform_(m.weight)
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.ones_(m.weight)
+                nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.Linear):
+                # Xavier initialization for FC layers
+                nn.init.xavier_uniform_(m.weight)
+                nn.init.zeros_(m.bias)

scripts/training/__init__.py ADDED Viewed

File without changes

scripts/training/config.py ADDED Viewed

	@@ -0,0 +1,15 @@

+BLOCK_OPTIONS = [8, 16, 32, 64, 128]
+class NetworkConfig:
+    def __init__(self):
+        self.block1 = 32
+        self.block2 = 64
+        self.block3 = 128
+        self.batch_size = 64
+        self.optimizer = 'SGD'
+        self.epochs = 1
+    def update(self, **kwargs):
+        for key, value in kwargs.items():
+            if hasattr(self, key):
+                setattr(self, key, value)

scripts/training/model.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import Conv2d, MaxPool2d, Linear, Sequential, ReLU, LogSoftmax, Flatten
+class Net(torch.nn.Module):
+    def __init__(self, block1=32, block2=64, block3=128):
+        """
+        Constructor
+        """
+        super(Net, self).__init__()
+        # Define model architecture with configurable blocks
+        self.conv1 = nn.Conv2d(1, block1, kernel_size=3)
+        self.conv2 = nn.Conv2d(block1, block2, kernel_size=3)
+        self.conv3 = nn.Conv2d(block2, block3, kernel_size=3)
+        self.conv4 = nn.Conv2d(block3, block3*2, kernel_size=3)
+        # Calculate the input size for the first fully connected layer
+        self.fc1 = nn.Linear(block3*2*16, 50)
+        self.fc2 = nn.Linear(50, 10)
+    def forward(self, x):
+        """
+        Forward pass for model training
+        :param x: Input layer
+        :return: Output of the model
+        """
+        x = F.relu(self.conv1(x))
+        x = F.relu(F.max_pool2d(self.conv2(x), 2))
+        x = F.relu(self.conv3(x))
+        x = F.relu(F.max_pool2d(self.conv4(x), 2))
+        x = x.view(x.size(0), -1)
+        x = F.relu(self.fc1(x))
+        x = self.fc2(x)
+        return x

scripts/training/train.py ADDED Viewed

	@@ -0,0 +1,273 @@

+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.utils.data import DataLoader, Dataset
+from torchvision import transforms
+import numpy as np
+import gzip
+import os
+from pathlib import Path
+from datetime import datetime
+import urllib.request
+import shutil
+from tqdm import tqdm
+import asyncio
+def download_and_extract_mnist_data():
+    """Download and extract MNIST dataset from a reliable mirror"""
+    base_url = "https://storage.googleapis.com/cvdf-datasets/mnist/"
+    files = {
+        "train_images": "train-images-idx3-ubyte.gz",
+        "train_labels": "train-labels-idx1-ubyte.gz",
+        "test_images": "t10k-images-idx3-ubyte.gz",
+        "test_labels": "t10k-labels-idx1-ubyte.gz"
+    }
+    data_dir = Path("data/MNIST/raw")
+    data_dir.mkdir(parents=True, exist_ok=True)
+    for file_name in files.values():
+        gz_file_path = data_dir / file_name
+        extracted_file_path = data_dir / file_name.replace('.gz', '')
+        # If the extracted file exists, skip downloading
+        if extracted_file_path.exists():
+            print(f"{extracted_file_path} already exists, skipping download.")
+            continue
+        # Download the file
+        print(f"Downloading {file_name}...")
+        url = base_url + file_name
+        try:
+            urllib.request.urlretrieve(url, gz_file_path)
+            print(f"Successfully downloaded {file_name}")
+        except Exception as e:
+            print(f"Failed to download {file_name}: {e}")
+            raise Exception(f"Could not download {file_name}")
+        # Extract the files
+        try:
+            print(f"Extracting {file_name}...")
+            with gzip.open(gz_file_path, 'rb') as f_in:
+                with open(extracted_file_path, 'wb') as f_out:
+                    shutil.copyfileobj(f_in, f_out)
+            print(f"Successfully extracted {file_name}")
+        except Exception as e:
+            print(f"Failed to extract {file_name}: {e}")
+            raise Exception(f"Could not extract {file_name}")
+def load_mnist_images(filename):
+    with open(filename, 'rb') as f:
+        data = np.frombuffer(f.read(), np.uint8, offset=16)
+    return data.reshape(-1, 1, 28, 28).astype(np.float32) / 255.0
+def load_mnist_labels(filename):
+    with open(filename, 'rb') as f:
+        return np.frombuffer(f.read(), np.uint8, offset=8)
+class CustomMNISTDataset(Dataset):
+    def __init__(self, images_path, labels_path, transform=None):
+        self.images = load_mnist_images(images_path)
+        self.labels = load_mnist_labels(labels_path)
+        self.transform = transform
+    def __len__(self):
+        return len(self.labels)
+    def __getitem__(self, idx):
+        image = torch.FloatTensor(self.images[idx])
+        label = int(self.labels[idx])
+        if self.transform:
+            image = self.transform(image)
+        return image, label
+def validate(model, test_loader, criterion, device):
+    """Modified validate function to handle validation properly"""
+    model.eval()
+    val_loss = 0
+    correct = 0
+    total = 0
+    num_batches = 0
+    with torch.no_grad():  # Important: no gradient computation in validation
+        for data, target in test_loader:
+            data, target = data.to(device), target.to(device)
+            output = model(data)
+            val_loss += criterion(output, target).item()  # Don't scale by batch size
+            _, predicted = output.max(1)
+            total += target.size(0)
+            correct += predicted.eq(target).sum().item()
+            num_batches += 1
+    # Average the loss by number of batches and accuracy by total samples
+    val_loss = val_loss / num_batches  # Average loss across batches
+    val_acc = 100. * correct / total
+    return val_loss, val_acc
+async def train(model, config, websocket=None):
+    print("\nStarting training...")
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"Using device: {device}")
+    model = model.to(device)
+    # Create data directory if it doesn't exist
+    data_dir = Path("data")
+    data_dir.mkdir(exist_ok=True)
+    # Ensure data is downloaded and extracted
+    print("Preparing dataset...")
+    download_and_extract_mnist_data()
+    # Paths to the extracted files
+    train_images_path = "data/MNIST/raw/train-images-idx3-ubyte"
+    train_labels_path = "data/MNIST/raw/train-labels-idx1-ubyte"
+    test_images_path = "data/MNIST/raw/t10k-images-idx3-ubyte"
+    test_labels_path = "data/MNIST/raw/t10k-labels-idx1-ubyte"
+    # Data loading
+    transform = transforms.Compose([
+        transforms.Normalize((0.1307,), (0.3081,))
+    ])
+    train_dataset = CustomMNISTDataset(train_images_path, train_labels_path, transform=transform)
+    test_dataset = CustomMNISTDataset(test_images_path, test_labels_path, transform=transform)
+    train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True)
+    test_loader = DataLoader(test_dataset, batch_size=config.batch_size, shuffle=False)
+    print(f"Dataset loaded. Training samples: {len(train_dataset)}, Test samples: {len(test_dataset)}")
+    # Initialize optimizer based on config
+    if config.optimizer.lower() == 'adam':
+        optimizer = optim.Adam(model.parameters())
+    else:
+        optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
+    criterion = nn.CrossEntropyLoss()
+    print("\nTraining Configuration:")
+    print(f"Optimizer: {config.optimizer}")
+    print(f"Batch Size: {config.batch_size}")
+    print(f"Network Architecture: {config.block1}-{config.block2}-{config.block3}")
+    print("\nStarting training loop...")
+    best_val_acc = 0
+    history = {
+        'train_loss': [],
+        'train_acc': [],
+        'val_loss': [],
+        'val_acc': []
+    }
+    try:
+        for epoch in range(config.epochs):
+            model.train()
+            total_loss = 0
+            correct = 0
+            total = 0
+            # Create progress bar for each epoch
+            progress_bar = tqdm(
+                train_loader,
+                desc=f"Epoch {epoch+1}/{config.epochs}",
+                unit='batch',
+                leave=True
+            )
+            for batch_idx, (data, target) in enumerate(progress_bar):
+                data, target = data.to(device), target.to(device)
+                optimizer.zero_grad()
+                output = model(data)
+                loss = criterion(output, target)
+                loss.backward()
+                optimizer.step()
+                # Calculate batch accuracy
+                pred = output.argmax(dim=1, keepdim=True)
+                correct += pred.eq(target.view_as(pred)).sum().item()
+                total += target.size(0)
+                total_loss += loss.item()
+                # Calculate current metrics
+                current_loss = total_loss / (batch_idx + 1)
+                current_acc = 100. * correct / total
+                # Update progress bar description
+                progress_bar.set_postfix({
+                    'loss': f'{current_loss:.4f}',
+                    'acc': f'{current_acc:.2f}%'
+                })
+                # Send training update through websocket
+                if websocket:
+                    try:
+                        await websocket.send_json({
+                            'type': 'training_update',
+                            'data': {
+                                'step': batch_idx + epoch * len(train_loader),
+                                'train_loss': current_loss,
+                                'train_acc': current_acc
+                            }
+                        })
+                    except Exception as e:
+                        print(f"Error sending websocket update: {e}")
+            # Calculate epoch metrics
+            train_loss = total_loss / len(train_loader)
+            train_acc = 100. * correct / total
+            # Validation phase
+            model.eval()
+            val_loss = 0
+            val_correct = 0
+            val_total = 0
+            print("\nRunning validation...")
+            with torch.no_grad():
+                for data, target in test_loader:
+                    data, target = data.to(device), target.to(device)
+                    output = model(data)
+                    val_loss += criterion(output, target).item()
+                    pred = output.argmax(dim=1, keepdim=True)
+                    val_correct += pred.eq(target.view_as(pred)).sum().item()
+                    val_total += target.size(0)
+            val_loss /= len(test_loader)
+            val_acc = 100. * val_correct / val_total
+            # Print epoch results
+            print(f"\nEpoch {epoch+1}/{config.epochs} Results:")
+            print(f"Training Loss: {train_loss:.4f} | Training Accuracy: {train_acc:.2f}%")
+            print(f"Val Loss: {val_loss:.4f} | Val Accuracy: {val_acc:.2f}%")
+            # Send validation update through websocket
+            if websocket:
+                try:
+                    await websocket.send_json({
+                        'type': 'validation_update',
+                        'data': {
+                            'step': (epoch + 1) * len(train_loader),
+                            'val_loss': val_loss,
+                            'val_acc': val_acc
+                        }
+                    })
+                except Exception as e:
+                    print(f"Error sending websocket update: {e}")
+            # Save best model
+            if val_acc > best_val_acc:
+                best_val_acc = val_acc
+                print(f"\nNew best validation accuracy: {val_acc:.2f}%")
+                print("Saving model...")
+                torch.save(model.state_dict(), 'best_model.pth')
+    except Exception as e:
+        print(f"\nError during training: {e}")
+        raise e
+    print("\nTraining completed!")
+    print(f"Best validation accuracy: {best_val_acc:.2f}%")
+    return history