File size: 12,352 Bytes

# Upgraded to use TensorFlow 2.x and the Keras API.  Provides multiple visualizations of the weights.
# Ready to copy, paste, run in colab using TensorFlow 2.x
"""
Corrected and upgraded by the Martial Terran, from 
https://github.com/spiderPan/Google-Machine-Learning-Crash-Course/blob/master/multi-class_classfication_of_handwritten_digits.py

The architecture of this model is not a CNN (Convolutional Neural Network).
It is a Dense Neural Network (DNN), also commonly known as a Multilayer Perceptron (MLP).
Let's break down why and look at the specific architecture.
Why it's a DNN and Not a CNN
The defining characteristic of a CNN is its use of convolutional layers (Conv2D). These layers are specifically designed to work with grid-like data, such as images. They use filters (or kernels) to slide across the input image, detecting spatial patterns like edges, textures, and shapes.
This model does not use any convolutional layers. Instead, its core components are Dense layers (tf.keras.layers.Dense).
DNN Approach: The 28x28 pixel image is flattened into a single vector of 784 numbers. The Dense layers treat these numbers as a simple list, with no inherent understanding that pixel #29 is directly below pixel #1. It learns patterns from the pixel values themselves, but loses all the spatial relationships between them.
CNN Approach: A CNN would take the input as a 2D grid (e.g., shape=(28, 28, 1)) and use Conv2D layers to analyze neighboring pixels, preserving the spatial structure of the image.
The Specific Architecture of this Model
You can see the exact architecture from the code or by printing the model's summary (model.summary()). 
Based on the code with hidden_units = [100, 100], the architecture is as follows:
Layer #	Layer Type	Description	Output Shape
1	Input	A flat vector of 784 pixel values (28x28).	(None, 784)
2	Dense	First fully-connected hidden layer. Every one of its 100 neurons is connected to all 784 input pixels.	(None, 100)
3	Dense	Second fully-connected hidden layer. Every one of its 100 neurons is connected to all 100 neurons before it.	(None, 100)
4	Dropout	Regularization layer. Randomly sets 20% of neuron activations to zero during training to prevent overfitting.	(None, 100)
5	Dense	The final output layer. It has 10 neurons, one for each class (digits 0-9).	(None, 10)
Softmax	The activation function on the output layer that converts the outputs into a probability distribution.	(None, 10)
(Note: "None" in the output shape refers to the batch size, which can vary.)
In summary:
It's a DNN/MLP: It uses stacked Dense (fully-connected) layers.
It's not a CNN: It lacks Conv2D and MaxPooling2D layers, and it flattens the image data, discarding the crucial 2D spatial information that CNNs are built to exploit.

Model Summary:
/usr/local/lib/python3.11/dist-packages/keras/src/layers/core/input_layer.py:27: UserWarning: Argument `input_shape` is deprecated. Use `shape` instead.
  warnings.warn(
Model: "sequential_1"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Layer (type)                    ┃ Output Shape           ┃       Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ dense_3 (Dense)                 │ (None, 100)            │        78,500 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_4 (Dense)                 │ (None, 100)            │        10,100 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_1 (Dropout)             │ (None, 100)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_5 (Dense)                 │ (None, 10)             │         1,010 │
└─────────────────────────────────┴────────────────────────┴───────────────┘
 Total params: 89,610 (350.04 KB)
 Trainable params: 89,610 (350.04 KB)
 Non-trainable params: 0 (0.00 B)

Final accuracy (on validation data): 0.96

Evaluating on test data...
Accuracy on test data: 0.96

"""

import glob
import math
import os

import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
from IPython.display import display
from matplotlib import pyplot as plt
from sklearn import metrics

# Set pandas display options
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format


def parse_labels_and_features(dataset):
    """Parses a dataset into features and labels.

    Args:
      dataset: A Pandas DataFrame with the first column being the label
               and the remaining columns as pixel data.
    Returns:
      A tuple of (labels, features), where both are Pandas Series/DataFrame.
    """
    labels = dataset[0]
    # The remaining 784 columns are the pixel values.
    features = dataset.loc[:, 1:784]
    # Normalize the feature values to be in the range [0, 1].
    features = features / 255
    return labels, features


def create_and_train_nn_model(
        learning_rate,
        epochs,
        batch_size,
        hidden_units,
        training_examples,
        training_targets,
        validation_examples,
        validation_targets):
    """
    Creates, trains, and evaluates a Deep Neural Network model using tf.keras.

    Args:
        learning_rate: The learning rate for the optimizer.
        epochs: The number of times to iterate through the training data.
        batch_size: The number of examples to use in each training step.
        hidden_units: A list of integers, where each integer is the number of nodes
                      in a hidden layer.
        training_examples: DataFrame of training features.
        training_targets: Series of training labels.
        validation_examples: DataFrame of validation features.
        validation_targets: Series of validation labels.

    Returns:
        The trained tf.keras.Model object and the training history.
    """
    # 1. Define the model architecture
    model = tf.keras.models.Sequential()

    # Input layer (no feature columns needed for dense input)
    model.add(tf.keras.layers.InputLayer(input_shape=(784,)))

    # Add hidden layers
    for units in hidden_units:
        model.add(tf.keras.layers.Dense(units, activation='relu'))

    # Add a dropout layer for regularization to prevent overfitting
    model.add(tf.keras.layers.Dropout(0.2))

    # Output layer with 10 units for 10 classes (0-9) and softmax activation
    model.add(tf.keras.layers.Dense(10, activation='softmax'))

    # 2. Compile the model
    model.compile(
        optimizer=tf.keras.optimizers.Adagrad(learning_rate=learning_rate),
        loss="sparse_categorical_crossentropy",
        metrics=['accuracy']
    )
    
    # Print a summary of the model
    print("Model Summary:")
    model.summary()
    print("\nTraining Model...")

    # 3. Train the model
    history = model.fit(
        x=training_examples.values,
        y=training_targets.values,
        batch_size=batch_size,
        epochs=epochs,
        shuffle=True,
        validation_data=(validation_examples.values, validation_targets.values),
        # Suppress verbose logs, show one line per epoch
        verbose=2
    )
    print("Model training finished.")

    # 4. Plot the results
    training_loss = history.history["loss"]
    validation_loss = history.history["val_loss"]
    epochs_range = range(1, epochs + 1)

    plt.figure(figsize=(10, 5))
    plt.ylabel("Loss (Sparse Categorical Crossentropy)")
    plt.xlabel("Epochs")
    plt.title("Loss vs. Epochs")
    plt.plot(epochs_range, training_loss, label="Training")
    plt.plot(epochs_range, validation_loss, label="Validation")
    plt.legend()
    plt.show()

    # 5. Show a confusion matrix
    # Get predictions for the validation set
    validation_probabilities = model.predict(validation_examples.values)
    validation_predictions = np.argmax(validation_probabilities, axis=1)

    cm = metrics.confusion_matrix(validation_targets, validation_predictions)
    cm_normalized = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis]

    plt.figure(figsize=(8, 8))
    ax = sns.heatmap(cm_normalized, cmap="bone_r", annot=True, fmt=".2f")
    ax.set_aspect(1)
    plt.title("Confusion Matrix")
    plt.ylabel("True Label")
    plt.xlabel("Predicted Label")
    plt.show()

    # Print final validation accuracy from the last epoch
    final_validation_accuracy = history.history["val_accuracy"][-1]
    print(f"Final accuracy (on validation data): {final_validation_accuracy:.2f}")

    return model, history


# --- Main Execution ---

# Load the datasets
mnist_dataframe = pd.read_csv('sample_data/mnist_train_small.csv', sep=",", header=None)
mnist_test_dataframe = pd.read_csv('sample_data/mnist_test.csv', sep=',', header=None)

# Shuffle and select a subset of the training data
mnist_dataframe = mnist_dataframe.head(10000)
mnist_dataframe = mnist_dataframe.reindex(np.random.permutation(mnist_dataframe.index))
display(mnist_dataframe.head())

# Parse features and labels
training_targets, training_examples = parse_labels_and_features(mnist_dataframe[:7500])
validation_targets, validation_examples = parse_labels_and_features(mnist_dataframe[7500:10000])
testing_targets, testing_examples = parse_labels_and_features(mnist_test_dataframe)

display(training_examples.describe())
display(validation_examples.describe())

# Show a random example from the training set
rand_example_idx = np.random.choice(training_examples.index)
_, ax = plt.subplots()
ax.matshow(training_examples.loc[rand_example_idx].values.reshape(28, 28))
ax.set_title(f"Label: {training_targets.loc[rand_example_idx]}")
ax.grid(False)
plt.show()

# Define hyperparameters
# The original script used `steps=1000`, `batch_size=30`, `periods=10`.
# With 7500 training examples, one epoch is 7500/30 = 250 steps.
# To get a similar amount of training (1000 steps), we need 1000/250 = 4 epochs.
# We will use 10 epochs to match the 10 "periods" from the original for better visualization.
LEARNING_RATE = 0.05
EPOCHS = 25
BATCH_SIZE = 30
HIDDEN_UNITS = [100, 100]

# Train the model
trained_model, history = create_and_train_nn_model(
    learning_rate=LEARNING_RATE,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    hidden_units=HIDDEN_UNITS,
    training_examples=training_examples,
    training_targets=training_targets,
    validation_examples=validation_examples,
    validation_targets=validation_targets
)

# Evaluate the model on the test data
print("\nEvaluating on test data...")
loss, accuracy = trained_model.evaluate(testing_examples.values, testing_targets.values, verbose=0)
print(f"Accuracy on test data: {accuracy:.2f}")

# Visualize the weights of the first hidden layer
print("\nVisualizing weights of the first hidden layer...")
# The first Dense layer is at index 0 in the model.layers list
# get_weights() returns a list [kernel, bias], we need the kernel [0]
weights0 = trained_model.layers[0].get_weights()[0]
print('Weights 0 shape:', weights0.shape)

num_nodes = weights0.shape[1]
num_rows = int(math.ceil(num_nodes / 10.0))
fig, axes = plt.subplots(num_rows, 10, figsize=(20, 2 * num_rows))
for coef, ax in zip(weights0.T, axes.ravel()):
    # Weights are reshaped from (784,) to (28, 28) for visualization.
    ax.matshow(coef.reshape(28, 28), cmap=plt.cm.pink)
    ax.set_xticks(())
    ax.set_yticks(())

plt.suptitle("First Hidden Layer Weights", fontsize=20)
plt.show()