import matplotlib.pyplot as plt
1. The Perceptron: A Single Linear Neuron
The Perceptron is the simplest form of a neural network. It’s a single neuron that takes binary inputs, applies weights and a bias, and uses a step function to produce a binary output. It can only solve linearly separable problems.
The formula is: \(y = f(\sum_{i} w_i x_i + b)\), where \(f\) is a step function.
\(f(x) = \begin{cases} 1 & \text{if } x \geq 0 \\ 0 & \text{if } x < 0 \end{cases}\)
import numpy as np
class Perceptron:
"""A simple Perceptron classifier."""
def __init__(self, learning_rate=0.1, n_iters=100):
self.lr = learning_rate
self.n_iters = n_iters
self.activation_func = self._step_function
self.weights = None
self.bias = None
def _step_function(self, x):
return np.where(x >= 0, 1, 0)
def fit(self, X, y):
print('Beginning to fit')
= X.shape
n_samples, n_features # Initialize weights and bias
self.weights = np.zeros(n_features)
self.bias = 0
for i in range(self.n_iters):
for idx, x_i in enumerate(X):
= np.dot(x_i, self.weights) + self.bias
linear_output = self.activation_func(linear_output)
y_predicted
# Perceptron update rule
= self.lr * (y[idx] - y_predicted)
update self.weights += update * x_i
self.bias += update
if i%10==0:
print(i)
def predict(self, X):
= np.dot(X, self.weights) + self.bias
linear_output return self.activation_func(linear_output)
def show(self):
= plt.subplots(figsize=(4, 2))
fig, ax 'off')
ax.axis(
# Input layer (2 inputs)
0.5, 1), 0.1, color='skyblue', ec='black'))
ax.add_patch(plt.Circle((0.3, 1, "$x_1$", fontsize=12)
ax.text(0.5, 0.5), 0.1, color='skyblue', ec='black'))
ax.add_patch(plt.Circle((0.3, 0.5, "$x_2$", fontsize=12)
ax.text(
# Output neuron
2, 0.75), 0.12, color='salmon', ec='black'))
ax.add_patch(plt.Circle((2.2, 0.75, "$\hat{y}$", fontsize=12)
ax.text(
# Arrows
"", xy=(1.88, 0.75), xytext=(0.6, 1), arrowprops=dict(arrowstyle='->'))
ax.annotate("", xy=(1.88, 0.75), xytext=(0.6, 0.5), arrowprops=dict(arrowstyle='->'))
ax.annotate(
"Perceptron Architecture", fontsize=14)
ax.set_title(0, 2.5)
plt.xlim(0.2, 1.3)
plt.ylim(
plt.tight_layout() plt.show()
<>:51: SyntaxWarning: invalid escape sequence '\h'
<>:51: SyntaxWarning: invalid escape sequence '\h'
/tmp/ipykernel_65160/2482501332.py:51: SyntaxWarning: invalid escape sequence '\h'
ax.text(2.2, 0.75, "$\hat{y}$", fontsize=12)
= Perceptron() model
model.show()
# Generate data
= np.linspace(-5, 5, 500)
x_vals = model._step_function(x_vals)
y_step
# Create the plot
=(12, 8))
plt.figure(figsize
2, 2, 1)
plt.subplot(='Step Function')
plt.plot(x_vals, y_step, label'Step Activation Function')
plt.title('Input value (x)')
plt.xlabel('Output value')
plt.ylabel(-0.1, 1.1)
plt.ylim(
plt.legend()print("Step Function: Outputs 0 for negative input, 1 for positive. Used in the original Perceptron.")
Step Function: Outputs 0 for negative input, 1 for positive. Used in the original Perceptron.
Verification: Perceptron Fails for XOR Gate
The XOR (exclusive OR) gate is a classic example of a non-linearly separable problem. A single straight line cannot separate the (0,1)
and (1,0)
points from (0,0)
and (1,1)
.
# XOR problem data
= np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
X_xor = np.array([0, 1, 1, 0])
y_xor
=(5, 5))
plt.figure(figsizefor label in np.unique(y_xor):
plt.scatter(== label, 0],
X_xor[y_xor == label, 1],
X_xor[y_xor =f"Class {label}",
label='k',
edgecolor=100
s
)
"XOR Dataset")
plt.title("x1")
plt.xlabel("x2")
plt.ylabel(0, 1])
plt.xticks([0, 1])
plt.yticks([True)
plt.grid(
plt.legend()'equal')
plt.axis( plt.show()
# Train the perceptron
= Perceptron(learning_rate=0.1, n_iters=100)
perceptron perceptron.fit(X_xor, y_xor)
Beginning to fit
0
10
20
30
40
50
60
70
80
90
print("\n=== Perceptron Model Structure ===")
print(f"Number of layers: 1 (no hidden layer)")
print(f"Weights shape: {perceptron.weights.shape}")
print(f"Bias: {perceptron.bias}")
=== Perceptron Model Structure ===
Number of layers: 1 (no hidden layer)
Weights shape: (2,)
Bias: 0.0
# Get predictions
= perceptron.predict(X_xor)
predictions
print(f"XOR Input:\n{X_xor}")
print(f"Expected Output: {y_xor}")
print(f"Perceptron Output: {predictions}")
= np.sum(y_xor == predictions) / len(y_xor)
accuracy print(f"Accuracy: {accuracy * 100}%")
print("\nAs you can see, the single-layer Perceptron cannot learn the XOR function.")
XOR Input:
[[0 0]
[0 1]
[1 0]
[1 1]]
Expected Output: [0 1 1 0]
Perceptron Output: [1 1 0 0]
Accuracy: 50.0%
As you can see, the single-layer Perceptron cannot learn the XOR function.
2. Multilayer Perceptron (MLP) for XOR
To solve non-linear problems like XOR, we need to add a hidden layer. This is a Multilayer Perceptron (MLP). The hidden layer allows the network to learn non-linear combinations of the inputs. We also switch to a smooth activation function like the Sigmoid function to enable gradient-based learning via backpropagation.
Mathematically,
Sigmoid function: \(\sigma(x) = \frac{1}{1 + e^{-x}}\)
# Activation function and its derivative
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def sigmoid_derivative(x):
return x * (1 - x)
= sigmoid(x_vals)
y_sigmoid = sigmoid_derivative(y_sigmoid)
y_sigmoid_deriv
=(6, 6))
plt.figure(figsize='Sigmoid')
plt.plot(x_vals, y_sigmoid, label='Sigmoid Derivative', linestyle='--')
plt.plot(x_vals, y_sigmoid_deriv, label'Sigmoid Function and its Derivative')
plt.title('Input value (x)')
plt.xlabel('Output value')
plt.ylabel( plt.legend()
def draw_mlp_architecture(input_size, hidden_size, output_size):
= plt.subplots(figsize=(6, 4))
fig, ax 'off')
ax.axis(
# Circle radius
= 0.1
r
# Layer x-positions
= 0.5
x_input = 2
x_hidden = 3.5
x_output
# Draw input layer
for i in range(input_size):
= 1.5 - i * 0.75
y ='skyblue', ec='black'))
ax.add_patch(plt.Circle((x_input, y), r, color- 0.3, y, f"$x_{i+1}$", fontsize=12)
ax.text(x_input
# Draw hidden layer
for j in range(hidden_size):
= 1.5 - j * 0.75
y ='lightgreen', ec='black'))
ax.add_patch(plt.Circle((x_hidden, y), r, colorf"$h_{j+1}$", fontsize=12, ha='center', va='center')
ax.text(x_hidden, y,
# Draw output layer
for k in range(output_size):
= 0.75 # Always one output neuron here
y ='salmon', ec='black'))
ax.add_patch(plt.Circle((x_output, y), r, color+ 0.2, y, "$\\hat{y}$", fontsize=12)
ax.text(x_output
# Arrows from input to hidden
for i in range(input_size):
= 1.5 - i * 0.75
y1 for j in range(hidden_size):
= 1.5 - j * 0.75
y2 "", xy=(x_hidden - r, y2), xytext=(x_input + r, y1),
ax.annotate(=dict(arrowstyle='->', lw=1))
arrowprops
# Arrows from hidden to output
for j in range(hidden_size):
= 1.5 - j * 0.75
y2 = 0.75
y_out "", xy=(x_output - r, y_out), xytext=(x_hidden + r, y2),
ax.annotate(=dict(arrowstyle='->', lw=1))
arrowprops
"MLP Architecture for XOR", fontsize=14)
ax.set_title(0, 4)
plt.xlim(-0.5, 2.0)
plt.ylim(
plt.tight_layout() plt.show()
import numpy as np
class MLP_XOR:
def __init__(self, input_size=2, hidden_size=2, output_size=1):
# Initialize weights randomly to break symmetry
self.weights_hidden = np.random.uniform(size=(input_size, hidden_size))
self.weights_output = np.random.uniform(size=(hidden_size, output_size))
# Biases can be initialized to zero or randomly
self.bias_hidden = np.random.uniform(size=(1, hidden_size))
self.bias_output = np.random.uniform(size=(1, output_size))
def forward(self, X):
# Forward propagation
self.hidden_activation = sigmoid(np.dot(X, self.weights_hidden) + self.bias_hidden)
self.output = sigmoid(np.dot(self.hidden_activation, self.weights_output) + self.bias_output)
return self.output
def backward(self, X, y, output, lr):
# error
= y - output
output_error = output_error * sigmoid_derivative(output)
output_delta
= output_delta.dot(self.weights_output.T)
hidden_error = hidden_error * sigmoid_derivative(self.hidden_activation)
hidden_delta
# Update weights and biases
self.weights_output += self.hidden_activation.T.dot(output_delta) * lr
self.weights_hidden += X.T.dot(hidden_delta) * lr
self.bias_output += np.sum(output_delta, axis=0, keepdims=True) * lr
self.bias_hidden += np.sum(hidden_delta, axis=0, keepdims=True) * lr
def train(self, X, y, epochs=10000, lr=0.1):
= y.reshape(-1, 1) # Ensure y is a column vector
y for i in range(epochs):
= self.forward(X)
output self.backward(X, y, output, lr)
if (i % 1000) == 0:
= np.mean(np.square(y - output))
loss print(f"Epoch {i} Loss: {loss:.4f}")
def predict(self, X):
return (self.forward(X) > 0.5).astype(int)
def show(self):
draw_mlp_architecture(=self.weights_hidden.shape[0],
input_size=self.weights_hidden.shape[1],
hidden_size=self.weights_output.shape[1]
output_size )
= MLP_XOR()
mlp mlp.show()
= np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
X_xor = np.array([0, 1, 1, 0])
y_xor
= MLP_XOR()
mlp_xor mlp_xor.train(X_xor, y_xor)
Epoch 0 Loss: 0.3274
Epoch 1000 Loss: 0.2498
Epoch 2000 Loss: 0.2479
Epoch 3000 Loss: 0.2306
Epoch 4000 Loss: 0.1784
Epoch 5000 Loss: 0.0817
Epoch 6000 Loss: 0.0219
Epoch 7000 Loss: 0.0104
Epoch 8000 Loss: 0.0065
Epoch 9000 Loss: 0.0046
= mlp_xor.predict(X_xor)
predictions
print("\n--- MLP for XOR Results ---")
print(f"Expected Output: {y_xor}")
print(f"MLP Final Output: {predictions.flatten()}")
= np.sum(y_xor == predictions.flatten()) / len(y_xor)
accuracy print(f"Accuracy: {accuracy * 100}%")
print("\nSuccess! The MLP with a hidden layer correctly learns the XOR function.")
--- MLP for XOR Results ---
Expected Output: [0 1 1 0]
MLP Final Output: [0 1 1 0]
Accuracy: 100.0%
Success! The MLP with a hidden layer correctly learns the XOR function.
3. Simple Neural Network for MNIST from Scratch
Now, we’ll scale up to a more complex problem: classifying handwritten digits from the MNIST dataset. We will build everything from scratch.
- Architecture: Input Layer (784 neurons) -> Hidden Layer (128 neurons, ReLU activation) -> Output Layer (10 neurons, Softmax activation)
- Loss Function: Categorical Cross-Entropy
- Optimizer: Stochastic Gradient Descent (SGD)
Note: We use torchvision
for convenience to download and load the dataset, but all network logic is pure NumPy.
import numpy as np
import matplotlib.pyplot as plt
from torchvision import datasets
import torchvision.transforms as transforms
from tqdm import tqdm
= transforms.ToTensor()
transform = datasets.MNIST(root='data', train=True, download=True, transform=transform)
train_data = datasets.MNIST(root='data', train=False, download=True, transform=transform)
test_data len(train_data), len(test_data)
(60000, 10000)
# convert to numpy
# flatten the images
# normalize the data
print(train_data.data.numpy().shape)
= train_data.data.numpy().reshape(len(train_data), -1) / 255.0
X_train = train_data.targets.numpy()
y_train_raw
= test_data.data.numpy().reshape(len(test_data), -1) / 255.0
X_test = test_data.targets.numpy()
y_test_raw
X_train.shape
(60000, 28, 28)
(60000, 784)
# One-hot encode labels
def one_hot(y, num_classes):
return np.eye(num_classes)[y]
# demonstrating one-hot
= 7
label = np.array([3, 0, 9, 1])
batch_of_labels = 10
num_classes
= one_hot(label, num_classes)
one_hot_label = one_hot(batch_of_labels, num_classes)
one_hot_batch
print(f"Original label: {label}")
print(f"One-hot vector: {one_hot_label}\n")
print(f"Original batch: {batch_of_labels}")
print(f"One-hot batch:\n{one_hot_batch}")
Original label: 7
One-hot vector: [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
Original batch: [3 0 9 1]
One-hot batch:
[[0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
[1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
[0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]]
= one_hot(y_train_raw, 10)
y_train = one_hot(y_test_raw, 10)
y_test
2, :], y_test[:2, :] y_train[:
(array([[0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
[1., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]),
array([[0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
[0., 0., 1., 0., 0., 0., 0., 0., 0., 0.]]))
print(f"Training data shape: {X_train.shape}")
print(f"Training labels shape: {y_train.shape}")
Training data shape: (60000, 784)
Training labels shape: (60000, 10)
def relu(x):
return np.maximum(0, x)
def relu_derivative(x):
return np.where(x > 0, 1, 0)
= relu(x_vals)
y_relu = relu_derivative(x_vals)
y_relu_deriv
='ReLU')
plt.plot(x_vals, y_relu, label='ReLU Derivative', linestyle='--')
plt.plot(x_vals, y_relu_deriv, label'ReLU Function and its Derivative')
plt.title('Input value (x)')
plt.xlabel('Output value')
plt.ylabel( plt.legend()
def softmax(x):
= np.exp(x - np.max(x, axis=1, keepdims=True)) # difference for stability
exps return exps / np.sum(exps, axis=1, keepdims=True)
# extra bracket for batch dimension
= np.array([[2.0, 1.0, 0.1, 3.0, -1.0]])
logits
= softmax(logits)
probabilities
# flatten to plot
= probabilities.flatten()
probabilities_flatten
print(f"Original Logits: {logits.flatten()}")
print(f"Probabilities after Softmax: {np.round(probabilities, 3)}")
print(f"Sum of probabilities: {np.sum(probabilities):.2f}")
= [f'Class {i}' for i in range(len(probabilities_flatten))]
class_indices
plt.bar(class_indices, probabilities_flatten)'Softmax Function Output')
plt.title('Class')
plt.xlabel('Probability')
plt.ylabel(0, 1)
plt.ylim(print("Softmax Function: Converts raw scores (logits) into a probability distribution. The class with the highest logit gets the highest probability.")
Original Logits: [ 2. 1. 0.1 3. -1. ]
Probabilities after Softmax: [[0.233 0.086 0.035 0.634 0.012]]
Sum of probabilities: 1.00
Softmax Function: Converts raw scores (logits) into a probability distribution. The class with the highest logit gets the highest probability.
# penalty should grow exponentially as the model gets more confident and wrong
def cross_entropy_loss(y_pred, y_true):
# y_true contains labels for entire batch
# Clip to avoid log(0)
= np.clip(y_pred, 1e-12, 1. - 1e-12)
y_pred_clipped
# so divided by batch size for averaging loss per sample
# y_true is either 0 or 1, one_hot
return -np.sum(y_true * np.log(y_pred_clipped)) / y_true.shape[0]
= np.array([[0, 0, 1, 0]])
y_true
= np.linspace(0.01, 0.99, 200)
predicted_probs_for_correct_class
# basic curve
= [-np.log(p) for p in predicted_probs_for_correct_class]
losses_curve
'seaborn-v0_8-whitegrid')
plt.style.use(=(10, 6))
plt.figure(figsize='royalblue', label='Loss Curve')
plt.plot(predicted_probs_for_correct_class, losses_curve, color
# 3 Key Cases
= {
cases 'A': 0.95, # High Confidence, Correct
'B': 0.50, # Medium Confidence
'C': 0.05 # Low Confidence, Wrong
}
= {'A': 'green', 'B': 'orange', 'C': 'red'}
colors print("--- Predictions and Losses for 3 Cases ---\n")
for case, prob in cases.items():
= (1 - prob) / 3
remaining_prob
# sharing same for the rest of the 3 classes
= np.array([remaining_prob, remaining_prob, prob, remaining_prob])
y_pred = cross_entropy_loss(y_pred.reshape(1, -1), y_true)
loss
print(f"--- Case {case} ---")
print(f"Prediction Vector (y_pred): {np.round(y_pred, 4)}")
print(f"Corresponding Loss: {loss:.4f}\n")
# Marking the points
'o', color=colors[case], markersize=10, label=f'Case {case}')
plt.plot(prob, loss,
'Cross-Entropy Loss Curve', fontsize=16)
plt.title('Predicted Probability for the Correct Class', fontsize=12)
plt.xlabel('Calculated Loss', fontsize=12)
plt.ylabel(
plt.legend()True)
plt.grid(0, 5)
plt.ylim( plt.show()
--- Predictions and Losses for 3 Cases ---
--- Case A ---
Prediction Vector (y_pred): [0.0167 0.0167 0.95 0.0167]
Corresponding Loss: 0.0513
--- Case B ---
Prediction Vector (y_pred): [0.1667 0.1667 0.5 0.1667]
Corresponding Loss: 0.6931
--- Case C ---
Prediction Vector (y_pred): [0.3167 0.3167 0.05 0.3167]
Corresponding Loss: 2.9957
Simple computation graph to understand gradients formula computation
Cross entropy backpropagation: https://medium.com/data-science/deriving-backpropagation-with-cross-entropy-loss-d24811edeaf9
class SimpleNN_MNIST:
def __init__(self, input_size, hidden_size, output_size):
# He initialization for weights
self.W1 = np.random.randn(input_size, hidden_size) * np.sqrt(2. / input_size)
self.b1 = np.zeros((1, hidden_size))
self.W2 = np.random.randn(hidden_size, output_size) * np.sqrt(2. / hidden_size)
self.b2 = np.zeros((1, output_size))
def forward(self, X):
# Store intermediate values for backpropagation
self.Z1 = X @ self.W1 + self.b1
self.A1 = relu(self.Z1)
self.Z2 = self.A1 @ self.W2 + self.b2
self.A2 = softmax(self.Z2)
return self.A2
def backward(self, X, y_true):
# Number of samples in the batch
= y_true.shape[0]
m
# -----------------------------------
# Output Layer Gradients
# -----------------------------------
# Gradient of the loss with respect to Z2 (pre-activation of the output layer)
# Since we're using Softmax + Cross-Entropy Loss, the gradient simplifies to:
# dZ2 = A2 - y_true
# A2 is the output from softmax, y_true is one-hot encoded ground truth
= self.A2 - y_true
dZ2
# Gradient of the loss with respect to W2 (weights between hidden and output layers)
# Using the chain rule: dW2 = (A1^T @ dZ2) / m
# A1: activations from hidden layer, shape (m, hidden_dim)
# dZ2: error term for output layer, shape (m, output_dim)
# A1.T @ dZ2 results in shape (hidden_dim, output_dim)
self.dW2 = (self.A1.T @ dZ2) / m
# Gradient of the loss with respect to b2 (bias of the output layer)
# Sum over the batch dimension to get bias gradient: shape (1, output_dim)
self.db2 = np.sum(dZ2, axis=0, keepdims=True) / m
# Backpropagating the error to the hidden layer
# dA1 = dZ2 @ W2^T
# W2.T: shape (output_dim, hidden_dim)
# dZ2: shape (m, output_dim)
# dA1: shape (m, hidden_dim), error signal for hidden layer outputs (A1)
= dZ2 @ self.W2.T
dA1
# Applying the derivative of the ReLU activation function
# ReLU'(Z1) is 1 where Z1 > 0, else 0
# Element-wise multiply with dA1 to get dZ1 (gradient wrt pre-activation of hidden layer)
= dA1 * relu_derivative(self.Z1)
dZ1 self.dW1 = (X.T @ dZ1) / m
self.db1 = np.sum(dZ1, axis=0, keepdims=True) / m
def update_params(self, lr):
# Basic SGD optimizer
self.W1 -= lr * self.dW1
self.b1 -= lr * self.db1
self.W2 -= lr * self.dW2
self.b2 -= lr * self.db2
def train(self, X_train, y_train, X_test, y_test_raw, epochs, lr, batch_size):
= {'loss': [], 'accuracy': []}
history = len(X_train) // batch_size
num_batches
for epoch in range(epochs):
# Shuffle
= np.random.permutation(len(X_train))
permutation = X_train[permutation]
X_train_shuffled = y_train[permutation]
y_train_shuffled
= 0
epoch_loss for i in tqdm(range(num_batches), desc=f"Epoch {epoch+1}/{epochs}"):
# Create mini-batch
= i * batch_size
start = start + batch_size
end = X_train_shuffled[start:end]
X_batch = y_train_shuffled[start:end]
y_batch
= self.forward(X_batch)
y_pred += cross_entropy_loss(y_pred, y_batch)
epoch_loss self.backward(X_batch, y_batch)
self.update_params(lr)
# Calculate loss and accuracy at the end of epoch
= epoch_loss / num_batches
avg_loss
# Evaluate on test set
= self.predict(X_test)
y_pred_test = np.sum(y_pred_test == y_test_raw) / len(y_test_raw)
accuracy
'loss'].append(avg_loss)
history['accuracy'].append(accuracy)
history[print(f'Epoch {epoch+1} - Loss: {avg_loss:.4f}, Test Accuracy: {accuracy:.4f}')
return history
def predict(self, X):
= self.forward(X)
y_pred_probs return np.argmax(y_pred_probs, axis=1)
# --- 3. Train the Network and Plot Results ---
# Hyperparameters
= 784
INPUT_SIZE = 128
HIDDEN_SIZE = 10
OUTPUT_SIZE = 10
EPOCHS = 0.1
LEARNING_RATE = 64
BATCH_SIZE
= SimpleNN_MNIST(INPUT_SIZE, HIDDEN_SIZE, OUTPUT_SIZE)
scratch_nn = scratch_nn.train(X_train, y_train, X_test, y_test_raw, EPOCHS, LEARNING_RATE, BATCH_SIZE)
history
= plt.subplots(1, 2, figsize=(12, 5))
fig, (ax1, ax2) 'From-Scratch Model Training', fontsize=16)
fig.suptitle(
'loss'])
ax1.plot(history['Training Loss')
ax1.set_title('Epoch')
ax1.set_xlabel('Cross-Entropy Loss')
ax1.set_ylabel(
'accuracy'])
ax2.plot(history['Test Accuracy')
ax2.set_title('Epoch')
ax2.set_xlabel('Accuracy')
ax2.set_ylabel(
plt.show()
Epoch 1/10: 100%|██████████████████████| 937/937 [00:02<00:00, 315.13it/s]
Epoch 1 - Loss: 0.3735, Test Accuracy: 0.9359
Epoch 2/10: 100%|██████████████████████| 937/937 [00:01<00:00, 536.68it/s]
Epoch 2 - Loss: 0.2000, Test Accuracy: 0.9515
Epoch 3/10: 100%|██████████████████████| 937/937 [00:02<00:00, 428.29it/s]
Epoch 3 - Loss: 0.1509, Test Accuracy: 0.9600
Epoch 4/10: 100%|██████████████████████| 937/937 [00:01<00:00, 569.65it/s]
Epoch 4 - Loss: 0.1223, Test Accuracy: 0.9645
Epoch 5/10: 100%|██████████████████████| 937/937 [00:01<00:00, 546.63it/s]
Epoch 5 - Loss: 0.1034, Test Accuracy: 0.9675
Epoch 6/10: 100%|██████████████████████| 937/937 [00:03<00:00, 308.86it/s]
Epoch 6 - Loss: 0.0895, Test Accuracy: 0.9693
Epoch 7/10: 100%|██████████████████████| 937/937 [00:01<00:00, 546.72it/s]
Epoch 7 - Loss: 0.0784, Test Accuracy: 0.9728
Epoch 8/10: 100%|██████████████████████| 937/937 [00:01<00:00, 528.95it/s]
Epoch 8 - Loss: 0.0700, Test Accuracy: 0.9745
Epoch 9/10: 100%|██████████████████████| 937/937 [00:02<00:00, 467.59it/s]
Epoch 9 - Loss: 0.0624, Test Accuracy: 0.9752
Epoch 10/10: 100%|█████████████████████| 937/937 [00:01<00:00, 492.77it/s]
Epoch 10 - Loss: 0.0565, Test Accuracy: 0.9761
4. Weight Initialization Techniques
Proper weight initialization is crucial for preventing gradients from vanishing (becoming too small) or exploding (becoming too large) during training. Here are a few common techniques implemented from scratch.
- Zeros Initialization: A bad practice that causes all neurons in a layer to learn the same thing.
- Random Normal: Breaks symmetry, but can lead to vanishing/exploding gradients if not scaled correctly.
- Xavier/Glorot Initialization: Scales weights based on the number of input neurons (
n_in
). Good for Tanh/Sigmoid activations. Formula: \(W \sim N(0, \sqrt{1/n_{in}})\). - He Initialization: Scales weights based on
n_in
. Designed for ReLU-based activations. Formula: \(W \sim N(0, \sqrt{2/n_{in}})\).
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
# Initialization
def zeros_init(n_in, n_out):
return np.zeros((n_out, n_in))
def random_normal_init(n_in, n_out):
return np.random.randn(n_out, n_in) * 0.01
def xavier_init(n_in, n_out):
return np.random.randn(n_out, n_in) * np.sqrt(1.0 / n_in)
def he_init(n_in, n_out):
return np.random.randn(n_out, n_in) * np.sqrt(2.0 / n_in)
# plot density curves
def plot_density(weights, label, color):
= weights.flatten()
flat_weights = gaussian_kde(flat_weights)
density = np.linspace(flat_weights.min(), flat_weights.max(), 200)
x_vals =label, color=color)
plt.plot(x_vals, density(x_vals), label
# Layer dimensions
= 784, 128
n_in, n_out
= {
initializations "Random Normal": (random_normal_init(n_in, n_out), 'blue'),
"Xavier": (xavier_init(n_in, n_out), 'red'),
"He": (he_init(n_in, n_out), 'green'),
"Zeros": (zeros_init(n_in, n_out), 'black')
}
# Print stats and plot densities (excluding Zeros)
=(10, 5))
plt.figure(figsizefor name, (weights, color) in initializations.items():
= weights.mean(), weights.std()
mean, std print(f"{name:<15} | Mean: {mean:>7.4f}, Std: {std:>7.4f}")
if name != "Zeros":
plot_density(weights, name, color)
"Weight Initialization Density (Excl. Zeros)")
plt.title("Weight Value")
plt.xlabel("Density")
plt.ylabel(True)
plt.grid(
plt.legend()
plt.tight_layout()
plt.show()
# Plot Zeros separately
=(5, 4))
plt.figure(figsize"Zeros"][0].flatten(), bins=10, color='gray')
plt.hist(initializations["Zeros Initialization (Separate View)")
plt.title("Weight Value")
plt.xlabel("Frequency")
plt.ylabel(True)
plt.grid(
plt.tight_layout() plt.show()
Random Normal | Mean: 0.0000, Std: 0.0100
Xavier | Mean: 0.0000, Std: 0.0358
He | Mean: 0.0000, Std: 0.0505
Zeros | Mean: 0.0000, Std: 0.0000
5. PyTorch Verification
Let’s build the exact same network in PyTorch. This helps verify that our from-scratch implementation is correct. We will use the same architecture, hyperparameters, and optimizer.
The final accuracy should be very close to our NumPy model.
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
= torch.tensor(X_train, dtype=torch.float32)
X_train_t = torch.tensor(y_train_raw, dtype=torch.long)
y_train_t = torch.tensor(X_test, dtype=torch.float32)
X_test_t = torch.tensor(y_test_raw, dtype=torch.long)
y_test_t
= TensorDataset(X_train_t, y_train_t)
train_dataset = TensorDataset(X_test_t, y_test_t)
test_dataset
= DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
train_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader
class PyTorchNN(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super(PyTorchNN, self).__init__()
self.fc1 = nn.Linear(input_size, hidden_size)
self.relu = nn.ReLU()
self.fc2 = nn.Linear(hidden_size, output_size)
# Apply He initialization
self.fc1.weight, nonlinearity='relu')
nn.init.kaiming_normal_(self.fc2.weight, nonlinearity='relu')
nn.init.kaiming_normal_(
def forward(self, x):
= self.fc1(x)
out = self.relu(out)
out = self.fc2(out)
out return out
= PyTorchNN(INPUT_SIZE, HIDDEN_SIZE, OUTPUT_SIZE)
pytorch_nn = nn.CrossEntropyLoss()
criterion = optim.SGD(pytorch_nn.parameters(), lr=LEARNING_RATE)
optimizer
= {'loss': [], 'accuracy': []}
pytorch_history
for epoch in range(EPOCHS):
= 0
epoch_loss for i, (inputs, labels) in enumerate(tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}")):
= pytorch_nn(inputs)
outputs = criterion(outputs, labels)
loss
optimizer.zero_grad()
loss.backward()
optimizer.step()+= loss.item()
epoch_loss
= 0
correct = 0
total with torch.no_grad():
for inputs, labels in test_loader:
= pytorch_nn(inputs)
outputs = torch.max(outputs.data, 1)
_, predicted += labels.size(0)
total += (predicted == labels).sum().item()
correct
= epoch_loss / len(train_loader)
avg_loss = correct / total
accuracy
'loss'].append(avg_loss)
pytorch_history['accuracy'].append(accuracy)
pytorch_history[print(f'Epoch {epoch+1} - Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}')
Epoch 1/10: 100%|█████████████████████| 938/938 [00:00<00:00, 1080.91it/s]
Epoch 1 - Loss: 0.3656, Accuracy: 0.9382
Epoch 2/10: 100%|█████████████████████| 938/938 [00:00<00:00, 1130.23it/s]
Epoch 2 - Loss: 0.1950, Accuracy: 0.9547
Epoch 3/10: 100%|█████████████████████| 938/938 [00:00<00:00, 1095.94it/s]
Epoch 3 - Loss: 0.1472, Accuracy: 0.9615
Epoch 4/10: 100%|██████████████████████| 938/938 [00:00<00:00, 996.78it/s]
Epoch 4 - Loss: 0.1205, Accuracy: 0.9628
Epoch 5/10: 100%|██████████████████████| 938/938 [00:01<00:00, 853.66it/s]
Epoch 5 - Loss: 0.1021, Accuracy: 0.9704
Epoch 6/10: 100%|██████████████████████| 938/938 [00:01<00:00, 857.51it/s]
Epoch 6 - Loss: 0.0882, Accuracy: 0.9725
Epoch 7/10: 100%|██████████████████████| 938/938 [00:00<00:00, 979.19it/s]
Epoch 7 - Loss: 0.0776, Accuracy: 0.9717
Epoch 8/10: 100%|██████████████████████| 938/938 [00:01<00:00, 842.42it/s]
Epoch 8 - Loss: 0.0690, Accuracy: 0.9751
Epoch 9/10: 100%|█████████████████████| 938/938 [00:00<00:00, 1029.96it/s]
Epoch 9 - Loss: 0.0624, Accuracy: 0.9755
Epoch 10/10: 100%|█████████████████████| 938/938 [00:00<00:00, 951.11it/s]
Epoch 10 - Loss: 0.0562, Accuracy: 0.9775
= plt.subplots(1, 2, figsize=(15, 6))
fig, (ax1, ax2) 'From-Scratch vs PyTorch Model Comparison', fontsize=16)
fig.suptitle(
'loss'], label='From Scratch')
ax1.plot(history['loss'], label='PyTorch', linestyle='--')
ax1.plot(pytorch_history['Training Loss Comparison')
ax1.set_title('Epoch')
ax1.set_xlabel('Cross-Entropy Loss')
ax1.set_ylabel(
ax1.legend()
'accuracy'], label='From Scratch')
ax2.plot(history['accuracy'], label='PyTorch', linestyle='--')
ax2.plot(pytorch_history['Test Accuracy Comparison')
ax2.set_title('Epoch')
ax2.set_xlabel('Accuracy')
ax2.set_ylabel(
ax2.legend()
plt.show()