Building Neural Networks from Scratch with NumPy¶
This notebook demonstrates how to:
- Implement a feedforward neural network (forward and backward pass) using NumPy.
- Train the network on spam detection from the
NotShrirang/email-spam-filter
dataset, using TF-IDF for text. - Experiment with different batch sizes and learning rates.
- Illustrate vanishing and exploding gradients in deeper networks.
No Keras or TensorFlow is used—it's all done with core Python/NumPy.
# Imports (No Keras/TensorFlow)
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# For reproducibility:
np.random.seed(42)
1. Activation and Loss Functions (NumPy)¶
We'll define some basic building blocks: sigmoid
, relu
, softmax
, etc., plus a cross-entropy loss function.
def sigmoid(x):
# clip to avoid overflow in large exponent
return 1 / (1 + np.exp(-np.clip(x, -500, 500)))
def sigmoid_derivative(x):
s = sigmoid(x)
return s * (1 - s)
def relu(x):
return np.maximum(0, x)
def relu_derivative(x):
return np.where(x > 0, 1, 0)
def tanh(x):
return np.tanh(x)
def tanh_derivative(x):
return 1 - np.tanh(x)**2
def softmax(x):
# numeric stability shift
shifted = x - np.max(x, axis=1, keepdims=True)
ex = np.exp(shifted)
return ex / np.sum(ex, axis=1, keepdims=True)
def categorical_cross_entropy(y_true, y_pred):
# clip predictions to avoid log(0)
y_pred = np.clip(y_pred, 1e-15, 1-1e-15)
return -np.sum(y_true * np.log(y_pred)) / y_true.shape[0]
2. Base Neural Network in NumPy¶
Implements forward/backward passes, He initialization, and can train via a simple loop. We'll do mini-batches inside a .train(...)
method for basic gradient descent.
class NeuralNetwork:
def __init__(self, layer_sizes, activations):
"""
layer_sizes: list (e.g. [input_dim, hidden_dim, ..., output_dim])
activations: list of string names, e.g. ['relu','softmax'] (one per hidden/output layer)
"""
self.layer_sizes = layer_sizes
self.num_layers = len(layer_sizes)
# map from string to actual function
self.activation_map = {
'sigmoid': (sigmoid, sigmoid_derivative),
'relu': (relu, relu_derivative),
'softmax': (softmax, None),
'tanh': (tanh, tanh_derivative)
}
self.activations = []
self.activation_derivs = []
for act in activations:
f, d = self.activation_map[act]
self.activations.append(f)
self.activation_derivs.append(d)
self.initialize_parameters()
# placeholders for forward pass data
self.Z = [None]*(self.num_layers-1)
self.A = [None]*self.num_layers
def initialize_parameters(self):
# He initialization
self.weights = []
self.biases = []
for i in range(1, self.num_layers):
scale = np.sqrt(2.0 / self.layer_sizes[i-1])
W = np.random.randn(self.layer_sizes[i-1], self.layer_sizes[i]) * scale
b = np.zeros((1, self.layer_sizes[i]))
self.weights.append(W)
self.biases.append(b)
def forward_propagation(self, X):
self.A[0] = X
for i in range(self.num_layers-1):
self.Z[i] = np.dot(self.A[i], self.weights[i]) + self.biases[i]
self.A[i+1] = self.activations[i](self.Z[i])
return self.A[-1]
def compute_loss(self, y_true, y_pred):
return categorical_cross_entropy(y_true, y_pred)
def backward_propagation(self, y_true):
m = y_true.shape[0]
dW = [None]*(self.num_layers-1)
db = [None]*(self.num_layers-1)
# handle last layer
if self.activations[-1] == softmax:
dZ = self.A[-1] - y_true # (y_pred - y_true)
else:
# e.g. sigmoid + cross-entropy
dA = -(y_true / self.A[-1] - (1 - y_true)/(1 - self.A[-1]))
dZ = dA * self.activation_derivs[-1](self.Z[-1])
dW[-1] = np.dot(self.A[-2].T, dZ)/m
db[-1] = np.sum(dZ, axis=0, keepdims=True)/m
for l in range(self.num_layers-3, -1, -1):
dA = np.dot(dZ, self.weights[l+1].T)
dZ = dA * self.activation_derivs[l](self.Z[l])
dW[l] = np.dot(self.A[l].T, dZ)/m
db[l] = np.sum(dZ, axis=0, keepdims=True)/m
return {'dW': dW, 'db': db}
def update_parameters(self, grads, lr):
dW = grads['dW']
db = grads['db']
for i in range(self.num_layers-1):
self.weights[i] -= lr * dW[i]
self.biases[i] -= lr * db[i]
def train_step(self, X_batch, y_batch, lr):
y_pred = self.forward_propagation(X_batch)
loss = self.compute_loss(y_batch, y_pred)
grads = self.backward_propagation(y_batch)
self.update_parameters(grads, lr)
return loss
def train(self, X_train, y_train, X_val=None, y_val=None, batch_size=32, learning_rate=0.01,
epochs=10, verbose=True, record_every=1):
m = X_train.shape[0]
history = {'train_loss':[], 'val_loss':[], 'train_acc':[], 'val_acc':[]}
for epoch in range(epochs):
perm = np.random.permutation(m)
X_shuf = X_train[perm]
y_shuf = y_train[perm]
epoch_loss = 0
num_batches = int(np.ceil(m/batch_size))
for i in range(num_batches):
start = i*batch_size
end = min((i+1)*batch_size, m)
X_batch = X_shuf[start:end]
y_batch = y_shuf[start:end]
b_loss = self.train_step(X_batch, y_batch, learning_rate)
epoch_loss += b_loss * (end - start)
epoch_loss /= m
# track train/val if needed
if epoch % record_every == 0 or epoch == epochs-1:
y_pred_train = self.predict(X_train)
train_acc = np.mean(np.argmax(y_pred_train, axis=1) == np.argmax(y_train, axis=1))
history['train_loss'].append(epoch_loss)
history['train_acc'].append(train_acc)
if X_val is not None and y_val is not None:
y_pred_val = self.predict(X_val)
val_loss = self.compute_loss(y_val, y_pred_val)
val_acc = np.mean(np.argmax(y_pred_val, axis=1) == np.argmax(y_val, axis=1))
history['val_loss'].append(val_loss)
history['val_acc'].append(val_acc)
if verbose:
print(f"Epoch {epoch+1}/{epochs} - loss: {epoch_loss:.4f} - acc: {train_acc:.4f} - val_loss: {val_loss:.4f} - val_acc: {val_acc:.4f}")
else:
if verbose:
print(f"Epoch {epoch+1}/{epochs} - loss: {epoch_loss:.4f} - acc: {train_acc:.4f}")
return history
def predict(self, X):
return self.forward_propagation(X)
Quick XOR Check¶
We confirm that the above network can learn XOR with a small architecture.
# XOR dataset: 2 inputs -> 2 classes, one-hot.
X_xor = np.array([[0,0],[0,1],[1,0],[1,1]])
y_xor = np.array([[1,0],[0,1],[0,1],[1,0]])
# define net
xor_net = NeuralNetwork(
layer_sizes=[2,4,2],
activations=['sigmoid','sigmoid']
)
hist_xor = xor_net.train(
X_xor,
y_xor,
batch_size=4, # full batch
learning_rate=0.5,
epochs=1000,
verbose=False,
record_every=100
)
# Evaluate
y_out = xor_net.predict(X_xor)
pred_cls = np.argmax(y_out, axis=1)
true_cls = np.argmax(y_xor, axis=1)
acc = np.mean(pred_cls==true_cls)
print("XOR Accuracy:", acc)
plt.plot(hist_xor['train_loss'], label='Train Loss')
plt.title('XOR Training Loss')
plt.xlabel('Checkpoints')
plt.ylabel('Loss')
plt.legend()
plt.show()
XOR Accuracy: 1.0
3. Spam Detection (NumPy)¶
We will:
- Load
NotShrirang/email-spam-filter
- Split train/dev/test
- TF-IDF vectorize
- Convert labels to one-hot for a 2-output softmax
- Train a
[2000 -> 64 -> 2]
network for ~10 epochs.
spam_data = load_dataset("NotShrirang/email-spam-filter")
df = spam_data['train'].to_pandas()
df.head()
Unnamed: 0 | label | text | label_num | |
---|---|---|---|---|
0 | 605 | ham | Subject: enron methanol ; meter # : 988291\nth... | 0 |
1 | 2349 | ham | Subject: hpl nom for january 9 , 2001\n( see a... | 0 |
2 | 3624 | ham | Subject: neon retreat\nho ho ho , we ' re arou... | 0 |
3 | 4685 | spam | Subject: photoshop , windows , office . cheap ... | 1 |
4 | 2030 | ham | Subject: re : indian springs\nthis deal is to ... | 0 |
3.1 Split into Train/Dev/Test, TF-IDF, and One-Hot¶
train_df, test_df = train_test_split(
df, test_size=0.2, stratify=df['label_num'], random_state=42)
train_df, dev_df = train_test_split(
train_df, test_size=0.2, stratify=train_df['label_num'], random_state=42)
print(len(train_df), len(dev_df), len(test_df))
vectorizer = TfidfVectorizer(stop_words='english', max_features=2000)
X_train_tfidf = vectorizer.fit_transform(train_df['text']).toarray()
y_train_nums = train_df['label_num'].values
X_dev_tfidf = vectorizer.transform(dev_df['text']).toarray()
y_dev_nums = dev_df['label_num'].values
X_test_tfidf = vectorizer.transform(test_df['text']).toarray()
y_test_nums = test_df['label_num'].values
def to_one_hot(lbls, num_classes=2):
return np.eye(num_classes)[lbls]
y_train_oh = to_one_hot(y_train_nums, 2)
y_dev_oh = to_one_hot(y_dev_nums, 2)
y_test_oh = to_one_hot(y_test_nums, 2)
3308 828 1035
3.2 Build a [2000->64->2]
Net with ReLU + Softmax¶
Train for 10 epochs and check the test accuracy.
spam_net_np = NeuralNetwork(
layer_sizes=[2000,64,2],
activations=['relu','softmax']
)
hist_spam_np = spam_net_np.train(
X_train_tfidf, y_train_oh,
X_dev_tfidf, y_dev_oh,
batch_size=64,
learning_rate=0.2,
epochs=50,
verbose=True,
record_every=1
)
y_test_pred_proba_np = spam_net_np.predict(X_test_tfidf)
y_test_pred_class_np = np.argmax(y_test_pred_proba_np, axis=1)
test_acc_np = np.mean(y_test_pred_class_np == y_test_nums)
print(f"\nNumPy Network Test Accuracy: {test_acc_np:.4f}")
print("\nClassification Report:")
print(classification_report(y_test_nums, y_test_pred_class_np))
cm = confusion_matrix(y_test_nums, y_test_pred_class_np)
sns.heatmap(cm, annot=True, cmap='Blues', fmt='d')
plt.title('Spam Detection (NumPy NN) - Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()
Epoch 1/50 - loss: 0.5815 - acc: 0.7101 - val_loss: 0.5329 - val_acc: 0.7101 Epoch 2/50 - loss: 0.4722 - acc: 0.7443 - val_loss: 0.4096 - val_acc: 0.7403 Epoch 3/50 - loss: 0.3412 - acc: 0.9531 - val_loss: 0.2936 - val_acc: 0.9300 Epoch 4/50 - loss: 0.2433 - acc: 0.9628 - val_loss: 0.2230 - val_acc: 0.9372 Epoch 5/50 - loss: 0.1849 - acc: 0.9680 - val_loss: 0.1820 - val_acc: 0.9589 Epoch 6/50 - loss: 0.1487 - acc: 0.9743 - val_loss: 0.1527 - val_acc: 0.9614 Epoch 7/50 - loss: 0.1242 - acc: 0.9831 - val_loss: 0.1363 - val_acc: 0.9626 Epoch 8/50 - loss: 0.1056 - acc: 0.9840 - val_loss: 0.1183 - val_acc: 0.9686 Epoch 9/50 - loss: 0.0925 - acc: 0.9876 - val_loss: 0.1064 - val_acc: 0.9698 Epoch 10/50 - loss: 0.0814 - acc: 0.9897 - val_loss: 0.0991 - val_acc: 0.9722 Epoch 11/50 - loss: 0.0720 - acc: 0.9909 - val_loss: 0.0947 - val_acc: 0.9734 Epoch 12/50 - loss: 0.0658 - acc: 0.9915 - val_loss: 0.0857 - val_acc: 0.9771 Epoch 13/50 - loss: 0.0595 - acc: 0.9921 - val_loss: 0.0867 - val_acc: 0.9722 Epoch 14/50 - loss: 0.0544 - acc: 0.9933 - val_loss: 0.0776 - val_acc: 0.9783 Epoch 15/50 - loss: 0.0510 - acc: 0.9946 - val_loss: 0.0781 - val_acc: 0.9734 Epoch 16/50 - loss: 0.0476 - acc: 0.9955 - val_loss: 0.0736 - val_acc: 0.9758 Epoch 17/50 - loss: 0.0443 - acc: 0.9961 - val_loss: 0.0701 - val_acc: 0.9771 Epoch 18/50 - loss: 0.0412 - acc: 0.9967 - val_loss: 0.0681 - val_acc: 0.9771 Epoch 19/50 - loss: 0.0384 - acc: 0.9958 - val_loss: 0.0695 - val_acc: 0.9771 Epoch 20/50 - loss: 0.0364 - acc: 0.9964 - val_loss: 0.0657 - val_acc: 0.9795 Epoch 21/50 - loss: 0.0340 - acc: 0.9967 - val_loss: 0.0655 - val_acc: 0.9783 Epoch 22/50 - loss: 0.0321 - acc: 0.9961 - val_loss: 0.0688 - val_acc: 0.9771 Epoch 23/50 - loss: 0.0310 - acc: 0.9973 - val_loss: 0.0626 - val_acc: 0.9771 Epoch 24/50 - loss: 0.0290 - acc: 0.9976 - val_loss: 0.0644 - val_acc: 0.9795 Epoch 25/50 - loss: 0.0275 - acc: 0.9973 - val_loss: 0.0616 - val_acc: 0.9771 Epoch 26/50 - loss: 0.0264 - acc: 0.9976 - val_loss: 0.0609 - val_acc: 0.9783 Epoch 27/50 - loss: 0.0250 - acc: 0.9973 - val_loss: 0.0605 - val_acc: 0.9807 Epoch 28/50 - loss: 0.0238 - acc: 0.9979 - val_loss: 0.0614 - val_acc: 0.9795 Epoch 29/50 - loss: 0.0229 - acc: 0.9976 - val_loss: 0.0597 - val_acc: 0.9795 Epoch 30/50 - loss: 0.0220 - acc: 0.9979 - val_loss: 0.0599 - val_acc: 0.9807 Epoch 31/50 - loss: 0.0209 - acc: 0.9988 - val_loss: 0.0622 - val_acc: 0.9783 Epoch 32/50 - loss: 0.0200 - acc: 0.9985 - val_loss: 0.0598 - val_acc: 0.9807 Epoch 33/50 - loss: 0.0194 - acc: 0.9985 - val_loss: 0.0593 - val_acc: 0.9807 Epoch 34/50 - loss: 0.0187 - acc: 0.9991 - val_loss: 0.0603 - val_acc: 0.9807 Epoch 35/50 - loss: 0.0177 - acc: 0.9985 - val_loss: 0.0588 - val_acc: 0.9783 Epoch 36/50 - loss: 0.0171 - acc: 0.9985 - val_loss: 0.0586 - val_acc: 0.9795 Epoch 37/50 - loss: 0.0167 - acc: 0.9991 - val_loss: 0.0592 - val_acc: 0.9807 Epoch 38/50 - loss: 0.0160 - acc: 0.9991 - val_loss: 0.0604 - val_acc: 0.9807 Epoch 39/50 - loss: 0.0154 - acc: 0.9994 - val_loss: 0.0633 - val_acc: 0.9795 Epoch 40/50 - loss: 0.0150 - acc: 0.9991 - val_loss: 0.0587 - val_acc: 0.9795 Epoch 41/50 - loss: 0.0146 - acc: 0.9991 - val_loss: 0.0586 - val_acc: 0.9795 Epoch 42/50 - loss: 0.0141 - acc: 0.9991 - val_loss: 0.0587 - val_acc: 0.9795 Epoch 43/50 - loss: 0.0136 - acc: 0.9991 - val_loss: 0.0588 - val_acc: 0.9783 Epoch 44/50 - loss: 0.0132 - acc: 0.9991 - val_loss: 0.0587 - val_acc: 0.9795 Epoch 45/50 - loss: 0.0129 - acc: 0.9994 - val_loss: 0.0590 - val_acc: 0.9807 Epoch 46/50 - loss: 0.0126 - acc: 0.9994 - val_loss: 0.0604 - val_acc: 0.9819 Epoch 47/50 - loss: 0.0121 - acc: 0.9994 - val_loss: 0.0619 - val_acc: 0.9807 Epoch 48/50 - loss: 0.0118 - acc: 0.9994 - val_loss: 0.0635 - val_acc: 0.9807 Epoch 49/50 - loss: 0.0116 - acc: 0.9994 - val_loss: 0.0591 - val_acc: 0.9807 Epoch 50/50 - loss: 0.0114 - acc: 0.9994 - val_loss: 0.0600 - val_acc: 0.9819 NumPy Network Test Accuracy: 0.9855 Classification Report: precision recall f1-score support 0 0.99 0.99 0.99 735 1 0.97 0.98 0.98 300 accuracy 0.99 1035 macro avg 0.98 0.98 0.98 1035 weighted avg 0.99 0.99 0.99 1035
Quite impressive ! 98.5% accuracy !! Which is much more than the 50% of the baseline and what we created in previous session with baseline.
3.3 Plot the Training/Dev Loss¶
plt.figure(figsize=(8,4))
plt.plot(hist_spam_np['train_loss'], label='Train Loss')
if len(hist_spam_np['val_loss'])>0:
plt.plot(hist_spam_np['val_loss'], label='Dev Loss')
plt.title('Spam (NumPy) Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()
Nevertheless, we see the dev loss starting to increase after 20 epochs. Which means we most likely overfit the training data past that point. To avoid this, we could have used early stopping or regularization.
4. Different Batch Sizes and Learning Rates on Spam Data (NumPy)¶
We'll define a short experiment, training for a small number of epochs, just to illustrate how the training loss changes across (batch_size, learning_rate)
combos.
batch_sizes = [32, 128, 512]
lrs = [0.001, 0.01, 0.1]
results = {}
for bs in batch_sizes:
for lr in lrs:
key = f"BS={bs}-LR={lr}"
net_temp = NeuralNetwork(
layer_sizes=[2000,32,2],
activations=['relu','softmax']
)
hist_temp = net_temp.train(
X_train_tfidf, y_train_oh,
X_dev_tfidf, y_dev_oh,
batch_size=bs,
learning_rate=lr,
epochs=50,
verbose=False,
record_every=1
)
results[key] = hist_temp
# Plot the training loss curves
plt.figure(figsize=(15,8))
for key, hist_ in results.items():
plt.plot(hist_['train_loss'], label=key)
plt.title('Spam (NumPy) with Different Batch Sizes & LRs')
plt.xlabel('Epoch')
plt.ylabel('Train Loss')
plt.legend()
plt.show()
You can see the differences in loss curves for different batch sizes and learning rates. Generally with really low learning rates the curves are quite flat and the loss is high. And at equal learning rates, the lower batch sizes converge faster.
5. Vanishing and Exploding Gradients¶
We'll build deeper networks in NumPy. By controlling:
- Small init or
sigmoid
for vanishing - Large init or large LR for exploding We'll track the gradient norm at each epoch to see the pattern.
def gradient_norm(nn, X_batch, y_batch):
_ = nn.forward_propagation(X_batch)
grads = nn.backward_propagation(y_batch)
total_sq = 0
for dw, db in zip(grads['dW'], grads['db']):
total_sq += np.sum(dw**2) + np.sum(db**2)
return np.sqrt(total_sq)
def train_with_grad_tracking(nn, X, y, epochs=20, batch_size=32, lr=0.01):
m = X.shape[0]
grad_norms = []
losses = []
for e in range(epochs):
perm = np.random.permutation(m)
X_shuf = X[perm]
y_shuf = y[perm]
epoch_loss = 0
num_batches = int(np.ceil(m/batch_size))
for i in range(num_batches):
start = i*batch_size
end = min((i+1)*batch_size, m)
X_batch = X_shuf[start:end]
y_batch = y_shuf[start:end]
# measure gradient norm before update (arbitrary choice)
gn = gradient_norm(nn, X_batch, y_batch)
grad_norms.append(gn)
b_loss = nn.train_step(X_batch, y_batch, lr)
epoch_loss += b_loss*(end-start)
epoch_loss /= m
losses.append(epoch_loss)
return grad_norms, losses
# We'll create artificial data for demonstration
# E.g. 500 samples, dimension=20, 5-output classification
N = 500
D = 20
C = 5
X_demo = np.random.randn(N, D)
y_demo_lbls = np.random.randint(0,C,size=(N,))
y_demo_oh = np.eye(C)[y_demo_lbls]
# 5.1 Vanishing Gradient Example
class SmallInitNet(NeuralNetwork):
def initialize_parameters(self):
# Extremely small init => prone to vanishing
self.weights = []
self.biases = []
for i in range(1, self.num_layers):
W = np.random.normal(0, 1e-2, (self.layer_sizes[i-1], self.layer_sizes[i]))
b = np.zeros((1, self.layer_sizes[i]))
self.weights.append(W)
self.biases.append(b)
# All 'sigmoid' => more saturation => easier vanishing
vanish_net = SmallInitNet(
layer_sizes=[20, 64, 64, 5],
activations=['tanh','tanh','sigmoid','sigmoid']
)
gn_vanish, loss_vanish = train_with_grad_tracking(
vanish_net,
X_demo, # e.g. a random or real dataset of shape (N, 20)
y_demo_oh, # e.g. one-hot labels of shape (N, 5)
epochs=20,
batch_size=16,
lr=0.01
)
plt.figure(figsize=(10,4))
# Left: gradient norm curve
plt.subplot(1,2,1)
plt.plot(gn_vanish)
plt.title('Vanishing Gradient Norm (Deep Net + Very Small Init + Tanh and Sigmoid)')
plt.xlabel('Batch Updates')
plt.ylabel('Grad Norm')
# Right: training loss curve
plt.subplot(1,2,2)
plt.plot(loss_vanish)
plt.title('Vanishing - Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.tight_layout()
plt.show()
# 5.2 Exploding Gradient Example
class LargeInitNet(NeuralNetwork):
def initialize_parameters(self):
self.weights = []
self.biases = []
for i in range(1, self.num_layers):
W = np.random.normal(0, 1e-1, (self.layer_sizes[i-1], self.layer_sizes[i]))
b = np.zeros((1, self.layer_sizes[i]))
self.weights.append(W)
self.biases.append(b)
explode_net = LargeInitNet(
layer_sizes=[20,64,64,64,5],
activations=['relu','relu','relu','softmax']
)
gn_explode, loss_explode = train_with_grad_tracking(
explode_net,
X_demo, y_demo_oh,
epochs=20,
batch_size=8,
lr=3.0 # intentionally large
)
plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
plt.plot(gn_explode)
plt.title('Exploding Gradient Norm (Large Init + LR=1.0)')
plt.xlabel('Batch updates')
plt.ylabel('Grad Norm')
plt.subplot(1,2,2)
plt.plot(loss_explode)
plt.title('Exploding - Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.tight_layout()
plt.show()
Observing the Norm Plots¶
- Vanishing: the gradient norm typically goes near 0, leading to minimal changes in weights.
- Exploding: the gradient norm can skyrocket (sometimes NaN), and training becomes unstable.
Conclusion¶
We've built a neural network with pure NumPy:
- It handles feedforward/backprop.
- We tested it on XOR.
- We applied it to spam detection using TF-IDF.
- We tried different batch sizes and learning rates.
- We demonstrated vanishing/exploding gradients by tracking gradient norms in a deeper net.
This clarifies many core concepts that frameworks like PyTorch or TensorFlow manage under the hood.