Building Neural Networks from Scratch with NumPy¶
This notebook demonstrates how to:
- Implement a feedforward neural network (forward and backward pass) using NumPy.
- Train the network on spam detection from the
NotShrirang/email-spam-filterdataset, using TF-IDF for text. - Experiment with different batch sizes and learning rates.
- Illustrate vanishing and exploding gradients in deeper networks.
No Keras or TensorFlow is used—it's all done with core Python/NumPy.
# Imports (No Keras/TensorFlow)
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# For reproducibility:
np.random.seed(42)
1. Activation and Loss Functions (NumPy)¶
We'll define some basic building blocks: sigmoid, relu, softmax, etc., plus a cross-entropy loss function.
def sigmoid(x):
# clip to avoid overflow in large exponent
return 1 / (1 + np.exp(-np.clip(x, -500, 500)))
def sigmoid_derivative(x):
s = sigmoid(x)
return s * (1 - s)
def relu(x):
return np.maximum(0, x)
def relu_derivative(x):
return np.where(x > 0, 1, 0)
def tanh(x):
return np.tanh(x)
def tanh_derivative(x):
return 1 - np.tanh(x)**2
def softmax(x):
# numeric stability shift
shifted = x - np.max(x, axis=1, keepdims=True)
ex = np.exp(shifted)
return ex / np.sum(ex, axis=1, keepdims=True)
def categorical_cross_entropy(y_true, y_pred):
# clip predictions to avoid log(0)
y_pred = np.clip(y_pred, 1e-15, 1-1e-15)
return -np.sum(y_true * np.log(y_pred)) / y_true.shape[0]
2. Base Neural Network in NumPy¶
Implements forward/backward passes, He initialization, and can train via a simple loop. We'll do mini-batches inside a .train(...) method for basic gradient descent.
class NeuralNetwork:
def __init__(self, layer_sizes, activations):
"""
layer_sizes: list (e.g. [input_dim, hidden_dim, ..., output_dim])
activations: list of string names, e.g. ['relu','softmax'] (one per hidden/output layer)
"""
self.layer_sizes = layer_sizes
self.num_layers = len(layer_sizes)
# map from string to actual function
self.activation_map = {
'sigmoid': (sigmoid, sigmoid_derivative),
'relu': (relu, relu_derivative),
'softmax': (softmax, None),
'tanh': (tanh, tanh_derivative)
}
self.activations = []
self.activation_derivs = []
for act in activations:
f, d = self.activation_map[act]
self.activations.append(f)
self.activation_derivs.append(d)
self.initialize_parameters()
# placeholders for forward pass data
self.Z = [None]*(self.num_layers-1)
self.A = [None]*self.num_layers
def initialize_parameters(self):
# He initialization
self.weights = []
self.biases = []
for i in range(1, self.num_layers):
scale = np.sqrt(2.0 / self.layer_sizes[i-1])
W = np.random.randn(self.layer_sizes[i-1], self.layer_sizes[i]) * scale
b = np.zeros((1, self.layer_sizes[i]))
self.weights.append(W)
self.biases.append(b)
def forward_propagation(self, X):
self.A[0] = X
for i in range(self.num_layers-1):
self.Z[i] = np.dot(self.A[i], self.weights[i]) + self.biases[i]
self.A[i+1] = self.activations[i](self.Z[i])
return self.A[-1]
def compute_loss(self, y_true, y_pred):
return categorical_cross_entropy(y_true, y_pred)
def backward_propagation(self, y_true):
m = y_true.shape[0]
dW = [None]*(self.num_layers-1)
db = [None]*(self.num_layers-1)
# handle last layer
if self.activations[-1] == softmax:
dZ = self.A[-1] - y_true # (y_pred - y_true)
else:
# e.g. sigmoid + cross-entropy
dA = -(y_true / self.A[-1] - (1 - y_true)/(1 - self.A[-1]))
dZ = dA * self.activation_derivs[-1](self.Z[-1])
dW[-1] = np.dot(self.A[-2].T, dZ)/m
db[-1] = np.sum(dZ, axis=0, keepdims=True)/m
for l in range(self.num_layers-3, -1, -1):
dA = np.dot(dZ, self.weights[l+1].T)
dZ = dA * self.activation_derivs[l](self.Z[l])
dW[l] = np.dot(self.A[l].T, dZ)/m
db[l] = np.sum(dZ, axis=0, keepdims=True)/m
return {'dW': dW, 'db': db}
def update_parameters(self, grads, lr):
dW = grads['dW']
db = grads['db']
for i in range(self.num_layers-1):
self.weights[i] -= lr * dW[i]
self.biases[i] -= lr * db[i]
def train_step(self, X_batch, y_batch, lr):
y_pred = self.forward_propagation(X_batch)
loss = self.compute_loss(y_batch, y_pred)
grads = self.backward_propagation(y_batch)
self.update_parameters(grads, lr)
return loss
def train(self, X_train, y_train, X_val=None, y_val=None, batch_size=32, learning_rate=0.01,
epochs=10, verbose=True, record_every=1):
m = X_train.shape[0]
history = {'train_loss':[], 'val_loss':[], 'train_acc':[], 'val_acc':[]}
for epoch in range(epochs):
perm = np.random.permutation(m)
X_shuf = X_train[perm]
y_shuf = y_train[perm]
epoch_loss = 0
num_batches = int(np.ceil(m/batch_size))
for i in range(num_batches):
start = i*batch_size
end = min((i+1)*batch_size, m)
X_batch = X_shuf[start:end]
y_batch = y_shuf[start:end]
b_loss = self.train_step(X_batch, y_batch, learning_rate)
epoch_loss += b_loss * (end - start)
epoch_loss /= m
# track train/val if needed
if epoch % record_every == 0 or epoch == epochs-1:
y_pred_train = self.predict(X_train)
train_acc = np.mean(np.argmax(y_pred_train, axis=1) == np.argmax(y_train, axis=1))
history['train_loss'].append(epoch_loss)
history['train_acc'].append(train_acc)
if X_val is not None and y_val is not None:
y_pred_val = self.predict(X_val)
val_loss = self.compute_loss(y_val, y_pred_val)
val_acc = np.mean(np.argmax(y_pred_val, axis=1) == np.argmax(y_val, axis=1))
history['val_loss'].append(val_loss)
history['val_acc'].append(val_acc)
if verbose:
print(f"Epoch {epoch+1}/{epochs} - loss: {epoch_loss:.4f} - acc: {train_acc:.4f} - val_loss: {val_loss:.4f} - val_acc: {val_acc:.4f}")
else:
if verbose:
print(f"Epoch {epoch+1}/{epochs} - loss: {epoch_loss:.4f} - acc: {train_acc:.4f}")
return history
def predict(self, X):
return self.forward_propagation(X)
Quick XOR Check¶
We confirm that the above network can learn XOR with a small architecture.
# XOR dataset: 2 inputs -> 2 classes, one-hot.
X_xor = np.array([[0,0],[0,1],[1,0],[1,1]])
y_xor = np.array([[1,0],[0,1],[0,1],[1,0]])
# define net
xor_net = NeuralNetwork(
layer_sizes=[2,4,2],
activations=['sigmoid','sigmoid']
)
hist_xor = xor_net.train(
X_xor,
y_xor,
batch_size=4, # full batch
learning_rate=0.5,
epochs=1000,
verbose=False,
record_every=100
)
# Evaluate
y_out = xor_net.predict(X_xor)
pred_cls = np.argmax(y_out, axis=1)
true_cls = np.argmax(y_xor, axis=1)
acc = np.mean(pred_cls==true_cls)
print("XOR Accuracy:", acc)
plt.plot(hist_xor['train_loss'], label='Train Loss')
plt.title('XOR Training Loss')
plt.xlabel('Checkpoints')
plt.ylabel('Loss')
plt.legend()
plt.show()
XOR Accuracy: 1.0
3. Spam Detection (NumPy)¶
We will:
- Load
NotShrirang/email-spam-filter - Split train/dev/test
- TF-IDF vectorize
- Convert labels to one-hot for a 2-output softmax
- Train a
[2000 -> 64 -> 2]network for ~10 epochs.
spam_data = load_dataset("NotShrirang/email-spam-filter")
df = spam_data['train'].to_pandas()
df.head()
| Unnamed: 0 | label | text | label_num | |
|---|---|---|---|---|
| 0 | 605 | ham | Subject: enron methanol ; meter # : 988291\nth... | 0 |
| 1 | 2349 | ham | Subject: hpl nom for january 9 , 2001\n( see a... | 0 |
| 2 | 3624 | ham | Subject: neon retreat\nho ho ho , we ' re arou... | 0 |
| 3 | 4685 | spam | Subject: photoshop , windows , office . cheap ... | 1 |
| 4 | 2030 | ham | Subject: re : indian springs\nthis deal is to ... | 0 |
3.1 Split into Train/Dev/Test, TF-IDF, and One-Hot¶
train_df, test_df = train_test_split(
df, test_size=0.2, stratify=df['label_num'], random_state=42)
train_df, dev_df = train_test_split(
train_df, test_size=0.2, stratify=train_df['label_num'], random_state=42)
print(len(train_df), len(dev_df), len(test_df))
vectorizer = TfidfVectorizer(stop_words='english', max_features=2000)
X_train_tfidf = vectorizer.fit_transform(train_df['text']).toarray()
y_train_nums = train_df['label_num'].values
X_dev_tfidf = vectorizer.transform(dev_df['text']).toarray()
y_dev_nums = dev_df['label_num'].values
X_test_tfidf = vectorizer.transform(test_df['text']).toarray()
y_test_nums = test_df['label_num'].values
def to_one_hot(lbls, num_classes=2):
return np.eye(num_classes)[lbls]
y_train_oh = to_one_hot(y_train_nums, 2)
y_dev_oh = to_one_hot(y_dev_nums, 2)
y_test_oh = to_one_hot(y_test_nums, 2)
3308 828 1035
3.2 Build a [2000->64->2] Net with ReLU + Softmax¶
Train for 10 epochs and check the test accuracy.
spam_net_np = NeuralNetwork(
layer_sizes=[2000,64,2],
activations=['relu','softmax']
)
hist_spam_np = spam_net_np.train(
X_train_tfidf, y_train_oh,
X_dev_tfidf, y_dev_oh,
batch_size=64,
learning_rate=0.2,
epochs=50,
verbose=True,
record_every=1
)
y_test_pred_proba_np = spam_net_np.predict(X_test_tfidf)
y_test_pred_class_np = np.argmax(y_test_pred_proba_np, axis=1)
test_acc_np = np.mean(y_test_pred_class_np == y_test_nums)
print(f"\nNumPy Network Test Accuracy: {test_acc_np:.4f}")
print("\nClassification Report:")
print(classification_report(y_test_nums, y_test_pred_class_np))
cm = confusion_matrix(y_test_nums, y_test_pred_class_np)
sns.heatmap(cm, annot=True, cmap='Blues', fmt='d')
plt.title('Spam Detection (NumPy NN) - Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()
Epoch 1/50 - loss: 0.5815 - acc: 0.7101 - val_loss: 0.5329 - val_acc: 0.7101
Epoch 2/50 - loss: 0.4722 - acc: 0.7443 - val_loss: 0.4096 - val_acc: 0.7403
Epoch 3/50 - loss: 0.3412 - acc: 0.9531 - val_loss: 0.2936 - val_acc: 0.9300
Epoch 4/50 - loss: 0.2433 - acc: 0.9628 - val_loss: 0.2230 - val_acc: 0.9372
Epoch 5/50 - loss: 0.1849 - acc: 0.9680 - val_loss: 0.1820 - val_acc: 0.9589
Epoch 6/50 - loss: 0.1487 - acc: 0.9743 - val_loss: 0.1527 - val_acc: 0.9614
Epoch 7/50 - loss: 0.1242 - acc: 0.9831 - val_loss: 0.1363 - val_acc: 0.9626
Epoch 8/50 - loss: 0.1056 - acc: 0.9840 - val_loss: 0.1183 - val_acc: 0.9686
Epoch 9/50 - loss: 0.0925 - acc: 0.9876 - val_loss: 0.1064 - val_acc: 0.9698
Epoch 10/50 - loss: 0.0814 - acc: 0.9897 - val_loss: 0.0991 - val_acc: 0.9722
Epoch 11/50 - loss: 0.0720 - acc: 0.9909 - val_loss: 0.0947 - val_acc: 0.9734
Epoch 12/50 - loss: 0.0658 - acc: 0.9915 - val_loss: 0.0857 - val_acc: 0.9771
Epoch 13/50 - loss: 0.0595 - acc: 0.9921 - val_loss: 0.0867 - val_acc: 0.9722
Epoch 14/50 - loss: 0.0544 - acc: 0.9933 - val_loss: 0.0776 - val_acc: 0.9783
Epoch 15/50 - loss: 0.0510 - acc: 0.9946 - val_loss: 0.0781 - val_acc: 0.9734
Epoch 16/50 - loss: 0.0476 - acc: 0.9955 - val_loss: 0.0736 - val_acc: 0.9758
Epoch 17/50 - loss: 0.0443 - acc: 0.9961 - val_loss: 0.0701 - val_acc: 0.9771
Epoch 18/50 - loss: 0.0412 - acc: 0.9967 - val_loss: 0.0681 - val_acc: 0.9771
Epoch 19/50 - loss: 0.0384 - acc: 0.9958 - val_loss: 0.0695 - val_acc: 0.9771
Epoch 20/50 - loss: 0.0364 - acc: 0.9964 - val_loss: 0.0657 - val_acc: 0.9795
Epoch 21/50 - loss: 0.0340 - acc: 0.9967 - val_loss: 0.0655 - val_acc: 0.9783
Epoch 22/50 - loss: 0.0321 - acc: 0.9961 - val_loss: 0.0688 - val_acc: 0.9771
Epoch 23/50 - loss: 0.0310 - acc: 0.9973 - val_loss: 0.0626 - val_acc: 0.9771
Epoch 24/50 - loss: 0.0290 - acc: 0.9976 - val_loss: 0.0644 - val_acc: 0.9795
Epoch 25/50 - loss: 0.0275 - acc: 0.9973 - val_loss: 0.0616 - val_acc: 0.9771
Epoch 26/50 - loss: 0.0264 - acc: 0.9976 - val_loss: 0.0609 - val_acc: 0.9783
Epoch 27/50 - loss: 0.0250 - acc: 0.9973 - val_loss: 0.0605 - val_acc: 0.9807
Epoch 28/50 - loss: 0.0238 - acc: 0.9979 - val_loss: 0.0614 - val_acc: 0.9795
Epoch 29/50 - loss: 0.0229 - acc: 0.9976 - val_loss: 0.0597 - val_acc: 0.9795
Epoch 30/50 - loss: 0.0220 - acc: 0.9979 - val_loss: 0.0599 - val_acc: 0.9807
Epoch 31/50 - loss: 0.0209 - acc: 0.9988 - val_loss: 0.0622 - val_acc: 0.9783
Epoch 32/50 - loss: 0.0200 - acc: 0.9985 - val_loss: 0.0598 - val_acc: 0.9807
Epoch 33/50 - loss: 0.0194 - acc: 0.9985 - val_loss: 0.0593 - val_acc: 0.9807
Epoch 34/50 - loss: 0.0187 - acc: 0.9991 - val_loss: 0.0603 - val_acc: 0.9807
Epoch 35/50 - loss: 0.0177 - acc: 0.9985 - val_loss: 0.0588 - val_acc: 0.9783
Epoch 36/50 - loss: 0.0171 - acc: 0.9985 - val_loss: 0.0586 - val_acc: 0.9795
Epoch 37/50 - loss: 0.0167 - acc: 0.9991 - val_loss: 0.0592 - val_acc: 0.9807
Epoch 38/50 - loss: 0.0160 - acc: 0.9991 - val_loss: 0.0604 - val_acc: 0.9807
Epoch 39/50 - loss: 0.0154 - acc: 0.9994 - val_loss: 0.0633 - val_acc: 0.9795
Epoch 40/50 - loss: 0.0150 - acc: 0.9991 - val_loss: 0.0587 - val_acc: 0.9795
Epoch 41/50 - loss: 0.0146 - acc: 0.9991 - val_loss: 0.0586 - val_acc: 0.9795
Epoch 42/50 - loss: 0.0141 - acc: 0.9991 - val_loss: 0.0587 - val_acc: 0.9795
Epoch 43/50 - loss: 0.0136 - acc: 0.9991 - val_loss: 0.0588 - val_acc: 0.9783
Epoch 44/50 - loss: 0.0132 - acc: 0.9991 - val_loss: 0.0587 - val_acc: 0.9795
Epoch 45/50 - loss: 0.0129 - acc: 0.9994 - val_loss: 0.0590 - val_acc: 0.9807
Epoch 46/50 - loss: 0.0126 - acc: 0.9994 - val_loss: 0.0604 - val_acc: 0.9819
Epoch 47/50 - loss: 0.0121 - acc: 0.9994 - val_loss: 0.0619 - val_acc: 0.9807
Epoch 48/50 - loss: 0.0118 - acc: 0.9994 - val_loss: 0.0635 - val_acc: 0.9807
Epoch 49/50 - loss: 0.0116 - acc: 0.9994 - val_loss: 0.0591 - val_acc: 0.9807
Epoch 50/50 - loss: 0.0114 - acc: 0.9994 - val_loss: 0.0600 - val_acc: 0.9819
NumPy Network Test Accuracy: 0.9855
Classification Report:
precision recall f1-score support
0 0.99 0.99 0.99 735
1 0.97 0.98 0.98 300
accuracy 0.99 1035
macro avg 0.98 0.98 0.98 1035
weighted avg 0.99 0.99 0.99 1035
Quite impressive ! 98.5% accuracy !! Which is much more than the 50% of the baseline and what we created in previous session with baseline.
3.3 Plot the Training/Dev Loss¶
plt.figure(figsize=(8,4))
plt.plot(hist_spam_np['train_loss'], label='Train Loss')
if len(hist_spam_np['val_loss'])>0:
plt.plot(hist_spam_np['val_loss'], label='Dev Loss')
plt.title('Spam (NumPy) Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()
Nevertheless, we see the dev loss starting to increase after 20 epochs. Which means we most likely overfit the training data past that point. To avoid this, we could have used early stopping or regularization.
4. Different Batch Sizes and Learning Rates on Spam Data (NumPy)¶
We'll define a short experiment, training for a small number of epochs, just to illustrate how the training loss changes across (batch_size, learning_rate) combos.
batch_sizes = [32, 128, 512]
lrs = [0.001, 0.01, 0.1]
results = {}
for bs in batch_sizes:
for lr in lrs:
key = f"BS={bs}-LR={lr}"
net_temp = NeuralNetwork(
layer_sizes=[2000,32,2],
activations=['relu','softmax']
)
hist_temp = net_temp.train(
X_train_tfidf, y_train_oh,
X_dev_tfidf, y_dev_oh,
batch_size=bs,
learning_rate=lr,
epochs=50,
verbose=False,
record_every=1
)
results[key] = hist_temp
# Plot the training loss curves
plt.figure(figsize=(15,8))
for key, hist_ in results.items():
plt.plot(hist_['train_loss'], label=key)
plt.title('Spam (NumPy) with Different Batch Sizes & LRs')
plt.xlabel('Epoch')
plt.ylabel('Train Loss')
plt.legend()
plt.show()
You can see the differences in loss curves for different batch sizes and learning rates. Generally with really low learning rates the curves are quite flat and the loss is high. And at equal learning rates, the lower batch sizes converge faster.
5. Vanishing and Exploding Gradients¶
We'll build deeper networks in NumPy. By controlling:
- Small init or
sigmoidfor vanishing - Large init or large LR for exploding We'll track the gradient norm at each epoch to see the pattern.
def gradient_norm(nn, X_batch, y_batch):
_ = nn.forward_propagation(X_batch)
grads = nn.backward_propagation(y_batch)
total_sq = 0
for dw, db in zip(grads['dW'], grads['db']):
total_sq += np.sum(dw**2) + np.sum(db**2)
return np.sqrt(total_sq)
def train_with_grad_tracking(nn, X, y, epochs=20, batch_size=32, lr=0.01):
m = X.shape[0]
grad_norms = []
losses = []
for e in range(epochs):
perm = np.random.permutation(m)
X_shuf = X[perm]
y_shuf = y[perm]
epoch_loss = 0
num_batches = int(np.ceil(m/batch_size))
for i in range(num_batches):
start = i*batch_size
end = min((i+1)*batch_size, m)
X_batch = X_shuf[start:end]
y_batch = y_shuf[start:end]
# measure gradient norm before update (arbitrary choice)
gn = gradient_norm(nn, X_batch, y_batch)
grad_norms.append(gn)
b_loss = nn.train_step(X_batch, y_batch, lr)
epoch_loss += b_loss*(end-start)
epoch_loss /= m
losses.append(epoch_loss)
return grad_norms, losses
# We'll create artificial data for demonstration
# E.g. 500 samples, dimension=20, 5-output classification
N = 500
D = 20
C = 5
X_demo = np.random.randn(N, D)
y_demo_lbls = np.random.randint(0,C,size=(N,))
y_demo_oh = np.eye(C)[y_demo_lbls]
# 5.1 Vanishing Gradient Example
class SmallInitNet(NeuralNetwork):
def initialize_parameters(self):
# Extremely small init => prone to vanishing
self.weights = []
self.biases = []
for i in range(1, self.num_layers):
W = np.random.normal(0, 1e-2, (self.layer_sizes[i-1], self.layer_sizes[i]))
b = np.zeros((1, self.layer_sizes[i]))
self.weights.append(W)
self.biases.append(b)
# All 'sigmoid' => more saturation => easier vanishing
vanish_net = SmallInitNet(
layer_sizes=[20, 64, 64, 5],
activations=['tanh','tanh','sigmoid','sigmoid']
)
gn_vanish, loss_vanish = train_with_grad_tracking(
vanish_net,
X_demo, # e.g. a random or real dataset of shape (N, 20)
y_demo_oh, # e.g. one-hot labels of shape (N, 5)
epochs=20,
batch_size=16,
lr=0.01
)
plt.figure(figsize=(10,4))
# Left: gradient norm curve
plt.subplot(1,2,1)
plt.plot(gn_vanish)
plt.title('Vanishing Gradient Norm (Deep Net + Very Small Init + Tanh and Sigmoid)')
plt.xlabel('Batch Updates')
plt.ylabel('Grad Norm')
# Right: training loss curve
plt.subplot(1,2,2)
plt.plot(loss_vanish)
plt.title('Vanishing - Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.tight_layout()
plt.show()
# 5.2 Exploding Gradient Example
class LargeInitNet(NeuralNetwork):
def initialize_parameters(self):
self.weights = []
self.biases = []
for i in range(1, self.num_layers):
W = np.random.normal(0, 1e-1, (self.layer_sizes[i-1], self.layer_sizes[i]))
b = np.zeros((1, self.layer_sizes[i]))
self.weights.append(W)
self.biases.append(b)
explode_net = LargeInitNet(
layer_sizes=[20,64,64,64,5],
activations=['relu','relu','relu','softmax']
)
gn_explode, loss_explode = train_with_grad_tracking(
explode_net,
X_demo, y_demo_oh,
epochs=20,
batch_size=8,
lr=3.0 # intentionally large
)
plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
plt.plot(gn_explode)
plt.title('Exploding Gradient Norm (Large Init + LR=1.0)')
plt.xlabel('Batch updates')
plt.ylabel('Grad Norm')
plt.subplot(1,2,2)
plt.plot(loss_explode)
plt.title('Exploding - Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.tight_layout()
plt.show()
Observing the Norm Plots¶
- Vanishing: the gradient norm typically goes near 0, leading to minimal changes in weights.
- Exploding: the gradient norm can skyrocket (sometimes NaN), and training becomes unstable.
Conclusion¶
We've built a neural network with pure NumPy:
- It handles feedforward/backprop.
- We tested it on XOR.
- We applied it to spam detection using TF-IDF.
- We tried different batch sizes and learning rates.
- We demonstrated vanishing/exploding gradients by tracking gradient norms in a deeper net.
This clarifies many core concepts that frameworks like PyTorch or TensorFlow manage under the hood.