Lab: Compare models and assess performance (PyTorch)¶
Goals¶
- Train two different architectures on the same split of Fashion-MNIST with the same optimizer settings.
- Report accuracy, macro-F1, confusion matrix, parameter count, and wall-clock training time.
- Plot learning curves to see whether a deeper model is learning faster or overfitting.
Prerequisites¶
pip install torch torchvision matplotlib numpy scikit-learn
In [ ]:
import time
from dataclasses import dataclass
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from torch.utils.data import DataLoader
from torchvision import transforms
from torchvision.datasets import FashionMNIST
torch.manual_seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if not torch.cuda.is_available() and getattr(torch.backends, 'mps', None) and torch.backends.mps.is_available():
device = torch.device('mps')
print('device:', device)
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
# Repeat grayscale to 3 channels so ResNet expects RGB-sized tensors
class ToRGB:
def __call__(self, t: torch.Tensor) -> torch.Tensor:
return t.repeat(3, 1, 1)
train_tf = transforms.Compose(
[
transforms.RandomHorizontalFlip(),
transforms.RandomCrop(28, padding=4),
transforms.ToTensor(),
ToRGB(),
transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD),
]
)
eval_tf = transforms.Compose(
[
transforms.ToTensor(),
ToRGB(),
transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD),
]
)
train_ds = FashionMNIST('./data', train=True, download=True, transform=train_tf)
test_ds = FashionMNIST('./data', train=False, download=True, transform=eval_tf)
val_n = 5000
train_idx = list(range(len(train_ds) - val_n))
val_idx = list(range(len(train_ds) - val_n, len(train_ds)))
train_subset = torch.utils.data.Subset(train_ds, train_idx)
val_subset = torch.utils.data.Subset(train_ds, val_idx)
train_loader = DataLoader(train_subset, batch_size=128, shuffle=True)
val_loader = DataLoader(val_subset, batch_size=256, shuffle=False)
test_loader = DataLoader(test_ds, batch_size=256, shuffle=False)
class_names = train_ds.classes
num_classes = len(class_names)
print('Classes:', class_names)
Architectures¶
- SmallCNN — lightweight baseline (good for edge devices).
- ResNet18 (ImageNet pretrained) — replace
conv1for small inputs, resetfcfor 10 classes. Fine-tuning transfer learning.
In [ ]:
class SmallCNN(nn.Module):
def __init__(self):
super().__init__()
self.net = nn.Sequential(
nn.Conv2d(3, 32, 3, padding=1),
nn.BatchNorm2d(32),
nn.ReLU(inplace=True),
nn.MaxPool2d(2),
nn.Conv2d(32, 64, 3, padding=1),
nn.BatchNorm2d(64),
nn.ReLU(inplace=True),
nn.MaxPool2d(2),
nn.Flatten(),
nn.Linear(64 * 7 * 7, 128),
nn.ReLU(inplace=True),
nn.Dropout(0.3),
nn.Linear(128, num_classes),
)
def forward(self, x):
return self.net(x)
from torchvision.models import resnet18, ResNet18_Weights
def build_resnet18_fashion() -> nn.Module:
w = ResNet18_Weights.IMAGENET1K_V1
m = resnet18(weights=w)
m.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
m.maxpool = nn.Identity()
nf = m.fc.in_features
m.fc = nn.Linear(nf, num_classes)
return m
def count_params(model: nn.Module) -> int:
return sum(p.numel() for p in model.parameters() if p.requires_grad)
Training utility — one place for fair comparison¶
In [ ]:
@dataclass
class RunResult:
name: str
history: dict
seconds: float
test_pred: np.ndarray
test_true: np.ndarray
def run_experiment(name: str, model: nn.Module, epochs: int = 3, lr: float = 1e-3) -> RunResult:
model = model.to(device)
opt = torch.optim.Adam(model.parameters(), lr=lr)
loss_fn = nn.CrossEntropyLoss()
hist = {'train_loss': [], 'val_loss': [], 'val_acc': []}
t0 = time.perf_counter()
for ep in range(epochs):
model.train()
tl = tn = 0
for xb, yb in train_loader:
xb, yb = xb.to(device), yb.to(device)
opt.zero_grad(set_to_none=True)
logits = model(xb)
loss = loss_fn(logits, yb)
loss.backward()
opt.step()
tl += loss.item() * xb.size(0)
tn += xb.size(0)
train_loss = tl / tn
model.eval()
vl = vn = c = tot = 0
with torch.no_grad():
for xb, yb in val_loader:
xb, yb = xb.to(device), yb.to(device)
logits = model(xb)
loss = loss_fn(logits, yb)
vl += loss.item() * xb.size(0)
vn += xb.size(0)
c += (logits.argmax(1) == yb).sum().item()
tot += yb.size(0)
val_loss = vl / vn
val_acc = c / tot
hist['train_loss'].append(train_loss)
hist['val_loss'].append(val_loss)
hist['val_acc'].append(val_acc)
print(f'{name} epoch {ep+1}/{epochs} train_loss={train_loss:.4f} val_acc={val_acc:.4f}')
# Test predictions
preds, trues = [], []
model.eval()
with torch.no_grad():
for xb, yb in test_loader:
xb = xb.to(device)
preds.append(model(xb).argmax(1).cpu().numpy())
trues.append(yb.numpy())
sec = time.perf_counter() - t0
return RunResult(name, hist, sec, np.concatenate(preds), np.concatenate(trues))
print('Parameter counts')
sc = SmallCNN()
rn = build_resnet18_fashion()
print(' SmallCNN:', count_params(sc))
print(' ResNet18:', count_params(rn))
Run both models (short training on CPU/GPU)¶
Increase epochs for graded assignments; three epochs keeps a classroom session under ~10 minutes on many laptops (ResNet18 will be slower).
In [ ]:
EPOCHS = 3
results = []
results.append(run_experiment('SmallCNN', SmallCNN(), epochs=EPOCHS))
results.append(run_experiment('ResNet18-ft', build_resnet18_fashion(), epochs=EPOCHS, lr=3e-4))
Learning curves and metric table¶
In [ ]:
fig, ax = plt.subplots(1, 2, figsize=(10, 4))
for r in results:
ax[0].plot(r.history['val_acc'], marker='o', label=r.name)
ax[0].set_title('Validation accuracy')
ax[0].set_xlabel('epoch')
ax[0].legend()
ax[0].grid(True)
for r in results:
ax[1].plot(r.history['val_loss'], marker='o', label=r.name)
ax[1].set_title('Validation loss')
ax[1].set_xlabel('epoch')
ax[1].legend()
ax[1].grid(True)
plt.tight_layout()
plt.show()
rows = []
for r in results:
acc_final = accuracy_score(r.test_true, r.test_pred)
f1 = f1_score(r.test_true, r.test_pred, average='macro')
rows.append((r.name, acc_final, f1, r.seconds))
print(f'{"Model":<14} {"TestAcc":>10} {"MacroF1":>10} {"Seconds":>10}')
for name, acc, f1, sec in rows:
print(f'{name:<14} {acc:10.4f} {f1:10.4f} {sec:10.1f}')
Confusion matrices (test set)¶
In [ ]:
fig, ax = plt.subplots(1, len(results), figsize=(6 * len(results), 5))
if len(results) == 1:
ax = [ax]
for a, r in zip(ax, results):
cm = confusion_matrix(r.test_true, r.test_pred, normalize='true')
im = a.imshow(cm, cmap='Blues')
a.set_title(r.name)
a.set_xlabel('Predicted')
a.set_ylabel('True')
plt.colorbar(im, ax=a, fraction=0.046)
plt.tight_layout()
plt.show()
Discussion prompts¶
- Which model has more parameters? Did it always win on accuracy after three epochs?
- If ResNet18 underfits, what would you try next (more epochs, higher LR, unfreeze more layers)?
- Where does the confusion matrix show systematic mistakes (symmetric classes like
ShirtvsT-shirt/top)? - For deployment, would you pick the lighter model even with slightly lower F1? Why?