Lab: Compare models and assess performance (PyTorch)¶

Goals¶

  • Train two different architectures on the same split of Fashion-MNIST with the same optimizer settings.
  • Report accuracy, macro-F1, confusion matrix, parameter count, and wall-clock training time.
  • Plot learning curves to see whether a deeper model is learning faster or overfitting.

Prerequisites¶

pip install torch torchvision matplotlib numpy scikit-learn

In [ ]:
import time
from dataclasses import dataclass

import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from torch.utils.data import DataLoader
from torchvision import transforms
from torchvision.datasets import FashionMNIST

torch.manual_seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if not torch.cuda.is_available() and getattr(torch.backends, 'mps', None) and torch.backends.mps.is_available():
    device = torch.device('mps')
print('device:', device)

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)

# Repeat grayscale to 3 channels so ResNet expects RGB-sized tensors
class ToRGB:
    def __call__(self, t: torch.Tensor) -> torch.Tensor:
        return t.repeat(3, 1, 1)


train_tf = transforms.Compose(
    [
        transforms.RandomHorizontalFlip(),
        transforms.RandomCrop(28, padding=4),
        transforms.ToTensor(),
        ToRGB(),
        transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD),
    ]
)
eval_tf = transforms.Compose(
    [
        transforms.ToTensor(),
        ToRGB(),
        transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD),
    ]
)

train_ds = FashionMNIST('./data', train=True, download=True, transform=train_tf)
test_ds = FashionMNIST('./data', train=False, download=True, transform=eval_tf)

val_n = 5000
train_idx = list(range(len(train_ds) - val_n))
val_idx = list(range(len(train_ds) - val_n, len(train_ds)))
train_subset = torch.utils.data.Subset(train_ds, train_idx)
val_subset = torch.utils.data.Subset(train_ds, val_idx)

train_loader = DataLoader(train_subset, batch_size=128, shuffle=True)
val_loader = DataLoader(val_subset, batch_size=256, shuffle=False)
test_loader = DataLoader(test_ds, batch_size=256, shuffle=False)

class_names = train_ds.classes
num_classes = len(class_names)
print('Classes:', class_names)

Architectures¶

  1. SmallCNN — lightweight baseline (good for edge devices).
  2. ResNet18 (ImageNet pretrained) — replace conv1 for small inputs, reset fc for 10 classes. Fine-tuning transfer learning.
In [ ]:
class SmallCNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Conv2d(3, 32, 3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),
            nn.Conv2d(32, 64, 3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),
            nn.Flatten(),
            nn.Linear(64 * 7 * 7, 128),
            nn.ReLU(inplace=True),
            nn.Dropout(0.3),
            nn.Linear(128, num_classes),
        )

    def forward(self, x):
        return self.net(x)


from torchvision.models import resnet18, ResNet18_Weights


def build_resnet18_fashion() -> nn.Module:
    w = ResNet18_Weights.IMAGENET1K_V1
    m = resnet18(weights=w)
    m.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
    m.maxpool = nn.Identity()
    nf = m.fc.in_features
    m.fc = nn.Linear(nf, num_classes)
    return m


def count_params(model: nn.Module) -> int:
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

Training utility — one place for fair comparison¶

In [ ]:
@dataclass
class RunResult:
    name: str
    history: dict
    seconds: float
    test_pred: np.ndarray
    test_true: np.ndarray


def run_experiment(name: str, model: nn.Module, epochs: int = 3, lr: float = 1e-3) -> RunResult:
    model = model.to(device)
    opt = torch.optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.CrossEntropyLoss()
    hist = {'train_loss': [], 'val_loss': [], 'val_acc': []}
    t0 = time.perf_counter()
    for ep in range(epochs):
        model.train()
        tl = tn = 0
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            opt.zero_grad(set_to_none=True)
            logits = model(xb)
            loss = loss_fn(logits, yb)
            loss.backward()
            opt.step()
            tl += loss.item() * xb.size(0)
            tn += xb.size(0)
        train_loss = tl / tn

        model.eval()
        vl = vn = c = tot = 0
        with torch.no_grad():
            for xb, yb in val_loader:
                xb, yb = xb.to(device), yb.to(device)
                logits = model(xb)
                loss = loss_fn(logits, yb)
                vl += loss.item() * xb.size(0)
                vn += xb.size(0)
                c += (logits.argmax(1) == yb).sum().item()
                tot += yb.size(0)
        val_loss = vl / vn
        val_acc = c / tot
        hist['train_loss'].append(train_loss)
        hist['val_loss'].append(val_loss)
        hist['val_acc'].append(val_acc)
        print(f'{name} epoch {ep+1}/{epochs}  train_loss={train_loss:.4f}  val_acc={val_acc:.4f}')

    # Test predictions
    preds, trues = [], []
    model.eval()
    with torch.no_grad():
        for xb, yb in test_loader:
            xb = xb.to(device)
            preds.append(model(xb).argmax(1).cpu().numpy())
            trues.append(yb.numpy())
    sec = time.perf_counter() - t0
    return RunResult(name, hist, sec, np.concatenate(preds), np.concatenate(trues))


print('Parameter counts')
sc = SmallCNN()
rn = build_resnet18_fashion()
print(' SmallCNN:', count_params(sc))
print(' ResNet18:', count_params(rn))

Run both models (short training on CPU/GPU)¶

Increase epochs for graded assignments; three epochs keeps a classroom session under ~10 minutes on many laptops (ResNet18 will be slower).

In [ ]:
EPOCHS = 3
results = []
results.append(run_experiment('SmallCNN', SmallCNN(), epochs=EPOCHS))
results.append(run_experiment('ResNet18-ft', build_resnet18_fashion(), epochs=EPOCHS, lr=3e-4))

Learning curves and metric table¶

In [ ]:
fig, ax = plt.subplots(1, 2, figsize=(10, 4))
for r in results:
    ax[0].plot(r.history['val_acc'], marker='o', label=r.name)
ax[0].set_title('Validation accuracy')
ax[0].set_xlabel('epoch')
ax[0].legend()
ax[0].grid(True)

for r in results:
    ax[1].plot(r.history['val_loss'], marker='o', label=r.name)
ax[1].set_title('Validation loss')
ax[1].set_xlabel('epoch')
ax[1].legend()
ax[1].grid(True)
plt.tight_layout()
plt.show()

rows = []
for r in results:
    acc_final = accuracy_score(r.test_true, r.test_pred)
    f1 = f1_score(r.test_true, r.test_pred, average='macro')
    rows.append((r.name, acc_final, f1, r.seconds))

print(f'{"Model":<14} {"TestAcc":>10} {"MacroF1":>10} {"Seconds":>10}')
for name, acc, f1, sec in rows:
    print(f'{name:<14} {acc:10.4f} {f1:10.4f} {sec:10.1f}')

Confusion matrices (test set)¶

In [ ]:
fig, ax = plt.subplots(1, len(results), figsize=(6 * len(results), 5))
if len(results) == 1:
    ax = [ax]
for a, r in zip(ax, results):
    cm = confusion_matrix(r.test_true, r.test_pred, normalize='true')
    im = a.imshow(cm, cmap='Blues')
    a.set_title(r.name)
    a.set_xlabel('Predicted')
    a.set_ylabel('True')
    plt.colorbar(im, ax=a, fraction=0.046)
plt.tight_layout()
plt.show()

Discussion prompts¶

  1. Which model has more parameters? Did it always win on accuracy after three epochs?
  2. If ResNet18 underfits, what would you try next (more epochs, higher LR, unfreeze more layers)?
  3. Where does the confusion matrix show systematic mistakes (symmetric classes like Shirt vs T-shirt/top)?
  4. For deployment, would you pick the lighter model even with slightly lower F1? Why?