Deep Learning with PyTorch - Complete Guide

🎯 PyTorch Fundamentals

Tensors - The Building Blocks

import torch
import numpy as np

# Creating tensors
x = torch.tensor([1, 2, 3, 4, 5])
y = torch.zeros(3, 4)
z = torch.randn(2, 3, 4)  # Random normal distribution
ones = torch.ones(3, 3)
eye = torch.eye(3)  # Identity matrix

# From NumPy
numpy_array = np.array([1, 2, 3, 4])
tensor_from_numpy = torch.from_numpy(numpy_array)

# Tensor properties
print(f"Shape: {z.shape}")
print(f"Data type: {z.dtype}")
print(f"Device: {z.device}")

# Move to GPU if available
if torch.cuda.is_available():
    z = z.cuda()  # or z.to('cuda')
    print(f"GPU tensor device: {z.device}")

# Basic operations
a = torch.tensor([1, 2, 3])
b = torch.tensor([4, 5, 6])

# Element-wise operations
addition = a + b  # or torch.add(a, b)
multiplication = a * b  # or torch.mul(a, b)
matrix_mult = torch.mm(a.unsqueeze(0), b.unsqueeze(1))  # Matrix multiplication

# Reshaping
reshaped = z.view(-1, 4)  # Reshape to (-1, 4)
reshaped_safe = z.reshape(-1, 4)  # Alternative that handles contiguity

Automatic Differentiation with Autograd

# Enable gradient computation
x = torch.tensor([2.0], requires_grad=True)
y = torch.tensor([3.0], requires_grad=True)

# Forward pass
z = x**2 + y**3
loss = z.mean()

# Backward pass
loss.backward()

# Access gradients
print(f"dx: {x.grad}")  # dz/dx = 2x = 4
print(f"dy: {y.grad}")  # dz/dy = 3y^2 = 27

# Gradient accumulation example
x = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)

for i in range(3):
    y = (x**2).sum()
    y.backward()
    print(f"Iteration {i+1}, gradients: {x.grad}")
    # x.grad.zero_()  # Uncomment to reset gradients

# Context managers for gradient control
x = torch.tensor([1.0, 2.0], requires_grad=True)

# Disable gradients temporarily
with torch.no_grad():
    y = x**2  # No gradient computation

# Detach from computational graph
x_detached = x.detach()  # New tensor without gradients

🏗️ Building Neural Networks

Using torch.nn Module

import torch.nn as nn
import torch.nn.functional as F

# Simple feedforward network
class SimpleNet(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(SimpleNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, num_classes)
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

# Create model instance
model = SimpleNet(input_size=784, hidden_size=256, num_classes=10)
print(model)

# Model parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total parameters: {total_params}")
print(f"Trainable parameters: {trainable_params}")

# Using Sequential for simpler models
sequential_model = nn.Sequential(
    nn.Linear(784, 256),
    nn.ReLU(),
    nn.Dropout(0.2),
    nn.Linear(256, 128),
    nn.ReLU(),
    nn.Dropout(0.2),
    nn.Linear(128, 10)
)

Convolutional Neural Networks

class CNN(nn.Module):
    def __init__(self, num_classes=10):
        super(CNN, self).__init__()
        
        # Convolutional layers
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1)
        
        # Pooling layer
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        
        # Batch normalization
        self.bn1 = nn.BatchNorm2d(32)
        self.bn2 = nn.BatchNorm2d(64)
        self.bn3 = nn.BatchNorm2d(128)
        
        # Fully connected layers
        self.fc1 = nn.Linear(128 * 3 * 3, 512)  # Assuming 28x28 input
        self.fc2 = nn.Linear(512, num_classes)
        
        # Dropout
        self.dropout = nn.Dropout(0.5)
        
    def forward(self, x):
        # First conv block
        x = self.pool(F.relu(self.bn1(self.conv1(x))))
        
        # Second conv block
        x = self.pool(F.relu(self.bn2(self.conv2(x))))
        
        # Third conv block
        x = self.pool(F.relu(self.bn3(self.conv3(x))))
        
        # Flatten for fully connected layers
        x = x.view(x.size(0), -1)
        
        # Fully connected layers
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        
        return x

# Create CNN model
cnn_model = CNN(num_classes=10)

# Example input (batch_size, channels, height, width)
dummy_input = torch.randn(32, 1, 28, 28)
output = cnn_model(dummy_input)
print(f"Output shape: {output.shape}")  # Should be (32, 10)

🎓 Training Deep Learning Models

Complete Training Loop

import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Prepare data (example with dummy data)
X_train = torch.randn(1000, 784)
y_train = torch.randint(0, 10, (1000,))
X_val = torch.randn(200, 784)
y_val = torch.randint(0, 10, (200,))

# Create datasets and dataloaders
train_dataset = TensorDataset(X_train, y_train)
val_dataset = TensorDataset(X_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# Initialize model, loss function, and optimizer
model = SimpleNet(input_size=784, hidden_size=256, num_classes=10)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)

# Learning rate scheduler
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)

# Training function
def train_model(model, train_loader, val_loader, num_epochs=20):
    train_losses = []
    val_losses = []
    val_accuracies = []
    
    for epoch in range(num_epochs):
        # Training phase
        model.train()
        train_loss = 0.0
        
        for batch_X, batch_y in train_loader:
            # Zero gradients
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            
            # Backward pass and optimization
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
        
        # Validation phase
        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0
        
        with torch.no_grad():
            for batch_X, batch_y in val_loader:
                outputs = model(batch_X)
                loss = criterion(outputs, batch_y)
                val_loss += loss.item()
                
                _, predicted = torch.max(outputs.data, 1)
                total += batch_y.size(0)
                correct += (predicted == batch_y).sum().item()
        
        # Calculate averages
        train_loss /= len(train_loader)
        val_loss /= len(val_loader)
        val_accuracy = 100 * correct / total
        
        # Store metrics
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        val_accuracies.append(val_accuracy)
        
        # Update learning rate
        scheduler.step()
        
        # Print progress
        print(f'Epoch [{epoch+1}/{num_epochs}]')
        print(f'Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.2f}%')
        print(f'Learning Rate: {optimizer.param_groups[0]["lr"]:.6f}')
        print('-' * 50)
    
    return train_losses, val_losses, val_accuracies

# Train the model
train_losses, val_losses, val_accuracies = train_model(model, train_loader, val_loader)

Advanced Training Techniques

# Early stopping implementation
class EarlyStopping:
    def __init__(self, patience=7, min_delta=0, restore_best_weights=True):
        self.patience = patience
        self.min_delta = min_delta
        self.restore_best_weights = restore_best_weights
        self.best_loss = None
        self.counter = 0
        self.best_weights = None
        
    def __call__(self, val_loss, model):
        if self.best_loss is None:
            self.best_loss = val_loss
            self.save_checkpoint(model)
        elif self.best_loss - val_loss > self.min_delta:
            self.best_loss = val_loss
            self.counter = 0
            self.save_checkpoint(model)
        else:
            self.counter += 1
            
        if self.counter >= self.patience:
            if self.restore_best_weights:
                model.load_state_dict(self.best_weights)
            return True
        return False
    
    def save_checkpoint(self, model):
        self.best_weights = model.state_dict().copy()

# Gradient clipping for training stability
def train_with_grad_clipping(model, train_loader, optimizer, criterion, max_norm=1.0):
    model.train()
    total_loss = 0
    
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        
        # Clip gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
        
        optimizer.step()
        total_loss += loss.item()
    
    return total_loss / len(train_loader)

# Mixed precision training (for faster training on modern GPUs)
from torch.cuda.amp import autocast, GradScaler

def train_with_mixed_precision(model, train_loader, optimizer, criterion):
    scaler = GradScaler()
    model.train()
    total_loss = 0
    
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        
        # Forward pass with autocast
        with autocast():
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
        
        # Backward pass with scaled gradients
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        
        total_loss += loss.item()
    
    return total_loss / len(train_loader)

🖼️ Computer Vision with PyTorch

Working with Real Image Data

import torchvision
import torchvision.transforms as transforms
from torchvision.datasets import CIFAR10

# Data preprocessing and augmentation
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])

# Load CIFAR-10 dataset
train_dataset = CIFAR10(root='./data', train=True, download=True, transform=transform_train)
test_dataset = CIFAR10(root='./data', train=False, download=True, transform=transform_test)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False, num_workers=4)

# ResNet-inspired architecture
class ResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1):
        super(ResidualBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)
        
        self.shortcut = nn.Sequential()
        if stride != 1 or in_channels != out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels)
            )
    
    def forward(self, x):
        residual = x
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(residual)
        out = F.relu(out)
        return out

class SimpleResNet(nn.Module):
    def __init__(self, num_classes=10):
        super(SimpleResNet, self).__init__()
        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        
        self.layer1 = self._make_layer(64, 64, 2, stride=1)
        self.layer2 = self._make_layer(64, 128, 2, stride=2)
        self.layer3 = self._make_layer(128, 256, 2, stride=2)
        
        self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(256, num_classes)
        
    def _make_layer(self, in_channels, out_channels, num_blocks, stride):
        layers = []
        layers.append(ResidualBlock(in_channels, out_channels, stride))
        for _ in range(1, num_blocks):
            layers.append(ResidualBlock(out_channels, out_channels))
        return nn.Sequential(*layers)
    
    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.avg_pool(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x

# Create and train the model
model = SimpleResNet(num_classes=10)
if torch.cuda.is_available():
    model = model.cuda()

Transfer Learning

# Using pre-trained models
import torchvision.models as models

# Load pre-trained ResNet50
pretrained_resnet = models.resnet50(pretrained=True)

# Freeze all parameters
for param in pretrained_resnet.parameters():
    param.requires_grad = False

# Replace the final layer
num_features = pretrained_resnet.fc.in_features
pretrained_resnet.fc = nn.Linear(num_features, 10)  # 10 classes for CIFAR-10

# Only the final layer parameters will be updated
optimizer = optim.Adam(pretrained_resnet.fc.parameters(), lr=0.001)

# Fine-tuning: Unfreeze last few layers
def unfreeze_layers(model, num_layers=2):
    # Get all layer names
    layer_names = [name for name, _ in model.named_parameters()]
    
    # Unfreeze last num_layers
    for name, param in model.named_parameters():
        if any(layer in name for layer in layer_names[-num_layers:]):
            param.requires_grad = True

# Unfreeze last 2 layers for fine-tuning
unfreeze_layers(pretrained_resnet, num_layers=2)

# Create optimizer for unfrozen parameters
optimizer = optim.Adam(filter(lambda p: p.requires_grad, pretrained_resnet.parameters()), lr=0.0001)

🔤 Natural Language Processing

Text Processing and RNNs

# Simple text classification with LSTM
class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes, num_layers=2):
        super(TextClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True, dropout=0.3)
        self.fc = nn.Linear(hidden_dim, num_classes)
        self.dropout = nn.Dropout(0.5)
        
    def forward(self, x):
        # x shape: (batch_size, sequence_length)
        embedded = self.embedding(x)  # (batch_size, seq_len, embedding_dim)
        
        # LSTM output
        lstm_out, (hidden, cell) = self.lstm(embedded)
        
        # Use the last hidden state
        last_hidden = hidden[-1]  # (batch_size, hidden_dim)
        
        # Apply dropout and final layer
        output = self.dropout(last_hidden)
        output = self.fc(output)
        
        return output

# Attention mechanism implementation
class AttentionLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes):
        super(AttentionLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.attention = nn.Linear(hidden_dim * 2, 1)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)
        
    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)  # (batch_size, seq_len, hidden_dim * 2)
        
        # Attention weights
        attention_weights = torch.softmax(self.attention(lstm_out), dim=1)  # (batch_size, seq_len, 1)
        
        # Weighted sum
        attended = torch.sum(attention_weights * lstm_out, dim=1)  # (batch_size, hidden_dim * 2)
        
        output = self.fc(attended)
        return output

# Text preprocessing example
import re
from collections import Counter

def build_vocab(texts, max_vocab_size=10000):
    # Tokenize and build vocabulary
    word_counts = Counter()
    for text in texts:
        words = re.findall(r'\b\w+\b', text.lower())
        word_counts.update(words)
    
    # Create vocabulary
    vocab = {'': 0, '': 1}
    for word, count in word_counts.most_common(max_vocab_size - 2):
        vocab[word] = len(vocab)
    
    return vocab

def text_to_indices(text, vocab, max_length=100):
    words = re.findall(r'\b\w+\b', text.lower())
    indices = [vocab.get(word, vocab['']) for word in words]
    
    # Pad or truncate
    if len(indices) < max_length:
        indices.extend([vocab['']] * (max_length - len(indices)))
    else:
        indices = indices[:max_length]
    
    return indices

🚀 Advanced Topics

Generative Adversarial Networks (GANs)

# Simple GAN implementation
class Generator(nn.Module):
    def __init__(self, nz=100, ngf=64, nc=3):
        super(Generator, self).__init__()
        self.main = nn.Sequential(
            # Input is Z, going into a convolution
            nn.ConvTranspose2d(nz, ngf * 8, 4, 1, 0, bias=False),
            nn.BatchNorm2d(ngf * 8),
            nn.ReLU(True),
            
            nn.ConvTranspose2d(ngf * 8, ngf * 4, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ngf * 4),
            nn.ReLU(True),
            
            nn.ConvTranspose2d(ngf * 4, ngf * 2, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ngf * 2),
            nn.ReLU(True),
            
            nn.ConvTranspose2d(ngf * 2, ngf, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ngf),
            nn.ReLU(True),
            
            nn.ConvTranspose2d(ngf, nc, 4, 2, 1, bias=False),
            nn.Tanh()
        )
    
    def forward(self, input):
        return self.main(input)

class Discriminator(nn.Module):
    def __init__(self, nc=3, ndf=64):
        super(Discriminator, self).__init__()
        self.main = nn.Sequential(
            nn.Conv2d(nc, ndf, 4, 2, 1, bias=False),
            nn.LeakyReLU(0.2, inplace=True),
            
            nn.Conv2d(ndf, ndf * 2, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ndf * 2),
            nn.LeakyReLU(0.2, inplace=True),
            
            nn.Conv2d(ndf * 2, ndf * 4, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ndf * 4),
            nn.LeakyReLU(0.2, inplace=True),
            
            nn.Conv2d(ndf * 4, ndf * 8, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ndf * 8),
            nn.LeakyReLU(0.2, inplace=True),
            
            nn.Conv2d(ndf * 8, 1, 4, 1, 0, bias=False),
            nn.Sigmoid()
        )
    
    def forward(self, input):
        return self.main(input).view(-1, 1).squeeze(1)

# GAN training loop
def train_gan(generator, discriminator, dataloader, num_epochs=100, lr=0.0002):
    criterion = nn.BCELoss()
    
    optimizer_G = optim.Adam(generator.parameters(), lr=lr, betas=(0.5, 0.999))
    optimizer_D = optim.Adam(discriminator.parameters(), lr=lr, betas=(0.5, 0.999))
    
    fixed_noise = torch.randn(64, 100, 1, 1)
    
    for epoch in range(num_epochs):
        for i, data in enumerate(dataloader):
            # Update Discriminator
            discriminator.zero_grad()
            real_data = data[0]
            batch_size = real_data.size(0)
            label = torch.full((batch_size,), 1., dtype=torch.float)
            
            output = discriminator(real_data)
            errD_real = criterion(output, label)
            errD_real.backward()
            
            # Train with fake
            noise = torch.randn(batch_size, 100, 1, 1)
            fake = generator(noise)
            label.fill_(0.)
            output = discriminator(fake.detach())
            errD_fake = criterion(output, label)
            errD_fake.backward()
            optimizer_D.step()
            
            # Update Generator
            generator.zero_grad()
            label.fill_(1.)
            output = discriminator(fake)
            errG = criterion(output, label)
            errG.backward()
            optimizer_G.step()
            
            if i % 50 == 0:
                print(f'Epoch [{epoch}/{num_epochs}] Batch [{i}/{len(dataloader)}]')
                print(f'Loss_D: {errD_real.item() + errD_fake.item():.4f}, Loss_G: {errG.item():.4f}')

Model Deployment

# Save and load models
# Save model state dict
torch.save(model.state_dict(), 'model_weights.pth')

# Save entire model
torch.save(model, 'complete_model.pth')

# Load model
model = SimpleNet(input_size=784, hidden_size=256, num_classes=10)
model.load_state_dict(torch.load('model_weights.pth'))
model.eval()

# TorchScript for production deployment
# Tracing method
example_input = torch.randn(1, 784)
traced_model = torch.jit.trace(model, example_input)
traced_model.save('traced_model.pt')

# Scripting method (handles control flow better)
scripted_model = torch.jit.script(model)
scripted_model.save('scripted_model.pt')

# Load TorchScript model
loaded_model = torch.jit.load('traced_model.pt')

# ONNX export for cross-platform deployment
import torch.onnx

dummy_input = torch.randn(1, 784)
torch.onnx.export(model, dummy_input, "model.onnx", 
                  input_names=['input'], output_names=['output'],
                  dynamic_axes={'input': {0: 'batch_size'}, 'output': {0: 'batch_size'}})

🎯 Conclusion

PyTorch provides a powerful and flexible framework for deep learning. From basic neural networks to complex architectures like GANs and Transformers, PyTorch's dynamic computation graph and intuitive API make it an excellent choice for both research and production.

The key to mastering PyTorch is practice. Start with simple projects, gradually work your way up to more complex architectures, and always keep the documentation handy. The PyTorch community is vibrant and helpful, so don't hesitate to engage and learn from others.

🚀 Next Steps

Build a complete image classification project
Experiment with different architectures (ResNet, VGG, DenseNet)
Try transfer learning on a custom dataset
Implement a simple GAN or VAE
Explore PyTorch Lightning for cleaner code
Learn about model optimization and quantization
Deploy a model using TorchServe or ONNX