🎯 PyTorch Fundamentals
Tensors - The Building Blocks
import torch
import numpy as np
# Creating tensors
x = torch.tensor([1, 2, 3, 4, 5])
y = torch.zeros(3, 4)
z = torch.randn(2, 3, 4) # Random normal distribution
ones = torch.ones(3, 3)
eye = torch.eye(3) # Identity matrix
# From NumPy
numpy_array = np.array([1, 2, 3, 4])
tensor_from_numpy = torch.from_numpy(numpy_array)
# Tensor properties
print(f"Shape: {z.shape}")
print(f"Data type: {z.dtype}")
print(f"Device: {z.device}")
# Move to GPU if available
if torch.cuda.is_available():
z = z.cuda() # or z.to('cuda')
print(f"GPU tensor device: {z.device}")
# Basic operations
a = torch.tensor([1, 2, 3])
b = torch.tensor([4, 5, 6])
# Element-wise operations
addition = a + b # or torch.add(a, b)
multiplication = a * b # or torch.mul(a, b)
matrix_mult = torch.mm(a.unsqueeze(0), b.unsqueeze(1)) # Matrix multiplication
# Reshaping
reshaped = z.view(-1, 4) # Reshape to (-1, 4)
reshaped_safe = z.reshape(-1, 4) # Alternative that handles contiguity
Automatic Differentiation with Autograd
# Enable gradient computation
x = torch.tensor([2.0], requires_grad=True)
y = torch.tensor([3.0], requires_grad=True)
# Forward pass
z = x**2 + y**3
loss = z.mean()
# Backward pass
loss.backward()
# Access gradients
print(f"dx: {x.grad}") # dz/dx = 2x = 4
print(f"dy: {y.grad}") # dz/dy = 3y^2 = 27
# Gradient accumulation example
x = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)
for i in range(3):
y = (x**2).sum()
y.backward()
print(f"Iteration {i+1}, gradients: {x.grad}")
# x.grad.zero_() # Uncomment to reset gradients
# Context managers for gradient control
x = torch.tensor([1.0, 2.0], requires_grad=True)
# Disable gradients temporarily
with torch.no_grad():
y = x**2 # No gradient computation
# Detach from computational graph
x_detached = x.detach() # New tensor without gradients
🏗️ Building Neural Networks
Using torch.nn Module
import torch.nn as nn
import torch.nn.functional as F
# Simple feedforward network
class SimpleNet(nn.Module):
def __init__(self, input_size, hidden_size, num_classes):
super(SimpleNet, self).__init__()
self.fc1 = nn.Linear(input_size, hidden_size)
self.fc2 = nn.Linear(hidden_size, hidden_size)
self.fc3 = nn.Linear(hidden_size, num_classes)
self.dropout = nn.Dropout(0.2)
def forward(self, x):
x = F.relu(self.fc1(x))
x = self.dropout(x)
x = F.relu(self.fc2(x))
x = self.dropout(x)
x = self.fc3(x)
return x
# Create model instance
model = SimpleNet(input_size=784, hidden_size=256, num_classes=10)
print(model)
# Model parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total parameters: {total_params}")
print(f"Trainable parameters: {trainable_params}")
# Using Sequential for simpler models
sequential_model = nn.Sequential(
nn.Linear(784, 256),
nn.ReLU(),
nn.Dropout(0.2),
nn.Linear(256, 128),
nn.ReLU(),
nn.Dropout(0.2),
nn.Linear(128, 10)
)
Convolutional Neural Networks
class CNN(nn.Module):
def __init__(self, num_classes=10):
super(CNN, self).__init__()
# Convolutional layers
self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
self.conv3 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1)
# Pooling layer
self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
# Batch normalization
self.bn1 = nn.BatchNorm2d(32)
self.bn2 = nn.BatchNorm2d(64)
self.bn3 = nn.BatchNorm2d(128)
# Fully connected layers
self.fc1 = nn.Linear(128 * 3 * 3, 512) # Assuming 28x28 input
self.fc2 = nn.Linear(512, num_classes)
# Dropout
self.dropout = nn.Dropout(0.5)
def forward(self, x):
# First conv block
x = self.pool(F.relu(self.bn1(self.conv1(x))))
# Second conv block
x = self.pool(F.relu(self.bn2(self.conv2(x))))
# Third conv block
x = self.pool(F.relu(self.bn3(self.conv3(x))))
# Flatten for fully connected layers
x = x.view(x.size(0), -1)
# Fully connected layers
x = F.relu(self.fc1(x))
x = self.dropout(x)
x = self.fc2(x)
return x
# Create CNN model
cnn_model = CNN(num_classes=10)
# Example input (batch_size, channels, height, width)
dummy_input = torch.randn(32, 1, 28, 28)
output = cnn_model(dummy_input)
print(f"Output shape: {output.shape}") # Should be (32, 10)
🎓 Training Deep Learning Models
Complete Training Loop
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
# Prepare data (example with dummy data)
X_train = torch.randn(1000, 784)
y_train = torch.randint(0, 10, (1000,))
X_val = torch.randn(200, 784)
y_val = torch.randint(0, 10, (200,))
# Create datasets and dataloaders
train_dataset = TensorDataset(X_train, y_train)
val_dataset = TensorDataset(X_val, y_val)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
# Initialize model, loss function, and optimizer
model = SimpleNet(input_size=784, hidden_size=256, num_classes=10)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)
# Learning rate scheduler
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)
# Training function
def train_model(model, train_loader, val_loader, num_epochs=20):
train_losses = []
val_losses = []
val_accuracies = []
for epoch in range(num_epochs):
# Training phase
model.train()
train_loss = 0.0
for batch_X, batch_y in train_loader:
# Zero gradients
optimizer.zero_grad()
# Forward pass
outputs = model(batch_X)
loss = criterion(outputs, batch_y)
# Backward pass and optimization
loss.backward()
optimizer.step()
train_loss += loss.item()
# Validation phase
model.eval()
val_loss = 0.0
correct = 0
total = 0
with torch.no_grad():
for batch_X, batch_y in val_loader:
outputs = model(batch_X)
loss = criterion(outputs, batch_y)
val_loss += loss.item()
_, predicted = torch.max(outputs.data, 1)
total += batch_y.size(0)
correct += (predicted == batch_y).sum().item()
# Calculate averages
train_loss /= len(train_loader)
val_loss /= len(val_loader)
val_accuracy = 100 * correct / total
# Store metrics
train_losses.append(train_loss)
val_losses.append(val_loss)
val_accuracies.append(val_accuracy)
# Update learning rate
scheduler.step()
# Print progress
print(f'Epoch [{epoch+1}/{num_epochs}]')
print(f'Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.2f}%')
print(f'Learning Rate: {optimizer.param_groups[0]["lr"]:.6f}')
print('-' * 50)
return train_losses, val_losses, val_accuracies
# Train the model
train_losses, val_losses, val_accuracies = train_model(model, train_loader, val_loader)
Advanced Training Techniques
# Early stopping implementation
class EarlyStopping:
def __init__(self, patience=7, min_delta=0, restore_best_weights=True):
self.patience = patience
self.min_delta = min_delta
self.restore_best_weights = restore_best_weights
self.best_loss = None
self.counter = 0
self.best_weights = None
def __call__(self, val_loss, model):
if self.best_loss is None:
self.best_loss = val_loss
self.save_checkpoint(model)
elif self.best_loss - val_loss > self.min_delta:
self.best_loss = val_loss
self.counter = 0
self.save_checkpoint(model)
else:
self.counter += 1
if self.counter >= self.patience:
if self.restore_best_weights:
model.load_state_dict(self.best_weights)
return True
return False
def save_checkpoint(self, model):
self.best_weights = model.state_dict().copy()
# Gradient clipping for training stability
def train_with_grad_clipping(model, train_loader, optimizer, criterion, max_norm=1.0):
model.train()
total_loss = 0
for batch_X, batch_y in train_loader:
optimizer.zero_grad()
outputs = model(batch_X)
loss = criterion(outputs, batch_y)
loss.backward()
# Clip gradients
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
optimizer.step()
total_loss += loss.item()
return total_loss / len(train_loader)
# Mixed precision training (for faster training on modern GPUs)
from torch.cuda.amp import autocast, GradScaler
def train_with_mixed_precision(model, train_loader, optimizer, criterion):
scaler = GradScaler()
model.train()
total_loss = 0
for batch_X, batch_y in train_loader:
optimizer.zero_grad()
# Forward pass with autocast
with autocast():
outputs = model(batch_X)
loss = criterion(outputs, batch_y)
# Backward pass with scaled gradients
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
total_loss += loss.item()
return total_loss / len(train_loader)
🖼️ Computer Vision with PyTorch
Working with Real Image Data
import torchvision
import torchvision.transforms as transforms
from torchvision.datasets import CIFAR10
# Data preprocessing and augmentation
transform_train = transforms.Compose([
transforms.RandomCrop(32, padding=4),
transforms.RandomHorizontalFlip(p=0.5),
transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])
transform_test = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])
# Load CIFAR-10 dataset
train_dataset = CIFAR10(root='./data', train=True, download=True, transform=transform_train)
test_dataset = CIFAR10(root='./data', train=False, download=True, transform=transform_test)
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False, num_workers=4)
# ResNet-inspired architecture
class ResidualBlock(nn.Module):
def __init__(self, in_channels, out_channels, stride=1):
super(ResidualBlock, self).__init__()
self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(out_channels)
self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(out_channels)
self.shortcut = nn.Sequential()
if stride != 1 or in_channels != out_channels:
self.shortcut = nn.Sequential(
nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(out_channels)
)
def forward(self, x):
residual = x
out = F.relu(self.bn1(self.conv1(x)))
out = self.bn2(self.conv2(out))
out += self.shortcut(residual)
out = F.relu(out)
return out
class SimpleResNet(nn.Module):
def __init__(self, num_classes=10):
super(SimpleResNet, self).__init__()
self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(64)
self.layer1 = self._make_layer(64, 64, 2, stride=1)
self.layer2 = self._make_layer(64, 128, 2, stride=2)
self.layer3 = self._make_layer(128, 256, 2, stride=2)
self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
self.fc = nn.Linear(256, num_classes)
def _make_layer(self, in_channels, out_channels, num_blocks, stride):
layers = []
layers.append(ResidualBlock(in_channels, out_channels, stride))
for _ in range(1, num_blocks):
layers.append(ResidualBlock(out_channels, out_channels))
return nn.Sequential(*layers)
def forward(self, x):
x = F.relu(self.bn1(self.conv1(x)))
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.avg_pool(x)
x = x.view(x.size(0), -1)
x = self.fc(x)
return x
# Create and train the model
model = SimpleResNet(num_classes=10)
if torch.cuda.is_available():
model = model.cuda()
Transfer Learning
# Using pre-trained models
import torchvision.models as models
# Load pre-trained ResNet50
pretrained_resnet = models.resnet50(pretrained=True)
# Freeze all parameters
for param in pretrained_resnet.parameters():
param.requires_grad = False
# Replace the final layer
num_features = pretrained_resnet.fc.in_features
pretrained_resnet.fc = nn.Linear(num_features, 10) # 10 classes for CIFAR-10
# Only the final layer parameters will be updated
optimizer = optim.Adam(pretrained_resnet.fc.parameters(), lr=0.001)
# Fine-tuning: Unfreeze last few layers
def unfreeze_layers(model, num_layers=2):
# Get all layer names
layer_names = [name for name, _ in model.named_parameters()]
# Unfreeze last num_layers
for name, param in model.named_parameters():
if any(layer in name for layer in layer_names[-num_layers:]):
param.requires_grad = True
# Unfreeze last 2 layers for fine-tuning
unfreeze_layers(pretrained_resnet, num_layers=2)
# Create optimizer for unfrozen parameters
optimizer = optim.Adam(filter(lambda p: p.requires_grad, pretrained_resnet.parameters()), lr=0.0001)
🔤 Natural Language Processing
Text Processing and RNNs
# Simple text classification with LSTM
class TextClassifier(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes, num_layers=2):
super(TextClassifier, self).__init__()
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True, dropout=0.3)
self.fc = nn.Linear(hidden_dim, num_classes)
self.dropout = nn.Dropout(0.5)
def forward(self, x):
# x shape: (batch_size, sequence_length)
embedded = self.embedding(x) # (batch_size, seq_len, embedding_dim)
# LSTM output
lstm_out, (hidden, cell) = self.lstm(embedded)
# Use the last hidden state
last_hidden = hidden[-1] # (batch_size, hidden_dim)
# Apply dropout and final layer
output = self.dropout(last_hidden)
output = self.fc(output)
return output
# Attention mechanism implementation
class AttentionLSTM(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes):
super(AttentionLSTM, self).__init__()
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
self.attention = nn.Linear(hidden_dim * 2, 1)
self.fc = nn.Linear(hidden_dim * 2, num_classes)
def forward(self, x):
embedded = self.embedding(x)
lstm_out, _ = self.lstm(embedded) # (batch_size, seq_len, hidden_dim * 2)
# Attention weights
attention_weights = torch.softmax(self.attention(lstm_out), dim=1) # (batch_size, seq_len, 1)
# Weighted sum
attended = torch.sum(attention_weights * lstm_out, dim=1) # (batch_size, hidden_dim * 2)
output = self.fc(attended)
return output
# Text preprocessing example
import re
from collections import Counter
def build_vocab(texts, max_vocab_size=10000):
# Tokenize and build vocabulary
word_counts = Counter()
for text in texts:
words = re.findall(r'\b\w+\b', text.lower())
word_counts.update(words)
# Create vocabulary
vocab = {'': 0, '': 1}
for word, count in word_counts.most_common(max_vocab_size - 2):
vocab[word] = len(vocab)
return vocab
def text_to_indices(text, vocab, max_length=100):
words = re.findall(r'\b\w+\b', text.lower())
indices = [vocab.get(word, vocab['']) for word in words]
# Pad or truncate
if len(indices) < max_length:
indices.extend([vocab['']] * (max_length - len(indices)))
else:
indices = indices[:max_length]
return indices
🚀 Advanced Topics
Generative Adversarial Networks (GANs)
# Simple GAN implementation
class Generator(nn.Module):
def __init__(self, nz=100, ngf=64, nc=3):
super(Generator, self).__init__()
self.main = nn.Sequential(
# Input is Z, going into a convolution
nn.ConvTranspose2d(nz, ngf * 8, 4, 1, 0, bias=False),
nn.BatchNorm2d(ngf * 8),
nn.ReLU(True),
nn.ConvTranspose2d(ngf * 8, ngf * 4, 4, 2, 1, bias=False),
nn.BatchNorm2d(ngf * 4),
nn.ReLU(True),
nn.ConvTranspose2d(ngf * 4, ngf * 2, 4, 2, 1, bias=False),
nn.BatchNorm2d(ngf * 2),
nn.ReLU(True),
nn.ConvTranspose2d(ngf * 2, ngf, 4, 2, 1, bias=False),
nn.BatchNorm2d(ngf),
nn.ReLU(True),
nn.ConvTranspose2d(ngf, nc, 4, 2, 1, bias=False),
nn.Tanh()
)
def forward(self, input):
return self.main(input)
class Discriminator(nn.Module):
def __init__(self, nc=3, ndf=64):
super(Discriminator, self).__init__()
self.main = nn.Sequential(
nn.Conv2d(nc, ndf, 4, 2, 1, bias=False),
nn.LeakyReLU(0.2, inplace=True),
nn.Conv2d(ndf, ndf * 2, 4, 2, 1, bias=False),
nn.BatchNorm2d(ndf * 2),
nn.LeakyReLU(0.2, inplace=True),
nn.Conv2d(ndf * 2, ndf * 4, 4, 2, 1, bias=False),
nn.BatchNorm2d(ndf * 4),
nn.LeakyReLU(0.2, inplace=True),
nn.Conv2d(ndf * 4, ndf * 8, 4, 2, 1, bias=False),
nn.BatchNorm2d(ndf * 8),
nn.LeakyReLU(0.2, inplace=True),
nn.Conv2d(ndf * 8, 1, 4, 1, 0, bias=False),
nn.Sigmoid()
)
def forward(self, input):
return self.main(input).view(-1, 1).squeeze(1)
# GAN training loop
def train_gan(generator, discriminator, dataloader, num_epochs=100, lr=0.0002):
criterion = nn.BCELoss()
optimizer_G = optim.Adam(generator.parameters(), lr=lr, betas=(0.5, 0.999))
optimizer_D = optim.Adam(discriminator.parameters(), lr=lr, betas=(0.5, 0.999))
fixed_noise = torch.randn(64, 100, 1, 1)
for epoch in range(num_epochs):
for i, data in enumerate(dataloader):
# Update Discriminator
discriminator.zero_grad()
real_data = data[0]
batch_size = real_data.size(0)
label = torch.full((batch_size,), 1., dtype=torch.float)
output = discriminator(real_data)
errD_real = criterion(output, label)
errD_real.backward()
# Train with fake
noise = torch.randn(batch_size, 100, 1, 1)
fake = generator(noise)
label.fill_(0.)
output = discriminator(fake.detach())
errD_fake = criterion(output, label)
errD_fake.backward()
optimizer_D.step()
# Update Generator
generator.zero_grad()
label.fill_(1.)
output = discriminator(fake)
errG = criterion(output, label)
errG.backward()
optimizer_G.step()
if i % 50 == 0:
print(f'Epoch [{epoch}/{num_epochs}] Batch [{i}/{len(dataloader)}]')
print(f'Loss_D: {errD_real.item() + errD_fake.item():.4f}, Loss_G: {errG.item():.4f}')
Model Deployment
# Save and load models
# Save model state dict
torch.save(model.state_dict(), 'model_weights.pth')
# Save entire model
torch.save(model, 'complete_model.pth')
# Load model
model = SimpleNet(input_size=784, hidden_size=256, num_classes=10)
model.load_state_dict(torch.load('model_weights.pth'))
model.eval()
# TorchScript for production deployment
# Tracing method
example_input = torch.randn(1, 784)
traced_model = torch.jit.trace(model, example_input)
traced_model.save('traced_model.pt')
# Scripting method (handles control flow better)
scripted_model = torch.jit.script(model)
scripted_model.save('scripted_model.pt')
# Load TorchScript model
loaded_model = torch.jit.load('traced_model.pt')
# ONNX export for cross-platform deployment
import torch.onnx
dummy_input = torch.randn(1, 784)
torch.onnx.export(model, dummy_input, "model.onnx",
input_names=['input'], output_names=['output'],
dynamic_axes={'input': {0: 'batch_size'}, 'output': {0: 'batch_size'}})
🎯 Conclusion
PyTorch provides a powerful and flexible framework for deep learning. From basic neural networks to complex architectures like GANs and Transformers, PyTorch's dynamic computation graph and intuitive API make it an excellent choice for both research and production.
The key to mastering PyTorch is practice. Start with simple projects, gradually work your way up to more complex architectures, and always keep the documentation handy. The PyTorch community is vibrant and helpful, so don't hesitate to engage and learn from others.