Slide 1: Introduction to Adam Optimizer
Adam (Adaptive Moment Estimation) optimizer combines the advantages of two other optimization algorithms: RMSprop and momentum, maintaining exponentially decaying average of past gradients and squared gradients to adapt learning rates for each parameter.
# Mathematical formulation for Adam optimizer
"""
$$m_t = \beta_1 m_{t-1} + (1-\beta_1)g_t$$
$$v_t = \beta_2 v_{t-1} + (1-\beta_2)g_t^2$$
$$\hat{m}_t = \frac{m_t}{1-\beta_1^t}$$
$$\hat{v}_t = \frac{v_t}{1-\beta_2^t}$$
$$\theta_{t+1} = \theta_t - \frac{\eta}{\sqrt{\hat{v}_t} + \epsilon}\hat{m}_t$$
"""
Slide 2: Basic Adam Implementation
The core implementation of Adam optimizer showcases how it maintains moving averages of both gradients and their squares, using bias correction to adjust for initialization bias during early iterations.
import numpy as np
class Adam:
def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
self.learning_rate = learning_rate
self.beta1 = beta1
self.beta2 = beta2
self.epsilon = epsilon
self.m = None
self.v = None
self.t = 0
def initialize(self, params_shape):
self.m = np.zeros(params_shape)
self.v = np.zeros(params_shape)
def update(self, params, grads):
if self.m is None:
self.initialize(params.shape)
self.t += 1
# Update biased first moment estimate
self.m = self.beta1 * self.m + (1 - self.beta1) * grads
# Update biased second moment estimate
self.v = self.beta2 * self.v + (1 - self.beta2) * np.square(grads)
# Compute bias-corrected first moment estimate
m_hat = self.m / (1 - self.beta1**self.t)
# Compute bias-corrected second moment estimate
v_hat = self.v / (1 - self.beta2**self.t)
# Update parameters
params -= self.learning_rate * m_hat / (np.sqrt(v_hat) + self.epsilon)
return params
Slide 3: Linear Regression with Adam
A practical example implementing linear regression using Adam optimizer demonstrates its effectiveness in minimizing the mean squared error loss function for basic regression tasks.
import numpy as np
from sklearn.datasets import make_regression
# Generate synthetic data
X, y = make_regression(n_samples=1000, n_features=1, noise=0.1, random_state=42)
y = y.reshape(-1, 1)
# Initialize parameters
w = np.random.randn(1, 1)
b = np.zeros(1)
# Create Adam optimizer instances
adam_w = Adam(learning_rate=0.01)
adam_b = Adam(learning_rate=0.01)
# Training loop
for epoch in range(100):
# Forward pass
y_pred = np.dot(X, w) + b
# Compute gradients
dw = np.dot(X.T, (y_pred - y)) / len(X)
db = np.sum(y_pred - y) / len(X)
# Update parameters using Adam
w = adam_w.update(w, dw)
b = adam_b.update(b, db)
if epoch % 10 == 0:
loss = np.mean(np.square(y_pred - y))
print(f'Epoch {epoch}, Loss: {loss:.6f}')
Slide 4: Neural Network with Adam
import numpy as np
class NeuralNetwork:
def __init__(self, input_size, hidden_size, output_size):
self.W1 = np.random.randn(input_size, hidden_size) * 0.01
self.b1 = np.zeros((1, hidden_size))
self.W2 = np.random.randn(hidden_size, output_size) * 0.01
self.b2 = np.zeros((1, output_size))
# Initialize Adam optimizers for each parameter
self.adam_W1 = Adam()
self.adam_b1 = Adam()
self.adam_W2 = Adam()
self.adam_b2 = Adam()
def forward(self, X):
self.z1 = np.dot(X, self.W1) + self.b1
self.a1 = np.maximum(0, self.z1) # ReLU activation
self.z2 = np.dot(self.a1, self.W2) + self.b2
self.a2 = 1 / (1 + np.exp(-self.z2)) # Sigmoid activation
return self.a2
def backward(self, X, y, output):
m = X.shape[0]
dz2 = output - y
dW2 = np.dot(self.a1.T, dz2) / m
db2 = np.sum(dz2, axis=0, keepdims=True) / m
da1 = np.dot(dz2, self.W2.T)
dz1 = da1 * (self.z1 > 0) # ReLU derivative
dW1 = np.dot(X.T, dz1) / m
db1 = np.sum(dz1, axis=0, keepdims=True) / m
# Update parameters using Adam
self.W1 = self.adam_W1.update(self.W1, dW1)
self.b1 = self.adam_b1.update(self.b1, db1)
self.W2 = self.adam_W2.update(self.W2, dW2)
self.b2 = self.adam_b2.update(self.b2, db2)
Slide 5: Training Neural Network with Adam
The training process demonstrates how Adam optimizer effectively adjusts learning rates for each parameter, leading to faster convergence compared to standard gradient descent methods.
# Generate synthetic classification data
X = np.random.randn(1000, 10)
y = (np.sum(X, axis=1) > 0).reshape(-1, 1)
# Create and train neural network
nn = NeuralNetwork(input_size=10, hidden_size=5, output_size=1)
# Training loop
for epoch in range(1000):
# Forward pass
output = nn.forward(X)
# Compute loss
loss = -np.mean(y * np.log(output + 1e-8) + (1-y) * np.log(1-output + 1e-8))
# Backward pass and parameter update
nn.backward(X, y, output)
if epoch % 100 == 0:
accuracy = np.mean((output > 0.5) == y)
print(f'Epoch {epoch}, Loss: {loss:.6f}, Accuracy: {accuracy:.4f}')
Slide 6: Adam with Weight Decay (AdamW)
AdamW modifies the original Adam optimizer by implementing weight decay as a direct multiplicative factor on the weights rather than incorporating it into the gradient computation, resulting in better generalization.
class AdamW:
def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8, weight_decay=0.01):
self.learning_rate = learning_rate
self.beta1 = beta1
self.beta2 = beta2
self.epsilon = epsilon
self.weight_decay = weight_decay
self.m = None
self.v = None
self.t = 0
def update(self, params, grads):
if self.m is None:
self.m = np.zeros_like(params)
self.v = np.zeros_like(params)
self.t += 1
# Weight decay term
params = params * (1 - self.learning_rate * self.weight_decay)
# Update moment estimates
self.m = self.beta1 * self.m + (1 - self.beta1) * grads
self.v = self.beta2 * self.v + (1 - self.beta2) * np.square(grads)
# Bias correction
m_hat = self.m / (1 - self.beta1**self.t)
v_hat = self.v / (1 - self.beta2**self.t)
# Update parameters
params -= self.learning_rate * m_hat / (np.sqrt(v_hat) + self.epsilon)
return params
Slide 7: Implementing AMSGrad Variant
AMSGrad addresses the convergence issues of Adam by maintaining the maximum of all past squared gradients and using it for parameter updates instead of exponential moving averages.
class AMSGrad:
def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
self.learning_rate = learning_rate
self.beta1 = beta1
self.beta2 = beta2
self.epsilon = epsilon
self.m = None
self.v = None
self.v_hat = None
self.t = 0
def update(self, params, grads):
if self.m is None:
self.m = np.zeros_like(params)
self.v = np.zeros_like(params)
self.v_hat = np.zeros_like(params)
self.t += 1
# Update moment estimates
self.m = self.beta1 * self.m + (1 - self.beta1) * grads
self.v = self.beta2 * self.v + (1 - self.beta2) * np.square(grads)
# Update maximum squared gradient
self.v_hat = np.maximum(self.v_hat, self.v)
# Bias correction
m_hat = self.m / (1 - self.beta1**self.t)
# Update parameters using maximum
params -= self.learning_rate * m_hat / (np.sqrt(self.v_hat) + self.epsilon)
return params
Slide 8: Comparison of Optimizers
A comprehensive comparison of different Adam variants on a complex optimization problem demonstrates their relative performance characteristics and convergence behaviors.
import numpy as np
import matplotlib.pyplot as plt
# Generate complex optimization landscape
def rosenbrock(x, y):
return (1 - x)**2 + 100 * (y - x**2)**2
# Initialize optimizers
adam = Adam(learning_rate=0.001)
adamw = AdamW(learning_rate=0.001)
amsgrad = AMSGrad(learning_rate=0.001)
# Initial points
x = np.array([1.5])
y = np.array([1.5])
# Training history
history = {
'adam': {'x': [], 'y': [], 'loss': []},
'adamw': {'x': [], 'y': [], 'loss': []},
'amsgrad': {'x': [], 'y': [], 'loss': []}
}
# Optimization loop
for step in range(1000):
# Compute gradients
dx = 2 * (x - 1) - 400 * x * (y - x**2)
dy = 200 * (y - x**2)
# Update parameters with different optimizers
x_adam = adam.update(x.copy(), dx)
y_adam = adam.update(y.copy(), dy)
x_adamw = adamw.update(x.copy(), dx)
y_adamw = adamw.update(y.copy(), dy)
x_amsgrad = amsgrad.update(x.copy(), dx)
y_amsgrad = amsgrad.update(y.copy(), dy)
# Store history
for opt, x_val, y_val in [('adam', x_adam, y_adam),
('adamw', x_adamw, y_adamw),
('amsgrad', x_amsgrad, y_amsgrad)]:
history[opt]['x'].append(float(x_val))
history[opt]['y'].append(float(y_val))
history[opt]['loss'].append(float(rosenbrock(x_val, y_val)))
# Plot results
plt.figure(figsize=(12, 6))
for opt in ['adam', 'adamw', 'amsgrad']:
plt.plot(history[opt]['loss'], label=opt.upper())
plt.yscale('log')
plt.xlabel('Steps')
plt.ylabel('Loss')
plt.legend()
plt.title('Optimizer Comparison on Rosenbrock Function')
plt.show()
Slide 9: Rectified Adam (RAdam)
RAdam addresses the adaptive learning rate's variance in the early stages of training by implementing a rectification term that automatically adjusts the adaptive learning rate based on the variance of the momentum.
class RAdam:
def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
self.learning_rate = learning_rate
self.beta1 = beta1
self.beta2 = beta2
self.epsilon = epsilon
self.m = None
self.v = None
self.t = 0
def update(self, params, grads):
if self.m is None:
self.m = np.zeros_like(params)
self.v = np.zeros_like(params)
self.t += 1
# Update momentum and variance
self.m = self.beta1 * self.m + (1 - self.beta1) * grads
self.v = self.beta2 * self.v + (1 - self.beta2) * np.square(grads)
# Bias correction
m_hat = self.m / (1 - self.beta1**self.t)
v_hat = self.v / (1 - self.beta2**self.t)
# Compute rectification term
rho_inf = 2 / (1 - self.beta2) - 1
rho_t = rho_inf - 2 * self.t * self.beta2**self.t / (1 - self.beta2**self.t)
if rho_t > 4:
# Reactive variance adaptation
r_t = np.sqrt((rho_t - 4) * (rho_t - 2) * rho_inf / ((rho_inf - 4) * (rho_inf - 2) * rho_t))
params -= self.learning_rate * r_t * m_hat / (np.sqrt(v_hat) + self.epsilon)
else:
# Fall back to SGD
params -= self.learning_rate * m_hat
return params
Slide 10: Convolutional Neural Network with RAdam
This implementation showcases RAdam's effectiveness in training deep convolutional networks, particularly during the initial phase where adaptive learning rates can be unstable.
import numpy as np
class ConvLayer:
def __init__(self, input_shape, kernel_size, num_filters):
self.input_shape = input_shape
self.kernel_size = kernel_size
self.num_filters = num_filters
# Initialize kernels and bias
self.kernels = np.random.randn(
num_filters,
input_shape[0],
kernel_size,
kernel_size) * 0.01
self.bias = np.zeros(num_filters)
# Initialize RAdam optimizer
self.kernel_optimizer = RAdam()
self.bias_optimizer = RAdam()
def forward(self, inputs):
self.inputs = inputs
batch_size = inputs.shape[0]
# Calculate output dimensions
output_height = self.input_shape[1] - self.kernel_size + 1
output_width = self.input_shape[2] - self.kernel_size + 1
# Initialize output
self.output = np.zeros((
batch_size,
self.num_filters,
output_height,
output_width
))
# Convolution operation
for i in range(output_height):
for j in range(output_width):
input_slice = inputs[:, :, i:i+self.kernel_size, j:j+self.kernel_size]
for k in range(self.num_filters):
self.output[:, k, i, j] = np.sum(
input_slice * self.kernels[k],
axis=(1,2,3)
) + self.bias[k]
return self.output
def backward(self, grad_output):
batch_size = grad_output.shape[0]
grad_kernels = np.zeros_like(self.kernels)
grad_bias = np.sum(grad_output, axis=(0,2,3))
# Update parameters using RAdam
self.kernels = self.kernel_optimizer.update(self.kernels, grad_kernels)
self.bias = self.bias_optimizer.update(self.bias, grad_bias)
return grad_input
Slide 11: AdaBelief Optimizer
AdaBelief adapts the step size based on the "belief" in observed gradients, offering improved training stability and generalization compared to traditional Adam variants.
class AdaBelief:
def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
self.learning_rate = learning_rate
self.beta1 = beta1
self.beta2 = beta2
self.epsilon = epsilon
self.m = None
self.s = None
self.t = 0
def update(self, params, grads):
if self.m is None:
self.m = np.zeros_like(params)
self.s = np.zeros_like(params)
self.t += 1
# Update first moment
self.m = self.beta1 * self.m + (1 - self.beta1) * grads
# Update second moment based on belief
grad_residual = grads - self.m
self.s = self.beta2 * self.s + (1 - self.beta2) * np.square(grad_residual)
# Bias correction
m_hat = self.m / (1 - self.beta1**self.t)
s_hat = self.s / (1 - self.beta2**self.t)
# Update parameters
params -= self.learning_rate * m_hat / (np.sqrt(s_hat + self.epsilon))
return params
Slide 12: Complex Optimization Example
This comprehensive example demonstrates the performance differences between various Adam-based optimizers on a challenging non-convex optimization problem using a mixture of Gaussian distributions.
import numpy as np
from scipy.stats import multivariate_normal
class OptimizationProblem:
def __init__(self):
# Create mixture of Gaussians
self.means = np.array([[-2, -2], [2, 2], [-2, 2], [2, -2]])
self.covs = np.array([[[1, 0.5], [0.5, 1]]] * 4)
self.weights = np.array([0.25, 0.25, 0.25, 0.25])
def objective(self, x):
result = 0
for mean, cov, weight in zip(self.means, self.covs, self.weights):
result += weight * multivariate_normal.pdf(x, mean=mean, cov=cov)
return -result # Negative because we want to minimize
def gradient(self, x):
grad = np.zeros_like(x)
for mean, cov, weight in zip(self.means, self.covs, self.weights):
diff = x - mean
grad += weight * multivariate_normal.pdf(x, mean=mean, cov=cov) * \
np.linalg.solve(cov, diff)
return grad
# Initialize optimizers
optimizers = {
'Adam': Adam(learning_rate=0.01),
'AdamW': AdamW(learning_rate=0.01),
'RAdam': RAdam(learning_rate=0.01),
'AdaBelief': AdaBelief(learning_rate=0.01)
}
# Training loop
iterations = 1000
x_init = np.array([3.0, 3.0])
results = {name: {'trajectory': [], 'loss': []} for name in optimizers}
for name, optimizer in optimizers.items():
x = x_init.copy()
problem = OptimizationProblem()
for i in range(iterations):
grad = problem.gradient(x)
loss = problem.objective(x)
results[name]['trajectory'].append(x.copy())
results[name]['loss'].append(loss)
x = optimizer.update(x, grad)
Slide 13: Visualization of Optimizer Performance
A detailed visualization comparing convergence rates, stability, and final optimization results for different Adam variants on the mixture of Gaussians problem.
import matplotlib.pyplot as plt
def plot_optimization_results(results):
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
# Plot loss curves
for name, result in results.items():
losses = np.array(result['loss'])
ax1.plot(losses, label=name)
ax1.set_yscale('log')
ax1.set_xlabel('Iterations')
ax1.set_ylabel('Loss')
ax1.set_title('Convergence Comparison')
ax1.legend()
# Plot trajectories
x = np.linspace(-4, 4, 100)
y = np.linspace(-4, 4, 100)
X, Y = np.meshgrid(x, y)
Z = np.zeros_like(X)
problem = OptimizationProblem()
for i in range(len(x)):
for j in range(len(y)):
Z[i,j] = problem.objective([X[i,j], Y[i,j]])
ax2.contour(X, Y, Z, levels=20)
for name, result in results.items():
trajectory = np.array(result['trajectory'])
ax2.plot(trajectory[:,0], trajectory[:,1],
label=f'{name} path', marker='.')
ax2.set_xlabel('x')
ax2.set_ylabel('y')
ax2.set_title('Optimization Trajectories')
ax2.legend()
plt.tight_layout()
plt.show()
# Plot results
plot_optimization_results(results)
Slide 14: Additional Resources
- "Adam: A Method for Stochastic Optimization" - https://arxiv.org/abs/1412.6980
- "Decoupled Weight Decay Regularization" - https://arxiv.org/abs/1711.05101
- "On the Variance of the Adaptive Learning Rate and Beyond" - https://arxiv.org/abs/1908.03265
- "AdaBelief Optimizer: Adapting Stepsizes by the Belief in Observed Gradients" - https://arxiv.org/abs/2010.07468
- "AMSGrad: On the Convergence of Adam and Beyond" - https://openreview.net/forum?id=ryQu7f-RZ