Lecture 30. L1 and L2 Regularization

This commit is contained in:
judsonupchurch 2025-01-27 19:39:18 +00:00
parent 95127b5eb4
commit a82b17fabe
3 changed files with 505 additions and 2 deletions

View File

@ -9,7 +9,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
@ -23,7 +23,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
@ -464,6 +464,272 @@
"## Data Leakage\n",
"While K-Fold is good for getting hyper-parameters with limited data, it can have data leakage if not correctly setup. For example, with timeseries data, it may get access to future information and train off of that."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# L1/L2 Regularization\n",
"## How it Works\n",
"When a network is overfitting the data, it typically has larger weights and biases. By punishing the neural network for larger weights and biases, we can try to reduce the chances of overfitting.\n",
"\n",
"We add the L1 or L2 loss to the data loss. L1 is the summation of the absolute value of all the weights. L2 is the summation of the weights squared. L2 is typically preferred due to the smoother gradient and still allowing small weights and biases.\n",
"\n",
"## Backward Pass\n",
"### L1\n",
"$\\frac{\\delta L}{\\delta w} = \\lambda \\text{ if } w \\gt 0, \\text{ else } -\\lambda$\n",
"\n",
"### L2\n",
"$\\frac{\\delta L}{\\delta w} = 2\\lambda w$"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"\n",
"class Layer_Dense:\n",
" def __init__(self, n_inputs, n_neurons,\n",
" weight_regularizer_l1=0, weight_regularizer_l2=0,\n",
" bias_regularizer_l1=0, bias_regularizer_l2=0):\n",
" # Initialize the weights and biases\n",
" self.weights = 0.01 * np.random.randn(n_inputs, n_neurons)\n",
" self.biases = np.zeros((1, n_neurons))\n",
" # Set the regularization strength\n",
" self.weight_regularizer_l1 = weight_regularizer_l1\n",
" self.weight_regularizer_l2 = weight_regularizer_l2\n",
" self.bias_regularizer_l1 = bias_regularizer_l1\n",
" self.bias_regularizer_l2 = bias_regularizer_l2\n",
"\n",
" def forward(self, inputs):\n",
" # Calculate the output values from inputs, weights, and biases\n",
" self.inputs = inputs\n",
" self.output = np.dot(inputs, self.weights) + self.biases # Weights are already transposed\n",
" \n",
" def backward(self, dvalues):\n",
" '''Calculated the gradient of the loss with respect to the weights and biases of this layer.\n",
" dvalues is equiavelent to a transposed dl_dZ. It is the gradient \n",
" of the loss with respect to the outputs of this layer.'''\n",
" # Gradients based on parameters\n",
" self.dweights = np.dot(self.inputs.T, dvalues)\n",
" self.dbiases = np.sum(dvalues, axis=0, keepdims=True)\n",
" self.dinputs = np.dot(dvalues, self.weights.T)\n",
"\n",
" # Now we look at the gradients on regularization\n",
" # L1\n",
" if self.weight_regularizer_l1 > 0:\n",
" dL1 = np.ones_like(self.weights)\n",
" dL1[self.weights < 0] = -1\n",
" self.dweights += self.weight_regularizer_l1 * dL1\n",
" if self.bias_regularizer_l1 > 0:\n",
" dL1 = np.ones_like(self.biases)\n",
" dL1[self.biases < 0] = -1\n",
" self.dbiases += self.bias_regularizer_l1 * dL1\n",
"\n",
" # L2\n",
" if self.weight_regularizer_l2 > 0:\n",
" self.dweights += 2 * self.weight_regularizer_l2 * self.weights\n",
" if self.bias_regularizer_l2 > 0:\n",
" self.dbiases += 2 * self.bias_regularizer_l2 * self.biases\n",
"\n",
"# Base class for Loss functions\n",
"class Loss:\n",
" '''Calculates the data and regularization losses given\n",
" model output and ground truth values'''\n",
" def regularization_loss(self, layer):\n",
" regularization_loss = 0\n",
"\n",
" # L1 regularization\n",
" if layer.weight_regularizer_l1 > 0:\n",
" regularization_loss += layer.weight_regularizer_l1 * np.sum(np.abs(layer.weights))\n",
" if layer.bias_regularizer_l1 > 0:\n",
" regularization_loss += layer.bias_regularizer_l1 * np.sum(np.abs(layer.biases))\n",
"\n",
" # L2 regularization\n",
" if layer.weight_regularizer_l2 > 0:\n",
" regularization_loss += layer.weight_regularizer_l1 * np.sum(layer.weights * layer.weights)\n",
" if layer.bias_regularizer_l2 > 0:\n",
" regularization_loss += layer.bias_regularizer_l1 * np.sum(layer.biases * layer.biases)\n",
"\n",
" return regularization_loss\n",
"\n",
" def calculate(self, output, y):\n",
" sample_losses = self.forward(output, y)\n",
" data_loss = np.average(sample_losses)\n",
" return data_loss\n",
"\n",
"class Loss_CategoricalCrossEntropy(Loss):\n",
" def forward(self, y_pred, y_true):\n",
" '''y_pred is the neural network output\n",
" y_true is the ideal output of the neural network'''\n",
" samples = len(y_pred)\n",
" # Bound the predicted values \n",
" y_pred_clipped = np.clip(y_pred, 1e-7, 1-1e-7)\n",
" \n",
" if len(y_true.shape) == 1: # Categorically labeled\n",
" correct_confidences = y_pred_clipped[range(samples), y_true]\n",
" elif len(y_true.shape) == 2: # One hot encoded\n",
" correct_confidences = np.sum(y_pred_clipped*y_true, axis=1)\n",
"\n",
" # Calculate the losses\n",
" negative_log_likelihoods = -np.log(correct_confidences)\n",
" return negative_log_likelihoods\n",
" \n",
" def backward(self, dvalues, y_true):\n",
" samples = len(dvalues)\n",
"\n",
" # Number of lables in each sample\n",
" labels = len(dvalues[0])\n",
"\n",
" # if the labels are sparse, turn them into a one-hot vector\n",
" if len(y_true.shape) == 1:\n",
" y_true = np.eye(labels)[y_true]\n",
"\n",
" # Calculate the gradient then normalize\n",
" self.dinputs = -y_true / dvalues\n",
" self.dinputs = self.dinputs / samples\n",
"\n",
"class Activation_Softmax_Loss_CategoricalCrossentropy():\n",
" def __init__(self):\n",
" self.activation = Activation_Softmax()\n",
" self.loss = Loss_CategoricalCrossEntropy()\n",
"\n",
" def forward(self, inputs, y_true):\n",
" self.activation.forward(inputs)\n",
" self.output = self.activation.output\n",
" return self.loss.calculate(self.output, y_true)\n",
" \n",
" def backward(self, dvalues, y_true):\n",
" samples = len(dvalues)\n",
"\n",
" # if the samples are one-hot encoded, turn them into discrete values\n",
" if len(y_true.shape) == 2:\n",
" y_true = np.argmax(y_true, axis=1)\n",
" \n",
" # Copy so we can safely modify\n",
" self.dinputs = dvalues.copy()\n",
" \n",
" # Calculate and normalize gradient \n",
" self.dinputs[range(samples), y_true] -= 1\n",
" self.dinputs = self.dinputs / samples"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Testing Regularization"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"epoch: 10000, acc: 0.913, loss: 0.210, lr: 0.019980019980019983\n",
"validation, acc: 0.820, loss: 0.694\n"
]
}
],
"source": [
"# Create dataset\n",
"X, y = spiral_data(samples=100, classes=3)\n",
"\n",
"# Create Dense layer with 2 input features and 64 output values\n",
"dense1 = Layer_Dense(2, 64, weight_regularizer_l2=5e-4, bias_regularizer_l2=5e-4)\n",
"\n",
"# Create ReLU activation (to be used with Dense layer)\n",
"activation1 = Activation_ReLU()\n",
"\n",
"# Create second Dense layer with 64 input features (as we take output\n",
"# of previous layer here) and 3 output values (output values)\n",
"dense2 = Layer_Dense(64, 3)\n",
"\n",
"# Create Softmax classifier's combined loss and activation\n",
"loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()\n",
"\n",
"# Create optimizer\n",
"optimizer = Optimizer_Adam(learning_rate=0.02, decay=1e-7)\n",
"\n",
"# Train in loop\n",
"for epoch in range(10001):\n",
" # Perform a forward pass of our training data through this layer\n",
" dense1.forward(X)\n",
" \n",
" # Perform a forward pass through activation function\n",
" # takes the output of first dense layer here\n",
" activation1.forward(dense1.output)\n",
" \n",
" # Perform a forward pass through second Dense layer\n",
" # takes outputs of activation function of first layer as inputs\n",
" dense2.forward(activation1.output)\n",
" \n",
" # Perform a forward pass through the activation/loss function\n",
" # takes the output of second dense layer here and returns loss\n",
" data_loss = loss_activation.forward(dense2.output, y)\n",
"\n",
" regularization_loss = (\n",
" loss_activation.loss.regularization_loss(dense1) +\n",
" loss_activation.loss.regularization_loss(dense2)\n",
" )\n",
"\n",
" loss = data_loss + regularization_loss\n",
" \n",
" # Calculate accuracy from output of activation2 and targets\n",
" # calculate values along first axis\n",
" predictions = np.argmax(loss_activation.output, axis=1)\n",
" if len(y.shape) == 2:\n",
" y = np.argmax(y, axis=1)\n",
" accuracy = np.mean(predictions == y)\n",
" \n",
" # Backward pass\n",
" loss_activation.backward(loss_activation.output, y)\n",
" dense2.backward(loss_activation.dinputs)\n",
" activation1.backward(dense2.dinputs)\n",
" dense1.backward(activation1.dinputs)\n",
" \n",
" # Update weights and biases\n",
" optimizer.pre_update_params()\n",
" optimizer.update_params(dense1)\n",
" optimizer.update_params(dense2)\n",
" optimizer.post_update_params()\n",
"\n",
"# After Training\n",
"print(f'epoch: {epoch}, ' +\n",
" f'acc: {accuracy:.3f}, ' +\n",
" f'loss: {loss:.3f}, ' +\n",
" f'lr: {optimizer.current_learning_rate}')\n",
"\n",
"\n",
"# With the weights and biases now optimized based on the training data, lets validate it\n",
"# Create test dataset\n",
"X_test, y_test = spiral_data(samples=100, classes=3)\n",
"# Perform a forward pass of our testing data through this layer\n",
"dense1.forward(X_test)\n",
"# Perform a forward pass through activation function\n",
"# takes the output of first dense layer here\n",
"activation1.forward(dense1.output)\n",
"# Perform a forward pass through second Dense layer\n",
"# takes outputs of activation function of first layer as inputs\n",
"dense2.forward(activation1.output)\n",
"# Perform a forward pass through the activation/loss function\n",
"# takes the output of second dense layer here and returns loss\n",
"loss = loss_activation.forward(dense2.output, y_test)\n",
"# Calculate accuracy from output of activation2 and targets\n",
"# calculate values along first axis\n",
"predictions = np.argmax(loss_activation.output, axis=1)\n",
"if len(y_test.shape) == 2:\n",
" y_test = np.argmax(y_test, axis=1)\n",
"accuracy = np.mean(predictions == y_test)\n",
"print(f'validation, acc: {accuracy:.3f}, loss: {loss:.3f}')"
]
}
],
"metadata": {

Binary file not shown.

View File

@ -397,4 +397,241 @@ print(f'validation, acc: {accuracy:.3f}, loss: {loss:.3f}')
# ## Data Leakage
# While K-Fold is good for getting hyper-parameters with limited data, it can have data leakage if not correctly setup. For example, with timeseries data, it may get access to future information and train off of that.
# %% [markdown]
# # L1/L2 Regularization
# ## How it Works
# When a network is overfitting the data, it typically has larger weights and biases. By punishing the neural network for larger weights and biases, we can try to reduce the chances of overfitting.
#
# We add the L1 or L2 loss to the data loss. L1 is the summation of the absolute value of all the weights. L2 is the summation of the weights squared. L2 is typically preferred due to the smoother gradient and still allowing small weights and biases.
#
# ## Backward Pass
# ### L1
# $\frac{\delta L}{\delta w} = \lambda \text{ if } w \gt 0, \text{ else } -\lambda$
#
# ### L2
# $\frac{\delta L}{\delta w} = 2\lambda w$
# %%
import numpy as np
class Layer_Dense:
def __init__(self, n_inputs, n_neurons,
weight_regularizer_l1=0, weight_regularizer_l2=0,
bias_regularizer_l1=0, bias_regularizer_l2=0):
# Initialize the weights and biases
self.weights = 0.01 * np.random.randn(n_inputs, n_neurons)
self.biases = np.zeros((1, n_neurons))
# Set the regularization strength
self.weight_regularizer_l1 = weight_regularizer_l1
self.weight_regularizer_l2 = weight_regularizer_l2
self.bias_regularizer_l1 = bias_regularizer_l1
self.bias_regularizer_l2 = bias_regularizer_l2
def forward(self, inputs):
# Calculate the output values from inputs, weights, and biases
self.inputs = inputs
self.output = np.dot(inputs, self.weights) + self.biases # Weights are already transposed
def backward(self, dvalues):
'''Calculated the gradient of the loss with respect to the weights and biases of this layer.
dvalues is equiavelent to a transposed dl_dZ. It is the gradient
of the loss with respect to the outputs of this layer.'''
# Gradients based on parameters
self.dweights = np.dot(self.inputs.T, dvalues)
self.dbiases = np.sum(dvalues, axis=0, keepdims=True)
self.dinputs = np.dot(dvalues, self.weights.T)
# Now we look at the gradients on regularization
# L1
if self.weight_regularizer_l1 > 0:
dL1 = np.ones_like(self.weights)
dL1[self.weights < 0] = -1
self.dweights += self.weight_regularizer_l1 * dL1
if self.bias_regularizer_l1 > 0:
dL1 = np.ones_like(self.biases)
dL1[self.biases < 0] = -1
self.dbiases += self.bias_regularizer_l1 * dL1
# L2
if self.weight_regularizer_l2 > 0:
self.dweights += 2 * self.weight_regularizer_l2 * self.weights
if self.bias_regularizer_l2 > 0:
self.dbiases += 2 * self.bias_regularizer_l2 * self.biases
# Base class for Loss functions
class Loss:
'''Calculates the data and regularization losses given
model output and ground truth values'''
def regularization_loss(self, layer):
regularization_loss = 0
# L1 regularization
if layer.weight_regularizer_l1 > 0:
regularization_loss += layer.weight_regularizer_l1 * np.sum(np.abs(layer.weights))
if layer.bias_regularizer_l1 > 0:
regularization_loss += layer.bias_regularizer_l1 * np.sum(np.abs(layer.biases))
# L2 regularization
if layer.weight_regularizer_l2 > 0:
regularization_loss += layer.weight_regularizer_l1 * np.sum(layer.weights * layer.weights)
if layer.bias_regularizer_l2 > 0:
regularization_loss += layer.bias_regularizer_l1 * np.sum(layer.biases * layer.biases)
return regularization_loss
def calculate(self, output, y):
sample_losses = self.forward(output, y)
data_loss = np.average(sample_losses)
return data_loss
class Loss_CategoricalCrossEntropy(Loss):
def forward(self, y_pred, y_true):
'''y_pred is the neural network output
y_true is the ideal output of the neural network'''
samples = len(y_pred)
# Bound the predicted values
y_pred_clipped = np.clip(y_pred, 1e-7, 1-1e-7)
if len(y_true.shape) == 1: # Categorically labeled
correct_confidences = y_pred_clipped[range(samples), y_true]
elif len(y_true.shape) == 2: # One hot encoded
correct_confidences = np.sum(y_pred_clipped*y_true, axis=1)
# Calculate the losses
negative_log_likelihoods = -np.log(correct_confidences)
return negative_log_likelihoods
def backward(self, dvalues, y_true):
samples = len(dvalues)
# Number of lables in each sample
labels = len(dvalues[0])
# if the labels are sparse, turn them into a one-hot vector
if len(y_true.shape) == 1:
y_true = np.eye(labels)[y_true]
# Calculate the gradient then normalize
self.dinputs = -y_true / dvalues
self.dinputs = self.dinputs / samples
class Activation_Softmax_Loss_CategoricalCrossentropy():
def __init__(self):
self.activation = Activation_Softmax()
self.loss = Loss_CategoricalCrossEntropy()
def forward(self, inputs, y_true):
self.activation.forward(inputs)
self.output = self.activation.output
return self.loss.calculate(self.output, y_true)
def backward(self, dvalues, y_true):
samples = len(dvalues)
# if the samples are one-hot encoded, turn them into discrete values
if len(y_true.shape) == 2:
y_true = np.argmax(y_true, axis=1)
# Copy so we can safely modify
self.dinputs = dvalues.copy()
# Calculate and normalize gradient
self.dinputs[range(samples), y_true] -= 1
self.dinputs = self.dinputs / samples
# %% [markdown]
# ## Testing Regularization
# %%
# Create dataset
X, y = spiral_data(samples=100, classes=3)
# Create Dense layer with 2 input features and 64 output values
dense1 = Layer_Dense(2, 64, weight_regularizer_l2=5e-4, bias_regularizer_l2=5e-4)
# Create ReLU activation (to be used with Dense layer)
activation1 = Activation_ReLU()
# Create second Dense layer with 64 input features (as we take output
# of previous layer here) and 3 output values (output values)
dense2 = Layer_Dense(64, 3)
# Create Softmax classifier's combined loss and activation
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()
# Create optimizer
optimizer = Optimizer_Adam(learning_rate=0.02, decay=1e-7)
# Train in loop
for epoch in range(10001):
# Perform a forward pass of our training data through this layer
dense1.forward(X)
# Perform a forward pass through activation function
# takes the output of first dense layer here
activation1.forward(dense1.output)
# Perform a forward pass through second Dense layer
# takes outputs of activation function of first layer as inputs
dense2.forward(activation1.output)
# Perform a forward pass through the activation/loss function
# takes the output of second dense layer here and returns loss
data_loss = loss_activation.forward(dense2.output, y)
regularization_loss = (
loss_activation.loss.regularization_loss(dense1) +
loss_activation.loss.regularization_loss(dense2)
)
loss = data_loss + regularization_loss
# Calculate accuracy from output of activation2 and targets
# calculate values along first axis
predictions = np.argmax(loss_activation.output, axis=1)
if len(y.shape) == 2:
y = np.argmax(y, axis=1)
accuracy = np.mean(predictions == y)
# Backward pass
loss_activation.backward(loss_activation.output, y)
dense2.backward(loss_activation.dinputs)
activation1.backward(dense2.dinputs)
dense1.backward(activation1.dinputs)
# Update weights and biases
optimizer.pre_update_params()
optimizer.update_params(dense1)
optimizer.update_params(dense2)
optimizer.post_update_params()
# After Training
print(f'epoch: {epoch}, ' +
f'acc: {accuracy:.3f}, ' +
f'loss: {loss:.3f}, ' +
f'lr: {optimizer.current_learning_rate}')
# With the weights and biases now optimized based on the training data, lets validate it
# Create test dataset
X_test, y_test = spiral_data(samples=100, classes=3)
# Perform a forward pass of our testing data through this layer
dense1.forward(X_test)
# Perform a forward pass through activation function
# takes the output of first dense layer here
activation1.forward(dense1.output)
# Perform a forward pass through second Dense layer
# takes outputs of activation function of first layer as inputs
dense2.forward(activation1.output)
# Perform a forward pass through the activation/loss function
# takes the output of second dense layer here and returns loss
loss = loss_activation.forward(dense2.output, y_test)
# Calculate accuracy from output of activation2 and targets
# calculate values along first axis
predictions = np.argmax(loss_activation.output, axis=1)
if len(y_test.shape) == 2:
y_test = np.argmax(y_test, axis=1)
accuracy = np.mean(predictions == y_test)
print(f'validation, acc: {accuracy:.3f}, loss: {loss:.3f}')