Lecture 30. L1 and L2 Regularization

2025-01-27 19:39:18 +00:00 · 2025-01-27 19:39:18 +00:00 · a82b17fabe
commit a82b17fabe
parent 95127b5eb4
3 changed files with 505 additions and 2 deletions
--- a/lecture28_31/notes_28.ipynb
+++ b/lecture28_31/notes_28.ipynb
@ -9,7 +9,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
@ -23,7 +23,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
@ -464,6 +464,272 @@
    "## Data Leakage\n",
    "While K-Fold is good for getting hyper-parameters with limited data, it can have data leakage if not correctly setup. For example, with timeseries data, it may get access to future information and train off of that."
   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# L1/L2 Regularization\n",
+    "## How it Works\n",
+    "When a network is overfitting the data, it typically has larger weights and biases. By punishing the neural network for larger weights and biases, we can try to reduce the chances of overfitting.\n",
+    "\n",
+    "We add the L1 or L2 loss to the data loss. L1 is the summation of the absolute value of all the weights. L2 is the summation of the weights squared. L2 is typically preferred due to the smoother gradient and still allowing small weights and biases.\n",
+    "\n",
+    "## Backward Pass\n",
+    "### L1\n",
+    "$\\frac{\\delta L}{\\delta w} = \\lambda \\text{ if } w \\gt 0, \\text{ else } -\\lambda$\n",
+    "\n",
+    "### L2\n",
+    "$\\frac{\\delta L}{\\delta w} = 2\\lambda w$"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "class Layer_Dense:\n",
+    "    def __init__(self, n_inputs, n_neurons,\n",
+    "                 weight_regularizer_l1=0, weight_regularizer_l2=0,\n",
+    "                 bias_regularizer_l1=0, bias_regularizer_l2=0):\n",
+    "        # Initialize the weights and biases\n",
+    "        self.weights = 0.01 * np.random.randn(n_inputs, n_neurons)\n",
+    "        self.biases = np.zeros((1, n_neurons))\n",
+    "        # Set the regularization strength\n",
+    "        self.weight_regularizer_l1 = weight_regularizer_l1\n",
+    "        self.weight_regularizer_l2 = weight_regularizer_l2\n",
+    "        self.bias_regularizer_l1 = bias_regularizer_l1\n",
+    "        self.bias_regularizer_l2 = bias_regularizer_l2\n",
+    "\n",
+    "    def forward(self, inputs):\n",
+    "        # Calculate the output values from inputs, weights, and biases\n",
+    "        self.inputs = inputs\n",
+    "        self.output = np.dot(inputs, self.weights) + self.biases # Weights are already transposed\n",
+    "    \n",
+    "    def backward(self, dvalues):\n",
+    "        '''Calculated the gradient of the loss with respect to the weights and biases of this layer.\n",
+    "        dvalues is equiavelent to a transposed dl_dZ. It is the gradient \n",
+    "        of the loss with respect to the outputs of this layer.'''\n",
+    "        # Gradients based on parameters\n",
+    "        self.dweights = np.dot(self.inputs.T, dvalues)\n",
+    "        self.dbiases = np.sum(dvalues, axis=0, keepdims=True)\n",
+    "        self.dinputs = np.dot(dvalues, self.weights.T)\n",
+    "\n",
+    "        # Now we look at the gradients on regularization\n",
+    "        # L1\n",
+    "        if self.weight_regularizer_l1 > 0:\n",
+    "            dL1 = np.ones_like(self.weights)\n",
+    "            dL1[self.weights < 0] = -1\n",
+    "            self.dweights += self.weight_regularizer_l1 * dL1\n",
+    "        if self.bias_regularizer_l1 > 0:\n",
+    "            dL1 = np.ones_like(self.biases)\n",
+    "            dL1[self.biases < 0] = -1\n",
+    "            self.dbiases += self.bias_regularizer_l1 * dL1\n",
+    "\n",
+    "        # L2\n",
+    "        if self.weight_regularizer_l2 > 0:\n",
+    "            self.dweights += 2 * self.weight_regularizer_l2 * self.weights\n",
+    "        if self.bias_regularizer_l2 > 0:\n",
+    "            self.dbiases += 2 * self.bias_regularizer_l2 * self.biases\n",
+    "\n",
+    "# Base class for Loss functions\n",
+    "class Loss:\n",
+    "    '''Calculates the data and regularization losses given\n",
+    "    model output and ground truth values'''\n",
+    "    def regularization_loss(self, layer):\n",
+    "        regularization_loss = 0\n",
+    "\n",
+    "        # L1 regularization\n",
+    "        if layer.weight_regularizer_l1 > 0:\n",
+    "            regularization_loss += layer.weight_regularizer_l1 * np.sum(np.abs(layer.weights))\n",
+    "        if layer.bias_regularizer_l1 > 0:\n",
+    "            regularization_loss += layer.bias_regularizer_l1 * np.sum(np.abs(layer.biases))\n",
+    "\n",
+    "        # L2 regularization\n",
+    "        if layer.weight_regularizer_l2 > 0:\n",
+    "            regularization_loss += layer.weight_regularizer_l1 * np.sum(layer.weights * layer.weights)\n",
+    "        if layer.bias_regularizer_l2 > 0:\n",
+    "            regularization_loss += layer.bias_regularizer_l1 * np.sum(layer.biases * layer.biases)\n",
+    "\n",
+    "        return regularization_loss\n",
+    "\n",
+    "    def calculate(self, output, y):\n",
+    "        sample_losses = self.forward(output, y)\n",
+    "        data_loss = np.average(sample_losses)\n",
+    "        return data_loss\n",
+    "\n",
+    "class Loss_CategoricalCrossEntropy(Loss):\n",
+    "    def forward(self, y_pred, y_true):\n",
+    "        '''y_pred is the neural network output\n",
+    "        y_true is the ideal output of the neural network'''\n",
+    "        samples = len(y_pred)\n",
+    "        # Bound the predicted values \n",
+    "        y_pred_clipped = np.clip(y_pred, 1e-7, 1-1e-7)\n",
+    "        \n",
+    "        if len(y_true.shape) == 1:     # Categorically labeled\n",
+    "            correct_confidences = y_pred_clipped[range(samples), y_true]\n",
+    "        elif len(y_true.shape) == 2:   # One hot encoded\n",
+    "            correct_confidences = np.sum(y_pred_clipped*y_true, axis=1)\n",
+    "\n",
+    "        # Calculate the losses\n",
+    "        negative_log_likelihoods = -np.log(correct_confidences)\n",
+    "        return negative_log_likelihoods\n",
+    "    \n",
+    "    def backward(self, dvalues, y_true):\n",
+    "        samples = len(dvalues)\n",
+    "\n",
+    "        # Number of lables in each sample\n",
+    "        labels = len(dvalues[0])\n",
+    "\n",
+    "        # if the labels are sparse, turn them into a one-hot vector\n",
+    "        if len(y_true.shape) == 1:\n",
+    "            y_true = np.eye(labels)[y_true]\n",
+    "\n",
+    "        # Calculate the gradient then normalize\n",
+    "        self.dinputs = -y_true / dvalues\n",
+    "        self.dinputs = self.dinputs / samples\n",
+    "\n",
+    "class Activation_Softmax_Loss_CategoricalCrossentropy():\n",
+    "    def __init__(self):\n",
+    "        self.activation = Activation_Softmax()\n",
+    "        self.loss = Loss_CategoricalCrossEntropy()\n",
+    "\n",
+    "    def forward(self, inputs, y_true):\n",
+    "        self.activation.forward(inputs)\n",
+    "        self.output = self.activation.output\n",
+    "        return self.loss.calculate(self.output, y_true)\n",
+    "    \n",
+    "    def backward(self, dvalues, y_true):\n",
+    "        samples = len(dvalues)\n",
+    "\n",
+    "        # if the samples are one-hot encoded, turn them into discrete values\n",
+    "        if len(y_true.shape) == 2:\n",
+    "            y_true = np.argmax(y_true, axis=1)\n",
+    "            \n",
+    "        # Copy so we can safely modify\n",
+    "        self.dinputs = dvalues.copy()\n",
+    "        \n",
+    "        # Calculate and normalize gradient \n",
+    "        self.dinputs[range(samples), y_true] -= 1\n",
+    "        self.dinputs = self.dinputs / samples"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Testing Regularization"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch: 10000, acc: 0.913, loss: 0.210, lr: 0.019980019980019983\n",
+      "validation, acc: 0.820, loss: 0.694\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Create dataset\n",
+    "X, y = spiral_data(samples=100, classes=3)\n",
+    "\n",
+    "# Create Dense layer with 2 input features and 64 output values\n",
+    "dense1 = Layer_Dense(2, 64, weight_regularizer_l2=5e-4, bias_regularizer_l2=5e-4)\n",
+    "\n",
+    "# Create ReLU activation (to be used with Dense layer)\n",
+    "activation1 = Activation_ReLU()\n",
+    "\n",
+    "# Create second Dense layer with 64 input features (as we take output\n",
+    "# of previous layer here) and 3 output values (output values)\n",
+    "dense2 = Layer_Dense(64, 3)\n",
+    "\n",
+    "# Create Softmax classifier's combined loss and activation\n",
+    "loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()\n",
+    "\n",
+    "# Create optimizer\n",
+    "optimizer = Optimizer_Adam(learning_rate=0.02, decay=1e-7)\n",
+    "\n",
+    "# Train in loop\n",
+    "for epoch in range(10001):\n",
+    "    # Perform a forward pass of our training data through this layer\n",
+    "    dense1.forward(X)\n",
+    "    \n",
+    "    # Perform a forward pass through activation function\n",
+    "    # takes the output of first dense layer here\n",
+    "    activation1.forward(dense1.output)\n",
+    "    \n",
+    "    # Perform a forward pass through second Dense layer\n",
+    "    # takes outputs of activation function of first layer as inputs\n",
+    "    dense2.forward(activation1.output)\n",
+    "    \n",
+    "    # Perform a forward pass through the activation/loss function\n",
+    "    # takes the output of second dense layer here and returns loss\n",
+    "    data_loss = loss_activation.forward(dense2.output, y)\n",
+    "\n",
+    "    regularization_loss = (\n",
+    "        loss_activation.loss.regularization_loss(dense1) +\n",
+    "        loss_activation.loss.regularization_loss(dense2)\n",
+    "    )\n",
+    "\n",
+    "    loss = data_loss + regularization_loss\n",
+    "    \n",
+    "    # Calculate accuracy from output of activation2 and targets\n",
+    "    # calculate values along first axis\n",
+    "    predictions = np.argmax(loss_activation.output, axis=1)\n",
+    "    if len(y.shape) == 2:\n",
+    "        y = np.argmax(y, axis=1)\n",
+    "    accuracy = np.mean(predictions == y)\n",
+    "    \n",
+    "    # Backward pass\n",
+    "    loss_activation.backward(loss_activation.output, y)\n",
+    "    dense2.backward(loss_activation.dinputs)\n",
+    "    activation1.backward(dense2.dinputs)\n",
+    "    dense1.backward(activation1.dinputs)\n",
+    "    \n",
+    "    # Update weights and biases\n",
+    "    optimizer.pre_update_params()\n",
+    "    optimizer.update_params(dense1)\n",
+    "    optimizer.update_params(dense2)\n",
+    "    optimizer.post_update_params()\n",
+    "\n",
+    "# After Training\n",
+    "print(f'epoch: {epoch}, ' +\n",
+    "      f'acc: {accuracy:.3f}, ' +\n",
+    "      f'loss: {loss:.3f}, ' +\n",
+    "      f'lr: {optimizer.current_learning_rate}')\n",
+    "\n",
+    "\n",
+    "# With the weights and biases now optimized based on the training data, lets validate it\n",
+    "# Create test dataset\n",
+    "X_test, y_test = spiral_data(samples=100, classes=3)\n",
+    "# Perform a forward pass of our testing data through this layer\n",
+    "dense1.forward(X_test)\n",
+    "# Perform a forward pass through activation function\n",
+    "# takes the output of first dense layer here\n",
+    "activation1.forward(dense1.output)\n",
+    "# Perform a forward pass through second Dense layer\n",
+    "# takes outputs of activation function of first layer as inputs\n",
+    "dense2.forward(activation1.output)\n",
+    "# Perform a forward pass through the activation/loss function\n",
+    "# takes the output of second dense layer here and returns loss\n",
+    "loss = loss_activation.forward(dense2.output, y_test)\n",
+    "# Calculate accuracy from output of activation2 and targets\n",
+    "# calculate values along first axis\n",
+    "predictions = np.argmax(loss_activation.output, axis=1)\n",
+    "if len(y_test.shape) == 2:\n",
+    " y_test = np.argmax(y_test, axis=1)\n",
+    "accuracy = np.mean(predictions == y_test)\n",
+    "print(f'validation, acc: {accuracy:.3f}, loss: {loss:.3f}')"
+   ]
  }
 ],
 "metadata": {
--- a/lecture28_31/notes_28.pdf
+++ b/lecture28_31/notes_28.pdf
--- a/lecture28_31/notes_28.py
+++ b/lecture28_31/notes_28.py
@ -397,4 +397,241 @@ print(f'validation, acc: {accuracy:.3f}, loss: {loss:.3f}')
 # ## Data Leakage
 # While K-Fold is good for getting hyper-parameters with limited data, it can have data leakage if not correctly setup. For example, with timeseries data, it may get access to future information and train off of that.

+# %% [markdown]
+# # L1/L2 Regularization
+# ## How it Works
+# When a network is overfitting the data, it typically has larger weights and biases. By punishing the neural network for larger weights and biases, we can try to reduce the chances of overfitting.
+# 
+# We add the L1 or L2 loss to the data loss. L1 is the summation of the absolute value of all the weights. L2 is the summation of the weights squared. L2 is typically preferred due to the smoother gradient and still allowing small weights and biases.
+# 
+# ## Backward Pass
+# ### L1
+# $\frac{\delta L}{\delta w} = \lambda \text{ if } w \gt 0, \text{ else } -\lambda$
+# 
+# ### L2
+# $\frac{\delta L}{\delta w} = 2\lambda w$
+
+# %%
+import numpy as np
+
+class Layer_Dense:
+    def __init__(self, n_inputs, n_neurons,
+                 weight_regularizer_l1=0, weight_regularizer_l2=0,
+                 bias_regularizer_l1=0, bias_regularizer_l2=0):
+        # Initialize the weights and biases
+        self.weights = 0.01 * np.random.randn(n_inputs, n_neurons)
+        self.biases = np.zeros((1, n_neurons))
+        # Set the regularization strength
+        self.weight_regularizer_l1 = weight_regularizer_l1
+        self.weight_regularizer_l2 = weight_regularizer_l2
+        self.bias_regularizer_l1 = bias_regularizer_l1
+        self.bias_regularizer_l2 = bias_regularizer_l2
+
+    def forward(self, inputs):
+        # Calculate the output values from inputs, weights, and biases
+        self.inputs = inputs
+        self.output = np.dot(inputs, self.weights) + self.biases # Weights are already transposed
+    
+    def backward(self, dvalues):
+        '''Calculated the gradient of the loss with respect to the weights and biases of this layer.
+        dvalues is equiavelent to a transposed dl_dZ. It is the gradient 
+        of the loss with respect to the outputs of this layer.'''
+        # Gradients based on parameters
+        self.dweights = np.dot(self.inputs.T, dvalues)
+        self.dbiases = np.sum(dvalues, axis=0, keepdims=True)
+        self.dinputs = np.dot(dvalues, self.weights.T)
+
+        # Now we look at the gradients on regularization
+        # L1
+        if self.weight_regularizer_l1 > 0:
+            dL1 = np.ones_like(self.weights)
+            dL1[self.weights < 0] = -1
+            self.dweights += self.weight_regularizer_l1 * dL1
+        if self.bias_regularizer_l1 > 0:
+            dL1 = np.ones_like(self.biases)
+            dL1[self.biases < 0] = -1
+            self.dbiases += self.bias_regularizer_l1 * dL1
+
+        # L2
+        if self.weight_regularizer_l2 > 0:
+            self.dweights += 2 * self.weight_regularizer_l2 * self.weights
+        if self.bias_regularizer_l2 > 0:
+            self.dbiases += 2 * self.bias_regularizer_l2 * self.biases
+
+# Base class for Loss functions
+class Loss:
+    '''Calculates the data and regularization losses given
+    model output and ground truth values'''
+    def regularization_loss(self, layer):
+        regularization_loss = 0
+
+        # L1 regularization
+        if layer.weight_regularizer_l1 > 0:
+            regularization_loss += layer.weight_regularizer_l1 * np.sum(np.abs(layer.weights))
+        if layer.bias_regularizer_l1 > 0:
+            regularization_loss += layer.bias_regularizer_l1 * np.sum(np.abs(layer.biases))
+
+        # L2 regularization
+        if layer.weight_regularizer_l2 > 0:
+            regularization_loss += layer.weight_regularizer_l1 * np.sum(layer.weights * layer.weights)
+        if layer.bias_regularizer_l2 > 0:
+            regularization_loss += layer.bias_regularizer_l1 * np.sum(layer.biases * layer.biases)
+
+        return regularization_loss
+
+    def calculate(self, output, y):
+        sample_losses = self.forward(output, y)
+        data_loss = np.average(sample_losses)
+        return data_loss
+
+class Loss_CategoricalCrossEntropy(Loss):
+    def forward(self, y_pred, y_true):
+        '''y_pred is the neural network output
+        y_true is the ideal output of the neural network'''
+        samples = len(y_pred)
+        # Bound the predicted values 
+        y_pred_clipped = np.clip(y_pred, 1e-7, 1-1e-7)
+        
+        if len(y_true.shape) == 1:     # Categorically labeled
+            correct_confidences = y_pred_clipped[range(samples), y_true]
+        elif len(y_true.shape) == 2:   # One hot encoded
+            correct_confidences = np.sum(y_pred_clipped*y_true, axis=1)
+
+        # Calculate the losses
+        negative_log_likelihoods = -np.log(correct_confidences)
+        return negative_log_likelihoods
+    
+    def backward(self, dvalues, y_true):
+        samples = len(dvalues)
+
+        # Number of lables in each sample
+        labels = len(dvalues[0])
+
+        # if the labels are sparse, turn them into a one-hot vector
+        if len(y_true.shape) == 1:
+            y_true = np.eye(labels)[y_true]
+
+        # Calculate the gradient then normalize
+        self.dinputs = -y_true / dvalues
+        self.dinputs = self.dinputs / samples
+
+class Activation_Softmax_Loss_CategoricalCrossentropy():
+    def __init__(self):
+        self.activation = Activation_Softmax()
+        self.loss = Loss_CategoricalCrossEntropy()
+
+    def forward(self, inputs, y_true):
+        self.activation.forward(inputs)
+        self.output = self.activation.output
+        return self.loss.calculate(self.output, y_true)
+    
+    def backward(self, dvalues, y_true):
+        samples = len(dvalues)
+
+        # if the samples are one-hot encoded, turn them into discrete values
+        if len(y_true.shape) == 2:
+            y_true = np.argmax(y_true, axis=1)
+            
+        # Copy so we can safely modify
+        self.dinputs = dvalues.copy()
+        
+        # Calculate and normalize gradient 
+        self.dinputs[range(samples), y_true] -= 1
+        self.dinputs = self.dinputs / samples
+
+# %% [markdown]
+# ## Testing Regularization
+
+# %%
+# Create dataset
+X, y = spiral_data(samples=100, classes=3)
+
+# Create Dense layer with 2 input features and 64 output values
+dense1 = Layer_Dense(2, 64, weight_regularizer_l2=5e-4, bias_regularizer_l2=5e-4)
+
+# Create ReLU activation (to be used with Dense layer)
+activation1 = Activation_ReLU()
+
+# Create second Dense layer with 64 input features (as we take output
+# of previous layer here) and 3 output values (output values)
+dense2 = Layer_Dense(64, 3)
+
+# Create Softmax classifier's combined loss and activation
+loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()
+
+# Create optimizer
+optimizer = Optimizer_Adam(learning_rate=0.02, decay=1e-7)
+
+# Train in loop
+for epoch in range(10001):
+    # Perform a forward pass of our training data through this layer
+    dense1.forward(X)
+    
+    # Perform a forward pass through activation function
+    # takes the output of first dense layer here
+    activation1.forward(dense1.output)
+    
+    # Perform a forward pass through second Dense layer
+    # takes outputs of activation function of first layer as inputs
+    dense2.forward(activation1.output)
+    
+    # Perform a forward pass through the activation/loss function
+    # takes the output of second dense layer here and returns loss
+    data_loss = loss_activation.forward(dense2.output, y)
+
+    regularization_loss = (
+        loss_activation.loss.regularization_loss(dense1) +
+        loss_activation.loss.regularization_loss(dense2)
+    )
+
+    loss = data_loss + regularization_loss
+    
+    # Calculate accuracy from output of activation2 and targets
+    # calculate values along first axis
+    predictions = np.argmax(loss_activation.output, axis=1)
+    if len(y.shape) == 2:
+        y = np.argmax(y, axis=1)
+    accuracy = np.mean(predictions == y)
+    
+    # Backward pass
+    loss_activation.backward(loss_activation.output, y)
+    dense2.backward(loss_activation.dinputs)
+    activation1.backward(dense2.dinputs)
+    dense1.backward(activation1.dinputs)
+    
+    # Update weights and biases
+    optimizer.pre_update_params()
+    optimizer.update_params(dense1)
+    optimizer.update_params(dense2)
+    optimizer.post_update_params()
+
+# After Training
+print(f'epoch: {epoch}, ' +
+      f'acc: {accuracy:.3f}, ' +
+      f'loss: {loss:.3f}, ' +
+      f'lr: {optimizer.current_learning_rate}')
+
+
+# With the weights and biases now optimized based on the training data, lets validate it
+# Create test dataset
+X_test, y_test = spiral_data(samples=100, classes=3)
+# Perform a forward pass of our testing data through this layer
+dense1.forward(X_test)
+# Perform a forward pass through activation function
+# takes the output of first dense layer here
+activation1.forward(dense1.output)
+# Perform a forward pass through second Dense layer
+# takes outputs of activation function of first layer as inputs
+dense2.forward(activation1.output)
+# Perform a forward pass through the activation/loss function
+# takes the output of second dense layer here and returns loss
+loss = loss_activation.forward(dense2.output, y_test)
+# Calculate accuracy from output of activation2 and targets
+# calculate values along first axis
+predictions = np.argmax(loss_activation.output, axis=1)
+if len(y_test.shape) == 2:
+ y_test = np.argmax(y_test, axis=1)
+accuracy = np.mean(predictions == y_test)
+print(f'validation, acc: {accuracy:.3f}, loss: {loss:.3f}')
+