From 23fca634fb8441090f9f6a27097cade4cbc3e1c4 Mon Sep 17 00:00:00 2001 From: judsonupchurch Date: Tue, 31 Dec 2024 01:29:07 +0000 Subject: [PATCH] Change of Notation after Lecture 15-17 --- README.md | 4 +- lecture13_15/handout_13.ipynb | 122 -------------- .../handout_13.ipynb | 0 {lecture13_15 => lecture13_17}/notes_13.ipynb | 149 ++++++++++++++++-- 4 files changed, 137 insertions(+), 138 deletions(-) delete mode 100644 lecture13_15/handout_13.ipynb rename lecture16_17/handout_16.ipynb => lecture13_17/handout_13.ipynb (100%) rename {lecture13_15 => lecture13_17}/notes_13.ipynb (76%) diff --git a/README.md b/README.md index c0b8577..7fc2082 100644 --- a/README.md +++ b/README.md @@ -8,9 +8,7 @@ Lectures 7-11 use same handout. Lecture 12 uses same handout. -Lectures 13-15 use same handout. - -Lectures 16-17 use same handout. +Lectures 13-17 use same handout. Lecture 18 uses same handout. diff --git a/lecture13_15/handout_13.ipynb b/lecture13_15/handout_13.ipynb deleted file mode 100644 index 699a258..0000000 --- a/lecture13_15/handout_13.ipynb +++ /dev/null @@ -1,122 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Iteration 0, Loss: 466.56000000000006\n", - "Iteration 20, Loss: 5.32959636083938\n", - "Iteration 40, Loss: 0.41191523404899866\n", - "Iteration 60, Loss: 0.031836212079467595\n", - "Iteration 80, Loss: 0.002460565465389601\n", - "Iteration 100, Loss: 0.000190172825660145\n", - "Iteration 120, Loss: 1.4698126966451542e-05\n", - "Iteration 140, Loss: 1.1359926717815175e-06\n", - "Iteration 160, Loss: 8.779889800154524e-08\n", - "Iteration 180, Loss: 6.7858241357822796e-09\n", - "Final weights:\n", - " [[-0.00698895 -0.01397789 -0.02096684 -0.02795579]\n", - " [ 0.25975286 0.11950572 -0.02074143 -0.16098857]\n", - " [ 0.53548461 0.27096922 0.00645383 -0.25806156]]\n", - "Final biases:\n", - " [-0.00698895 -0.04024714 -0.06451539]\n" - ] - } - ], - "source": [ - "import numpy as np\n", - "\n", - "# Initial inputs\n", - "inputs = np.array([1, 2, 3, 4])\n", - "\n", - "# Initial weights and biases\n", - "weights = np.array([\n", - " [0.1, 0.2, 0.3, 0.4],\n", - " [0.5, 0.6, 0.7, 0.8],\n", - " [0.9, 1.0, 1.1, 1.2]\n", - "])\n", - "\n", - "biases = np.array([0.1, 0.2, 0.3])\n", - "\n", - "# Learning rate\n", - "learning_rate = 0.001\n", - "\n", - "# ReLU activation function and its derivative\n", - "def relu(x):\n", - " return np.maximum(0, x)\n", - "\n", - "def relu_derivative(x):\n", - " return np.where(x > 0, 1, 0)\n", - "\n", - "# Training loop\n", - "for iteration in range(200):\n", - " # Forward pass\n", - " z = np.dot(weights, inputs) + biases\n", - " a = relu(z)\n", - " y = np.sum(a)\n", - "\n", - " # Calculate loss\n", - " loss = y ** 2\n", - "\n", - " # Backward pass\n", - " # Gradient of loss with respect to output y\n", - " dL_dy = 2 * y\n", - "\n", - " # Gradient of y with respect to a\n", - " dy_da = np.ones_like(a)\n", - "\n", - " # Gradient of loss with respect to a\n", - " dL_da = dL_dy * dy_da\n", - "\n", - " # Gradient of a with respect to z (ReLU derivative)\n", - " da_dz = relu_derivative(z)\n", - "\n", - " # Gradient of loss with respect to z\n", - " dL_dz = dL_da * da_dz\n", - "\n", - " # Gradient of z with respect to weights and biases\n", - " dL_dW = np.outer(dL_dz, inputs)\n", - " dL_db = dL_dz\n", - "\n", - " # Update weights and biases\n", - " weights -= learning_rate * dL_dW\n", - " biases -= learning_rate * dL_db\n", - "\n", - " # Print the loss every 20 iterations\n", - " if iteration % 20 == 0:\n", - " print(f\"Iteration {iteration}, Loss: {loss}\")\n", - "\n", - "# Final weights and biases\n", - "print(\"Final weights:\\n\", weights)\n", - "print(\"Final biases:\\n\", biases)\n", - "\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.12" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/lecture16_17/handout_16.ipynb b/lecture13_17/handout_13.ipynb similarity index 100% rename from lecture16_17/handout_16.ipynb rename to lecture13_17/handout_13.ipynb diff --git a/lecture13_15/notes_13.ipynb b/lecture13_17/notes_13.ipynb similarity index 76% rename from lecture13_15/notes_13.ipynb rename to lecture13_17/notes_13.ipynb index b16ed07..578e1d2 100644 --- a/lecture13_15/notes_13.ipynb +++ b/lecture13_17/notes_13.ipynb @@ -366,12 +366,17 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Backpropagation of a Layer" + "# Backpropagation of a Layer\n", + "Same thing as a single neuron, but now using matrices to keep track of each neuron in the layer.\n", + "\n", + "If there are multiple input arrays (batches), one can take the summation of the loss from each batch as a total loss, and therefore the gradient of the total loss with respect to a weight or bias is the summation of the gradients of each batch's loss with respect to the weight or bias given that batch's input.\n", + "\n", + "In general, the partial derivative of the loss with respect to a specific weight or bias remains the same across all neurons of that layer for that batch. ie, the weight gradient matrix has the same column vector for N number of neurons. The bias gradient matrix is similar but is a single row of N elements for the same value." ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -379,18 +384,18 @@ "output_type": "stream", "text": [ "Iteration 0, Loss: 466.56000000000006\n", - "Iteration 20, Loss: 5.32959636083938\n", - "Iteration 40, Loss: 0.41191523404899866\n", - "Iteration 60, Loss: 0.031836212079467595\n", - "Iteration 80, Loss: 0.002460565465389601\n", - "Iteration 100, Loss: 0.000190172825660145\n", - "Iteration 120, Loss: 1.4698126966451542e-05\n", - "Iteration 140, Loss: 1.1359926717815175e-06\n", - "Iteration 160, Loss: 8.779889800154524e-08\n", - "Iteration 180, Loss: 6.7858241357822796e-09\n", + "Iteration 20, Loss: 5.329595763793193\n", + "Iteration 40, Loss: 0.41191524253483786\n", + "Iteration 60, Loss: 0.03183621475376345\n", + "Iteration 80, Loss: 0.002460565405431671\n", + "Iteration 100, Loss: 0.0001901729121621426\n", + "Iteration 120, Loss: 1.4698120139337557e-05\n", + "Iteration 140, Loss: 1.1359948840900371e-06\n", + "Iteration 160, Loss: 8.779778427447647e-08\n", + "Iteration 180, Loss: 6.785903626216421e-09\n", "Final weights:\n", - " [[-0.00698895 -0.01397789 -0.02096684 -0.02795579]\n", - " [ 0.25975286 0.11950572 -0.02074143 -0.16098857]\n", + " [[-0.00698895 -0.0139779 -0.02096685 -0.0279558 ]\n", + " [ 0.25975286 0.11950571 -0.02074143 -0.16098857]\n", " [ 0.53548461 0.27096922 0.00645383 -0.25806156]]\n", "Final biases:\n", " [-0.00698895 -0.04024714 -0.06451539]\n" @@ -463,6 +468,124 @@ "print(\"Final weights:\\n\", weights)\n", "print(\"Final biases:\\n\", biases)\n" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Change of Notation\n", + "The previous notation is clunky and long. From here forward, we will use the following notation for a layer with $n$ inputs and $i$ neurons. The neruon layer has is followed by an activation layer and then fed into a final value $y$ with a computed loss $l$. There can be $j$ batches of data.\n", + "\n", + "$\\vec{X_j} = \\begin{bmatrix} x_{1j} & x_{2j} & \\cdots & x_{nj} \\end{bmatrix}$ -> Row vector for the layer inputs for the $j$ batch of data.\n", + "\n", + "$\\overline{\\overline{W}} = \\begin{bmatrix} \\vec{w_{1}} \\\\ \\vec{w_{2}} \\\\ \\vdots \\\\ \\vec{w_{i}} \\end{bmatrix} = \\begin{bmatrix} w_{11} & w_{12} & \\cdots & w_{1n} \\\\ w_{21} & w_{22} & \\cdots & w_{2n} \\\\ \\vdots & \\vdots & \\ddots & \\vdots \\\\ w_{i1} & w_{i2} & \\cdots & w_{in}\\end{bmatrix}$ -> Matrix of weight values.\n", + "\n", + "$\\vec{B} = \\begin{bmatrix} b_1 & b_2 & \\cdots & b_i \\end{bmatrix}$ -> Row vector for the neuron biases\n", + "\n", + "$\\vec{Z_j} = \\begin{bmatrix} z_{1j} & z_{2j} & \\cdots & z_{ij} \\end{bmatrix}$ -> Row vector for the neuron outputs for the $j$ batch of data.\n", + "\n", + "$\\vec{A_j} = \\begin{bmatrix} a_{1j} & a_{2j} & \\cdots & a_{ij} \\end{bmatrix}$ -> Row vector for the activation later outputs for the $j$ batch of data.\n", + "\n", + "$y_j$ -> Final layer output for the $j$ batch of data if the layer is the final layer (could be summation, probability, etc).\n", + "\n", + "$l_j$ -> Loss for the $j$ batch of data." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Iteration 0, Loss: 466.56000000000006\n", + "Iteration 20, Loss: 5.329595763793193\n", + "Iteration 40, Loss: 0.41191524253483786\n", + "Iteration 60, Loss: 0.03183621475376345\n", + "Iteration 80, Loss: 0.002460565405431671\n", + "Iteration 100, Loss: 0.0001901729121621426\n", + "Iteration 120, Loss: 1.4698120139337557e-05\n", + "Iteration 140, Loss: 1.1359948840900371e-06\n", + "Iteration 160, Loss: 8.779778427447647e-08\n", + "Iteration 180, Loss: 6.785903626216421e-09\n", + "Final weights:\n", + " [[-0.00698895 -0.0139779 -0.02096685 -0.0279558 ]\n", + " [ 0.25975286 0.11950571 -0.02074143 -0.16098857]\n", + " [ 0.53548461 0.27096922 0.00645383 -0.25806156]]\n", + "Final biases:\n", + " [-0.00698895 -0.04024714 -0.06451539]\n" + ] + } + ], + "source": [ + "# Code changed to match new notation\n", + "import numpy as np\n", + "\n", + "# Initial inputs\n", + "X = np.array([1, 2, 3, 4])\n", + "\n", + "# Initial weights and biases\n", + "W = np.array([\n", + " [0.1, 0.2, 0.3, 0.4],\n", + " [0.5, 0.6, 0.7, 0.8],\n", + " [0.9, 1.0, 1.1, 1.2]\n", + "])\n", + "\n", + "B = np.array([0.1, 0.2, 0.3])\n", + "\n", + "learning_rate = 0.001\n", + "\n", + "# Add the derivative function to the ReLU class\n", + "class Activation_ReLU:\n", + " def forward(self, inputs):\n", + " return np.maximum(0, inputs)\n", + " \n", + " def derivative(self, inputs):\n", + " return np.where(inputs > 0, 1, 0)\n", + " \n", + "relu = Activation_ReLU()\n", + "\n", + "num_iterations = 200\n", + "\n", + "# Training loop\n", + "# A single layer of 3 neurons, each with 4 inputs\n", + "# The neuron layer is then fed into a ReLU activation layer\n", + "for iteration in range(num_iterations):\n", + " # Forward pass\n", + " Z = np.dot(W, X) + B\n", + " A = relu.forward(Z)\n", + " \n", + " # Calculate the squared loss assuming the desired output is a sum of 0. Trivial but just an example\n", + " y = np.sum(A)\n", + " l = y**2\n", + "\n", + " # Backward pass\n", + " dL_dy = 2 * y\n", + " dy_dA = np.ones_like(A)\n", + " dA_dZ = relu.derivative(Z)\n", + "\n", + " dl_dZ = dL_dy * dy_dA * dA_dZ\n", + "\n", + " # Get the gradient of the Loss with respect to the weights and biases\n", + " # dL_dW = np.outer(dl_dz, X)\n", + " dL_dW = X.reshape(-1, 1) @ dl_dZ.reshape(1, -1)\n", + " dL_dB = dl_dZ\n", + "\n", + " # Update the weights and biases\n", + " # Remove the .T if using dL_dW = np.outer(dl_dz, X)\n", + " W -= learning_rate * dL_dW.T\n", + " B -= learning_rate * dL_dB\n", + "\n", + " # Print the loss every 20 iterations\n", + " if iteration % 20 == 0:\n", + " print(f\"Iteration {iteration}, Loss: {l}\")\n", + "\n", + "# Final weights and biases\n", + "print(\"Final weights:\\n\", W)\n", + "print(\"Final biases:\\n\", B)\n" + ] } ], "metadata": {