Change of Notation after Lecture 15-17

2024-12-31 01:29:07 +00:00 · 2024-12-31 01:29:07 +00:00 · 23fca634fb
commit 23fca634fb
parent 12afb03d1f
4 changed files with 137 additions and 138 deletions
--- a/README.md
+++ b/README.md
@ -8,9 +8,7 @@ Lectures 7-11 use same handout.

 Lecture 12 uses same handout.

-Lectures 13-15 use same handout.
-
-Lectures 16-17 use same handout.
+Lectures 13-17 use same handout.

 Lecture 18 uses same handout.

--- a/lecture13_15/handout_13.ipynb
+++ b/lecture13_15/handout_13.ipynb
@ -1,122 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Iteration 0, Loss: 466.56000000000006\n",
-      "Iteration 20, Loss: 5.32959636083938\n",
-      "Iteration 40, Loss: 0.41191523404899866\n",
-      "Iteration 60, Loss: 0.031836212079467595\n",
-      "Iteration 80, Loss: 0.002460565465389601\n",
-      "Iteration 100, Loss: 0.000190172825660145\n",
-      "Iteration 120, Loss: 1.4698126966451542e-05\n",
-      "Iteration 140, Loss: 1.1359926717815175e-06\n",
-      "Iteration 160, Loss: 8.779889800154524e-08\n",
-      "Iteration 180, Loss: 6.7858241357822796e-09\n",
-      "Final weights:\n",
-      " [[-0.00698895 -0.01397789 -0.02096684 -0.02795579]\n",
-      " [ 0.25975286  0.11950572 -0.02074143 -0.16098857]\n",
-      " [ 0.53548461  0.27096922  0.00645383 -0.25806156]]\n",
-      "Final biases:\n",
-      " [-0.00698895 -0.04024714 -0.06451539]\n"
-     ]
-    }
-   ],
-   "source": [
-    "import numpy as np\n",
-    "\n",
-    "# Initial inputs\n",
-    "inputs = np.array([1, 2, 3, 4])\n",
-    "\n",
-    "# Initial weights and biases\n",
-    "weights = np.array([\n",
-    "    [0.1, 0.2, 0.3, 0.4],\n",
-    "    [0.5, 0.6, 0.7, 0.8],\n",
-    "    [0.9, 1.0, 1.1, 1.2]\n",
-    "])\n",
-    "\n",
-    "biases = np.array([0.1, 0.2, 0.3])\n",
-    "\n",
-    "# Learning rate\n",
-    "learning_rate = 0.001\n",
-    "\n",
-    "# ReLU activation function and its derivative\n",
-    "def relu(x):\n",
-    "    return np.maximum(0, x)\n",
-    "\n",
-    "def relu_derivative(x):\n",
-    "    return np.where(x > 0, 1, 0)\n",
-    "\n",
-    "# Training loop\n",
-    "for iteration in range(200):\n",
-    "    # Forward pass\n",
-    "    z = np.dot(weights, inputs) + biases\n",
-    "    a = relu(z)\n",
-    "    y = np.sum(a)\n",
-    "\n",
-    "    # Calculate loss\n",
-    "    loss = y ** 2\n",
-    "\n",
-    "    # Backward pass\n",
-    "    # Gradient of loss with respect to output y\n",
-    "    dL_dy = 2 * y\n",
-    "\n",
-    "    # Gradient of y with respect to a\n",
-    "    dy_da = np.ones_like(a)\n",
-    "\n",
-    "    # Gradient of loss with respect to a\n",
-    "    dL_da = dL_dy * dy_da\n",
-    "\n",
-    "    # Gradient of a with respect to z (ReLU derivative)\n",
-    "    da_dz = relu_derivative(z)\n",
-    "\n",
-    "    # Gradient of loss with respect to z\n",
-    "    dL_dz = dL_da * da_dz\n",
-    "\n",
-    "    # Gradient of z with respect to weights and biases\n",
-    "    dL_dW = np.outer(dL_dz, inputs)\n",
-    "    dL_db = dL_dz\n",
-    "\n",
-    "    # Update weights and biases\n",
-    "    weights -= learning_rate * dL_dW\n",
-    "    biases -= learning_rate * dL_db\n",
-    "\n",
-    "    # Print the loss every 20 iterations\n",
-    "    if iteration % 20 == 0:\n",
-    "        print(f\"Iteration {iteration}, Loss: {loss}\")\n",
-    "\n",
-    "# Final weights and biases\n",
-    "print(\"Final weights:\\n\", weights)\n",
-    "print(\"Final biases:\\n\", biases)\n",
-    "\n"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.12"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
--- a/lecture13_17/handout_13.ipynb
+++ b/lecture13_17/handout_13.ipynb
--- a/lecture13_17/notes_13.ipynb
+++ b/lecture13_17/notes_13.ipynb
@ -366,12 +366,17 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "# Backpropagation of a Layer"
+    "# Backpropagation of a Layer\n",
+    "Same thing as a single neuron, but now using matrices to keep track of each neuron in the layer.\n",
+    "\n",
+    "If there are multiple input arrays (batches), one can take the summation of the loss from each batch as a total loss, and therefore the gradient of the total loss with respect to a weight or bias is the summation of the gradients of each batch's loss with respect to the weight or bias given that batch's input.\n",
+    "\n",
+    "In general, the partial derivative of the loss with respect to a specific weight or bias remains the same across all neurons of that layer for that batch. ie, the weight gradient matrix has the same column vector for N number of neurons. The bias gradient matrix is similar but is a single row of N elements for the same value."
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
@ -379,18 +384,18 @@
     "output_type": "stream",
     "text": [
      "Iteration 0, Loss: 466.56000000000006\n",
-      "Iteration 20, Loss: 5.32959636083938\n",
-      "Iteration 40, Loss: 0.41191523404899866\n",
-      "Iteration 60, Loss: 0.031836212079467595\n",
-      "Iteration 80, Loss: 0.002460565465389601\n",
-      "Iteration 100, Loss: 0.000190172825660145\n",
-      "Iteration 120, Loss: 1.4698126966451542e-05\n",
-      "Iteration 140, Loss: 1.1359926717815175e-06\n",
-      "Iteration 160, Loss: 8.779889800154524e-08\n",
-      "Iteration 180, Loss: 6.7858241357822796e-09\n",
+      "Iteration 20, Loss: 5.329595763793193\n",
+      "Iteration 40, Loss: 0.41191524253483786\n",
+      "Iteration 60, Loss: 0.03183621475376345\n",
+      "Iteration 80, Loss: 0.002460565405431671\n",
+      "Iteration 100, Loss: 0.0001901729121621426\n",
+      "Iteration 120, Loss: 1.4698120139337557e-05\n",
+      "Iteration 140, Loss: 1.1359948840900371e-06\n",
+      "Iteration 160, Loss: 8.779778427447647e-08\n",
+      "Iteration 180, Loss: 6.785903626216421e-09\n",
      "Final weights:\n",
-      " [[-0.00698895 -0.01397789 -0.02096684 -0.02795579]\n",
-      " [ 0.25975286  0.11950572 -0.02074143 -0.16098857]\n",
+      " [[-0.00698895 -0.0139779  -0.02096685 -0.0279558 ]\n",
+      " [ 0.25975286  0.11950571 -0.02074143 -0.16098857]\n",
      " [ 0.53548461  0.27096922  0.00645383 -0.25806156]]\n",
      "Final biases:\n",
      " [-0.00698895 -0.04024714 -0.06451539]\n"
@ -463,6 +468,124 @@
    "print(\"Final weights:\\n\", weights)\n",
    "print(\"Final biases:\\n\", biases)\n"
   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Change of Notation\n",
+    "The previous notation is clunky and long. From here forward, we will use the following notation for a layer with $n$ inputs and $i$ neurons. The neruon layer has is followed by an activation layer and then fed into a final value $y$ with a computed loss $l$. There can be $j$ batches of data.\n",
+    "\n",
+    "$\\vec{X_j} = \\begin{bmatrix} x_{1j} & x_{2j} & \\cdots & x_{nj} \\end{bmatrix}$ -> Row vector for the layer inputs for the $j$ batch of data.\n",
+    "\n",
+    "$\\overline{\\overline{W}} = \\begin{bmatrix} \\vec{w_{1}} \\\\ \\vec{w_{2}} \\\\ \\vdots \\\\ \\vec{w_{i}} \\end{bmatrix} = \\begin{bmatrix} w_{11} & w_{12} & \\cdots & w_{1n} \\\\ w_{21} & w_{22} & \\cdots & w_{2n} \\\\ \\vdots & \\vdots & \\ddots & \\vdots \\\\ w_{i1} & w_{i2} & \\cdots & w_{in}\\end{bmatrix}$ -> Matrix of weight values.\n",
+    "\n",
+    "$\\vec{B} = \\begin{bmatrix} b_1 & b_2 & \\cdots & b_i \\end{bmatrix}$ -> Row vector for the neuron biases\n",
+    "\n",
+    "$\\vec{Z_j} = \\begin{bmatrix} z_{1j} & z_{2j} & \\cdots & z_{ij} \\end{bmatrix}$ -> Row vector for the neuron outputs for the $j$ batch of data.\n",
+    "\n",
+    "$\\vec{A_j} = \\begin{bmatrix} a_{1j} & a_{2j} & \\cdots & a_{ij} \\end{bmatrix}$ -> Row vector for the activation later outputs for the $j$ batch of data.\n",
+    "\n",
+    "$y_j$ -> Final layer output for the $j$ batch of data if the layer is the final layer (could be summation, probability, etc).\n",
+    "\n",
+    "$l_j$ -> Loss for the $j$ batch of data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Iteration 0, Loss: 466.56000000000006\n",
+      "Iteration 20, Loss: 5.329595763793193\n",
+      "Iteration 40, Loss: 0.41191524253483786\n",
+      "Iteration 60, Loss: 0.03183621475376345\n",
+      "Iteration 80, Loss: 0.002460565405431671\n",
+      "Iteration 100, Loss: 0.0001901729121621426\n",
+      "Iteration 120, Loss: 1.4698120139337557e-05\n",
+      "Iteration 140, Loss: 1.1359948840900371e-06\n",
+      "Iteration 160, Loss: 8.779778427447647e-08\n",
+      "Iteration 180, Loss: 6.785903626216421e-09\n",
+      "Final weights:\n",
+      " [[-0.00698895 -0.0139779  -0.02096685 -0.0279558 ]\n",
+      " [ 0.25975286  0.11950571 -0.02074143 -0.16098857]\n",
+      " [ 0.53548461  0.27096922  0.00645383 -0.25806156]]\n",
+      "Final biases:\n",
+      " [-0.00698895 -0.04024714 -0.06451539]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Code changed to match new notation\n",
+    "import numpy as np\n",
+    "\n",
+    "# Initial inputs\n",
+    "X = np.array([1, 2, 3, 4])\n",
+    "\n",
+    "# Initial weights and biases\n",
+    "W = np.array([\n",
+    "    [0.1, 0.2, 0.3, 0.4],\n",
+    "    [0.5, 0.6, 0.7, 0.8],\n",
+    "    [0.9, 1.0, 1.1, 1.2]\n",
+    "])\n",
+    "\n",
+    "B = np.array([0.1, 0.2, 0.3])\n",
+    "\n",
+    "learning_rate = 0.001\n",
+    "\n",
+    "# Add the derivative function to the ReLU class\n",
+    "class Activation_ReLU:\n",
+    "    def forward(self, inputs):\n",
+    "        return np.maximum(0, inputs)\n",
+    "    \n",
+    "    def derivative(self, inputs):\n",
+    "        return np.where(inputs > 0, 1, 0)\n",
+    "    \n",
+    "relu = Activation_ReLU()\n",
+    "\n",
+    "num_iterations = 200\n",
+    "\n",
+    "# Training loop\n",
+    "# A single layer of 3 neurons, each with 4 inputs\n",
+    "# The neuron layer is then fed into a ReLU activation layer\n",
+    "for iteration in range(num_iterations):\n",
+    "    # Forward pass\n",
+    "    Z = np.dot(W, X) + B\n",
+    "    A = relu.forward(Z)\n",
+    "    \n",
+    "    # Calculate the squared loss assuming the desired output is a sum of 0. Trivial but just an example\n",
+    "    y = np.sum(A)\n",
+    "    l = y**2\n",
+    "\n",
+    "    # Backward pass\n",
+    "    dL_dy = 2 * y\n",
+    "    dy_dA = np.ones_like(A)\n",
+    "    dA_dZ = relu.derivative(Z)\n",
+    "\n",
+    "    dl_dZ = dL_dy * dy_dA * dA_dZ\n",
+    "\n",
+    "    # Get the gradient of the Loss with respect to the weights and biases\n",
+    "    # dL_dW = np.outer(dl_dz, X)\n",
+    "    dL_dW = X.reshape(-1, 1) @ dl_dZ.reshape(1, -1)\n",
+    "    dL_dB = dl_dZ\n",
+    "\n",
+    "    # Update the weights and biases\n",
+    "    # Remove the .T if using dL_dW = np.outer(dl_dz, X)\n",
+    "    W -= learning_rate * dL_dW.T\n",
+    "    B -= learning_rate * dL_dB\n",
+    "\n",
+    "    # Print the loss every 20 iterations\n",
+    "    if iteration % 20 == 0:\n",
+    "        print(f\"Iteration {iteration}, Loss: {l}\")\n",
+    "\n",
+    "# Final weights and biases\n",
+    "print(\"Final weights:\\n\", W)\n",
+    "print(\"Final biases:\\n\", B)\n"
+   ]
  }
 ],
 "metadata": {