Change of Notation after Lecture 15-17

2024-12-31 01:29:07 +00:00 · 2024-12-31 01:29:07 +00:00 · 23fca634fb
commit 23fca634fb
parent 12afb03d1f
4 changed files with 137 additions and 138 deletions
--- a/README.md
+++ b/README.md
@ -8,9 +8,7 @@ Lectures 7-11 use same handout.
 Lecture 12 uses same handout.
-Lectures 13-15 use same handout.
+Lectures 13-17 use same handout.
 Lectures 16-17 use same handout.
 Lecture 18 uses same handout.
--- a/lecture13_15/handout_13.ipynb
+++ b/lecture13_15/handout_13.ipynb
@ -1,122 +0,0 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Iteration 0, Loss: 466.56000000000006\n",
      "Iteration 20, Loss: 5.32959636083938\n",
      "Iteration 40, Loss: 0.41191523404899866\n",
      "Iteration 60, Loss: 0.031836212079467595\n",
      "Iteration 80, Loss: 0.002460565465389601\n",
      "Iteration 100, Loss: 0.000190172825660145\n",
      "Iteration 120, Loss: 1.4698126966451542e-05\n",
      "Iteration 140, Loss: 1.1359926717815175e-06\n",
      "Iteration 160, Loss: 8.779889800154524e-08\n",
      "Iteration 180, Loss: 6.7858241357822796e-09\n",
      "Final weights:\n",
      " [[-0.00698895 -0.01397789 -0.02096684 -0.02795579]\n",
      " [ 0.25975286  0.11950572 -0.02074143 -0.16098857]\n",
      " [ 0.53548461  0.27096922  0.00645383 -0.25806156]]\n",
      "Final biases:\n",
      " [-0.00698895 -0.04024714 -0.06451539]\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "\n",
    "# Initial inputs\n",
    "inputs = np.array([1, 2, 3, 4])\n",
    "\n",
    "# Initial weights and biases\n",
    "weights = np.array([\n",
    "    [0.1, 0.2, 0.3, 0.4],\n",
    "    [0.5, 0.6, 0.7, 0.8],\n",
    "    [0.9, 1.0, 1.1, 1.2]\n",
    "])\n",
    "\n",
    "biases = np.array([0.1, 0.2, 0.3])\n",
    "\n",
    "# Learning rate\n",
    "learning_rate = 0.001\n",
    "\n",
    "# ReLU activation function and its derivative\n",
    "def relu(x):\n",
    "    return np.maximum(0, x)\n",
    "\n",
    "def relu_derivative(x):\n",
    "    return np.where(x > 0, 1, 0)\n",
    "\n",
    "# Training loop\n",
    "for iteration in range(200):\n",
    "    # Forward pass\n",
    "    z = np.dot(weights, inputs) + biases\n",
    "    a = relu(z)\n",
    "    y = np.sum(a)\n",
    "\n",
    "    # Calculate loss\n",
    "    loss = y ** 2\n",
    "\n",
    "    # Backward pass\n",
    "    # Gradient of loss with respect to output y\n",
    "    dL_dy = 2 * y\n",
    "\n",
    "    # Gradient of y with respect to a\n",
    "    dy_da = np.ones_like(a)\n",
    "\n",
    "    # Gradient of loss with respect to a\n",
    "    dL_da = dL_dy * dy_da\n",
    "\n",
    "    # Gradient of a with respect to z (ReLU derivative)\n",
    "    da_dz = relu_derivative(z)\n",
    "\n",
    "    # Gradient of loss with respect to z\n",
    "    dL_dz = dL_da * da_dz\n",
    "\n",
    "    # Gradient of z with respect to weights and biases\n",
    "    dL_dW = np.outer(dL_dz, inputs)\n",
    "    dL_db = dL_dz\n",
    "\n",
    "    # Update weights and biases\n",
    "    weights -= learning_rate * dL_dW\n",
    "    biases -= learning_rate * dL_db\n",
    "\n",
    "    # Print the loss every 20 iterations\n",
    "    if iteration % 20 == 0:\n",
    "        print(f\"Iteration {iteration}, Loss: {loss}\")\n",
    "\n",
    "# Final weights and biases\n",
    "print(\"Final weights:\\n\", weights)\n",
    "print(\"Final biases:\\n\", biases)\n",
    "\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
--- a/lecture13_17/handout_13.ipynb
+++ b/lecture13_17/handout_13.ipynb
--- a/lecture13_17/notes_13.ipynb
+++ b/lecture13_17/notes_13.ipynb
@ -366,12 +366,17 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "# Backpropagation of a Layer"
+    "# Backpropagation of a Layer\n",
    "Same thing as a single neuron, but now using matrices to keep track of each neuron in the layer.\n",
    "\n",
    "If there are multiple input arrays (batches), one can take the summation of the loss from each batch as a total loss, and therefore the gradient of the total loss with respect to a weight or bias is the summation of the gradients of each batch's loss with respect to the weight or bias given that batch's input.\n",
    "\n",
    "In general, the partial derivative of the loss with respect to a specific weight or bias remains the same across all neurons of that layer for that batch. ie, the weight gradient matrix has the same column vector for N number of neurons. The bias gradient matrix is similar but is a single row of N elements for the same value."
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
@ -379,18 +384,18 @@
     "output_type": "stream",
     "text": [
      "Iteration 0, Loss: 466.56000000000006\n",
-      "Iteration 20, Loss: 5.32959636083938\n",
+      "Iteration 20, Loss: 5.329595763793193\n",
-      "Iteration 40, Loss: 0.41191523404899866\n",
+      "Iteration 40, Loss: 0.41191524253483786\n",
-      "Iteration 60, Loss: 0.031836212079467595\n",
+      "Iteration 60, Loss: 0.03183621475376345\n",
-      "Iteration 80, Loss: 0.002460565465389601\n",
+      "Iteration 80, Loss: 0.002460565405431671\n",
-      "Iteration 100, Loss: 0.000190172825660145\n",
+      "Iteration 100, Loss: 0.0001901729121621426\n",
-      "Iteration 120, Loss: 1.4698126966451542e-05\n",
+      "Iteration 120, Loss: 1.4698120139337557e-05\n",
-      "Iteration 140, Loss: 1.1359926717815175e-06\n",
+      "Iteration 140, Loss: 1.1359948840900371e-06\n",
-      "Iteration 160, Loss: 8.779889800154524e-08\n",
+      "Iteration 160, Loss: 8.779778427447647e-08\n",
-      "Iteration 180, Loss: 6.7858241357822796e-09\n",
+      "Iteration 180, Loss: 6.785903626216421e-09\n",
      "Final weights:\n",
-      " [[-0.00698895 -0.01397789 -0.02096684 -0.02795579]\n",
+      " [[-0.00698895 -0.0139779  -0.02096685 -0.0279558 ]\n",
-      " [ 0.25975286  0.11950572 -0.02074143 -0.16098857]\n",
+      " [ 0.25975286  0.11950571 -0.02074143 -0.16098857]\n",
      " [ 0.53548461  0.27096922  0.00645383 -0.25806156]]\n",
      "Final biases:\n",
      " [-0.00698895 -0.04024714 -0.06451539]\n"
@ -463,6 +468,124 @@
    "print(\"Final weights:\\n\", weights)\n",
    "print(\"Final biases:\\n\", biases)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Change of Notation\n",
    "The previous notation is clunky and long. From here forward, we will use the following notation for a layer with $n$ inputs and $i$ neurons. The neruon layer has is followed by an activation layer and then fed into a final value $y$ with a computed loss $l$. There can be $j$ batches of data.\n",
    "\n",
    "$\\vec{X_j} = \\begin{bmatrix} x_{1j} & x_{2j} & \\cdots & x_{nj} \\end{bmatrix}$ -> Row vector for the layer inputs for the $j$ batch of data.\n",
    "\n",
    "$\\overline{\\overline{W}} = \\begin{bmatrix} \\vec{w_{1}} \\\\ \\vec{w_{2}} \\\\ \\vdots \\\\ \\vec{w_{i}} \\end{bmatrix} = \\begin{bmatrix} w_{11} & w_{12} & \\cdots & w_{1n} \\\\ w_{21} & w_{22} & \\cdots & w_{2n} \\\\ \\vdots & \\vdots & \\ddots & \\vdots \\\\ w_{i1} & w_{i2} & \\cdots & w_{in}\\end{bmatrix}$ -> Matrix of weight values.\n",
    "\n",
    "$\\vec{B} = \\begin{bmatrix} b_1 & b_2 & \\cdots & b_i \\end{bmatrix}$ -> Row vector for the neuron biases\n",
    "\n",
    "$\\vec{Z_j} = \\begin{bmatrix} z_{1j} & z_{2j} & \\cdots & z_{ij} \\end{bmatrix}$ -> Row vector for the neuron outputs for the $j$ batch of data.\n",
    "\n",
    "$\\vec{A_j} = \\begin{bmatrix} a_{1j} & a_{2j} & \\cdots & a_{ij} \\end{bmatrix}$ -> Row vector for the activation later outputs for the $j$ batch of data.\n",
    "\n",
    "$y_j$ -> Final layer output for the $j$ batch of data if the layer is the final layer (could be summation, probability, etc).\n",
    "\n",
    "$l_j$ -> Loss for the $j$ batch of data."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Iteration 0, Loss: 466.56000000000006\n",
      "Iteration 20, Loss: 5.329595763793193\n",
      "Iteration 40, Loss: 0.41191524253483786\n",
      "Iteration 60, Loss: 0.03183621475376345\n",
      "Iteration 80, Loss: 0.002460565405431671\n",
      "Iteration 100, Loss: 0.0001901729121621426\n",
      "Iteration 120, Loss: 1.4698120139337557e-05\n",
      "Iteration 140, Loss: 1.1359948840900371e-06\n",
      "Iteration 160, Loss: 8.779778427447647e-08\n",
      "Iteration 180, Loss: 6.785903626216421e-09\n",
      "Final weights:\n",
      " [[-0.00698895 -0.0139779  -0.02096685 -0.0279558 ]\n",
      " [ 0.25975286  0.11950571 -0.02074143 -0.16098857]\n",
      " [ 0.53548461  0.27096922  0.00645383 -0.25806156]]\n",
      "Final biases:\n",
      " [-0.00698895 -0.04024714 -0.06451539]\n"
     ]
    }
   ],
   "source": [
    "# Code changed to match new notation\n",
    "import numpy as np\n",
    "\n",
    "# Initial inputs\n",
    "X = np.array([1, 2, 3, 4])\n",
    "\n",
    "# Initial weights and biases\n",
    "W = np.array([\n",
    "    [0.1, 0.2, 0.3, 0.4],\n",
    "    [0.5, 0.6, 0.7, 0.8],\n",
    "    [0.9, 1.0, 1.1, 1.2]\n",
    "])\n",
    "\n",
    "B = np.array([0.1, 0.2, 0.3])\n",
    "\n",
    "learning_rate = 0.001\n",
    "\n",
    "# Add the derivative function to the ReLU class\n",
    "class Activation_ReLU:\n",
    "    def forward(self, inputs):\n",
    "        return np.maximum(0, inputs)\n",
    "    \n",
    "    def derivative(self, inputs):\n",
    "        return np.where(inputs > 0, 1, 0)\n",
    "    \n",
    "relu = Activation_ReLU()\n",
    "\n",
    "num_iterations = 200\n",
    "\n",
    "# Training loop\n",
    "# A single layer of 3 neurons, each with 4 inputs\n",
    "# The neuron layer is then fed into a ReLU activation layer\n",
    "for iteration in range(num_iterations):\n",
    "    # Forward pass\n",
    "    Z = np.dot(W, X) + B\n",
    "    A = relu.forward(Z)\n",
    "    \n",
    "    # Calculate the squared loss assuming the desired output is a sum of 0. Trivial but just an example\n",
    "    y = np.sum(A)\n",
    "    l = y**2\n",
    "\n",
    "    # Backward pass\n",
    "    dL_dy = 2 * y\n",
    "    dy_dA = np.ones_like(A)\n",
    "    dA_dZ = relu.derivative(Z)\n",
    "\n",
    "    dl_dZ = dL_dy * dy_dA * dA_dZ\n",
    "\n",
    "    # Get the gradient of the Loss with respect to the weights and biases\n",
    "    # dL_dW = np.outer(dl_dz, X)\n",
    "    dL_dW = X.reshape(-1, 1) @ dl_dZ.reshape(1, -1)\n",
    "    dL_dB = dl_dZ\n",
    "\n",
    "    # Update the weights and biases\n",
    "    # Remove the .T if using dL_dW = np.outer(dl_dz, X)\n",
    "    W -= learning_rate * dL_dW.T\n",
    "    B -= learning_rate * dL_dB\n",
    "\n",
    "    # Print the loss every 20 iterations\n",
    "    if iteration % 20 == 0:\n",
    "        print(f\"Iteration {iteration}, Loss: {l}\")\n",
    "\n",
    "# Final weights and biases\n",
    "print(\"Final weights:\\n\", W)\n",
    "print(\"Final biases:\\n\", B)\n"
   ]
  }
 ],
 "metadata": {