Change of Notation after Lecture 15-17
This commit is contained in:
parent
12afb03d1f
commit
23fca634fb
@ -8,9 +8,7 @@ Lectures 7-11 use same handout.
|
||||
|
||||
Lecture 12 uses same handout.
|
||||
|
||||
Lectures 13-15 use same handout.
|
||||
|
||||
Lectures 16-17 use same handout.
|
||||
Lectures 13-17 use same handout.
|
||||
|
||||
Lecture 18 uses same handout.
|
||||
|
||||
|
||||
@ -1,122 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Iteration 0, Loss: 466.56000000000006\n",
|
||||
"Iteration 20, Loss: 5.32959636083938\n",
|
||||
"Iteration 40, Loss: 0.41191523404899866\n",
|
||||
"Iteration 60, Loss: 0.031836212079467595\n",
|
||||
"Iteration 80, Loss: 0.002460565465389601\n",
|
||||
"Iteration 100, Loss: 0.000190172825660145\n",
|
||||
"Iteration 120, Loss: 1.4698126966451542e-05\n",
|
||||
"Iteration 140, Loss: 1.1359926717815175e-06\n",
|
||||
"Iteration 160, Loss: 8.779889800154524e-08\n",
|
||||
"Iteration 180, Loss: 6.7858241357822796e-09\n",
|
||||
"Final weights:\n",
|
||||
" [[-0.00698895 -0.01397789 -0.02096684 -0.02795579]\n",
|
||||
" [ 0.25975286 0.11950572 -0.02074143 -0.16098857]\n",
|
||||
" [ 0.53548461 0.27096922 0.00645383 -0.25806156]]\n",
|
||||
"Final biases:\n",
|
||||
" [-0.00698895 -0.04024714 -0.06451539]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"\n",
|
||||
"# Initial inputs\n",
|
||||
"inputs = np.array([1, 2, 3, 4])\n",
|
||||
"\n",
|
||||
"# Initial weights and biases\n",
|
||||
"weights = np.array([\n",
|
||||
" [0.1, 0.2, 0.3, 0.4],\n",
|
||||
" [0.5, 0.6, 0.7, 0.8],\n",
|
||||
" [0.9, 1.0, 1.1, 1.2]\n",
|
||||
"])\n",
|
||||
"\n",
|
||||
"biases = np.array([0.1, 0.2, 0.3])\n",
|
||||
"\n",
|
||||
"# Learning rate\n",
|
||||
"learning_rate = 0.001\n",
|
||||
"\n",
|
||||
"# ReLU activation function and its derivative\n",
|
||||
"def relu(x):\n",
|
||||
" return np.maximum(0, x)\n",
|
||||
"\n",
|
||||
"def relu_derivative(x):\n",
|
||||
" return np.where(x > 0, 1, 0)\n",
|
||||
"\n",
|
||||
"# Training loop\n",
|
||||
"for iteration in range(200):\n",
|
||||
" # Forward pass\n",
|
||||
" z = np.dot(weights, inputs) + biases\n",
|
||||
" a = relu(z)\n",
|
||||
" y = np.sum(a)\n",
|
||||
"\n",
|
||||
" # Calculate loss\n",
|
||||
" loss = y ** 2\n",
|
||||
"\n",
|
||||
" # Backward pass\n",
|
||||
" # Gradient of loss with respect to output y\n",
|
||||
" dL_dy = 2 * y\n",
|
||||
"\n",
|
||||
" # Gradient of y with respect to a\n",
|
||||
" dy_da = np.ones_like(a)\n",
|
||||
"\n",
|
||||
" # Gradient of loss with respect to a\n",
|
||||
" dL_da = dL_dy * dy_da\n",
|
||||
"\n",
|
||||
" # Gradient of a with respect to z (ReLU derivative)\n",
|
||||
" da_dz = relu_derivative(z)\n",
|
||||
"\n",
|
||||
" # Gradient of loss with respect to z\n",
|
||||
" dL_dz = dL_da * da_dz\n",
|
||||
"\n",
|
||||
" # Gradient of z with respect to weights and biases\n",
|
||||
" dL_dW = np.outer(dL_dz, inputs)\n",
|
||||
" dL_db = dL_dz\n",
|
||||
"\n",
|
||||
" # Update weights and biases\n",
|
||||
" weights -= learning_rate * dL_dW\n",
|
||||
" biases -= learning_rate * dL_db\n",
|
||||
"\n",
|
||||
" # Print the loss every 20 iterations\n",
|
||||
" if iteration % 20 == 0:\n",
|
||||
" print(f\"Iteration {iteration}, Loss: {loss}\")\n",
|
||||
"\n",
|
||||
"# Final weights and biases\n",
|
||||
"print(\"Final weights:\\n\", weights)\n",
|
||||
"print(\"Final biases:\\n\", biases)\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.12"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
@ -366,12 +366,17 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Backpropagation of a Layer"
|
||||
"# Backpropagation of a Layer\n",
|
||||
"Same thing as a single neuron, but now using matrices to keep track of each neuron in the layer.\n",
|
||||
"\n",
|
||||
"If there are multiple input arrays (batches), one can take the summation of the loss from each batch as a total loss, and therefore the gradient of the total loss with respect to a weight or bias is the summation of the gradients of each batch's loss with respect to the weight or bias given that batch's input.\n",
|
||||
"\n",
|
||||
"In general, the partial derivative of the loss with respect to a specific weight or bias remains the same across all neurons of that layer for that batch. ie, the weight gradient matrix has the same column vector for N number of neurons. The bias gradient matrix is similar but is a single row of N elements for the same value."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@ -379,18 +384,18 @@
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Iteration 0, Loss: 466.56000000000006\n",
|
||||
"Iteration 20, Loss: 5.32959636083938\n",
|
||||
"Iteration 40, Loss: 0.41191523404899866\n",
|
||||
"Iteration 60, Loss: 0.031836212079467595\n",
|
||||
"Iteration 80, Loss: 0.002460565465389601\n",
|
||||
"Iteration 100, Loss: 0.000190172825660145\n",
|
||||
"Iteration 120, Loss: 1.4698126966451542e-05\n",
|
||||
"Iteration 140, Loss: 1.1359926717815175e-06\n",
|
||||
"Iteration 160, Loss: 8.779889800154524e-08\n",
|
||||
"Iteration 180, Loss: 6.7858241357822796e-09\n",
|
||||
"Iteration 20, Loss: 5.329595763793193\n",
|
||||
"Iteration 40, Loss: 0.41191524253483786\n",
|
||||
"Iteration 60, Loss: 0.03183621475376345\n",
|
||||
"Iteration 80, Loss: 0.002460565405431671\n",
|
||||
"Iteration 100, Loss: 0.0001901729121621426\n",
|
||||
"Iteration 120, Loss: 1.4698120139337557e-05\n",
|
||||
"Iteration 140, Loss: 1.1359948840900371e-06\n",
|
||||
"Iteration 160, Loss: 8.779778427447647e-08\n",
|
||||
"Iteration 180, Loss: 6.785903626216421e-09\n",
|
||||
"Final weights:\n",
|
||||
" [[-0.00698895 -0.01397789 -0.02096684 -0.02795579]\n",
|
||||
" [ 0.25975286 0.11950572 -0.02074143 -0.16098857]\n",
|
||||
" [[-0.00698895 -0.0139779 -0.02096685 -0.0279558 ]\n",
|
||||
" [ 0.25975286 0.11950571 -0.02074143 -0.16098857]\n",
|
||||
" [ 0.53548461 0.27096922 0.00645383 -0.25806156]]\n",
|
||||
"Final biases:\n",
|
||||
" [-0.00698895 -0.04024714 -0.06451539]\n"
|
||||
@ -463,6 +468,124 @@
|
||||
"print(\"Final weights:\\n\", weights)\n",
|
||||
"print(\"Final biases:\\n\", biases)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Change of Notation\n",
|
||||
"The previous notation is clunky and long. From here forward, we will use the following notation for a layer with $n$ inputs and $i$ neurons. The neruon layer has is followed by an activation layer and then fed into a final value $y$ with a computed loss $l$. There can be $j$ batches of data.\n",
|
||||
"\n",
|
||||
"$\\vec{X_j} = \\begin{bmatrix} x_{1j} & x_{2j} & \\cdots & x_{nj} \\end{bmatrix}$ -> Row vector for the layer inputs for the $j$ batch of data.\n",
|
||||
"\n",
|
||||
"$\\overline{\\overline{W}} = \\begin{bmatrix} \\vec{w_{1}} \\\\ \\vec{w_{2}} \\\\ \\vdots \\\\ \\vec{w_{i}} \\end{bmatrix} = \\begin{bmatrix} w_{11} & w_{12} & \\cdots & w_{1n} \\\\ w_{21} & w_{22} & \\cdots & w_{2n} \\\\ \\vdots & \\vdots & \\ddots & \\vdots \\\\ w_{i1} & w_{i2} & \\cdots & w_{in}\\end{bmatrix}$ -> Matrix of weight values.\n",
|
||||
"\n",
|
||||
"$\\vec{B} = \\begin{bmatrix} b_1 & b_2 & \\cdots & b_i \\end{bmatrix}$ -> Row vector for the neuron biases\n",
|
||||
"\n",
|
||||
"$\\vec{Z_j} = \\begin{bmatrix} z_{1j} & z_{2j} & \\cdots & z_{ij} \\end{bmatrix}$ -> Row vector for the neuron outputs for the $j$ batch of data.\n",
|
||||
"\n",
|
||||
"$\\vec{A_j} = \\begin{bmatrix} a_{1j} & a_{2j} & \\cdots & a_{ij} \\end{bmatrix}$ -> Row vector for the activation later outputs for the $j$ batch of data.\n",
|
||||
"\n",
|
||||
"$y_j$ -> Final layer output for the $j$ batch of data if the layer is the final layer (could be summation, probability, etc).\n",
|
||||
"\n",
|
||||
"$l_j$ -> Loss for the $j$ batch of data."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Iteration 0, Loss: 466.56000000000006\n",
|
||||
"Iteration 20, Loss: 5.329595763793193\n",
|
||||
"Iteration 40, Loss: 0.41191524253483786\n",
|
||||
"Iteration 60, Loss: 0.03183621475376345\n",
|
||||
"Iteration 80, Loss: 0.002460565405431671\n",
|
||||
"Iteration 100, Loss: 0.0001901729121621426\n",
|
||||
"Iteration 120, Loss: 1.4698120139337557e-05\n",
|
||||
"Iteration 140, Loss: 1.1359948840900371e-06\n",
|
||||
"Iteration 160, Loss: 8.779778427447647e-08\n",
|
||||
"Iteration 180, Loss: 6.785903626216421e-09\n",
|
||||
"Final weights:\n",
|
||||
" [[-0.00698895 -0.0139779 -0.02096685 -0.0279558 ]\n",
|
||||
" [ 0.25975286 0.11950571 -0.02074143 -0.16098857]\n",
|
||||
" [ 0.53548461 0.27096922 0.00645383 -0.25806156]]\n",
|
||||
"Final biases:\n",
|
||||
" [-0.00698895 -0.04024714 -0.06451539]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Code changed to match new notation\n",
|
||||
"import numpy as np\n",
|
||||
"\n",
|
||||
"# Initial inputs\n",
|
||||
"X = np.array([1, 2, 3, 4])\n",
|
||||
"\n",
|
||||
"# Initial weights and biases\n",
|
||||
"W = np.array([\n",
|
||||
" [0.1, 0.2, 0.3, 0.4],\n",
|
||||
" [0.5, 0.6, 0.7, 0.8],\n",
|
||||
" [0.9, 1.0, 1.1, 1.2]\n",
|
||||
"])\n",
|
||||
"\n",
|
||||
"B = np.array([0.1, 0.2, 0.3])\n",
|
||||
"\n",
|
||||
"learning_rate = 0.001\n",
|
||||
"\n",
|
||||
"# Add the derivative function to the ReLU class\n",
|
||||
"class Activation_ReLU:\n",
|
||||
" def forward(self, inputs):\n",
|
||||
" return np.maximum(0, inputs)\n",
|
||||
" \n",
|
||||
" def derivative(self, inputs):\n",
|
||||
" return np.where(inputs > 0, 1, 0)\n",
|
||||
" \n",
|
||||
"relu = Activation_ReLU()\n",
|
||||
"\n",
|
||||
"num_iterations = 200\n",
|
||||
"\n",
|
||||
"# Training loop\n",
|
||||
"# A single layer of 3 neurons, each with 4 inputs\n",
|
||||
"# The neuron layer is then fed into a ReLU activation layer\n",
|
||||
"for iteration in range(num_iterations):\n",
|
||||
" # Forward pass\n",
|
||||
" Z = np.dot(W, X) + B\n",
|
||||
" A = relu.forward(Z)\n",
|
||||
" \n",
|
||||
" # Calculate the squared loss assuming the desired output is a sum of 0. Trivial but just an example\n",
|
||||
" y = np.sum(A)\n",
|
||||
" l = y**2\n",
|
||||
"\n",
|
||||
" # Backward pass\n",
|
||||
" dL_dy = 2 * y\n",
|
||||
" dy_dA = np.ones_like(A)\n",
|
||||
" dA_dZ = relu.derivative(Z)\n",
|
||||
"\n",
|
||||
" dl_dZ = dL_dy * dy_dA * dA_dZ\n",
|
||||
"\n",
|
||||
" # Get the gradient of the Loss with respect to the weights and biases\n",
|
||||
" # dL_dW = np.outer(dl_dz, X)\n",
|
||||
" dL_dW = X.reshape(-1, 1) @ dl_dZ.reshape(1, -1)\n",
|
||||
" dL_dB = dl_dZ\n",
|
||||
"\n",
|
||||
" # Update the weights and biases\n",
|
||||
" # Remove the .T if using dL_dW = np.outer(dl_dz, X)\n",
|
||||
" W -= learning_rate * dL_dW.T\n",
|
||||
" B -= learning_rate * dL_dB\n",
|
||||
"\n",
|
||||
" # Print the loss every 20 iterations\n",
|
||||
" if iteration % 20 == 0:\n",
|
||||
" print(f\"Iteration {iteration}, Loss: {l}\")\n",
|
||||
"\n",
|
||||
"# Final weights and biases\n",
|
||||
"print(\"Final weights:\\n\", W)\n",
|
||||
"print(\"Final biases:\\n\", B)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
Loading…
Reference in New Issue
Block a user