Lecture 28. Overfitting and generalization

2025-01-27 02:14:20 +00:00 · 2025-01-27 02:14:20 +00:00 · a95ddb61b8
commit a95ddb61b8
parent 4a22ddc3b5
4 changed files with 944 additions and 102 deletions
--- a/lecture25_27/notes_25.ipynb
+++ b/lecture25_27/notes_25.ipynb
@ -786,114 +786,120 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "epoch: 0, acc: 0.327, loss: 1.099, lr: 0.02\n",
-      "epoch: 100, acc: 0.523, loss: 0.941, lr: 0.01998021958261321\n",
-      "epoch: 200, acc: 0.517, loss: 0.854, lr: 0.019960279044701046\n",
-      "epoch: 300, acc: 0.630, loss: 0.753, lr: 0.019940378268975763\n",
-      "epoch: 400, acc: 0.693, loss: 0.699, lr: 0.01992051713662487\n",
-      "epoch: 500, acc: 0.693, loss: 0.658, lr: 0.01990069552930875\n",
-      "epoch: 600, acc: 0.707, loss: 0.632, lr: 0.019880913329158343\n",
-      "epoch: 700, acc: 0.707, loss: 0.612, lr: 0.019861170418772778\n",
-      "epoch: 800, acc: 0.720, loss: 0.572, lr: 0.019841466681217078\n",
-      "epoch: 900, acc: 0.737, loss: 0.558, lr: 0.01982180200001982\n",
-      "epoch: 1000, acc: 0.730, loss: 0.548, lr: 0.019802176259170884\n",
-      "epoch: 1100, acc: 0.730, loss: 0.540, lr: 0.01978258934311912\n",
-      "epoch: 1200, acc: 0.733, loss: 0.529, lr: 0.01976304113677013\n",
-      "epoch: 1300, acc: 0.733, loss: 0.521, lr: 0.019743531525483964\n",
-      "epoch: 1400, acc: 0.750, loss: 0.517, lr: 0.01972406039507293\n",
-      "epoch: 1500, acc: 0.750, loss: 0.518, lr: 0.019704627631799327\n",
-      "epoch: 1600, acc: 0.743, loss: 0.508, lr: 0.019685233122373254\n",
-      "epoch: 1700, acc: 0.723, loss: 0.505, lr: 0.019665876753950384\n",
-      "epoch: 1800, acc: 0.740, loss: 0.502, lr: 0.01964655841412981\n",
-      "epoch: 1900, acc: 0.740, loss: 0.498, lr: 0.019627277990951823\n",
-      "epoch: 2000, acc: 0.730, loss: 0.493, lr: 0.019608035372895814\n",
-      "epoch: 2100, acc: 0.753, loss: 0.491, lr: 0.01958883044887805\n",
-      "epoch: 2200, acc: 0.760, loss: 0.488, lr: 0.019569663108249594\n",
-      "epoch: 2300, acc: 0.757, loss: 0.487, lr: 0.01955053324079414\n",
-      "epoch: 2400, acc: 0.743, loss: 0.482, lr: 0.019531440736725945\n",
-      "epoch: 2500, acc: 0.743, loss: 0.479, lr: 0.019512385486687673\n",
-      "epoch: 2600, acc: 0.760, loss: 0.479, lr: 0.019493367381748363\n",
-      "epoch: 2700, acc: 0.797, loss: 0.462, lr: 0.019474386313401298\n",
-      "epoch: 2800, acc: 0.787, loss: 0.439, lr: 0.019455442173562\n",
-      "epoch: 2900, acc: 0.790, loss: 0.436, lr: 0.019436534854566128\n",
-      "epoch: 3000, acc: 0.800, loss: 0.435, lr: 0.01941766424916747\n",
-      "epoch: 3100, acc: 0.787, loss: 0.433, lr: 0.019398830250535893\n",
-      "epoch: 3200, acc: 0.793, loss: 0.430, lr: 0.019380032752255354\n",
-      "epoch: 3300, acc: 0.800, loss: 0.429, lr: 0.01936127164832186\n",
-      "epoch: 3400, acc: 0.790, loss: 0.427, lr: 0.01934254683314152\n",
-      "epoch: 3500, acc: 0.790, loss: 0.426, lr: 0.019323858201528515\n",
-      "epoch: 3600, acc: 0.783, loss: 0.428, lr: 0.019305205648703173\n",
-      "epoch: 3700, acc: 0.790, loss: 0.424, lr: 0.01928658907028997\n",
-      "epoch: 3800, acc: 0.773, loss: 0.428, lr: 0.01926800836231563\n",
-      "epoch: 3900, acc: 0.787, loss: 0.420, lr: 0.019249463421207133\n",
-      "epoch: 4000, acc: 0.797, loss: 0.417, lr: 0.019230954143789846\n",
-      "epoch: 4100, acc: 0.797, loss: 0.415, lr: 0.019212480427285565\n",
-      "epoch: 4200, acc: 0.807, loss: 0.414, lr: 0.019194042169310647\n",
-      "epoch: 4300, acc: 0.810, loss: 0.412, lr: 0.019175639267874092\n",
-      "epoch: 4400, acc: 0.807, loss: 0.413, lr: 0.019157271621375684\n",
-      "epoch: 4500, acc: 0.793, loss: 0.411, lr: 0.0191389391286041\n",
-      "epoch: 4600, acc: 0.810, loss: 0.413, lr: 0.019120641688735073\n",
-      "epoch: 4700, acc: 0.810, loss: 0.409, lr: 0.019102379201329525\n",
-      "epoch: 4800, acc: 0.790, loss: 0.408, lr: 0.01908415156633174\n",
-      "epoch: 4900, acc: 0.810, loss: 0.408, lr: 0.01906595868406753\n",
-      "epoch: 5000, acc: 0.800, loss: 0.406, lr: 0.01904780045524243\n",
-      "epoch: 5100, acc: 0.793, loss: 0.407, lr: 0.019029676780939874\n",
-      "epoch: 5200, acc: 0.787, loss: 0.405, lr: 0.019011587562619416\n",
-      "epoch: 5300, acc: 0.793, loss: 0.405, lr: 0.01899353270211493\n",
-      "epoch: 5400, acc: 0.797, loss: 0.404, lr: 0.018975512101632844\n",
-      "epoch: 5500, acc: 0.810, loss: 0.405, lr: 0.018957525663750367\n",
-      "epoch: 5600, acc: 0.793, loss: 0.403, lr: 0.018939573291413745\n",
-      "epoch: 5700, acc: 0.790, loss: 0.403, lr: 0.018921654887936498\n",
-      "epoch: 5800, acc: 0.790, loss: 0.402, lr: 0.018903770356997706\n",
-      "epoch: 5900, acc: 0.793, loss: 0.401, lr: 0.018885919602640248\n",
-      "epoch: 6000, acc: 0.813, loss: 0.404, lr: 0.018868102529269144\n",
-      "epoch: 6100, acc: 0.807, loss: 0.401, lr: 0.018850319041649778\n",
-      "epoch: 6200, acc: 0.807, loss: 0.402, lr: 0.018832569044906263\n",
-      "epoch: 6300, acc: 0.810, loss: 0.401, lr: 0.018814852444519702\n",
-      "epoch: 6400, acc: 0.810, loss: 0.399, lr: 0.018797169146326564\n",
-      "epoch: 6500, acc: 0.793, loss: 0.401, lr: 0.01877951905651696\n",
-      "epoch: 6600, acc: 0.797, loss: 0.399, lr: 0.018761902081633034\n",
-      "epoch: 6700, acc: 0.807, loss: 0.398, lr: 0.018744318128567278\n",
-      "epoch: 6800, acc: 0.790, loss: 0.399, lr: 0.018726767104560903\n",
-      "epoch: 6900, acc: 0.810, loss: 0.398, lr: 0.018709248917202218\n",
-      "epoch: 7000, acc: 0.807, loss: 0.398, lr: 0.018691763474424996\n",
-      "epoch: 7100, acc: 0.807, loss: 0.399, lr: 0.018674310684506857\n",
-      "epoch: 7200, acc: 0.790, loss: 0.398, lr: 0.01865689045606769\n",
-      "epoch: 7300, acc: 0.790, loss: 0.398, lr: 0.01863950269806802\n",
-      "epoch: 7400, acc: 0.803, loss: 0.396, lr: 0.018622147319807447\n",
-      "epoch: 7500, acc: 0.790, loss: 0.399, lr: 0.018604824230923075\n",
-      "epoch: 7600, acc: 0.793, loss: 0.398, lr: 0.01858753334138793\n",
-      "epoch: 7700, acc: 0.810, loss: 0.396, lr: 0.018570274561509396\n",
-      "epoch: 7800, acc: 0.810, loss: 0.395, lr: 0.018553047801927663\n",
-      "epoch: 7900, acc: 0.803, loss: 0.395, lr: 0.018535852973614212\n",
-      "epoch: 8000, acc: 0.790, loss: 0.395, lr: 0.01851868998787026\n",
-      "epoch: 8100, acc: 0.813, loss: 0.395, lr: 0.018501558756325222\n",
-      "epoch: 8200, acc: 0.790, loss: 0.395, lr: 0.01848445919093522\n",
-      "epoch: 8300, acc: 0.793, loss: 0.395, lr: 0.018467391203981567\n",
-      "epoch: 8400, acc: 0.793, loss: 0.395, lr: 0.018450354708069265\n",
-      "epoch: 8500, acc: 0.813, loss: 0.394, lr: 0.018433349616125496\n",
-      "epoch: 8600, acc: 0.813, loss: 0.395, lr: 0.018416375841398172\n",
-      "epoch: 8700, acc: 0.793, loss: 0.394, lr: 0.01839943329745444\n",
-      "epoch: 8800, acc: 0.783, loss: 0.398, lr: 0.01838252189817921\n",
-      "epoch: 8900, acc: 0.793, loss: 0.393, lr: 0.018365641557773718\n",
-      "epoch: 9000, acc: 0.797, loss: 0.393, lr: 0.018348792190754044\n",
-      "epoch: 9100, acc: 0.813, loss: 0.394, lr: 0.0183319737119497\n",
-      "epoch: 9200, acc: 0.813, loss: 0.393, lr: 0.018315186036502167\n",
-      "epoch: 9300, acc: 0.810, loss: 0.393, lr: 0.018298429079863496\n",
-      "epoch: 9400, acc: 0.793, loss: 0.395, lr: 0.018281702757794862\n",
-      "epoch: 9500, acc: 0.800, loss: 0.392, lr: 0.018265006986365174\n",
-      "epoch: 9600, acc: 0.797, loss: 0.393, lr: 0.018248341681949654\n",
-      "epoch: 9700, acc: 0.807, loss: 0.392, lr: 0.018231706761228456\n",
-      "epoch: 9800, acc: 0.817, loss: 0.393, lr: 0.018215102141185255\n",
-      "epoch: 9900, acc: 0.817, loss: 0.395, lr: 0.018198527739105907\n",
-      "epoch: 10000, acc: 0.790, loss: 0.392, lr: 0.018181983472577025\n"
+      "epoch: 0, acc: 0.383, loss: 1.099, lr: 0.02\n",
+      "epoch: 100, acc: 0.517, loss: 0.941, lr: 0.01998021958261321\n",
+      "epoch: 200, acc: 0.707, loss: 0.705, lr: 0.019960279044701046\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch: 300, acc: 0.723, loss: 0.591, lr: 0.019940378268975763\n",
+      "epoch: 400, acc: 0.750, loss: 0.535, lr: 0.01992051713662487\n",
+      "epoch: 500, acc: 0.777, loss: 0.488, lr: 0.01990069552930875\n",
+      "epoch: 600, acc: 0.787, loss: 0.460, lr: 0.019880913329158343\n",
+      "epoch: 700, acc: 0.810, loss: 0.441, lr: 0.019861170418772778\n",
+      "epoch: 800, acc: 0.810, loss: 0.425, lr: 0.019841466681217078\n",
+      "epoch: 900, acc: 0.827, loss: 0.403, lr: 0.01982180200001982\n",
+      "epoch: 1000, acc: 0.823, loss: 0.390, lr: 0.019802176259170884\n",
+      "epoch: 1100, acc: 0.827, loss: 0.383, lr: 0.01978258934311912\n",
+      "epoch: 1200, acc: 0.830, loss: 0.377, lr: 0.01976304113677013\n",
+      "epoch: 1300, acc: 0.837, loss: 0.375, lr: 0.019743531525483964\n",
+      "epoch: 1400, acc: 0.833, loss: 0.369, lr: 0.01972406039507293\n",
+      "epoch: 1500, acc: 0.830, loss: 0.366, lr: 0.019704627631799327\n",
+      "epoch: 1600, acc: 0.833, loss: 0.362, lr: 0.019685233122373254\n",
+      "epoch: 1700, acc: 0.827, loss: 0.359, lr: 0.019665876753950384\n",
+      "epoch: 1800, acc: 0.837, loss: 0.357, lr: 0.01964655841412981\n",
+      "epoch: 1900, acc: 0.833, loss: 0.355, lr: 0.019627277990951823\n",
+      "epoch: 2000, acc: 0.843, loss: 0.353, lr: 0.019608035372895814\n",
+      "epoch: 2100, acc: 0.833, loss: 0.352, lr: 0.01958883044887805\n",
+      "epoch: 2200, acc: 0.843, loss: 0.350, lr: 0.019569663108249594\n",
+      "epoch: 2300, acc: 0.840, loss: 0.349, lr: 0.01955053324079414\n",
+      "epoch: 2400, acc: 0.837, loss: 0.347, lr: 0.019531440736725945\n",
+      "epoch: 2500, acc: 0.837, loss: 0.346, lr: 0.019512385486687673\n",
+      "epoch: 2600, acc: 0.847, loss: 0.344, lr: 0.019493367381748363\n",
+      "epoch: 2700, acc: 0.837, loss: 0.343, lr: 0.019474386313401298\n",
+      "epoch: 2800, acc: 0.833, loss: 0.343, lr: 0.019455442173562\n",
+      "epoch: 2900, acc: 0.837, loss: 0.341, lr: 0.019436534854566128\n",
+      "epoch: 3000, acc: 0.843, loss: 0.339, lr: 0.01941766424916747\n",
+      "epoch: 3100, acc: 0.843, loss: 0.338, lr: 0.019398830250535893\n",
+      "epoch: 3200, acc: 0.843, loss: 0.337, lr: 0.019380032752255354\n",
+      "epoch: 3300, acc: 0.840, loss: 0.336, lr: 0.01936127164832186\n",
+      "epoch: 3400, acc: 0.847, loss: 0.335, lr: 0.01934254683314152\n",
+      "epoch: 3500, acc: 0.853, loss: 0.336, lr: 0.019323858201528515\n",
+      "epoch: 3600, acc: 0.850, loss: 0.334, lr: 0.019305205648703173\n",
+      "epoch: 3700, acc: 0.847, loss: 0.332, lr: 0.01928658907028997\n",
+      "epoch: 3800, acc: 0.847, loss: 0.331, lr: 0.01926800836231563\n",
+      "epoch: 3900, acc: 0.850, loss: 0.330, lr: 0.019249463421207133\n",
+      "epoch: 4000, acc: 0.847, loss: 0.329, lr: 0.019230954143789846\n",
+      "epoch: 4100, acc: 0.843, loss: 0.329, lr: 0.019212480427285565\n",
+      "epoch: 4200, acc: 0.850, loss: 0.327, lr: 0.019194042169310647\n",
+      "epoch: 4300, acc: 0.847, loss: 0.326, lr: 0.019175639267874092\n",
+      "epoch: 4400, acc: 0.843, loss: 0.327, lr: 0.019157271621375684\n",
+      "epoch: 4500, acc: 0.850, loss: 0.325, lr: 0.0191389391286041\n",
+      "epoch: 4600, acc: 0.850, loss: 0.325, lr: 0.019120641688735073\n",
+      "epoch: 4700, acc: 0.847, loss: 0.324, lr: 0.019102379201329525\n",
+      "epoch: 4800, acc: 0.847, loss: 0.324, lr: 0.01908415156633174\n",
+      "epoch: 4900, acc: 0.837, loss: 0.325, lr: 0.01906595868406753\n",
+      "epoch: 5000, acc: 0.847, loss: 0.321, lr: 0.01904780045524243\n",
+      "epoch: 5100, acc: 0.847, loss: 0.322, lr: 0.019029676780939874\n",
+      "epoch: 5200, acc: 0.847, loss: 0.320, lr: 0.019011587562619416\n",
+      "epoch: 5300, acc: 0.850, loss: 0.320, lr: 0.01899353270211493\n",
+      "epoch: 5400, acc: 0.847, loss: 0.319, lr: 0.018975512101632844\n",
+      "epoch: 5500, acc: 0.843, loss: 0.318, lr: 0.018957525663750367\n",
+      "epoch: 5600, acc: 0.847, loss: 0.317, lr: 0.018939573291413745\n",
+      "epoch: 5700, acc: 0.840, loss: 0.318, lr: 0.018921654887936498\n",
+      "epoch: 5800, acc: 0.847, loss: 0.316, lr: 0.018903770356997706\n",
+      "epoch: 5900, acc: 0.847, loss: 0.316, lr: 0.018885919602640248\n",
+      "epoch: 6000, acc: 0.843, loss: 0.315, lr: 0.018868102529269144\n",
+      "epoch: 6100, acc: 0.850, loss: 0.315, lr: 0.018850319041649778\n",
+      "epoch: 6200, acc: 0.850, loss: 0.315, lr: 0.018832569044906263\n",
+      "epoch: 6300, acc: 0.843, loss: 0.314, lr: 0.018814852444519702\n",
+      "epoch: 6400, acc: 0.847, loss: 0.315, lr: 0.018797169146326564\n",
+      "epoch: 6500, acc: 0.847, loss: 0.313, lr: 0.01877951905651696\n",
+      "epoch: 6600, acc: 0.847, loss: 0.313, lr: 0.018761902081633034\n",
+      "epoch: 6700, acc: 0.847, loss: 0.312, lr: 0.018744318128567278\n",
+      "epoch: 6800, acc: 0.853, loss: 0.314, lr: 0.018726767104560903\n",
+      "epoch: 6900, acc: 0.850, loss: 0.323, lr: 0.018709248917202218\n",
+      "epoch: 7000, acc: 0.850, loss: 0.312, lr: 0.018691763474424996\n",
+      "epoch: 7100, acc: 0.847, loss: 0.311, lr: 0.018674310684506857\n",
+      "epoch: 7200, acc: 0.853, loss: 0.312, lr: 0.01865689045606769\n",
+      "epoch: 7300, acc: 0.850, loss: 0.310, lr: 0.01863950269806802\n",
+      "epoch: 7400, acc: 0.850, loss: 0.311, lr: 0.018622147319807447\n",
+      "epoch: 7500, acc: 0.853, loss: 0.310, lr: 0.018604824230923075\n",
+      "epoch: 7600, acc: 0.847, loss: 0.307, lr: 0.01858753334138793\n",
+      "epoch: 7700, acc: 0.847, loss: 0.308, lr: 0.018570274561509396\n",
+      "epoch: 7800, acc: 0.847, loss: 0.307, lr: 0.018553047801927663\n",
+      "epoch: 7900, acc: 0.847, loss: 0.306, lr: 0.018535852973614212\n",
+      "epoch: 8000, acc: 0.850, loss: 0.305, lr: 0.01851868998787026\n",
+      "epoch: 8100, acc: 0.853, loss: 0.305, lr: 0.018501558756325222\n",
+      "epoch: 8200, acc: 0.847, loss: 0.305, lr: 0.01848445919093522\n",
+      "epoch: 8300, acc: 0.843, loss: 0.306, lr: 0.018467391203981567\n",
+      "epoch: 8400, acc: 0.857, loss: 0.304, lr: 0.018450354708069265\n",
+      "epoch: 8500, acc: 0.850, loss: 0.304, lr: 0.018433349616125496\n",
+      "epoch: 8600, acc: 0.853, loss: 0.303, lr: 0.018416375841398172\n",
+      "epoch: 8700, acc: 0.843, loss: 0.304, lr: 0.01839943329745444\n",
+      "epoch: 8800, acc: 0.843, loss: 0.303, lr: 0.01838252189817921\n",
+      "epoch: 8900, acc: 0.850, loss: 0.302, lr: 0.018365641557773718\n",
+      "epoch: 9000, acc: 0.850, loss: 0.303, lr: 0.018348792190754044\n",
+      "epoch: 9100, acc: 0.857, loss: 0.302, lr: 0.0183319737119497\n",
+      "epoch: 9200, acc: 0.843, loss: 0.302, lr: 0.018315186036502167\n",
+      "epoch: 9300, acc: 0.847, loss: 0.303, lr: 0.018298429079863496\n",
+      "epoch: 9400, acc: 0.857, loss: 0.301, lr: 0.018281702757794862\n",
+      "epoch: 9500, acc: 0.853, loss: 0.290, lr: 0.018265006986365174\n",
+      "epoch: 9600, acc: 0.867, loss: 0.280, lr: 0.018248341681949654\n",
+      "epoch: 9700, acc: 0.857, loss: 0.275, lr: 0.018231706761228456\n",
+      "epoch: 9800, acc: 0.863, loss: 0.272, lr: 0.018215102141185255\n",
+      "epoch: 9900, acc: 0.873, loss: 0.269, lr: 0.018198527739105907\n",
+      "epoch: 10000, acc: 0.870, loss: 0.268, lr: 0.018181983472577025\n"
     ]
    }
   ],
--- a/lecture28_31/notes_28.ipynb
+++ b/lecture28_31/notes_28.ipynb
@ -0,0 +1,461 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Previous Class Definitions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# imports\n",
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "import nnfs\n",
+    "from nnfs.datasets import spiral_data, vertical_data\n",
+    "nnfs.init()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class Layer_Dense:\n",
+    "    def __init__(self, n_inputs, n_neurons):\n",
+    "        # Initialize the weights and biases\n",
+    "        self.weights = 0.01 * np.random.randn(n_inputs, n_neurons)  # Normal distribution of weights\n",
+    "        self.biases = np.zeros((1, n_neurons))\n",
+    "\n",
+    "    def forward(self, inputs):\n",
+    "        # Calculate the output values from inputs, weights, and biases\n",
+    "        self.inputs = inputs\n",
+    "        self.output = np.dot(inputs, self.weights) + self.biases        # Weights are already transposed\n",
+    "    \n",
+    "    def backward(self, dvalues):\n",
+    "        '''Calculated the gradient of the loss with respect to the weights and biases of this layer.\n",
+    "        dvalues is equiavelent to a transposed dl_dZ. It is the gradient \n",
+    "        of the loss with respect to the outputs of this layer.'''\n",
+    "        self.dweights = np.dot(self.inputs.T, dvalues)\n",
+    "        self.dbiases = np.sum(dvalues, axis=0, keepdims=0)\n",
+    "        self.dinputs = np.dot(dvalues, self.weights.T)\n",
+    "\n",
+    "class Activation_ReLU:\n",
+    "    def forward(self, inputs):\n",
+    "        self.inputs = inputs\n",
+    "        self.output = np.maximum(0, inputs)\n",
+    "    \n",
+    "    def backward(self, dvalues):\n",
+    "        '''Calculated the gradient of the loss with respect to this layer's activation function\n",
+    "        dvalues is equiavelent to a transposed dl_dZ. It is the gradient \n",
+    "        of the loss with respect to the outputs of this layer.'''\n",
+    "        self.dinputs = dvalues.copy()\n",
+    "        self.dinputs[self.inputs <= 0] = 0\n",
+    "        \n",
+    "class Activation_Softmax:\n",
+    "    def forward(self, inputs):\n",
+    "        # Get the unnormalized probabilities\n",
+    "        # Subtract max from the row to prevent larger numbers\n",
+    "        exp_values = np.exp(inputs - np.max(inputs, axis=1, keepdims=True))\n",
+    "\n",
+    "        # Normalize the probabilities with element wise division\n",
+    "        probabilities = exp_values / np.sum(exp_values, axis=1,keepdims=True)\n",
+    "        self.output = probabilities\n",
+    "        \n",
+    "# Base class for Loss functions\n",
+    "class Loss:\n",
+    "    '''Calculates the data and regularization losses given\n",
+    "    model output and ground truth values'''\n",
+    "    def calculate(self, output, y):\n",
+    "        sample_losses = self.forward(output, y)\n",
+    "        data_loss = np.average(sample_losses)\n",
+    "        return data_loss\n",
+    "\n",
+    "class Loss_CategoricalCrossEntropy(Loss):\n",
+    "    def forward(self, y_pred, y_true):\n",
+    "        '''y_pred is the neural network output\n",
+    "        y_true is the ideal output of the neural network'''\n",
+    "        samples = len(y_pred)\n",
+    "        # Bound the predicted values \n",
+    "        y_pred_clipped = np.clip(y_pred, 1e-7, 1-1e-7)\n",
+    "        \n",
+    "        if len(y_true.shape) == 1:     # Categorically labeled\n",
+    "            correct_confidences = y_pred_clipped[range(samples), y_true]\n",
+    "        elif len(y_true.shape) == 2:   # One hot encoded\n",
+    "            correct_confidences = np.sum(y_pred_clipped*y_true, axis=1)\n",
+    "\n",
+    "        # Calculate the losses\n",
+    "        negative_log_likelihoods = -np.log(correct_confidences)\n",
+    "        return negative_log_likelihoods\n",
+    "    \n",
+    "    def backward(self, dvalues, y_true):\n",
+    "        samples = len(dvalues)\n",
+    "\n",
+    "        # Number of lables in each sample\n",
+    "        labels = len(dvalues[0])\n",
+    "\n",
+    "        # if the labels are sparse, turn them into a one-hot vector\n",
+    "        if len(y_true.shape) == 1:\n",
+    "            y_true = np.eye(labels)[y_true]\n",
+    "\n",
+    "        # Calculate the gradient then normalize\n",
+    "        self.dinputs = -y_true / dvalues\n",
+    "        self.dinputs = self.dinputs / samples\n",
+    "\n",
+    "class Activation_Softmax_Loss_CategoricalCrossentropy():\n",
+    "    def __init__(self):\n",
+    "        self.activation = Activation_Softmax()\n",
+    "        self.loss = Loss_CategoricalCrossEntropy()\n",
+    "\n",
+    "    def forward(self, inputs, y_true):\n",
+    "        self.activation.forward(inputs)\n",
+    "        self.output = self.activation.output\n",
+    "        return self.loss.calculate(self.output, y_true)\n",
+    "    \n",
+    "    def backward(self, dvalues, y_true):\n",
+    "        samples = len(dvalues)\n",
+    "\n",
+    "        # if the samples are one-hot encoded, turn them into discrete values\n",
+    "        if len(y_true.shape) == 2:\n",
+    "            y_true = np.argmax(y_true, axis=1)\n",
+    "            \n",
+    "        # Copy so we can safely modify\n",
+    "        self.dinputs = dvalues.copy()\n",
+    "        \n",
+    "        # Calculate and normalize gradient \n",
+    "        self.dinputs[range(samples), y_true] -= 1\n",
+    "        self.dinputs = self.dinputs / samples\n",
+    "\n",
+    "class Optimizer_SGD():\n",
+    "    def __init__(self, learning_rate=0.5, decay=0.0, momentum=0.0):\n",
+    "        self.initial_rate = learning_rate\n",
+    "        self.current_learning_rate = self.initial_rate\n",
+    "        self.decay = decay\n",
+    "        self.iterations = 0\n",
+    "        self.momentum = momentum\n",
+    "\n",
+    "    def pre_update_params(self):\n",
+    "        # Update the current_learning_rate before updating params\n",
+    "        if self.decay:\n",
+    "            self.current_learning_rate = self.initial_rate / (1 + self.decay * self.iterations)\n",
+    "\n",
+    "    def update_params(self, layer):\n",
+    "        if self.momentum:\n",
+    "            # For each layer, we need to use its last momentums\n",
+    "\n",
+    "            # First check if the layer has a last momentum stored\n",
+    "            if not hasattr(layer, 'weight_momentums'):\n",
+    "                layer.weight_momentums = np.zeros_like(layer.weights)\n",
+    "                layer.bias_momentums = np.zeros_like(layer.biases)\n",
+    "            \n",
+    "            weight_updates = self.momentum * layer.weight_momentums - \\\n",
+    "                             self.current_learning_rate * layer.dweights\n",
+    "            layer.weight_momentums = weight_updates\n",
+    "\n",
+    "            bias_updates = self.momentum * layer.bias_momentums - \\\n",
+    "                           self.current_learning_rate * layer.dbiases\n",
+    "            layer.bias_momentums = bias_updates\n",
+    "            \n",
+    "        # Not using momentum\n",
+    "        else:\n",
+    "            weight_updates = -self.current_learning_rate * layer.dweights\n",
+    "            bias_updates = -self.current_learning_rate * layer.dbiases\n",
+    "\n",
+    "        layer.weights += weight_updates\n",
+    "        layer.biases += bias_updates\n",
+    "\n",
+    "    def post_update_params(self):\n",
+    "        # Update the self.iterations for use with decay\n",
+    "        self.iterations += 1\n",
+    "\n",
+    "class Optimizer_Adagrad():\n",
+    "    def __init__(self, learning_rate=0.5, decay=0.0, epsilon=1e-7):\n",
+    "        self.initial_learning_rate = learning_rate\n",
+    "        self.current_learning_rate = self.initial_learning_rate\n",
+    "        self.decay = decay\n",
+    "        self.iterations = 0\n",
+    "        self.epsilon = epsilon\n",
+    "\n",
+    "    def pre_update_params(self):\n",
+    "        if self.decay:\n",
+    "            self.current_learning_rate = self.initial_learning_rate / (1 + self.decay * self.iterations)\n",
+    "\n",
+    "    def update_params(self, layer):\n",
+    "        if not hasattr(layer, 'weight_cache'):\n",
+    "            layer.weight_cache = np.zeros_like(layer.weights)\n",
+    "            layer.bias_cache = np.zeros_like(layer.biases)\n",
+    "\n",
+    "        layer.weight_cache += layer.dweights**2\n",
+    "        layer.bias_cache += layer.dbiases**2\n",
+    "\n",
+    "        layer.weights += -self.current_learning_rate * layer.dweights / (np.sqrt(layer.weight_cache) + self.epsilon)\n",
+    "        layer.biases += -self.current_learning_rate * layer.dbiases / (np.sqrt(layer.bias_cache) + self.epsilon)\n",
+    "\n",
+    "    def post_update_params(self):\n",
+    "        self.iterations += 1\n",
+    "\n",
+    "class Optimizer_RMSProp():\n",
+    "    def __init__(self, learning_rate=1e-3, decay=0.0, epsilon=1e-7, rho=0.9):\n",
+    "        self.initial_learning_rate = learning_rate\n",
+    "        self.current_learning_rate = self.initial_learning_rate\n",
+    "        self.decay = decay\n",
+    "        self.iterations = 0\n",
+    "        self.epsilon = epsilon\n",
+    "        self.rho = rho\n",
+    "\n",
+    "    def pre_update_params(self):\n",
+    "        if self.decay:\n",
+    "            self.current_learning_rate = self.initial_learning_rate / (1 + self.decay * self.iterations)\n",
+    "\n",
+    "    def update_params(self, layer):\n",
+    "        if not hasattr(layer, 'weight_cache'):\n",
+    "            layer.weight_cache = np.zeros_like(layer.weights)\n",
+    "            layer.bias_cache = np.zeros_like(layer.biases)\n",
+    "\n",
+    "        layer.weight_cache = self.rho * layer.weight_cache + (1 - self.rho) * layer.dweights**2\n",
+    "        layer.bias_cache = self.rho * layer.bias_cache + (1 - self.rho) * layer.dbiases**2\n",
+    "\n",
+    "        layer.weights += -self.current_learning_rate * layer.dweights / (np.sqrt(layer.weight_cache) + self.epsilon)\n",
+    "        layer.biases += -self.current_learning_rate * layer.dbiases / (np.sqrt(layer.bias_cache) + self.epsilon)\n",
+    "\n",
+    "    def post_update_params(self):\n",
+    "        self.iterations += 1\n",
+    "\n",
+    "# Adam optimizer\n",
+    "class Optimizer_Adam():\n",
+    "    def __init__(self, learning_rate=0.001, decay=0.0, epsilon=1e-7, beta_1=0.9, beta_2=0.999):\n",
+    "        self.initial_learning_rate = learning_rate\n",
+    "        self.current_learning_rate = learning_rate\n",
+    "        self.decay = decay\n",
+    "        self.iterations = 0\n",
+    "        self.epsilon = epsilon\n",
+    "        self.beta_1 = beta_1\n",
+    "        self.beta_2 = beta_2\n",
+    "\n",
+    "    def pre_update_params(self):\n",
+    "        if self.decay:\n",
+    "            self.current_learning_rate = self.initial_learning_rate * (1. / (1. + self.decay * self.iterations))\n",
+    "\n",
+    "    def update_params(self, layer):\n",
+    "        # If layer does not contain cache arrays, create them filled with zeros\n",
+    "        if not hasattr(layer, 'weight_cache'):\n",
+    "            layer.weight_momentums = np.zeros_like(layer.weights)\n",
+    "            layer.weight_cache = np.zeros_like(layer.weights)\n",
+    "            layer.bias_momentums = np.zeros_like(layer.biases)\n",
+    "            layer.bias_cache = np.zeros_like(layer.biases)\n",
+    "\n",
+    "        # Update momentum with current gradients\n",
+    "        layer.weight_momentums = self.beta_1 * layer.weight_momentums + (1 - self.beta_1) * layer.dweights\n",
+    "        layer.bias_momentums = self.beta_1 * layer.bias_momentums + (1 - self.beta_1) * layer.dbiases\n",
+    "\n",
+    "        # Get corrected momentum\n",
+    "        # use self.iteration + 1 because we start at iteration 0\n",
+    "        weight_momentums_corrected = layer.weight_momentums / (1 - self.beta_1 ** (self.iterations + 1))\n",
+    "        bias_momentums_corrected = layer.bias_momentums / (1 - self.beta_1 ** (self.iterations + 1))\n",
+    "\n",
+    "        # Update cache with squared current gradients\n",
+    "        layer.weight_cache = self.beta_2 * layer.weight_cache + (1 - self.beta_2) * layer.dweights**2\n",
+    "        layer.bias_cache = self.beta_2 * layer.bias_cache + (1 - self.beta_2) * layer.dbiases**2\n",
+    "\n",
+    "        # Get corrected cache\n",
+    "        weight_cache_corrected = layer.weight_cache / (1 - self.beta_2 ** (self.iterations + 1))\n",
+    "        bias_cache_corrected = layer.bias_cache / (1 - self.beta_2 ** (self.iterations + 1))\n",
+    "\n",
+    "        # Vanilla SGD parameter update + normalization with square rooted cache\n",
+    "        layer.weights += -self.current_learning_rate * weight_momentums_corrected / (np.sqrt(weight_cache_corrected) + self.epsilon)\n",
+    "        layer.biases += -self.current_learning_rate * bias_momentums_corrected / (np.sqrt(bias_cache_corrected) + self.epsilon)\n",
+    "\n",
+    "    # Call once after any parameter updates\n",
+    "    def post_update_params(self):\n",
+    "        self.iterations += 1\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Generalization and Overfitting\n",
+    "Overfitting can occur when the neural network tries to fit every training data perfectly. If the training data was perfect, this would not be an issue. However, because some training data is bad or should not be expected to be identified perfectly, the neural network can sacrifice generability to trying to identify all training data.\n",
+    "\n",
+    "If we could assign uncertainty to training data, I believe this would help.\n",
+    "\n",
+    "## Out of Sample Data\n",
+    "Rather than use all of our data for training, we can set aside some for out of sample testing so we can better understand how well the network generalizes."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Run the Adam Optimizer with the 100 Samples of Training Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "epoch: 10000, acc: 0.883, loss: 0.230, lr: 0.01818181818181818\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Create dataset\n",
+    "X, y = spiral_data(samples=100, classes=3)\n",
+    "\n",
+    "# Create Dense layer with 2 input features and 64 output values\n",
+    "dense1 = Layer_Dense(2, 64)\n",
+    "\n",
+    "# Create ReLU activation (to be used with Dense layer)\n",
+    "activation1 = Activation_ReLU()\n",
+    "\n",
+    "# Create second Dense layer with 64 input features (as we take output\n",
+    "# of previous layer here) and 3 output values (output values)\n",
+    "dense2 = Layer_Dense(64, 3)\n",
+    "\n",
+    "# Create Softmax classifier's combined loss and activation\n",
+    "loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()\n",
+    "\n",
+    "# Create optimizer\n",
+    "optimizer = Optimizer_Adam(learning_rate=0.02, decay=1e-5)\n",
+    "\n",
+    "# Train in loop\n",
+    "for epoch in range(10001):\n",
+    "    # Perform a forward pass of our training data through this layer\n",
+    "    dense1.forward(X)\n",
+    "    \n",
+    "    # Perform a forward pass through activation function\n",
+    "    # takes the output of first dense layer here\n",
+    "    activation1.forward(dense1.output)\n",
+    "    \n",
+    "    # Perform a forward pass through second Dense layer\n",
+    "    # takes outputs of activation function of first layer as inputs\n",
+    "    dense2.forward(activation1.output)\n",
+    "    \n",
+    "    # Perform a forward pass through the activation/loss function\n",
+    "    # takes the output of second dense layer here and returns loss\n",
+    "    loss = loss_activation.forward(dense2.output, y)\n",
+    "    \n",
+    "    # Calculate accuracy from output of activation2 and targets\n",
+    "    # calculate values along first axis\n",
+    "    predictions = np.argmax(loss_activation.output, axis=1)\n",
+    "    if len(y.shape) == 2:\n",
+    "        y = np.argmax(y, axis=1)\n",
+    "    accuracy = np.mean(predictions == y)\n",
+    "    \n",
+    "    # Backward pass\n",
+    "    loss_activation.backward(loss_activation.output, y)\n",
+    "    dense2.backward(loss_activation.dinputs)\n",
+    "    activation1.backward(dense2.dinputs)\n",
+    "    dense1.backward(activation1.dinputs)\n",
+    "    \n",
+    "    # Update weights and biases\n",
+    "    optimizer.pre_update_params()\n",
+    "    optimizer.update_params(dense1)\n",
+    "    optimizer.update_params(dense2)\n",
+    "    optimizer.post_update_params()\n",
+    "\n",
+    "print(f'epoch: {epoch}, ' +\n",
+    "      f'acc: {accuracy:.3f}, ' +\n",
+    "      f'loss: {loss:.3f}, ' +\n",
+    "      f'lr: {optimizer.current_learning_rate}')\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Now Use Different Found Biases and Weights on Out of Sample Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "validation, acc: 0.797, loss: 0.672\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Create test dataset\n",
+    "X_test, y_test = spiral_data(samples=100, classes=3)\n",
+    "# Perform a forward pass of our testing data through this layer\n",
+    "dense1.forward(X_test)\n",
+    "# Perform a forward pass through activation function\n",
+    "# takes the output of first dense layer here\n",
+    "activation1.forward(dense1.output)\n",
+    "# Perform a forward pass through second Dense layer\n",
+    "# takes outputs of activation function of first layer as inputs\n",
+    "dense2.forward(activation1.output)\n",
+    "# Perform a forward pass through the activation/loss function\n",
+    "# takes the output of second dense layer here and returns loss\n",
+    "loss = loss_activation.forward(dense2.output, y_test)\n",
+    "# Calculate accuracy from output of activation2 and targets\n",
+    "# calculate values along first axis\n",
+    "predictions = np.argmax(loss_activation.output, axis=1)\n",
+    "if len(y_test.shape) == 2:\n",
+    " y_test = np.argmax(y_test, axis=1)\n",
+    "accuracy = np.mean(predictions == y_test)\n",
+    "print(f'validation, acc: {accuracy:.3f}, loss: {loss:.3f}')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Observations\n",
+    "The out of sample accuracy is about 0.1% lower than the training data, with a loss almost 3x the training data."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Preventing Overfitting\n",
+    "## Reducing Network Complexity\n",
+    "Simpler models are more robust against overfitting and can provide more generalizability. This can be reducing the number of neurons in a layer or the total layers. Effectively, you reduce the granularity of functions that the network can model.\n",
+    "\n",
+    "## Reduce the Number of Epochs\n",
+    "By allowing less training iterations to occur, the network isn't given the time or opportunity to fit data points that might not be valid.\n",
+    "\n",
+    "These \"hyper-parameters\" can be adjusted after testing with out of sample data."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/lecture28_31/notes_28.pdf
+++ b/lecture28_31/notes_28.pdf
--- a/lecture28_31/notes_28.py
+++ b/lecture28_31/notes_28.py
@ -0,0 +1,375 @@
+# %% [markdown]
+# # Previous Class Definitions
+
+# %%
+# imports
+import matplotlib.pyplot as plt
+import numpy as np
+import nnfs
+from nnfs.datasets import spiral_data, vertical_data
+nnfs.init()
+
+# %%
+class Layer_Dense:
+    def __init__(self, n_inputs, n_neurons):
+        # Initialize the weights and biases
+        self.weights = 0.01 * np.random.randn(n_inputs, n_neurons)  # Normal distribution of weights
+        self.biases = np.zeros((1, n_neurons))
+
+    def forward(self, inputs):
+        # Calculate the output values from inputs, weights, and biases
+        self.inputs = inputs
+        self.output = np.dot(inputs, self.weights) + self.biases        # Weights are already transposed
+    
+    def backward(self, dvalues):
+        '''Calculated the gradient of the loss with respect to the weights and biases of this layer.
+        dvalues is equiavelent to a transposed dl_dZ. It is the gradient 
+        of the loss with respect to the outputs of this layer.'''
+        self.dweights = np.dot(self.inputs.T, dvalues)
+        self.dbiases = np.sum(dvalues, axis=0, keepdims=0)
+        self.dinputs = np.dot(dvalues, self.weights.T)
+
+class Activation_ReLU:
+    def forward(self, inputs):
+        self.inputs = inputs
+        self.output = np.maximum(0, inputs)
+    
+    def backward(self, dvalues):
+        '''Calculated the gradient of the loss with respect to this layer's activation function
+        dvalues is equiavelent to a transposed dl_dZ. It is the gradient 
+        of the loss with respect to the outputs of this layer.'''
+        self.dinputs = dvalues.copy()
+        self.dinputs[self.inputs <= 0] = 0
+        
+class Activation_Softmax:
+    def forward(self, inputs):
+        # Get the unnormalized probabilities
+        # Subtract max from the row to prevent larger numbers
+        exp_values = np.exp(inputs - np.max(inputs, axis=1, keepdims=True))
+
+        # Normalize the probabilities with element wise division
+        probabilities = exp_values / np.sum(exp_values, axis=1,keepdims=True)
+        self.output = probabilities
+        
+# Base class for Loss functions
+class Loss:
+    '''Calculates the data and regularization losses given
+    model output and ground truth values'''
+    def calculate(self, output, y):
+        sample_losses = self.forward(output, y)
+        data_loss = np.average(sample_losses)
+        return data_loss
+
+class Loss_CategoricalCrossEntropy(Loss):
+    def forward(self, y_pred, y_true):
+        '''y_pred is the neural network output
+        y_true is the ideal output of the neural network'''
+        samples = len(y_pred)
+        # Bound the predicted values 
+        y_pred_clipped = np.clip(y_pred, 1e-7, 1-1e-7)
+        
+        if len(y_true.shape) == 1:     # Categorically labeled
+            correct_confidences = y_pred_clipped[range(samples), y_true]
+        elif len(y_true.shape) == 2:   # One hot encoded
+            correct_confidences = np.sum(y_pred_clipped*y_true, axis=1)
+
+        # Calculate the losses
+        negative_log_likelihoods = -np.log(correct_confidences)
+        return negative_log_likelihoods
+    
+    def backward(self, dvalues, y_true):
+        samples = len(dvalues)
+
+        # Number of lables in each sample
+        labels = len(dvalues[0])
+
+        # if the labels are sparse, turn them into a one-hot vector
+        if len(y_true.shape) == 1:
+            y_true = np.eye(labels)[y_true]
+
+        # Calculate the gradient then normalize
+        self.dinputs = -y_true / dvalues
+        self.dinputs = self.dinputs / samples
+
+class Activation_Softmax_Loss_CategoricalCrossentropy():
+    def __init__(self):
+        self.activation = Activation_Softmax()
+        self.loss = Loss_CategoricalCrossEntropy()
+
+    def forward(self, inputs, y_true):
+        self.activation.forward(inputs)
+        self.output = self.activation.output
+        return self.loss.calculate(self.output, y_true)
+    
+    def backward(self, dvalues, y_true):
+        samples = len(dvalues)
+
+        # if the samples are one-hot encoded, turn them into discrete values
+        if len(y_true.shape) == 2:
+            y_true = np.argmax(y_true, axis=1)
+            
+        # Copy so we can safely modify
+        self.dinputs = dvalues.copy()
+        
+        # Calculate and normalize gradient 
+        self.dinputs[range(samples), y_true] -= 1
+        self.dinputs = self.dinputs / samples
+
+class Optimizer_SGD():
+    def __init__(self, learning_rate=0.5, decay=0.0, momentum=0.0):
+        self.initial_rate = learning_rate
+        self.current_learning_rate = self.initial_rate
+        self.decay = decay
+        self.iterations = 0
+        self.momentum = momentum
+
+    def pre_update_params(self):
+        # Update the current_learning_rate before updating params
+        if self.decay:
+            self.current_learning_rate = self.initial_rate / (1 + self.decay * self.iterations)
+
+    def update_params(self, layer):
+        if self.momentum:
+            # For each layer, we need to use its last momentums
+
+            # First check if the layer has a last momentum stored
+            if not hasattr(layer, 'weight_momentums'):
+                layer.weight_momentums = np.zeros_like(layer.weights)
+                layer.bias_momentums = np.zeros_like(layer.biases)
+            
+            weight_updates = self.momentum * layer.weight_momentums - \
+                             self.current_learning_rate * layer.dweights
+            layer.weight_momentums = weight_updates
+
+            bias_updates = self.momentum * layer.bias_momentums - \
+                           self.current_learning_rate * layer.dbiases
+            layer.bias_momentums = bias_updates
+            
+        # Not using momentum
+        else:
+            weight_updates = -self.current_learning_rate * layer.dweights
+            bias_updates = -self.current_learning_rate * layer.dbiases
+
+        layer.weights += weight_updates
+        layer.biases += bias_updates
+
+    def post_update_params(self):
+        # Update the self.iterations for use with decay
+        self.iterations += 1
+
+class Optimizer_Adagrad():
+    def __init__(self, learning_rate=0.5, decay=0.0, epsilon=1e-7):
+        self.initial_learning_rate = learning_rate
+        self.current_learning_rate = self.initial_learning_rate
+        self.decay = decay
+        self.iterations = 0
+        self.epsilon = epsilon
+
+    def pre_update_params(self):
+        if self.decay:
+            self.current_learning_rate = self.initial_learning_rate / (1 + self.decay * self.iterations)
+
+    def update_params(self, layer):
+        if not hasattr(layer, 'weight_cache'):
+            layer.weight_cache = np.zeros_like(layer.weights)
+            layer.bias_cache = np.zeros_like(layer.biases)
+
+        layer.weight_cache += layer.dweights**2
+        layer.bias_cache += layer.dbiases**2
+
+        layer.weights += -self.current_learning_rate * layer.dweights / (np.sqrt(layer.weight_cache) + self.epsilon)
+        layer.biases += -self.current_learning_rate * layer.dbiases / (np.sqrt(layer.bias_cache) + self.epsilon)
+
+    def post_update_params(self):
+        self.iterations += 1
+
+class Optimizer_RMSProp():
+    def __init__(self, learning_rate=1e-3, decay=0.0, epsilon=1e-7, rho=0.9):
+        self.initial_learning_rate = learning_rate
+        self.current_learning_rate = self.initial_learning_rate
+        self.decay = decay
+        self.iterations = 0
+        self.epsilon = epsilon
+        self.rho = rho
+
+    def pre_update_params(self):
+        if self.decay:
+            self.current_learning_rate = self.initial_learning_rate / (1 + self.decay * self.iterations)
+
+    def update_params(self, layer):
+        if not hasattr(layer, 'weight_cache'):
+            layer.weight_cache = np.zeros_like(layer.weights)
+            layer.bias_cache = np.zeros_like(layer.biases)
+
+        layer.weight_cache = self.rho * layer.weight_cache + (1 - self.rho) * layer.dweights**2
+        layer.bias_cache = self.rho * layer.bias_cache + (1 - self.rho) * layer.dbiases**2
+
+        layer.weights += -self.current_learning_rate * layer.dweights / (np.sqrt(layer.weight_cache) + self.epsilon)
+        layer.biases += -self.current_learning_rate * layer.dbiases / (np.sqrt(layer.bias_cache) + self.epsilon)
+
+    def post_update_params(self):
+        self.iterations += 1
+
+# Adam optimizer
+class Optimizer_Adam():
+    def __init__(self, learning_rate=0.001, decay=0.0, epsilon=1e-7, beta_1=0.9, beta_2=0.999):
+        self.initial_learning_rate = learning_rate
+        self.current_learning_rate = learning_rate
+        self.decay = decay
+        self.iterations = 0
+        self.epsilon = epsilon
+        self.beta_1 = beta_1
+        self.beta_2 = beta_2
+
+    def pre_update_params(self):
+        if self.decay:
+            self.current_learning_rate = self.initial_learning_rate * (1. / (1. + self.decay * self.iterations))
+
+    def update_params(self, layer):
+        # If layer does not contain cache arrays, create them filled with zeros
+        if not hasattr(layer, 'weight_cache'):
+            layer.weight_momentums = np.zeros_like(layer.weights)
+            layer.weight_cache = np.zeros_like(layer.weights)
+            layer.bias_momentums = np.zeros_like(layer.biases)
+            layer.bias_cache = np.zeros_like(layer.biases)
+
+        # Update momentum with current gradients
+        layer.weight_momentums = self.beta_1 * layer.weight_momentums + (1 - self.beta_1) * layer.dweights
+        layer.bias_momentums = self.beta_1 * layer.bias_momentums + (1 - self.beta_1) * layer.dbiases
+
+        # Get corrected momentum
+        # use self.iteration + 1 because we start at iteration 0
+        weight_momentums_corrected = layer.weight_momentums / (1 - self.beta_1 ** (self.iterations + 1))
+        bias_momentums_corrected = layer.bias_momentums / (1 - self.beta_1 ** (self.iterations + 1))
+
+        # Update cache with squared current gradients
+        layer.weight_cache = self.beta_2 * layer.weight_cache + (1 - self.beta_2) * layer.dweights**2
+        layer.bias_cache = self.beta_2 * layer.bias_cache + (1 - self.beta_2) * layer.dbiases**2
+
+        # Get corrected cache
+        weight_cache_corrected = layer.weight_cache / (1 - self.beta_2 ** (self.iterations + 1))
+        bias_cache_corrected = layer.bias_cache / (1 - self.beta_2 ** (self.iterations + 1))
+
+        # Vanilla SGD parameter update + normalization with square rooted cache
+        layer.weights += -self.current_learning_rate * weight_momentums_corrected / (np.sqrt(weight_cache_corrected) + self.epsilon)
+        layer.biases += -self.current_learning_rate * bias_momentums_corrected / (np.sqrt(bias_cache_corrected) + self.epsilon)
+
+    # Call once after any parameter updates
+    def post_update_params(self):
+        self.iterations += 1
+
+
+# %% [markdown]
+# # Generalization and Overfitting
+# Overfitting can occur when the neural network tries to fit every training data perfectly. If the training data was perfect, this would not be an issue. However, because some training data is bad or should not be expected to be identified perfectly, the neural network can sacrifice generability to trying to identify all training data.
+# 
+# If we could assign uncertainty to training data, I believe this would help.
+# 
+# ## Out of Sample Data
+# Rather than use all of our data for training, we can set aside some for out of sample testing so we can better understand how well the network generalizes.
+
+# %% [markdown]
+# ### Run the Adam Optimizer with the 100 Samples of Training Data
+
+# %%
+# Create dataset
+X, y = spiral_data(samples=100, classes=3)
+
+# Create Dense layer with 2 input features and 64 output values
+dense1 = Layer_Dense(2, 64)
+
+# Create ReLU activation (to be used with Dense layer)
+activation1 = Activation_ReLU()
+
+# Create second Dense layer with 64 input features (as we take output
+# of previous layer here) and 3 output values (output values)
+dense2 = Layer_Dense(64, 3)
+
+# Create Softmax classifier's combined loss and activation
+loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()
+
+# Create optimizer
+optimizer = Optimizer_Adam(learning_rate=0.02, decay=1e-5)
+
+# Train in loop
+for epoch in range(10001):
+    # Perform a forward pass of our training data through this layer
+    dense1.forward(X)
+    
+    # Perform a forward pass through activation function
+    # takes the output of first dense layer here
+    activation1.forward(dense1.output)
+    
+    # Perform a forward pass through second Dense layer
+    # takes outputs of activation function of first layer as inputs
+    dense2.forward(activation1.output)
+    
+    # Perform a forward pass through the activation/loss function
+    # takes the output of second dense layer here and returns loss
+    loss = loss_activation.forward(dense2.output, y)
+    
+    # Calculate accuracy from output of activation2 and targets
+    # calculate values along first axis
+    predictions = np.argmax(loss_activation.output, axis=1)
+    if len(y.shape) == 2:
+        y = np.argmax(y, axis=1)
+    accuracy = np.mean(predictions == y)
+    
+    # Backward pass
+    loss_activation.backward(loss_activation.output, y)
+    dense2.backward(loss_activation.dinputs)
+    activation1.backward(dense2.dinputs)
+    dense1.backward(activation1.dinputs)
+    
+    # Update weights and biases
+    optimizer.pre_update_params()
+    optimizer.update_params(dense1)
+    optimizer.update_params(dense2)
+    optimizer.post_update_params()
+
+print(f'epoch: {epoch}, ' +
+      f'acc: {accuracy:.3f}, ' +
+      f'loss: {loss:.3f}, ' +
+      f'lr: {optimizer.current_learning_rate}')
+
+
+# %% [markdown]
+# ### Now Use Different Found Biases and Weights on Out of Sample Data
+
+# %%
+# Create test dataset
+X_test, y_test = spiral_data(samples=100, classes=3)
+# Perform a forward pass of our testing data through this layer
+dense1.forward(X_test)
+# Perform a forward pass through activation function
+# takes the output of first dense layer here
+activation1.forward(dense1.output)
+# Perform a forward pass through second Dense layer
+# takes outputs of activation function of first layer as inputs
+dense2.forward(activation1.output)
+# Perform a forward pass through the activation/loss function
+# takes the output of second dense layer here and returns loss
+loss = loss_activation.forward(dense2.output, y_test)
+# Calculate accuracy from output of activation2 and targets
+# calculate values along first axis
+predictions = np.argmax(loss_activation.output, axis=1)
+if len(y_test.shape) == 2:
+ y_test = np.argmax(y_test, axis=1)
+accuracy = np.mean(predictions == y_test)
+print(f'validation, acc: {accuracy:.3f}, loss: {loss:.3f}')
+
+# %% [markdown]
+# ### Observations
+# The out of sample accuracy is about 0.1% lower than the training data, with a loss almost 3x the training data.
+
+# %% [markdown]
+# # Preventing Overfitting
+# ## Reducing Network Complexity
+# Simpler models are more robust against overfitting and can provide more generalizability. This can be reducing the number of neurons in a layer or the total layers. Effectively, you reduce the granularity of functions that the network can model.
+# 
+# ## Reduce the Number of Epochs
+# By allowing less training iterations to occur, the network isn't given the time or opportunity to fit data points that might not be valid.
+# 
+# These "hyper-parameters" can be adjusted after testing with out of sample data.
+
+