# %% [markdown]
# # Previous Class Definitions
# The previously defined Layer_Dense, Activation_ReLU, and Activation_Softmax

# %%
# imports
import numpy as np
import nnfs
from nnfs.datasets import spiral_data
nnfs.init()

# %%
class Layer_Dense:
    def __init__(self, n_inputs, n_neurons):
        # Initialize the weights and biases
        self.weights = 0.01 * np.random.randn(n_inputs, n_neurons)  # Normal distribution of weights
        self.biases = np.zeros((1, n_neurons))

    def forward(self, inputs):
        # Calculate the output values from inputs, weights, and biases
        self.output = np.dot(inputs, self.weights) + self.biases        # Weights are already transposed

class Activation_ReLU:
    def forward(self, inputs):
        self.output = np.maximum(0, inputs)
        
class Activation_Softmax:
    def forward(self, inputs):
        # Get the unnormalized probabilities
        # Subtract max from the row to prevent larger numbers
        exp_values = np.exp(inputs - np.max(inputs, axis=1, keepdims=True))

        # Normalize the probabilities with element wise division
        probabilities = exp_values / np.sum(exp_values, axis=1,keepdims=True)
        self.output = probabilities

# %% [markdown]
# # Forward Pass with No Loss Consideration
# 2 input neural network with 2 layers of 3 neurons each. ReLU activation in the first layer with Softmax in the second layer to normalize the outputs.

# %%
# Create dataset
X, y = spiral_data(samples=100, classes=3)
# Create Dense layer with 2 input features and 3 output values
dense1 = Layer_Dense(2, 3)
# Create ReLU activation (to be used with Dense layer):
activation1 = Activation_ReLU()
# Create second Dense layer with 3 input features (as we take output
# of previous layer here) and 3 output values
dense2 = Layer_Dense(3, 3)
# Create Softmax activation (to be used with Dense layer):
activation2 = Activation_Softmax()

# Make a forward pass of our training data through this layer
dense1.forward(X)

# Make a forward pass through activation function
# it takes the output of first dense layer here
activation1.forward(dense1.output)
# Make a forward pass through second Dense layer
# it takes outputs of activation function of first layer as inputs
dense2.forward(activation1.output)
# Make a forward pass through activation function
# it takes the output of second dense layer here
activation2.forward(dense2.output)
# Let's see output of the first few samples:
print(activation2.output[:5])

# %% [markdown]
# # Calculating Network Error with Categorical Cross Entropy Loss
# loss = negative sum of the expected output * log(neural network output)
# loss = - sum(expected_i * log(nn_output_i)) for all i in outputs
# 
# In the classification case, incorrect outputs do not end up mattering as the expected_i for the wrong class is 0.
# 

# %%
nn_outputs = np.array([
    [0.7, 0.1, 0.2],
    [0.1, 0.5, 0.4],
    [0.02, 0.9, 0.08]])
class_targets = [0, 1, 1]
losses = -np.log(nn_outputs[range(len(nn_outputs)), class_targets])
print(f"Losses: {losses}")
print(f"Average Loss: {np.average(losses)}")

# %% [markdown]
# ## Loss with One Hot Encoding
# Classification typically has the expected output to be all zero except for the class the inputs belong too. This leads to simplfiying the cross entropy loss calculation.

# %%
true_output = np.array([
    [1, 0, 0],
    [0, 1, 0],
    [0, 1, 0]
])

nn_output = np.array([
    [0.7, 0.2, 0.1],
    [0.1, 0.5, 0.4],
    [0.02, 0.9, 0.08]
])

# Element by element multiplication "erases" the output terms corresponding with 0
A = true_output*nn_output

# Sum the columns (ie, sum every element in row 0, then row 1, etc) because each row is a batch of output
B = np.sum(A, axis = 1)

# Get the cross entropy loss
C = -np.log(B)

print(f"Losses: {C}")
print(f"Average Loss: {np.mean(C)}")


# %% [markdown]
# ## Implementing the Loss Class

# %%
# Base class for Loss functions
class Loss:
    '''Calculates the data and regularization losses given
    model output and ground truth values'''
    def calculate(self, output, y):
        sample_losses = self.forward(output, y)
        data_loss = np.average(sample_losses)
        return data_loss

# %% [markdown]
# ## Implementing the Categorical Cross Entropy Loss Class

# %%
class Loss_CategoricalCrossEntropy(Loss):
    def forward(self, y_pred, y_true):
        '''y_pred is the neural network output
        y_true is the ideal output of the neural network'''
        samples = len(y_pred)
        # Bound the predicted values 
        y_pred_clipped = np.clip(y_pred, 1e-7, 1-1e-7)
        
        if len(y_true.shape) == 1:     # Categorically labeled
            correct_confidences = y_pred_clipped[range(samples), y_true]
        elif len(y_true.shape) == 2:   # One hot encoded
            correct_confidences = np.sum(y_pred_clipped*y_true, axis=1)

        # Calculate the losses
        negative_log_likelihoods = -np.log(correct_confidences)
        return negative_log_likelihoods

# %%
nn_outputs = np.array([
    [0.7, 0.1, 0.2],
    [0.1, 0.5, 0.4],
    [0.02, 0.9, 0.08]])
class_targets = np.array([
    [1, 0, 0],
    [0, 1, 0],
    [0, 1, 0]])

loss_function = Loss_CategoricalCrossEntropy()
losses = loss_function.calculate(nn_outputs, class_targets)
print(f"Losses: {losses}")
print(f"Average Loss: {np.average(losses)}")

# %% [markdown]
# # Introducing Accuracy
# In the simple example, if the highest value in the outputs align with the correct classification, then that accuracy is 1. Even if it was 51% red and 49% blue, and the true output is red, it would be considered fully accurate.

# %%
nn_outputs = np.array([
    [0.7, 0.1, 0.2],
    [0.1, 0.5, 0.4],
    [0.02, 0.9, 0.08]])
class_targets = np.array([
    [1, 0, 0],
    [0, 1, 0],
    [0, 1, 0]])

# Calculate the losses
loss_function = Loss_CategoricalCrossEntropy()
losses = loss_function.calculate(nn_outputs, class_targets)
print(f"Losses: {losses}")
print(f"Average Loss: {np.average(losses)}")

# Calculate the accuracy
predictions = np.argmax(nn_outputs, axis=1)
# If targets are one-hot encoded - convert them
if len(class_targets.shape) == 2:
    class_targets = np.argmax(class_targets, axis=1)
# True evaluates to 1; False to 0
accuracy = np.mean(predictions == class_targets)
print(f"Accuracy: {accuracy}")