I've created binary classification model from scratch, just to understand intuition behind that.
However when I compare my implementation to model from tensorflow/pytorch with the same parameters and configuration I noticed that my model achieved similar results in about 3 000 epochs but tensorflow/pytorch model achieved that in 300 epochs.
I also noticed that my model calculates very small gradient when tensorflow/pytorch calculates a much higher gradient in every epoch
- Is there any way to optimize calculating gradient in
backward
function to make model learning faster? - Is there any other such field that could be optimized/simplified and how could it be implemented
Below is my backward
function responsible for calculating gradient:
def backward(
y: np.ndarray,
y_pred: np.ndarray,
layers: List[ Dict[ str, np.ndarray ] ]
) -> None:
loss: np.ndarray = binary_cross_entropy_loss_prime(y, y_pred)
for layer in reversed(layers):
dZ: np.ndarray = layer['prime'](layer['z']) * loss
layer['db'] = (dZ * np.ones_like(layer['b'])).sum(axis = 0, keepdims=True) / loss.shape[0]
dU: np.ndarray = dZ * np.ones_like(layer['u'])
layer['dw'] = np.dot(layer['x'].T, dU) / loss.shape[0]
loss = np.dot(dU, layer['w'].T)
and also full code with data types to easier understanding:
"""# Dataset and libraries"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from typing import List, Dict
from sklearn.datasets import make_moons
x, y = make_moons(n_samples = 1000, noise = 0.2, random_state = 100)
# expand y second dim
# before expand_dims -> y.shape = (1000, )
# after expand_dims -> y.shape = (1000, 1)
y = np.expand_dims(y, 1)
# final shapes: X -> (1000, 2), Y -> (1000, 1)
"""# Activations functions"""
def sigma(x: np.ndarray) -> np.ndarray:
return 1 / (1 + np.exp(-x))
def sigma_prime(x: np.ndarray) -> np.ndarray:
e = np.exp(x)
return e / (e + 1) ** 2
def relu(x: np.ndarray) -> np.ndarray:
return np.maximum(0, x)
def relu_prime(x: np.ndarray) -> np.ndarray:
return np.where(x <= 0, 0, 1)
"""# Dense layers"""
dense_layers = [
{ 'w': np.random.rand(2, 8) * 0.1, 'b': np.random.rand(1, 8) * 0.1, 'activ': relu, 'prime': relu_prime },
{ 'w': np.random.rand(8, 8) * 0.1, 'b': np.random.rand(1, 8) * 0.1, 'activ': relu, 'prime': relu_prime },
{ 'w': np.random.rand(8, 1) * 0.1, 'b': np.random.rand(1, 1) * 0.1, 'activ': sigma, 'prime': sigma_prime }
]
"""# Losses and metrics """
def binary_cross_entropy_loss(y_true: np.ndarray, y_pred: np.ndarray) -> float:
number_of_rows = y_true.shape[0] # 1000 rows
number_of_cols = y_true.shape[1] # 1 cols
return np.sum(-(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))) / number_of_rows * number_of_cols
def binary_cross_entropy_loss_prime(y_true: np.ndarray, y_pred: np.ndarray) -> np.ndarray:
return (1 - y_true) / (1 - y_pred) - y_true / y_pred
def accuracy(y_true: np.ndarray, y_pred: np.ndarray, threshhold: float = 0.5) -> float:
return (np.where(y_pred <= threshhold, 0, 1) == y_true).mean()
"""# Forward propagation"""
def forward(x: np.ndarray, layers: List[ Dict[ str, np.ndarray ] ]) -> np.ndarray:
for layer in layers:
layer['x'] = x
layer['u'] = np.dot(x, layer['w'])
layer['z'] = layer['u'] + layer['b']
layer['a'] = layer['activ'](layer['z'])
x = layer['a']
return x
"""# Backward propagation"""
def backward(y: np.ndarray, y_pred: np.ndarray, layers: List[ Dict[ str, np.ndarray ] ]) -> None:
loss: np.ndarray = binary_cross_entropy_loss_prime(y, y_pred)
for layer in reversed(layers):
dZ: np.ndarray = layer['prime'](layer['z']) * loss
layer['db'] = (dZ * np.ones_like(layer['b'])).sum(axis = 0, keepdims=True) / loss.shape[0]
dU: np.ndarray = dZ * np.ones_like(layer['u'])
layer['dw'] = np.dot(layer['x'].T, dU) / loss.shape[0]
loss = np.dot(dU, layer['w'].T)
"""# Update weights and biases (SGD optimizer)"""
def update(layers: List[ Dict[ str, np.ndarray ] ], learning_rate: float) -> None:
for layer in layers:
layer['w'] -= learning_rate * layer['dw']
layer['b'] -= learning_rate * layer['db']
"""# Train model"""
def train(x: np.ndarray, y: np.ndarray, layers: List[ Dict[ str, np.ndarray ] ], epochs: int, learning_rate: float) -> None:
for epoch in range(epochs):
# Forward propagation
y_hat = forward(x, layers)
# Backward propagation
backward(y, y_hat, layers)
# Update layers
update(layers, learning_rate)
# show progress
if epoch % 100 == 0:
print('Iteration nr: ', epoch, ', loss: ', binary_cross_entropy_loss(y, y_hat), ', accuracy: ', accuracy(y, y_hat))
train(x, y, dense_layers, 3001, 0.01)
```