I wrote a simple neural network binary classification algorithm using Pytorch. It uses the dataset from https://www.kaggle.com/pritsheta/heart-attack, which consists of a table with 300 rows and 14 columns. The final column, 'target', is the training goal and indicates whether this patient has a hearth disease.
I define two classes, CustomDataset and NeuralNet. The CustomDataset modifies the data into Pytorch tensors. This is done by standardizing the columns containing quantities and one-hot-encoding the columns containing categorical data.
The NeuralNet class represents a fully connected neural network with a hidden layer and a sigmoid function at the end.
Furthermore, the function get_accuracy() gets the fraction of correctly predicted labels. Finally, I create a loop that trains the neural network and plot the losses and accuracies.
from torch.utils.data import DataLoader,Dataset,random_split
from torch import Generator,nn
import torch
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
class CustomDataset(Dataset):
def __init__(self,file):
"""
Reads the csv with data and converts it to tensors. There are 3 types of columns:
self.cols_standardize : Columns for which the values will be standardized
self.cols_binary: Columns with binary values
self.cols_ohe: Columns with categorical data. Will be converted to one hot encoding
"""
self.data = pd.read_csv(file)
#Set colums to take into account
self.cols_standardize = ['age','trestbps','chol','thalach','oldpeak']
self.cols_binary = ['sex','exang','fbs']
self.cols_ohe = ['cp','restecg','slope','ca','thal']
#Create empty tensor
ohe_num_classes = self.data[self.cols_ohe].nunique().values
self.x_cols_num = len(self.cols_standardize) + len(self.cols_binary) + ohe_num_classes.sum()
self.x = torch.empty((len(self.data),self.x_cols_num), dtype=torch.float64)
#Add standardized values
means = self.data[self.cols_standardize].mean()
stds = self.data[self.cols_standardize].std()
x_std = (self.data[self.cols_standardize] - means)/stds
self.x[:,:x_std.shape[1]] = torch.from_numpy(x_std.values)
current_col = x_std.shape[1]
#Add binary values
x_bin = self.data[self.cols_binary]
self.x[:,current_col:current_col+x_bin.shape[1]] = torch.from_numpy(x_bin.values)
current_col += x_bin.shape[1]
#Add ohe values
ohe_data = torch.from_numpy(self.data[self.cols_ohe].values.astype(np.int64))
for i,num_classes in enumerate(ohe_num_classes):
x_ohe = nn.functional.one_hot(ohe_data[:,i],num_classes)
self.x[:,current_col:current_col + x_ohe.shape[1]] = x_ohe
current_col += x_ohe.shape[1]
#Set target value to tensors
self.y = torch.Tensor(self.data['target'].values)
def __len__(self):
return len(self.data)
def __getitem__(self,idx):
return self.x[idx], self.y[idx]
class NeuralNet(nn.Module):
"""
Neural network with one hidden layer and a sigmoid function applied to the y_logits
"""
def __init__(self,input_size,hidden_size):
super(NeuralNet,self).__init__()
self.layer1 = nn.Linear(input_size,hidden_size)
self.relu = nn.ReLU()
self.layer2 = nn.Linear(hidden_size,1)
self.out_layer = nn.Sigmoid()
def forward(self,x):
out = self.layer1(x.float())
out = self.relu(out)
out = self.layer2(out)
out = self.out_layer(out)
return out
def get_accuracy(y_true,y_prob):
"""
:param y_true: True values for y
:param y_prob: Estimated values for y
:return: Accuracy of estimation
"""
y_estimate = y_prob > 0.5
return (y_true == y_estimate).sum() / y_true.size(0)
if __name__ == '__main__':
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#Set parameters
learning_rate = 10e-3
num_epochs = 25
weight_decay = 10e-5
batch_size=32
hidden_layers = 10
#Create dataset
dataset = CustomDataset('data.csv')
#Split dataset into train and test data
train_split = 0.8
train_len = round(train_split*len(dataset))
train_data,test_data = random_split(dataset,[train_len,len(dataset)-train_len],generator=Generator().manual_seed(0))
train_dataloader = DataLoader(train_data,batch_size=batch_size,shuffle=True)
test_dataloader = DataLoader(test_data,batch_size=batch_size,shuffle=True)
#Create model
model = NeuralNet(dataset.x_cols_num,hidden_layers).to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate,weight_decay=weight_decay)
#Train model
accs = []
losses = []
for epoch in range(num_epochs):
print('Epoch ',epoch)
for i, (x,y) in enumerate(train_dataloader):
model.train()
optimizer.zero_grad()
yhat = model(x)
loss = criterion(yhat[:,0],y)
loss.backward()
optimizer.step()
losses.append(loss.item())
#Get accuracy
with torch.no_grad():
x_test = test_data[:][0]
y_test = test_data[:][1]
y_pred = model(x_test)[:,0]
acc = get_accuracy(y_test,y_pred)
print(f'test set accuracy: {acc}')
accs.append(acc.item())
#Plot results
plt.figure()
plt.plot(accs)
plt.plot(losses)
plt.legend(['test set accuracy','loss'])
Since this is my first time working with Pytorch, i'm quite sure there are many suggestions for improvement and everything is welcome. However, I am particularly interested in the following:
- Machine learning wise improvements: I know there are probably many things to improve on this and my goal is not to build the perfect predictor. However, are there any standard things that basically everyone with a bit of experience would add, that I seem to be missing?
- Conventions/standard coding/rookie mistakes: Are there any things that I am doing that are considered obsolete/stupid/overly complicated?
- Code structure: I create 2 classes and 1 function. Does this seem oke as a structure, or is something else recommended?
Thanks in advance for any feedback!