from csv import reader import numpy as np import matplotlib.pyplot as plt def load_csv(filename, skip = False): dataset = list() with open(filename, 'r', newline='') as file: csv_reader = reader(file) if skip: next(csv_reader) for row in csv_reader: if not row: continue dataset.append(row) return dataset def diagnosis_column_to_number(dataset, column): for row in dataset: if row[column] == 'M': row[column] = 0 elif row[column] == 'B': row[column] = 1 def extract_only_x_data(dataset): if len(dataset) == 0: return data = list() for i in range(0, len(dataset)): data.append(list()) for j in range(0, len(dataset[i]) - 1): data[-1].append(float(dataset[i][j])) return data def extract_only_y_data(dataset): if len(dataset) == 0: return data = list() for i in range(0, len(dataset)): data.append(int(dataset[i][-1])) return data def sigmoid(z): z = 1 / (1 + np.exp(-z)) # Return the value of the implemented sigmoid function, do not simply return z return z def loss(y, y_hat): epsilon = 1e-9 # small value to prevent log(0) loss = -np.mean(y * np.log(y_hat + epsilon) + (1 - y) * np.log(1 - y_hat + epsilon)) # Return the value of the implemented loss function, do not simply return loss of zero return loss def gradients(X, y, y_hat): # number of training examples. number_of_examples = X.shape[0] # Gradient of loss weights. dw = (1 / number_of_examples) * np.dot(X.T, (y_hat - y)) # Gradient of loss bias. db = (1 / number_of_examples) * np.sum((y_hat - y)) return dw, db def train(X, y, batch_size, epochs, learning_rate): number_of_examples, number_of_features = X.shape print(number_of_examples) print(number_of_features) # Initializing weights and bias to zeros. weights = np.zeros((number_of_features, 1)) bias = 0 # Reshaping y. y = y.reshape(number_of_examples, 1) # Empty list to store losses. losses = [] # Training loop. for epoch in range(epochs): for i in range((number_of_examples - 1) // batch_size + 1): # Defining batches. SGD. start_i = i * batch_size end_i = start_i + batch_size xb = X[start_i:end_i] yb = y[start_i:end_i] print(xb) # Calculating hypothesis/prediction. y_hat = sigmoid(np.dot(xb, weights) + bias) # Getting the gradients of loss w.r.t parameters. dw, db = gradients(xb, yb, y_hat) # Updating the parameters. weights -= learning_rate * dw bias -= learning_rate * db # Calculating loss and appending it in the list. l = loss(y, sigmoid(np.dot(X, weights) + bias)) losses.append(l) # returning weights, bias and losses(List). return weights, bias, losses # Make the predictions. def predict(X, w, b): # X Input. # Calculating presictions/y_hat. preds = sigmoid(np.dot(X, w) + b) # Empty List to store predictions. pred_class = [] # if y_hat >= 0.5 round up to 1 # if y_hat < 0.5 round down to 0 for i in preds: pred_class.append(1 if i >= 0.5 else 0) return np.array(pred_class) # Obtain the accuracy. def accuracy(y, y_hat): accuracy = np.sum(y == y_hat) / len(y) return accuracy # Output the plot. def plot_decision_boundary(X, w, b): # X Inputs # w weights # b bias fig = plt.figure(figsize=(10, 8)) plt.plot(X[:, 0][y == 0], X[:, 1][y == 0], "g^") plt.plot(X[:, 0][y == 1], X[:, 1][y == 1], "bs") plt.xlim([-2, 2]) plt.ylim([0, 2.2]) plt.xlabel("feature 1") plt.ylabel("feature 2") plt.title('Decision Boundary') # The Line is y=mx+c # So, Equate mx+c = w.X + b # Solving we find m and c x1 = [min(X[:, 0]), max(X[:, 0])] if (w[1] != 0): m = -w[0] / w[1] c = -b / w[1] x2 = m * x1 + c plt.plot(x1, x2, 'y-') plt.show() ### Evaluate the algorithm. filename = 'breast_cancer_data.csv' dataset = load_csv(filename, skip=True) diagnosis_column_to_number(dataset, 2) X_train_data = extract_only_x_data(dataset) y_train_data = extract_only_y_data(dataset) X = np.array(X_train_data) y = np.array(y_train_data) # Training w, b, l = train(X, y, batch_size=100, epochs=1000, learning_rate=0.01) # Plotting Decision Boundary plot_decision_boundary(X, w, b) accuracy(y, y_hat=predict(X, w, b))