from math import sqrt from matplotlib import pyplot as plot from random import seed from random import randrange from csv import reader def load_csv(filename, skip=False): dataset = list() with open(filename, newline='') as file: csv_reader = reader(file) if skip: next(csv_reader) for row in csv_reader: if not row: continue dataset.append(row) return dataset def string_column_to_float(dataset, column): for row in dataset: # The strip() function remove white space # then convert the data into a decimal number (float) # and overwrite the original data row[column] = float(row[column].strip()) def mean(values): mean_results = 0.0 mean_results = sum(values) / float(len(values)) return mean_results def regularisation(parameter, lambda_value=0.01): parameter = parameter * (1 - lambda_value) return parameter def leastSquares(dataset): x = list() y = list() for row in dataset: x.append(row[0]) for row in dataset: y.append(row[1]) b0 = 0 b1 = 0 # using the formula to calculate the b1 and b0 numerator = 0 denominator = 0 x_mean = mean(x) y_mean = mean(y) numerator = sum((x[i] - x_mean) * (y[i] - y_mean) for i in range(len(x))) denominator = sum((x[i] - x_mean) ** 2 for i in range(len(x))) b1 = numerator / denominator b0 = y_mean - b1 * x_mean return [b0, b1] def root_mean_square_error(actual, predicted): rmse = 0.0 sum_error = 0.0 sum_error = sum((predicted[i] - actual[i]) ** 2 for i in range(len(actual))) rmse = sqrt(sum_error / len(actual)) return rmse def simple_linear_regression(train, test): predictions = list() b0, b1 = leastSquares(train) # Calculate the prediction (yhat) for row in test: yhat = b0 + b1 * row[0] predictions.append(yhat) return predictions def train_test_split(dataset, split): train = list() test = list(dataset) train_size = int(split * len(dataset)) while len(train) < train_size: index = randrange(len(test)) train.append(test.pop(index)) return train, test def evaluate_simple_linear_regression(dataset, split=0): train, test = train_test_split(dataset, split) test_set = list() for row in test: row_copy = list(row) row_copy[-1] = None test_set.append(row_copy) predicted = simple_linear_regression(train, test_set) actual = [row[-1] for row in test] rmse = root_mean_square_error(actual, predicted) return rmse def visualise_dataset(dataset): test_set = list() for row in dataset: row_copy = list(row) row_copy[-1] = None test_set.append(row_copy) sizes, prices = [], [] for i in range(len(dataset)): sizes.append(dataset[i][0]) prices.append(dataset[i][1]) plot.figure() plot.plot(sizes, prices, 'x') plot.plot(test_set, simple_linear_regression(dataset, test_set)) plot.xlabel('Fertility rate') plot.ylabel('Worker percent') plot.grid() plot.tight_layout() plot.show() seed(1) filename = 'fertility_rate-worker_percent.csv' dataset = load_csv(filename, skip=True) for i in range(len(dataset[0])): string_column_to_float(dataset, i) split = 0.6 rmse = evaluate_simple_linear_regression(dataset, split) print('Root Mean Square Error: %.3f' % rmse) visualise_dataset(dataset)