You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

150 lines
3.4 KiB
Python

from math import sqrt
from matplotlib import pyplot as plot
from random import seed
from random import randrange
from csv import reader
def load_csv(filename, skip=False):
dataset = list()
with open(filename, newline='') as file:
csv_reader = reader(file)
if skip:
next(csv_reader)
for row in csv_reader:
if not row:
continue
dataset.append(row)
return dataset
def string_column_to_float(dataset, column):
for row in dataset:
# The strip() function remove white space
# then convert the data into a decimal number (float)
# and overwrite the original data
row[column] = float(row[column].strip())
def mean(values):
mean_results = 0.0
mean_results = sum(values) / float(len(values))
return mean_results
def regularisation(parameter, lambda_value=0.01):
parameter = parameter * (1 - lambda_value)
return parameter
def leastSquares(dataset):
x = list()
y = list()
for row in dataset:
x.append(row[0])
for row in dataset:
y.append(row[1])
b0 = 0
b1 = 0
# using the formula to calculate the b1 and b0
numerator = 0
denominator = 0
x_mean = mean(x)
y_mean = mean(y)
numerator = sum((x[i] - x_mean) * (y[i] - y_mean) for i in range(len(x)))
denominator = sum((x[i] - x_mean) ** 2 for i in range(len(x)))
b1 = numerator / denominator
b0 = y_mean - b1 * x_mean
return [b0, b1]
def root_mean_square_error(actual, predicted):
rmse = 0.0
sum_error = 0.0
sum_error = sum((predicted[i] - actual[i]) ** 2 for i in range(len(actual)))
rmse = sqrt(sum_error / len(actual))
return rmse
def simple_linear_regression(train, test):
predictions = list()
b0, b1 = leastSquares(train)
# Calculate the prediction (yhat)
for row in test:
yhat = b0 + b1 * row[0]
predictions.append(yhat)
return predictions
def train_test_split(dataset, split):
train = list()
test = list(dataset)
train_size = int(split * len(dataset))
while len(train) < train_size:
index = randrange(len(test))
train.append(test.pop(index))
return train, test
def evaluate_simple_linear_regression(dataset, split=0):
train, test = train_test_split(dataset, split)
test_set = list()
for row in test:
row_copy = list(row)
row_copy[-1] = None
test_set.append(row_copy)
predicted = simple_linear_regression(train, test_set)
actual = [row[-1] for row in test]
rmse = root_mean_square_error(actual, predicted)
return rmse
def visualise_dataset(dataset):
test_set = list()
for row in dataset:
row_copy = list(row)
row_copy[-1] = None
test_set.append(row_copy)
sizes, prices = [], []
for i in range(len(dataset)):
sizes.append(dataset[i][0])
prices.append(dataset[i][1])
plot.figure()
plot.plot(sizes, prices, 'x')
plot.plot(test_set, simple_linear_regression(dataset, test_set))
plot.xlabel('Fertility rate')
plot.ylabel('Worker percent')
plot.grid()
plot.tight_layout()
plot.show()
seed(1)
filename = 'fertility_rate-worker_percent.csv'
dataset = load_csv(filename, skip=True)
for i in range(len(dataset[0])):
string_column_to_float(dataset, i)
split = 0.6
rmse = evaluate_simple_linear_regression(dataset, split)
print('Root Mean Square Error: %.3f' % rmse)
visualise_dataset(dataset)