You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
150 lines
3.4 KiB
Python
150 lines
3.4 KiB
Python
from math import sqrt
|
|
from matplotlib import pyplot as plot
|
|
from random import seed
|
|
from random import randrange
|
|
from csv import reader
|
|
|
|
|
|
def load_csv(filename, skip=False):
|
|
dataset = list()
|
|
with open(filename, newline='') as file:
|
|
csv_reader = reader(file)
|
|
if skip:
|
|
next(csv_reader)
|
|
for row in csv_reader:
|
|
if not row:
|
|
continue
|
|
dataset.append(row)
|
|
return dataset
|
|
|
|
|
|
def string_column_to_float(dataset, column):
|
|
for row in dataset:
|
|
# The strip() function remove white space
|
|
# then convert the data into a decimal number (float)
|
|
# and overwrite the original data
|
|
row[column] = float(row[column].strip())
|
|
|
|
def mean(values):
|
|
mean_results = 0.0
|
|
mean_results = sum(values) / float(len(values))
|
|
return mean_results
|
|
|
|
def regularisation(parameter, lambda_value=0.01):
|
|
parameter = parameter * (1 - lambda_value)
|
|
return parameter
|
|
|
|
|
|
def leastSquares(dataset):
|
|
x = list()
|
|
y = list()
|
|
|
|
for row in dataset:
|
|
x.append(row[0])
|
|
|
|
for row in dataset:
|
|
y.append(row[1])
|
|
|
|
b0 = 0
|
|
b1 = 0
|
|
|
|
# using the formula to calculate the b1 and b0
|
|
numerator = 0
|
|
denominator = 0
|
|
|
|
x_mean = mean(x)
|
|
y_mean = mean(y)
|
|
|
|
numerator = sum((x[i] - x_mean) * (y[i] - y_mean) for i in range(len(x)))
|
|
denominator = sum((x[i] - x_mean) ** 2 for i in range(len(x)))
|
|
|
|
b1 = numerator / denominator
|
|
b0 = y_mean - b1 * x_mean
|
|
|
|
return [b0, b1]
|
|
|
|
def root_mean_square_error(actual, predicted):
|
|
rmse = 0.0
|
|
sum_error = 0.0
|
|
sum_error = sum((predicted[i] - actual[i]) ** 2 for i in range(len(actual)))
|
|
rmse = sqrt(sum_error / len(actual))
|
|
return rmse
|
|
|
|
|
|
def simple_linear_regression(train, test):
|
|
predictions = list()
|
|
b0, b1 = leastSquares(train)
|
|
|
|
# Calculate the prediction (yhat)
|
|
for row in test:
|
|
yhat = b0 + b1 * row[0]
|
|
predictions.append(yhat)
|
|
|
|
return predictions
|
|
|
|
|
|
def train_test_split(dataset, split):
|
|
train = list()
|
|
test = list(dataset)
|
|
|
|
train_size = int(split * len(dataset))
|
|
while len(train) < train_size:
|
|
index = randrange(len(test))
|
|
train.append(test.pop(index))
|
|
|
|
return train, test
|
|
|
|
|
|
def evaluate_simple_linear_regression(dataset, split=0):
|
|
train, test = train_test_split(dataset, split)
|
|
test_set = list()
|
|
|
|
for row in test:
|
|
row_copy = list(row)
|
|
row_copy[-1] = None
|
|
test_set.append(row_copy)
|
|
|
|
predicted = simple_linear_regression(train, test_set)
|
|
|
|
actual = [row[-1] for row in test]
|
|
|
|
rmse = root_mean_square_error(actual, predicted)
|
|
|
|
return rmse
|
|
|
|
|
|
def visualise_dataset(dataset):
|
|
test_set = list()
|
|
|
|
for row in dataset:
|
|
row_copy = list(row)
|
|
row_copy[-1] = None
|
|
test_set.append(row_copy)
|
|
|
|
sizes, prices = [], []
|
|
for i in range(len(dataset)):
|
|
sizes.append(dataset[i][0])
|
|
prices.append(dataset[i][1])
|
|
|
|
plot.figure()
|
|
plot.plot(sizes, prices, 'x')
|
|
plot.plot(test_set, simple_linear_regression(dataset, test_set))
|
|
plot.xlabel('Fertility rate')
|
|
plot.ylabel('Worker percent')
|
|
plot.grid()
|
|
plot.tight_layout()
|
|
plot.show()
|
|
|
|
seed(1)
|
|
|
|
filename = 'fertility_rate-worker_percent.csv'
|
|
dataset = load_csv(filename, skip=True)
|
|
|
|
for i in range(len(dataset[0])):
|
|
string_column_to_float(dataset, i)
|
|
|
|
split = 0.6
|
|
rmse = evaluate_simple_linear_regression(dataset, split)
|
|
|
|
print('Root Mean Square Error: %.3f' % rmse)
|
|
visualise_dataset(dataset) |