You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
97 lines
2.6 KiB
Python
97 lines
2.6 KiB
Python
from random import seed
|
|
from random import randrange
|
|
import random
|
|
from csv import reader
|
|
from tabulate import tabulate
|
|
|
|
def load_csv(filename, skip = False):
|
|
dataset = []
|
|
with open(filename, newline='') as file:
|
|
csv_reader = reader(file)
|
|
if skip:
|
|
next(csv_reader)
|
|
for row in csv_reader:
|
|
if row:
|
|
dataset.append(row)
|
|
return dataset
|
|
|
|
|
|
def print_the_dataset(dataset, contents=True, length=True):
|
|
if (contents):
|
|
print(tabulate(dataset))
|
|
|
|
if (length):
|
|
print(len(dataset))
|
|
|
|
|
|
def train_test_split(dataset, split):
|
|
# Create an empty list for the training set
|
|
train_set = []
|
|
|
|
# Define the size of the training set
|
|
train_size = int(split * len(dataset))
|
|
|
|
# Copy the original dataset to
|
|
dataset_copy = dataset.copy()
|
|
|
|
# Loops only to the size of the training set
|
|
while len(train_set) < train_size:
|
|
index = randrange(len(dataset_copy))
|
|
# Populate the training set, by moving the data points from the
|
|
# dataset/test set to the training set
|
|
train_set.append(dataset_copy.pop(index))
|
|
|
|
# Return both the training set and test set
|
|
return train_set, dataset_copy
|
|
|
|
|
|
def k_fold_cross_validation(dataset, k):
|
|
n = len(dataset) # Length of the dataset
|
|
fold_size = n // k # Divide the length into smaller folds
|
|
folds = [] # Empty list of folds
|
|
|
|
# Shuffle the dataset
|
|
shuffled_dataset = dataset.copy()
|
|
random.shuffle(shuffled_dataset)
|
|
|
|
for i in range(k):
|
|
# Assign a start and end variables in respect to the fold size
|
|
start = i * fold_size
|
|
end = start + fold_size
|
|
|
|
# Generate all the test indices for the current fold
|
|
test_indices = shuffled_dataset[start:end]
|
|
|
|
# Generate all the train indices for the all other folds
|
|
train_indices = shuffled_dataset[:start] + shuffled_dataset[end:]
|
|
|
|
# Create a test set that is randomly populated via the test_indices
|
|
test_set = test_indices
|
|
|
|
# Create a train set that is randomly populated via the train_indices
|
|
train_set = train_indices
|
|
|
|
folds.append((train_set, test_set))
|
|
|
|
return folds
|
|
|
|
seed(1)
|
|
|
|
filename = 'big_heart.csv'
|
|
|
|
dataset = load_csv(filename, skip = True)
|
|
print_the_dataset(dataset)
|
|
|
|
training, test = train_test_split(dataset, 0.8)
|
|
|
|
print(len(training))
|
|
|
|
print(len(test))
|
|
|
|
k = 5 # Number of folds for cross-validation
|
|
folds = k_fold_cross_validation(dataset, k)
|
|
|
|
# Print the size of each fold
|
|
for i, fold in enumerate(folds):
|
|
train_set, test_set = fold
|
|
print(f"Fold {i+1}: Training set size: {len(train_set)}, Test set size: {len(test_set)}") |