You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

97 lines
2.6 KiB
Python

from random import seed
from random import randrange
import random
from csv import reader
from tabulate import tabulate
def load_csv(filename, skip = False):
dataset = []
with open(filename, newline='') as file:
csv_reader = reader(file)
if skip:
next(csv_reader)
for row in csv_reader:
if row:
dataset.append(row)
return dataset
def print_the_dataset(dataset, contents=True, length=True):
if (contents):
print(tabulate(dataset))
if (length):
print(len(dataset))
def train_test_split(dataset, split):
# Create an empty list for the training set
train_set = []
# Define the size of the training set
train_size = int(split * len(dataset))
# Copy the original dataset to
dataset_copy = dataset.copy()
# Loops only to the size of the training set
while len(train_set) < train_size:
index = randrange(len(dataset_copy))
# Populate the training set, by moving the data points from the
# dataset/test set to the training set
train_set.append(dataset_copy.pop(index))
# Return both the training set and test set
return train_set, dataset_copy
def k_fold_cross_validation(dataset, k):
n = len(dataset) # Length of the dataset
fold_size = n // k # Divide the length into smaller folds
folds = [] # Empty list of folds
# Shuffle the dataset
shuffled_dataset = dataset.copy()
random.shuffle(shuffled_dataset)
for i in range(k):
# Assign a start and end variables in respect to the fold size
start = i * fold_size
end = start + fold_size
# Generate all the test indices for the current fold
test_indices = shuffled_dataset[start:end]
# Generate all the train indices for the all other folds
train_indices = shuffled_dataset[:start] + shuffled_dataset[end:]
# Create a test set that is randomly populated via the test_indices
test_set = test_indices
# Create a train set that is randomly populated via the train_indices
train_set = train_indices
folds.append((train_set, test_set))
return folds
seed(1)
filename = 'big_heart.csv'
dataset = load_csv(filename, skip = True)
print_the_dataset(dataset)
training, test = train_test_split(dataset, 0.8)
print(len(training))
print(len(test))
k = 5 # Number of folds for cross-validation
folds = k_fold_cross_validation(dataset, k)
# Print the size of each fold
for i, fold in enumerate(folds):
train_set, test_set = fold
print(f"Fold {i+1}: Training set size: {len(train_set)}, Test set size: {len(test_set)}")