from random import seed from random import randrange import random from csv import reader from tabulate import tabulate def load_csv(filename, skip = False): dataset = [] with open(filename, newline='') as file: csv_reader = reader(file) if skip: next(csv_reader) for row in csv_reader: if row: dataset.append(row) return dataset def print_the_dataset(dataset, contents=True, length=True): if (contents): print(tabulate(dataset)) if (length): print(len(dataset)) def train_test_split(dataset, split): # Create an empty list for the training set train_set = [] # Define the size of the training set train_size = int(split * len(dataset)) # Copy the original dataset to dataset_copy = dataset.copy() # Loops only to the size of the training set while len(train_set) < train_size: index = randrange(len(dataset_copy)) # Populate the training set, by moving the data points from the # dataset/test set to the training set train_set.append(dataset_copy.pop(index)) # Return both the training set and test set return train_set, dataset_copy def k_fold_cross_validation(dataset, k): n = len(dataset) # Length of the dataset fold_size = n // k # Divide the length into smaller folds folds = [] # Empty list of folds # Shuffle the dataset shuffled_dataset = dataset.copy() random.shuffle(shuffled_dataset) for i in range(k): # Assign a start and end variables in respect to the fold size start = i * fold_size end = start + fold_size # Generate all the test indices for the current fold test_indices = shuffled_dataset[start:end] # Generate all the train indices for the all other folds train_indices = shuffled_dataset[:start] + shuffled_dataset[end:] # Create a test set that is randomly populated via the test_indices test_set = test_indices # Create a train set that is randomly populated via the train_indices train_set = train_indices folds.append((train_set, test_set)) return folds seed(1) filename = 'big_heart.csv' dataset = load_csv(filename, skip = True) print_the_dataset(dataset) training, test = train_test_split(dataset, 0.8) print(len(training)) print(len(test)) k = 5 # Number of folds for cross-validation folds = k_fold_cross_validation(dataset, k) # Print the size of each fold for i, fold in enumerate(folds): train_set, test_set = fold print(f"Fold {i+1}: Training set size: {len(train_set)}, Test set size: {len(test_set)}")