AIAss3/Q1.py

from random import seed
from random import randrange
import random
from csv import reader
from tabulate import tabulate

def load_csv(filename, skip = False):
    dataset  = []
    with open(filename, newline='') as file:
        csv_reader = reader(file)
        if skip:
            next(csv_reader)
        for row in csv_reader:
            if row:
                dataset.append(row)
    return dataset


def print_the_dataset(dataset, contents=True, length=True):
    if (contents):
        print(tabulate(dataset))

    if (length):
        print(len(dataset))


def train_test_split(dataset, split):
    # Create an empty list for the training set
    train_set = []

    # Define the size of the training set
    train_size = int(split * len(dataset))

    # Copy the original dataset to
    dataset_copy = dataset.copy()

    # Loops only to the size of the training set
    while len(train_set) < train_size:
        index = randrange(len(dataset_copy))
        # Populate the training set, by moving the data points from the
        # dataset/test set to the training set
        train_set.append(dataset_copy.pop(index))

    # Return both the training set and test set
    return train_set, dataset_copy


def k_fold_cross_validation(dataset, k):
    n = len(dataset)  # Length of the dataset
    fold_size = n // k  # Divide the length into smaller folds
    folds = []  # Empty list of folds

    # Shuffle the dataset
    shuffled_dataset = dataset.copy()
    random.shuffle(shuffled_dataset)

    for i in range(k):
        # Assign a start and end variables in respect to the fold size
        start = i * fold_size
        end = start + fold_size

        # Generate all the test indices for the current fold
        test_indices = shuffled_dataset[start:end]

        # Generate all the train indices for the all other folds
        train_indices = shuffled_dataset[:start] + shuffled_dataset[end:]

        # Create a test set that is randomly populated via the test_indices
        test_set = test_indices

        # Create a train set that is randomly populated via the train_indices
        train_set = train_indices

        folds.append((train_set, test_set))

    return folds

seed(1)

filename = 'big_heart.csv'

dataset = load_csv(filename, skip = True)
print_the_dataset(dataset)

training, test = train_test_split(dataset, 0.8)

print(len(training))

print(len(test))

k = 5  # Number of folds for cross-validation
folds = k_fold_cross_validation(dataset, k)

# Print the size of each fold
for i, fold in enumerate(folds):
    train_set, test_set = fold
    print(f"Fold {i+1}: Training set size: {len(train_set)}, Test set size: {len(test_set)}")