From 24d1d330fa6b9bf74dcbbc8ddd1977b60d0187bd Mon Sep 17 00:00:00 2001 From: ilennaj Date: Tue, 1 Sep 2020 11:25:43 -0400 Subject: [PATCH] commented load_data.py --- .../load_data-checkpoint.py | 142 ++++++++++++------ custompackage/load_data.py | 142 ++++++++++++------ 2 files changed, 200 insertions(+), 84 deletions(-) diff --git a/custompackage/.ipynb_checkpoints/load_data-checkpoint.py b/custompackage/.ipynb_checkpoints/load_data-checkpoint.py index 20b3f44..51f3ea7 100644 --- a/custompackage/.ipynb_checkpoints/load_data-checkpoint.py +++ b/custompackage/.ipynb_checkpoints/load_data-checkpoint.py @@ -9,60 +9,82 @@ def format_data_weighted(Data_set, Target_class_1, Target_class_2, Data_weightin ''' Change labels so that target class is of value 1 and all other classes are of value 0. Dataset will be a 3 member tuple: data, label_binary, label_default. - Inputs: Data_set, Batch_size, Target_class + Inputs: Data_set, Target_class_1, Target_class_2, Data_weighting, permute, padded Outputs: Data_set_formatted ''' + # If Data_weighting == paired, then only the 2 classes, with 1:1 data weighting, are returned labeled 0 and 1 if Data_weighting == 'paired': + # Load intire dataset with batch size equal to entire dataset Loader = DataLoader(Data_set, batch_size=len(Data_set), shuffle=True) for _, (inputs, labels) in enumerate(Loader): + # data and label_default contain entire dataset data = inputs label_default = labels # Filter out all classes except Target classes + # Get indices for filter selector_1 = np.where(label_default.numpy() == Target_class_1) selector_2 = np.where(label_default.numpy() == Target_class_2) - + + # Filter labels and data (images) label_1 = label_default[selector_1] data_1 = data[selector_1] label_2 = label_default[selector_2] data_2 = data[selector_2] + # combine filtered data and label for each class label_pair = torch.cat((label_1, label_2), 0) data_pair = torch.cat((data_1, data_2), 0) + # Assign binary labels to each class label_binary = np.where(label_pair.numpy() == Target_class_1, 1, 0) label_binary = torch.from_numpy(label_binary).long() - ## permute with get_permutation function + ## permute with get_permutation function if permute: if data_pair.shape[1:] == torch.Size([1,28,28]): - # Pad data + # Pad data to make it 32x32 padding_f = torch.nn.ZeroPad2d(2) data_pair = padding_f(data_pair) + # make 2d image a 1d array data_pair = data_pair.view(len(data_pair),-1) + # get permutation perm_idx = get_permutation(5) + # Permute data data_pair = data_pair[:,perm_idx] - elif data_pair.shape[1:] == torch.Size([3,32,32]): + elif data_pair.shape[1:] == torch.Size([3,32,32]): #for CIFAR10 and SVHN datasets + # make 2d image a 1d array data_pair = data_pair.view(len(data_pair),-1) + # get permutation perm_idx = get_permutation(5) perm_idx = np.concatenate((perm_idx,perm_idx,perm_idx),0) + # permute data data_pair = data_pair[:,perm_idx] else: + # make 2d image a 1d array data_pair = data_pair.view(len(data_pair),-1) + # get permutation perm_idx = get_permutation(4) + # permute data data_pair = data_pair[:,perm_idx] - else: + else: # Only if padding, then pad 28x28 to 32x32 if padded and data_pair.shape[1:] == torch.Size([1,28,28]): # Pad data padding_f = torch.nn.ZeroPad2d(2) - data_pair = padding_f(data_pair) + data_pair = padding_f(data_pair) + # make 2d image a 1d array data_pair = data_pair.view(len(data_pair),-1) + # Put now formatted data and labels into a dataset Data_set_formatted = torch.utils.data.TensorDataset(data_pair, label_binary, label_pair) - else: + + else: # Keep all classes, only label target class with 1 and all others with 0 + # Load intire dataset with batch size equal to entire dataset Loader = DataLoader(Data_set, batch_size=len(Data_set), shuffle=True) for _, (inputs, labels) in enumerate(Loader): + # data and label_default contain entire dataset data = inputs label_default = labels + # Assign binary labels to each class label_binary = np.where(labels.numpy() == Target_class_1, 1, 0) label_binary = torch.from_numpy(label_binary).long() ## permute with get_permutation function @@ -71,20 +93,31 @@ def format_data_weighted(Data_set, Target_class_1, Target_class_2, Data_weightin # Pad data padding_f = torch.nn.ZeroPad2d(2) data = padding_f(data) + # make 2d image a 1d array data = data.view(len(data),-1) + # get permutation perm_idx = get_permutation(5) + # Permute data data = data[:,perm_idx] elif data.shape[1:] == torch.Size([3,32,32]): + # make 2d image a 1d array data = data.view(len(data),-1) + # get permutation perm_idx = get_permutation(5) perm_idx = np.concatenate((perm_idx,perm_idx,perm_idx),0) + # Permute data data = data[:,perm_idx] else: + # make 2d image a 1d array data = data.view(len(data),-1) + # get permutation perm_idx = get_permutation(4) + # Permute data data = data[:,perm_idx] else: + # make 2d image a 1d array data = data.view(len(data),-1) + # Put now formatted data and labels into a dataset Data_set_formatted = torch.utils.data.TensorDataset(data, label_binary, labels) return Data_set_formatted @@ -95,8 +128,8 @@ def dataset_weighted_split_all(Batch_size=32, Target_class_1=0, Target_class_2=1 padded=True): ''' Produces dataset that will be fed into a network model. - Inputs: Batch_size, Target_num, Data_weighting, Sequence_size - Outputs: Train_loaders, Valid_loaders, Test_set + Inputs: Batch_size, Target_class_1, Target_class_2, Data_weighting, Split, dataset, permute, padded + Outputs: set of Train_loaders, set of Valid_loaders, a single Test_loader ''' transform = transforms.ToTensor() @@ -147,10 +180,11 @@ def dataset_weighted_split_all(Batch_size=32, Target_class_1=0, Target_class_2=1 Train_set = format_data_weighted(Train_set, Target_class_1, Target_class_2, Data_weighting=Data_weighting, permute=permute, padded=padded) Test_set = format_data_weighted(Test_set, Target_class_1, Target_class_2, Data_weighting=Data_weighting, permute=permute, padded=padded) + # Set length for dataset splitting purposes train_len = Train_set.tensors[0].size()[0] test_len = Test_set.tensors[0].size()[0] - # Make validset from training data + # Make validset from training data such that it is equal in size to the test set Train_set, Valid_set = torch.utils.data.dataset.random_split(Train_set, (train_len-test_len, test_len)) # Since random_split sends out a subset and the original dataset is normally used from that. @@ -163,40 +197,54 @@ def dataset_weighted_split_all(Batch_size=32, Target_class_1=0, Target_class_2=1 # Split Training set and Valid set into multiple dataloaders and return array of dataloaders Train_loader_split, Valid_loader_split = [],[] for i in range(Split): + # get size of each split dataset spl = int(len(Train_set)/Split) + # Split dataset Train_set_split = torch.utils.data.TensorDataset(Train_set[i*spl:(i+1)*spl][0], Train_set[i*spl:(i+1)*spl][1], Train_set[i*spl:(i+1)*spl][2]) + # make dataloader from split section of original dataset Train_loader = DataLoader(Train_set_split, batch_size=Batch_size, shuffle=True) + # Add split dataloader to the list of dataloaders Train_loader_split.append(Train_loader) + # get size of each split dataset spl = int(len(Valid_set)/Split) + # Split dataset Valid_set_split = torch.utils.data.TensorDataset(Valid_set[i*spl:(i+1)*spl][0], Valid_set[i*spl:(i+1)*spl][1], Valid_set[i*spl:(i+1)*spl][2]) + # make dataloader from split section of original dataset Valid_loader = DataLoader(Valid_set_split, batch_size=Batch_size, shuffle=True) + # Add split dataloader to the list of dataloaders Valid_loader_split.append(Valid_loader) - + + # make dataloader from original test set Test_loader = DataLoader(Test_set, batch_size=Batch_size, shuffle=False) - else: #Default is 1:9 oversampled weighting - + else: #Default is 1:9 oversampled weighting. Oversample target dataset + + # Determine ratios of each class trainratio = np.bincount(Train_set.tensors[1]) validratio = np.bincount(Valid_set.tensors[1]) testratio = np.bincount(Test_set.tensors[1]) + # Convert ratios to counts of how many samples belong to each class train_classcount = trainratio.tolist() valid_classcount = validratio.tolist() test_classcount = testratio.tolist() + # Use counts to calculate original dataweighting of each class train_weights = 1./torch.tensor(train_classcount, dtype=torch.float) valid_weights = 1./torch.tensor(valid_classcount, dtype=torch.float) test_weights = 1./torch.tensor(test_classcount, dtype=torch.float) + # Select target class to get original weights of that class train_sampleweights = train_weights[Train_set.tensors[1]] valid_sampleweights = train_weights[Valid_set.tensors[1]] test_sampleweights = test_weights[Test_set.tensors[1]] + # Make samplers for data loader in order to oversample target class train_sampler = torch.utils.data.sampler.WeightedRandomSampler(weights=train_sampleweights, num_samples=len(train_sampleweights)) valid_sampler = torch.utils.data.sampler.WeightedRandomSampler(weights=valid_sampleweights, @@ -204,55 +252,65 @@ def dataset_weighted_split_all(Batch_size=32, Target_class_1=0, Target_class_2=1 test_sampler = torch.utils.data.sampler.WeightedRandomSampler(weights=test_sampleweights, num_samples=len(test_sampleweights)) + # Split Training set and Valid set into multiple dataloaders and return array of dataloaders Train_loader_split, Valid_loader_split = [],[] for i in range(Split): - + # get size of each split dataset spl = int(len(Train_set)/Split) + # Split dataset Train_set_split = torch.utils.data.TensorDataset(Train_set[i*spl:(i+1)*spl][0], Train_set[i*spl:(i+1)*spl][1], Train_set[i*spl:(i+1)*spl][2]) + # make dataloader from split section of original dataset Train_loader = DataLoader(Train_set_split, batch_size=Batch_size, shuffle=True) + # Add split dataloader to the list of dataloaders Train_loader_split.append(Train_loader) - + + # get size of each split dataset spl = int(len(Valid_set)/Split) + # Split dataset Valid_set_split = torch.utils.data.TensorDataset(Valid_set[i*spl:(i+1)*spl][0], Valid_set[i*spl:(i+1)*spl][1], Valid_set[i*spl:(i+1)*spl][2]) + # make dataloader from split section of original dataset Valid_loader = DataLoader(Valid_set_split, batch_size=Batch_size, shuffle=True) + # Add split dataloader to the list of dataloaders Valid_loader_split.append(Valid_loader) - + # make dataloader from original test set Test_loader = DataLoader(Test_set, batch_size=Batch_size, sampler=test_sampler) return Train_loader_split, Valid_loader_split, Test_loader -# Assumes that the matrix is of size 2^n x 2^n for some n -# -# -# EXAMPLE for n=4 -# -# Old order: -# -# 1 2 3 4 -# 5 6 7 8 -# 9 10 11 12 -# 13 14 15 16 -# -# New order: -# -# 1 2 5 6 -# 3 4 7 8 -# 9 10 13 14 -# 11 12 15 16 -# -# Function returns numbers from old order, read in the order of the new numbers: -# -# [1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16] -# -# So if you previously had a data vector v from a matrix size 32 x 32, -# you can now use v[get_permutation(5)] to reorder the elements. + def get_matrix(n): + ''' + Assumes that the matrix is of size 2^n x 2^n for some n + + EXAMPLE for n=4 + + Old order: + + 1 2 3 4 + 5 6 7 8 + 9 10 11 12 + 13 14 15 16 + + New order: + + 1 2 5 6 + 3 4 7 8 + 9 10 13 14 + 11 12 15 16 + + Function returns numbers from old order, read in the order of the new numbers: + + [1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16] + + So if you previously had a data vector v from a matrix size 32 x 32, + you can now use v[get_permutation(5)] to reorder the elements. + ''' if n == 0: return np.array([[1]]) else: diff --git a/custompackage/load_data.py b/custompackage/load_data.py index 20b3f44..51f3ea7 100644 --- a/custompackage/load_data.py +++ b/custompackage/load_data.py @@ -9,60 +9,82 @@ def format_data_weighted(Data_set, Target_class_1, Target_class_2, Data_weightin ''' Change labels so that target class is of value 1 and all other classes are of value 0. Dataset will be a 3 member tuple: data, label_binary, label_default. - Inputs: Data_set, Batch_size, Target_class + Inputs: Data_set, Target_class_1, Target_class_2, Data_weighting, permute, padded Outputs: Data_set_formatted ''' + # If Data_weighting == paired, then only the 2 classes, with 1:1 data weighting, are returned labeled 0 and 1 if Data_weighting == 'paired': + # Load intire dataset with batch size equal to entire dataset Loader = DataLoader(Data_set, batch_size=len(Data_set), shuffle=True) for _, (inputs, labels) in enumerate(Loader): + # data and label_default contain entire dataset data = inputs label_default = labels # Filter out all classes except Target classes + # Get indices for filter selector_1 = np.where(label_default.numpy() == Target_class_1) selector_2 = np.where(label_default.numpy() == Target_class_2) - + + # Filter labels and data (images) label_1 = label_default[selector_1] data_1 = data[selector_1] label_2 = label_default[selector_2] data_2 = data[selector_2] + # combine filtered data and label for each class label_pair = torch.cat((label_1, label_2), 0) data_pair = torch.cat((data_1, data_2), 0) + # Assign binary labels to each class label_binary = np.where(label_pair.numpy() == Target_class_1, 1, 0) label_binary = torch.from_numpy(label_binary).long() - ## permute with get_permutation function + ## permute with get_permutation function if permute: if data_pair.shape[1:] == torch.Size([1,28,28]): - # Pad data + # Pad data to make it 32x32 padding_f = torch.nn.ZeroPad2d(2) data_pair = padding_f(data_pair) + # make 2d image a 1d array data_pair = data_pair.view(len(data_pair),-1) + # get permutation perm_idx = get_permutation(5) + # Permute data data_pair = data_pair[:,perm_idx] - elif data_pair.shape[1:] == torch.Size([3,32,32]): + elif data_pair.shape[1:] == torch.Size([3,32,32]): #for CIFAR10 and SVHN datasets + # make 2d image a 1d array data_pair = data_pair.view(len(data_pair),-1) + # get permutation perm_idx = get_permutation(5) perm_idx = np.concatenate((perm_idx,perm_idx,perm_idx),0) + # permute data data_pair = data_pair[:,perm_idx] else: + # make 2d image a 1d array data_pair = data_pair.view(len(data_pair),-1) + # get permutation perm_idx = get_permutation(4) + # permute data data_pair = data_pair[:,perm_idx] - else: + else: # Only if padding, then pad 28x28 to 32x32 if padded and data_pair.shape[1:] == torch.Size([1,28,28]): # Pad data padding_f = torch.nn.ZeroPad2d(2) - data_pair = padding_f(data_pair) + data_pair = padding_f(data_pair) + # make 2d image a 1d array data_pair = data_pair.view(len(data_pair),-1) + # Put now formatted data and labels into a dataset Data_set_formatted = torch.utils.data.TensorDataset(data_pair, label_binary, label_pair) - else: + + else: # Keep all classes, only label target class with 1 and all others with 0 + # Load intire dataset with batch size equal to entire dataset Loader = DataLoader(Data_set, batch_size=len(Data_set), shuffle=True) for _, (inputs, labels) in enumerate(Loader): + # data and label_default contain entire dataset data = inputs label_default = labels + # Assign binary labels to each class label_binary = np.where(labels.numpy() == Target_class_1, 1, 0) label_binary = torch.from_numpy(label_binary).long() ## permute with get_permutation function @@ -71,20 +93,31 @@ def format_data_weighted(Data_set, Target_class_1, Target_class_2, Data_weightin # Pad data padding_f = torch.nn.ZeroPad2d(2) data = padding_f(data) + # make 2d image a 1d array data = data.view(len(data),-1) + # get permutation perm_idx = get_permutation(5) + # Permute data data = data[:,perm_idx] elif data.shape[1:] == torch.Size([3,32,32]): + # make 2d image a 1d array data = data.view(len(data),-1) + # get permutation perm_idx = get_permutation(5) perm_idx = np.concatenate((perm_idx,perm_idx,perm_idx),0) + # Permute data data = data[:,perm_idx] else: + # make 2d image a 1d array data = data.view(len(data),-1) + # get permutation perm_idx = get_permutation(4) + # Permute data data = data[:,perm_idx] else: + # make 2d image a 1d array data = data.view(len(data),-1) + # Put now formatted data and labels into a dataset Data_set_formatted = torch.utils.data.TensorDataset(data, label_binary, labels) return Data_set_formatted @@ -95,8 +128,8 @@ def dataset_weighted_split_all(Batch_size=32, Target_class_1=0, Target_class_2=1 padded=True): ''' Produces dataset that will be fed into a network model. - Inputs: Batch_size, Target_num, Data_weighting, Sequence_size - Outputs: Train_loaders, Valid_loaders, Test_set + Inputs: Batch_size, Target_class_1, Target_class_2, Data_weighting, Split, dataset, permute, padded + Outputs: set of Train_loaders, set of Valid_loaders, a single Test_loader ''' transform = transforms.ToTensor() @@ -147,10 +180,11 @@ def dataset_weighted_split_all(Batch_size=32, Target_class_1=0, Target_class_2=1 Train_set = format_data_weighted(Train_set, Target_class_1, Target_class_2, Data_weighting=Data_weighting, permute=permute, padded=padded) Test_set = format_data_weighted(Test_set, Target_class_1, Target_class_2, Data_weighting=Data_weighting, permute=permute, padded=padded) + # Set length for dataset splitting purposes train_len = Train_set.tensors[0].size()[0] test_len = Test_set.tensors[0].size()[0] - # Make validset from training data + # Make validset from training data such that it is equal in size to the test set Train_set, Valid_set = torch.utils.data.dataset.random_split(Train_set, (train_len-test_len, test_len)) # Since random_split sends out a subset and the original dataset is normally used from that. @@ -163,40 +197,54 @@ def dataset_weighted_split_all(Batch_size=32, Target_class_1=0, Target_class_2=1 # Split Training set and Valid set into multiple dataloaders and return array of dataloaders Train_loader_split, Valid_loader_split = [],[] for i in range(Split): + # get size of each split dataset spl = int(len(Train_set)/Split) + # Split dataset Train_set_split = torch.utils.data.TensorDataset(Train_set[i*spl:(i+1)*spl][0], Train_set[i*spl:(i+1)*spl][1], Train_set[i*spl:(i+1)*spl][2]) + # make dataloader from split section of original dataset Train_loader = DataLoader(Train_set_split, batch_size=Batch_size, shuffle=True) + # Add split dataloader to the list of dataloaders Train_loader_split.append(Train_loader) + # get size of each split dataset spl = int(len(Valid_set)/Split) + # Split dataset Valid_set_split = torch.utils.data.TensorDataset(Valid_set[i*spl:(i+1)*spl][0], Valid_set[i*spl:(i+1)*spl][1], Valid_set[i*spl:(i+1)*spl][2]) + # make dataloader from split section of original dataset Valid_loader = DataLoader(Valid_set_split, batch_size=Batch_size, shuffle=True) + # Add split dataloader to the list of dataloaders Valid_loader_split.append(Valid_loader) - + + # make dataloader from original test set Test_loader = DataLoader(Test_set, batch_size=Batch_size, shuffle=False) - else: #Default is 1:9 oversampled weighting - + else: #Default is 1:9 oversampled weighting. Oversample target dataset + + # Determine ratios of each class trainratio = np.bincount(Train_set.tensors[1]) validratio = np.bincount(Valid_set.tensors[1]) testratio = np.bincount(Test_set.tensors[1]) + # Convert ratios to counts of how many samples belong to each class train_classcount = trainratio.tolist() valid_classcount = validratio.tolist() test_classcount = testratio.tolist() + # Use counts to calculate original dataweighting of each class train_weights = 1./torch.tensor(train_classcount, dtype=torch.float) valid_weights = 1./torch.tensor(valid_classcount, dtype=torch.float) test_weights = 1./torch.tensor(test_classcount, dtype=torch.float) + # Select target class to get original weights of that class train_sampleweights = train_weights[Train_set.tensors[1]] valid_sampleweights = train_weights[Valid_set.tensors[1]] test_sampleweights = test_weights[Test_set.tensors[1]] + # Make samplers for data loader in order to oversample target class train_sampler = torch.utils.data.sampler.WeightedRandomSampler(weights=train_sampleweights, num_samples=len(train_sampleweights)) valid_sampler = torch.utils.data.sampler.WeightedRandomSampler(weights=valid_sampleweights, @@ -204,55 +252,65 @@ def dataset_weighted_split_all(Batch_size=32, Target_class_1=0, Target_class_2=1 test_sampler = torch.utils.data.sampler.WeightedRandomSampler(weights=test_sampleweights, num_samples=len(test_sampleweights)) + # Split Training set and Valid set into multiple dataloaders and return array of dataloaders Train_loader_split, Valid_loader_split = [],[] for i in range(Split): - + # get size of each split dataset spl = int(len(Train_set)/Split) + # Split dataset Train_set_split = torch.utils.data.TensorDataset(Train_set[i*spl:(i+1)*spl][0], Train_set[i*spl:(i+1)*spl][1], Train_set[i*spl:(i+1)*spl][2]) + # make dataloader from split section of original dataset Train_loader = DataLoader(Train_set_split, batch_size=Batch_size, shuffle=True) + # Add split dataloader to the list of dataloaders Train_loader_split.append(Train_loader) - + + # get size of each split dataset spl = int(len(Valid_set)/Split) + # Split dataset Valid_set_split = torch.utils.data.TensorDataset(Valid_set[i*spl:(i+1)*spl][0], Valid_set[i*spl:(i+1)*spl][1], Valid_set[i*spl:(i+1)*spl][2]) + # make dataloader from split section of original dataset Valid_loader = DataLoader(Valid_set_split, batch_size=Batch_size, shuffle=True) + # Add split dataloader to the list of dataloaders Valid_loader_split.append(Valid_loader) - + # make dataloader from original test set Test_loader = DataLoader(Test_set, batch_size=Batch_size, sampler=test_sampler) return Train_loader_split, Valid_loader_split, Test_loader -# Assumes that the matrix is of size 2^n x 2^n for some n -# -# -# EXAMPLE for n=4 -# -# Old order: -# -# 1 2 3 4 -# 5 6 7 8 -# 9 10 11 12 -# 13 14 15 16 -# -# New order: -# -# 1 2 5 6 -# 3 4 7 8 -# 9 10 13 14 -# 11 12 15 16 -# -# Function returns numbers from old order, read in the order of the new numbers: -# -# [1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16] -# -# So if you previously had a data vector v from a matrix size 32 x 32, -# you can now use v[get_permutation(5)] to reorder the elements. + def get_matrix(n): + ''' + Assumes that the matrix is of size 2^n x 2^n for some n + + EXAMPLE for n=4 + + Old order: + + 1 2 3 4 + 5 6 7 8 + 9 10 11 12 + 13 14 15 16 + + New order: + + 1 2 5 6 + 3 4 7 8 + 9 10 13 14 + 11 12 15 16 + + Function returns numbers from old order, read in the order of the new numbers: + + [1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16] + + So if you previously had a data vector v from a matrix size 32 x 32, + you can now use v[get_permutation(5)] to reorder the elements. + ''' if n == 0: return np.array([[1]]) else: