diff --git a/stargan/classifiers.py b/stargan/classifiers.py index 717ea5b..f1cf8dc 100644 --- a/stargan/classifiers.py +++ b/stargan/classifiers.py @@ -1,4 +1,4 @@ -''' +""" classifiers.py Author - Max Elliott @@ -7,7 +7,7 @@ auxiliary classifier for categorical emotion recognition. Dimension_Classifier is the auxiliary classifier for dimensional emotion recognition (didn't end up being used in this thesis). -''' +""" import torch import torch.nn as nn @@ -17,12 +17,12 @@ class Emotion_Classifier(nn.Module): - def __init__(self, input_size, hidden_size, num_layers, num_classes, bi = False, - device = 'cuda'): - ''' + def __init__(self, input_size, hidden_size, num_layers, num_classes, bi=False, + device='cuda'): + """ NOTE: input size must be directly divisible by 4 Is also used for speaker classifier - ''' + """ super(Emotion_Classifier, self).__init__() self.hidden_size = hidden_size self.input_size = input_size # == n_mels/world feats @@ -35,21 +35,21 @@ def __init__(self, input_size, hidden_size, num_layers, num_classes, bi = False, kernel = 7 padding = int((kernel-1)/2) - self.conv1 = nn.Conv2d(1, 16, kernel, padding = padding) - self.maxpool1 = nn.MaxPool2d(2, stride = 2) - self.conv2 = nn.Conv2d(16, 24, kernel, padding = padding) - self.maxpool2 = nn.MaxPool2d(2, stride = 2) - self.conv3 = nn.Conv2d(24, self.num_outchannels, kernel, padding = padding) - self.maxpool3 = nn.MaxPool2d(2, stride = 2) - - self.lstm1 = nn.LSTM(input_size = self.num_outchannels*(self.input_size//8), - hidden_size = self.hidden_size, num_layers = self.num_layers, - batch_first = True, bidirectional = bi) + self.conv1 = nn.Conv2d(1, 16, kernel, padding=padding) + self.maxpool1 = nn.MaxPool2d(2, stride=2) + self.conv2 = nn.Conv2d(16, 24, kernel, padding=padding) + self.maxpool2 = nn.MaxPool2d(2, stride=2) + self.conv3 = nn.Conv2d(24, self.num_outchannels, kernel, padding=padding) + self.maxpool3 = nn.MaxPool2d(2, stride=2) + + self.lstm1 = nn.LSTM(input_size=self.num_outchannels*(self.input_size//8), + hidden_size=self.hidden_size, num_layers=self.num_layers, + batch_first=True, bidirectional=bi) self.att = Average_Weighted_Attention(self.hidden_size*self.m_factor) self.fc = nn.Linear(self.m_factor*hidden_size, 64) - self.drop = nn.Dropout(p = 0.2) - self.out = nn.Linear(64,self.num_classes) + self.drop = nn.Dropout(p=0.2) + self.out = nn.Linear(64, self.num_classes) def forward(self, x_data, x_lens): """ @@ -63,7 +63,6 @@ def forward(self, x_data, x_lens): curr_device = x_data.device # Convolutional layers - # x_data = x.unsqueeze(1) x_data = self.maxpool1(F.relu(self.conv1(x_data))) x_data = self.maxpool2(F.relu(self.conv2(x_data))) x_data = self.maxpool3(F.relu(self.conv3(x_data))) @@ -75,7 +74,6 @@ def forward(self, x_data, x_lens): x_data = x_data.contiguous().view(batch_size, -1, self.num_outchannels*(no_features//8)) # Now x = (B, max_l//8, channels*(n_mels//8)) - x_data = nn.utils.rnn.pack_padded_sequence(x_data, x_lens, batch_first=True, enforce_sorted=True) @@ -86,8 +84,8 @@ def forward(self, x_data, x_lens): c0 = torch.zeros(self.m_factor*self.num_layers, batch_size, self.hidden_size).to(device=curr_device, dtype=torch.float) - #LSTM returns: (seq_len, batch, num_directions * hidden_size), - # ((num_layers * num_directions, batch, hidden_size), c_n) + # LSTM returns: (seq_len, batch, num_directions * hidden_size), + # ((num_layers * num_directions, batch, hidden_size), c_n) x_data, _ = self.lstm1(x_data, (h0, c0)) x_data, x_lens = torch.nn.utils.rnn.pad_packed_sequence(x_data, batch_first=True) @@ -110,36 +108,36 @@ class Dimension_Classifier(nn.Module): Uses three conv2d->maxpooling layers, into two separate sequential modelling networks for prediction of valence and arousal. """ - def __init__(self, input_size, hidden_size, num_layers, bi = False, device = 'cuda'): - ''' + def __init__(self, input_size, hidden_size, num_layers, bi=False, device='cuda'): + """ NOTE: input size must be directly divisible by 4 - ''' + """ super(Dimension_Classifier, self).__init__() self.hidden_size = hidden_size - self.input_size = input_size # == n_mels + self.input_size = input_size # == n_mels self.num_layers = num_layers self.num_outchannels = 32 self.m_factor = 2 if bi else 1 kernel = 7 padding = int((kernel-1)/2) - self.conv1 = nn.Conv2d(1, 16, kernel, padding = padding) - self.maxpool1 = nn.MaxPool2d(2, stride = 2) - self.conv2 = nn.Conv2d(16, 24, kernel, padding = padding) - self.maxpool2 = nn.MaxPool2d(2, stride = 2) - self.conv3 = nn.Conv2d(24, self.num_outchannels, kernel, padding = padding) - self.maxpool3 = nn.MaxPool2d(2, stride = 2) + self.conv1 = nn.Conv2d(1, 16, kernel, padding=padding) + self.maxpool1 = nn.MaxPool2d(2, stride=2) + self.conv2 = nn.Conv2d(16, 24, kernel, padding=padding) + self.maxpool2 = nn.MaxPool2d(2, stride=2) + self.conv3 = nn.Conv2d(24, self.num_outchannels, kernel, padding=padding) + self.maxpool3 = nn.MaxPool2d(2, stride=2) self.valence_predictor = Single_Dimension_Classifier( - input_size = self.input_size*(self.num_outchannels//8), - hidden_size = self.hidden_size, - num_layers = self.num_layers, - bi = bi, device = device) + input_size=self.input_size*(self.num_outchannels//8), + hidden_size=self.hidden_size, + num_layers=self.num_layers, + bi=bi, device=device) self.arousal_predictor = Single_Dimension_Classifier( - input_size = self.input_size*(self.num_outchannels//8), - hidden_size = self.hidden_size, - num_layers = self.num_layers, - bi = bi, device = device) + input_size=self.input_size*(self.num_outchannels//8), + hidden_size=self.hidden_size, + num_layers=self.num_layers, + bi=bi, device=device) # self.dominance_predictor = Single_Dimension_Classifier( # input_size = (self.input_size*self.num_outchannels)//8, # hidden_size = self.hidden_size, @@ -147,58 +145,56 @@ def __init__(self, input_size, hidden_size, num_layers, bi = False, device = 'cu # bi = bi, device = device) def forward(self, x_data, x_lens): - ''' + """ x[0] is size (batch_size, max_seq_length, feature_dim) x[1] is size (batch_size, 1), contains seq_lens batch is in descending seq_len order - ''' + """ batch_size = x_data.size(0) no_features = x_data.size(2) - #Convolutional layers - # x_data = x_data.unsqueeze(1) + # Convolutional layers x_data = self.maxpool1(F.relu(self.conv1(x_data))) x_data = self.maxpool2(F.relu(self.conv2(x_data))) x_data = self.maxpool3(F.relu(self.conv3(x_data))) x_lens = x_lens//8 # seq_len have got ~4 times shorted # x = (B, channels, max_l//4, n_mels//4) - #Recurrent layers + # Recurrent layers x_data = x_data.permute(0,2,1,3) x_data = x_data.contiguous().view(batch_size, -1, self.num_outchannels*no_features//8) - #Now x = (B, max_l//4, channels*n_mels//4) - + # Now x = (B, max_l//4, channels*n_mels//4) x_data = nn.utils.rnn.pack_padded_sequence(x_data, x_lens, batch_first=True, enforce_sorted=True) - #PASS INTO 3 single_dim_predictors + # Pass into 3 single_dim_predictors x_val = self.valence_predictor(x_data, batch_size) x_aro = self.arousal_predictor(x_data, batch_size) # x_dom = self.dominance_predictor(x_data, batch_size) - return x_val, x_aro#, x_dom + return x_val, x_aro #, x_dom class Single_Dimension_Classifier(nn.Module): - def __init__(self, input_size, hidden_size, num_layers, bi = False, device = 'cuda'): + def __init__(self, input_size, hidden_size, num_layers, bi=False, device='cuda'): super(Single_Dimension_Classifier, self).__init__() self.hidden_size = hidden_size - self.input_size = input_size # == n_mels + self.input_size = input_size # == n_mels self.num_layers = num_layers self.num_outchannels = 32 self.m_factor = 2 if bi else 1 - self.device = device + self.device = device # Now legacy as isn't used - self.lstm1 = nn.LSTM(input_size = self.input_size, - hidden_size = self.hidden_size, num_layers = self.num_layers, - batch_first = True, bidirectional = bi) + self.lstm1 = nn.LSTM(input_size=self.input_size, + hidden_size=self.hidden_size, num_layers=self.num_layers, + batch_first=True, bidirectional=bi) self.att = Average_Weighted_Attention(self.hidden_size*self.m_factor) self.fc1 = nn.Linear(self.hidden_size*self.m_factor, (self.hidden_size*self.m_factor)//2) @@ -206,17 +202,17 @@ def __init__(self, input_size, hidden_size, num_layers, bi = False, device = 'cu def forward(self, x, batch_size): + curr_device = x.device h0 = torch.zeros(self.m_factor*self.num_layers, batch_size, - self.hidden_size)#.to(device = self.device, dtype=torch.float) + self.hidden_size).to(device=curr_device, dtype=torch.float) c0 = torch.zeros(self.m_factor*self.num_layers, batch_size, - self.hidden_size)#.to(device = self.device, dtype=torch.float) + self.hidden_size).to(device=curr_device, dtype=torch.float) - #LSTM returns: (seq_len, batch, num_directions * hidden_size), - # ((num_layers * num_directions, batch, hidden_size), c_n) - x_data,_ = self.lstm1(x, (h0,c0)) -# x_data,_ = self.gru1(x_data, h0) + # LSTM returns: (seq_len, batch, num_directions * hidden_size), + # ((num_layers * num_directions, batch, hidden_size), c_n) + x_data, _ = self.lstm1(x, (h0, c0)) x_data, x_lens = torch.nn.utils.rnn.pad_packed_sequence(x_data, batch_first=True) @@ -229,4 +225,4 @@ def forward(self, x, batch_size): if __name__ == '__main__': - pass \ No newline at end of file + pass diff --git a/stargan/logger.py b/stargan/logger.py index 95b02bc..575f70c 100644 --- a/stargan/logger.py +++ b/stargan/logger.py @@ -1,11 +1,10 @@ -''' +""" logger.py Altered version of Logger.py by hujinsen. Original source can be found at: https://github.com/hujinsen/pytorch-StarGAN-VC -''' +""" import tensorflow as tf - import os @@ -26,4 +25,3 @@ def scalar_summary(self, tag, value, step): """Add scalar summary.""" summary = tf.Summary(value=[tf.Summary.Value(tag=tag, simple_value=value)]) self.writer.add_summary(summary, step) - # print("scalar_summary called.") diff --git a/stargan/model.py b/stargan/model.py index 52c348f..7eab10c 100644 --- a/stargan/model.py +++ b/stargan/model.py @@ -1,4 +1,4 @@ -''' +""" model.py Author - Max Elliott @@ -9,24 +9,21 @@ Note: Generator_World is the generator used for all experiments and the one described in the report. Generator_Mel was only an alternative that never got used. -''' - -import torch -import torch.nn as nn +""" import os from stargan.classifiers import * -from stargan.average_weighted_attention import Average_Weighted_Attention import stargan.unet.unet_model as unet_model + class StarGAN_emo_VC1(object): - ''' + """ The proposed model of this project. - ''' + """ def __init__(self, config, name): - ''' + """ Need config for input_size, hidden_size, num_layers, num_classes, bi = False - ''' + """ super(StarGAN_emo_VC1, self).__init__() self.config = config self.save_dir = config['logs']['model_save_dir'] @@ -36,7 +33,6 @@ def __init__(self, config, name): self.build_model() - def set_train_mode(self): self.G.train() self.D.train() @@ -57,28 +53,28 @@ def set_eval_mode(self): if self.use_dimension: self.dimension_cls.eval() - def to_device(self, device = torch.device('cuda')): + def to_device(self, device=torch.device('cuda')): if torch.cuda.is_available(): - self.G.to(device = device) - self.D.to(device = device) - self.emo_cls.to(device = device) + self.G.to(device=device) + self.D.to(device=device) + self.emo_cls.to(device=device) self.emo_cls.device = device if self.use_speaker: - self.speaker_cls.to(device = device) + self.speaker_cls.to(device=device) if self.use_dimension: - self.dimension_cls.to(device = device) + self.dimension_cls.to(device=device) else: print("Device not available") - self.G.to(device = torch.device('cpu')) - self.D.to(device = torch.device('cpu')) - self.emo_cls.to(device = torch.device('cpu')) + self.G.to(device=torch.device('cpu')) + self.D.to(device=torch.device('cpu')) + self.emo_cls.to(device=torch.device('cpu')) self.emo_cls.device = device if self.use_speaker: - self.speaker_cls.to(device = torch.device('cpu')) + self.speaker_cls.to(device=torch.device('cpu')) if self.use_dimension: - self.dimension_cls.to(device = torch.device('cpu')) + self.dimension_cls.to(device=torch.device('cpu')) def build_model(self): @@ -97,32 +93,32 @@ def build_model(self): self.G = nn.DataParallel(Generator_World(self.num_emotions)) self.D = nn.DataParallel(Discriminator(self.num_emotions)) self.emo_cls = nn.DataParallel(Emotion_Classifier(self.num_input_feats, - self.hidden_size, - self.num_layers, - self.num_emotions, - bi = self.bi)) + self.hidden_size, + self.num_layers, + self.num_emotions, + bi=self.bi)) self.speaker_cls = nn.DataParallel(Emotion_Classifier(self.num_input_feats, - self.hidden_size, - self.num_layers, - self.num_speakers, - bi = self.bi)) + self.hidden_size, + self.num_layers, + self.num_speakers, + bi=self.bi)) self.dimension_cls = nn.DataParallel(Dimension_Classifier(self.num_input_feats, - self.hidden_size, - self.num_layers, - bi = self.bi)) + self.hidden_size, + self.num_layers, + bi=self.bi)) print("Building optimizers") con_opt = self.config['optimizer'] self.g_optimizer = torch.optim.Adam(self.G.parameters(), con_opt['g_lr'], [con_opt['beta1'], con_opt['beta2']]) self.d_optimizer = torch.optim.Adam(self.D.parameters(), con_opt['d_lr'], [con_opt['beta1'], con_opt['beta2']]) - self.emo_cls_optimizer = torch.optim.Adam(self.emo_cls.parameters(), con_opt['emo_cls_lr'],[con_opt['beta1'], con_opt['beta2']],weight_decay = 0.000001) + self.emo_cls_optimizer = torch.optim.Adam(self.emo_cls.parameters(), con_opt['emo_cls_lr'], [con_opt['beta1'], con_opt['beta2']],weight_decay=0.000001) if self.use_speaker: - self.speaker_cls_optimizer = torch.optim.Adam(self.speaker_cls.parameters(), con_opt['speaker_cls_lr'],[con_opt['beta1'], con_opt['beta2']]) + self.speaker_cls_optimizer = torch.optim.Adam(self.speaker_cls.parameters(), con_opt['speaker_cls_lr'], [con_opt['beta1'], con_opt['beta2']]) if self.use_dimension: - self.dimension_cls_optimizer = torch.optim.Adam(self.dimension_cls.parameters(), con_opt['dim_cls_lr'],[con_opt['beta1'], con_opt['beta2']]) + self.dimension_cls_optimizer = torch.optim.Adam(self.dimension_cls.parameters(), con_opt['dim_cls_lr'], [con_opt['beta1'], con_opt['beta2']]) if self.config['verbose']: print("Network parameter list:") @@ -163,16 +159,16 @@ def reset_grad(self): if self.use_dimension: self.dimension_cls_optimizer.zero_grad() - def save(self, save_dir = None, iter = 0): + def save(self, save_dir=None, it=0): - if save_dir == None: + if save_dir is None: save_dir = self.save_dir path = os.path.join(save_dir, self.name) if not os.path.exists(path): os.makedirs(path) - self.config['loss']['resume_iters'] = iter + self.config['loss']['resume_iters'] = it state = {'D': self.D.state_dict(), 'G': self.G.state_dict(), @@ -181,7 +177,7 @@ def save(self, save_dir = None, iter = 0): 'g_opt': self.g_optimizer.state_dict(), 'emo_opt': self.emo_cls_optimizer.state_dict(), 'config': self.config - } + } if self.use_speaker: state['spk'] = self.speaker_cls.state_dict() @@ -190,18 +186,16 @@ def save(self, save_dir = None, iter = 0): state['dim'] = self.dimension_cls.state_dict() state['dim_opt'] = self.dimension_cls_cls_optimizer.state_dict() - path = os.path.join(path, "{:06}.ckpt".format(iter)) + path = os.path.join(path, "{:06}.ckpt".format(it)) torch.save(state, path) - # torch.save(self.G.state_dict(), G_path) - # torch.save(self.emo_cls.state_dict(), emo_path) print("Model saved as {}.".format(path)) - def load(self, load_dir, map_location = None): - ''' + def load(self, load_dir): + """ load_dir: full directory of checkpoint to load - ''' + """ # if load_dir[-1] == '/': # load_dir = load_dir[0:-1] # @@ -238,7 +232,7 @@ def load(self, load_dir, map_location = None): con_opt = self.config['optimizer'] self.g_optimizer = torch.optim.Adam(self.G.parameters(), con_opt['g_lr'], [con_opt['beta1'], con_opt['beta2']]) self.d_optimizer = torch.optim.Adam(self.D.parameters(), con_opt['d_lr'], [con_opt['beta1'], con_opt['beta2']]) - self.emo_cls_optimizer = torch.optim.Adam(self.emo_cls.parameters(), con_opt['emo_cls_lr'],[con_opt['beta1'], con_opt['beta2']],weight_decay = 0.000001) + self.emo_cls_optimizer = torch.optim.Adam(self.emo_cls.parameters(), con_opt['emo_cls_lr'], [con_opt['beta1'], con_opt['beta2']], weight_decay=0.000001) if 'spk' in dictionary: self.speaker_cls.load_state_dict(dictionary['spk']) @@ -255,7 +249,7 @@ def load(self, load_dir, map_location = None): print("Model and optimizers loaded.") - def load_pretrained_classifier(self, load_dir, map_location = None): + def load_pretrained_classifier(self, load_dir, map_location=None): if map_location is not None: dictionary = torch.load(load_dir, map_location=map_location) @@ -264,11 +258,12 @@ def load_pretrained_classifier(self, load_dir, map_location = None): con_opt = self.config['optimizer'] self.emo_cls.load_state_dict(dictionary['model_state_dict']) - self.emo_cls_optimizer = torch.optim.Adam(self.emo_cls.parameters(), con_opt['emo_cls_lr'],[con_opt['beta1'], con_opt['beta2']],weight_decay = 0.000001) + self.emo_cls_optimizer = torch.optim.Adam(self.emo_cls.parameters(), con_opt['emo_cls_lr'], [con_opt['beta1'], con_opt['beta2']],weight_decay = 0.000001) + class Down2d(nn.Module): """docstring for Down2d.""" - def __init__(self, in_channel ,out_channel, kernel, stride, padding): + def __init__(self, in_channel, out_channel, kernel, stride, padding): super(Down2d, self).__init__() self.c1 = nn.Conv2d(in_channel, out_channel, kernel_size=kernel, stride=stride, padding=padding) @@ -283,14 +278,14 @@ def forward(self, x): x2 = self.c2(x) x2 = self.n2(x2) - x3 = x1 * torch.sigmoid(x2) + x3 = x1 * torch.sigmoid(x2) return x3 class Up2d(nn.Module): """docstring for Up2d.""" - def __init__(self, in_channel ,out_channel, kernel, stride, padding): + def __init__(self, in_channel, out_channel, kernel, stride, padding): super(Up2d, self).__init__() self.c1 = nn.ConvTranspose2d(in_channel, out_channel, kernel_size=kernel, stride=stride, padding=padding) self.n1 = nn.InstanceNorm2d(out_channel) @@ -304,7 +299,7 @@ def forward(self, x): x2 = self.c2(x) x2 = self.n2(x2) - x3 = x1 * torch.sigmoid(x2) + x3 = x1 * torch.sigmoid(x2) return x3 @@ -316,61 +311,50 @@ def __init__(self, num_classes=4): self.num_classes = num_classes - self.down1 = Down2d(1, 32, (9,3), (1,1), (4,1)) - self.down2 = Down2d(32, 64, (8,4), (2,2), (3,1)) - self.down3 = Down2d(64, 128, (8,4), (2,2), (3,1)) - self.down4 = Down2d(128, 64, (5,3), (1,1), (2,1)) - self.down5 = Down2d(64, 5, (5,9), (1,9), (2,0)) + self.down1 = Down2d(1, 32, (9, 3), (1, 1), (4, 1)) + self.down2 = Down2d(32, 64, (8, 4), (2, 2), (3, 1)) + self.down3 = Down2d(64, 128, (8, 4), (2, 2), (3, 1)) + self.down4 = Down2d(128, 64, (5, 3), (1, 1), (2, 1)) + self.down5 = Down2d(64, 5, (5, 9), (1, 9), (2, 0)) + self.up1 = Up2d(5 + num_classes, 64, (5, 9), (1, 9), (2, 0)) + self.up2 = Up2d(64 + num_classes, 128, (5, 3), (1, 1), (2, 1)) + self.up3 = Up2d(128 + num_classes, 64, (8, 4), (2, 2), (3, 1)) + self.up4 = Up2d(64 + num_classes, 32, (8, 4), (2, 2), (3, 1)) - self.up1 = Up2d(5 + num_classes, 64, (5,9), (1,9), (2,0)) - self.up2 = Up2d(64 + num_classes, 128, (5,3), (1,1), (2,1)) - self.up3 = Up2d(128 + num_classes, 64, (8,4), (2,2), (3,1)) - self.up4 = Up2d(64 + num_classes, 32, (8,4), (2,2), (3,1)) - - self.deconv = nn.ConvTranspose2d(32 + num_classes, 1, (9,3), (1,1), (4,1)) + self.deconv = nn.ConvTranspose2d(32 + num_classes, 1, (9, 3), (1, 1), (4, 1)) def forward(self, x, c): - # x = x.unsqueeze(1) - # x = self.downsample(x) x = self.down1(x) - # print(x.size()) x = self.down2(x) - # print(x.size()) x = self.down3(x) - # print(x.size()) x = self.down4(x) - # print(x.size()) x = self.down5(x) - # print(x.size()) c = c.view(c.size(0), c.size(1), 1, 1) - c1 = c.repeat(1, 1, x.size(2), x.size(3)) x = torch.cat([x, c1], dim=1) x = self.up1(x) - # print(x.size()) c2 = c.repeat(1,1,x.size(2), x.size(3)) x = torch.cat([x, c2], dim=1) x = self.up2(x) - # print(x.size()) c3 = c.repeat(1,1,x.size(2), x.size(3)) x = torch.cat([x, c3], dim=1) x = self.up3(x) - # print(x.size()) c4 = c.repeat(1,1,x.size(2), x.size(3)) x = torch.cat([x, c4], dim=1) x = self.up4(x) - # print(x.size()) c5 = c.repeat(1,1, x.size(2), x.size(3)) x = torch.cat([x, c5], dim=1) x = self.deconv(x) + return x + class Generator_Mel(nn.Module): def __init__(self, num_classes): super(Generator_Mel, self).__init__() @@ -386,23 +370,12 @@ class Discriminator(nn.Module): def __init__(self, num_classes=4): super(Discriminator, self).__init__() - self.d1 = Down2d(1 + num_classes, 32, (9,3), (1,1), (4,1)) - self.d2 = Down2d(32 + num_classes, 32, (8,3), (2,1), (3,1)) - self.d3 = Down2d(32 + num_classes, 32, (8,3), (2,1), (3,1)) - self.d4 = Down2d(32 + num_classes, 32, (6,3), (2,1), (2,1)) - # - # self.conv = nn.Conv2d(36, 1, (8,8), (8,8), (2,0)) - - # self.d1 = Down2d(1 + num_classes, 32, (3,9), (1,1), (1,4)) - # self.d2 = Down2d(32 + num_classes, 32, (3,8), (1,2), (1,3)) - # self.d3 = Down2d(32 + num_classes, 32, (3,8), (1,2), (1,3)) - # self.d4 = Down2d(32 + num_classes, 32, (3,6), (1,2), (1,2)) - - # self.conv = nn.Conv2d(32 + num_classes, 1, (8,8), (8,8), (0,2)) - # self.pool = nn.AdaptiveAvgPool2d(1) - # self.conv = nn.Conv2d(32 + num_classes, 1, (36,5), (36,1), (0,2)) - # self.pool = nn.AvgPool2d((1,64)) - self.conv = nn.Conv2d(32 + num_classes, 1, (5,36), (1,36), (2,0)) + self.d1 = Down2d(1 + num_classes, 32, (9, 3), (1, 1), (4, 1)) + self.d2 = Down2d(32 + num_classes, 32, (8, 3), (2, 1), (3, 1)) + self.d3 = Down2d(32 + num_classes, 32, (8, 3), (2, 1), (3, 1)) + self.d4 = Down2d(32 + num_classes, 32, (6, 3), (2, 1), (2, 1)) + + self.conv = nn.Conv2d(32 + num_classes, 1, (5, 36), (1, 36), (2, 0)) self.pool = nn.AvgPool2d((64,1)) def forward(self, x, c): @@ -411,36 +384,32 @@ def forward(self, x, c): c1 = c.repeat(1, 1, x.size(2), x.size(3)) x = torch.cat([x, c1], dim=1) x = self.d1(x) - # print(x.size()) c2 = c.repeat(1, 1, x.size(2), x.size(3)) x = torch.cat([x, c2], dim=1) x = self.d2(x) - # print(x.size()) c3 = c.repeat(1, 1, x.size(2), x.size(3)) x = torch.cat([x, c3], dim=1) x = self.d3(x) - # print(x.size()) c4 = c.repeat(1, 1, x.size(2), x.size(3)) x = torch.cat([x, c4], dim=1) x = self.d4(x) - # print(x.size()) c5 = c.repeat(1, 1, x.size(2), x.size(3)) x = torch.cat([x, c5], dim=1) x = self.conv(x) - # print(x.size()) x = self.pool(x) - # x = torch.squeeze(x) x = torch.tanh(x) + return x -if __name__ == '__main__': +if __name__ == '__main__': + pass # import yaml # config = yaml.load(open('./config.yaml', 'r')) - - l = [0,0,0,1,2,3,4,5] - - l = [v for v in l if v in [0,1,5]] - - print(l) + # + # l = [0,0,0,1,2,3,4,5] + # + # l = [v for v in l if v in [0,1,5]] + # + # print(l) diff --git a/stargan/my_dataset.py b/stargan/my_dataset.py index 93952d7..0927c6d 100644 --- a/stargan/my_dataset.py +++ b/stargan/my_dataset.py @@ -1,26 +1,23 @@ -''' +""" my_dataset.py Author - Max Elliott The custom dataset and collate function described in the report. -''' +""" import torch import torch.utils.data as data_utils -from utils import audio_utils - import numpy as np -import librosa from librosa.util import find_files import random import os -import yaml + def get_filenames(dir): - files = find_files(dir, ext = 'npy') + files = find_files(dir, ext='npy') filenames = [] for f in files: @@ -29,10 +26,12 @@ def get_filenames(dir): return filenames + def shuffle(in_list): - ''' + """ in_list: list to be shuffled - ''' + """ + indices = list(range(len(in_list))) random.shuffle(indices) @@ -43,7 +42,8 @@ def shuffle(in_list): return shuffled_list -def _pad_sequence(seq, length, pad_value = 0): + +def _pad_sequence(seq, length, pad_value=0): new_seq = torch.zeros((length,seq.size(1))) if seq.size(0) <= length: new_seq[:seq.size(0), :] = seq @@ -51,28 +51,28 @@ def _pad_sequence(seq, length, pad_value = 0): new_seq[:seq.size(0), :] = seq[:length, :] return new_seq + def crop_sequences(seq_list, labels, segment_len): - ''' + """ seq_list = ([(seq_len, n_feats)]) labels = ([label]) - ''' + """ new_seqs = [] new_labels = [] for i, seq in enumerate(seq_list): - while seq.size(0) >= segment_len: - new_seq = seq[0:segment_len,:] + new_seq = seq[0:segment_len, :] new_seqs.append(new_seq) new_labels.append(labels[i]) - seq = torch.Tensor(seq[segment_len:,:]) + seq = torch.Tensor(seq[segment_len:, :]) if new_seq.size(0) != segment_len: print(i, new_seq.size(0)) - if seq.size(0) > segment_len//2: + if seq.size(0) > segment_len // 2: new_seq = _pad_sequence(seq, segment_len) new_seqs.append(new_seq) @@ -80,12 +80,13 @@ def crop_sequences(seq_list, labels, segment_len): return new_seqs, new_labels + class MyDataset(data_utils.Dataset): def __init__(self, config, filenames): super(MyDataset, self).__init__() - self.config = config + self.config = config self.dataset_dir = config['data']['dataset_dir'] if config['data']['type'] == 'mel': @@ -111,8 +112,9 @@ def __getitem__(self, index): def __len__(self): return len(self.filenames) + def collate_length_order(batch): - ''' + """ batch: Batch elements are tuples ((Tensor)sequence, target) Sorts batch by sequence length @@ -121,7 +123,7 @@ def collate_length_order(batch): (FloatTensor) sequence_padded: seqs in length order, padded to max_len (LongTensor) lengths: lengths of seqs in sequence_padded (LongTensor) labels: corresponding targets, in correct order - ''' + """ # assume that each element in "batch" is a tuple (data, label). # Sort the batch in the descending order sorted_batch = sorted(batch, key=lambda x: x[0].size(0), reverse=True) @@ -135,13 +137,10 @@ def collate_length_order(batch): for i,seq in enumerate(sequences): if seq.size(0) > 512: start_index = random.randint(0, seq.size(0)-512) - # print(start_index) - sequences[i] = seq[start_index:start_index+512,:] - # (seq[i] = seq[:512,:]) if seq.size(0) > 512 else seq[i] = seq + sequences[i] = seq[start_index:start_index+512, :] sequences_padded = torch.nn.utils.rnn.pad_sequence(sequences, batch_first=True) current_len = sequences_padded.size(1) - # print(f"Current length: {current_len}") if current_len < 512: pad_len = 512 - current_len new_tensor = torch.zeros((sequences_padded.size(0),pad_len,sequences_padded.size(2))) @@ -187,7 +186,7 @@ def collate_length_order(batch): # Also need to store the length of each sequence # This is later needed in order to unpad the sequences lengths = [len(x) for x in sequences] - for i,l in enumerate(lengths): + for i, l in enumerate(lengths): if l > 512: lengths[i] = 512 lengths = torch.LongTensor([len(x) for x in sequences]) @@ -197,29 +196,19 @@ def collate_length_order(batch): return [sequences_padded, lengths], targets -def make_variable_dataloader(train_set, test_set, batch_size = 64): - train_loader = data_utils.DataLoader(train_set, batch_size = batch_size, - collate_fn = collate_length_order, - num_workers = 0, shuffle = True) +def make_variable_dataloader(train_set, test_set, batch_size=64): - test_loader = data_utils.DataLoader(test_set, batch_size = batch_size, - collate_fn = collate_length_order, - num_workers = 0, shuffle = True) + train_loader = data_utils.DataLoader(train_set, batch_size=batch_size, + collate_fn=collate_length_order, + num_workers=0, shuffle=True) + + test_loader = data_utils.DataLoader(test_set, batch_size=batch_size, + collate_fn=collate_length_order, + num_workers=0, shuffle=True) return train_loader, test_loader -if __name__ == '__main__': - max_len = 608 - print("Original size = ", max_len) - div8 = max_len%2==0 - div5 = max_len%5==0 - div9 = max_len%3==0 - if not (div8 and div5 and div9): - pad_len = max_len + 1 - print("Current pad:", (pad_len%3 !=0 or pad_len%5!=0 or pad_len%9!=0)) - while (pad_len%8 !=0 or pad_len%5!=0 or pad_len%9!=0): - pad_len += 1 - print("Current pad:", (pad_len%8 !=0 or pad_len%5!=0 or pad_len%9!=0)) - pad_len = pad_len - max_len +if __name__ == '__main__': + pass diff --git a/stargan/sample_set.py b/stargan/sample_set.py index 1027a8f..4775ad5 100644 --- a/stargan/sample_set.py +++ b/stargan/sample_set.py @@ -1,4 +1,4 @@ -''' +""" sample_set.py Author - Max Elliott @@ -6,7 +6,7 @@ Class to hold sample files when generating test samples during training. Actually wasn't that useful in the end, just did conversion with each checkpoint after training was complete. -''' +""" import os import yaml @@ -52,15 +52,14 @@ def __init__(self, config): coded_sp = torch.Tensor(coded_sp.T) self.set[filename] = [f0, ap, sp, coded_sp, labels] - - def get_set(self): - ''' + """ Return dict of all samples Each value in dict is (mel, labels, spec) = ((len,n_mels),(8),(len2, n_ffts/2+1)) - ''' + """ return self.set + if __name__ == '__main__': config = yaml.load(open('./config.yaml', 'r')) diff --git a/stargan/solver.py b/stargan/solver.py index 4dd3d90..29b7d2a 100644 --- a/stargan/solver.py +++ b/stargan/solver.py @@ -1,4 +1,4 @@ -''' +""" solver.py Author: Max Elliott @@ -8,13 +8,10 @@ that were never implemented (namely speaker classifiers and dimension classifiers). Its generally safe to ignore anything in a "if self.use_speaker:" or "if self.use_dimension:" block and these should be set to False by config.yaml. -''' -import os +""" import random import numpy as np -import copy -import time -from datetime import datetime, timedelta +from datetime import datetime import torch import torch.nn as nn @@ -25,15 +22,12 @@ from stargan.logger import Logger from stargan.sample_set import Sample_Set -import sklearn -from sklearn.metrics import f1_score from sklearn.metrics import accuracy_score -from sklearn.metrics import recall_score class Solver(object): - def __init__(self, train_loader, test_loader, config, load_dir = None): + def __init__(self, train_loader, test_loader, config, load_dir=None): self.train_loader = train_loader self.test_loader = test_loader @@ -45,12 +39,11 @@ def __init__(self, train_loader, test_loader, config, load_dir = None): self.set_configuration() self.model = self.model - if not load_dir == None: + if load_dir is not None: self.load_checkpoint(load_dir) def load_checkpoint(self, load_dir): - # path = os.path.join(self.model_save_dir, self.model_name) self.model.load(load_dir) self.config = self.model.config self.set_configuration() @@ -74,8 +67,7 @@ def set_configuration(self): self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - self.emo_loss_weights = torch.Tensor([4040./549, 4040./890, - 4040./996, 4040./1605]).to(self.device) + self.emo_loss_weights = torch.Tensor([4040./549, 4040./890, 4040./996, 4040./1605]).to(self.device) self.use_speaker = self.config['model']['use_speaker'] self.use_dimension = self.config['model']['use_dimension'] @@ -114,9 +106,9 @@ def set_classification_weights(self, weights): print("Set classification weights.") def train(self): - ''' + """ Main training loop - ''' + """ print('################ BEGIN TRAINING LOOP ################') start_iter = self.resume_iters + 1 # == 1 if new model @@ -134,7 +126,7 @@ def train(self): print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Iteration {:02}/{:02} ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~".format(i,self.num_iters)) print("Iteration {:02} lr = {:.6f}".format(i, self.model.d_optimizer.param_groups[0]['lr'])) - self.model.to_device(device = self.device) + self.model.to_device(device=self.device) print("Device is ", self.device) print("Classifier device is ", self.model.emo_cls.device) self.model.set_train_mode() @@ -150,13 +142,12 @@ def train(self): data_iter = iter(self.train_loader) x, labels = next(data_iter) - x_real = x[0].to(device = self.device).unsqueeze(1) x_lens = x[1].to(device = self.device) print(f"solver.train: x_real size = {x_real.size()}") - emo_labels = labels[:,0].to(device = self.device) - spk_labels = labels[:,1].to(device = self.device) + emo_labels = labels[:, 0].to(device=self.device) + spk_labels = labels[:, 1].to(device=self.device) # ;;;;;;; GET DIM LABELS # Generate target domain labels randomly. @@ -165,13 +156,13 @@ def train(self): emo_targets = emo_targets.to(device = self.device) # one-hot versions of labels - emo_labels_ones = F.one_hot(emo_labels, num_classes = num_emos).float().to(device = self.device) - emo_targets_ones = F.one_hot(emo_targets, num_classes = num_emos).float().to(device = self.device) + emo_labels_ones = F.one_hot(emo_labels, num_classes=num_emos).float().to(device=self.device) + emo_targets_ones = F.one_hot(emo_targets, num_classes=num_emos).float().to(device=self.device) ############################################################# # TRAIN CLASSIFIERS # ############################################################# - ce_weighted_loss_fn = nn.CrossEntropyLoss(weight = self.emo_loss_weights) + ce_weighted_loss_fn = nn.CrossEntropyLoss(weight=self.emo_loss_weights) ce_loss_fn = nn.CrossEntropyLoss() if self.config['loss']['train_classifier']: @@ -204,7 +195,7 @@ def train(self): # Train with x_real preds_dimension_real = self.model.dimension_cls(x_real, x_lens) - #;;; DO FOR MULTILABEL + # ;;; DO FOR MULTILABEL c_dimension_real_loss = ce_loss_fn(preds_dimension_real, dim_labels) c_dimension_real_loss.backward() @@ -229,8 +220,8 @@ def train(self): # print(x_real.size()) # print(x_fake.size()) - #Calculate loss - grad_penalty = self.gradient_penalty(x_real, x_fake, emo_targets_ones) # detach(), one hots? + #C alculate loss + grad_penalty = self.gradient_penalty(x_real, x_fake, emo_targets_ones) # detach(), one hots? d_loss = -d_preds_real.mean() + d_preds_fake.mean() + \ self.lambda_gp * grad_penalty @@ -255,8 +246,7 @@ def train(self): x_cycle = self.model.G(x_fake, emo_labels_ones) x_id = self.model.G(x_real, emo_labels_ones) d_preds_for_g = self.model.D(x_fake, emo_targets_ones) - # preds_emo_fake = self.model.emo_cls(x_fake, x_fake_lens) - + preds_emo_fake = self.model.emo_cls(x_fake, x_fake_lens) # x_cycle = self.make_equal_length(x_cycle, x_real) x_id = self.make_equal_length(x_id, x_real) @@ -266,11 +256,11 @@ def train(self): loss_g_fake = - d_preds_for_g.mean() loss_cycle = l1_loss_fn(x_cycle, x_real) loss_id = l1_loss_fn(x_id, x_real) - # loss_g_emo_cls = ce_weighted_loss_fn(preds_emo_fake, emo_targets) + loss_g_emo_cls = ce_weighted_loss_fn(preds_emo_fake, emo_targets) g_loss = loss_g_fake + self.lambda_id * loss_id + \ - self.lambda_cycle * loss_cycle# + \ - # self.lambda_g_emo_cls * loss_g_emo_cls + self.lambda_cycle * loss_cycle + \ + self.lambda_g_emo_cls * loss_g_emo_cls if self.use_speaker: @@ -298,7 +288,6 @@ def train(self): loss['C/emo_real_loss'] = c_emo_real_loss.item() loss['D/total_loss'] = d_loss.item() loss['G/total_loss'] = g_loss.item() - # loss['G/emo_loss'] = loss_g_emo_cls.item() loss['D/gradient_penalty'] = grad_penalty.item() loss['G/loss_cycle'] = loss_cycle.item() loss['G/loss_id'] = loss_id.item() @@ -326,7 +315,7 @@ def train(self): # save checkpoint if i % self.model_save_every == 0: - self.model.save(save_dir = self.model_save_dir, iter = self.current_iter) + self.model.save(save_dir=self.model_save_dir, iter=self.current_iter) else: print("No model saved this iteration.") @@ -345,8 +334,7 @@ def train(self): elapsed = datetime.now() - start_time print('{} elapsed. Iteration {:04} complete'.format(elapsed, i)) - self.model.save(save_dir = self.model_save_dir, iter = self.current_iter) - + self.model.save(save_dir=self.model_save_dir, iter=self.current_iter) def test(self): @@ -354,37 +342,37 @@ def test(self): print("Testing generator accuracy ...") self.model.set_eval_mode() - real_preds = torch.rand(0).to(device = self.device, dtype = torch.long) - fake_preds = torch.rand(0).to(device = self.device, dtype = torch.long) - id_preds = torch.rand(0).to(device = self.device, dtype = torch.long) - cycle_preds = torch.rand(0).to(device = self.device, dtype = torch.long) + real_preds = torch.rand(0).to(device=self.device, dtype=torch.long) + fake_preds = torch.rand(0).to(device=self.device, dtype=torch.long) + id_preds = torch.rand(0).to(device=self.device, dtype=torch.long) + cycle_preds = torch.rand(0).to(device=self.device, dtype=torch.long) - total_labels = torch.rand(0).to(device = self.device, dtype = torch.long) - total_targets = torch.rand(0).to(device = self.device, dtype = torch.long) + total_labels = torch.rand(0).to(device=self.device, dtype=torch.long) + total_targets = torch.rand(0).to(device=self.device, dtype=torch.long) - total_real = torch.rand(0).to(device = self.device, dtype = torch.float) - total_id = torch.rand(0).to(device = self.device, dtype = torch.float) - total_cycle = torch.rand(0).to(device = self.device, dtype = torch.float) + total_real = torch.rand(0).to(device=self.device, dtype=torch.float) + total_id = torch.rand(0).to(device=self.device, dtype=torch.float) + total_cycle = torch.rand(0).to(device=self.device, dtype=torch.float) l1_loss_fn = nn.L1Loss() for i, (x, labels) in enumerate(self.test_loader): - x_real = x[0].to(device = self.device) - x_lens = x[1].to(device = self.device) + x_real = x[0].to(device=self.device) + x_lens = x[1].to(device=self.device) x_real = x_real.unsqueeze(1) - emo_labels = labels[:,0].to(device = self.device) - spk_labels = labels[:,1].to(device = self.device) + emo_labels = labels[:, 0].to(device=self.device) + spk_labels = labels[:, 1].to(device=self.device) # Generate target domain labels randomly. num_emos = self.config['model']['num_classes'] emo_targets = self.make_random_labels(num_emos, emo_labels.size(0)) - emo_targets = emo_targets.to(device = self.device) + emo_targets = emo_targets.to(device=self.device) # one-hot versions of labels - emo_labels_ones = F.one_hot(emo_labels, num_classes = num_emos).float().to(device = self.device) - emo_targets_ones = F.one_hot(emo_targets, num_classes = num_emos).float().to(device = self.device) + emo_labels_ones = F.one_hot(emo_labels, num_classes=num_emos).float().to(device=self.device) + emo_targets_ones = F.one_hot(emo_targets, num_classes=num_emos).float().to(device=self.device) with torch.no_grad(): @@ -427,8 +415,7 @@ def test(self): # print(L1_loss_id) # print(L1_loss_cycle) - l = ["Accuracy_real","Accuracy_fake", "Accuracy_id", "Accuracy_cycle", - "L1_id", "L1_cycle"] + l = ["Accuracy_real", "Accuracy_fake", "Accuracy_id", "Accuracy_cycle", "L1_id", "L1_cycle"] print('{:20} = {:.3f}'.format(l[0], accuracy_real)) print('{:20} = {:.3f}'.format(l[1], accuracy_fake)) @@ -446,10 +433,10 @@ def test(self): self.logger.scalar_summary("Val/test_L1_cycle", L1_loss_cycle, self.current_iter) def sample_mel(self): - ''' + """ Passes each performance sample through G for every target emotion. They are saved to 'config(sample_dir)/model_name/filename-to.png + .npy' - ''' + """ print("Saving mel samples...") @@ -473,23 +460,23 @@ def sample_mel(self): fake = self.model.G(mel, emo_targets[i].unsqueeze(0)) - filename_png = tag[0:-4] + "_" + str(int(labels[0].item())) + "to" + \ - str(emo_labels[i].item()) + '_i=' +\ - str(self.current_iter) + ".png" + filename_png = tag[0:-4] + "_" + str(int(labels[0].item())) + "to" + \ + str(emo_labels[i].item()) + '_i=' +\ + str(self.current_iter) + ".png" - filename_npy = tag[0:-4] + "_" + str(int(labels[0].item())) + "to" + \ - str(emo_labels[i].item()) + '_i=' +\ - str(self.current_iter) + ".npy" + filename_npy = tag[0:-4] + "_" + str(int(labels[0].item())) + "to" + \ + str(emo_labels[i].item()) + '_i=' +\ + str(self.current_iter) + ".npy" fake = fake.squeeze() audio_utils.save_spec_plot(fake.t(), self.model_name, filename_png) audio_utils.save_spec(fake.t(), self.model_name, filename_npy) def sample_world(self): - ''' + """ Passes each performance sample through G for every target emotion. They are saved to 'config(sample_dir)/model_name/filename-to.png + .npy' - ''' + """ print("Saving world samples...") @@ -513,17 +500,17 @@ def sample_world(self): coded_sp = val[3].unsqueeze(0).unsqueeze(0).to(device = self.device) with torch.no_grad(): - # print(emo_targets) - for i in range (0, emo_targets.size(0)): + + for i in range(0, emo_targets.size(0)): f0 = np.copy(f0_real) ap = np.copy(ap_real) fake = self.model.G(coded_sp, emo_targets[i].unsqueeze(0)) - filename_wav = tag[0:-4] + "_" + str(int(labels[0].item())) + "to" + \ - str(emo_labels[i].item()) + '_i=' +\ - str(self.current_iter) + ".wav" + filename_wav = tag[0:-4] + "_" + str(int(labels[0].item())) + "to" + \ + str(emo_labels[i].item()) + '_i=' +\ + str(self.current_iter) + ".wav" fake = fake.squeeze() # print("Sampled size = ",fake.size()) @@ -543,7 +530,7 @@ def sample_world(self): # print("ap shape = ", val[1].shape) # print("f0 shape = ", val[0].shape) - audio_utils.save_world_wav([f0,ap,sp,converted_sp], self.model_name, filename_wav) + audio_utils.save_world_wav([f0, ap, sp, converted_sp], self.model_name, filename_wav) def update_lr(self, i): """Decay learning rates of the generator and discriminator and classifier.""" @@ -575,11 +562,11 @@ def update_lr(self, i): param_group['lr'] = emo_lr def make_random_labels(self, num_domains, num_labels): - ''' + """ Creates random labels for generator. num_domains: number of unique labels num_labels: total number of labels to generate - ''' + """ domain_list = np.arange(0, num_domains) # print(domain_list) labels = torch.zeros((num_labels)) @@ -594,7 +581,6 @@ def gradient_penalty(self, x_real, x_fake, targets): # Compute loss for gradient penalty. alpha = torch.rand(x_real.size(0), 1, 1, 1).to(self.device) x_hat = (alpha * x_real.data + (1 - alpha) * x_fake.data).requires_grad_(True) - # print("x_hat size: ", x_hat.size()) out_src = self.model.D(x_hat, targets) weight = torch.ones(out_src.size()).to(self.device) @@ -615,7 +601,5 @@ def make_equal_length(self, x_out, x_real): return x_out - - if __name__ == '__main__': - print("Is main.") + pass diff --git a/utils/audio_utils.py b/utils/audio_utils.py index ecffcde..7795be9 100644 --- a/utils/audio_utils.py +++ b/utils/audio_utils.py @@ -16,7 +16,6 @@ import librosa import librosa.display -import pyworld from pyworld import decode_spectral_envelope, synthesize import numpy as np @@ -30,16 +29,16 @@ class hyperparams(object): def __init__(self): - self.sr = 16000 # Sampling rate. Paper => 24000 - self.n_fft = 1024 # fft points (samples) - self.frame_shift = 0.0125 # seconds - self.frame_length = 0.05 # seconds - self.hop_length = int(self.sr*self.frame_shift) # samples This is dependent on the frame_shift. - self.win_length = int(self.sr*self.frame_length) # samples This is dependent on the frame_length. - self.n_mels = 80 # Number of Mel banks to generate + self.sr = 16000 # Sampling rate. Paper => 24000 + self.n_fft = 1024 # fft points (samples) + self.frame_shift = 0.0125 # seconds + self.frame_length = 0.05 # seconds + self.hop_length = int(self.sr*self.frame_shift) # samples This is dependent on the frame_shift. + self.win_length = int(self.sr*self.frame_length) # samples This is dependent on the frame_length. + self.n_mels = 80 # Number of Mel banks to generate self.power = 1.2 # Exponent for amplifying the predicted magnitude - self.n_iter = 100 # Number of inversion iterations - self.use_log_magnitude = True # if False, use magnitude + self.n_iter = 100 # Number of inversion iterations + self.use_log_magnitude = True # if False, use magnitude self.preemph = 0.97 self.config = yaml.load(open('./config.yaml', 'r')) @@ -61,11 +60,6 @@ def __init__(self): with open('./f0_relative_dict.pkl', 'rb') as fp: self.f0_relative_dict = pickle.load(fp) - # for tag, val in self.f0_dict.items(): - # print(f'Emotion {tag} stats:') - # for tag2, val2 in val.items(): - # print(f'{tag2} = {val2[0]}, {val2[1]}') - hp = hyperparams() @@ -76,8 +70,6 @@ def load_wav(path): def save_wav(wav, path): - # print(np.max(np.abs(wav))) - # print(np.mean(np.abs(wav))) # wav *= 32767 / max(0.01, np.max(np.abs(wav))) wav *= 48000 @@ -85,17 +77,15 @@ def save_wav(wav, path): wavfile.write(path, hp.sr, wav.astype(np.int16)) -def wav2spectrogram(y, sr = hp.sr): +def wav2spectrogram(y, sr=hp.sr): ''' Produces log-magnitude spectrogram of audio data y ''' - spec = librosa.core.stft(y, n_fft = hp.n_fft, hop_length = hp.hop_length, - win_length = hp.win_length) + spec = librosa.core.stft(y, n_fft=hp.n_fft, hop_length=hp.hop_length, + win_length=hp.win_length) spec_mag = amp_to_db(np.abs(spec)) - # spec_angle = np.angle(spec) - # spec_mag = lowpass(spec_mag, 400) return spec_mag @@ -127,8 +117,8 @@ def wav2melspectrogram(y, sr = hp.sr, n_mels = hp.n_mels): y = input wav file ''' - mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels = n_mels, - n_fft = hp.n_fft, hop_length = hp.hop_length) + mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels, + n_fft=hp.n_fft, hop_length=hp.hop_length) # mel_spec = librosa.core.amplitude_to_db(y) if hp.normalise: mel_spec = _normalise_mel(mel_spec) @@ -136,12 +126,12 @@ def wav2melspectrogram(y, sr = hp.sr, n_mels = hp.n_mels): return mel_spec -def spectrogram2melspectrogram(spec, n_fft = hp.n_fft, n_mels = hp.n_mels): +def spectrogram2melspectrogram(spec, n_fft=hp.n_fft, n_mels=hp.n_mels): if isinstance(spec, torch.Tensor): spec = spec.numpy() - mels = librosa.filters.mel(hp.sr, n_fft, n_mels = n_mels) + mels = librosa.filters.mel(hp.sr, n_fft, n_mels=n_mels) return mels.dot(spec**hp.power) @@ -339,7 +329,7 @@ def f0_pitch_conversion(f0, source_labels, target_labels): plt.subplot(4, 2, 1) librosa.display.specshow(librosa.power_to_db(spec), y_axis='mel', sr=hp.sr, - hop_length=hp.hop_length, vmax = -8.47987, vmin= -100.0) + hop_length=hp.hop_length, vmax=-8.47987, vmin=-100.0) # fmin=None, fmax=4000) # plt.colorbar(format='%+2.0f dB') plt.title('1) Original (sad)') diff --git a/utils/data_preprocessing_utils.py b/utils/data_preprocessing_utils.py index 8afbee7..e6f627b 100644 --- a/utils/data_preprocessing_utils.py +++ b/utils/data_preprocessing_utils.py @@ -1,11 +1,11 @@ -''' +""" data_preprocessing2.py Author - Max Elliott Functions for pre-processing the IEMOCAP dataset. Can make mel-specs, WORLD features, and labels for each audio clip. -''' +""" import torch @@ -19,8 +19,7 @@ def get_speaker_from_filename(filename): code = filename[4] + filename[-8] - conversion = {'1F':0, '1M':1, '2F':2, '2M':3, '3F':4, '3M':5, '4F':6, '4M':7, - '5F': 8, '5M':9} + conversion = {'1F': 0, '1M': 1, '2F': 2, '2M': 3, '3F': 4, '3M': 5, '4F': 6, '4M': 7, '5F': 8, '5M': 9} label = conversion[code] @@ -29,12 +28,12 @@ def get_speaker_from_filename(filename): def get_emotion_from_label(category): - if category == 'xxx' or category =='dis' or category =='fea' or category == 'oth': + if category == 'xxx' or category == 'dis' or category == 'fea' or category == 'oth': return -1 if category == 'exc' or category == 'fru' or category == 'sur': return -1 - conversion = {'ang':0, 'sad':1, 'hap':2, 'neu':3} + conversion = {'ang': 0, 'sad': 1, 'hap': 2, 'neu': 3} label = conversion[category] @@ -49,7 +48,7 @@ def getOneHot(label, n_labels): return onehot -def cont2list(cont, binned = False): +def cont2list(cont, binned=False): list = [0,0,0] list[0] = float(cont[1:6]) @@ -73,7 +72,6 @@ def cont2list(cont, binned = False): def concatenate_labels(emo, speaker, dims, dims_dis): all_labels = torch.zeros(8) - # print(all_labels) # for i, row in enumerate(all_labels): all_labels[0] = emo @@ -85,13 +83,11 @@ def concatenate_labels(emo, speaker, dims, dims_dis): all_labels[6] = dims_dis[1] all_labels[7] = dims_dis[2] - return all_labels def get_wav_and_labels(filename, data_dir): - # folder = filename[:-9] wav_path = os.path.join(data_dir, "audio", filename) label_path = os.path.join(data_dir, "annotations", filename[:-9] + ".txt") @@ -110,7 +106,6 @@ def get_wav_and_labels(filename, data_dir): dimensions_dis = cont2list(split[3], binned = True) speaker = get_speaker_from_filename(filename) - audio = audio_utils.load_wav(wav_path) audio = np.array(audio, dtype = np.float32) labels = concatenate_labels(category, speaker, dimensions, dimensions_dis) @@ -120,8 +115,6 @@ def get_wav_and_labels(filename, data_dir): def get_samples_and_labels(filename, config): - # config = yaml.load(open('./config.yaml', 'r')) - wav_path = config['data']['sample_set_dir'] + "/" + filename folder = filename[:-9] label_path = config['data']['dataset_dir'] + "/Annotations/" + folder + ".txt" @@ -141,7 +134,6 @@ def get_samples_and_labels(filename, config): dimensions_dis = cont2list(split[3], binned = True) speaker = get_speaker_from_filename(filename) - audio = audio_utils.load_wav(wav_path) audio = np.array(audio, dtype = np.float32) labels = concatenate_labels(category, speaker, dimensions, dimensions_dis) @@ -183,10 +175,10 @@ def get_filenames(data_dir): i = 0 found = 0 lengths = [] - longest_length = 0 + longest_lensgth = 0 longest_name = "" for f in filenames: - if i >10000: + if i > 10000: print(f) wav, labels = get_wav_and_labels(f, data_dir) # mel = audio_utils.wav2melspectrogram(wav) diff --git a/utils/preprocess_world.py b/utils/preprocess_world.py index d748915..e8c7638 100644 --- a/utils/preprocess_world.py +++ b/utils/preprocess_world.py @@ -1,23 +1,16 @@ -''' +""" preprocess_world.py Author - Max Elliott Functions for extracting WORLD features form wav files. ALso for initial pre-processing of data. -''' +""" -import librosa from librosa.util import find_files import numpy as np import os import pyworld -# import pyworld as pw -from pyworld import decode_spectral_envelope, synthesize -import glob -# from utility import * -import argparse -from datetime import datetime import pickle from utils import audio_utils @@ -28,6 +21,7 @@ FRAMES = 512 FFTSIZE = 1024 + def world_features(wav, sr, fft_size, dim): f0, timeaxis = pyworld.harvest(wav, sr) sp = pyworld.cheaptrick(wav, f0, timeaxis, sr,fft_size=fft_size) @@ -36,20 +30,22 @@ def world_features(wav, sr, fft_size, dim): return f0, timeaxis, sp, ap, coded_sp + def cal_mcep(wav, sr=SAMPLE_RATE, dim=FEATURE_DIM, fft_size=FFTSIZE): - '''cal mcep given wav singnal - the frame_period used only for pad_wav_to_get_fixed_frames - ''' + """ + cal mcep given wav singnal + the frame_period used only for pad_wav_to_get_fixed_frames + """ f0, timeaxis, sp, ap, coded_sp = world_features(wav, sr, fft_size, dim) if audio_utils.hp.normalise: coded_sp = audio_utils._normalise_coded_sp(coded_sp) - # print("Normalised") - coded_sp = coded_sp.T # dim x n + coded_sp = coded_sp.T # dim x n return f0, ap, sp, coded_sp + def get_f0_stats(f0s): log_f0s_concatenated = np.ma.log(np.concatenate(f0s)) log_F0s_no0 = [] @@ -59,11 +55,10 @@ def get_f0_stats(f0s): log_f0s_mean = log_f0s_concatenated.mean() log_f0s_std = np.var(log_f0s_concatenated) - return log_f0s_mean, log_f0s_std -if __name__ == "__main__": +if __name__ == "__main__": ########################################### # WORLD features testing code #