config/project/fa_as_guide.cfg

# -*- coding: utf-8 -*-
#/usr/bin/python2

import os


## Take name of this file to be the config name:
config_name = os.path.split(__file__)[-1].split('.')[0]  ## remove path and extension

## Define place to put outputs relative to this config file's location;
## supply an absoluate path to work elsewhere:
topworkdir = os.path.realpath(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'work')))

voicedir = os.path.join(topworkdir, config_name)
logdir = os.path.join(voicedir, 'train')
sampledir = os.path.join(voicedir, 'synth')

## Change featuredir to absolute path to use existing features
featuredir = '/disk/scratch/cvbotinh/data/BC2013/CB-JE-LCL-FFM-EM/'
coarse_audio_dir = os.path.join(featuredir, 'mels')
full_mel_dir = os.path.join(featuredir, 'full_mels')
full_audio_dir = os.path.join(featuredir, 'mags')
#attention_guide_dir = os.path.join(featuredir, 'attention_guides') 
## Set this to the empty string ('') to global attention guide
attention_guide_dir = '/disk/scratch/cvbotinh/data/BC2013/CB-JE-LCL-FFM-EM/W0.1_attention_guides_dctts/' # contains FA guides

# Data locations:
datadir = '/disk/scratch/cvbotinh/data/BC2013/CB-JE-LCL-FFM-EM/'

transcript = os.path.join(datadir, 'trimmed_transcript_trainset_with_punctuation_with_all_quotes_CB-EM_less17.csv')
test_transcript = os.path.join(datadir, 'transcript_testset_shuffled_with_punctuation_with_all_quotes.csv')
waveforms = os.path.join(datadir, 'wavs_trim')


input_type = 'phones' ## letters or phones

## CMU phones:
vocab = ['<PADDING>', '<_END_>', '<_START_>'] + ['<!>', '<">', "<'>", "<'s>", '<)>', '<,>', '<.>', '<:>', '<;>', '<>', '<?>', '<]>', '<_END_>', '<_START_>', 'aa', 'ae', 'ah', 'ao', 'aw', 'ax', 'ay', 'b', 'ch', 'd', 'dh', 'eh', 'er', 'ey', 'f', 'g', 'hh', 'ih', 'iy', 'jh', 'k', 'l', 'm', 'n', 'ng', 'ow', 'oy', 'p', 'r', 's', 'sh', 't', 'th', 'uh', 'uw', 'v', 'w', 'y', 'z', 'zh']

# Train
max_N = 150 # Maximum number of characters/phones
max_T = 264 # Maximum number of mel frames

turn_off_monotonic_for_synthesis = True
sampledir = sampledir + '_' + str(turn_off_monotonic_for_synthesis)

multispeaker = [] ## list of positions at which to add speaker embeddings to handle multi-speaker training. [] means speaker dependent (no embeddings). Possible positions: text_encoder_input, text_encoder_towards_end, audio_decoder_input, ssrn_input, audio_encoder_input
n_utts = 0 ## 0 means use all data, other positive integer means select this many sentences from beginning of training set
random_reduction_on_the_fly = True ## Randomly choose shift when performing reduction to get coarse features.


# signal processing
trim_before_spectrogram_extraction = 0
vocoder = 'griffin_lim'
sr = 22050  # Sampling rate.
n_fft = 2048  # fft points (samples)
frame_shift = 0.0125  # seconds
frame_length = 0.05  # seconds
hop_length = int(sr * frame_shift)
win_length = int(sr * frame_length)
prepro = True  # don't extract spectrograms on the fly
full_dim = n_fft//2+1
n_mels = 80  # Number of Mel banks to generate
power = 1.5  # Exponent for amplifying the predicted magnitude
n_iter = 50  # Number of inversion iterations
preemphasis = .97
max_db = 100
ref_db = 20


# Model
r = 4 # Reduction factor. Do not change this.
dropout_rate = 0.05
e = 128 # == embedding
d = 256 # == hidden units of Text2Mel
c = 512 # == hidden units of SSRN
attention_win_size = 3
g = 0.2 ## determines width of band in attention guide
norm = None ## type of normalisation layers to use: from ['layer', 'batch', None]

## loss weights : T2M
lw_mel =0.333
lw_bd1 =0.333
lw_att =0.333
lw_t2m_l2 = 0.0
##              : SSRN
lw_mag = 0.5
lw_bd2 = 0.5
lw_ssrn_l2 = 0.0


## validation:
validpatt = 'CB-EM-55-6' ## sentence names containing this substring will be held out of training. In this case we will hold out 50th chapter of LJ. TODO: mention SD vs. SIL 
validation_sentences_to_evaluate = 5 
validation_sentences_to_synth_params = 3


# training scheme
restart_from_savepath = []
lr = 0.0001 # Initial learning rate.
beta1 = 0.5
beta2 = 0.9
epsilon = 0.000001
decay_lr = False
batchsize = {'t2m': 8, 'ssrn': 2}
num_threads = 8 # how many threads get_batch should use to build training batches of data (default: 8)
validate_every_n_epochs = 50   ## how often to compute validation score and save speech parameters
save_every_n_epochs = 50  ## as well as 5 latest models, how often to archive a model
max_epochs = 500

# attention plotting during training
plot_attention_every_n_epochs = 50 ## set to 0 if you do not wish to plot attention matrices
num_sentences_to_plot_attention = 3 ## number of sentences to plot attention matrices for