-
Notifications
You must be signed in to change notification settings - Fork 21
/
Copy pathfa_as_guide.cfg
114 lines (89 loc) · 4.51 KB
/
fa_as_guide.cfg
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# -*- coding: utf-8 -*-
#/usr/bin/python2
import os
## Take name of this file to be the config name:
config_name = os.path.split(__file__)[-1].split('.')[0] ## remove path and extension
## Define place to put outputs relative to this config file's location;
## supply an absoluate path to work elsewhere:
topworkdir = os.path.realpath(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'work')))
voicedir = os.path.join(topworkdir, config_name)
logdir = os.path.join(voicedir, 'train')
sampledir = os.path.join(voicedir, 'synth')
## Change featuredir to absolute path to use existing features
featuredir = '/disk/scratch/cvbotinh/data/BC2013/CB-JE-LCL-FFM-EM/'
coarse_audio_dir = os.path.join(featuredir, 'mels')
full_mel_dir = os.path.join(featuredir, 'full_mels')
full_audio_dir = os.path.join(featuredir, 'mags')
#attention_guide_dir = os.path.join(featuredir, 'attention_guides')
## Set this to the empty string ('') to global attention guide
attention_guide_dir = '/disk/scratch/cvbotinh/data/BC2013/CB-JE-LCL-FFM-EM/W0.1_attention_guides_dctts/' # contains FA guides
# Data locations:
datadir = '/disk/scratch/cvbotinh/data/BC2013/CB-JE-LCL-FFM-EM/'
transcript = os.path.join(datadir, 'trimmed_transcript_trainset_with_punctuation_with_all_quotes_CB-EM_less17.csv')
test_transcript = os.path.join(datadir, 'transcript_testset_shuffled_with_punctuation_with_all_quotes.csv')
waveforms = os.path.join(datadir, 'wavs_trim')
input_type = 'phones' ## letters or phones
## CMU phones:
vocab = ['<PADDING>', '<_END_>', '<_START_>'] + ['<!>', '<">', "<'>", "<'s>", '<)>', '<,>', '<.>', '<:>', '<;>', '<>', '<?>', '<]>', '<_END_>', '<_START_>', 'aa', 'ae', 'ah', 'ao', 'aw', 'ax', 'ay', 'b', 'ch', 'd', 'dh', 'eh', 'er', 'ey', 'f', 'g', 'hh', 'ih', 'iy', 'jh', 'k', 'l', 'm', 'n', 'ng', 'ow', 'oy', 'p', 'r', 's', 'sh', 't', 'th', 'uh', 'uw', 'v', 'w', 'y', 'z', 'zh']
# Train
max_N = 150 # Maximum number of characters/phones
max_T = 264 # Maximum number of mel frames
turn_off_monotonic_for_synthesis = True
sampledir = sampledir + '_' + str(turn_off_monotonic_for_synthesis)
multispeaker = [] ## list of positions at which to add speaker embeddings to handle multi-speaker training. [] means speaker dependent (no embeddings). Possible positions: text_encoder_input, text_encoder_towards_end, audio_decoder_input, ssrn_input, audio_encoder_input
n_utts = 0 ## 0 means use all data, other positive integer means select this many sentences from beginning of training set
random_reduction_on_the_fly = True ## Randomly choose shift when performing reduction to get coarse features.
# signal processing
trim_before_spectrogram_extraction = 0
vocoder = 'griffin_lim'
sr = 22050 # Sampling rate.
n_fft = 2048 # fft points (samples)
frame_shift = 0.0125 # seconds
frame_length = 0.05 # seconds
hop_length = int(sr * frame_shift)
win_length = int(sr * frame_length)
prepro = True # don't extract spectrograms on the fly
full_dim = n_fft//2+1
n_mels = 80 # Number of Mel banks to generate
power = 1.5 # Exponent for amplifying the predicted magnitude
n_iter = 50 # Number of inversion iterations
preemphasis = .97
max_db = 100
ref_db = 20
# Model
r = 4 # Reduction factor. Do not change this.
dropout_rate = 0.05
e = 128 # == embedding
d = 256 # == hidden units of Text2Mel
c = 512 # == hidden units of SSRN
attention_win_size = 3
g = 0.2 ## determines width of band in attention guide
norm = None ## type of normalisation layers to use: from ['layer', 'batch', None]
## loss weights : T2M
lw_mel =0.333
lw_bd1 =0.333
lw_att =0.333
lw_t2m_l2 = 0.0
## : SSRN
lw_mag = 0.5
lw_bd2 = 0.5
lw_ssrn_l2 = 0.0
## validation:
validpatt = 'CB-EM-55-6' ## sentence names containing this substring will be held out of training. In this case we will hold out 50th chapter of LJ. TODO: mention SD vs. SIL
validation_sentences_to_evaluate = 5
validation_sentences_to_synth_params = 3
# training scheme
restart_from_savepath = []
lr = 0.0001 # Initial learning rate.
beta1 = 0.5
beta2 = 0.9
epsilon = 0.000001
decay_lr = False
batchsize = {'t2m': 8, 'ssrn': 2}
num_threads = 8 # how many threads get_batch should use to build training batches of data (default: 8)
validate_every_n_epochs = 50 ## how often to compute validation score and save speech parameters
save_every_n_epochs = 50 ## as well as 5 latest models, how often to archive a model
max_epochs = 500
# attention plotting during training
plot_attention_every_n_epochs = 50 ## set to 0 if you do not wish to plot attention matrices
num_sentences_to_plot_attention = 3 ## number of sentences to plot attention matrices for