forked from andabi/deep-voice-conversion
-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathmodels.py
177 lines (142 loc) · 7.38 KB
/
models.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
# -*- coding: utf-8 -*-
#!/usr/bin/env python
import os
import tensorflow as tf
from data_load import load_vocab
from modules import prenet, cbhg
class Model:
def __init__(self, mode, hparams):
self.hparams = hparams
self.mode = mode
self.batch_size = self.hparams.Default.batch_size
self.is_training = self.get_is_training(mode)
# Input
self.x_mfcc, self.y_ppgs, self.y_spec, self.y_mel, self.num_batch = self.get_input(mode, self.batch_size)
# Networks
self.net_template = tf.make_template('net', self._net2)
self.ppgs, self.pred_ppg, self.logits_ppg, self.pred_spec, self.pred_mel = self.net_template()
def __call__(self):
return self.pred_spec
def get_input(self, mode, batch_size):
'''
mode: A string. One of the phases below:
`train1`: TIMIT TRAIN waveform -> mfccs (inputs) -> PGGs -> phones (target) (ce loss)
`test1`: TIMIT TEST waveform -> mfccs (inputs) -> PGGs -> phones (target) (accuracy)
`train2`: ARCTIC SLT waveform -> mfccs -> PGGs (inputs) -> spectrogram (target)(l2 loss)
`test2`: ARCTIC SLT waveform -> mfccs -> PGGs (inputs) -> spectrogram (target)(accuracy)
`convert`: ARCTIC BDL waveform -> mfccs (inputs) -> PGGs -> spectrogram -> waveform (output)
'''
if mode not in ('train1', 'test1', 'train2', 'test2', 'convert'):
raise Exception("invalid mode={}".format(mode))
x_mfcc = tf.placeholder(tf.float32, shape=(batch_size, None, self.hparams.Default.n_mfcc))
y_ppgs = tf.placeholder(tf.int32, shape=(batch_size, None,))
y_spec = tf.placeholder(tf.float32, shape=(batch_size, None, 1 + self.hparams.Default.n_fft // 2))
y_mel = tf.placeholder(tf.float32, shape=(batch_size, None, self.hparams.Default.n_mels))
num_batch = 1
return x_mfcc, y_ppgs, y_spec, y_mel, num_batch
def get_is_training(self, mode):
if mode in ('train1', 'train2'):
is_training = True
else:
is_training = False
return is_training
def _net1(self):
with tf.variable_scope('net1'):
# Load vocabulary
phn2idx, idx2phn = load_vocab()
# Pre-net
prenet_out = prenet(self.x_mfcc,
num_units=[self.hparams.Train1.hidden_units, self.hparams.Train1.hidden_units // 2],
dropout_rate=self.hparams.Train1.dropout_rate,
is_training=self.is_training) # (N, T, E/2)
# CBHG
out = cbhg(prenet_out, self.hparams.Train1.num_banks, self.hparams.Train1.hidden_units // 2, self.hparams.Train1.num_highway_blocks, self.hparams.Train1.norm_type, self.is_training)
# Final linear projection
logits = tf.layers.dense(out, len(phn2idx)) # (N, T, V)
ppgs = tf.nn.softmax(logits / self.hparams.Train1.t) # (N, T, V)
preds = tf.to_int32(tf.arg_max(logits, dimension=-1)) # (N, T)
return ppgs, preds, logits
def loss_net1(self):
istarget = tf.sign(tf.abs(tf.reduce_sum(self.x_mfcc, -1))) # indicator: (N, T)
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits_ppg / self.hparams.Train1.t, labels=self.y_ppgs)
loss *= istarget
loss = tf.reduce_mean(loss)
return loss
def acc_net1(self):
istarget = tf.sign(tf.abs(tf.reduce_sum(self.x_mfcc, -1))) # indicator: (N, T)
num_hits = tf.reduce_sum(tf.to_float(tf.equal(self.pred_ppg, self.y_ppgs)) * istarget)
num_targets = tf.reduce_sum(istarget)
acc = num_hits / num_targets
return acc
def _net2(self):
# PPGs from net1
ppgs, preds_ppg, logits_ppg = self._net1()
with tf.variable_scope('net2'):
# Pre-net
prenet_out = prenet(ppgs,
num_units=[self.hparams.Train2.hidden_units, self.hparams.Train2.hidden_units // 2],
dropout_rate=self.hparams.Train2.dropout_rate,
is_training=self.is_training) # (N, T, E/2)
# CBHG1: mel-scale
pred_mel = cbhg(prenet_out, self.hparams.Train2.num_banks, self.hparams.Train2.hidden_units // 2, self.hparams.Train2.num_highway_blocks, self.hparams.Train2.norm_type, self.is_training, scope="cbhg1")
pred_mel = tf.layers.dense(pred_mel, self.y_mel.shape[-1]) # log magnitude: (N, T, n_mels)
# CBHG2: linear-scale
pred_spec = tf.layers.dense(pred_mel, self.hparams.Train2.hidden_units // 2) # log magnitude: (N, T, n_mels)
pred_spec = cbhg(pred_spec, self.hparams.Train2.num_banks, self.hparams.Train2.hidden_units // 2, self.hparams.Train2.num_highway_blocks, self.hparams.Train2.norm_type, self.is_training, scope="cbhg2")
pred_spec = tf.layers.dense(pred_spec, self.y_spec.shape[-1]) # log magnitude: (N, T, 1+self.hparams.n_fft//2)
return ppgs, preds_ppg, logits_ppg, pred_spec, pred_mel
def loss_net2(self):
loss_spec = tf.reduce_mean(tf.squared_difference(self.pred_spec, self.y_spec))
loss_mel = tf.reduce_mean(tf.squared_difference(self.pred_mel, self.y_mel))
loss = loss_spec + loss_mel
return loss
@staticmethod
def load(sess, mode, logdir, logdir2=None):
def print_model_loaded(mode, logdir):
model_name = Model.get_model_name(logdir)
print(('Model loaded. mode: {}, model_name: {}'.format(mode, model_name)))
if mode in ['train1', 'test1']:
var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'net/net1')
if Model._load_variables(sess, logdir, var_list=var_list):
print_model_loaded(mode, logdir)
elif mode == 'train2':
var_list1 = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'net/net1')
if Model._load_variables(sess, logdir, var_list=var_list1):
print_model_loaded(mode, logdir)
var_list2 = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'net/net2')
if Model._load_variables(sess, logdir2, var_list=var_list2):
print_model_loaded(mode, logdir2)
elif mode in ['test2', 'convert']:
if Model._load_variables(sess, logdir, var_list=None): # Load all variables
print_model_loaded(mode, logdir)
@staticmethod
def _load_variables(sess, logdir, var_list):
ckpt = tf.train.latest_checkpoint(logdir)
if ckpt:
tf.train.Saver(var_list=var_list).restore(sess, ckpt)
return True
else:
return False
@staticmethod
def get_model_name(logdir):
path = '{}/checkpoint'.format(logdir)
if os.path.exists(path):
ckpt_path = open(path, 'r').read().split('"')[1]
_, model_name = os.path.split(ckpt_path)
else:
model_name = None
return model_name
@staticmethod
def get_global_step(logdir):
model_name = Model.get_model_name(logdir)
if model_name:
gs = int(model_name.split('_')[3])
else:
gs = 0
return gs
@staticmethod
def all_model_names(logdir):
import glob, os
path = '{}/*.meta'.format(logdir)
model_names = [os.path.basename(f).replace('.meta', '') for f in glob.glob(path)]
return model_names