Network Training for Speaker Verification/ Spoofing Detection

I firstly noticed this question when I worked as an intern in AISpeech.

When you have thousands of speakers in your training data, it is much harder to train a neural network predicting the right label. The differences between syllables are too obvious such that it is very hard to classify speakers with different texts.

In speaker verification tasks this problem is much easier to be solved. In my experience the ASR network which predicts tri-phone labels works pretty well for the text-dependent speaker verification (I don’t have such experience on text-independent one). In Interspeech 2015 I have shown the result given by LSTM network, which works amazingly well for the short-duration case.

For the spoofing detection the problem becomes much complex since it is much hard to get the word-level labels like senones. Recently I’m preparing for the challenge released by Idiap and working on new network structures for spoofing. But unfortunatelly the basic DNN network got very bad performance on the ASVspoof 2015 dataset. Until now I still don’t know the exact reason but there must be some differences between TNet and my Lasagne implementation, such as randomization process.

Generally this is a sequence label problem and models such as BLSTM should work well in my perspective. I will working on this problem for next few weeks.

BTW. I have read slides given by Christopher Moody. The idea of lda2vec is very interesting and it reminds me the co-reference project. Balance between local and global information is always a trick to get succeed for NLP tasks.

Code(inspired by the work for CIFAR-10):

spoof
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
#!/usr/bin/env python

"""
Lasagne implementation of CIFAR-10 examples from "Deep Residual Learning for Image Recognition" (http://arxiv.org/abs/1512.03385)

With n=5, i.e. 32-layer network from the paper, this achieves a validation error of 6.88% (vs 7.51% in the paper).
The accuracy has not yet been tested for the other values of n.
"""


from __future__ import print_function

import sys
import os
import time
import string
import random
import cPickle

import numpy as np
import theano
import theano.tensor as T
import lasagne
from htk import HTKFeat_read
from sklearn.preprocessing import StandardScaler
### config ###

FEATURE_DIM = 48
FEATURE_EX = 15
HIDDEN_SIZE = 1024
OUTPUT_SIZE = 6
#FILES_SHOW = 5000
data = None

def parse(fea_str, prefix):
s = fea_str.split('=')
t = s[1].split('[')
u = t[1].split(',')
v = u[1].split(']')
return prefix+"_"+s[0], t[0], int(u[0]), int(v[0])

def load_data(scp, mlf, prefix="train"):
global data
tr = {}
for i in open(mlf,"r"):
j = i.rstrip().split('\t')
tr[prefix+"_"+j[0]] = int(j[1])
file_data = {}
X = []
Y = []
for i in open(scp,"r"):
name, file, start, end = parse(i, prefix)
#if not name in X.keys():
# X[name]=[]
# Y[name]=[]
if not file in file_data.keys():
file_data[file]=HTKFeat_read(file).getall()
if data is None:
curr = 0
data = file_data[file][start-FEATURE_EX:end+FEATURE_EX+1]
else:
curr = data.shape[0]
data = np.vstack((data, file_data[file][start-FEATURE_EX:end+FEATURE_EX+1]))
#print(str(curr)+":"+str(start)+","+str(end))
X.extend(range(curr, curr+end-start))
#print(str(X))
Y.extend([tr[name] for i in range(end-start)])
#del file_data
return X, Y

# ##################### Build the neural network model #######################

#from lasagne.layers import Conv2DLayer as ConvLayer
from lasagne.layers import ElemwiseSumLayer
from lasagne.layers import InputLayer
from lasagne.layers import DenseLayer
from lasagne.layers import PadLayer
from lasagne.layers import NonlinearityLayer
from lasagne.nonlinearities import softmax, rectify

# NB! from pull request #461 : https://github.com/f0k/Lasagne/blob/98b5581fa830cda3d3f838506ef14e5811a35ef7/lasagne/layers/normalization.py
from lasagne.layers import batch_norm

def build_dnn(input_var=None, n=5):
def residual_block(l, increase_dim=0, projection=False):
if increase_dim>0:
stack_1 = batch_norm(NonlinearityLayer(DenseLayer(l, num_units=increase_dim, W=lasagne.init.HeNormal(gain='relu')),nonlinearity=rectify))
stack_2 = batch_norm(NonlinearityLayer(DenseLayer(stack_1, num_units=increase_dim, W=lasagne.init.HeNormal(gain='relu')),nonlinearity=rectify))
if projection:
projection = DenseLayer(l, num_units=increase_dim, W=lasagne.init.HeNormal(gain='relu'))
else:
projection = PadLayer(l, [tuple([0, increase_dim-l.output_shape[1]])], batch_ndim=1)
block = NonlinearityLayer(batch_norm(ElemwiseSumLayer([projection, stack_2])),nonlinearity=rectify)
else:
stack_1 = batch_norm(NonlinearityLayer(DenseLayer(l, num_units=l.output_shape[1], W=lasagne.init.HeNormal(gain='relu')),nonlinearity=rectify))
stack_2 = batch_norm(NonlinearityLayer(DenseLayer(stack_1, num_units=l.output_shape[1], W=lasagne.init.HeNormal(gain='relu')),nonlinearity=rectif
block = NonlinearityLayer(batch_norm(ElemwiseSumLayer([l, stack_2])),nonlinearity=rectify)
return block

l_in=InputLayer(shape=(None, FEATURE_DIM*(2*FEATURE_EX+1)), input_var=input_var)
l=residual_block(l_in, HIDDEN_SIZE, True)
for _ in xrange(n):
l=residual_block(l)
network = DenseLayer(l, num_units=OUTPUT_SIZE, nonlinearity=softmax)
return network

def build_mlp(input_var=None, n=4):
l=InputLayer(shape=(None, FEATURE_DIM*(2*FEATURE_EX+1)), input_var=input_var)
for _ in xrange(n):
l=NonlinearityLayer(DenseLayer(l, num_units=HIDDEN_SIZE, W=lasagne.init.HeNormal(gain='relu')),nonlinearity=rectify)
network = DenseLayer(l, num_units=OUTPUT_SIZE, nonlinearity=softmax)
return network
# ############################# Batch iterator ###############################

def iterate_minibatches(inputs, targets, batchsize, shuffle=False, augment=False):
if shuffle:
inds=range(len(inputs))
np.random.shuffle(inds)
inputs=[inputs[j] for j in inds]
targets=[targets[j] for j in inds]
#nt=FILES_NUM
#files=[HTKFeat_read(filename=file_pos[names[i]]) for i in xrange(FILES_NUM)]
#file_names=[names[i] for i in xrange(FILES_NUM)]
#inds=[0 for i in xrange(FILES_NUM)]
#inds=[0 for i in xrange(len(names))]
#avail=Set([i for i in xrange(len(inputs))])
curr = 0
while True:
X=None
Y=[]
finished=False
for r in xrange(batchsize):
if len(inputs)==curr:
finished=True
break
idx=curr
curr+=1
#files[idx].seek(inputs[file_names[idx]][inds[idx]])
#XX=[]
#for _ in xrange(FEATURE_EX):
# XX.extend(files[idx].next().tolist())
XX=data[inputs[idx]:inputs[idx]+2*FEATURE_EX+1].ravel()
#XX=data[inputs[names[idx]][inds[idx]]:inputs[names[idx]][inds[idx]]+2*FEATURE_EX+1].ravel()
#if np.isnan(np.sum(XX)):
# print(names[idx]+":"+str(inputs[names[idx]][inds[idx]])+","+str(inputs[names[idx]][inds[idx]]+FEATURE_EX))
if X is None:
X=XX
else:
X=np.vstack((X,XX))
Y.append(targets[idx])
#Y.append(targets[names[idx]][inds[idx]])
#inds[idx]+=1
#if inds[idx]>=len(inputs[names[idx]]):
# avail.remove(idx)
#files[idx]=None
#file_names[idx]=None
#inds[idx]=0
#print(str(X.mean(axis=1)))
yield lasagne.utils.floatX(X),np.array(Y).astype('int32')
if finished:
break

# ############################## Main program ################################

def main(n=5, num_epochs=50):
global data
# Load the dataset
print("Loading data...")
try:
with open("spoof.pickle", "rb") as f:
tmp = cPickle.load(f)
if tmp!=FEATURE_EX:
print("Context window don't match.")
raise ValueError,'invalid window'
data, X_train, Y_train, X_test, Y_test = cPickle.load(f)
except:
print("Regenerate data!")
X_train, Y_train = load_data("spoof_train.scp", "mlf")
X_test, Y_test = load_data("spoof_dev.scp", "mlf","dev")
with open("spoof.pickle", "wb") as f:
cPickle.dump(FEATURE_EX, f)
cPickle.dump([data, X_train, Y_train, X_test, Y_test], f)
#data = StandardScaler().fit_transform(data)
# Prepare Theano variables for inputs and targets
input_var = T.matrix('inputs')
target_var = T.ivector('targets')

# Create neural network model
print("Building model and compiling functions...")
network = build_mlp(input_var, n)
print("number of parameters in model: %d" % lasagne.layers.count_params(network))

# Create a loss expression for training, i.e., a scalar objective we want
# to minimize (for our multi-class problem, it is the cross-entropy loss):
prediction = lasagne.layers.get_output(network)
loss = lasagne.objectives.categorical_crossentropy(prediction, target_var)
loss = loss.mean()
# add weight decay
all_layers = lasagne.layers.get_all_layers(network)
l2_penalty = lasagne.regularization.regularize_layer_params(all_layers, lasagne.regularization.l2) * 0.0001
loss = loss + l2_penalty

# Create update expressions for training
# Stochastic Gradient Descent (SGD) with momentum
params = lasagne.layers.get_all_params(network, trainable=True)
#lr = 0.1
#sh_lr = theano.shared(lasagne.utils.floatX(lr))
#updates = lasagne.updates.momentum(
# loss, params, learning_rate=sh_lr, momentum=0.9)
updates = lasagne.updates.adagrad(loss, params)
# Create a loss expression for validation/testing
test_prediction = lasagne.layers.get_output(network)
test_loss = lasagne.objectives.categorical_crossentropy(test_prediction,
target_var)
test_loss = test_loss.mean()
test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var),
dtype=theano.config.floatX)

# Compile a function performing a training step on a mini-batch (by giving
# the updates dictionary) and returning the corresponding training loss:
train_fn = theano.function([input_var, target_var], loss, updates=updates)

# Compile a second function computing the validation loss and accuracy:
val_fn = theano.function([input_var, target_var], [test_loss, test_acc])

# Finally, launch the training loop.
print("Starting training...")
# We iterate over epochs:
for epoch in range(num_epochs):
# In each epoch, we do a full pass over the training data:
train_err = 0
train_batches = 0
start_time = time.time()
for batch in iterate_minibatches(X_train, Y_train, 512, shuffle=True, augment=True):
inputs, targets = batch
train_err += train_fn(inputs, targets)
train_batches += 1

train_acc = 0
for batch in iterate_minibatches(X_train, Y_train, 500, shuffle=False):
inputs, targets = batch
_, acc = val_fn(inputs, targets)
train_acc += acc
# And a full pass over the validation data:
val_err = 0
val_acc = 0
val_batches = 0
for batch in iterate_minibatches(X_test, Y_test, 500, shuffle=False):
inputs, targets = batch
err, acc = val_fn(inputs, targets)
val_err += err
val_acc += acc
val_batches += 1

# Then we print the results for this epoch:
print("Epoch {} of {} took {:.3f}s".format(
epoch + 1, num_epochs, time.time() - start_time))
print(" training loss:\t\t{:.6f}".format(train_err / train_batches))
print(" training accuracy:\t\t{:.2f} %".format(train_acc / train_batches * 100))
print(" validation loss:\t\t{:.6f}".format(val_err / val_batches))
print(" validation accuracy:\t\t{:.2f} %".format(
val_acc / val_batches * 100))

# adjust learning rate as in paper
# 32k and 48k iterations should be roughly equivalent to 41 and 61 epochs
#If (epoch+1) == 41 or (epoch+1) == 61:
#new_lr = sh_lr.get_value() * 0.1
#print("New LR:"+str(new_lr))
#sh_lr.set_value(lasagne.utils.floatX(new_lr))

# After training, we compute and print the test error:
test_err = 0
test_acc = 0
test_batches = 0
for batch in iterate_minibatches(X_test, Y_test, 500, shuffle=False):
inputs, targets = batch
err, acc = val_fn(inputs, targets)
test_err += err
test_acc += acc
test_batches += 1
print("Final results:")
print(" test loss:\t\t\t{:.6f}".format(test_err / test_batches))
print(" test accuracy:\t\t{:.2f} %".format(
test_acc / test_batches * 100))

# dump the network weights to a file :
np.savez('spoof_deep_residual_model.npz', *lasagne.layers.get_all_param_values(network))
#
# And load them again later on like this:
# with np.load('cifar10_deep_residual_model.npz') as f:
# param_values = [f['arr_%d' % i] for i in range(len(f.files))]
# lasagne.layers.set_all_param_values(network, param_values)


if __name__ == '__main__':
if ('--help' in sys.argv) or ('-h' in sys.argv):
print("Trains a Deep Residual Learning network on cifar-10 using Lasagne.")
print("Network architecture and training parameters are as in section 4.2 in 'Deep Residual Learning for Image Recognition'.")
print("Usage: %s [N [EPOCHS]]" % sys.argv[0])
print()
print("N: Number of stacked residual building blocks per feature map (default: 5)")
print("EPOCHS: number of training epochs to perform (default: 82)")
else:
kwargs = {}
if len(sys.argv) > 1:
kwargs['n'] = int(sys.argv[1])
if len(sys.argv) > 2:
kwargs['num_epochs'] = int(sys.argv[3])
main(**kwargs)

Update: Found the reason. For this task batch normalization shouldn’t be used otherwise the accuracy is really low(<40%). In my opinion the Batch Normalization may not be a good idea for speech processing tasks. The original paper published by Google also mentioned that it largely improve the performance using sigmoid activation function instead of rectifier function.