speaker_extraction/extract_feats_test.py at master · xuchenglin28/speaker_extraction

History

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

#!/usr/bin/env python

# -*- coding: utf-8 -*-

# Updated by Chenglin, Dec 2018

"""

1. Extract features (magnitude, log magnitude)

2. Converts to TFRecords format

3. Calculate global CMVN (same as kaldi).

"""

from __future__ import absolute_import

from __future__ import division

from __future__ import print_function

import argparse

import multiprocessing

import os,sys

import numpy as np

import tensorflow as tf

from utils.audioread import audioread

from utils.sigproc import framesig,magspec

from utils.normhamming import normhamming

import time

def make_sequence(feats, feats_aux):

"""

Return a sequence for given feats and corresponding labels (optional for test)

Args:

feats: input feature vectors (i.e. magnitude of mixture speech)

feats_aux: inputs to auxilary network to learn target speaker representation

labels1: reference labels for target sepaker

Returns:

A tf.train.SequenceExample

"""

inputs = [tf.train.Feature(float_list=tf.train.FloatList(value=feat)) for feat in feats]

inputs_aux = [tf.train.Feature(float_list=tf.train.FloatList(value=feat_aux)) for feat_aux in feats_aux]

feature_list = {

'inputs': tf.train.FeatureList(feature=inputs),

'inputs_aux': tf.train.FeatureList(feature=inputs_aux)

}

feature_lists = tf.train.FeatureLists(feature_list=feature_list)

return tf.train.SequenceExample(feature_lists=feature_lists)

def cal_phase_mag(filename):

'''

extract phase and feats for one utterance

'''

rate, sig, _ = audioread(filename)

frames = framesig(sig, FLAGS.FFT_LEN, FLAGS.FRAME_SHIFT, lambda x: normhamming(x), True)

phase, feats = magspec(frames, FLAGS.FFT_LEN)

return phase, feats

def extract_mag_feats(item):

tokens = item.strip().split()

(_, name) = os.path.split(tokens[0])

(uttid, _) = os.path.splitext(name) #mixed or noisy utterance

# extract feats for mixture

phase_mix, feats = cal_phase_mag(tokens[0])

(_, name_aux) = os.path.split(tokens[1])

(uttid_aux, _) = os.path.splitext(name_aux)

tokens_aux = uttid_aux.split('_')

# extract auxiliary feats for auxiliary network

phase_aux, feats_aux = cal_phase_mag(tokens[1])

# tfrecords to save the sequency consisting of feats and labels (optional for test)

tfrecords_name = os.path.join(FLAGS.output_dir, FLAGS.data_type, uttid+".tfrecords")

writer = tf.python_io.TFRecordWriter(tfrecords_name)

# write feats and labels into tfrecords

writer.write(make_sequence(feats, feats_aux).SerializeToString())

def main(unused_argv):

print('Extract starts ...')

print(time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()))

if not os.path.exists(os.path.join(FLAGS.output_dir, FLAGS.data_type)):

os.makedirs(os.path.join(FLAGS.output_dir, FLAGS.data_type))

lists = open(FLAGS.list_path).readlines()

pool = multiprocessing.Pool(FLAGS.num_threads)

workers = []

for item in lists:

workers.append(pool.apply_async(extract_mag_feats(item)))

pool.close()

pool.join()

print(time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()))

print('Extract ends.')

if __name__ == '__main__':

parser = argparse.ArgumentParser()

parser.add_argument(

'--data_type',

type=str,

default='tr',

help='tr, cv, tt.')

parser.add_argument(

'--list_path',

type=str,

default='lists/rm4_tr.lst',

help='List of the paired mix, aux, clean data'

)

parser.add_argument(

'--output_dir',

type=str,

default='data/tfrecords',

help='Directory to save the features into tfrecords format'

)

parser.add_argument(

'--FFT_LEN',

type=int,

default=512,

help='The length of fft window.'

)

parser.add_argument(

'--FRAME_SHIFT',

type=int,

default=256,

help='The shift of samples when calculating fft.'

)

parser.add_argument(

'--num_threads',

type=int,

default=10,

help='The number of threads to convert tfrecords files.'

)

FLAGS, unparsed = parser.parse_known_args()

tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

extract_feats_test.py

extract_feats_test.py

Files

extract_feats_test.py

Latest commit

History

extract_feats_test.py

File metadata and controls