-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfeaturizer.py
100 lines (84 loc) · 3.48 KB
/
featurizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import codecs
import numpy as np
def compute_spectrogram_feature(samples, sample_rate, stride_ms=10.0,
window_ms=20.0, max_freq=None, eps=1e-14):
"""Compute the spectrograms for the input samples(waveforms).
More about spectrogram computation, please refer to:
https://en.wikipedia.org/wiki/Short-time_Fourier_transform.
"""
if max_freq is None:
max_freq = sample_rate / 2
if max_freq > sample_rate / 2:
raise ValueError("max_freq must not be greater than half of sample rate.")
if stride_ms > window_ms:
raise ValueError("Stride size must not be greater than window size.")
stride_size = int(0.001 * sample_rate * stride_ms)
window_size = int(0.001 * sample_rate * window_ms)
# Extract strided windows
truncate_size = (len(samples) - window_size) % stride_size
samples = samples[:len(samples) - truncate_size]
nshape = (window_size, (len(samples) - window_size) // stride_size + 1)
nstrides = (samples.strides[0], samples.strides[0] * stride_size)
windows = np.lib.stride_tricks.as_strided(
samples, shape=nshape, strides=nstrides)
assert np.all(
windows[:, 1] == samples[stride_size:(stride_size + window_size)])
# Window weighting, squared Fast Fourier Transform (fft), scaling
weighting = np.hanning(window_size)[:, None]
fft = np.fft.rfft(windows * weighting, axis=0)
fft = np.absolute(fft)
fft = fft**2
scale = np.sum(weighting**2) * sample_rate
fft[1:-1, :] *= (2.0 / scale)
fft[(0, -1), :] /= scale
# Prepare fft frequency list
freqs = float(sample_rate) / window_size * np.arange(fft.shape[0])
# Compute spectrogram feature
ind = np.where(freqs <= max_freq)[0][-1] + 1
specgram = np.log(fft[:ind, :] + eps)
return np.transpose(specgram, (1, 0))
class AudioFeaturizer(object):
"""Class to extract spectrogram features from the audio input."""
def __init__(self,
sample_rate=16000,
window_ms=20.0,
stride_ms=10.0):
"""Initialize the audio featurizer class according to the configs.
Args:
sample_rate: an integer specifying the sample rate of the input waveform.
window_ms: an integer for the length of a spectrogram frame, in ms.
stride_ms: an integer for the frame stride, in ms.
"""
self.sample_rate = sample_rate
self.window_ms = window_ms
self.stride_ms = stride_ms
def compute_label_feature(text, token_to_idx):
"""Convert string to a list of integers."""
tokens = list(text.strip().lower())
feats = [token_to_idx[token] for token in tokens]
return feats
class TextFeaturizer(object):
"""Extract text feature based on char-level granularity.
By looking up the vocabulary table, each input string (one line of transcript)
will be converted to a sequence of integer indexes.
"""
def __init__(self, vocab_file):
lines = []
with codecs.open(vocab_file, "r", "utf-8") as fin:
lines.extend(fin.readlines())
self.token_to_index = {}
self.index_to_token = {}
self.speech_labels = ""
index = 0
for line in lines:
line = line[:-1] # Strip the '\n' char.
if line.startswith("#"):
# Skip from reading comment line.
continue
self.token_to_index[line] = index
self.index_to_token[index] = line
self.speech_labels += line
index += 1