Module: TorchAudio::Functional

Defined in:
lib/torchaudio/functional.rb

Class Method Summary collapse

Class Method Details

.amplitude_to_DB(amp, multiplier, amin, db_multiplier, top_db: nil) ⇒ Object



242
243
244
245
246
247
248
249
# File 'lib/torchaudio/functional.rb', line 242

def amplitude_to_DB(amp, multiplier, amin, db_multiplier, top_db: nil)
  db = Torch.log10(Torch.clamp(amp, min: amin)) * multiplier
  db -= multiplier * db_multiplier

  db = db.clamp(min: db.max.item - top_db) if top_db

  db
end

.biquad(waveform, b0, b1, b2, a0, a1, a2) ⇒ Object



148
149
150
151
152
153
154
155
156
157
158
# File 'lib/torchaudio/functional.rb', line 148

def biquad(waveform, b0, b1, b2, a0, a1, a2)
  device = waveform.device
  dtype = waveform.dtype

  output_waveform = lfilter(
      waveform,
      Torch.tensor([a0, a1, a2], dtype: dtype, device: device),
      Torch.tensor([b0, b1, b2], dtype: dtype, device: device)
  )
  output_waveform
end

.complex_norm(complex_tensor, power: 1.0) ⇒ Object



66
67
68
# File 'lib/torchaudio/functional.rb', line 66

def complex_norm(complex_tensor, power: 1.0)
  complex_tensor.pow(2.0).sum(-1).pow(0.5 * power)
end

.compute_deltas(specgram, win_length: 5, mode: "replicate") ⇒ Object

Raises:

  • (ArgumentError)


104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# File 'lib/torchaudio/functional.rb', line 104

def compute_deltas(specgram, win_length: 5, mode: "replicate")
  device = specgram.device
  dtype = specgram.dtype

  # pack batch
  shape = specgram.size
  specgram = specgram.reshape(1, -1, shape[-1])

  raise ArgumentError, "win_length must be >= 3" unless win_length >= 3

  n = (win_length - 1).div(2)

  # twice sum of integer squared
  denom = n * (n + 1) * (2 * n + 1) / 3

  specgram = Torch::NN::Functional.pad(specgram, [n, n], mode: mode)

  kernel = Torch.arange(-n, n + 1, 1, device: device, dtype: dtype).repeat([specgram.shape[1], 1, 1])

  output = Torch::NN::Functional.conv1d(specgram, kernel, groups: specgram.shape[1]) / denom

  # unpack batch
  output = output.reshape(shape)
end

.create_dct(n_mfcc, n_mels, norm: nil) ⇒ Object



255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
# File 'lib/torchaudio/functional.rb', line 255

def create_dct(n_mfcc, n_mels, norm: nil)
  n = Torch.arange(n_mels.to_f)
  k = Torch.arange(n_mfcc.to_f).unsqueeze!(1)
  dct = Torch.cos((n + 0.5) * k * Math::PI / n_mels.to_f)

  if norm.nil?
    dct *= 2.0
  else
    raise ArgumentError, "Invalid DCT norm value" unless norm == :ortho

    dct[0] *= 1.0 / Math.sqrt(2.0)
    dct *= Math.sqrt(2.0 / n_mels)
  end

  dct.t
end

.create_fb_matrix(n_freqs, f_min, f_max, n_mels, sample_rate, norm: nil) ⇒ Object



70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# File 'lib/torchaudio/functional.rb', line 70

def create_fb_matrix(n_freqs, f_min, f_max, n_mels, sample_rate, norm: nil)
  if norm && norm != "slaney"
    raise ArgumentError, "norm must be one of None or 'slaney'"
  end

  # freq bins
  # Equivalent filterbank construction by Librosa
  all_freqs = Torch.linspace(0, sample_rate.div(2), n_freqs)

  # calculate mel freq bins
  # hertz to mel(f) is 2595. * math.log10(1. + (f / 700.))
  m_min = 2595.0 * Math.log10(1.0 + (f_min / 700.0))
  m_max = 2595.0 * Math.log10(1.0 + (f_max / 700.0))
  m_pts = Torch.linspace(m_min, m_max, n_mels + 2)
  # mel to hertz(mel) is 700. * (10**(mel / 2595.) - 1.)
  f_pts = (Torch.pow(10, m_pts / 2595.0) - 1.0) * 700.0
  # calculate the difference between each mel point and each stft freq point in hertz
  f_diff = f_pts[1..-1] - f_pts[0...-1]  # (n_mels + 1)
  slopes = f_pts.unsqueeze(0) - all_freqs.unsqueeze(1)  # (n_freqs, n_mels + 2)
  # create overlapping triangles
  zero = Torch.zeros(1)
  down_slopes = (slopes[0..-1, 0...-2] * -1.0) / f_diff[0...-1]  # (n_freqs, n_mels)
  up_slopes = slopes[0..-1, 2..-1] / f_diff[1..-1]  # (n_freqs, n_mels)
  fb = Torch.max(zero, Torch.min(down_slopes, up_slopes))

  if norm && norm == "slaney"
    # Slaney-style mel is scaled to be approx constant energy per channel
    enorm = 2.0 / (f_pts[2...(n_mels + 2)] - f_pts[:n_mels])
    fb *= enorm.unsqueeze(0)
  end

  fb
end

.DB_to_amplitude(db, ref, power) ⇒ Object



251
252
253
# File 'lib/torchaudio/functional.rb', line 251

def DB_to_amplitude(db, ref, power)
  Torch.pow(Torch.pow(10.0, db * 0.1), power) * ref
end

.dither(waveform, density_function: "TPDF", noise_shaping: false) ⇒ Object



137
138
139
140
141
142
143
144
145
146
# File 'lib/torchaudio/functional.rb', line 137

def dither(waveform, density_function: "TPDF", noise_shaping: false)
  dithered = _apply_probability_distribution(waveform, density_function: density_function)

  if noise_shaping
    raise "Not implemented yet"
    # _add_noise_shaping(dithered, waveform)
  else
    dithered
  end
end

.gain(waveform, gain_db: 1.0) ⇒ Object



129
130
131
132
133
134
135
# File 'lib/torchaudio/functional.rb', line 129

def gain(waveform, gain_db: 1.0)
  return waveform if gain_db == 0

  ratio = 10 ** (gain_db / 20)

  waveform * ratio
end

.highpass_biquad(waveform, sample_rate, cutoff_freq, q: 0.707) ⇒ Object



160
161
162
163
164
165
166
167
168
169
170
171
# File 'lib/torchaudio/functional.rb', line 160

def highpass_biquad(waveform, sample_rate, cutoff_freq, q: 0.707)
  w0 = 2 * Math::PI * cutoff_freq / sample_rate
  alpha = Math.sin(w0) / 2.0 / q

  b0 = (1 + Math.cos(w0)) / 2
  b1 = -1 - Math.cos(w0)
  b2 = b0
  a0 = 1 + alpha
  a1 = -2 * Math.cos(w0)
  a2 = 1 - alpha
  biquad(waveform, b0, b1, b2, a0, a1, a2)
end

.lfilter(waveform, a_coeffs, b_coeffs, clamp: true) ⇒ Object

Raises:

  • (ArgumentError)


186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
# File 'lib/torchaudio/functional.rb', line 186

def lfilter(waveform, a_coeffs, b_coeffs, clamp: true)
  # pack batch
  shape = waveform.size
  waveform = waveform.reshape(-1, shape[-1])

  raise ArgumentError unless a_coeffs.size(0) == b_coeffs.size(0)
  raise ArgumentError unless waveform.size.length == 2
  raise ArgumentError unless waveform.device == a_coeffs.device
  raise ArgumentError unless b_coeffs.device == a_coeffs.device

  device = waveform.device
  dtype = waveform.dtype
  n_channel, n_sample = waveform.size
  n_order = a_coeffs.size(0)
  n_sample_padded = n_sample + n_order - 1
  raise ArgumentError unless n_order > 0

  # Pad the input and create output
  padded_waveform = Torch.zeros(n_channel, n_sample_padded, dtype: dtype, device: device)
  padded_waveform[0..-1, (n_order - 1)..-1] = waveform
  padded_output_waveform = Torch.zeros(n_channel, n_sample_padded, dtype: dtype, device: device)

  # Set up the coefficients matrix
  # Flip coefficients' order
  a_coeffs_flipped = a_coeffs.flip([0])
  b_coeffs_flipped = b_coeffs.flip([0])

  # calculate windowed_input_signal in parallel
  # create indices of original with shape (n_channel, n_order, n_sample)
  window_idxs = Torch.arange(n_sample, device: device).unsqueeze(0) + Torch.arange(n_order, device: device).unsqueeze(1)
  window_idxs = window_idxs.repeat([n_channel, 1, 1])
  window_idxs += (Torch.arange(n_channel, device: device).unsqueeze(-1).unsqueeze(-1) * n_sample_padded)
  window_idxs = window_idxs.long
  # (n_order, ) matmul (n_channel, n_order, n_sample) -> (n_channel, n_sample)
  input_signal_windows = Torch.matmul(b_coeffs_flipped, Torch.take(padded_waveform, window_idxs))

  input_signal_windows.div!(a_coeffs[0])
  a_coeffs_flipped.div!(a_coeffs[0])
  input_signal_windows.t.each_with_index do |o0, i_sample|
    windowed_output_signal = padded_output_waveform[0..-1, i_sample...(i_sample + n_order)]
    o0.addmv!(windowed_output_signal, a_coeffs_flipped, alpha: -1)
    padded_output_waveform[0..-1, i_sample + n_order - 1] = o0
  end

  output = padded_output_waveform[0..-1, (n_order - 1)..-1]

  if clamp
    output = Torch.clamp(output, -1.0, 1.0)
  end

  # unpack batch
  output = output.reshape(shape[0...-1] + output.shape[-1..-1])

  output
end

.lowpass_biquad(waveform, sample_rate, cutoff_freq, q: 0.707) ⇒ Object



173
174
175
176
177
178
179
180
181
182
183
184
# File 'lib/torchaudio/functional.rb', line 173

def lowpass_biquad(waveform, sample_rate, cutoff_freq, q: 0.707)
  w0 = 2 * Math::PI * cutoff_freq / sample_rate
  alpha = Math.sin(w0) / 2 / q

  b0 = (1 - Math.cos(w0)) / 2
  b1 = 1 - Math.cos(w0)
  b2 = b0
  a0 = 1 + alpha
  a1 = -2 * Math.cos(w0)
  a2 = 1 - alpha
  biquad(waveform, b0, b1, b2, a0, a1, a2)
end

.mu_law_decoding(x_mu, quantization_channels) ⇒ Object



55
56
57
58
59
60
61
62
63
64
# File 'lib/torchaudio/functional.rb', line 55

def mu_law_decoding(x_mu, quantization_channels)
  mu = quantization_channels - 1.0
  if !x_mu.floating_point?
    x_mu = x_mu.to(dtype: :float)
  end
  mu = Torch.tensor(mu, dtype: x_mu.dtype)
  x = ((x_mu) / mu) * 2 - 1.0
  x = Torch.sign(x) * (Torch.exp(Torch.abs(x) * Torch.log1p(mu)) - 1.0) / mu
  x
end

.mu_law_encoding(x, quantization_channels) ⇒ Object



44
45
46
47
48
49
50
51
52
53
# File 'lib/torchaudio/functional.rb', line 44

def mu_law_encoding(x, quantization_channels)
  mu = quantization_channels - 1.0
  if !x.floating_point?
    x = x.to(dtype: :float)
  end
  mu = Torch.tensor(mu, dtype: x.dtype)
  x_mu = Torch.sign(x) * Torch.log1p(mu * Torch.abs(x)) / Torch.log1p(mu)
  x_mu = ((x_mu + 1) / 2 * mu + 0.5).to(dtype: :int64)
  x_mu
end

.spectrogram(waveform, pad, window, n_fft, hop_length, win_length, power, normalized, center: true, pad_mode: "reflect", onesided: true) ⇒ Object



4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# File 'lib/torchaudio/functional.rb', line 4

def spectrogram(waveform, pad, window, n_fft, hop_length, win_length, power, normalized, center: true, pad_mode: "reflect", onesided: true)
  if pad > 0
    # TODO add "with torch.no_grad():" back when JIT supports it
    waveform = Torch::NN::Functional.pad(waveform, [pad, pad], "constant")
  end

  # pack batch
  shape = waveform.size
  waveform = waveform.reshape(-1, shape[-1])

  # default values are consistent with librosa.core.spectrum._spectrogram
  spec_f =
    Torch.stft(
      waveform,
      n_fft,
      hop_length: hop_length,
      win_length: win_length,
      window: window,
      center: center,
      pad_mode: pad_mode,
      normalized: false,
      onesided: onesided,
      return_complex: true
    )

  # unpack batch
  spec_f = spec_f.reshape(shape[0..-2] + spec_f.shape[-2..-1])

  if normalized
    spec_f /= window.pow(2.0).sum.sqrt
  end
  if !power.nil?
    if power == 1
      return spec_f.abs
    end
    return spec_f.abs.pow(power)
  end
  spec_f
end