Caution
You're reading an old version of this documentation. If you want up-to-date information, please have a look at 0.10.2.
Source code for librosa.core.constantq
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Constant-Q transforms"""
from __future__ import division
import warnings
import numpy as np
from numba import jit
from . import audio
from .fft import get_fftlib
from .convert import cqt_frequencies, note_to_hz
from .spectrum import stft, istft
from .pitch import estimate_tuning
from .._cache import cache
from .. import filters
from .. import util
from ..util.exceptions import ParameterError
__all__ = ["cqt", "hybrid_cqt", "pseudo_cqt", "icqt", "griffinlim_cqt", "vqt"]
# TODO: ivqt, griffinlim_vqt
[docs]@cache(level=20)
def cqt(
y,
sr=22050,
hop_length=512,
fmin=None,
n_bins=84,
bins_per_octave=12,
tuning=0.0,
filter_scale=1,
norm=1,
sparsity=0.01,
window="hann",
scale=True,
pad_mode="reflect",
res_type=None,
dtype=None,
):
"""Compute the constant-Q transform of an audio signal.
This implementation is based on the recursive sub-sampling method
described by [#]_.
.. [#] Schoerkhuber, Christian, and Anssi Klapuri.
"Constant-Q transform toolbox for music processing."
7th Sound and Music Computing Conference, Barcelona, Spain. 2010.
Parameters
----------
y : np.ndarray [shape=(n,)]
audio time series
sr : number > 0 [scalar]
sampling rate of ``y``
hop_length : int > 0 [scalar]
number of samples between successive CQT columns.
fmin : float > 0 [scalar]
Minimum frequency. Defaults to `C1 ~= 32.70 Hz`
n_bins : int > 0 [scalar]
Number of frequency bins, starting at ``fmin``
bins_per_octave : int > 0 [scalar]
Number of bins per octave
tuning : None or float
Tuning offset in fractions of a bin.
If ``None``, tuning will be automatically estimated from the signal.
The minimum frequency of the resulting CQT will be modified to
``fmin * 2**(tuning / bins_per_octave)``.
filter_scale : float > 0
Filter scale factor. Small values (<1) use shorter windows
for improved time resolution.
norm : {inf, -inf, 0, float > 0}
Type of norm to use for basis function normalization.
See `librosa.util.normalize`.
sparsity : float in [0, 1)
Sparsify the CQT basis by discarding up to ``sparsity``
fraction of the energy in each basis.
Set ``sparsity=0`` to disable sparsification.
window : str, tuple, number, or function
Window specification for the basis filters.
See `filters.get_window` for details.
scale : bool
If ``True``, scale the CQT response by square-root the length of
each channel's filter. This is analogous to ``norm='ortho'`` in FFT.
If ``False``, do not scale the CQT. This is analogous to
``norm=None`` in FFT.
pad_mode : string
Padding mode for centered frame analysis.
See also: `librosa.stft` and `numpy.pad`.
res_type : string [optional]
The resampling mode for recursive downsampling.
By default, `cqt` will adaptively select a resampling mode
which trades off accuracy at high frequencies for efficiency at low frequencies.
You can override this by specifying a resampling mode as supported by
`librosa.resample`. For example, ``res_type='fft'`` will use a high-quality,
but potentially slow FFT-based down-sampling, while ``res_type='polyphase'`` will
use a fast, but potentially inaccurate down-sampling.
dtype : np.dtype
The (complex) data type of the output array. By default, this is inferred to match
the numerical precision of the input signal.
Returns
-------
CQT : np.ndarray [shape=(n_bins, t)]
Constant-Q value each frequency at each time.
Raises
------
ParameterError
If ``hop_length`` is not an integer multiple of
``2**(n_bins / bins_per_octave)``
Or if ``y`` is too short to support the frequency range of the CQT.
See Also
--------
vqt
librosa.resample
librosa.util.normalize
Notes
-----
This function caches at level 20.
Examples
--------
Generate and plot a constant-Q power spectrum
>>> import matplotlib.pyplot as plt
>>> y, sr = librosa.load(librosa.ex('trumpet'))
>>> C = np.abs(librosa.cqt(y, sr=sr))
>>> fig, ax = plt.subplots()
>>> img = librosa.display.specshow(librosa.amplitude_to_db(C, ref=np.max),
... sr=sr, x_axis='time', y_axis='cqt_note', ax=ax)
>>> ax.set_title('Constant-Q power spectrum')
>>> fig.colorbar(img, ax=ax, format="%+2.0f dB")
Limit the frequency range
>>> C = np.abs(librosa.cqt(y, sr=sr, fmin=librosa.note_to_hz('C2'),
... n_bins=60))
>>> C
array([[6.830e-04, 6.361e-04, ..., 7.362e-09, 9.102e-09],
[5.366e-04, 4.818e-04, ..., 8.953e-09, 1.067e-08],
...,
[4.288e-02, 4.580e-01, ..., 1.529e-05, 5.572e-06],
[2.965e-03, 1.508e-01, ..., 8.965e-06, 1.455e-05]])
Using a higher frequency resolution
>>> C = np.abs(librosa.cqt(y, sr=sr, fmin=librosa.note_to_hz('C2'),
... n_bins=60 * 2, bins_per_octave=12 * 2))
>>> C
array([[5.468e-04, 5.382e-04, ..., 5.911e-09, 6.105e-09],
[4.118e-04, 4.014e-04, ..., 7.788e-09, 8.160e-09],
...,
[2.780e-03, 1.424e-01, ..., 4.225e-06, 2.388e-05],
[5.147e-02, 6.959e-02, ..., 1.694e-05, 5.811e-06]])
"""
# CQT is the special case of VQT with gamma=0
return vqt(
y=y,
sr=sr,
hop_length=hop_length,
fmin=fmin,
n_bins=n_bins,
gamma=0,
bins_per_octave=bins_per_octave,
tuning=tuning,
filter_scale=filter_scale,
norm=norm,
sparsity=sparsity,
window=window,
scale=scale,
pad_mode=pad_mode,
res_type=res_type,
dtype=dtype,
)
[docs]@cache(level=20)
def hybrid_cqt(
y,
sr=22050,
hop_length=512,
fmin=None,
n_bins=84,
bins_per_octave=12,
tuning=0.0,
filter_scale=1,
norm=1,
sparsity=0.01,
window="hann",
scale=True,
pad_mode="reflect",
res_type=None,
dtype=None,
):
"""Compute the hybrid constant-Q transform of an audio signal.
Here, the hybrid CQT uses the pseudo CQT for higher frequencies where
the hop_length is longer than half the filter length and the full CQT
for lower frequencies.
Parameters
----------
y : np.ndarray [shape=(n,)]
audio time series
sr : number > 0 [scalar]
sampling rate of ``y``
hop_length : int > 0 [scalar]
number of samples between successive CQT columns.
fmin : float > 0 [scalar]
Minimum frequency. Defaults to `C1 ~= 32.70 Hz`
n_bins : int > 0 [scalar]
Number of frequency bins, starting at ``fmin``
bins_per_octave : int > 0 [scalar]
Number of bins per octave
tuning : None or float
Tuning offset in fractions of a bin.
If ``None``, tuning will be automatically estimated from the signal.
The minimum frequency of the resulting CQT will be modified to
``fmin * 2**(tuning / bins_per_octave)``.
filter_scale : float > 0
Filter filter_scale factor. Larger values use longer windows.
sparsity : float in [0, 1)
Sparsify the CQT basis by discarding up to ``sparsity``
fraction of the energy in each basis.
Set ``sparsity=0`` to disable sparsification.
window : str, tuple, number, or function
Window specification for the basis filters.
See `filters.get_window` for details.
pad_mode : string
Padding mode for centered frame analysis.
See also: `librosa.stft` and `numpy.pad`.
res_type : string
Resampling mode. See `librosa.cqt` for details.
dtype : np.dtype, optional
The complex dtype to use for computing the CQT.
By default, this is inferred to match the precision of
the input signal.
Returns
-------
CQT : np.ndarray [shape=(n_bins, t), dtype=np.float]
Constant-Q energy for each frequency at each time.
Raises
------
ParameterError
If ``hop_length`` is not an integer multiple of
``2**(n_bins / bins_per_octave)``
Or if ``y`` is too short to support the frequency range of the CQT.
See Also
--------
cqt
pseudo_cqt
Notes
-----
This function caches at level 20.
"""
if fmin is None:
# C1 by default
fmin = note_to_hz("C1")
if tuning is None:
tuning = estimate_tuning(y=y, sr=sr, bins_per_octave=bins_per_octave)
# Apply tuning correction
fmin = fmin * 2.0 ** (tuning / bins_per_octave)
# Get all CQT frequencies
freqs = cqt_frequencies(n_bins, fmin, bins_per_octave=bins_per_octave)
# Compute the length of each constant-Q basis function
lengths = filters.constant_q_lengths(
sr,
fmin,
n_bins=n_bins,
bins_per_octave=bins_per_octave,
filter_scale=filter_scale,
window=window,
)
# Determine which filters to use with Pseudo CQT
# These are the ones that fit within 2 hop lengths after padding
pseudo_filters = 2.0 ** np.ceil(np.log2(lengths)) < 2 * hop_length
n_bins_pseudo = int(np.sum(pseudo_filters))
n_bins_full = n_bins - n_bins_pseudo
cqt_resp = []
if n_bins_pseudo > 0:
fmin_pseudo = np.min(freqs[pseudo_filters])
cqt_resp.append(
pseudo_cqt(
y,
sr,
hop_length=hop_length,
fmin=fmin_pseudo,
n_bins=n_bins_pseudo,
bins_per_octave=bins_per_octave,
filter_scale=filter_scale,
norm=norm,
sparsity=sparsity,
window=window,
scale=scale,
pad_mode=pad_mode,
dtype=dtype,
)
)
if n_bins_full > 0:
cqt_resp.append(
np.abs(
cqt(
y,
sr,
hop_length=hop_length,
fmin=fmin,
n_bins=n_bins_full,
bins_per_octave=bins_per_octave,
filter_scale=filter_scale,
norm=norm,
sparsity=sparsity,
window=window,
scale=scale,
pad_mode=pad_mode,
res_type=res_type,
dtype=dtype,
)
)
)
# Propagate dtype from the last component
return __trim_stack(cqt_resp, n_bins, cqt_resp[-1].dtype)
[docs]@cache(level=20)
def pseudo_cqt(
y,
sr=22050,
hop_length=512,
fmin=None,
n_bins=84,
bins_per_octave=12,
tuning=0.0,
filter_scale=1,
norm=1,
sparsity=0.01,
window="hann",
scale=True,
pad_mode="reflect",
dtype=None,
):
"""Compute the pseudo constant-Q transform of an audio signal.
This uses a single fft size that is the smallest power of 2 that is greater
than or equal to the max of:
1. The longest CQT filter
2. 2x the hop_length
Parameters
----------
y : np.ndarray [shape=(n,)]
audio time series
sr : number > 0 [scalar]
sampling rate of ``y``
hop_length : int > 0 [scalar]
number of samples between successive CQT columns.
fmin : float > 0 [scalar]
Minimum frequency. Defaults to `C1 ~= 32.70 Hz`
n_bins : int > 0 [scalar]
Number of frequency bins, starting at ``fmin``
bins_per_octave : int > 0 [scalar]
Number of bins per octave
tuning : None or float
Tuning offset in fractions of a bin.
If ``None``, tuning will be automatically estimated from the signal.
The minimum frequency of the resulting CQT will be modified to
``fmin * 2**(tuning / bins_per_octave)``.
filter_scale : float > 0
Filter filter_scale factor. Larger values use longer windows.
sparsity : float in [0, 1)
Sparsify the CQT basis by discarding up to ``sparsity``
fraction of the energy in each basis.
Set ``sparsity=0`` to disable sparsification.
window : str, tuple, number, or function
Window specification for the basis filters.
See `filters.get_window` for details.
pad_mode : string
Padding mode for centered frame analysis.
See also: `librosa.stft` and `numpy.pad`.
dtype : np.dtype, optional
The complex data type for CQT calculations.
By default, this is inferred to match the precision of the input signal.
Returns
-------
CQT : np.ndarray [shape=(n_bins, t), dtype=np.float]
Pseudo Constant-Q energy for each frequency at each time.
Raises
------
ParameterError
If ``hop_length`` is not an integer multiple of
``2**(n_bins / bins_per_octave)``
Or if ``y`` is too short to support the frequency range of the CQT.
Notes
-----
This function caches at level 20.
"""
if fmin is None:
# C1 by default
fmin = note_to_hz("C1")
if tuning is None:
tuning = estimate_tuning(y=y, sr=sr, bins_per_octave=bins_per_octave)
if dtype is None:
dtype = util.dtype_r2c(y.dtype)
# Apply tuning correction
fmin = fmin * 2.0 ** (tuning / bins_per_octave)
fft_basis, n_fft, _ = __cqt_filter_fft(
sr,
fmin,
n_bins,
bins_per_octave,
filter_scale,
norm,
sparsity,
hop_length=hop_length,
window=window,
dtype=dtype,
)
fft_basis = np.abs(fft_basis)
# Compute the magnitude STFT with Hann window
D = np.abs(
stft(y, n_fft=n_fft, hop_length=hop_length, pad_mode=pad_mode, dtype=dtype)
)
# Project onto the pseudo-cqt basis
C = fft_basis.dot(D)
if scale:
C /= np.sqrt(n_fft)
else:
lengths = filters.constant_q_lengths(
sr,
fmin,
n_bins=n_bins,
bins_per_octave=bins_per_octave,
window=window,
filter_scale=filter_scale,
)
C *= np.sqrt(lengths[:, np.newaxis] / n_fft)
return C
[docs]@cache(level=40)
def icqt(
C,
sr=22050,
hop_length=512,
fmin=None,
bins_per_octave=12,
tuning=0.0,
filter_scale=1,
norm=1,
sparsity=0.01,
window="hann",
scale=True,
length=None,
res_type="fft",
dtype=None,
):
"""Compute the inverse constant-Q transform.
Given a constant-Q transform representation ``C`` of an audio signal ``y``,
this function produces an approximation ``y_hat``.
Parameters
----------
C : np.ndarray, [shape=(n_bins, n_frames)]
Constant-Q representation as produced by `cqt`
hop_length : int > 0 [scalar]
number of samples between successive frames
fmin : float > 0 [scalar]
Minimum frequency. Defaults to `C1 ~= 32.70 Hz`
tuning : float [scalar]
Tuning offset in fractions of a bin.
The minimum frequency of the CQT will be modified to
``fmin * 2**(tuning / bins_per_octave)``.
filter_scale : float > 0 [scalar]
Filter scale factor. Small values (<1) use shorter windows
for improved time resolution.
norm : {inf, -inf, 0, float > 0}
Type of norm to use for basis function normalization.
See `librosa.util.normalize`.
sparsity : float in [0, 1)
Sparsify the CQT basis by discarding up to ``sparsity``
fraction of the energy in each basis.
Set ``sparsity=0`` to disable sparsification.
window : str, tuple, number, or function
Window specification for the basis filters.
See `filters.get_window` for details.
scale : bool
If ``True``, scale the CQT response by square-root the length
of each channel's filter. This is analogous to ``norm='ortho'`` in FFT.
If ``False``, do not scale the CQT. This is analogous to ``norm=None``
in FFT.
length : int > 0, optional
If provided, the output ``y`` is zero-padded or clipped to exactly
``length`` samples.
res_type : string
Resampling mode. By default, this uses ``'fft'`` mode for high-quality
reconstruction, but this may be slow depending on your signal duration.
See `librosa.resample` for supported modes.
dtype : numeric type
Real numeric type for ``y``. Default is inferred to match the numerical
precision of the input CQT.
Returns
-------
y : np.ndarray, [shape=(n_samples), dtype=np.float]
Audio time-series reconstructed from the CQT representation.
See Also
--------
cqt
librosa.resample
Notes
-----
This function caches at level 40.
Examples
--------
Using default parameters
>>> y, sr = librosa.load(librosa.ex('trumpet'))
>>> C = librosa.cqt(y=y, sr=sr)
>>> y_hat = librosa.icqt(C=C, sr=sr)
Or with a different hop length and frequency resolution:
>>> hop_length = 256
>>> bins_per_octave = 12 * 3
>>> C = librosa.cqt(y=y, sr=sr, hop_length=256, n_bins=7*bins_per_octave,
... bins_per_octave=bins_per_octave)
>>> y_hat = librosa.icqt(C=C, sr=sr, hop_length=hop_length,
... bins_per_octave=bins_per_octave)
"""
if fmin is None:
fmin = note_to_hz("C1")
# Apply tuning correction
fmin = fmin * 2.0 ** (tuning / bins_per_octave)
# Get the top octave of frequencies
n_bins = len(C)
freqs = cqt_frequencies(n_bins, fmin, bins_per_octave=bins_per_octave)[
-bins_per_octave:
]
n_filters = min(n_bins, bins_per_octave)
fft_basis, n_fft, lengths = __cqt_filter_fft(
sr,
np.min(freqs),
n_filters,
bins_per_octave,
filter_scale,
norm,
sparsity=sparsity,
window=window,
)
if hop_length > min(lengths):
warnings.warn(
"hop_length={} exceeds minimum CQT filter length={:.3f}.\n"
"This will probably cause unpleasant acoustic artifacts. "
"Consider decreasing your hop length or increasing the "
"frequency resolution of your CQT.".format(hop_length, min(lengths))
)
if length is not None:
n_frames = int(np.ceil((length + max(lengths)) / hop_length))
C = C[:, :n_frames]
# The basis gets renormalized by the effective window length above;
# This step undoes that
fft_basis = fft_basis.todense() * n_fft / lengths[:, np.newaxis]
# This step conjugate-transposes the filter
inv_basis = fft_basis.H
# How many octaves do we have?
n_octaves = int(np.ceil(float(n_bins) / bins_per_octave))
y = None
for octave in range(n_octaves - 1, -1, -1):
slice_ = slice(
-(octave + 1) * bins_per_octave - 1, -(octave) * bins_per_octave - 1
)
# Slice this octave
C_oct = C[slice_]
inv_oct = inv_basis[:, -C_oct.shape[0] :]
oct_hop = hop_length // 2 ** octave
# Apply energy corrections
if scale:
C_scale = np.sqrt(lengths[-C_oct.shape[0] :, np.newaxis]) / n_fft
else:
C_scale = (
lengths[-C_oct.shape[0] :, np.newaxis] * np.sqrt(2 ** octave) / n_fft
)
# Inverse-project the basis for each octave
D_oct = inv_oct.dot(C_oct / C_scale)
# Inverse-STFT that response
y_oct = istft(D_oct, window="ones", hop_length=oct_hop, dtype=dtype)
# Up-sample that octave
if y is None:
y = y_oct
else:
# Up-sample the previous buffer and add in the new one
# Scipy-resampling is fast here, since it's a power-of-two relation
y = audio.resample(y, 1, 2, scale=True, res_type=res_type, fix=False)
y[: len(y_oct)] += y_oct
if length:
y = util.fix_length(y, length)
return y
[docs]@cache(level=20)
def vqt(
y,
sr=22050,
hop_length=512,
fmin=None,
n_bins=84,
gamma=None,
bins_per_octave=12,
tuning=0.0,
filter_scale=1,
norm=1,
sparsity=0.01,
window="hann",
scale=True,
pad_mode="reflect",
res_type=None,
dtype=None,
):
"""Compute the variable-Q transform of an audio signal.
This implementation is based on the recursive sub-sampling method
described by [#]_.
.. [#] Schörkhuber, Christian, Anssi Klapuri, Nicki Holighaus, and Monika Dörfler.
"A Matlab toolbox for efficient perfect reconstruction time-frequency
transforms with log-frequency resolution."
In Audio Engineering Society Conference: 53rd International Conference: Semantic Audio.
Audio Engineering Society, 2014.
Parameters
----------
y : np.ndarray [shape=(n,)]
audio time series
sr : number > 0 [scalar]
sampling rate of ``y``
hop_length : int > 0 [scalar]
number of samples between successive VQT columns.
fmin : float > 0 [scalar]
Minimum frequency. Defaults to `C1 ~= 32.70 Hz`
n_bins : int > 0 [scalar]
Number of frequency bins, starting at ``fmin``
gamma : number > 0 [scalar]
Bandwidth offset for determining filter lengths.
If ``gamma=0``, produces the constant-Q transform.
If 'gamma=None', gamma will be calculated such that filter bandwidths are equal to a
constant fraction of the equivalent rectangular bandwidths (ERB). This is accomplished
by solving for the gamma which gives::
B_k = alpha * f_k + gamma = C * ERB(f_k),
where ``B_k`` is the bandwidth of filter ``k`` with center frequency ``f_k``, alpha
is the inverse of what would be the constant Q-factor, and ``C = alpha / 0.108`` is the
constant fraction across all filters.
Here we use ``ERB(f_k) = 24.7 + 0.108 * f_k``, the best-fit curve derived
from experimental data in [#]_.
.. [#] Glasberg, Brian R., and Brian CJ Moore.
"Derivation of auditory filter shapes from notched-noise data."
Hearing research 47.1-2 (1990): 103-138.
bins_per_octave : int > 0 [scalar]
Number of bins per octave
tuning : None or float
Tuning offset in fractions of a bin.
If ``None``, tuning will be automatically estimated from the signal.
The minimum frequency of the resulting VQT will be modified to
``fmin * 2**(tuning / bins_per_octave)``.
filter_scale : float > 0
Filter scale factor. Small values (<1) use shorter windows
for improved time resolution.
norm : {inf, -inf, 0, float > 0}
Type of norm to use for basis function normalization.
See `librosa.util.normalize`.
sparsity : float in [0, 1)
Sparsify the VQT basis by discarding up to ``sparsity``
fraction of the energy in each basis.
Set ``sparsity=0`` to disable sparsification.
window : str, tuple, number, or function
Window specification for the basis filters.
See `filters.get_window` for details.
scale : bool
If ``True``, scale the VQT response by square-root the length of
each channel's filter. This is analogous to ``norm='ortho'`` in FFT.
If ``False``, do not scale the VQT. This is analogous to
``norm=None`` in FFT.
pad_mode : string
Padding mode for centered frame analysis.
See also: `librosa.stft` and `numpy.pad`.
res_type : string [optional]
The resampling mode for recursive downsampling.
By default, `vqt` will adaptively select a resampling mode
which trades off accuracy at high frequencies for efficiency at low frequencies.
You can override this by specifying a resampling mode as supported by
`librosa.resample`. For example, ``res_type='fft'`` will use a high-quality,
but potentially slow FFT-based down-sampling, while ``res_type='polyphase'`` will
use a fast, but potentially inaccurate down-sampling.
dtype : np.dtype
The dtype of the output array. By default, this is inferred to match the
numerical precision of the input signal.
Returns
-------
VQT : np.ndarray [shape=(n_bins, t), dtype=np.complex or np.float]
Variable-Q value each frequency at each time.
Raises
------
ParameterError
If ``hop_length`` is not an integer multiple of
``2**(n_bins / bins_per_octave)``
Or if ``y`` is too short to support the frequency range of the VQT.
See Also
--------
cqt
Notes
-----
This function caches at level 20.
Examples
--------
Generate and plot a variable-Q power spectrum
>>> import matplotlib.pyplot as plt
>>> y, sr = librosa.load(librosa.ex('choice'), duration=5)
>>> C = np.abs(librosa.cqt(y, sr=sr))
>>> V = np.abs(librosa.vqt(y, sr=sr))
>>> fig, ax = plt.subplots(nrows=2, sharex=True, sharey=True)
>>> librosa.display.specshow(librosa.amplitude_to_db(C, ref=np.max),
... sr=sr, x_axis='time', y_axis='cqt_note', ax=ax[0])
>>> ax[0].set(title='Constant-Q power spectrum', xlabel=None)
>>> ax[0].label_outer()
>>> img = librosa.display.specshow(librosa.amplitude_to_db(V, ref=np.max),
... sr=sr, x_axis='time', y_axis='cqt_note', ax=ax[1])
>>> ax[1].set_title('Variable-Q power spectrum')
>>> fig.colorbar(img, ax=ax, format="%+2.0f dB")
"""
# How many octaves are we dealing with?
n_octaves = int(np.ceil(float(n_bins) / bins_per_octave))
n_filters = min(bins_per_octave, n_bins)
len_orig = len(y)
# Relative difference in frequency between any two consecutive bands
alpha = 2.0 ** (1.0 / bins_per_octave) - 1
if fmin is None:
# C1 by default
fmin = note_to_hz("C1")
if tuning is None:
tuning = estimate_tuning(y=y, sr=sr, bins_per_octave=bins_per_octave)
if gamma is None:
gamma = 24.7 * alpha / 0.108
if dtype is None:
dtype = util.dtype_r2c(y.dtype)
# Apply tuning correction
fmin = fmin * 2.0 ** (tuning / bins_per_octave)
# First thing, get the freqs of the top octave
freqs = cqt_frequencies(n_bins, fmin, bins_per_octave=bins_per_octave)[
-bins_per_octave:
]
fmin_t = np.min(freqs)
fmax_t = np.max(freqs)
# Determine required resampling quality
Q = float(filter_scale) / alpha
filter_cutoff = (
fmax_t * (1 + 0.5 * filters.window_bandwidth(window) / Q) + 0.5 * gamma
)
nyquist = sr / 2.0
auto_resample = False
if not res_type:
auto_resample = True
if filter_cutoff < audio.BW_FASTEST * nyquist:
res_type = "kaiser_fast"
else:
res_type = "kaiser_best"
y, sr, hop_length = __early_downsample(
y, sr, hop_length, res_type, n_octaves, nyquist, filter_cutoff, scale
)
vqt_resp = []
# Skip this block for now
if auto_resample and res_type != "kaiser_fast":
# Do the top octave before resampling to allow for fast resampling
fft_basis, n_fft, _ = __cqt_filter_fft(
sr,
fmin_t,
n_filters,
bins_per_octave,
filter_scale,
norm,
sparsity,
window=window,
gamma=gamma,
dtype=dtype,
)
# Compute the VQT filter response and append it to the stack
vqt_resp.append(
__cqt_response(y, n_fft, hop_length, fft_basis, pad_mode, dtype=dtype)
)
fmin_t /= 2
fmax_t /= 2
n_octaves -= 1
filter_cutoff = fmax_t * (1 + 0.5 * filters.window_bandwidth(window) / Q)
res_type = "kaiser_fast"
# Make sure our hop is long enough to support the bottom octave
num_twos = __num_two_factors(hop_length)
if num_twos < n_octaves - 1:
raise ParameterError(
"hop_length must be a positive integer "
"multiple of 2^{0:d} for {1:d}-octave CQT/VQT".format(
n_octaves - 1, n_octaves
)
)
# Now do the recursive bit
my_y, my_sr, my_hop = y, sr, hop_length
# Iterate down the octaves
for i in range(n_octaves):
# Resample (except first time)
if i > 0:
if len(my_y) < 2:
raise ParameterError(
"Input signal length={} is too short for "
"{:d}-octave CQT/VQT".format(len_orig, n_octaves)
)
my_y = audio.resample(my_y, 2, 1, res_type=res_type, scale=True)
my_sr /= 2.0
my_hop //= 2
fft_basis, n_fft, _ = __cqt_filter_fft(
my_sr,
fmin_t * 2.0 ** -i,
n_filters,
bins_per_octave,
filter_scale,
norm,
sparsity,
window=window,
gamma=gamma,
dtype=dtype,
)
# Re-scale the filters to compensate for downsampling
fft_basis[:] *= np.sqrt(2 ** i)
# Compute the vqt filter response and append to the stack
vqt_resp.append(
__cqt_response(my_y, n_fft, my_hop, fft_basis, pad_mode, dtype=dtype)
)
V = __trim_stack(vqt_resp, n_bins, dtype)
if scale:
lengths = filters.constant_q_lengths(
sr,
fmin,
n_bins=n_bins,
bins_per_octave=bins_per_octave,
window=window,
filter_scale=filter_scale,
gamma=gamma,
)
V /= np.sqrt(lengths[:, np.newaxis])
return V
@cache(level=10)
def __cqt_filter_fft(
sr,
fmin,
n_bins,
bins_per_octave,
filter_scale,
norm,
sparsity,
hop_length=None,
window="hann",
gamma=0.0,
dtype=np.complex,
):
"""Generate the frequency domain constant-Q filter basis."""
basis, lengths = filters.constant_q(
sr,
fmin=fmin,
n_bins=n_bins,
bins_per_octave=bins_per_octave,
filter_scale=filter_scale,
norm=norm,
pad_fft=True,
window=window,
gamma=gamma,
)
# Filters are padded up to the nearest integral power of 2
n_fft = basis.shape[1]
if hop_length is not None and n_fft < 2.0 ** (1 + np.ceil(np.log2(hop_length))):
n_fft = int(2.0 ** (1 + np.ceil(np.log2(hop_length))))
# re-normalize bases with respect to the FFT window length
basis *= lengths[:, np.newaxis] / float(n_fft)
# FFT and retain only the non-negative frequencies
fft = get_fftlib()
fft_basis = fft.fft(basis, n=n_fft, axis=1)[:, : (n_fft // 2) + 1]
# sparsify the basis
fft_basis = util.sparsify_rows(fft_basis, quantile=sparsity, dtype=dtype)
return fft_basis, n_fft, lengths
def __trim_stack(cqt_resp, n_bins, dtype):
"""Helper function to trim and stack a collection of CQT responses"""
max_col = min(c_i.shape[-1] for c_i in cqt_resp)
cqt_out = np.empty((n_bins, max_col), dtype=dtype, order="F")
# Copy per-octave data into output array
end = n_bins
for c_i in cqt_resp:
# By default, take the whole octave
n_oct = c_i.shape[0]
# If the whole octave is more than we can fit,
# take the highest bins from c_i
if end < n_oct:
cqt_out[:end] = c_i[-end:, :max_col]
else:
cqt_out[end - n_oct : end] = c_i[:, :max_col]
end -= n_oct
return cqt_out
def __cqt_response(y, n_fft, hop_length, fft_basis, mode, dtype=None):
"""Compute the filter response with a target STFT hop."""
# Compute the STFT matrix
D = stft(
y, n_fft=n_fft, hop_length=hop_length, window="ones", pad_mode=mode, dtype=dtype
)
# And filter response energy
return fft_basis.dot(D)
def __early_downsample_count(nyquist, filter_cutoff, hop_length, n_octaves):
"""Compute the number of early downsampling operations"""
downsample_count1 = max(
0, int(np.ceil(np.log2(audio.BW_FASTEST * nyquist / filter_cutoff)) - 1) - 1
)
num_twos = __num_two_factors(hop_length)
downsample_count2 = max(0, num_twos - n_octaves + 1)
return min(downsample_count1, downsample_count2)
def __early_downsample(
y, sr, hop_length, res_type, n_octaves, nyquist, filter_cutoff, scale
):
"""Perform early downsampling on an audio signal, if it applies."""
downsample_count = __early_downsample_count(
nyquist, filter_cutoff, hop_length, n_octaves
)
if downsample_count > 0 and res_type == "kaiser_fast":
downsample_factor = 2 ** (downsample_count)
hop_length //= downsample_factor
if len(y) < downsample_factor:
raise ParameterError(
"Input signal length={:d} is too short for "
"{:d}-octave CQT".format(len(y), n_octaves)
)
new_sr = sr / float(downsample_factor)
y = audio.resample(y, sr, new_sr, res_type=res_type, scale=True)
# If we're not going to length-scale after CQT, we
# need to compensate for the downsampling factor here
if not scale:
y *= np.sqrt(downsample_factor)
sr = new_sr
return y, sr, hop_length
@jit(nopython=True, cache=True)
def __num_two_factors(x):
"""Return how many times integer x can be evenly divided by 2.
Returns 0 for non-positive integers.
"""
if x <= 0:
return 0
num_twos = 0
while x % 2 == 0:
num_twos += 1
x //= 2
return num_twos
[docs]def griffinlim_cqt(
C,
n_iter=32,
sr=22050,
hop_length=512,
fmin=None,
bins_per_octave=12,
tuning=0.0,
filter_scale=1,
norm=1,
sparsity=0.01,
window="hann",
scale=True,
pad_mode="reflect",
res_type="kaiser_fast",
dtype=None,
length=None,
momentum=0.99,
init="random",
random_state=None,
):
"""Approximate constant-Q magnitude spectrogram inversion using the "fast" Griffin-Lim
algorithm.
Given the magnitude of a constant-Q spectrogram (``C``), the algorithm randomly initializes
phase estimates, and then alternates forward- and inverse-CQT operations. [#]_
This implementation is based on the (fast) Griffin-Lim method for Short-time Fourier Transforms, [#]_
but adapted for use with constant-Q spectrograms.
.. [#] D. W. Griffin and J. S. Lim,
"Signal estimation from modified short-time Fourier transform,"
IEEE Trans. ASSP, vol.32, no.2, pp.236–243, Apr. 1984.
.. [#] Perraudin, N., Balazs, P., & Søndergaard, P. L.
"A fast Griffin-Lim algorithm,"
IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (pp. 1-4),
Oct. 2013.
Parameters
----------
C : np.ndarray [shape=(n_bins, n_frames)]
The constant-Q magnitude spectrogram
n_iter : int > 0
The number of iterations to run
sr : number > 0
Audio sampling rate
hop_length : int > 0
The hop length of the CQT
fmin : number > 0
Minimum frequency for the CQT.
If not provided, it defaults to `C1`.
bins_per_octave : int > 0
Number of bins per octave
tuning : float
Tuning deviation from A440, in fractions of a bin
filter_scale : float > 0
Filter scale factor. Small values (<1) use shorter windows
for improved time resolution.
norm : {inf, -inf, 0, float > 0}
Type of norm to use for basis function normalization.
See `librosa.util.normalize`.
sparsity : float in [0, 1)
Sparsify the CQT basis by discarding up to ``sparsity``
fraction of the energy in each basis.
Set ``sparsity=0`` to disable sparsification.
window : str, tuple, or function
Window specification for the basis filters.
See `filters.get_window` for details.
scale : bool
If ``True``, scale the CQT response by square-root the length
of each channel's filter. This is analogous to ``norm='ortho'``
in FFT.
If ``False``, do not scale the CQT. This is analogous to ``norm=None``
in FFT.
pad_mode : string
Padding mode for centered frame analysis.
See also: `librosa.stft` and `numpy.pad`.
res_type : string
The resampling mode for recursive downsampling.
By default, CQT uses an adaptive mode selection to
trade accuracy at high frequencies for efficiency at low
frequencies.
Griffin-Lim uses the efficient (fast) resampling mode by default.
See ``librosa.resample`` for a list of available options.
dtype : numeric type
Real numeric type for ``y``. Default is inferred to match the precision
of the input CQT.
length : int > 0, optional
If provided, the output ``y`` is zero-padded or clipped to exactly
``length`` samples.
momentum : float > 0
The momentum parameter for fast Griffin-Lim.
Setting this to 0 recovers the original Griffin-Lim method.
Values near 1 can lead to faster convergence, but above 1 may not converge.
init : None or 'random' [default]
If 'random' (the default), then phase values are initialized randomly
according to ``random_state``. This is recommended when the input ``C`` is
a magnitude spectrogram with no initial phase estimates.
If ``None``, then the phase is initialized from ``C``. This is useful when
an initial guess for phase can be provided, or when you want to resume
Griffin-Lim from a previous output.
random_state : None, int, or np.random.RandomState
If int, random_state is the seed used by the random number generator
for phase initialization.
If `np.random.RandomState` instance, the random number generator itself.
If ``None``, defaults to the current `np.random` object.
Returns
-------
y : np.ndarray [shape=(n,)]
time-domain signal reconstructed from ``C``
See Also
--------
cqt
icqt
griffinlim
filters.get_window
resample
Examples
--------
A basis CQT inverse example
>>> y, sr = librosa.load(librosa.ex('trumpet', hq=True), sr=None)
>>> # Get the CQT magnitude, 7 octaves at 36 bins per octave
>>> C = np.abs(librosa.cqt(y=y, sr=sr, bins_per_octave=36, n_bins=7*36))
>>> # Invert using Griffin-Lim
>>> y_inv = librosa.griffinlim_cqt(C, sr=sr, bins_per_octave=36)
>>> # And invert without estimating phase
>>> y_icqt = librosa.icqt(C, sr=sr, bins_per_octave=36)
Wave-plot the results
>>> import matplotlib.pyplot as plt
>>> fig, ax = plt.subplots(nrows=3, sharex=True, sharey=True)
>>> librosa.display.waveshow(y, sr=sr, color='b', ax=ax[0])
>>> ax[0].set(title='Original', xlabel=None)
>>> ax[0].label_outer()
>>> librosa.display.waveshow(y_inv, sr=sr, color='g', ax=ax[1])
>>> ax[1].set(title='Griffin-Lim reconstruction', xlabel=None)
>>> ax[1].label_outer()
>>> librosa.display.waveshow(y_icqt, sr=sr, color='r', ax=ax[2])
>>> ax[2].set(title='Magnitude-only icqt reconstruction')
"""
if fmin is None:
fmin = note_to_hz("C1")
if random_state is None:
rng = np.random
elif isinstance(random_state, int):
rng = np.random.RandomState(seed=random_state)
elif isinstance(random_state, np.random.RandomState):
rng = random_state
if momentum > 1:
warnings.warn(
"Griffin-Lim with momentum={} > 1 can be unstable. "
"Proceed with caution!".format(momentum)
)
elif momentum < 0:
raise ParameterError(
"griffinlim_cqt() called with momentum={} < 0".format(momentum)
)
# using complex64 will keep the result to minimal necessary precision
angles = np.empty(C.shape, dtype=np.complex64)
if init == "random":
# randomly initialize the phase
angles[:] = np.exp(2j * np.pi * rng.rand(*C.shape))
elif init is None:
# Initialize an all ones complex matrix
angles[:] = 1.0
else:
raise ParameterError("init={} must either None or 'random'".format(init))
# And initialize the previous iterate to 0
rebuilt = 0.0
for _ in range(n_iter):
# Store the previous iterate
tprev = rebuilt
# Invert with our current estimate of the phases
inverse = icqt(
C * angles,
sr=sr,
hop_length=hop_length,
bins_per_octave=bins_per_octave,
fmin=fmin,
tuning=tuning,
filter_scale=filter_scale,
window=window,
length=length,
res_type=res_type,
dtype=dtype,
)
# Rebuild the spectrogram
rebuilt = cqt(
inverse,
sr=sr,
bins_per_octave=bins_per_octave,
n_bins=C.shape[0],
hop_length=hop_length,
fmin=fmin,
tuning=tuning,
filter_scale=filter_scale,
window=window,
res_type=res_type,
)
# Update our phase estimates
angles[:] = rebuilt - (momentum / (1 + momentum)) * tprev
angles[:] /= np.abs(angles) + 1e-16
# Return the final phase estimates
return icqt(
C * angles,
sr=sr,
hop_length=hop_length,
bins_per_octave=bins_per_octave,
tuning=tuning,
filter_scale=filter_scale,
fmin=fmin,
window=window,
length=length,
res_type=res_type,
dtype=dtype,
)