Source code for librosa.core.constantq

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Constant-Q transforms"""
import warnings
import numpy as np
from numba import jit

from . import audio
from .intervals import interval_frequencies
from .fft import get_fftlib
from .convert import cqt_frequencies, note_to_hz
from .spectrum import stft, istft
from .pitch import estimate_tuning
from .._cache import cache
from .. import filters
from .. import util
from ..util.exceptions import ParameterError
from numpy.typing import DTypeLike
from typing import Optional, Union, Collection, List
from .._typing import _WindowSpec, _PadMode, _FloatLike_co, _ensure_not_reachable

__all__ = ["cqt", "hybrid_cqt", "pseudo_cqt", "icqt", "griffinlim_cqt", "vqt"]

# TODO: ivqt, griffinlim_vqt


[docs]@cache(level=20) def cqt( y: np.ndarray, *, sr: float = 22050, hop_length: int = 512, fmin: Optional[_FloatLike_co] = None, n_bins: int = 84, bins_per_octave: int = 12, tuning: Optional[float] = 0.0, filter_scale: float = 1, norm: Optional[float] = 1, sparsity: float = 0.01, window: _WindowSpec = "hann", scale: bool = True, pad_mode: _PadMode = "constant", res_type: Optional[str] = "soxr_hq", dtype: Optional[DTypeLike] = None, ) -> np.ndarray: """Compute the constant-Q transform of an audio signal. This implementation is based on the recursive sub-sampling method described by [#]_. .. [#] Schoerkhuber, Christian, and Anssi Klapuri. "Constant-Q transform toolbox for music processing." 7th Sound and Music Computing Conference, Barcelona, Spain. 2010. Parameters ---------- y : np.ndarray [shape=(..., n)] audio time series. Multi-channel is supported. sr : number > 0 [scalar] sampling rate of ``y`` hop_length : int > 0 [scalar] number of samples between successive CQT columns. fmin : float > 0 [scalar] Minimum frequency. Defaults to `C1 ~= 32.70 Hz` n_bins : int > 0 [scalar] Number of frequency bins, starting at ``fmin`` bins_per_octave : int > 0 [scalar] Number of bins per octave tuning : None or float Tuning offset in fractions of a bin. If ``None``, tuning will be automatically estimated from the signal. The minimum frequency of the resulting CQT will be modified to ``fmin * 2**(tuning / bins_per_octave)``. filter_scale : float > 0 Filter scale factor. Small values (<1) use shorter windows for improved time resolution. norm : {inf, -inf, 0, float > 0} Type of norm to use for basis function normalization. See `librosa.util.normalize`. sparsity : float in [0, 1) Sparsify the CQT basis by discarding up to ``sparsity`` fraction of the energy in each basis. Set ``sparsity=0`` to disable sparsification. window : str, tuple, number, or function Window specification for the basis filters. See `filters.get_window` for details. scale : bool If ``True``, scale the CQT response by square-root the length of each channel's filter. This is analogous to ``norm='ortho'`` in FFT. If ``False``, do not scale the CQT. This is analogous to ``norm=None`` in FFT. pad_mode : string Padding mode for centered frame analysis. See also: `librosa.stft` and `numpy.pad`. res_type : string The resampling mode for recursive downsampling. dtype : np.dtype The (complex) data type of the output array. By default, this is inferred to match the numerical precision of the input signal. Returns ------- CQT : np.ndarray [shape=(..., n_bins, t)] Constant-Q value each frequency at each time. See Also -------- vqt librosa.resample librosa.util.normalize Notes ----- This function caches at level 20. Examples -------- Generate and plot a constant-Q power spectrum >>> import matplotlib.pyplot as plt >>> y, sr = librosa.load(librosa.ex('trumpet')) >>> C = np.abs(librosa.cqt(y, sr=sr)) >>> fig, ax = plt.subplots() >>> img = librosa.display.specshow(librosa.amplitude_to_db(C, ref=np.max), ... sr=sr, x_axis='time', y_axis='cqt_note', ax=ax) >>> ax.set_title('Constant-Q power spectrum') >>> fig.colorbar(img, ax=ax, format="%+2.0f dB") Limit the frequency range >>> C = np.abs(librosa.cqt(y, sr=sr, fmin=librosa.note_to_hz('C2'), ... n_bins=60)) >>> C array([[6.830e-04, 6.361e-04, ..., 7.362e-09, 9.102e-09], [5.366e-04, 4.818e-04, ..., 8.953e-09, 1.067e-08], ..., [4.288e-02, 4.580e-01, ..., 1.529e-05, 5.572e-06], [2.965e-03, 1.508e-01, ..., 8.965e-06, 1.455e-05]]) Using a higher frequency resolution >>> C = np.abs(librosa.cqt(y, sr=sr, fmin=librosa.note_to_hz('C2'), ... n_bins=60 * 2, bins_per_octave=12 * 2)) >>> C array([[5.468e-04, 5.382e-04, ..., 5.911e-09, 6.105e-09], [4.118e-04, 4.014e-04, ..., 7.788e-09, 8.160e-09], ..., [2.780e-03, 1.424e-01, ..., 4.225e-06, 2.388e-05], [5.147e-02, 6.959e-02, ..., 1.694e-05, 5.811e-06]]) """ # CQT is the special case of VQT with gamma=0 return vqt( y=y, sr=sr, hop_length=hop_length, fmin=fmin, n_bins=n_bins, intervals="equal", gamma=0, bins_per_octave=bins_per_octave, tuning=tuning, filter_scale=filter_scale, norm=norm, sparsity=sparsity, window=window, scale=scale, pad_mode=pad_mode, res_type=res_type, dtype=dtype, )
[docs]@cache(level=20) def hybrid_cqt( y: np.ndarray, *, sr: float = 22050, hop_length: int = 512, fmin: Optional[_FloatLike_co] = None, n_bins: int = 84, bins_per_octave: int = 12, tuning: Optional[float] = 0.0, filter_scale: float = 1, norm: Optional[float] = 1, sparsity: float = 0.01, window: _WindowSpec = "hann", scale: bool = True, pad_mode: _PadMode = "constant", res_type: str = "soxr_hq", dtype: Optional[DTypeLike] = None, ) -> np.ndarray: """Compute the hybrid constant-Q transform of an audio signal. Here, the hybrid CQT uses the pseudo CQT for higher frequencies where the hop_length is longer than half the filter length and the full CQT for lower frequencies. Parameters ---------- y : np.ndarray [shape=(..., n)] audio time series. Multi-channel is supported. sr : number > 0 [scalar] sampling rate of ``y`` hop_length : int > 0 [scalar] number of samples between successive CQT columns. fmin : float > 0 [scalar] Minimum frequency. Defaults to `C1 ~= 32.70 Hz` n_bins : int > 0 [scalar] Number of frequency bins, starting at ``fmin`` bins_per_octave : int > 0 [scalar] Number of bins per octave tuning : None or float Tuning offset in fractions of a bin. If ``None``, tuning will be automatically estimated from the signal. The minimum frequency of the resulting CQT will be modified to ``fmin * 2**(tuning / bins_per_octave)``. filter_scale : float > 0 Filter filter_scale factor. Larger values use longer windows. norm : {inf, -inf, 0, float > 0} Type of norm to use for basis function normalization. See `librosa.util.normalize`. sparsity : float in [0, 1) Sparsify the CQT basis by discarding up to ``sparsity`` fraction of the energy in each basis. Set ``sparsity=0`` to disable sparsification. window : str, tuple, number, or function Window specification for the basis filters. See `filters.get_window` for details. scale : bool If ``True``, scale the CQT response by square-root the length of each channel's filter. This is analogous to ``norm='ortho'`` in FFT. If ``False``, do not scale the CQT. This is analogous to ``norm=None`` in FFT. pad_mode : string Padding mode for centered frame analysis. See also: `librosa.stft` and `numpy.pad`. res_type : string Resampling mode. See `librosa.cqt` for details. dtype : np.dtype, optional The complex dtype to use for computing the CQT. By default, this is inferred to match the precision of the input signal. Returns ------- CQT : np.ndarray [shape=(..., n_bins, t), dtype=np.float] Constant-Q energy for each frequency at each time. See Also -------- cqt pseudo_cqt Notes ----- This function caches at level 20. """ if fmin is None: # C1 by default fmin = note_to_hz("C1") if tuning is None: tuning = estimate_tuning(y=y, sr=sr, bins_per_octave=bins_per_octave) # Apply tuning correction fmin = fmin * 2.0 ** (tuning / bins_per_octave) # Get all CQT frequencies freqs = cqt_frequencies(n_bins, fmin=fmin, bins_per_octave=bins_per_octave) # Pre-compute alpha if n_bins == 1: alpha = __et_relative_bw(bins_per_octave) else: alpha = filters._relative_bandwidth(freqs=freqs) # Compute the length of each constant-Q basis function lengths, _ = filters.wavelet_lengths( freqs=freqs, sr=sr, filter_scale=filter_scale, window=window, alpha=alpha ) # Determine which filters to use with Pseudo CQT # These are the ones that fit within 2 hop lengths after padding pseudo_filters = 2.0 ** np.ceil(np.log2(lengths)) < 2 * hop_length n_bins_pseudo = int(np.sum(pseudo_filters)) n_bins_full = n_bins - n_bins_pseudo cqt_resp = [] if n_bins_pseudo > 0: fmin_pseudo = np.min(freqs[pseudo_filters]) cqt_resp.append( pseudo_cqt( y, sr=sr, hop_length=hop_length, fmin=fmin_pseudo, n_bins=n_bins_pseudo, bins_per_octave=bins_per_octave, filter_scale=filter_scale, norm=norm, sparsity=sparsity, window=window, scale=scale, pad_mode=pad_mode, dtype=dtype, ) ) if n_bins_full > 0: cqt_resp.append( np.abs( cqt( y, sr=sr, hop_length=hop_length, fmin=fmin, n_bins=n_bins_full, bins_per_octave=bins_per_octave, filter_scale=filter_scale, norm=norm, sparsity=sparsity, window=window, scale=scale, pad_mode=pad_mode, res_type=res_type, dtype=dtype, ) ) ) # Propagate dtype from the last component return __trim_stack(cqt_resp, n_bins, cqt_resp[-1].dtype)
[docs]@cache(level=20) def pseudo_cqt( y: np.ndarray, *, sr: float = 22050, hop_length: int = 512, fmin: Optional[_FloatLike_co] = None, n_bins: int = 84, bins_per_octave: int = 12, tuning: Optional[float] = 0.0, filter_scale: float = 1, norm: Optional[float] = 1, sparsity: float = 0.01, window: _WindowSpec = "hann", scale: bool = True, pad_mode: _PadMode = "constant", dtype: Optional[DTypeLike] = None, ) -> np.ndarray: """Compute the pseudo constant-Q transform of an audio signal. This uses a single fft size that is the smallest power of 2 that is greater than or equal to the max of: 1. The longest CQT filter 2. 2x the hop_length Parameters ---------- y : np.ndarray [shape=(..., n)] audio time series. Multi-channel is supported. sr : number > 0 [scalar] sampling rate of ``y`` hop_length : int > 0 [scalar] number of samples between successive CQT columns. fmin : float > 0 [scalar] Minimum frequency. Defaults to `C1 ~= 32.70 Hz` n_bins : int > 0 [scalar] Number of frequency bins, starting at ``fmin`` bins_per_octave : int > 0 [scalar] Number of bins per octave tuning : None or float Tuning offset in fractions of a bin. If ``None``, tuning will be automatically estimated from the signal. The minimum frequency of the resulting CQT will be modified to ``fmin * 2**(tuning / bins_per_octave)``. filter_scale : float > 0 Filter filter_scale factor. Larger values use longer windows. norm : {inf, -inf, 0, float > 0} Type of norm to use for basis function normalization. See `librosa.util.normalize`. sparsity : float in [0, 1) Sparsify the CQT basis by discarding up to ``sparsity`` fraction of the energy in each basis. Set ``sparsity=0`` to disable sparsification. window : str, tuple, number, or function Window specification for the basis filters. See `filters.get_window` for details. scale : bool If ``True``, scale the CQT response by square-root the length of each channel's filter. This is analogous to ``norm='ortho'`` in FFT. If ``False``, do not scale the CQT. This is analogous to ``norm=None`` in FFT. pad_mode : string Padding mode for centered frame analysis. See also: `librosa.stft` and `numpy.pad`. dtype : np.dtype, optional The complex data type for CQT calculations. By default, this is inferred to match the precision of the input signal. Returns ------- CQT : np.ndarray [shape=(..., n_bins, t), dtype=np.float] Pseudo Constant-Q energy for each frequency at each time. Notes ----- This function caches at level 20. """ if fmin is None: # C1 by default fmin = note_to_hz("C1") if tuning is None: tuning = estimate_tuning(y=y, sr=sr, bins_per_octave=bins_per_octave) if dtype is None: dtype = util.dtype_r2c(y.dtype) # Apply tuning correction fmin = fmin * 2.0 ** (tuning / bins_per_octave) freqs = cqt_frequencies(fmin=fmin, n_bins=n_bins, bins_per_octave=bins_per_octave) if n_bins == 1: alpha = __et_relative_bw(bins_per_octave) else: alpha = filters._relative_bandwidth(freqs=freqs) lengths, _ = filters.wavelet_lengths( freqs=freqs, sr=sr, window=window, filter_scale=filter_scale, alpha=alpha ) fft_basis, n_fft, _ = __vqt_filter_fft( sr, freqs, filter_scale, norm, sparsity, hop_length=hop_length, window=window, dtype=dtype, alpha=alpha, ) fft_basis = np.abs(fft_basis) # Compute the magnitude-only CQT response C: np.ndarray = __cqt_response( y, n_fft, hop_length, fft_basis, pad_mode, window="hann", dtype=dtype, phase=False, ) if scale: C /= np.sqrt(n_fft) else: # reshape lengths to match dimension properly lengths = util.expand_to(lengths, ndim=C.ndim, axes=-2) C *= np.sqrt(lengths / n_fft) return C
[docs]@cache(level=40) def icqt( C: np.ndarray, *, sr: float = 22050, hop_length: int = 512, fmin: Optional[_FloatLike_co] = None, bins_per_octave: int = 12, tuning: float = 0.0, filter_scale: float = 1, norm: Optional[float] = 1, sparsity: float = 0.01, window: _WindowSpec = "hann", scale: bool = True, length: Optional[int] = None, res_type: str = "soxr_hq", dtype: Optional[DTypeLike] = None, ) -> np.ndarray: """Compute the inverse constant-Q transform. Given a constant-Q transform representation ``C`` of an audio signal ``y``, this function produces an approximation ``y_hat``. Parameters ---------- C : np.ndarray, [shape=(..., n_bins, n_frames)] Constant-Q representation as produced by `cqt` sr : number > 0 [scalar] sampling rate of the signal hop_length : int > 0 [scalar] number of samples between successive frames fmin : float > 0 [scalar] Minimum frequency. Defaults to `C1 ~= 32.70 Hz` bins_per_octave : int > 0 [scalar] Number of bins per octave tuning : float [scalar] Tuning offset in fractions of a bin. The minimum frequency of the CQT will be modified to ``fmin * 2**(tuning / bins_per_octave)``. filter_scale : float > 0 [scalar] Filter scale factor. Small values (<1) use shorter windows for improved time resolution. norm : {inf, -inf, 0, float > 0} Type of norm to use for basis function normalization. See `librosa.util.normalize`. sparsity : float in [0, 1) Sparsify the CQT basis by discarding up to ``sparsity`` fraction of the energy in each basis. Set ``sparsity=0`` to disable sparsification. window : str, tuple, number, or function Window specification for the basis filters. See `filters.get_window` for details. scale : bool If ``True``, scale the CQT response by square-root the length of each channel's filter. This is analogous to ``norm='ortho'`` in FFT. If ``False``, do not scale the CQT. This is analogous to ``norm=None`` in FFT. length : int > 0, optional If provided, the output ``y`` is zero-padded or clipped to exactly ``length`` samples. res_type : string Resampling mode. See `librosa.resample` for supported modes. dtype : numeric type Real numeric type for ``y``. Default is inferred to match the numerical precision of the input CQT. Returns ------- y : np.ndarray, [shape=(..., n_samples), dtype=np.float] Audio time-series reconstructed from the CQT representation. See Also -------- cqt librosa.resample Notes ----- This function caches at level 40. Examples -------- Using default parameters >>> y, sr = librosa.load(librosa.ex('trumpet')) >>> C = librosa.cqt(y=y, sr=sr) >>> y_hat = librosa.icqt(C=C, sr=sr) Or with a different hop length and frequency resolution: >>> hop_length = 256 >>> bins_per_octave = 12 * 3 >>> C = librosa.cqt(y=y, sr=sr, hop_length=256, n_bins=7*bins_per_octave, ... bins_per_octave=bins_per_octave) >>> y_hat = librosa.icqt(C=C, sr=sr, hop_length=hop_length, ... bins_per_octave=bins_per_octave) """ if fmin is None: fmin = note_to_hz("C1") # Apply tuning correction fmin = fmin * 2.0 ** (tuning / bins_per_octave) # Get the top octave of frequencies n_bins = C.shape[-2] n_octaves = int(np.ceil(float(n_bins) / bins_per_octave)) freqs = cqt_frequencies(fmin=fmin, n_bins=n_bins, bins_per_octave=bins_per_octave) if n_bins == 1: alpha = __et_relative_bw(bins_per_octave) else: alpha = filters._relative_bandwidth(freqs=freqs) lengths, f_cutoff = filters.wavelet_lengths( freqs=freqs, sr=sr, window=window, filter_scale=filter_scale, alpha=alpha ) # Trim the CQT to only what's necessary for reconstruction if length is not None: n_frames = int(np.ceil((length + max(lengths)) / hop_length)) C = C[..., :n_frames] C_scale = np.sqrt(lengths) # This shape array will be used for broadcasting the basis scale # we'll have to adapt this per octave within the loop y: Optional[np.ndarray] = None # Assume the top octave is at the full rate srs = [sr] hops = [hop_length] for i in range(n_octaves - 1): if hops[0] % 2 == 0: # We can downsample: srs.insert(0, srs[0] * 0.5) hops.insert(0, hops[0] // 2) else: # We're out of downsamplings, carry forward srs.insert(0, srs[0]) hops.insert(0, hops[0]) for i, (my_sr, my_hop) in enumerate(zip(srs, hops)): # How many filters are in this octave? n_filters = min(bins_per_octave, n_bins - bins_per_octave * i) # Slice out the current octave sl = slice(bins_per_octave * i, bins_per_octave * i + n_filters) fft_basis, n_fft, _ = __vqt_filter_fft( my_sr, freqs[sl], filter_scale, norm, sparsity, window=window, alpha=alpha[sl], ) # Transpose the basis inv_basis = fft_basis.H.todense() # Compute each filter's frequency-domain power freq_power = 1 / np.sum(util.abs2(np.asarray(inv_basis)), axis=0) # Compensate for length normalization in the forward transform freq_power *= n_fft / lengths[sl] # Inverse-project the basis for each octave if scale: # scale=True ==> re-scale by sqrt(lengths) D_oct = np.einsum( "fc,c,c,...ct->...ft", inv_basis, C_scale[sl], freq_power, C[..., sl, :], optimize=True, ) else: D_oct = np.einsum( "fc,c,...ct->...ft", inv_basis, freq_power, C[..., sl, :], optimize=True ) y_oct = istft(D_oct, window="ones", hop_length=my_hop, dtype=dtype) y_oct = audio.resample( y_oct, orig_sr=1, target_sr=sr // my_sr, res_type=res_type, scale=False, fix=False, ) if y is None: y = y_oct else: y[..., : y_oct.shape[-1]] += y_oct # make mypy happy assert y is not None if length: y = util.fix_length(y, size=length) return y
[docs]@cache(level=20) def vqt( y: np.ndarray, *, sr: float = 22050, hop_length: int = 512, fmin: Optional[_FloatLike_co] = None, n_bins: int = 84, intervals: Union[str, Collection[float]] = "equal", gamma: Optional[float] = None, bins_per_octave: int = 12, tuning: Optional[float] = 0.0, filter_scale: float = 1, norm: Optional[float] = 1, sparsity: float = 0.01, window: _WindowSpec = "hann", scale: bool = True, pad_mode: _PadMode = "constant", res_type: Optional[str] = "soxr_hq", dtype: Optional[DTypeLike] = None, ) -> np.ndarray: """Compute the variable-Q transform of an audio signal. This implementation is based on the recursive sub-sampling method described by [#]_. .. [#] Schörkhuber, Christian, Anssi Klapuri, Nicki Holighaus, and Monika Dörfler. "A Matlab toolbox for efficient perfect reconstruction time-frequency transforms with log-frequency resolution." In Audio Engineering Society Conference: 53rd International Conference: Semantic Audio. Audio Engineering Society, 2014. Parameters ---------- y : np.ndarray [shape=(..., n)] audio time series. Multi-channel is supported. sr : number > 0 [scalar] sampling rate of ``y`` hop_length : int > 0 [scalar] number of samples between successive VQT columns. fmin : float > 0 [scalar] Minimum frequency. Defaults to `C1 ~= 32.70 Hz` n_bins : int > 0 [scalar] Number of frequency bins, starting at ``fmin`` intervals : str or array of floats in [1, 2) Either a string specification for an interval set, e.g., `'equal'`, `'pythagorean'`, `'ji3'`, etc. or an array of intervals expressed as numbers between 1 and 2. .. see also:: librosa.interval_frequencies gamma : number > 0 [scalar] Bandwidth offset for determining filter lengths. If ``gamma=0``, produces the constant-Q transform. If 'gamma=None', gamma will be calculated such that filter bandwidths are equal to a constant fraction of the equivalent rectangular bandwidths (ERB). This is accomplished by solving for the gamma which gives:: B_k = alpha * f_k + gamma = C * ERB(f_k), where ``B_k`` is the bandwidth of filter ``k`` with center frequency ``f_k``, alpha is the inverse of what would be the constant Q-factor, and ``C = alpha / 0.108`` is the constant fraction across all filters. Here we use ``ERB(f_k) = 24.7 + 0.108 * f_k``, the best-fit curve derived from experimental data in [#]_. .. [#] Glasberg, Brian R., and Brian CJ Moore. "Derivation of auditory filter shapes from notched-noise data." Hearing research 47.1-2 (1990): 103-138. bins_per_octave : int > 0 [scalar] Number of bins per octave tuning : None or float Tuning offset in fractions of a bin. If ``None``, tuning will be automatically estimated from the signal. The minimum frequency of the resulting VQT will be modified to ``fmin * 2**(tuning / bins_per_octave)``. filter_scale : float > 0 Filter scale factor. Small values (<1) use shorter windows for improved time resolution. norm : {inf, -inf, 0, float > 0} Type of norm to use for basis function normalization. See `librosa.util.normalize`. sparsity : float in [0, 1) Sparsify the VQT basis by discarding up to ``sparsity`` fraction of the energy in each basis. Set ``sparsity=0`` to disable sparsification. window : str, tuple, number, or function Window specification for the basis filters. See `filters.get_window` for details. scale : bool If ``True``, scale the VQT response by square-root the length of each channel's filter. This is analogous to ``norm='ortho'`` in FFT. If ``False``, do not scale the VQT. This is analogous to ``norm=None`` in FFT. pad_mode : string Padding mode for centered frame analysis. See also: `librosa.stft` and `numpy.pad`. res_type : string The resampling mode for recursive downsampling. dtype : np.dtype The dtype of the output array. By default, this is inferred to match the numerical precision of the input signal. Returns ------- VQT : np.ndarray [shape=(..., n_bins, t), dtype=np.complex] Variable-Q value each frequency at each time. See Also -------- cqt Notes ----- This function caches at level 20. Examples -------- Generate and plot a variable-Q power spectrum >>> import matplotlib.pyplot as plt >>> y, sr = librosa.load(librosa.ex('choice'), duration=5) >>> C = np.abs(librosa.cqt(y, sr=sr)) >>> V = np.abs(librosa.vqt(y, sr=sr)) >>> fig, ax = plt.subplots(nrows=2, sharex=True, sharey=True) >>> librosa.display.specshow(librosa.amplitude_to_db(C, ref=np.max), ... sr=sr, x_axis='time', y_axis='cqt_note', ax=ax[0]) >>> ax[0].set(title='Constant-Q power spectrum', xlabel=None) >>> ax[0].label_outer() >>> img = librosa.display.specshow(librosa.amplitude_to_db(V, ref=np.max), ... sr=sr, x_axis='time', y_axis='cqt_note', ax=ax[1]) >>> ax[1].set_title('Variable-Q power spectrum') >>> fig.colorbar(img, ax=ax, format="%+2.0f dB") """ # If intervals are provided as an array, override BPO if not isinstance(intervals, str): bins_per_octave = len(intervals) # How many octaves are we dealing with? n_octaves = int(np.ceil(float(n_bins) / bins_per_octave)) n_filters = min(bins_per_octave, n_bins) if fmin is None: # C1 by default fmin = note_to_hz("C1") if tuning is None: tuning = estimate_tuning(y=y, sr=sr, bins_per_octave=bins_per_octave) if dtype is None: dtype = util.dtype_r2c(y.dtype) # Apply tuning correction fmin = fmin * 2.0 ** (tuning / bins_per_octave) # First thing, get the freqs of the top octave freqs = interval_frequencies( n_bins=n_bins, fmin=fmin, intervals=intervals, bins_per_octave=bins_per_octave, sort=True, ) freqs_top = freqs[-bins_per_octave:] fmax_t: float = np.max(freqs_top) if n_bins == 1: alpha = __et_relative_bw(bins_per_octave) else: alpha = filters._relative_bandwidth(freqs=freqs) lengths, filter_cutoff = filters.wavelet_lengths( freqs=freqs, sr=sr, window=window, filter_scale=filter_scale, gamma=gamma, alpha=alpha, ) # Determine required resampling quality nyquist = sr / 2.0 if filter_cutoff > nyquist: raise ParameterError( f"Wavelet basis with max frequency={fmax_t} would exceed the Nyquist frequency={nyquist}. " "Try reducing the number of frequency bins." ) if res_type is None: warnings.warn( "Support for VQT with res_type=None is deprecated in librosa 0.10\n" "and will be removed in version 1.0.", category=FutureWarning, stacklevel=2, ) res_type = "soxr_hq" y, sr, hop_length = __early_downsample( y, sr, hop_length, res_type, n_octaves, nyquist, filter_cutoff, scale ) vqt_resp = [] # Iterate down the octaves my_y, my_sr, my_hop = y, sr, hop_length for i in range(n_octaves): # Slice out the current octave of filters if i == 0: sl = slice(-n_filters, None) else: sl = slice(-n_filters * (i + 1), -n_filters * i) # This may be incorrect with early downsampling freqs_oct = freqs[sl] alpha_oct = alpha[sl] fft_basis, n_fft, _ = __vqt_filter_fft( my_sr, freqs_oct, filter_scale, norm, sparsity, window=window, gamma=gamma, dtype=dtype, alpha=alpha_oct, ) # Re-scale the filters to compensate for downsampling fft_basis[:] *= np.sqrt(sr / my_sr) # Compute the vqt filter response and append to the stack vqt_resp.append( __cqt_response(my_y, n_fft, my_hop, fft_basis, pad_mode, dtype=dtype) ) if my_hop % 2 == 0: my_hop //= 2 my_sr /= 2.0 my_y = audio.resample( my_y, orig_sr=2, target_sr=1, res_type=res_type, scale=True ) V = __trim_stack(vqt_resp, n_bins, dtype) if scale: # Recompute lengths here because early downsampling may have changed # our sampling rate lengths, _ = filters.wavelet_lengths( freqs=freqs, sr=sr, window=window, filter_scale=filter_scale, gamma=gamma, alpha=alpha, ) # reshape lengths to match V shape lengths = util.expand_to(lengths, ndim=V.ndim, axes=-2) V /= np.sqrt(lengths) return V
@cache(level=10) def __vqt_filter_fft( sr, freqs, filter_scale, norm, sparsity, hop_length=None, window="hann", gamma=0.0, dtype=np.complex64, alpha=None, ): """Generate the frequency domain variable-Q filter basis.""" basis, lengths = filters.wavelet( freqs=freqs, sr=sr, filter_scale=filter_scale, norm=norm, pad_fft=True, window=window, gamma=gamma, alpha=alpha, ) # Filters are padded up to the nearest integral power of 2 n_fft = basis.shape[1] if hop_length is not None and n_fft < 2.0 ** (1 + np.ceil(np.log2(hop_length))): n_fft = int(2.0 ** (1 + np.ceil(np.log2(hop_length)))) # re-normalize bases with respect to the FFT window length basis *= lengths[:, np.newaxis] / float(n_fft) # FFT and retain only the non-negative frequencies fft = get_fftlib() fft_basis = fft.fft(basis, n=n_fft, axis=1)[:, : (n_fft // 2) + 1] # sparsify the basis fft_basis = util.sparsify_rows(fft_basis, quantile=sparsity, dtype=dtype) return fft_basis, n_fft, lengths def __trim_stack( cqt_resp: List[np.ndarray], n_bins: int, dtype: DTypeLike ) -> np.ndarray: """Trim and stack a collection of CQT responses""" max_col = min(c_i.shape[-1] for c_i in cqt_resp) # Grab any leading dimensions shape = list(cqt_resp[0].shape) shape[-2] = n_bins shape[-1] = max_col cqt_out = np.empty(shape, dtype=dtype, order="F") # Copy per-octave data into output array end = n_bins for c_i in cqt_resp: # By default, take the whole octave n_oct = c_i.shape[-2] # If the whole octave is more than we can fit, # take the highest bins from c_i if end < n_oct: cqt_out[..., :end, :] = c_i[..., -end:, :max_col] else: cqt_out[..., end - n_oct : end, :] = c_i[..., :max_col] end -= n_oct return cqt_out def __cqt_response( y, n_fft, hop_length, fft_basis, mode, window="ones", phase=True, dtype=None ): """Compute the filter response with a target STFT hop.""" # Compute the STFT matrix D = stft( y, n_fft=n_fft, hop_length=hop_length, window=window, pad_mode=mode, dtype=dtype ) if not phase: D = np.abs(D) # Reshape D to Dr Dr = D.reshape((-1, D.shape[-2], D.shape[-1])) output_flat = np.empty( (Dr.shape[0], fft_basis.shape[0], Dr.shape[-1]), dtype=D.dtype ) # iterate over channels # project fft_basis.dot(Dr[i]) for i in range(Dr.shape[0]): output_flat[i] = fft_basis.dot(Dr[i]) # reshape Dr to match D's leading dimensions again shape = list(D.shape) shape[-2] = fft_basis.shape[0] return output_flat.reshape(shape) def __early_downsample_count(nyquist, filter_cutoff, hop_length, n_octaves): """Compute the number of early downsampling operations""" downsample_count1 = max(0, int(np.ceil(np.log2(nyquist / filter_cutoff)) - 1) - 1) num_twos = __num_two_factors(hop_length) downsample_count2 = max(0, num_twos - n_octaves + 1) return min(downsample_count1, downsample_count2) def __early_downsample( y, sr, hop_length, res_type, n_octaves, nyquist, filter_cutoff, scale ): """Perform early downsampling on an audio signal, if it applies.""" downsample_count = __early_downsample_count( nyquist, filter_cutoff, hop_length, n_octaves ) if downsample_count > 0: downsample_factor = 2 ** (downsample_count) hop_length //= downsample_factor if y.shape[-1] < downsample_factor: raise ParameterError( f"Input signal length={len(y):d} is too short for " f"{n_octaves:d}-octave CQT" ) new_sr = sr / float(downsample_factor) y = audio.resample( y, orig_sr=downsample_factor, target_sr=1, res_type=res_type, scale=True ) # If we're not going to length-scale after CQT, we # need to compensate for the downsampling factor here if not scale: y *= np.sqrt(downsample_factor) sr = new_sr return y, sr, hop_length @jit(nopython=True, cache=True) def __num_two_factors(x): """Return how many times integer x can be evenly divided by 2. Returns 0 for non-positive integers. """ if x <= 0: return 0 num_twos = 0 while x % 2 == 0: num_twos += 1 x //= 2 return num_twos
[docs]def griffinlim_cqt( C: np.ndarray, *, n_iter: int = 32, sr: float = 22050, hop_length: int = 512, fmin: Optional[_FloatLike_co] = None, bins_per_octave: int = 12, tuning: float = 0.0, filter_scale: float = 1, norm: Optional[float] = 1, sparsity: float = 0.01, window: _WindowSpec = "hann", scale: bool = True, pad_mode: _PadMode = "constant", res_type: str = "soxr_hq", dtype: Optional[DTypeLike] = None, length: Optional[int] = None, momentum: float = 0.99, init: Optional[str] = "random", random_state: Optional[ Union[int, np.random.RandomState, np.random.Generator] ] = None, ) -> np.ndarray: """Approximate constant-Q magnitude spectrogram inversion using the "fast" Griffin-Lim algorithm. Given the magnitude of a constant-Q spectrogram (``C``), the algorithm randomly initializes phase estimates, and then alternates forward- and inverse-CQT operations. [#]_ This implementation is based on the (fast) Griffin-Lim method for Short-time Fourier Transforms, [#]_ but adapted for use with constant-Q spectrograms. .. [#] D. W. Griffin and J. S. Lim, "Signal estimation from modified short-time Fourier transform," IEEE Trans. ASSP, vol.32, no.2, pp.236–243, Apr. 1984. .. [#] Perraudin, N., Balazs, P., & Søndergaard, P. L. "A fast Griffin-Lim algorithm," IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (pp. 1-4), Oct. 2013. Parameters ---------- C : np.ndarray [shape=(..., n_bins, n_frames)] The constant-Q magnitude spectrogram n_iter : int > 0 The number of iterations to run sr : number > 0 Audio sampling rate hop_length : int > 0 The hop length of the CQT fmin : number > 0 Minimum frequency for the CQT. If not provided, it defaults to `C1`. bins_per_octave : int > 0 Number of bins per octave tuning : float Tuning deviation from A440, in fractions of a bin filter_scale : float > 0 Filter scale factor. Small values (<1) use shorter windows for improved time resolution. norm : {inf, -inf, 0, float > 0} Type of norm to use for basis function normalization. See `librosa.util.normalize`. sparsity : float in [0, 1) Sparsify the CQT basis by discarding up to ``sparsity`` fraction of the energy in each basis. Set ``sparsity=0`` to disable sparsification. window : str, tuple, or function Window specification for the basis filters. See `filters.get_window` for details. scale : bool If ``True``, scale the CQT response by square-root the length of each channel's filter. This is analogous to ``norm='ortho'`` in FFT. If ``False``, do not scale the CQT. This is analogous to ``norm=None`` in FFT. pad_mode : string Padding mode for centered frame analysis. See also: `librosa.stft` and `numpy.pad`. res_type : string The resampling mode for recursive downsampling. See ``librosa.resample`` for a list of available options. dtype : numeric type Real numeric type for ``y``. Default is inferred to match the precision of the input CQT. length : int > 0, optional If provided, the output ``y`` is zero-padded or clipped to exactly ``length`` samples. momentum : float > 0 The momentum parameter for fast Griffin-Lim. Setting this to 0 recovers the original Griffin-Lim method. Values near 1 can lead to faster convergence, but above 1 may not converge. init : None or 'random' [default] If 'random' (the default), then phase values are initialized randomly according to ``random_state``. This is recommended when the input ``C`` is a magnitude spectrogram with no initial phase estimates. If ``None``, then the phase is initialized from ``C``. This is useful when an initial guess for phase can be provided, or when you want to resume Griffin-Lim from a previous output. random_state : None, int, np.random.RandomState, or np.random.Generator If int, random_state is the seed used by the random number generator for phase initialization. If `np.random.RandomState` or `np.random.Generator` instance, the random number generator itself. If ``None``, defaults to the `np.random.default_rng()` object. Returns ------- y : np.ndarray [shape=(..., n)] time-domain signal reconstructed from ``C`` See Also -------- cqt icqt griffinlim filters.get_window resample Examples -------- A basis CQT inverse example >>> y, sr = librosa.load(librosa.ex('trumpet', hq=True), sr=None) >>> # Get the CQT magnitude, 7 octaves at 36 bins per octave >>> C = np.abs(librosa.cqt(y=y, sr=sr, bins_per_octave=36, n_bins=7*36)) >>> # Invert using Griffin-Lim >>> y_inv = librosa.griffinlim_cqt(C, sr=sr, bins_per_octave=36) >>> # And invert without estimating phase >>> y_icqt = librosa.icqt(C, sr=sr, bins_per_octave=36) Wave-plot the results >>> import matplotlib.pyplot as plt >>> fig, ax = plt.subplots(nrows=3, sharex=True, sharey=True) >>> librosa.display.waveshow(y, sr=sr, color='b', ax=ax[0]) >>> ax[0].set(title='Original', xlabel=None) >>> ax[0].label_outer() >>> librosa.display.waveshow(y_inv, sr=sr, color='g', ax=ax[1]) >>> ax[1].set(title='Griffin-Lim reconstruction', xlabel=None) >>> ax[1].label_outer() >>> librosa.display.waveshow(y_icqt, sr=sr, color='r', ax=ax[2]) >>> ax[2].set(title='Magnitude-only icqt reconstruction') """ if fmin is None: fmin = note_to_hz("C1") if random_state is None: rng = np.random.default_rng() elif isinstance(random_state, int): rng = np.random.RandomState(seed=random_state) # type: ignore elif isinstance(random_state, (np.random.RandomState, np.random.Generator)): rng = random_state # type: ignore else: _ensure_not_reachable(random_state) raise ParameterError(f"Unsupported random_state={random_state!r}") if momentum > 1: warnings.warn( f"Griffin-Lim with momentum={momentum} > 1 can be unstable. " "Proceed with caution!", stacklevel=2, ) elif momentum < 0: raise ParameterError(f"griffinlim_cqt() called with momentum={momentum} < 0") # using complex64 will keep the result to minimal necessary precision angles = np.empty(C.shape, dtype=np.complex64) eps = util.tiny(angles) if init == "random": # randomly initialize the phase angles[:] = util.phasor(2 * np.pi * rng.random(size=C.shape)) elif init is None: # Initialize an all ones complex matrix angles[:] = 1.0 else: raise ParameterError(f"init={init} must either None or 'random'") # And initialize the previous iterate to 0 rebuilt: np.ndarray = np.array(0.0) for _ in range(n_iter): # Store the previous iterate tprev = rebuilt # Invert with our current estimate of the phases inverse = icqt( C * angles, sr=sr, hop_length=hop_length, bins_per_octave=bins_per_octave, fmin=fmin, tuning=tuning, filter_scale=filter_scale, window=window, length=length, res_type=res_type, norm=norm, scale=scale, sparsity=sparsity, dtype=dtype, ) # Rebuild the spectrogram rebuilt = cqt( inverse, sr=sr, bins_per_octave=bins_per_octave, n_bins=C.shape[-2], hop_length=hop_length, fmin=fmin, tuning=tuning, filter_scale=filter_scale, window=window, norm=norm, scale=scale, sparsity=sparsity, pad_mode=pad_mode, res_type=res_type, ) # Update our phase estimates angles[:] = rebuilt - (momentum / (1 + momentum)) * tprev angles[:] /= np.abs(angles) + eps # Return the final phase estimates return icqt( C * angles, sr=sr, hop_length=hop_length, bins_per_octave=bins_per_octave, tuning=tuning, filter_scale=filter_scale, fmin=fmin, window=window, length=length, res_type=res_type, norm=norm, scale=scale, sparsity=sparsity, dtype=dtype, )
def __et_relative_bw(bins_per_octave: int) -> np.ndarray: """Compute the relative bandwidth coefficient for equal (geometric) freuqency spacing and a give number of bins per octave. This is a special case of the more general `relative_bandwidth` calculation that can be used when only a single basis frequency is used. Parameters ---------- bins_per_octave : int Returns ------- alpha : np.ndarray > 0 Value is cast up to a 1d array to allow slicing """ r = 2 ** (1 / bins_per_octave) return np.atleast_1d((r**2 - 1) / (r**2 + 1))