Caution
You're reading an old version of this documentation. If you want up-to-date information, please have a look at 0.9.1.
Source code for librosa.decompose

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Spectrogram decomposition
=========================
.. autosummary::
    :toctree: generated/

    decompose
    hpss
    nn_filter
"""

import numpy as np

import scipy.sparse
from scipy.ndimage import median_filter

import sklearn.decomposition

from . import core
from ._cache import cache
from . import segment
from . import util
from .util.exceptions import ParameterError

__all__ = ['decompose', 'hpss', 'nn_filter']


[docs]def decompose(S, n_components=None, transformer=None, sort=False, fit=True, **kwargs):
    """Decompose a feature matrix.

    Given a spectrogram `S`, produce a decomposition into `components`
    and `activations` such that `S ~= components.dot(activations)`.

    By default, this is done with with non-negative matrix factorization (NMF),
    but any `sklearn.decomposition`-type object will work.


    Parameters
    ----------
    S : np.ndarray [shape=(n_features, n_samples), dtype=float]
        The input feature matrix (e.g., magnitude spectrogram)

    n_components : int > 0 [scalar] or None
        number of desired components

        if None, then `n_features` components are used

    transformer : None or object
        If None, use `sklearn.decomposition.NMF`

        Otherwise, any object with a similar interface to NMF should work.
        `transformer` must follow the scikit-learn convention, where
        input data is `(n_samples, n_features)`.

        `transformer.fit_transform()` will be run on `S.T` (not `S`),
        the return value of which is stored (transposed) as `activations`

        The components will be retrieved as `transformer.components_.T`

        `S ~= np.dot(activations, transformer.components_).T`

        or equivalently:
        `S ~= np.dot(transformer.components_.T, activations.T)`

    sort : bool
        If `True`, components are sorted by ascending peak frequency.

        .. note:: If used with `transformer`, sorting is applied to copies
            of the decomposition parameters, and not to `transformer`'s
            internal parameters.

    fit : bool
        If `True`, components are estimated from the input ``S``.

        If `False`, components are assumed to be pre-computed and stored
        in ``transformer``, and are not changed.

    kwargs : Additional keyword arguments to the default transformer
        `sklearn.decomposition.NMF`


    Returns
    -------
    components: np.ndarray [shape=(n_features, n_components)]
        matrix of components (basis elements).

    activations: np.ndarray [shape=(n_components, n_samples)]
        transformed matrix/activation matrix


    Raises
    ------
    ParameterError
        if `fit` is False and no `transformer` object is provided.


    See Also
    --------
    sklearn.decomposition : SciKit-Learn matrix decomposition modules


    Examples
    --------
    Decompose a magnitude spectrogram into 32 components with NMF

    >>> y, sr = librosa.load(librosa.util.example_audio_file())
    >>> S = np.abs(librosa.stft(y))
    >>> comps, acts = librosa.decompose.decompose(S, n_components=8)
    >>> comps
    array([[  1.876e-01,   5.559e-02, ...,   1.687e-01,   4.907e-02],
           [  3.148e-01,   1.719e-01, ...,   2.314e-01,   9.493e-02],
           ...,
           [  1.561e-07,   8.564e-08, ...,   7.167e-08,   4.997e-08],
           [  1.531e-07,   7.880e-08, ...,   5.632e-08,   4.028e-08]])
    >>> acts
    array([[  4.197e-05,   8.512e-03, ...,   3.056e-05,   9.159e-06],
           [  9.568e-06,   1.718e-02, ...,   3.322e-05,   7.869e-06],
           ...,
           [  5.982e-05,   1.311e-02, ...,  -0.000e+00,   6.323e-06],
           [  3.782e-05,   7.056e-03, ...,   3.290e-05,  -0.000e+00]])


    Sort components by ascending peak frequency

    >>> comps, acts = librosa.decompose.decompose(S, n_components=16,
    ...                                           sort=True)


    Or with sparse dictionary learning

    >>> import sklearn.decomposition
    >>> T = sklearn.decomposition.MiniBatchDictionaryLearning(n_components=16)
    >>> scomps, sacts = librosa.decompose.decompose(S, transformer=T, sort=True)

    >>> import matplotlib.pyplot as plt
    >>> plt.figure(figsize=(10,8))
    >>> plt.subplot(3, 1, 1)
    >>> librosa.display.specshow(librosa.amplitude_to_db(S,
    ...                                                  ref=np.max),
    ...                          y_axis='log', x_axis='time')
    >>> plt.title('Input spectrogram')
    >>> plt.colorbar(format='%+2.0f dB')
    >>> plt.subplot(3, 2, 3)
    >>> librosa.display.specshow(librosa.amplitude_to_db(comps,
    ...                                                  ref=np.max),
    ...                          y_axis='log')
    >>> plt.colorbar(format='%+2.0f dB')
    >>> plt.title('Components')
    >>> plt.subplot(3, 2, 4)
    >>> librosa.display.specshow(acts, x_axis='time')
    >>> plt.ylabel('Components')
    >>> plt.title('Activations')
    >>> plt.colorbar()
    >>> plt.subplot(3, 1, 3)
    >>> S_approx = comps.dot(acts)
    >>> librosa.display.specshow(librosa.amplitude_to_db(S_approx,
    ...                                                  ref=np.max),
    ...                          y_axis='log', x_axis='time')
    >>> plt.colorbar(format='%+2.0f dB')
    >>> plt.title('Reconstructed spectrogram')
    >>> plt.tight_layout()
    >>> plt.show()
    """

    if transformer is None:
        if fit is False:
            raise ParameterError('fit must be True if transformer is None')

        transformer = sklearn.decomposition.NMF(n_components=n_components,
                                                **kwargs)

    if n_components is None:
        n_components = S.shape[0]

    if fit:
        activations = transformer.fit_transform(S.T).T
    else:
        activations = transformer.transform(S.T).T

    components = transformer.components_.T

    if sort:
        components, idx = util.axis_sort(components, index=True)
        activations = activations[idx]

    return components, activations


[docs]@cache(level=30)
def hpss(S, kernel_size=31, power=2.0, mask=False, margin=1.0):
    """Median-filtering harmonic percussive source separation (HPSS).

    If `margin = 1.0`, decomposes an input spectrogram `S = H + P`
    where `H` contains the harmonic components,
    and `P` contains the percussive components.

    If `margin > 1.0`, decomposes an input spectrogram `S = H + P + R`
    where `R` contains residual components not included in `H` or `P`.

    This implementation is based upon the algorithm described by [1]_ and [2]_.

    .. [1] Fitzgerald, Derry.
        "Harmonic/percussive separation using median filtering."
        13th International Conference on Digital Audio Effects (DAFX10),
        Graz, Austria, 2010.

    .. [2] Driedger, Müller, Disch.
        "Extending harmonic-percussive separation of audio."
        15th International Society for Music Information Retrieval Conference (ISMIR 2014),
        Taipei, Taiwan, 2014.

    Parameters
    ----------
    S : np.ndarray [shape=(d, n)]
        input spectrogram. May be real (magnitude) or complex.

    kernel_size : int or tuple (kernel_harmonic, kernel_percussive)
        kernel size(s) for the median filters.

        - If scalar, the same size is used for both harmonic and percussive.
        - If tuple, the first value specifies the width of the
          harmonic filter, and the second value specifies the width
          of the percussive filter.

    power : float > 0 [scalar]
        Exponent for the Wiener filter when constructing soft mask matrices.

    mask : bool
        Return the masking matrices instead of components.

        Masking matrices contain non-negative real values that
        can be used to measure the assignment of energy from `S`
        into harmonic or percussive components.

        Components can be recovered by multiplying `S * mask_H`
        or `S * mask_P`.


    margin : float or tuple (margin_harmonic, margin_percussive)
        margin size(s) for the masks (as described in [2]_)

        - If scalar, the same size is used for both harmonic and percussive.
        - If tuple, the first value specifies the margin of the
          harmonic mask, and the second value specifies the margin
          of the percussive mask.

    Returns
    -------
    harmonic : np.ndarray [shape=(d, n)]
        harmonic component (or mask)

    percussive : np.ndarray [shape=(d, n)]
        percussive component (or mask)


    See Also
    --------
    util.softmask

    Notes
    -----
    This function caches at level 30.

    Examples
    --------
    Separate into harmonic and percussive

    >>> y, sr = librosa.load(librosa.util.example_audio_file(), duration=15)
    >>> D = librosa.stft(y)
    >>> H, P = librosa.decompose.hpss(D)

    >>> import matplotlib.pyplot as plt
    >>> plt.figure()
    >>> plt.subplot(3, 1, 1)
    >>> librosa.display.specshow(librosa.amplitude_to_db(np.abs(D),
    ...                                                  ref=np.max),
    ...                          y_axis='log')
    >>> plt.colorbar(format='%+2.0f dB')
    >>> plt.title('Full power spectrogram')
    >>> plt.subplot(3, 1, 2)
    >>> librosa.display.specshow(librosa.amplitude_to_db(np.abs(H),
    ...                                                  ref=np.max),
    ...                          y_axis='log')
    >>> plt.colorbar(format='%+2.0f dB')
    >>> plt.title('Harmonic power spectrogram')
    >>> plt.subplot(3, 1, 3)
    >>> librosa.display.specshow(librosa.amplitude_to_db(np.abs(P),
    ...                                                  ref=np.max),
    ...                          y_axis='log')
    >>> plt.colorbar(format='%+2.0f dB')
    >>> plt.title('Percussive power spectrogram')
    >>> plt.tight_layout()
    >>> plt.show()


    Or with a narrower horizontal filter

    >>> H, P = librosa.decompose.hpss(D, kernel_size=(13, 31))

    Just get harmonic/percussive masks, not the spectra

    >>> mask_H, mask_P = librosa.decompose.hpss(D, mask=True)
    >>> mask_H
    array([[  1.000e+00,   1.469e-01, ...,   2.648e-03,   2.164e-03],
           [  1.000e+00,   2.368e-01, ...,   9.413e-03,   7.703e-03],
           ...,
           [  8.869e-01,   5.673e-02, ...,   4.603e-02,   1.247e-05],
           [  7.068e-01,   2.194e-02, ...,   4.453e-02,   1.205e-05]], dtype=float32)
    >>> mask_P
    array([[  2.858e-05,   8.531e-01, ...,   9.974e-01,   9.978e-01],
           [  1.586e-05,   7.632e-01, ...,   9.906e-01,   9.923e-01],
           ...,
           [  1.131e-01,   9.433e-01, ...,   9.540e-01,   1.000e+00],
           [  2.932e-01,   9.781e-01, ...,   9.555e-01,   1.000e+00]], dtype=float32)

    Separate into harmonic/percussive/residual components by using a margin > 1.0

    >>> H, P = librosa.decompose.hpss(D, margin=3.0)
    >>> R = D - (H+P)
    >>> y_harm = librosa.core.istft(H)
    >>> y_perc = librosa.core.istft(P)
    >>> y_resi = librosa.core.istft(R)


    Get a more isolated percussive component by widening its margin

    >>> H, P = librosa.decompose.hpss(D, margin=(1.0,5.0))

    """

    if np.iscomplexobj(S):
        S, phase = core.magphase(S)
    else:
        phase = 1

    if np.isscalar(kernel_size):
        win_harm = kernel_size
        win_perc = kernel_size
    else:
        win_harm = kernel_size[0]
        win_perc = kernel_size[1]

    if np.isscalar(margin):
        margin_harm = margin
        margin_perc = margin
    else:
        margin_harm = margin[0]
        margin_perc = margin[1]

    # margin minimum is 1.0
    if margin_harm < 1 or margin_perc < 1:
        raise ParameterError("Margins must be >= 1.0. "
                             "A typical range is between 1 and 10.")

    # Compute median filters. Pre-allocation here preserves memory layout.
    harm = np.empty_like(S)
    harm[:] = median_filter(S, size=(1, win_harm), mode='reflect')

    perc = np.empty_like(S)
    perc[:] = median_filter(S, size=(win_perc, 1), mode='reflect')

    split_zeros = (margin_harm == 1 and margin_perc == 1)

    mask_harm = util.softmask(harm, perc * margin_harm,
                              power=power,
                              split_zeros=split_zeros)

    mask_perc = util.softmask(perc, harm * margin_perc,
                              power=power,
                              split_zeros=split_zeros)

    if mask:
        return mask_harm, mask_perc

    return ((S * mask_harm) * phase, (S * mask_perc) * phase)


[docs]@cache(level=30)
def nn_filter(S, rec=None, aggregate=None, axis=-1, **kwargs):
    '''Filtering by nearest-neighbors.

    Each data point (e.g, spectrogram column) is replaced
    by aggregating its nearest neighbors in feature space.

    This can be useful for de-noising a spectrogram or feature matrix.

    The non-local means method [1]_ can be recovered by providing a
    weighted recurrence matrix as input and specifying `aggregate=np.average`.

    Similarly, setting `aggregate=np.median` produces sparse de-noising
    as in REPET-SIM [2]_.

    .. [1] Buades, A., Coll, B., & Morel, J. M.
        (2005, June). A non-local algorithm for image denoising.
        In Computer Vision and Pattern Recognition, 2005.
        CVPR 2005. IEEE Computer Society Conference on (Vol. 2, pp. 60-65). IEEE.

    .. [2] Rafii, Z., & Pardo, B.
        (2012, October).  "Music/Voice Separation Using the Similarity Matrix."
        International Society for Music Information Retrieval Conference, 2012.

    Parameters
    ----------
    S : np.ndarray
        The input data (spectrogram) to filter

    rec : (optional) scipy.sparse.spmatrix or np.ndarray
        Optionally, a pre-computed nearest-neighbor matrix
        as provided by `librosa.segment.recurrence_matrix`

    aggregate : function
        aggregation function (default: `np.mean`)

        If `aggregate=np.average`, then a weighted average is
        computed according to the (per-row) weights in `rec`.

        For all other aggregation functions, all neighbors
        are treated equally.


    axis : int
        The axis along which to filter (by default, columns)

    kwargs
        Additional keyword arguments provided to
        `librosa.segment.recurrence_matrix` if `rec` is not provided

    Returns
    -------
    S_filtered : np.ndarray
        The filtered data

    Raises
    ------
    ParameterError
        if `rec` is provided and its shape is incompatible with `S`.

    See also
    --------
    decompose
    hpss
    librosa.segment.recurrence_matrix


    Notes
    -----
    This function caches at level 30.


    Examples
    --------

    De-noise a chromagram by non-local median filtering.
    By default this would use euclidean distance to select neighbors,
    but this can be overridden directly by setting the `metric` parameter.

    >>> y, sr = librosa.load(librosa.util.example_audio_file(),
    ...                      offset=30, duration=10)
    >>> chroma = librosa.feature.chroma_cqt(y=y, sr=sr)
    >>> chroma_med = librosa.decompose.nn_filter(chroma,
    ...                                          aggregate=np.median,
    ...                                          metric='cosine')

    To use non-local means, provide an affinity matrix and `aggregate=np.average`.

    >>> rec = librosa.segment.recurrence_matrix(chroma, mode='affinity',
    ...                                         metric='cosine', sparse=True)
    >>> chroma_nlm = librosa.decompose.nn_filter(chroma, rec=rec,
    ...                                          aggregate=np.average)

    >>> import matplotlib.pyplot as plt
    >>> plt.figure(figsize=(10, 8))
    >>> plt.subplot(5, 1, 1)
    >>> librosa.display.specshow(chroma, y_axis='chroma')
    >>> plt.colorbar()
    >>> plt.title('Unfiltered')
    >>> plt.subplot(5, 1, 2)
    >>> librosa.display.specshow(chroma_med, y_axis='chroma')
    >>> plt.colorbar()
    >>> plt.title('Median-filtered')
    >>> plt.subplot(5, 1, 3)
    >>> librosa.display.specshow(chroma_nlm, y_axis='chroma')
    >>> plt.colorbar()
    >>> plt.title('Non-local means')
    >>> plt.subplot(5, 1, 4)
    >>> librosa.display.specshow(chroma - chroma_med,
    ...                          y_axis='chroma')
    >>> plt.colorbar()
    >>> plt.title('Original - median')
    >>> plt.subplot(5, 1, 5)
    >>> librosa.display.specshow(chroma - chroma_nlm,
    ...                          y_axis='chroma', x_axis='time')
    >>> plt.colorbar()
    >>> plt.title('Original - NLM')
    >>> plt.tight_layout()
    >>> plt.show()
    '''
    if aggregate is None:
        aggregate = np.mean

    if rec is None:
        kwargs = dict(kwargs)
        kwargs['sparse'] = True
        rec = segment.recurrence_matrix(S, axis=axis, **kwargs)
    elif not scipy.sparse.issparse(rec):
        rec = scipy.sparse.csc_matrix(rec)

    if rec.shape[0] != S.shape[axis] or rec.shape[0] != rec.shape[1]:
        raise ParameterError('Invalid self-similarity matrix shape '
                             'rec.shape={} for S.shape={}'.format(rec.shape,
                                                                  S.shape))

    return __nn_filter_helper(rec.data, rec.indices, rec.indptr,
                              S.swapaxes(0, axis), aggregate).swapaxes(0, axis)


def __nn_filter_helper(R_data, R_indices, R_ptr, S, aggregate):
    '''Nearest-neighbor filter helper function.

    This is an internal function, not for use outside of the decompose module.

    It applies the nearest-neighbor filter to S, assuming that the first index
    corresponds to observations.

    Parameters
    ----------
    R_data, R_indices, R_ptr : np.ndarrays
        The `data`, `indices`, and `indptr` of a scipy.sparse matrix

    S : np.ndarray
        The observation data to filter

    aggregate : callable
        The aggregation operator


    Returns
    -------
    S_out : np.ndarray like S
        The filtered data array
    '''
    s_out = np.empty_like(S)

    for i in range(len(R_ptr)-1):

        # Get the non-zeros out of the recurrence matrix
        targets = R_indices[R_ptr[i]:R_ptr[i+1]]

        if not len(targets):
            s_out[i] = S[i]
            continue

        neighbors = np.take(S, targets, axis=0)

        if aggregate is np.average:
            weights = R_data[R_ptr[i]:R_ptr[i+1]]
            s_out[i] = aggregate(neighbors, axis=0, weights=weights)
        else:
            s_out[i] = aggregate(neighbors, axis=0)

    return s_out