Caution

You're reading the documentation for a development version. For the latest released version, please have a look at 0.10.2.

Source code for librosa.onset

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Onset detection
===============
.. autosummary::
    :toctree: generated/

    onset_detect
    onset_backtrack
    onset_strength
    onset_strength_multi
"""

import numpy as np
import scipy

from ._cache import cache
from . import core
from . import util
from .util.exceptions import ParameterError

from .feature.spectral import melspectrogram
from typing import Any, Callable, Iterable, Optional, Union, Sequence

__all__ = ["onset_detect", "onset_strength", "onset_strength_multi", "onset_backtrack"]


[docs]def onset_detect( *, y: Optional[np.ndarray] = None, sr: float = 22050, onset_envelope: Optional[np.ndarray] = None, hop_length: int = 512, backtrack: bool = False, energy: Optional[np.ndarray] = None, units: str = "frames", normalize: bool = True, sparse: bool = True, **kwargs: Any, ) -> np.ndarray: """Locate note onset events by picking peaks in an onset strength envelope. The `peak_pick` parameters were chosen by large-scale hyper-parameter optimization over the dataset provided by [#]_. .. [#] https://github.com/CPJKU/onset_db Parameters ---------- y : np.ndarray [shape=(..., n)] audio time-series. Multi-channel is supported. sr : number > 0 [scalar] sampling rate of ``y`` onset_envelope : np.ndarray [shape=(..., m)] (optional) pre-computed onset strength envelope hop_length : int > 0 [scalar] hop length (in samples) units : {'frames', 'samples', 'time'} The units to encode detected onset events in. By default, 'frames' are used. backtrack : bool If ``True``, detected onset events are backtracked to the nearest preceding minimum of ``energy``. This is primarily useful when using onsets as slice points for segmentation. .. note:: backtracking is only supported if ``sparse=True``. energy : np.ndarray [shape=(m,)] (optional) An energy function to use for backtracking detected onset events. If none is provided, then ``onset_envelope`` is used. normalize : bool If ``True`` (default), normalize the onset envelope to have minimum of 0 and maximum of 1 prior to detection. This is helpful for standardizing the parameters of `librosa.util.peak_pick`. Otherwise, the onset envelope is left unnormalized. sparse : bool If ``True`` (default), detections are returned as an array of frames, samples, or time indices (as specified by ``units=``). If ``False``, detections are encoded as a dense boolean array where ``onsets[n]`` is True if there's an onset at frame index ``n``. .. note:: multi-channel input is only supported if ``sparse=False``. **kwargs : additional keyword arguments Additional parameters for peak picking. See `librosa.util.peak_pick` for details. Returns ------- onsets : np.ndarray [shape=(n_onsets,) or onset_envelope.shape] estimated positions of detected onsets, in whichever units are specified. By default, frame indices. If `sparse=False`, `onsets[..., n]` indicates an onset detection at frame index `n`. .. note:: If no onset strength could be detected, onset_detect returns an empty array (sparse=True) or all-False array (sparse=False). Raises ------ ParameterError if neither ``y`` nor ``onsets`` are provided or if ``units`` is not one of 'frames', 'samples', or 'time' See Also -------- onset_strength : compute onset strength per-frame onset_backtrack : backtracking onset events librosa.util.peak_pick : pick peaks from a time series Examples -------- Get onset times from a signal >>> y, sr = librosa.load(librosa.ex('trumpet')) >>> librosa.onset.onset_detect(y=y, sr=sr, units='time') array([0.07 , 0.232, 0.395, 0.604, 0.743, 0.929, 1.045, 1.115, 1.416, 1.672, 1.881, 2.043, 2.206, 2.368, 2.554, 3.019]) Or use a pre-computed onset envelope >>> o_env = librosa.onset.onset_strength(y=y, sr=sr) >>> times = librosa.times_like(o_env, sr=sr) >>> onset_frames = librosa.onset.onset_detect(onset_envelope=o_env, sr=sr) >>> import matplotlib.pyplot as plt >>> D = np.abs(librosa.stft(y)) >>> fig, ax = plt.subplots(nrows=2, sharex=True) >>> librosa.display.specshow(librosa.amplitude_to_db(D, ref=np.max), ... x_axis='time', y_axis='log', ax=ax[0], sr=sr) >>> ax[0].set(title='Power spectrogram') >>> ax[0].label_outer() >>> ax[1].plot(times, o_env, label='Onset strength') >>> ax[1].vlines(times[onset_frames], 0, o_env.max(), color='r', alpha=0.9, ... linestyle='--', label='Onsets') >>> ax[1].legend() """ # First, get the frame->beat strength profile if we don't already have one if onset_envelope is None: if y is None: raise ParameterError("y or onset_envelope must be provided") onset_envelope = onset_strength(y=y, sr=sr, hop_length=hop_length) # Shift onset envelope up to be non-negative # (a common normalization step to make the threshold more consistent) if normalize: # Normalize onset strength function to [0, 1] range # Normalization is performed over the trailing axis onset_envelope = onset_envelope - np.min(onset_envelope, keepdims=True, axis=-1) # Mypy does not realize that oenv is not None by now # Max-scale with safe division onset_envelope /= np.max(onset_envelope, keepdims=True, axis=-1) + util.tiny(onset_envelope) # type: ignore # help out mypy assert onset_envelope is not None # Do we have any onsets to grab? if not onset_envelope.any() or not np.all(np.isfinite(onset_envelope)): if sparse: onsets = np.array([], dtype=int) else: onsets = np.zeros_like(onset_envelope, dtype=bool) else: # These parameter settings found by large-scale search kwargs.setdefault("pre_max", 0.03 * sr // hop_length) # 30ms kwargs.setdefault("post_max", 0.00 * sr // hop_length + 1) # 0ms kwargs.setdefault("pre_avg", 0.10 * sr // hop_length) # 100ms kwargs.setdefault("post_avg", 0.10 * sr // hop_length + 1) # 100ms kwargs.setdefault("wait", 0.03 * sr // hop_length) # 30ms kwargs.setdefault("delta", 0.07) # Peak pick the onset envelope onsets = util.peak_pick(onset_envelope, sparse=sparse, axis=-1, **kwargs) # Optionally backtrack the events if backtrack: if not sparse: raise ParameterError("onset backtracking is only supported if sparse=True") if energy is None: energy = onset_envelope assert energy is not None onsets = onset_backtrack(onsets, energy) if sparse: if units == "frames": pass elif units == "samples": onsets = core.frames_to_samples(onsets, hop_length=hop_length) elif units == "time": onsets = core.frames_to_time(onsets, hop_length=hop_length, sr=sr) else: raise ParameterError(f"Invalid unit type: {units}") return onsets
[docs]def onset_strength( *, y: Optional[np.ndarray] = None, sr: float = 22050, S: Optional[np.ndarray] = None, lag: int = 1, max_size: int = 1, ref: Optional[np.ndarray] = None, detrend: bool = False, center: bool = True, feature: Optional[Callable] = None, aggregate: Optional[Union[Callable, bool]] = None, **kwargs: Any, ) -> np.ndarray: """Compute a spectral flux onset strength envelope. Onset strength at time ``t`` is determined by:: mean_f max(0, S[f, t] - ref[f, t - lag]) where ``ref`` is ``S`` after local max filtering along the frequency axis [#]_. By default, if a time series ``y`` is provided, S will be the log-power Mel spectrogram. .. [#] Böck, Sebastian, and Gerhard Widmer. "Maximum filter vibrato suppression for onset detection." 16th International Conference on Digital Audio Effects, Maynooth, Ireland. 2013. Parameters ---------- y : np.ndarray [shape=(..., n)] audio time-series. Multi-channel is supported. sr : number > 0 [scalar] sampling rate of ``y`` S : np.ndarray [shape=(..., d, m)] pre-computed (log-power) spectrogram lag : int > 0 time lag for computing differences max_size : int > 0 size (in frequency bins) of the local max filter. set to `1` to disable filtering. ref : None or np.ndarray [shape=(..., d, m)] An optional pre-computed reference spectrum, of the same shape as ``S``. If not provided, it will be computed from ``S``. If provided, it will override any local max filtering governed by ``max_size``. detrend : bool [scalar] Filter the onset strength to remove the DC component center : bool [scalar] Shift the onset function by ``n_fft // (2 * hop_length)`` frames. This corresponds to using a centered frame analysis in the short-time Fourier transform. feature : function Function for computing time-series features, eg, scaled spectrograms. By default, uses `librosa.feature.melspectrogram` with ``fmax=sr/2`` aggregate : function Aggregation function to use when combining onsets at different frequency bins. Default: `np.mean` **kwargs : additional keyword arguments Additional parameters to ``feature()``, if ``S`` is not provided. Returns ------- onset_envelope : np.ndarray [shape=(..., m,)] vector containing the onset strength envelope. If the input contains multiple channels, then onset envelope is computed for each channel. Raises ------ ParameterError if neither ``(y, sr)`` nor ``S`` are provided or if ``lag`` or ``max_size`` are not positive integers See Also -------- onset_detect onset_strength_multi Examples -------- First, load some audio and plot the spectrogram >>> import matplotlib.pyplot as plt >>> y, sr = librosa.load(librosa.ex('trumpet'), duration=3) >>> D = np.abs(librosa.stft(y)) >>> times = librosa.times_like(D, sr=sr) >>> fig, ax = plt.subplots(nrows=2, sharex=True) >>> librosa.display.specshow(librosa.amplitude_to_db(D, ref=np.max), ... y_axis='log', x_axis='time', ax=ax[0], sr=sr) >>> ax[0].set(title='Power spectrogram') >>> ax[0].label_outer() Construct a standard onset function >>> onset_env = librosa.onset.onset_strength(y=y, sr=sr) >>> ax[1].plot(times, 2 + onset_env / onset_env.max(), alpha=0.8, ... label='Mean (mel)') Median aggregation, and custom mel options >>> onset_env = librosa.onset.onset_strength(y=y, sr=sr, ... aggregate=np.median, ... fmax=8000, n_mels=256) >>> ax[1].plot(times, 1 + onset_env / onset_env.max(), alpha=0.8, ... label='Median (custom mel)') Constant-Q spectrogram instead of Mel >>> C = np.abs(librosa.cqt(y=y, sr=sr)) >>> onset_env = librosa.onset.onset_strength(sr=sr, S=librosa.amplitude_to_db(C, ref=np.max)) >>> ax[1].plot(times, onset_env / onset_env.max(), alpha=0.8, ... label='Mean (CQT)') >>> ax[1].legend() >>> ax[1].set(ylabel='Normalized strength', yticks=[]) """ if aggregate is False: raise ParameterError( f"aggregate parameter cannot be False when computing full-spectrum onset strength." ) odf_all = onset_strength_multi( y=y, sr=sr, S=S, lag=lag, max_size=max_size, ref=ref, detrend=detrend, center=center, feature=feature, aggregate=aggregate, channels=None, **kwargs, ) return odf_all[..., 0, :]
[docs]def onset_backtrack(events: np.ndarray, energy: np.ndarray) -> np.ndarray: """Backtrack detected onset events to the nearest preceding local minimum of an energy function. This function can be used to roll back the timing of detected onsets from a detected peak amplitude to the preceding minimum. This is most useful when using onsets to determine slice points for segmentation, as described by [#]_. .. [#] Jehan, Tristan. "Creating music by listening" Doctoral dissertation Massachusetts Institute of Technology, 2005. Parameters ---------- events : np.ndarray, dtype=int List of onset event frame indices, as computed by `onset_detect` energy : np.ndarray, shape=(m,) An energy function Returns ------- events_backtracked : np.ndarray, shape=events.shape The input events matched to nearest preceding minima of ``energy``. Examples -------- Backtrack the events using the onset envelope >>> y, sr = librosa.load(librosa.ex('trumpet'), duration=3) >>> oenv = librosa.onset.onset_strength(y=y, sr=sr) >>> times = librosa.times_like(oenv, sr=sr) >>> # Detect events without backtracking >>> onset_raw = librosa.onset.onset_detect(onset_envelope=oenv, ... backtrack=False) >>> onset_bt = librosa.onset.onset_backtrack(onset_raw, oenv) Backtrack the events using the RMS values >>> S = np.abs(librosa.stft(y=y)) >>> rms = librosa.feature.rms(S=S) >>> onset_bt_rms = librosa.onset.onset_backtrack(onset_raw, rms[0]) Plot the results >>> import matplotlib.pyplot as plt >>> fig, ax = plt.subplots(nrows=3, sharex=True) >>> librosa.display.specshow(librosa.amplitude_to_db(S, ref=np.max), ... y_axis='log', x_axis='time', ax=ax[0]) >>> ax[0].label_outer() >>> ax[1].plot(times, oenv, label='Onset strength') >>> ax[1].vlines(librosa.frames_to_time(onset_raw), 0, oenv.max(), label='Raw onsets') >>> ax[1].vlines(librosa.frames_to_time(onset_bt), 0, oenv.max(), label='Backtracked', color='r') >>> ax[1].legend() >>> ax[1].label_outer() >>> ax[2].plot(times, rms[0], label='RMS') >>> ax[2].vlines(librosa.frames_to_time(onset_bt_rms), 0, rms.max(), label='Backtracked (RMS)', color='r') >>> ax[2].legend() """ # Find points where energy is non-increasing # all points: energy[i] <= energy[i-1] # tail points: energy[i] < energy[i+1] minima = np.flatnonzero((energy[1:-1] <= energy[:-2]) & (energy[1:-1] < energy[2:])) # Pad on a 0, just in case we have onsets with no preceding minimum # Shift by one to account for slicing in minima detection minima = util.fix_frames(1 + minima, x_min=0) # Only match going left from the detected events results: np.ndarray = minima[util.match_events(events, minima, right=False)] return results
[docs]@cache(level=30) def onset_strength_multi( *, y: Optional[np.ndarray] = None, sr: float = 22050, S: Optional[np.ndarray] = None, n_fft: int = 2048, hop_length: int = 512, lag: int = 1, max_size: int = 1, ref: Optional[np.ndarray] = None, detrend: bool = False, center: bool = True, feature: Optional[Callable] = None, aggregate: Optional[Union[Callable, bool]] = None, channels: Optional[Union[Sequence[int], Sequence[slice]]] = None, **kwargs: Any, ) -> np.ndarray: """Compute a spectral flux onset strength envelope across multiple channels. Onset strength for channel ``i`` at time ``t`` is determined by:: mean_{f in channels[i]} max(0, S[f, t+1] - S[f, t]) Parameters ---------- y : np.ndarray [shape=(..., n,)] audio time-series. Multi-channel is supported. sr : number > 0 [scalar] sampling rate of ``y`` S : np.ndarray [shape=(..., d, m)] pre-computed (log-power) spectrogram n_fft : int > 0 [scalar] FFT window size for use in ``feature()`` if ``S`` is not provided. hop_length : int > 0 [scalar] hop length for use in ``feature()`` if ``S`` is not provided. lag : int > 0 time lag for computing differences max_size : int > 0 size (in frequency bins) of the local max filter. set to `1` to disable filtering. ref : None or np.ndarray [shape=(d, m)] An optional pre-computed reference spectrum, of the same shape as ``S``. If not provided, it will be computed from ``S``. If provided, it will override any local max filtering governed by ``max_size``. detrend : bool [scalar] Filter the onset strength to remove the DC component center : bool [scalar] Shift the onset function by ``n_fft // (2 * hop_length)`` frames. This corresponds to using a centered frame analysis in the short-time Fourier transform. feature : function Function for computing time-series features, eg, scaled spectrograms. By default, uses `librosa.feature.melspectrogram` with ``fmax=sr/2`` Must support arguments: ``y, sr, n_fft, hop_length`` aggregate : function or False Aggregation function to use when combining onsets at different frequency bins. If ``False``, then no aggregation is performed. Default: `np.mean` channels : list or None Array of channel boundaries or slice objects. If `None`, then a single channel is generated to span all bands. **kwargs : additional keyword arguments Additional parameters to ``feature()``, if ``S`` is not provided. Returns ------- onset_envelope : np.ndarray [shape=(..., n_channels, m)] array containing the onset strength envelope for each specified channel Raises ------ ParameterError if neither ``(y, sr)`` nor ``S`` are provided See Also -------- onset_strength Notes ----- This function caches at level 30. Examples -------- First, load some audio and plot the spectrogram >>> import matplotlib.pyplot as plt >>> y, sr = librosa.load(librosa.ex('choice'), duration=5) >>> D = np.abs(librosa.stft(y)) >>> fig, ax = plt.subplots(nrows=2, sharex=True) >>> img1 = librosa.display.specshow(librosa.amplitude_to_db(D, ref=np.max), ... y_axis='log', x_axis='time', ax=ax[0]) >>> ax[0].set(title='Power spectrogram') >>> ax[0].label_outer() >>> fig.colorbar(img1, ax=[ax[0]], format="%+2.f dB") Construct a standard onset function over four sub-bands >>> onset_subbands = librosa.onset.onset_strength_multi(y=y, sr=sr, ... channels=[0, 32, 64, 96, 128]) >>> img2 = librosa.display.specshow(onset_subbands, x_axis='time', ax=ax[1]) >>> ax[1].set(ylabel='Sub-bands', title='Sub-band onset strength') >>> fig.colorbar(img2, ax=[ax[1]]) """ if feature is None: feature = melspectrogram kwargs.setdefault("fmax", 0.5 * sr) if aggregate is None: aggregate = np.mean if not util.is_positive_int(lag): raise ParameterError(f"lag={lag} must be a positive integer") if not util.is_positive_int(max_size): raise ParameterError(f"max_size={max_size} must be a positive integer") # First, compute mel spectrogram if S is None: S = np.abs(feature(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length, **kwargs)) # Convert to dBs S = core.power_to_db(S) # Assertion to make type checking happy assert S is not None # Ensure that S is at least 2-d S = np.atleast_2d(S) # Compute the reference spectrogram. # Efficiency hack: skip filtering step and pass by reference # if max_size will produce a no-op. if ref is None: if max_size == 1: ref = S else: ref = scipy.ndimage.maximum_filter1d(S, max_size, axis=-2) elif ref.shape != S.shape: raise ParameterError( f"Reference spectrum shape {ref.shape} must match input spectrum {S.shape}" ) # Compute difference to the reference, spaced by lag onset_env = S[..., lag:] - ref[..., :-lag] # Discard negatives (decreasing amplitude) onset_env = np.maximum(0.0, onset_env) # Aggregate within channels pad = True if channels is None: channels = [slice(None)] else: pad = False if callable(aggregate): onset_env = util.sync( onset_env, channels, aggregate=aggregate, pad=pad, axis=-2 ) # compensate for lag pad_width = lag if center: # Counter-act framing effects. Shift the onsets by n_fft / hop_length pad_width += n_fft // (2 * hop_length) padding = [(0, 0) for _ in onset_env.shape] padding[-1] = (int(pad_width), 0) onset_env = np.pad(onset_env, padding, mode="constant") # remove the DC component if detrend: onset_env = scipy.signal.lfilter([1.0, -1.0], [1.0, -0.99], onset_env, axis=-1) # Trim to match the input duration if center: onset_env = onset_env[..., : S.shape[-1]] return onset_env