#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Rhythmic feature extraction"""
import numpy as np
import scipy
from .. import util
from .._cache import cache
from ..core.audio import autocorrelate
from ..core.spectrum import stft
from ..core.convert import tempo_frequencies, time_to_frames
from ..core.harmonic import f0_harmonics
from ..util.exceptions import ParameterError
from ..filters import get_window
from typing import Optional, Callable, Any
from .._typing import _WindowSpec
__all__ = ["tempogram", "fourier_tempogram", "tempo", "tempogram_ratio"]
# -- Rhythmic features -- #
[docs]def tempogram(
*,
y: Optional[np.ndarray] = None,
sr: float = 22050,
onset_envelope: Optional[np.ndarray] = None,
hop_length: int = 512,
win_length: int = 384,
center: bool = True,
window: _WindowSpec = "hann",
norm: Optional[float] = np.inf,
) -> np.ndarray:
"""Compute the tempogram: local autocorrelation of the onset strength envelope. [#]_
.. [#] Grosche, Peter, Meinard Müller, and Frank Kurth.
"Cyclic tempogram - A mid-level tempo representation for music signals."
ICASSP, 2010.
Parameters
----------
y : np.ndarray [shape=(..., n)] or None
Audio time series. Multi-channel is supported.
sr : number > 0 [scalar]
sampling rate of ``y``
onset_envelope : np.ndarray [shape=(..., n) or (..., m, n)] or None
Optional pre-computed onset strength envelope as provided by
`librosa.onset.onset_strength`.
If multi-dimensional, tempograms are computed independently for each
band (first dimension).
hop_length : int > 0
number of audio samples between successive onset measurements
win_length : int > 0
length of the onset autocorrelation window (in frames/onset measurements)
The default settings (384) corresponds to ``384 * hop_length / sr ~= 8.9s``.
center : bool
If `True`, onset autocorrelation windows are centered.
If `False`, windows are left-aligned.
window : string, function, number, tuple, or np.ndarray [shape=(win_length,)]
A window specification as in `stft`.
norm : {np.inf, -np.inf, 0, float > 0, None}
Normalization mode. Set to `None` to disable normalization.
Returns
-------
tempogram : np.ndarray [shape=(..., win_length, n)]
Localized autocorrelation of the onset strength envelope.
If given multi-band input (``onset_envelope.shape==(m,n)``) then
``tempogram[i]`` is the tempogram of ``onset_envelope[i]``.
Raises
------
ParameterError
if neither ``y`` nor ``onset_envelope`` are provided
if ``win_length < 1``
See Also
--------
fourier_tempogram
librosa.onset.onset_strength
librosa.util.normalize
librosa.stft
Examples
--------
>>> # Compute local onset autocorrelation
>>> y, sr = librosa.load(librosa.ex('nutcracker'), duration=30)
>>> hop_length = 512
>>> oenv = librosa.onset.onset_strength(y=y, sr=sr, hop_length=hop_length)
>>> tempogram = librosa.feature.tempogram(onset_envelope=oenv, sr=sr,
... hop_length=hop_length)
>>> # Compute global onset autocorrelation
>>> ac_global = librosa.autocorrelate(oenv, max_size=tempogram.shape[0])
>>> ac_global = librosa.util.normalize(ac_global)
>>> # Estimate the global tempo for display purposes
>>> tempo = librosa.feature.tempo(onset_envelope=oenv, sr=sr,
... hop_length=hop_length)[0]
>>> import matplotlib.pyplot as plt
>>> fig, ax = plt.subplots(nrows=4, figsize=(10, 10))
>>> times = librosa.times_like(oenv, sr=sr, hop_length=hop_length)
>>> ax[0].plot(times, oenv, label='Onset strength')
>>> ax[0].label_outer()
>>> ax[0].legend(frameon=True)
>>> librosa.display.specshow(tempogram, sr=sr, hop_length=hop_length,
>>> x_axis='time', y_axis='tempo', cmap='magma',
... ax=ax[1])
>>> ax[1].axhline(tempo, color='w', linestyle='--', alpha=1,
... label='Estimated tempo={:g}'.format(tempo))
>>> ax[1].legend(loc='upper right')
>>> ax[1].set(title='Tempogram')
>>> x = np.linspace(0, tempogram.shape[0] * float(hop_length) / sr,
... num=tempogram.shape[0])
>>> ax[2].plot(x, np.mean(tempogram, axis=1), label='Mean local autocorrelation')
>>> ax[2].plot(x, ac_global, '--', alpha=0.75, label='Global autocorrelation')
>>> ax[2].set(xlabel='Lag (seconds)')
>>> ax[2].legend(frameon=True)
>>> freqs = librosa.tempo_frequencies(tempogram.shape[0], hop_length=hop_length, sr=sr)
>>> ax[3].semilogx(freqs[1:], np.mean(tempogram[1:], axis=1),
... label='Mean local autocorrelation', base=2)
>>> ax[3].semilogx(freqs[1:], ac_global[1:], '--', alpha=0.75,
... label='Global autocorrelation', base=2)
>>> ax[3].axvline(tempo, color='black', linestyle='--', alpha=.8,
... label='Estimated tempo={:g}'.format(tempo))
>>> ax[3].legend(frameon=True)
>>> ax[3].set(xlabel='BPM')
>>> ax[3].grid(True)
"""
from ..onset import onset_strength
if win_length < 1:
raise ParameterError("win_length must be a positive integer")
ac_window = get_window(window, win_length, fftbins=True)
if onset_envelope is None:
if y is None:
raise ParameterError("Either y or onset_envelope must be provided")
onset_envelope = onset_strength(y=y, sr=sr, hop_length=hop_length)
# Center the autocorrelation windows
n = onset_envelope.shape[-1]
if center:
padding = [(0, 0) for _ in onset_envelope.shape]
padding[-1] = (int(win_length // 2),) * 2
onset_envelope = np.pad(
onset_envelope, padding, mode="linear_ramp", end_values=[0, 0]
)
# Carve onset envelope into frames
odf_frame = util.frame(onset_envelope, frame_length=win_length, hop_length=1)
# Truncate to the length of the original signal
if center:
odf_frame = odf_frame[..., :n]
# explicit broadcast of ac_window
ac_window = util.expand_to(ac_window, ndim=odf_frame.ndim, axes=-2)
# Window, autocorrelate, and normalize
return util.normalize(
autocorrelate(odf_frame * ac_window, axis=-2), norm=norm, axis=-2
)
[docs]def fourier_tempogram(
*,
y: Optional[np.ndarray] = None,
sr: float = 22050,
onset_envelope: Optional[np.ndarray] = None,
hop_length: int = 512,
win_length: int = 384,
center: bool = True,
window: _WindowSpec = "hann",
) -> np.ndarray:
"""Compute the Fourier tempogram: the short-time Fourier transform of the
onset strength envelope. [#]_
.. [#] Grosche, Peter, Meinard Müller, and Frank Kurth.
"Cyclic tempogram - A mid-level tempo representation for music signals."
ICASSP, 2010.
Parameters
----------
y : np.ndarray [shape=(..., n)] or None
Audio time series. Multi-channel is supported.
sr : number > 0 [scalar]
sampling rate of ``y``
onset_envelope : np.ndarray [shape=(..., n)] or None
Optional pre-computed onset strength envelope as provided by
``librosa.onset.onset_strength``.
Multi-channel is supported.
hop_length : int > 0
number of audio samples between successive onset measurements
win_length : int > 0
length of the onset window (in frames/onset measurements)
The default settings (384) corresponds to ``384 * hop_length / sr ~= 8.9s``.
center : bool
If `True`, onset windows are centered.
If `False`, windows are left-aligned.
window : string, function, number, tuple, or np.ndarray [shape=(win_length,)]
A window specification as in `stft`.
Returns
-------
tempogram : np.ndarray [shape=(..., win_length // 2 + 1, n)]
Complex short-time Fourier transform of the onset envelope.
Raises
------
ParameterError
if neither ``y`` nor ``onset_envelope`` are provided
if ``win_length < 1``
See Also
--------
tempogram
librosa.onset.onset_strength
librosa.util.normalize
librosa.stft
Examples
--------
>>> # Compute local onset autocorrelation
>>> y, sr = librosa.load(librosa.ex('nutcracker'))
>>> hop_length = 512
>>> oenv = librosa.onset.onset_strength(y=y, sr=sr, hop_length=hop_length)
>>> tempogram = librosa.feature.fourier_tempogram(onset_envelope=oenv, sr=sr,
... hop_length=hop_length)
>>> # Compute the auto-correlation tempogram, unnormalized to make comparison easier
>>> ac_tempogram = librosa.feature.tempogram(onset_envelope=oenv, sr=sr,
... hop_length=hop_length, norm=None)
>>> import matplotlib.pyplot as plt
>>> fig, ax = plt.subplots(nrows=3, sharex=True)
>>> ax[0].plot(librosa.times_like(oenv), oenv, label='Onset strength')
>>> ax[0].legend(frameon=True)
>>> ax[0].label_outer()
>>> librosa.display.specshow(np.abs(tempogram), sr=sr, hop_length=hop_length,
>>> x_axis='time', y_axis='fourier_tempo', cmap='magma',
... ax=ax[1])
>>> ax[1].set(title='Fourier tempogram')
>>> ax[1].label_outer()
>>> librosa.display.specshow(ac_tempogram, sr=sr, hop_length=hop_length,
>>> x_axis='time', y_axis='tempo', cmap='magma',
... ax=ax[2])
>>> ax[2].set(title='Autocorrelation tempogram')
"""
from ..onset import onset_strength
if win_length < 1:
raise ParameterError("win_length must be a positive integer")
if onset_envelope is None:
if y is None:
raise ParameterError("Either y or onset_envelope must be provided")
onset_envelope = onset_strength(y=y, sr=sr, hop_length=hop_length)
# Generate the short-time Fourier transform
return stft(
onset_envelope, n_fft=win_length, hop_length=1, center=center, window=window
)
[docs]@cache(level=30)
def tempo(
*,
y: Optional[np.ndarray] = None,
sr: float = 22050,
onset_envelope: Optional[np.ndarray] = None,
tg: Optional[np.ndarray] = None,
hop_length: int = 512,
start_bpm: float = 120,
std_bpm: float = 1.0,
ac_size: float = 8.0,
max_tempo: Optional[float] = 320.0,
aggregate: Optional[Callable[..., Any]] = np.mean,
prior: Optional[scipy.stats.rv_continuous] = None,
) -> np.ndarray:
"""Estimate the tempo (beats per minute)
Parameters
----------
y : np.ndarray [shape=(..., n)] or None
audio time series. Multi-channel is supported.
sr : number > 0 [scalar]
sampling rate of the time series
onset_envelope : np.ndarray [shape=(..., n)]
pre-computed onset strength envelope
tg : np.ndarray
pre-computed tempogram. If provided, then `y` and
`onset_envelope` are ignored, and `win_length` is
inferred from the shape of the tempogram.
hop_length : int > 0 [scalar]
hop length of the time series
start_bpm : float [scalar]
initial guess of the BPM
std_bpm : float > 0 [scalar]
standard deviation of tempo distribution
ac_size : float > 0 [scalar]
length (in seconds) of the auto-correlation window
max_tempo : float > 0 [scalar, optional]
If provided, only estimate tempo below this threshold
aggregate : callable [optional]
Aggregation function for estimating global tempo.
If `None`, then tempo is estimated independently for each frame.
prior : scipy.stats.rv_continuous [optional]
A prior distribution over tempo (in beats per minute).
By default, a pseudo-log-normal prior is used.
If given, ``start_bpm`` and ``std_bpm`` will be ignored.
Returns
-------
tempo : np.ndarray
estimated tempo (beats per minute).
If input is multi-channel, one tempo estimate per channel is provided.
See Also
--------
librosa.onset.onset_strength
librosa.feature.tempogram
Notes
-----
This function caches at level 30.
Examples
--------
>>> # Estimate a static tempo
>>> y, sr = librosa.load(librosa.ex('nutcracker'), duration=30)
>>> onset_env = librosa.onset.onset_strength(y=y, sr=sr)
>>> tempo = librosa.feature.tempo(onset_envelope=onset_env, sr=sr)
>>> tempo
array([143.555])
>>> # Or a static tempo with a uniform prior instead
>>> import scipy.stats
>>> prior = scipy.stats.uniform(30, 300) # uniform over 30-300 BPM
>>> utempo = librosa.feature.tempo(onset_envelope=onset_env, sr=sr, prior=prior)
>>> utempo
array([161.499])
>>> # Or a dynamic tempo
>>> dtempo = librosa.feature.tempo(onset_envelope=onset_env, sr=sr,
... aggregate=None)
>>> dtempo
array([ 89.103, 89.103, 89.103, ..., 123.047, 123.047, 123.047])
>>> # Dynamic tempo with a proper log-normal prior
>>> prior_lognorm = scipy.stats.lognorm(loc=np.log(120), scale=120, s=1)
>>> dtempo_lognorm = librosa.feature.tempo(onset_envelope=onset_env, sr=sr,
... aggregate=None,
... prior=prior_lognorm)
>>> dtempo_lognorm
array([ 89.103, 89.103, 89.103, ..., 123.047, 123.047, 123.047])
Plot the estimated tempo against the onset autocorrelation
>>> import matplotlib.pyplot as plt
>>> # Convert to scalar
>>> tempo = tempo.item()
>>> utempo = utempo.item()
>>> # Compute 2-second windowed autocorrelation
>>> hop_length = 512
>>> ac = librosa.autocorrelate(onset_env, max_size=2 * sr // hop_length)
>>> freqs = librosa.tempo_frequencies(len(ac), sr=sr,
... hop_length=hop_length)
>>> # Plot on a BPM axis. We skip the first (0-lag) bin.
>>> fig, ax = plt.subplots()
>>> ax.semilogx(freqs[1:], librosa.util.normalize(ac)[1:],
... label='Onset autocorrelation', base=2)
>>> ax.axvline(tempo, 0, 1, alpha=0.75, linestyle='--', color='r',
... label='Tempo (default prior): {:.2f} BPM'.format(tempo))
>>> ax.axvline(utempo, 0, 1, alpha=0.75, linestyle=':', color='g',
... label='Tempo (uniform prior): {:.2f} BPM'.format(utempo))
>>> ax.set(xlabel='Tempo (BPM)', title='Static tempo estimation')
>>> ax.grid(True)
>>> ax.legend()
Plot dynamic tempo estimates over a tempogram
>>> fig, ax = plt.subplots()
>>> tg = librosa.feature.tempogram(onset_envelope=onset_env, sr=sr,
... hop_length=hop_length)
>>> librosa.display.specshow(tg, x_axis='time', y_axis='tempo', cmap='magma', ax=ax)
>>> ax.plot(librosa.times_like(dtempo), dtempo,
... color='c', linewidth=1.5, label='Tempo estimate (default prior)')
>>> ax.plot(librosa.times_like(dtempo_lognorm), dtempo_lognorm,
... color='c', linewidth=1.5, linestyle='--',
... label='Tempo estimate (lognorm prior)')
>>> ax.set(title='Dynamic tempo estimation')
>>> ax.legend()
"""
if start_bpm <= 0:
raise ParameterError("start_bpm must be strictly positive")
if tg is None:
win_length = time_to_frames(ac_size, sr=sr, hop_length=hop_length).item()
tg = tempogram(
y=y,
sr=sr,
onset_envelope=onset_envelope,
hop_length=hop_length,
win_length=win_length,
)
else:
# Override window length by what's actually given
win_length = tg.shape[-2]
# Eventually, we want this to work for time-varying tempo
if aggregate is not None:
tg = aggregate(tg, axis=-1, keepdims=True)
assert tg is not None
# Get the BPM values for each bin, skipping the 0-lag bin
bpms = tempo_frequencies(win_length, hop_length=hop_length, sr=sr)
# Weight the autocorrelation by a log-normal distribution
if prior is None:
logprior = -0.5 * ((np.log2(bpms) - np.log2(start_bpm)) / std_bpm) ** 2
else:
logprior = prior.logpdf(bpms)
# Kill everything above the max tempo
if max_tempo is not None:
max_idx = int(np.argmax(bpms < max_tempo))
logprior[:max_idx] = -np.inf
# explicit axis expansion
logprior = util.expand_to(logprior, ndim=tg.ndim, axes=-2)
# Get the maximum, weighted by the prior
# Using log1p here for numerical stability
best_period = np.argmax(np.log1p(1e6 * tg) + logprior, axis=-2)
tempo_est: np.ndarray = np.take(bpms, best_period)
return tempo_est
[docs]@cache(level=40)
def tempogram_ratio(
*,
y: Optional[np.ndarray] = None,
sr: float = 22050,
onset_envelope: Optional[np.ndarray] = None,
tg: Optional[np.ndarray] = None,
bpm: Optional[np.ndarray] = None,
hop_length: int = 512,
win_length: int = 384,
start_bpm: float = 120,
std_bpm: float = 1.0,
max_tempo: Optional[float] = 320.0,
freqs: Optional[np.ndarray] = None,
factors: Optional[np.ndarray] = None,
aggregate: Optional[Callable[..., Any]] = None,
prior: Optional[scipy.stats.rv_continuous] = None,
center: bool = True,
window: _WindowSpec = "hann",
kind: str = "linear",
fill_value: float = 0,
norm: Optional[float] = np.inf,
) -> np.ndarray:
"""Tempogram ratio features, also known as spectral rhythm patterns. [1]_
This function summarizes the energy at metrically important multiples
of the tempo. For example, if the tempo corresponds to the quarter-note
period, the tempogram ratio will measure the energy at the eighth note,
sixteenth note, half note, whole note, etc. periods, as well as dotted
and triplet ratios.
By default, the multiplicative factors used here are as specified by
[2]_. If the estimated tempo corresponds to a quarter note, these factors
will measure relative energy at the following metrical subdivisions:
+-------+--------+------------------+
| Index | Factor | Description |
+=======+========+==================+
| 0 | 4 | Sixteenth note |
+-------+--------+------------------+
| 1 | 8/3 | Dotted sixteenth |
+-------+--------+------------------+
| 2 | 3 | Eighth triplet |
+-------+--------+------------------+
| 3 | 2 | Eighth note |
+-------+--------+------------------+
| 4 | 4/3 | Dotted eighth |
+-------+--------+------------------+
| 5 | 3/2 | Quarter triplet |
+-------+--------+------------------+
| 6 | 1 | Quarter note |
+-------+--------+------------------+
| 7 | 2/3 | Dotted quarter |
+-------+--------+------------------+
| 8 | 3/4 | Half triplet |
+-------+--------+------------------+
| 9 | 1/2 | Half note |
+-------+--------+------------------+
| 10 | 1/3 | Dotted half note |
+-------+--------+------------------+
| 11 | 3/8 | Whole triplet |
+-------+--------+------------------+
| 12 | 1/4 | Whole note |
+-------+--------+------------------+
.. [1] Peeters, Geoffroy.
"Rhythm Classification Using Spectral Rhythm Patterns."
In ISMIR, pp. 644-647. 2005.
.. [2] Prockup, Matthew, Andreas F. Ehmann, Fabien Gouyon, Erik M. Schmidt, and Youngmoo E. Kim.
"Modeling musical rhythm at scale with the music genome project."
In 2015 IEEE workshop on applications of signal processing to audio and acoustics (WASPAA), pp. 1-5. IEEE, 2015.
Parameters
----------
y : np.ndarray [shape=(..., n)] or None
audio time series
sr : number > 0 [scalar]
sampling rate of the time series
onset_envelope : np.ndarray [shape=(..., n)]
pre-computed onset strength envelope
tg : np.ndarray
pre-computed tempogram. If provided, then `y` and
`onset_envelope` are ignored, and `win_length` is
inferred from the shape of the tempogram.
bpm : np.ndarray
pre-computed tempo estimate. This must be a per-frame
estimate, and have dimension compatible with `tg`.
hop_length : int > 0 [scalar]
hop length of the time series
win_length : int > 0 [scalar]
window length of the autocorrelation window for tempogram
calculation
start_bpm : float [scalar]
initial guess of the BPM if `bpm` is not provided
std_bpm : float > 0 [scalar]
standard deviation of tempo distribution
max_tempo : float > 0 [scalar, optional]
If provided, only estimate tempo below this threshold
freqs : np.ndarray
Frequencies (in BPM) of the tempogram axis.
factors : np.ndarray
Multiples of the fundamental tempo (bpm) to estimate.
If not provided, the factors are as specified above.
prior : scipy.stats.rv_continuous [optional]
A prior distribution over tempo (in beats per minute).
By default, a pseudo-log-normal prior is used.
If given, ``start_bpm`` and ``std_bpm`` will be ignored.
center : bool
If `True`, onset windows are centered.
If `False`, windows are left-aligned.
aggregate : callable [optional]
Aggregation function for estimating global tempogram ratio.
If `None`, then ratios are estimated independently for each frame.
window : string, function, number, tuple, or np.ndarray [shape=(win_length,)]
A window specification as in `stft`.
kind : str
Interpolation mode for measuring tempogram ratios
fill_value : float
The value to fill when extrapolating beyond the observed
frequency range.
norm : {np.inf, -np.inf, 0, float > 0, None}
Normalization mode. Set to `None` to disable normalization.
Returns
-------
tgr : np.ndarray
The tempogram ratio for the specified factors.
If `aggregate` is provided, the trailing time axis
will be removed.
If `aggregate` is not provided (default), ratios
will be estimated for each frame.
See Also
--------
tempogram
tempo
librosa.f0_harmonics
librosa.tempo_frequencies
Examples
--------
Compute tempogram ratio features using the default factors
for a waltz (3/4 time)
>>> import matplotlib.pyplot as plt
>>> y, sr = librosa.load(librosa.ex('sweetwaltz'))
>>> tempogram = librosa.feature.tempogram(y=y, sr=sr)
>>> tgr = librosa.feature.tempogram_ratio(tg=tempogram, sr=sr)
>>> fig, ax = plt.subplots(nrows=2, sharex=True)
>>> librosa.display.specshow(tempogram, x_axis='time', y_axis='tempo',
... ax=ax[0])
>>> librosa.display.specshow(tgr, x_axis='time', ax=ax[1])
>>> ax[0].label_outer()
>>> ax[0].set(title="Tempogram")
>>> ax[1].set(title="Tempogram ratio")
"""
# Get a tempogram and time-varying tempo estimate
if tg is None:
tg = tempogram(
y=y,
sr=sr,
onset_envelope=onset_envelope,
hop_length=hop_length,
win_length=win_length,
center=center,
window=window,
norm=norm,
)
if freqs is None:
freqs = tempo_frequencies(sr=sr, n_bins=len(tg), hop_length=hop_length)
# Estimate tempo per-frame, no aggregation yet
if bpm is None:
bpm = tempo(
sr=sr,
tg=tg,
hop_length=hop_length,
start_bpm=start_bpm,
std_bpm=std_bpm,
max_tempo=max_tempo,
aggregate=None,
prior=prior,
)
if factors is None:
# metric multiples from Prockup'15
factors = np.array(
[4, 8 / 3, 3, 2, 4 / 3, 3 / 2, 1, 2 / 3, 3 / 4, 1 / 2, 1 / 3, 3 / 8, 1 / 4]
)
tgr = f0_harmonics(
tg, freqs=freqs, f0=bpm, harmonics=factors, kind=kind, fill_value=fill_value
)
if aggregate is not None:
return aggregate(tgr, axis=-1) # type: ignore
return tgr