Caution
You're reading an old version of this documentation. If you want up-to-date information, please have a look at 0.9.1.
Source code for librosa.core.pitch
#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''Pitch-tracking and tuning estimation'''
import warnings
import numpy as np
import six
from .spectrum import _spectrogram
from . import time_frequency
from .._cache import cache
from .. import util
__all__ = ['estimate_tuning', 'pitch_tuning', 'piptrack']
[docs]def estimate_tuning(y=None, sr=22050, S=None, n_fft=2048,
resolution=0.01, bins_per_octave=12, **kwargs):
'''Estimate the tuning of an audio time series or spectrogram input.
Parameters
----------
y: np.ndarray [shape=(n,)] or None
audio signal
sr : number > 0 [scalar]
audio sampling rate of `y`
S: np.ndarray [shape=(d, t)] or None
magnitude or power spectrogram
n_fft : int > 0 [scalar] or None
number of FFT bins to use, if `y` is provided.
resolution : float in `(0, 1)`
Resolution of the tuning as a fraction of a bin.
0.01 corresponds to measurements in cents.
bins_per_octave : int > 0 [scalar]
How many frequency bins per octave
kwargs : additional keyword arguments
Additional arguments passed to `piptrack`
Returns
-------
tuning: float in `[-0.5, 0.5)`
estimated tuning deviation (fractions of a bin)
See Also
--------
piptrack
Pitch tracking by parabolic interpolation
Examples
--------
>>> # With time-series input
>>> y, sr = librosa.load(librosa.util.example_audio_file())
>>> librosa.estimate_tuning(y=y, sr=sr)
0.089999999999999969
>>> # In tenths of a cent
>>> y, sr = librosa.load(librosa.util.example_audio_file())
>>> librosa.estimate_tuning(y=y, sr=sr, resolution=1e-3)
0.093999999999999972
>>> # Using spectrogram input
>>> y, sr = librosa.load(librosa.util.example_audio_file())
>>> S = np.abs(librosa.stft(y))
>>> librosa.estimate_tuning(S=S, sr=sr)
0.089999999999999969
>>> # Using pass-through arguments to `librosa.piptrack`
>>> y, sr = librosa.load(librosa.util.example_audio_file())
>>> librosa.estimate_tuning(y=y, sr=sr, n_fft=8192,
... fmax=librosa.note_to_hz('G#9'))
0.070000000000000062
'''
pitch, mag = piptrack(y=y, sr=sr, S=S, n_fft=n_fft, **kwargs)
# Only count magnitude where frequency is > 0
pitch_mask = pitch > 0
if pitch_mask.any():
threshold = np.median(mag[pitch_mask])
else:
threshold = 0.0
return pitch_tuning(pitch[(mag >= threshold) & pitch_mask],
resolution=resolution,
bins_per_octave=bins_per_octave)
[docs]def pitch_tuning(frequencies, resolution=0.01, bins_per_octave=12):
'''Given a collection of pitches, estimate its tuning offset
(in fractions of a bin) relative to A440=440.0Hz.
Parameters
----------
frequencies : array-like, float
A collection of frequencies detected in the signal.
See `piptrack`
resolution : float in `(0, 1)`
Resolution of the tuning as a fraction of a bin.
0.01 corresponds to cents.
bins_per_octave : int > 0 [scalar]
How many frequency bins per octave
Returns
-------
tuning: float in `[-0.5, 0.5)`
estimated tuning deviation (fractions of a bin)
See Also
--------
estimate_tuning
Estimating tuning from time-series or spectrogram input
Examples
--------
>>> # Generate notes at +25 cents
>>> freqs = librosa.cqt_frequencies(24, 55, tuning=0.25)
>>> librosa.pitch_tuning(freqs)
0.25
>>> # Track frequencies from a real spectrogram
>>> y, sr = librosa.load(librosa.util.example_audio_file())
>>> pitches, magnitudes, stft = librosa.ifptrack(y, sr)
>>> # Select out pitches with high energy
>>> pitches = pitches[magnitudes > np.median(magnitudes)]
>>> librosa.pitch_tuning(pitches)
0.089999999999999969
'''
frequencies = np.atleast_1d(frequencies)
# Trim out any DC components
frequencies = frequencies[frequencies > 0]
if not np.any(frequencies):
warnings.warn('Trying to estimate tuning from empty frequency set.')
return 0.0
# Compute the residual relative to the number of bins
residual = np.mod(bins_per_octave *
time_frequency.hz_to_octs(frequencies), 1.0)
# Are we on the wrong side of the semitone?
# A residual of 0.95 is more likely to be a deviation of -0.05
# from the next tone up.
residual[residual >= 0.5] -= 1.0
bins = np.linspace(-0.5, 0.5, int(np.ceil(1. / resolution)) + 1)
counts, tuning = np.histogram(residual, bins)
# return the histogram peak
return tuning[np.argmax(counts)]
[docs]@cache(level=30)
def piptrack(y=None, sr=22050, S=None, n_fft=2048, hop_length=None,
fmin=150.0, fmax=4000.0, threshold=0.1,
win_length=None, window='hann', center=True, pad_mode='reflect',
ref=None):
'''Pitch tracking on thresholded parabolically-interpolated STFT.
This implementation uses the parabolic interpolation method described by [1]_.
.. [1] https://ccrma.stanford.edu/~jos/sasp/Sinusoidal_Peak_Interpolation.html
Parameters
----------
y: np.ndarray [shape=(n,)] or None
audio signal
sr : number > 0 [scalar]
audio sampling rate of `y`
S: np.ndarray [shape=(d, t)] or None
magnitude or power spectrogram
n_fft : int > 0 [scalar] or None
number of FFT bins to use, if `y` is provided.
hop_length : int > 0 [scalar] or None
number of samples to hop
threshold : float in `(0, 1)`
A bin in spectrum `S` is considered a pitch when it is greater than
`threshold*ref(S)`.
By default, `ref(S)` is taken to be `max(S, axis=0)` (the maximum value in
each column).
fmin : float > 0 [scalar]
lower frequency cutoff.
fmax : float > 0 [scalar]
upper frequency cutoff.
win_length : int <= n_fft [scalar]
Each frame of audio is windowed by `window()`.
The window will be of length `win_length` and then padded
with zeros to match `n_fft`.
If unspecified, defaults to ``win_length = n_fft``.
window : string, tuple, number, function, or np.ndarray [shape=(n_fft,)]
- a window specification (string, tuple, or number);
see `scipy.signal.get_window`
- a window function, such as `scipy.signal.hanning`
- a vector or array of length `n_fft`
.. see also:: `filters.get_window`
center : boolean
- If `True`, the signal `y` is padded so that frame
`t` is centered at `y[t * hop_length]`.
- If `False`, then frame `t` begins at `y[t * hop_length]`
pad_mode : string
If `center=True`, the padding mode to use at the edges of the signal.
By default, STFT uses reflection padding.
ref : scalar or callable [default=np.max]
If scalar, the reference value against which `S` is compared for determining
pitches.
If callable, the reference value is computed as `ref(S, axis=0)`.
.. note::
One of `S` or `y` must be provided.
If `S` is not given, it is computed from `y` using
the default parameters of `librosa.core.stft`.
Returns
-------
pitches : np.ndarray [shape=(d, t)]
magnitudes : np.ndarray [shape=(d,t)]
Where `d` is the subset of FFT bins within `fmin` and `fmax`.
`pitches[f, t]` contains instantaneous frequency at bin
`f`, time `t`
`magnitudes[f, t]` contains the corresponding magnitudes.
Both `pitches` and `magnitudes` take value 0 at bins
of non-maximal magnitude.
Notes
-----
This function caches at level 30.
Examples
--------
Computing pitches from a waveform input
>>> y, sr = librosa.load(librosa.util.example_audio_file())
>>> pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
Or from a spectrogram input
>>> S = np.abs(librosa.stft(y))
>>> pitches, magnitudes = librosa.piptrack(S=S, sr=sr)
Or with an alternate reference value for pitch detection, where
values above the mean spectral energy in each frame are counted as pitches
>>> pitches, magnitudes = librosa.piptrack(S=S, sr=sr, threshold=1,
... ref=np.mean)
'''
# Check that we received an audio time series or STFT
S, n_fft = _spectrogram(y=y, S=S, n_fft=n_fft, hop_length=hop_length,
win_length=win_length, window=window,
center=center, pad_mode=pad_mode)
# Make sure we're dealing with magnitudes
S = np.abs(S)
# Truncate to feasible region
fmin = np.maximum(fmin, 0)
fmax = np.minimum(fmax, float(sr) / 2)
fft_freqs = time_frequency.fft_frequencies(sr=sr, n_fft=n_fft)
# Do the parabolic interpolation everywhere,
# then figure out where the peaks are
# then restrict to the feasible range (fmin:fmax)
avg = 0.5 * (S[2:] - S[:-2])
shift = 2 * S[1:-1] - S[2:] - S[:-2]
# Suppress divide-by-zeros.
# Points where shift == 0 will never be selected by localmax anyway
shift = avg / (shift + (np.abs(shift) < util.tiny(shift)))
# Pad back up to the same shape as S
avg = np.pad(avg, ([1, 1], [0, 0]), mode='constant')
shift = np.pad(shift, ([1, 1], [0, 0]), mode='constant')
dskew = 0.5 * avg * shift
# Pre-allocate output
pitches = np.zeros_like(S)
mags = np.zeros_like(S)
# Clip to the viable frequency range
freq_mask = ((fmin <= fft_freqs) & (fft_freqs < fmax)).reshape((-1, 1))
# Compute the column-wise local max of S after thresholding
# Find the argmax coordinates
if ref is None:
ref = np.max
if six.callable(ref):
ref_value = threshold * ref(S, axis=0)
else:
ref_value = np.abs(ref)
idx = np.argwhere(freq_mask & util.localmax(S * (S > ref_value)))
# Store pitch and magnitude
pitches[idx[:, 0], idx[:, 1]] = ((idx[:, 0] + shift[idx[:, 0], idx[:, 1]])
* float(sr) / n_fft)
mags[idx[:, 0], idx[:, 1]] = (S[idx[:, 0], idx[:, 1]]
+ dskew[idx[:, 0], idx[:, 1]])
return pitches, mags