This commit is contained in:
2026-04-10 15:06:59 +02:00
parent 3031b7153b
commit e5a4711004
7806 changed files with 1918528 additions and 335 deletions

View File

@@ -0,0 +1,63 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Feature extraction
==================
Spectral features
-----------------
.. autosummary::
:toctree: generated/
chroma_stft
chroma_cqt
chroma_cens
chroma_vqt
melspectrogram
mfcc
rms
spectral_centroid
spectral_bandwidth
spectral_contrast
spectral_flatness
spectral_rolloff
poly_features
tonnetz
zero_crossing_rate
Rhythm features
---------------
.. autosummary::
:toctree: generated/
tempo
tempogram
fourier_tempogram
tempogram_ratio
Feature manipulation
--------------------
.. autosummary::
:toctree: generated/
delta
stack_memory
Feature inversion
-----------------
.. autosummary::
:toctree: generated
inverse.mel_to_stft
inverse.mel_to_audio
inverse.mfcc_to_mel
inverse.mfcc_to_audio
"""
import lazy_loader as lazy
__getattr__, __dir__, __all__ = lazy.attach_stub(__name__, __file__)

View File

@@ -0,0 +1,31 @@
from .utils import (
delta as delta,
stack_memory as stack_memory,
)
from .spectral import (
spectral_centroid as spectral_centroid,
spectral_bandwidth as spectral_bandwidth,
spectral_contrast as spectral_contrast,
spectral_rolloff as spectral_rolloff,
spectral_flatness as spectral_flatness,
poly_features as poly_features,
rms as rms,
zero_crossing_rate as zero_crossing_rate,
chroma_stft as chroma_stft,
chroma_cqt as chroma_cqt,
chroma_cens as chroma_cens,
chroma_vqt as chroma_vqt,
melspectrogram as melspectrogram,
mfcc as mfcc,
tonnetz as tonnetz,
)
from .rhythm import (
tempogram as tempogram,
fourier_tempogram as fourier_tempogram,
tempo as tempo,
tempogram_ratio as tempogram_ratio,
)
from . import (
inverse as inverse,
)

View File

@@ -0,0 +1,373 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Feature inversion"""
import warnings
import numpy as np
from ..core.fft import get_fftlib
from ..util.exceptions import ParameterError
from ..core.spectrum import griffinlim
from ..core.spectrum import db_to_power
from ..util.utils import tiny
from .. import filters
from ..util import nnls, expand_to
from numpy.typing import DTypeLike
from typing import Any, Optional
from .._typing import _WindowSpec, _PadModeSTFT
__all__ = ["mel_to_stft", "mel_to_audio", "mfcc_to_mel", "mfcc_to_audio"]
def mel_to_stft(
M: np.ndarray,
*,
sr: float = 22050,
n_fft: int = 2048,
power: float = 2.0,
**kwargs: Any,
) -> np.ndarray:
"""Approximate STFT magnitude from a Mel power spectrogram.
Parameters
----------
M : np.ndarray [shape=(..., n_mels, n), non-negative]
The spectrogram as produced by `feature.melspectrogram`
sr : number > 0 [scalar]
sampling rate of the underlying signal
n_fft : int > 0 [scalar]
number of FFT components in the resulting STFT
power : float > 0 [scalar]
Exponent for the magnitude melspectrogram
**kwargs : additional keyword arguments for Mel filter bank parameters
fmin : float >= 0 [scalar]
lowest frequency (in Hz)
fmax : float >= 0 [scalar]
highest frequency (in Hz).
If `None`, use ``fmax = sr / 2.0``
htk : bool [scalar]
use HTK formula instead of Slaney
norm : {None, 'slaney', or number} [scalar]
If 'slaney', divide the triangular mel weights by the width of
the mel band (area normalization).
If numeric, use `librosa.util.normalize` to normalize each filter
by to unit l_p norm. See `librosa.util.normalize` for a full
description of supported norm values (including `+-np.inf`).
Otherwise, leave all the triangles aiming for a peak value of 1.0
dtype : np.dtype
The data type of the output basis.
By default, uses 32-bit (single-precision) floating point.
Returns
-------
S : np.ndarray [shape=(..., n_fft, t), non-negative]
An approximate linear magnitude spectrogram
See Also
--------
librosa.feature.melspectrogram
librosa.stft
librosa.filters.mel
librosa.util.nnls
Examples
--------
>>> y, sr = librosa.load(librosa.ex('trumpet'))
>>> S = librosa.util.abs2(librosa.stft(y))
>>> mel_spec = librosa.feature.melspectrogram(S=S, sr=sr)
>>> S_inv = librosa.feature.inverse.mel_to_stft(mel_spec, sr=sr)
Compare the results visually
>>> import matplotlib.pyplot as plt
>>> fig, ax = plt.subplots(nrows=3, sharex=True, sharey=True)
>>> img = librosa.display.specshow(librosa.amplitude_to_db(S, ref=np.max, top_db=None),
... y_axis='log', x_axis='time', ax=ax[0])
>>> ax[0].set(title='Original STFT')
>>> ax[0].label_outer()
>>> librosa.display.specshow(librosa.amplitude_to_db(S_inv, ref=np.max, top_db=None),
... y_axis='log', x_axis='time', ax=ax[1])
>>> ax[1].set(title='Reconstructed STFT')
>>> ax[1].label_outer()
>>> librosa.display.specshow(librosa.amplitude_to_db(np.abs(S_inv - S),
... ref=S.max(), top_db=None),
... vmax=0, y_axis='log', x_axis='time', cmap='magma', ax=ax[2])
>>> ax[2].set(title='Residual error (dB)')
>>> fig.colorbar(img, ax=ax, format="%+2.f dB")
"""
# Construct a mel basis with dtype matching the input data
mel_basis = filters.mel(
sr=sr, n_fft=n_fft, n_mels=M.shape[-2], dtype=M.dtype, **kwargs
)
# Find the non-negative least squares solution, and apply
# the inverse exponent.
# We'll do the exponentiation in-place.
inverse = nnls(mel_basis, M)
return np.power(inverse, 1.0 / power, out=inverse)
def mel_to_audio(
M: np.ndarray,
*,
sr: float = 22050,
n_fft: int = 2048,
hop_length: Optional[int] = None,
win_length: Optional[int] = None,
window: _WindowSpec = "hann",
center: bool = True,
pad_mode: _PadModeSTFT = "constant",
power: float = 2.0,
n_iter: int = 32,
length: Optional[int] = None,
dtype: DTypeLike = np.float32,
**kwargs: Any,
) -> np.ndarray:
"""Invert a mel power spectrogram to audio using Griffin-Lim.
This is primarily a convenience wrapper for:
>>> S = librosa.feature.inverse.mel_to_stft(M)
>>> y = librosa.griffinlim(S)
Parameters
----------
M : np.ndarray [shape=(..., n_mels, n), non-negative]
The spectrogram as produced by `feature.melspectrogram`
sr : number > 0 [scalar]
sampling rate of the underlying signal
n_fft : int > 0 [scalar]
number of FFT components in the resulting STFT
hop_length : None or int > 0
The hop length of the STFT. If not provided, it will default to ``n_fft // 4``
win_length : None or int > 0
The window length of the STFT. By default, it will equal ``n_fft``
window : string, tuple, number, function, or np.ndarray [shape=(n_fft,)]
A window specification as supported by `stft` or `istft`
center : boolean
If `True`, the STFT is assumed to use centered frames.
If `False`, the STFT is assumed to use left-aligned frames.
pad_mode : string
If ``center=True``, the padding mode to use at the edges of the signal.
By default, STFT uses zero padding.
power : float > 0 [scalar]
Exponent for the magnitude melspectrogram
n_iter : int > 0
The number of iterations for Griffin-Lim
length : None or int > 0
If provided, the output ``y`` is zero-padded or clipped to exactly ``length``
samples.
dtype : np.dtype
Real numeric type for the time-domain signal. Default is 32-bit float.
**kwargs : additional keyword arguments for Mel filter bank parameters
fmin : float >= 0 [scalar]
lowest frequency (in Hz)
fmax : float >= 0 [scalar]
highest frequency (in Hz).
If `None`, use ``fmax = sr / 2.0``
htk : bool [scalar]
use HTK formula instead of Slaney
norm : {None, 'slaney', or number} [scalar]
If 'slaney', divide the triangular mel weights by the width of
the mel band (area normalization).
If numeric, use `librosa.util.normalize` to normalize each filter
by to unit l_p norm. See `librosa.util.normalize` for a full
description of supported norm values (including `+-np.inf`).
Otherwise, leave all the triangles aiming for a peak value of 1.0
Returns
-------
y : np.ndarray [shape(..., n,)]
time-domain signal reconstructed from ``M``
See Also
--------
librosa.griffinlim
librosa.feature.melspectrogram
librosa.filters.mel
librosa.feature.inverse.mel_to_stft
"""
stft = mel_to_stft(M, sr=sr, n_fft=n_fft, power=power, **kwargs)
return griffinlim(
stft,
n_iter=n_iter,
hop_length=hop_length,
win_length=win_length,
n_fft=n_fft,
window=window,
center=center,
dtype=dtype,
length=length,
pad_mode=pad_mode,
)
def mfcc_to_mel(
mfcc: np.ndarray,
*,
n_mels: int = 128,
dct_type: int = 2,
norm: Optional[str] = "ortho",
ref: float = 1.0,
lifter: float = 0,
) -> np.ndarray:
"""Invert Mel-frequency cepstral coefficients to approximate a Mel power
spectrogram.
This inversion proceeds in two steps:
1. The inverse DCT is applied to the MFCCs
2. `librosa.db_to_power` is applied to map the dB-scaled result to a power spectrogram
Parameters
----------
mfcc : np.ndarray [shape=(..., n_mfcc, n)]
The Mel-frequency cepstral coefficients
n_mels : int > 0
The number of Mel frequencies
dct_type : {1, 2, 3}
Discrete cosine transform (DCT) type
By default, DCT type-2 is used.
norm : None or 'ortho'
If ``dct_type`` is `2 or 3`, setting ``norm='ortho'`` uses an orthonormal
DCT basis.
Normalization is not supported for `dct_type=1`.
ref : float
Reference power for (inverse) decibel calculation
lifter : number >= 0
If ``lifter>0``, apply inverse liftering (inverse cepstral filtering)::
M[n, :] <- M[n, :] / (1 + sin(pi * (n + 1) / lifter) * lifter / 2)
Returns
-------
M : np.ndarray [shape=(..., n_mels, n)]
An approximate Mel power spectrum recovered from ``mfcc``
Warns
-----
UserWarning
due to critical values in lifter array that invokes underflow.
See Also
--------
librosa.feature.mfcc
librosa.feature.melspectrogram
scipy.fft.dct
"""
if lifter > 0:
n_mfcc = mfcc.shape[-2]
idx = np.arange(1, 1 + n_mfcc, dtype=mfcc.dtype)
idx = expand_to(idx, ndim=mfcc.ndim, axes=-2)
lifter_sine = 1 + lifter * 0.5 * np.sin(np.pi * idx / lifter)
# raise a UserWarning if lifter array includes critical values
if np.any(np.abs(lifter_sine) < np.finfo(lifter_sine.dtype).eps):
warnings.warn(
message="lifter array includes critical values that may invoke underflow.",
category=UserWarning,
stacklevel=2,
)
# lifter mfcc values
mfcc = mfcc / (lifter_sine + tiny(mfcc))
elif lifter != 0:
raise ParameterError("MFCC to mel lifter must be a non-negative number.")
fft = get_fftlib()
logmel = fft.idct(mfcc, axis=-2, type=dct_type, norm=norm, n=n_mels)
melspec: np.ndarray = db_to_power(logmel, ref=ref)
return melspec
def mfcc_to_audio(
mfcc: np.ndarray,
*,
n_mels: int = 128,
dct_type: int = 2,
norm: Optional[str] = "ortho",
ref: float = 1.0,
lifter: float = 0,
**kwargs: Any,
) -> np.ndarray:
"""Convert Mel-frequency cepstral coefficients to a time-domain audio signal
This function is primarily a convenience wrapper for the following steps:
1. Convert mfcc to Mel power spectrum (`mfcc_to_mel`)
2. Convert Mel power spectrum to time-domain audio (`mel_to_audio`)
Parameters
----------
mfcc : np.ndarray [shape=(..., n_mfcc, n)]
The Mel-frequency cepstral coefficients
n_mels : int > 0
The number of Mel frequencies
dct_type : {1, 2, 3}
Discrete cosine transform (DCT) type
By default, DCT type-2 is used.
norm : None or 'ortho'
If ``dct_type`` is `2 or 3`, setting ``norm='ortho'`` uses an orthonormal
DCT basis.
Normalization is not supported for ``dct_type=1``.
ref : float
Reference power for (inverse) decibel calculation
lifter : number >= 0
If ``lifter>0``, apply inverse liftering (inverse cepstral filtering)::
M[n, :] <- M[n, :] / (1 + sin(pi * (n + 1) / lifter)) * lifter / 2
**kwargs : additional keyword arguments to pass through to `mel_to_audio`
M : np.ndarray [shape=(..., n_mels, n), non-negative]
The spectrogram as produced by `feature.melspectrogram`
sr : number > 0 [scalar]
sampling rate of the underlying signal
n_fft : int > 0 [scalar]
number of FFT components in the resulting STFT
hop_length : None or int > 0
The hop length of the STFT. If not provided, it will default to ``n_fft // 4``
win_length : None or int > 0
The window length of the STFT. By default, it will equal ``n_fft``
window : string, tuple, number, function, or np.ndarray [shape=(n_fft,)]
A window specification as supported by `stft` or `istft`
center : boolean
If `True`, the STFT is assumed to use centered frames.
If `False`, the STFT is assumed to use left-aligned frames.
pad_mode : string
If ``center=True``, the padding mode to use at the edges of the signal.
By default, STFT uses zero padding.
power : float > 0 [scalar]
Exponent for the magnitude melspectrogram
n_iter : int > 0
The number of iterations for Griffin-Lim
length : None or int > 0
If provided, the output ``y`` is zero-padded or clipped to exactly ``length``
samples.
dtype : np.dtype
Real numeric type for the time-domain signal. Default is 32-bit float.
**kwargs : additional keyword arguments for Mel filter bank parameters
fmin : float >= 0 [scalar]
lowest frequency (in Hz)
fmax : float >= 0 [scalar]
highest frequency (in Hz).
If `None`, use ``fmax = sr / 2.0``
htk : bool [scalar]
use HTK formula instead of Slaney
Returns
-------
y : np.ndarray [shape=(..., n)]
A time-domain signal reconstructed from `mfcc`
See Also
--------
mfcc_to_mel
mel_to_audio
librosa.feature.mfcc
librosa.griffinlim
scipy.fft.dct
"""
mel_spec = mfcc_to_mel(
mfcc, n_mels=n_mels, dct_type=dct_type, norm=norm, ref=ref, lifter=lifter
)
return mel_to_audio(mel_spec, **kwargs)

View File

@@ -0,0 +1,655 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Rhythmic feature extraction"""
import numpy as np
import scipy
from .. import util
from .._cache import cache
from ..core.audio import autocorrelate
from ..core.spectrum import stft
from ..core.convert import tempo_frequencies, time_to_frames
from ..core.harmonic import f0_harmonics
from ..util.exceptions import ParameterError
from ..filters import get_window
from typing import Optional, Callable, Any
from .._typing import _WindowSpec
__all__ = ["tempogram", "fourier_tempogram", "tempo", "tempogram_ratio"]
# -- Rhythmic features -- #
def tempogram(
*,
y: Optional[np.ndarray] = None,
sr: float = 22050,
onset_envelope: Optional[np.ndarray] = None,
hop_length: int = 512,
win_length: int = 384,
center: bool = True,
window: _WindowSpec = "hann",
norm: Optional[float] = np.inf,
) -> np.ndarray:
"""Compute the tempogram: local autocorrelation of the onset strength envelope. [#]_
.. [#] Grosche, Peter, Meinard Müller, and Frank Kurth.
"Cyclic tempogram - A mid-level tempo representation for music signals."
ICASSP, 2010.
Parameters
----------
y : np.ndarray [shape=(..., n)] or None
Audio time series. Multi-channel is supported.
sr : number > 0 [scalar]
sampling rate of ``y``
onset_envelope : np.ndarray [shape=(..., n) or (..., m, n)] or None
Optional pre-computed onset strength envelope as provided by
`librosa.onset.onset_strength`.
If multi-dimensional, tempograms are computed independently for each
band (first dimension).
hop_length : int > 0
number of audio samples between successive onset measurements
win_length : int > 0
length of the onset autocorrelation window (in frames/onset measurements)
The default settings (384) corresponds to ``384 * hop_length / sr ~= 8.9s``.
center : bool
If `True`, onset autocorrelation windows are centered.
If `False`, windows are left-aligned.
window : string, function, number, tuple, or np.ndarray [shape=(win_length,)]
A window specification as in `stft`.
norm : {np.inf, -np.inf, 0, float > 0, None}
Normalization mode. Set to `None` to disable normalization.
Returns
-------
tempogram : np.ndarray [shape=(..., win_length, n)]
Localized autocorrelation of the onset strength envelope.
If given multi-band input (``onset_envelope.shape==(m,n)``) then
``tempogram[i]`` is the tempogram of ``onset_envelope[i]``.
Raises
------
ParameterError
if neither ``y`` nor ``onset_envelope`` are provided
if ``win_length < 1``
See Also
--------
fourier_tempogram
librosa.onset.onset_strength
librosa.util.normalize
librosa.stft
Examples
--------
>>> # Compute local onset autocorrelation
>>> y, sr = librosa.load(librosa.ex('nutcracker'), duration=30)
>>> hop_length = 512
>>> oenv = librosa.onset.onset_strength(y=y, sr=sr, hop_length=hop_length)
>>> tempogram = librosa.feature.tempogram(onset_envelope=oenv, sr=sr,
... hop_length=hop_length)
>>> # Compute global onset autocorrelation
>>> ac_global = librosa.autocorrelate(oenv, max_size=tempogram.shape[0])
>>> ac_global = librosa.util.normalize(ac_global)
>>> # Estimate the global tempo for display purposes
>>> tempo = librosa.feature.tempo(onset_envelope=oenv, sr=sr,
... hop_length=hop_length)[0]
>>> import matplotlib.pyplot as plt
>>> fig, ax = plt.subplots(nrows=4, figsize=(10, 10))
>>> times = librosa.times_like(oenv, sr=sr, hop_length=hop_length)
>>> ax[0].plot(times, oenv, label='Onset strength')
>>> ax[0].label_outer()
>>> ax[0].legend(frameon=True)
>>> librosa.display.specshow(tempogram, sr=sr, hop_length=hop_length,
>>> x_axis='time', y_axis='tempo', cmap='magma',
... ax=ax[1])
>>> ax[1].axhline(tempo, color='w', linestyle='--', alpha=1,
... label='Estimated tempo={:g}'.format(tempo))
>>> ax[1].legend(loc='upper right')
>>> ax[1].set(title='Tempogram')
>>> x = np.linspace(0, tempogram.shape[0] * float(hop_length) / sr,
... num=tempogram.shape[0])
>>> ax[2].plot(x, np.mean(tempogram, axis=1), label='Mean local autocorrelation')
>>> ax[2].plot(x, ac_global, '--', alpha=0.75, label='Global autocorrelation')
>>> ax[2].set(xlabel='Lag (seconds)')
>>> ax[2].legend(frameon=True)
>>> freqs = librosa.tempo_frequencies(tempogram.shape[0], hop_length=hop_length, sr=sr)
>>> ax[3].semilogx(freqs[1:], np.mean(tempogram[1:], axis=1),
... label='Mean local autocorrelation', base=2)
>>> ax[3].semilogx(freqs[1:], ac_global[1:], '--', alpha=0.75,
... label='Global autocorrelation', base=2)
>>> ax[3].axvline(tempo, color='black', linestyle='--', alpha=.8,
... label='Estimated tempo={:g}'.format(tempo))
>>> ax[3].legend(frameon=True)
>>> ax[3].set(xlabel='BPM')
>>> ax[3].grid(True)
"""
from ..onset import onset_strength
if win_length < 1:
raise ParameterError("win_length must be a positive integer")
ac_window = get_window(window, win_length, fftbins=True)
if onset_envelope is None:
if y is None:
raise ParameterError("Either y or onset_envelope must be provided")
onset_envelope = onset_strength(y=y, sr=sr, hop_length=hop_length)
# Center the autocorrelation windows
n = onset_envelope.shape[-1]
if center:
padding = [(0, 0) for _ in onset_envelope.shape]
padding[-1] = (int(win_length // 2),) * 2
onset_envelope = np.pad(
onset_envelope, padding, mode="linear_ramp", end_values=[0, 0]
)
# Carve onset envelope into frames
odf_frame = util.frame(onset_envelope, frame_length=win_length, hop_length=1)
# Truncate to the length of the original signal
if center:
odf_frame = odf_frame[..., :n]
# explicit broadcast of ac_window
ac_window = util.expand_to(ac_window, ndim=odf_frame.ndim, axes=-2)
# Window, autocorrelate, and normalize
return util.normalize(
autocorrelate(odf_frame * ac_window, axis=-2), norm=norm, axis=-2
)
def fourier_tempogram(
*,
y: Optional[np.ndarray] = None,
sr: float = 22050,
onset_envelope: Optional[np.ndarray] = None,
hop_length: int = 512,
win_length: int = 384,
center: bool = True,
window: _WindowSpec = "hann",
) -> np.ndarray:
"""Compute the Fourier tempogram: the short-time Fourier transform of the
onset strength envelope. [#]_
.. [#] Grosche, Peter, Meinard Müller, and Frank Kurth.
"Cyclic tempogram - A mid-level tempo representation for music signals."
ICASSP, 2010.
Parameters
----------
y : np.ndarray [shape=(..., n)] or None
Audio time series. Multi-channel is supported.
sr : number > 0 [scalar]
sampling rate of ``y``
onset_envelope : np.ndarray [shape=(..., n)] or None
Optional pre-computed onset strength envelope as provided by
``librosa.onset.onset_strength``.
Multi-channel is supported.
hop_length : int > 0
number of audio samples between successive onset measurements
win_length : int > 0
length of the onset window (in frames/onset measurements)
The default settings (384) corresponds to ``384 * hop_length / sr ~= 8.9s``.
center : bool
If `True`, onset windows are centered.
If `False`, windows are left-aligned.
window : string, function, number, tuple, or np.ndarray [shape=(win_length,)]
A window specification as in `stft`.
Returns
-------
tempogram : np.ndarray [shape=(..., win_length // 2 + 1, n)]
Complex short-time Fourier transform of the onset envelope.
Raises
------
ParameterError
if neither ``y`` nor ``onset_envelope`` are provided
if ``win_length < 1``
See Also
--------
tempogram
librosa.onset.onset_strength
librosa.util.normalize
librosa.stft
Examples
--------
>>> # Compute local onset autocorrelation
>>> y, sr = librosa.load(librosa.ex('nutcracker'))
>>> hop_length = 512
>>> oenv = librosa.onset.onset_strength(y=y, sr=sr, hop_length=hop_length)
>>> tempogram = librosa.feature.fourier_tempogram(onset_envelope=oenv, sr=sr,
... hop_length=hop_length)
>>> # Compute the auto-correlation tempogram, unnormalized to make comparison easier
>>> ac_tempogram = librosa.feature.tempogram(onset_envelope=oenv, sr=sr,
... hop_length=hop_length, norm=None)
>>> import matplotlib.pyplot as plt
>>> fig, ax = plt.subplots(nrows=3, sharex=True)
>>> ax[0].plot(librosa.times_like(oenv), oenv, label='Onset strength')
>>> ax[0].legend(frameon=True)
>>> ax[0].label_outer()
>>> librosa.display.specshow(np.abs(tempogram), sr=sr, hop_length=hop_length,
>>> x_axis='time', y_axis='fourier_tempo', cmap='magma',
... ax=ax[1])
>>> ax[1].set(title='Fourier tempogram')
>>> ax[1].label_outer()
>>> librosa.display.specshow(ac_tempogram, sr=sr, hop_length=hop_length,
>>> x_axis='time', y_axis='tempo', cmap='magma',
... ax=ax[2])
>>> ax[2].set(title='Autocorrelation tempogram')
"""
from ..onset import onset_strength
if win_length < 1:
raise ParameterError("win_length must be a positive integer")
if onset_envelope is None:
if y is None:
raise ParameterError("Either y or onset_envelope must be provided")
onset_envelope = onset_strength(y=y, sr=sr, hop_length=hop_length)
# Generate the short-time Fourier transform
return stft(
onset_envelope, n_fft=win_length, hop_length=1, center=center, window=window
)
@cache(level=30)
def tempo(
*,
y: Optional[np.ndarray] = None,
sr: float = 22050,
onset_envelope: Optional[np.ndarray] = None,
tg: Optional[np.ndarray] = None,
hop_length: int = 512,
start_bpm: float = 120,
std_bpm: float = 1.0,
ac_size: float = 8.0,
max_tempo: Optional[float] = 320.0,
aggregate: Optional[Callable[..., Any]] = np.mean,
prior: Optional[scipy.stats.rv_continuous] = None,
) -> np.ndarray:
"""Estimate the tempo (beats per minute)
Parameters
----------
y : np.ndarray [shape=(..., n)] or None
audio time series. Multi-channel is supported.
sr : number > 0 [scalar]
sampling rate of the time series
onset_envelope : np.ndarray [shape=(..., n)]
pre-computed onset strength envelope
tg : np.ndarray
pre-computed tempogram. If provided, then `y` and
`onset_envelope` are ignored, and `win_length` is
inferred from the shape of the tempogram.
hop_length : int > 0 [scalar]
hop length of the time series
start_bpm : float [scalar]
initial guess of the BPM
std_bpm : float > 0 [scalar]
standard deviation of tempo distribution
ac_size : float > 0 [scalar]
length (in seconds) of the auto-correlation window
max_tempo : float > 0 [scalar, optional]
If provided, only estimate tempo below this threshold
aggregate : callable [optional]
Aggregation function for estimating global tempo.
If `None`, then tempo is estimated independently for each frame.
prior : scipy.stats.rv_continuous [optional]
A prior distribution over tempo (in beats per minute).
By default, a pseudo-log-normal prior is used.
If given, ``start_bpm`` and ``std_bpm`` will be ignored.
Returns
-------
tempo : np.ndarray
estimated tempo (beats per minute).
If input is multi-channel, one tempo estimate per channel is provided.
See Also
--------
librosa.onset.onset_strength
librosa.feature.tempogram
Notes
-----
This function caches at level 30.
Examples
--------
>>> # Estimate a static tempo
>>> y, sr = librosa.load(librosa.ex('nutcracker'), duration=30)
>>> onset_env = librosa.onset.onset_strength(y=y, sr=sr)
>>> tempo = librosa.feature.tempo(onset_envelope=onset_env, sr=sr)
>>> tempo
array([143.555])
>>> # Or a static tempo with a uniform prior instead
>>> import scipy.stats
>>> prior = scipy.stats.uniform(30, 300) # uniform over 30-300 BPM
>>> utempo = librosa.feature.tempo(onset_envelope=onset_env, sr=sr, prior=prior)
>>> utempo
array([161.499])
>>> # Or a dynamic tempo
>>> dtempo = librosa.feature.tempo(onset_envelope=onset_env, sr=sr,
... aggregate=None)
>>> dtempo
array([ 89.103, 89.103, 89.103, ..., 123.047, 123.047, 123.047])
>>> # Dynamic tempo with a proper log-normal prior
>>> prior_lognorm = scipy.stats.lognorm(loc=np.log(120), scale=120, s=1)
>>> dtempo_lognorm = librosa.feature.tempo(onset_envelope=onset_env, sr=sr,
... aggregate=None,
... prior=prior_lognorm)
>>> dtempo_lognorm
array([ 89.103, 89.103, 89.103, ..., 123.047, 123.047, 123.047])
Plot the estimated tempo against the onset autocorrelation
>>> import matplotlib.pyplot as plt
>>> # Convert to scalar
>>> tempo = tempo.item()
>>> utempo = utempo.item()
>>> # Compute 2-second windowed autocorrelation
>>> hop_length = 512
>>> ac = librosa.autocorrelate(onset_env, max_size=2 * sr // hop_length)
>>> freqs = librosa.tempo_frequencies(len(ac), sr=sr,
... hop_length=hop_length)
>>> # Plot on a BPM axis. We skip the first (0-lag) bin.
>>> fig, ax = plt.subplots()
>>> ax.semilogx(freqs[1:], librosa.util.normalize(ac)[1:],
... label='Onset autocorrelation', base=2)
>>> ax.axvline(tempo, 0, 1, alpha=0.75, linestyle='--', color='r',
... label='Tempo (default prior): {:.2f} BPM'.format(tempo))
>>> ax.axvline(utempo, 0, 1, alpha=0.75, linestyle=':', color='g',
... label='Tempo (uniform prior): {:.2f} BPM'.format(utempo))
>>> ax.set(xlabel='Tempo (BPM)', title='Static tempo estimation')
>>> ax.grid(True)
>>> ax.legend()
Plot dynamic tempo estimates over a tempogram
>>> fig, ax = plt.subplots()
>>> tg = librosa.feature.tempogram(onset_envelope=onset_env, sr=sr,
... hop_length=hop_length)
>>> librosa.display.specshow(tg, x_axis='time', y_axis='tempo', cmap='magma', ax=ax)
>>> ax.plot(librosa.times_like(dtempo), dtempo,
... color='c', linewidth=1.5, label='Tempo estimate (default prior)')
>>> ax.plot(librosa.times_like(dtempo_lognorm), dtempo_lognorm,
... color='c', linewidth=1.5, linestyle='--',
... label='Tempo estimate (lognorm prior)')
>>> ax.set(title='Dynamic tempo estimation')
>>> ax.legend()
"""
if start_bpm <= 0:
raise ParameterError("start_bpm must be strictly positive")
if tg is None:
win_length = time_to_frames(ac_size, sr=sr, hop_length=hop_length).item()
tg = tempogram(
y=y,
sr=sr,
onset_envelope=onset_envelope,
hop_length=hop_length,
win_length=win_length,
)
else:
# Override window length by what's actually given
win_length = tg.shape[-2]
# Eventually, we want this to work for time-varying tempo
if aggregate is not None:
tg = aggregate(tg, axis=-1, keepdims=True)
assert tg is not None
# Get the BPM values for each bin, skipping the 0-lag bin
bpms = tempo_frequencies(win_length, hop_length=hop_length, sr=sr)
# Weight the autocorrelation by a log-normal distribution
if prior is None:
logprior = -0.5 * ((np.log2(bpms) - np.log2(start_bpm)) / std_bpm) ** 2
else:
logprior = prior.logpdf(bpms)
# Kill everything above the max tempo
if max_tempo is not None:
max_idx = int(np.argmax(bpms < max_tempo))
logprior[:max_idx] = -np.inf
# explicit axis expansion
logprior = util.expand_to(logprior, ndim=tg.ndim, axes=-2)
# Get the maximum, weighted by the prior
# Using log1p here for numerical stability
best_period = np.argmax(np.log1p(1e6 * tg) + logprior, axis=-2)
tempo_est: np.ndarray = np.take(bpms, best_period)
return tempo_est
@cache(level=40)
def tempogram_ratio(
*,
y: Optional[np.ndarray] = None,
sr: float = 22050,
onset_envelope: Optional[np.ndarray] = None,
tg: Optional[np.ndarray] = None,
bpm: Optional[np.ndarray] = None,
hop_length: int = 512,
win_length: int = 384,
start_bpm: float = 120,
std_bpm: float = 1.0,
max_tempo: Optional[float] = 320.0,
freqs: Optional[np.ndarray] = None,
factors: Optional[np.ndarray] = None,
aggregate: Optional[Callable[..., Any]] = None,
prior: Optional[scipy.stats.rv_continuous] = None,
center: bool = True,
window: _WindowSpec = "hann",
kind: str = "linear",
fill_value: float = 0,
norm: Optional[float] = np.inf,
) -> np.ndarray:
"""Tempogram ratio features, also known as spectral rhythm patterns. [1]_
This function summarizes the energy at metrically important multiples
of the tempo. For example, if the tempo corresponds to the quarter-note
period, the tempogram ratio will measure the energy at the eighth note,
sixteenth note, half note, whole note, etc. periods, as well as dotted
and triplet ratios.
By default, the multiplicative factors used here are as specified by
[2]_. If the estimated tempo corresponds to a quarter note, these factors
will measure relative energy at the following metrical subdivisions:
+-------+--------+------------------+
| Index | Factor | Description |
+=======+========+==================+
| 0 | 4 | Sixteenth note |
+-------+--------+------------------+
| 1 | 8/3 | Dotted sixteenth |
+-------+--------+------------------+
| 2 | 3 | Eighth triplet |
+-------+--------+------------------+
| 3 | 2 | Eighth note |
+-------+--------+------------------+
| 4 | 4/3 | Dotted eighth |
+-------+--------+------------------+
| 5 | 3/2 | Quarter triplet |
+-------+--------+------------------+
| 6 | 1 | Quarter note |
+-------+--------+------------------+
| 7 | 2/3 | Dotted quarter |
+-------+--------+------------------+
| 8 | 3/4 | Half triplet |
+-------+--------+------------------+
| 9 | 1/2 | Half note |
+-------+--------+------------------+
| 10 | 1/3 | Dotted half note |
+-------+--------+------------------+
| 11 | 3/8 | Whole triplet |
+-------+--------+------------------+
| 12 | 1/4 | Whole note |
+-------+--------+------------------+
.. [1] Peeters, Geoffroy.
"Rhythm Classification Using Spectral Rhythm Patterns."
In ISMIR, pp. 644-647. 2005.
.. [2] Prockup, Matthew, Andreas F. Ehmann, Fabien Gouyon, Erik M. Schmidt, and Youngmoo E. Kim.
"Modeling musical rhythm at scale with the music genome project."
In 2015 IEEE workshop on applications of signal processing to audio and acoustics (WASPAA), pp. 1-5. IEEE, 2015.
Parameters
----------
y : np.ndarray [shape=(..., n)] or None
audio time series
sr : number > 0 [scalar]
sampling rate of the time series
onset_envelope : np.ndarray [shape=(..., n)]
pre-computed onset strength envelope
tg : np.ndarray
pre-computed tempogram. If provided, then `y` and
`onset_envelope` are ignored, and `win_length` is
inferred from the shape of the tempogram.
bpm : np.ndarray
pre-computed tempo estimate. This must be a per-frame
estimate, and have dimension compatible with `tg`.
hop_length : int > 0 [scalar]
hop length of the time series
win_length : int > 0 [scalar]
window length of the autocorrelation window for tempogram
calculation
start_bpm : float [scalar]
initial guess of the BPM if `bpm` is not provided
std_bpm : float > 0 [scalar]
standard deviation of tempo distribution
max_tempo : float > 0 [scalar, optional]
If provided, only estimate tempo below this threshold
freqs : np.ndarray
Frequencies (in BPM) of the tempogram axis.
factors : np.ndarray
Multiples of the fundamental tempo (bpm) to estimate.
If not provided, the factors are as specified above.
prior : scipy.stats.rv_continuous [optional]
A prior distribution over tempo (in beats per minute).
By default, a pseudo-log-normal prior is used.
If given, ``start_bpm`` and ``std_bpm`` will be ignored.
center : bool
If `True`, onset windows are centered.
If `False`, windows are left-aligned.
aggregate : callable [optional]
Aggregation function for estimating global tempogram ratio.
If `None`, then ratios are estimated independently for each frame.
window : string, function, number, tuple, or np.ndarray [shape=(win_length,)]
A window specification as in `stft`.
kind : str
Interpolation mode for measuring tempogram ratios
fill_value : float
The value to fill when extrapolating beyond the observed
frequency range.
norm : {np.inf, -np.inf, 0, float > 0, None}
Normalization mode. Set to `None` to disable normalization.
Returns
-------
tgr : np.ndarray
The tempogram ratio for the specified factors.
If `aggregate` is provided, the trailing time axis
will be removed.
If `aggregate` is not provided (default), ratios
will be estimated for each frame.
See Also
--------
tempogram
tempo
librosa.f0_harmonics
librosa.tempo_frequencies
Examples
--------
Compute tempogram ratio features using the default factors
for a waltz (3/4 time)
>>> import matplotlib.pyplot as plt
>>> y, sr = librosa.load(librosa.ex('sweetwaltz'))
>>> tempogram = librosa.feature.tempogram(y=y, sr=sr)
>>> tgr = librosa.feature.tempogram_ratio(tg=tempogram, sr=sr)
>>> fig, ax = plt.subplots(nrows=2, sharex=True)
>>> librosa.display.specshow(tempogram, x_axis='time', y_axis='tempo',
... ax=ax[0])
>>> librosa.display.specshow(tgr, x_axis='time', ax=ax[1])
>>> ax[0].label_outer()
>>> ax[0].set(title="Tempogram")
>>> ax[1].set(title="Tempogram ratio")
"""
# Get a tempogram and time-varying tempo estimate
if tg is None:
tg = tempogram(
y=y,
sr=sr,
onset_envelope=onset_envelope,
hop_length=hop_length,
win_length=win_length,
center=center,
window=window,
norm=norm,
)
if freqs is None:
freqs = tempo_frequencies(sr=sr, n_bins=len(tg), hop_length=hop_length)
# Estimate tempo per-frame, no aggregation yet
if bpm is None:
bpm = tempo(
sr=sr,
tg=tg,
hop_length=hop_length,
start_bpm=start_bpm,
std_bpm=std_bpm,
max_tempo=max_tempo,
aggregate=None,
prior=prior,
)
if factors is None:
# metric multiples from Prockup'15
factors = np.array(
[4, 8 / 3, 3, 2, 4 / 3, 3 / 2, 1, 2 / 3, 3 / 4, 1 / 2, 1 / 3, 3 / 8, 1 / 4]
)
tgr = f0_harmonics(
tg, freqs=freqs, f0=bpm, harmonics=factors, kind=kind, fill_value=fill_value
)
if aggregate is not None:
return aggregate(tgr, axis=-1) # type: ignore
return tgr

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,310 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Feature manipulation utilities"""
import numpy as np
import scipy.signal
from numba import jit
from .._cache import cache
from ..util.exceptions import ParameterError
from typing import Any
__all__ = ["delta", "stack_memory"]
@cache(level=40)
def delta(
data: np.ndarray,
*,
width: int = 9,
order: int = 1,
axis: int = -1,
mode: str = "interp",
**kwargs: Any,
) -> np.ndarray:
r"""Compute delta features: local estimate of the derivative
of the input data along the selected axis.
Delta features are computed Savitsky-Golay filtering.
Parameters
----------
data : np.ndarray
the input data matrix (eg, spectrogram)
width : int, positive, odd [scalar]
Number of frames over which to compute the delta features.
Cannot exceed the length of ``data`` along the specified axis.
If ``mode='interp'``, then ``width`` must be at least ``data.shape[axis]``.
order : int > 0 [scalar]
the order of the difference operator.
1 for first derivative, 2 for second, etc.
axis : int [scalar]
the axis along which to compute deltas.
Default is -1 (columns).
mode : str, {'interp', 'nearest', 'mirror', 'constant', 'wrap'}
Padding mode for estimating differences at the boundaries.
**kwargs : additional keyword arguments
See `scipy.signal.savgol_filter`
Returns
-------
delta_data : np.ndarray [shape=(..., t)]
delta matrix of ``data`` at specified order
Notes
-----
This function caches at level 40.
See Also
--------
scipy.signal.savgol_filter
Examples
--------
Compute MFCC deltas, delta-deltas
>>> y, sr = librosa.load(librosa.ex('libri1'), duration=5)
>>> mfcc = librosa.feature.mfcc(y=y, sr=sr)
>>> mfcc_delta = librosa.feature.delta(mfcc)
>>> mfcc_delta
array([[-5.713e+02, -5.697e+02, ..., -1.522e+02, -1.224e+02],
[ 1.104e+01, 1.330e+01, ..., 2.089e+02, 1.698e+02],
...,
[ 2.829e+00, 1.933e+00, ..., -3.149e+00, 2.294e-01],
[ 2.890e+00, 2.187e+00, ..., 6.959e+00, -1.039e+00]],
dtype=float32)
>>> mfcc_delta2 = librosa.feature.delta(mfcc, order=2)
>>> mfcc_delta2
array([[-1.195, -1.195, ..., -4.328, -4.328],
[-1.566, -1.566, ..., -9.949, -9.949],
...,
[ 0.707, 0.707, ..., 2.287, 2.287],
[ 0.655, 0.655, ..., -1.719, -1.719]], dtype=float32)
>>> import matplotlib.pyplot as plt
>>> fig, ax = plt.subplots(nrows=3, sharex=True, sharey=True)
>>> img1 = librosa.display.specshow(mfcc, ax=ax[0], x_axis='time')
>>> ax[0].set(title='MFCC')
>>> ax[0].label_outer()
>>> img2 = librosa.display.specshow(mfcc_delta, ax=ax[1], x_axis='time')
>>> ax[1].set(title=r'MFCC-$\Delta$')
>>> ax[1].label_outer()
>>> img3 = librosa.display.specshow(mfcc_delta2, ax=ax[2], x_axis='time')
>>> ax[2].set(title=r'MFCC-$\Delta^2$')
>>> fig.colorbar(img1, ax=[ax[0]])
>>> fig.colorbar(img2, ax=[ax[1]])
>>> fig.colorbar(img3, ax=[ax[2]])
"""
data = np.atleast_1d(data)
if mode == "interp" and width > data.shape[axis]:
raise ParameterError(
f"when mode='interp', width={width} "
f"cannot exceed data.shape[axis]={data.shape[axis]}"
)
if width < 3 or np.mod(width, 2) != 1:
raise ParameterError("width must be an odd integer >= 3")
if order <= 0 or not isinstance(order, (int, np.integer)):
raise ParameterError("order must be a positive integer")
kwargs.pop("deriv", None)
kwargs.setdefault("polyorder", order)
result: np.ndarray = scipy.signal.savgol_filter(
data, width, deriv=order, axis=axis, mode=mode, **kwargs
)
return result
@cache(level=40)
def stack_memory(
data: np.ndarray, *, n_steps: int = 2, delay: int = 1, **kwargs: Any
) -> np.ndarray:
"""Short-term history embedding: vertically concatenate a data
vector or matrix with delayed copies of itself.
Each column ``data[:, i]`` is mapped to::
data[..., i] -> [data[..., i],
data[..., i - delay],
...
data[..., i - (n_steps-1)*delay]]
For columns ``i < (n_steps - 1) * delay``, the data will be padded.
By default, the data is padded with zeros, but this behavior can be
overridden by supplying additional keyword arguments which are passed
to `np.pad()`.
Parameters
----------
data : np.ndarray [shape=(..., d, t)]
Input data matrix. If ``data`` is a vector (``data.ndim == 1``),
it will be interpreted as a row matrix and reshaped to ``(1, t)``.
n_steps : int > 0 [scalar]
embedding dimension, the number of steps back in time to stack
delay : int != 0 [scalar]
the number of columns to step.
Positive values embed from the past (previous columns).
Negative values embed from the future (subsequent columns).
**kwargs : additional keyword arguments
Additional arguments to pass to `numpy.pad`
Returns
-------
data_history : np.ndarray [shape=(..., m * d, t)]
data augmented with lagged copies of itself,
where ``m == n_steps - 1``.
Notes
-----
This function caches at level 40.
Examples
--------
Keep two steps (current and previous)
>>> data = np.arange(-3, 3)
>>> librosa.feature.stack_memory(data)
array([[-3, -2, -1, 0, 1, 2],
[ 0, -3, -2, -1, 0, 1]])
Or three steps
>>> librosa.feature.stack_memory(data, n_steps=3)
array([[-3, -2, -1, 0, 1, 2],
[ 0, -3, -2, -1, 0, 1],
[ 0, 0, -3, -2, -1, 0]])
Use reflection padding instead of zero-padding
>>> librosa.feature.stack_memory(data, n_steps=3, mode='reflect')
array([[-3, -2, -1, 0, 1, 2],
[-2, -3, -2, -1, 0, 1],
[-1, -2, -3, -2, -1, 0]])
Or pad with edge-values, and delay by 2
>>> librosa.feature.stack_memory(data, n_steps=3, delay=2, mode='edge')
array([[-3, -2, -1, 0, 1, 2],
[-3, -3, -3, -2, -1, 0],
[-3, -3, -3, -3, -3, -2]])
Stack time-lagged beat-synchronous chroma edge padding
>>> y, sr = librosa.load(librosa.ex('sweetwaltz'), duration=10)
>>> chroma = librosa.feature.chroma_cqt(y=y, sr=sr)
>>> tempo, beats = librosa.beat.beat_track(y=y, sr=sr, hop_length=512)
>>> beats = librosa.util.fix_frames(beats, x_min=0)
>>> chroma_sync = librosa.util.sync(chroma, beats)
>>> chroma_lag = librosa.feature.stack_memory(chroma_sync, n_steps=3,
... mode='edge')
Plot the result
>>> import matplotlib.pyplot as plt
>>> fig, ax = plt.subplots()
>>> beat_times = librosa.frames_to_time(beats, sr=sr, hop_length=512)
>>> librosa.display.specshow(chroma_lag, y_axis='chroma', x_axis='time',
... x_coords=beat_times, ax=ax)
>>> ax.text(1.0, 1/6, "Lag=0", transform=ax.transAxes, rotation=-90, ha="left", va="center")
>>> ax.text(1.0, 3/6, "Lag=1", transform=ax.transAxes, rotation=-90, ha="left", va="center")
>>> ax.text(1.0, 5/6, "Lag=2", transform=ax.transAxes, rotation=-90, ha="left", va="center")
>>> ax.set(title='Time-lagged chroma', ylabel="")
"""
if n_steps < 1:
raise ParameterError("n_steps must be a positive integer")
if delay == 0:
raise ParameterError("delay must be a non-zero integer")
data = np.atleast_2d(data)
t = data.shape[-1]
if t < 1:
raise ParameterError(
"Cannot stack memory when input data has "
f"no columns. Given data.shape={data.shape}"
)
kwargs.setdefault("mode", "constant")
if kwargs["mode"] == "constant":
kwargs.setdefault("constant_values", [0])
padding = [(0, 0) for _ in range(data.ndim)]
# Pad the end with zeros, which will roll to the front below
if delay > 0:
padding[-1] = (int((n_steps - 1) * delay), 0)
else:
padding[-1] = (0, int((n_steps - 1) * -delay))
data = np.pad(data, padding, **kwargs)
# Construct the shape of the target array
shape = list(data.shape)
shape[-2] = shape[-2] * n_steps
shape[-1] = t
shape = tuple(shape)
# Construct the output array to match layout and dtype of input
history = np.empty_like(data, shape=shape)
# Populate the output array
__stack(history, data, n_steps, delay)
return history
@jit(nopython=True, cache=True)
def __stack(history, data, n_steps, delay):
"""Memory-stacking helper function.
Parameters
----------
history : output array (2-dimensional)
data : pre-padded input array (2-dimensional)
n_steps : int > 0, the number of steps to stack
delay : int != 0, the amount of delay between steps
Returns
-------
None
Output is stored directly in the history array
"""
# Dimension of each copy of the data
d = data.shape[-2]
# Total number of time-steps to output
t = history.shape[-1]
if delay > 0:
for step in range(n_steps):
q = n_steps - 1 - step
# nth block is original shifted left by n*delay steps
history[..., step * d : (step + 1) * d, :] = data[
..., q * delay : q * delay + t
]
else:
# Handle the last block separately to avoid -t:0 empty slices
history[..., -d:, :] = data[..., -t:]
for step in range(n_steps - 1):
# nth block is original shifted right by n*delay steps
q = n_steps - 1 - step
history[..., step * d : (step + 1) * d, :] = data[
..., -t + q * delay : q * delay
]