Videre
This commit is contained in:
@@ -0,0 +1,63 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Feature extraction
|
||||
==================
|
||||
|
||||
Spectral features
|
||||
-----------------
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
chroma_stft
|
||||
chroma_cqt
|
||||
chroma_cens
|
||||
chroma_vqt
|
||||
melspectrogram
|
||||
mfcc
|
||||
rms
|
||||
spectral_centroid
|
||||
spectral_bandwidth
|
||||
spectral_contrast
|
||||
spectral_flatness
|
||||
spectral_rolloff
|
||||
poly_features
|
||||
tonnetz
|
||||
zero_crossing_rate
|
||||
|
||||
Rhythm features
|
||||
---------------
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
tempo
|
||||
tempogram
|
||||
fourier_tempogram
|
||||
tempogram_ratio
|
||||
|
||||
Feature manipulation
|
||||
--------------------
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
delta
|
||||
stack_memory
|
||||
|
||||
|
||||
Feature inversion
|
||||
-----------------
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated
|
||||
|
||||
inverse.mel_to_stft
|
||||
inverse.mel_to_audio
|
||||
inverse.mfcc_to_mel
|
||||
inverse.mfcc_to_audio
|
||||
"""
|
||||
|
||||
import lazy_loader as lazy
|
||||
|
||||
__getattr__, __dir__, __all__ = lazy.attach_stub(__name__, __file__)
|
||||
@@ -0,0 +1,31 @@
|
||||
from .utils import (
|
||||
delta as delta,
|
||||
stack_memory as stack_memory,
|
||||
)
|
||||
from .spectral import (
|
||||
spectral_centroid as spectral_centroid,
|
||||
spectral_bandwidth as spectral_bandwidth,
|
||||
spectral_contrast as spectral_contrast,
|
||||
spectral_rolloff as spectral_rolloff,
|
||||
spectral_flatness as spectral_flatness,
|
||||
poly_features as poly_features,
|
||||
rms as rms,
|
||||
zero_crossing_rate as zero_crossing_rate,
|
||||
chroma_stft as chroma_stft,
|
||||
chroma_cqt as chroma_cqt,
|
||||
chroma_cens as chroma_cens,
|
||||
chroma_vqt as chroma_vqt,
|
||||
melspectrogram as melspectrogram,
|
||||
mfcc as mfcc,
|
||||
tonnetz as tonnetz,
|
||||
)
|
||||
from .rhythm import (
|
||||
tempogram as tempogram,
|
||||
fourier_tempogram as fourier_tempogram,
|
||||
tempo as tempo,
|
||||
tempogram_ratio as tempogram_ratio,
|
||||
)
|
||||
|
||||
from . import (
|
||||
inverse as inverse,
|
||||
)
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,373 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Feature inversion"""
|
||||
|
||||
import warnings
|
||||
import numpy as np
|
||||
|
||||
from ..core.fft import get_fftlib
|
||||
from ..util.exceptions import ParameterError
|
||||
from ..core.spectrum import griffinlim
|
||||
from ..core.spectrum import db_to_power
|
||||
from ..util.utils import tiny
|
||||
from .. import filters
|
||||
from ..util import nnls, expand_to
|
||||
from numpy.typing import DTypeLike
|
||||
from typing import Any, Optional
|
||||
from .._typing import _WindowSpec, _PadModeSTFT
|
||||
|
||||
__all__ = ["mel_to_stft", "mel_to_audio", "mfcc_to_mel", "mfcc_to_audio"]
|
||||
|
||||
|
||||
def mel_to_stft(
|
||||
M: np.ndarray,
|
||||
*,
|
||||
sr: float = 22050,
|
||||
n_fft: int = 2048,
|
||||
power: float = 2.0,
|
||||
**kwargs: Any,
|
||||
) -> np.ndarray:
|
||||
"""Approximate STFT magnitude from a Mel power spectrogram.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
M : np.ndarray [shape=(..., n_mels, n), non-negative]
|
||||
The spectrogram as produced by `feature.melspectrogram`
|
||||
sr : number > 0 [scalar]
|
||||
sampling rate of the underlying signal
|
||||
n_fft : int > 0 [scalar]
|
||||
number of FFT components in the resulting STFT
|
||||
power : float > 0 [scalar]
|
||||
Exponent for the magnitude melspectrogram
|
||||
**kwargs : additional keyword arguments for Mel filter bank parameters
|
||||
fmin : float >= 0 [scalar]
|
||||
lowest frequency (in Hz)
|
||||
fmax : float >= 0 [scalar]
|
||||
highest frequency (in Hz).
|
||||
If `None`, use ``fmax = sr / 2.0``
|
||||
htk : bool [scalar]
|
||||
use HTK formula instead of Slaney
|
||||
norm : {None, 'slaney', or number} [scalar]
|
||||
If 'slaney', divide the triangular mel weights by the width of
|
||||
the mel band (area normalization).
|
||||
If numeric, use `librosa.util.normalize` to normalize each filter
|
||||
by to unit l_p norm. See `librosa.util.normalize` for a full
|
||||
description of supported norm values (including `+-np.inf`).
|
||||
Otherwise, leave all the triangles aiming for a peak value of 1.0
|
||||
dtype : np.dtype
|
||||
The data type of the output basis.
|
||||
By default, uses 32-bit (single-precision) floating point.
|
||||
|
||||
Returns
|
||||
-------
|
||||
S : np.ndarray [shape=(..., n_fft, t), non-negative]
|
||||
An approximate linear magnitude spectrogram
|
||||
|
||||
See Also
|
||||
--------
|
||||
librosa.feature.melspectrogram
|
||||
librosa.stft
|
||||
librosa.filters.mel
|
||||
librosa.util.nnls
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> y, sr = librosa.load(librosa.ex('trumpet'))
|
||||
>>> S = librosa.util.abs2(librosa.stft(y))
|
||||
>>> mel_spec = librosa.feature.melspectrogram(S=S, sr=sr)
|
||||
>>> S_inv = librosa.feature.inverse.mel_to_stft(mel_spec, sr=sr)
|
||||
|
||||
Compare the results visually
|
||||
|
||||
>>> import matplotlib.pyplot as plt
|
||||
>>> fig, ax = plt.subplots(nrows=3, sharex=True, sharey=True)
|
||||
>>> img = librosa.display.specshow(librosa.amplitude_to_db(S, ref=np.max, top_db=None),
|
||||
... y_axis='log', x_axis='time', ax=ax[0])
|
||||
>>> ax[0].set(title='Original STFT')
|
||||
>>> ax[0].label_outer()
|
||||
>>> librosa.display.specshow(librosa.amplitude_to_db(S_inv, ref=np.max, top_db=None),
|
||||
... y_axis='log', x_axis='time', ax=ax[1])
|
||||
>>> ax[1].set(title='Reconstructed STFT')
|
||||
>>> ax[1].label_outer()
|
||||
>>> librosa.display.specshow(librosa.amplitude_to_db(np.abs(S_inv - S),
|
||||
... ref=S.max(), top_db=None),
|
||||
... vmax=0, y_axis='log', x_axis='time', cmap='magma', ax=ax[2])
|
||||
>>> ax[2].set(title='Residual error (dB)')
|
||||
>>> fig.colorbar(img, ax=ax, format="%+2.f dB")
|
||||
"""
|
||||
# Construct a mel basis with dtype matching the input data
|
||||
mel_basis = filters.mel(
|
||||
sr=sr, n_fft=n_fft, n_mels=M.shape[-2], dtype=M.dtype, **kwargs
|
||||
)
|
||||
|
||||
# Find the non-negative least squares solution, and apply
|
||||
# the inverse exponent.
|
||||
# We'll do the exponentiation in-place.
|
||||
inverse = nnls(mel_basis, M)
|
||||
return np.power(inverse, 1.0 / power, out=inverse)
|
||||
|
||||
|
||||
def mel_to_audio(
|
||||
M: np.ndarray,
|
||||
*,
|
||||
sr: float = 22050,
|
||||
n_fft: int = 2048,
|
||||
hop_length: Optional[int] = None,
|
||||
win_length: Optional[int] = None,
|
||||
window: _WindowSpec = "hann",
|
||||
center: bool = True,
|
||||
pad_mode: _PadModeSTFT = "constant",
|
||||
power: float = 2.0,
|
||||
n_iter: int = 32,
|
||||
length: Optional[int] = None,
|
||||
dtype: DTypeLike = np.float32,
|
||||
**kwargs: Any,
|
||||
) -> np.ndarray:
|
||||
"""Invert a mel power spectrogram to audio using Griffin-Lim.
|
||||
|
||||
This is primarily a convenience wrapper for:
|
||||
|
||||
>>> S = librosa.feature.inverse.mel_to_stft(M)
|
||||
>>> y = librosa.griffinlim(S)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
M : np.ndarray [shape=(..., n_mels, n), non-negative]
|
||||
The spectrogram as produced by `feature.melspectrogram`
|
||||
sr : number > 0 [scalar]
|
||||
sampling rate of the underlying signal
|
||||
n_fft : int > 0 [scalar]
|
||||
number of FFT components in the resulting STFT
|
||||
hop_length : None or int > 0
|
||||
The hop length of the STFT. If not provided, it will default to ``n_fft // 4``
|
||||
win_length : None or int > 0
|
||||
The window length of the STFT. By default, it will equal ``n_fft``
|
||||
window : string, tuple, number, function, or np.ndarray [shape=(n_fft,)]
|
||||
A window specification as supported by `stft` or `istft`
|
||||
center : boolean
|
||||
If `True`, the STFT is assumed to use centered frames.
|
||||
If `False`, the STFT is assumed to use left-aligned frames.
|
||||
pad_mode : string
|
||||
If ``center=True``, the padding mode to use at the edges of the signal.
|
||||
By default, STFT uses zero padding.
|
||||
power : float > 0 [scalar]
|
||||
Exponent for the magnitude melspectrogram
|
||||
n_iter : int > 0
|
||||
The number of iterations for Griffin-Lim
|
||||
length : None or int > 0
|
||||
If provided, the output ``y`` is zero-padded or clipped to exactly ``length``
|
||||
samples.
|
||||
dtype : np.dtype
|
||||
Real numeric type for the time-domain signal. Default is 32-bit float.
|
||||
**kwargs : additional keyword arguments for Mel filter bank parameters
|
||||
fmin : float >= 0 [scalar]
|
||||
lowest frequency (in Hz)
|
||||
fmax : float >= 0 [scalar]
|
||||
highest frequency (in Hz).
|
||||
If `None`, use ``fmax = sr / 2.0``
|
||||
htk : bool [scalar]
|
||||
use HTK formula instead of Slaney
|
||||
norm : {None, 'slaney', or number} [scalar]
|
||||
If 'slaney', divide the triangular mel weights by the width of
|
||||
the mel band (area normalization).
|
||||
If numeric, use `librosa.util.normalize` to normalize each filter
|
||||
by to unit l_p norm. See `librosa.util.normalize` for a full
|
||||
description of supported norm values (including `+-np.inf`).
|
||||
Otherwise, leave all the triangles aiming for a peak value of 1.0
|
||||
|
||||
Returns
|
||||
-------
|
||||
y : np.ndarray [shape(..., n,)]
|
||||
time-domain signal reconstructed from ``M``
|
||||
|
||||
See Also
|
||||
--------
|
||||
librosa.griffinlim
|
||||
librosa.feature.melspectrogram
|
||||
librosa.filters.mel
|
||||
librosa.feature.inverse.mel_to_stft
|
||||
"""
|
||||
stft = mel_to_stft(M, sr=sr, n_fft=n_fft, power=power, **kwargs)
|
||||
|
||||
return griffinlim(
|
||||
stft,
|
||||
n_iter=n_iter,
|
||||
hop_length=hop_length,
|
||||
win_length=win_length,
|
||||
n_fft=n_fft,
|
||||
window=window,
|
||||
center=center,
|
||||
dtype=dtype,
|
||||
length=length,
|
||||
pad_mode=pad_mode,
|
||||
)
|
||||
|
||||
|
||||
def mfcc_to_mel(
|
||||
mfcc: np.ndarray,
|
||||
*,
|
||||
n_mels: int = 128,
|
||||
dct_type: int = 2,
|
||||
norm: Optional[str] = "ortho",
|
||||
ref: float = 1.0,
|
||||
lifter: float = 0,
|
||||
) -> np.ndarray:
|
||||
"""Invert Mel-frequency cepstral coefficients to approximate a Mel power
|
||||
spectrogram.
|
||||
|
||||
This inversion proceeds in two steps:
|
||||
|
||||
1. The inverse DCT is applied to the MFCCs
|
||||
2. `librosa.db_to_power` is applied to map the dB-scaled result to a power spectrogram
|
||||
|
||||
Parameters
|
||||
----------
|
||||
mfcc : np.ndarray [shape=(..., n_mfcc, n)]
|
||||
The Mel-frequency cepstral coefficients
|
||||
n_mels : int > 0
|
||||
The number of Mel frequencies
|
||||
dct_type : {1, 2, 3}
|
||||
Discrete cosine transform (DCT) type
|
||||
By default, DCT type-2 is used.
|
||||
norm : None or 'ortho'
|
||||
If ``dct_type`` is `2 or 3`, setting ``norm='ortho'`` uses an orthonormal
|
||||
DCT basis.
|
||||
Normalization is not supported for `dct_type=1`.
|
||||
ref : float
|
||||
Reference power for (inverse) decibel calculation
|
||||
lifter : number >= 0
|
||||
If ``lifter>0``, apply inverse liftering (inverse cepstral filtering)::
|
||||
M[n, :] <- M[n, :] / (1 + sin(pi * (n + 1) / lifter) * lifter / 2)
|
||||
|
||||
Returns
|
||||
-------
|
||||
M : np.ndarray [shape=(..., n_mels, n)]
|
||||
An approximate Mel power spectrum recovered from ``mfcc``
|
||||
|
||||
Warns
|
||||
-----
|
||||
UserWarning
|
||||
due to critical values in lifter array that invokes underflow.
|
||||
|
||||
See Also
|
||||
--------
|
||||
librosa.feature.mfcc
|
||||
librosa.feature.melspectrogram
|
||||
scipy.fft.dct
|
||||
"""
|
||||
if lifter > 0:
|
||||
n_mfcc = mfcc.shape[-2]
|
||||
idx = np.arange(1, 1 + n_mfcc, dtype=mfcc.dtype)
|
||||
idx = expand_to(idx, ndim=mfcc.ndim, axes=-2)
|
||||
lifter_sine = 1 + lifter * 0.5 * np.sin(np.pi * idx / lifter)
|
||||
|
||||
# raise a UserWarning if lifter array includes critical values
|
||||
if np.any(np.abs(lifter_sine) < np.finfo(lifter_sine.dtype).eps):
|
||||
warnings.warn(
|
||||
message="lifter array includes critical values that may invoke underflow.",
|
||||
category=UserWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
# lifter mfcc values
|
||||
mfcc = mfcc / (lifter_sine + tiny(mfcc))
|
||||
|
||||
elif lifter != 0:
|
||||
raise ParameterError("MFCC to mel lifter must be a non-negative number.")
|
||||
|
||||
fft = get_fftlib()
|
||||
logmel = fft.idct(mfcc, axis=-2, type=dct_type, norm=norm, n=n_mels)
|
||||
melspec: np.ndarray = db_to_power(logmel, ref=ref)
|
||||
return melspec
|
||||
|
||||
|
||||
def mfcc_to_audio(
|
||||
mfcc: np.ndarray,
|
||||
*,
|
||||
n_mels: int = 128,
|
||||
dct_type: int = 2,
|
||||
norm: Optional[str] = "ortho",
|
||||
ref: float = 1.0,
|
||||
lifter: float = 0,
|
||||
**kwargs: Any,
|
||||
) -> np.ndarray:
|
||||
"""Convert Mel-frequency cepstral coefficients to a time-domain audio signal
|
||||
|
||||
This function is primarily a convenience wrapper for the following steps:
|
||||
|
||||
1. Convert mfcc to Mel power spectrum (`mfcc_to_mel`)
|
||||
2. Convert Mel power spectrum to time-domain audio (`mel_to_audio`)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
mfcc : np.ndarray [shape=(..., n_mfcc, n)]
|
||||
The Mel-frequency cepstral coefficients
|
||||
n_mels : int > 0
|
||||
The number of Mel frequencies
|
||||
dct_type : {1, 2, 3}
|
||||
Discrete cosine transform (DCT) type
|
||||
By default, DCT type-2 is used.
|
||||
norm : None or 'ortho'
|
||||
If ``dct_type`` is `2 or 3`, setting ``norm='ortho'`` uses an orthonormal
|
||||
DCT basis.
|
||||
Normalization is not supported for ``dct_type=1``.
|
||||
ref : float
|
||||
Reference power for (inverse) decibel calculation
|
||||
lifter : number >= 0
|
||||
If ``lifter>0``, apply inverse liftering (inverse cepstral filtering)::
|
||||
M[n, :] <- M[n, :] / (1 + sin(pi * (n + 1) / lifter)) * lifter / 2
|
||||
**kwargs : additional keyword arguments to pass through to `mel_to_audio`
|
||||
M : np.ndarray [shape=(..., n_mels, n), non-negative]
|
||||
The spectrogram as produced by `feature.melspectrogram`
|
||||
sr : number > 0 [scalar]
|
||||
sampling rate of the underlying signal
|
||||
n_fft : int > 0 [scalar]
|
||||
number of FFT components in the resulting STFT
|
||||
hop_length : None or int > 0
|
||||
The hop length of the STFT. If not provided, it will default to ``n_fft // 4``
|
||||
win_length : None or int > 0
|
||||
The window length of the STFT. By default, it will equal ``n_fft``
|
||||
window : string, tuple, number, function, or np.ndarray [shape=(n_fft,)]
|
||||
A window specification as supported by `stft` or `istft`
|
||||
center : boolean
|
||||
If `True`, the STFT is assumed to use centered frames.
|
||||
If `False`, the STFT is assumed to use left-aligned frames.
|
||||
pad_mode : string
|
||||
If ``center=True``, the padding mode to use at the edges of the signal.
|
||||
By default, STFT uses zero padding.
|
||||
power : float > 0 [scalar]
|
||||
Exponent for the magnitude melspectrogram
|
||||
n_iter : int > 0
|
||||
The number of iterations for Griffin-Lim
|
||||
length : None or int > 0
|
||||
If provided, the output ``y`` is zero-padded or clipped to exactly ``length``
|
||||
samples.
|
||||
dtype : np.dtype
|
||||
Real numeric type for the time-domain signal. Default is 32-bit float.
|
||||
**kwargs : additional keyword arguments for Mel filter bank parameters
|
||||
fmin : float >= 0 [scalar]
|
||||
lowest frequency (in Hz)
|
||||
fmax : float >= 0 [scalar]
|
||||
highest frequency (in Hz).
|
||||
If `None`, use ``fmax = sr / 2.0``
|
||||
htk : bool [scalar]
|
||||
use HTK formula instead of Slaney
|
||||
|
||||
Returns
|
||||
-------
|
||||
y : np.ndarray [shape=(..., n)]
|
||||
A time-domain signal reconstructed from `mfcc`
|
||||
|
||||
See Also
|
||||
--------
|
||||
mfcc_to_mel
|
||||
mel_to_audio
|
||||
librosa.feature.mfcc
|
||||
librosa.griffinlim
|
||||
scipy.fft.dct
|
||||
"""
|
||||
mel_spec = mfcc_to_mel(
|
||||
mfcc, n_mels=n_mels, dct_type=dct_type, norm=norm, ref=ref, lifter=lifter
|
||||
)
|
||||
|
||||
return mel_to_audio(mel_spec, **kwargs)
|
||||
@@ -0,0 +1,655 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Rhythmic feature extraction"""
|
||||
|
||||
import numpy as np
|
||||
import scipy
|
||||
|
||||
from .. import util
|
||||
|
||||
from .._cache import cache
|
||||
from ..core.audio import autocorrelate
|
||||
from ..core.spectrum import stft
|
||||
from ..core.convert import tempo_frequencies, time_to_frames
|
||||
from ..core.harmonic import f0_harmonics
|
||||
from ..util.exceptions import ParameterError
|
||||
from ..filters import get_window
|
||||
from typing import Optional, Callable, Any
|
||||
from .._typing import _WindowSpec
|
||||
|
||||
__all__ = ["tempogram", "fourier_tempogram", "tempo", "tempogram_ratio"]
|
||||
|
||||
|
||||
# -- Rhythmic features -- #
|
||||
def tempogram(
|
||||
*,
|
||||
y: Optional[np.ndarray] = None,
|
||||
sr: float = 22050,
|
||||
onset_envelope: Optional[np.ndarray] = None,
|
||||
hop_length: int = 512,
|
||||
win_length: int = 384,
|
||||
center: bool = True,
|
||||
window: _WindowSpec = "hann",
|
||||
norm: Optional[float] = np.inf,
|
||||
) -> np.ndarray:
|
||||
"""Compute the tempogram: local autocorrelation of the onset strength envelope. [#]_
|
||||
|
||||
.. [#] Grosche, Peter, Meinard Müller, and Frank Kurth.
|
||||
"Cyclic tempogram - A mid-level tempo representation for music signals."
|
||||
ICASSP, 2010.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : np.ndarray [shape=(..., n)] or None
|
||||
Audio time series. Multi-channel is supported.
|
||||
|
||||
sr : number > 0 [scalar]
|
||||
sampling rate of ``y``
|
||||
|
||||
onset_envelope : np.ndarray [shape=(..., n) or (..., m, n)] or None
|
||||
Optional pre-computed onset strength envelope as provided by
|
||||
`librosa.onset.onset_strength`.
|
||||
|
||||
If multi-dimensional, tempograms are computed independently for each
|
||||
band (first dimension).
|
||||
|
||||
hop_length : int > 0
|
||||
number of audio samples between successive onset measurements
|
||||
|
||||
win_length : int > 0
|
||||
length of the onset autocorrelation window (in frames/onset measurements)
|
||||
The default settings (384) corresponds to ``384 * hop_length / sr ~= 8.9s``.
|
||||
|
||||
center : bool
|
||||
If `True`, onset autocorrelation windows are centered.
|
||||
If `False`, windows are left-aligned.
|
||||
|
||||
window : string, function, number, tuple, or np.ndarray [shape=(win_length,)]
|
||||
A window specification as in `stft`.
|
||||
|
||||
norm : {np.inf, -np.inf, 0, float > 0, None}
|
||||
Normalization mode. Set to `None` to disable normalization.
|
||||
|
||||
Returns
|
||||
-------
|
||||
tempogram : np.ndarray [shape=(..., win_length, n)]
|
||||
Localized autocorrelation of the onset strength envelope.
|
||||
|
||||
If given multi-band input (``onset_envelope.shape==(m,n)``) then
|
||||
``tempogram[i]`` is the tempogram of ``onset_envelope[i]``.
|
||||
|
||||
Raises
|
||||
------
|
||||
ParameterError
|
||||
if neither ``y`` nor ``onset_envelope`` are provided
|
||||
|
||||
if ``win_length < 1``
|
||||
|
||||
See Also
|
||||
--------
|
||||
fourier_tempogram
|
||||
librosa.onset.onset_strength
|
||||
librosa.util.normalize
|
||||
librosa.stft
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> # Compute local onset autocorrelation
|
||||
>>> y, sr = librosa.load(librosa.ex('nutcracker'), duration=30)
|
||||
>>> hop_length = 512
|
||||
>>> oenv = librosa.onset.onset_strength(y=y, sr=sr, hop_length=hop_length)
|
||||
>>> tempogram = librosa.feature.tempogram(onset_envelope=oenv, sr=sr,
|
||||
... hop_length=hop_length)
|
||||
>>> # Compute global onset autocorrelation
|
||||
>>> ac_global = librosa.autocorrelate(oenv, max_size=tempogram.shape[0])
|
||||
>>> ac_global = librosa.util.normalize(ac_global)
|
||||
>>> # Estimate the global tempo for display purposes
|
||||
>>> tempo = librosa.feature.tempo(onset_envelope=oenv, sr=sr,
|
||||
... hop_length=hop_length)[0]
|
||||
|
||||
>>> import matplotlib.pyplot as plt
|
||||
>>> fig, ax = plt.subplots(nrows=4, figsize=(10, 10))
|
||||
>>> times = librosa.times_like(oenv, sr=sr, hop_length=hop_length)
|
||||
>>> ax[0].plot(times, oenv, label='Onset strength')
|
||||
>>> ax[0].label_outer()
|
||||
>>> ax[0].legend(frameon=True)
|
||||
>>> librosa.display.specshow(tempogram, sr=sr, hop_length=hop_length,
|
||||
>>> x_axis='time', y_axis='tempo', cmap='magma',
|
||||
... ax=ax[1])
|
||||
>>> ax[1].axhline(tempo, color='w', linestyle='--', alpha=1,
|
||||
... label='Estimated tempo={:g}'.format(tempo))
|
||||
>>> ax[1].legend(loc='upper right')
|
||||
>>> ax[1].set(title='Tempogram')
|
||||
>>> x = np.linspace(0, tempogram.shape[0] * float(hop_length) / sr,
|
||||
... num=tempogram.shape[0])
|
||||
>>> ax[2].plot(x, np.mean(tempogram, axis=1), label='Mean local autocorrelation')
|
||||
>>> ax[2].plot(x, ac_global, '--', alpha=0.75, label='Global autocorrelation')
|
||||
>>> ax[2].set(xlabel='Lag (seconds)')
|
||||
>>> ax[2].legend(frameon=True)
|
||||
>>> freqs = librosa.tempo_frequencies(tempogram.shape[0], hop_length=hop_length, sr=sr)
|
||||
>>> ax[3].semilogx(freqs[1:], np.mean(tempogram[1:], axis=1),
|
||||
... label='Mean local autocorrelation', base=2)
|
||||
>>> ax[3].semilogx(freqs[1:], ac_global[1:], '--', alpha=0.75,
|
||||
... label='Global autocorrelation', base=2)
|
||||
>>> ax[3].axvline(tempo, color='black', linestyle='--', alpha=.8,
|
||||
... label='Estimated tempo={:g}'.format(tempo))
|
||||
>>> ax[3].legend(frameon=True)
|
||||
>>> ax[3].set(xlabel='BPM')
|
||||
>>> ax[3].grid(True)
|
||||
"""
|
||||
from ..onset import onset_strength
|
||||
|
||||
if win_length < 1:
|
||||
raise ParameterError("win_length must be a positive integer")
|
||||
|
||||
ac_window = get_window(window, win_length, fftbins=True)
|
||||
|
||||
if onset_envelope is None:
|
||||
if y is None:
|
||||
raise ParameterError("Either y or onset_envelope must be provided")
|
||||
|
||||
onset_envelope = onset_strength(y=y, sr=sr, hop_length=hop_length)
|
||||
|
||||
# Center the autocorrelation windows
|
||||
n = onset_envelope.shape[-1]
|
||||
|
||||
if center:
|
||||
padding = [(0, 0) for _ in onset_envelope.shape]
|
||||
padding[-1] = (int(win_length // 2),) * 2
|
||||
onset_envelope = np.pad(
|
||||
onset_envelope, padding, mode="linear_ramp", end_values=[0, 0]
|
||||
)
|
||||
|
||||
# Carve onset envelope into frames
|
||||
odf_frame = util.frame(onset_envelope, frame_length=win_length, hop_length=1)
|
||||
|
||||
# Truncate to the length of the original signal
|
||||
if center:
|
||||
odf_frame = odf_frame[..., :n]
|
||||
|
||||
# explicit broadcast of ac_window
|
||||
ac_window = util.expand_to(ac_window, ndim=odf_frame.ndim, axes=-2)
|
||||
|
||||
# Window, autocorrelate, and normalize
|
||||
return util.normalize(
|
||||
autocorrelate(odf_frame * ac_window, axis=-2), norm=norm, axis=-2
|
||||
)
|
||||
|
||||
|
||||
def fourier_tempogram(
|
||||
*,
|
||||
y: Optional[np.ndarray] = None,
|
||||
sr: float = 22050,
|
||||
onset_envelope: Optional[np.ndarray] = None,
|
||||
hop_length: int = 512,
|
||||
win_length: int = 384,
|
||||
center: bool = True,
|
||||
window: _WindowSpec = "hann",
|
||||
) -> np.ndarray:
|
||||
"""Compute the Fourier tempogram: the short-time Fourier transform of the
|
||||
onset strength envelope. [#]_
|
||||
|
||||
.. [#] Grosche, Peter, Meinard Müller, and Frank Kurth.
|
||||
"Cyclic tempogram - A mid-level tempo representation for music signals."
|
||||
ICASSP, 2010.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : np.ndarray [shape=(..., n)] or None
|
||||
Audio time series. Multi-channel is supported.
|
||||
sr : number > 0 [scalar]
|
||||
sampling rate of ``y``
|
||||
onset_envelope : np.ndarray [shape=(..., n)] or None
|
||||
Optional pre-computed onset strength envelope as provided by
|
||||
``librosa.onset.onset_strength``.
|
||||
Multi-channel is supported.
|
||||
hop_length : int > 0
|
||||
number of audio samples between successive onset measurements
|
||||
win_length : int > 0
|
||||
length of the onset window (in frames/onset measurements)
|
||||
The default settings (384) corresponds to ``384 * hop_length / sr ~= 8.9s``.
|
||||
center : bool
|
||||
If `True`, onset windows are centered.
|
||||
If `False`, windows are left-aligned.
|
||||
window : string, function, number, tuple, or np.ndarray [shape=(win_length,)]
|
||||
A window specification as in `stft`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
tempogram : np.ndarray [shape=(..., win_length // 2 + 1, n)]
|
||||
Complex short-time Fourier transform of the onset envelope.
|
||||
|
||||
Raises
|
||||
------
|
||||
ParameterError
|
||||
if neither ``y`` nor ``onset_envelope`` are provided
|
||||
|
||||
if ``win_length < 1``
|
||||
|
||||
See Also
|
||||
--------
|
||||
tempogram
|
||||
librosa.onset.onset_strength
|
||||
librosa.util.normalize
|
||||
librosa.stft
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> # Compute local onset autocorrelation
|
||||
>>> y, sr = librosa.load(librosa.ex('nutcracker'))
|
||||
>>> hop_length = 512
|
||||
>>> oenv = librosa.onset.onset_strength(y=y, sr=sr, hop_length=hop_length)
|
||||
>>> tempogram = librosa.feature.fourier_tempogram(onset_envelope=oenv, sr=sr,
|
||||
... hop_length=hop_length)
|
||||
>>> # Compute the auto-correlation tempogram, unnormalized to make comparison easier
|
||||
>>> ac_tempogram = librosa.feature.tempogram(onset_envelope=oenv, sr=sr,
|
||||
... hop_length=hop_length, norm=None)
|
||||
|
||||
>>> import matplotlib.pyplot as plt
|
||||
>>> fig, ax = plt.subplots(nrows=3, sharex=True)
|
||||
>>> ax[0].plot(librosa.times_like(oenv), oenv, label='Onset strength')
|
||||
>>> ax[0].legend(frameon=True)
|
||||
>>> ax[0].label_outer()
|
||||
>>> librosa.display.specshow(np.abs(tempogram), sr=sr, hop_length=hop_length,
|
||||
>>> x_axis='time', y_axis='fourier_tempo', cmap='magma',
|
||||
... ax=ax[1])
|
||||
>>> ax[1].set(title='Fourier tempogram')
|
||||
>>> ax[1].label_outer()
|
||||
>>> librosa.display.specshow(ac_tempogram, sr=sr, hop_length=hop_length,
|
||||
>>> x_axis='time', y_axis='tempo', cmap='magma',
|
||||
... ax=ax[2])
|
||||
>>> ax[2].set(title='Autocorrelation tempogram')
|
||||
"""
|
||||
from ..onset import onset_strength
|
||||
|
||||
if win_length < 1:
|
||||
raise ParameterError("win_length must be a positive integer")
|
||||
|
||||
if onset_envelope is None:
|
||||
if y is None:
|
||||
raise ParameterError("Either y or onset_envelope must be provided")
|
||||
|
||||
onset_envelope = onset_strength(y=y, sr=sr, hop_length=hop_length)
|
||||
|
||||
# Generate the short-time Fourier transform
|
||||
return stft(
|
||||
onset_envelope, n_fft=win_length, hop_length=1, center=center, window=window
|
||||
)
|
||||
|
||||
|
||||
@cache(level=30)
|
||||
def tempo(
|
||||
*,
|
||||
y: Optional[np.ndarray] = None,
|
||||
sr: float = 22050,
|
||||
onset_envelope: Optional[np.ndarray] = None,
|
||||
tg: Optional[np.ndarray] = None,
|
||||
hop_length: int = 512,
|
||||
start_bpm: float = 120,
|
||||
std_bpm: float = 1.0,
|
||||
ac_size: float = 8.0,
|
||||
max_tempo: Optional[float] = 320.0,
|
||||
aggregate: Optional[Callable[..., Any]] = np.mean,
|
||||
prior: Optional[scipy.stats.rv_continuous] = None,
|
||||
) -> np.ndarray:
|
||||
"""Estimate the tempo (beats per minute)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : np.ndarray [shape=(..., n)] or None
|
||||
audio time series. Multi-channel is supported.
|
||||
sr : number > 0 [scalar]
|
||||
sampling rate of the time series
|
||||
onset_envelope : np.ndarray [shape=(..., n)]
|
||||
pre-computed onset strength envelope
|
||||
tg : np.ndarray
|
||||
pre-computed tempogram. If provided, then `y` and
|
||||
`onset_envelope` are ignored, and `win_length` is
|
||||
inferred from the shape of the tempogram.
|
||||
hop_length : int > 0 [scalar]
|
||||
hop length of the time series
|
||||
start_bpm : float [scalar]
|
||||
initial guess of the BPM
|
||||
std_bpm : float > 0 [scalar]
|
||||
standard deviation of tempo distribution
|
||||
ac_size : float > 0 [scalar]
|
||||
length (in seconds) of the auto-correlation window
|
||||
max_tempo : float > 0 [scalar, optional]
|
||||
If provided, only estimate tempo below this threshold
|
||||
aggregate : callable [optional]
|
||||
Aggregation function for estimating global tempo.
|
||||
If `None`, then tempo is estimated independently for each frame.
|
||||
prior : scipy.stats.rv_continuous [optional]
|
||||
A prior distribution over tempo (in beats per minute).
|
||||
By default, a pseudo-log-normal prior is used.
|
||||
If given, ``start_bpm`` and ``std_bpm`` will be ignored.
|
||||
|
||||
Returns
|
||||
-------
|
||||
tempo : np.ndarray
|
||||
estimated tempo (beats per minute).
|
||||
If input is multi-channel, one tempo estimate per channel is provided.
|
||||
|
||||
See Also
|
||||
--------
|
||||
librosa.onset.onset_strength
|
||||
librosa.feature.tempogram
|
||||
|
||||
Notes
|
||||
-----
|
||||
This function caches at level 30.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> # Estimate a static tempo
|
||||
>>> y, sr = librosa.load(librosa.ex('nutcracker'), duration=30)
|
||||
>>> onset_env = librosa.onset.onset_strength(y=y, sr=sr)
|
||||
>>> tempo = librosa.feature.tempo(onset_envelope=onset_env, sr=sr)
|
||||
>>> tempo
|
||||
array([143.555])
|
||||
|
||||
>>> # Or a static tempo with a uniform prior instead
|
||||
>>> import scipy.stats
|
||||
>>> prior = scipy.stats.uniform(30, 300) # uniform over 30-300 BPM
|
||||
>>> utempo = librosa.feature.tempo(onset_envelope=onset_env, sr=sr, prior=prior)
|
||||
>>> utempo
|
||||
array([161.499])
|
||||
|
||||
>>> # Or a dynamic tempo
|
||||
>>> dtempo = librosa.feature.tempo(onset_envelope=onset_env, sr=sr,
|
||||
... aggregate=None)
|
||||
>>> dtempo
|
||||
array([ 89.103, 89.103, 89.103, ..., 123.047, 123.047, 123.047])
|
||||
|
||||
>>> # Dynamic tempo with a proper log-normal prior
|
||||
>>> prior_lognorm = scipy.stats.lognorm(loc=np.log(120), scale=120, s=1)
|
||||
>>> dtempo_lognorm = librosa.feature.tempo(onset_envelope=onset_env, sr=sr,
|
||||
... aggregate=None,
|
||||
... prior=prior_lognorm)
|
||||
>>> dtempo_lognorm
|
||||
array([ 89.103, 89.103, 89.103, ..., 123.047, 123.047, 123.047])
|
||||
|
||||
Plot the estimated tempo against the onset autocorrelation
|
||||
|
||||
>>> import matplotlib.pyplot as plt
|
||||
>>> # Convert to scalar
|
||||
>>> tempo = tempo.item()
|
||||
>>> utempo = utempo.item()
|
||||
>>> # Compute 2-second windowed autocorrelation
|
||||
>>> hop_length = 512
|
||||
>>> ac = librosa.autocorrelate(onset_env, max_size=2 * sr // hop_length)
|
||||
>>> freqs = librosa.tempo_frequencies(len(ac), sr=sr,
|
||||
... hop_length=hop_length)
|
||||
>>> # Plot on a BPM axis. We skip the first (0-lag) bin.
|
||||
>>> fig, ax = plt.subplots()
|
||||
>>> ax.semilogx(freqs[1:], librosa.util.normalize(ac)[1:],
|
||||
... label='Onset autocorrelation', base=2)
|
||||
>>> ax.axvline(tempo, 0, 1, alpha=0.75, linestyle='--', color='r',
|
||||
... label='Tempo (default prior): {:.2f} BPM'.format(tempo))
|
||||
>>> ax.axvline(utempo, 0, 1, alpha=0.75, linestyle=':', color='g',
|
||||
... label='Tempo (uniform prior): {:.2f} BPM'.format(utempo))
|
||||
>>> ax.set(xlabel='Tempo (BPM)', title='Static tempo estimation')
|
||||
>>> ax.grid(True)
|
||||
>>> ax.legend()
|
||||
|
||||
Plot dynamic tempo estimates over a tempogram
|
||||
|
||||
>>> fig, ax = plt.subplots()
|
||||
>>> tg = librosa.feature.tempogram(onset_envelope=onset_env, sr=sr,
|
||||
... hop_length=hop_length)
|
||||
>>> librosa.display.specshow(tg, x_axis='time', y_axis='tempo', cmap='magma', ax=ax)
|
||||
>>> ax.plot(librosa.times_like(dtempo), dtempo,
|
||||
... color='c', linewidth=1.5, label='Tempo estimate (default prior)')
|
||||
>>> ax.plot(librosa.times_like(dtempo_lognorm), dtempo_lognorm,
|
||||
... color='c', linewidth=1.5, linestyle='--',
|
||||
... label='Tempo estimate (lognorm prior)')
|
||||
>>> ax.set(title='Dynamic tempo estimation')
|
||||
>>> ax.legend()
|
||||
"""
|
||||
if start_bpm <= 0:
|
||||
raise ParameterError("start_bpm must be strictly positive")
|
||||
|
||||
if tg is None:
|
||||
win_length = time_to_frames(ac_size, sr=sr, hop_length=hop_length).item()
|
||||
|
||||
tg = tempogram(
|
||||
y=y,
|
||||
sr=sr,
|
||||
onset_envelope=onset_envelope,
|
||||
hop_length=hop_length,
|
||||
win_length=win_length,
|
||||
)
|
||||
else:
|
||||
# Override window length by what's actually given
|
||||
win_length = tg.shape[-2]
|
||||
|
||||
# Eventually, we want this to work for time-varying tempo
|
||||
if aggregate is not None:
|
||||
tg = aggregate(tg, axis=-1, keepdims=True)
|
||||
|
||||
assert tg is not None
|
||||
|
||||
# Get the BPM values for each bin, skipping the 0-lag bin
|
||||
bpms = tempo_frequencies(win_length, hop_length=hop_length, sr=sr)
|
||||
|
||||
# Weight the autocorrelation by a log-normal distribution
|
||||
if prior is None:
|
||||
logprior = -0.5 * ((np.log2(bpms) - np.log2(start_bpm)) / std_bpm) ** 2
|
||||
else:
|
||||
logprior = prior.logpdf(bpms)
|
||||
|
||||
# Kill everything above the max tempo
|
||||
if max_tempo is not None:
|
||||
max_idx = int(np.argmax(bpms < max_tempo))
|
||||
logprior[:max_idx] = -np.inf
|
||||
# explicit axis expansion
|
||||
logprior = util.expand_to(logprior, ndim=tg.ndim, axes=-2)
|
||||
|
||||
# Get the maximum, weighted by the prior
|
||||
# Using log1p here for numerical stability
|
||||
best_period = np.argmax(np.log1p(1e6 * tg) + logprior, axis=-2)
|
||||
|
||||
tempo_est: np.ndarray = np.take(bpms, best_period)
|
||||
return tempo_est
|
||||
|
||||
|
||||
@cache(level=40)
|
||||
def tempogram_ratio(
|
||||
*,
|
||||
y: Optional[np.ndarray] = None,
|
||||
sr: float = 22050,
|
||||
onset_envelope: Optional[np.ndarray] = None,
|
||||
tg: Optional[np.ndarray] = None,
|
||||
bpm: Optional[np.ndarray] = None,
|
||||
hop_length: int = 512,
|
||||
win_length: int = 384,
|
||||
start_bpm: float = 120,
|
||||
std_bpm: float = 1.0,
|
||||
max_tempo: Optional[float] = 320.0,
|
||||
freqs: Optional[np.ndarray] = None,
|
||||
factors: Optional[np.ndarray] = None,
|
||||
aggregate: Optional[Callable[..., Any]] = None,
|
||||
prior: Optional[scipy.stats.rv_continuous] = None,
|
||||
center: bool = True,
|
||||
window: _WindowSpec = "hann",
|
||||
kind: str = "linear",
|
||||
fill_value: float = 0,
|
||||
norm: Optional[float] = np.inf,
|
||||
) -> np.ndarray:
|
||||
"""Tempogram ratio features, also known as spectral rhythm patterns. [1]_
|
||||
|
||||
This function summarizes the energy at metrically important multiples
|
||||
of the tempo. For example, if the tempo corresponds to the quarter-note
|
||||
period, the tempogram ratio will measure the energy at the eighth note,
|
||||
sixteenth note, half note, whole note, etc. periods, as well as dotted
|
||||
and triplet ratios.
|
||||
|
||||
By default, the multiplicative factors used here are as specified by
|
||||
[2]_. If the estimated tempo corresponds to a quarter note, these factors
|
||||
will measure relative energy at the following metrical subdivisions:
|
||||
|
||||
+-------+--------+------------------+
|
||||
| Index | Factor | Description |
|
||||
+=======+========+==================+
|
||||
| 0 | 4 | Sixteenth note |
|
||||
+-------+--------+------------------+
|
||||
| 1 | 8/3 | Dotted sixteenth |
|
||||
+-------+--------+------------------+
|
||||
| 2 | 3 | Eighth triplet |
|
||||
+-------+--------+------------------+
|
||||
| 3 | 2 | Eighth note |
|
||||
+-------+--------+------------------+
|
||||
| 4 | 4/3 | Dotted eighth |
|
||||
+-------+--------+------------------+
|
||||
| 5 | 3/2 | Quarter triplet |
|
||||
+-------+--------+------------------+
|
||||
| 6 | 1 | Quarter note |
|
||||
+-------+--------+------------------+
|
||||
| 7 | 2/3 | Dotted quarter |
|
||||
+-------+--------+------------------+
|
||||
| 8 | 3/4 | Half triplet |
|
||||
+-------+--------+------------------+
|
||||
| 9 | 1/2 | Half note |
|
||||
+-------+--------+------------------+
|
||||
| 10 | 1/3 | Dotted half note |
|
||||
+-------+--------+------------------+
|
||||
| 11 | 3/8 | Whole triplet |
|
||||
+-------+--------+------------------+
|
||||
| 12 | 1/4 | Whole note |
|
||||
+-------+--------+------------------+
|
||||
|
||||
.. [1] Peeters, Geoffroy.
|
||||
"Rhythm Classification Using Spectral Rhythm Patterns."
|
||||
In ISMIR, pp. 644-647. 2005.
|
||||
|
||||
.. [2] Prockup, Matthew, Andreas F. Ehmann, Fabien Gouyon, Erik M. Schmidt, and Youngmoo E. Kim.
|
||||
"Modeling musical rhythm at scale with the music genome project."
|
||||
In 2015 IEEE workshop on applications of signal processing to audio and acoustics (WASPAA), pp. 1-5. IEEE, 2015.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : np.ndarray [shape=(..., n)] or None
|
||||
audio time series
|
||||
sr : number > 0 [scalar]
|
||||
sampling rate of the time series
|
||||
onset_envelope : np.ndarray [shape=(..., n)]
|
||||
pre-computed onset strength envelope
|
||||
tg : np.ndarray
|
||||
pre-computed tempogram. If provided, then `y` and
|
||||
`onset_envelope` are ignored, and `win_length` is
|
||||
inferred from the shape of the tempogram.
|
||||
bpm : np.ndarray
|
||||
pre-computed tempo estimate. This must be a per-frame
|
||||
estimate, and have dimension compatible with `tg`.
|
||||
hop_length : int > 0 [scalar]
|
||||
hop length of the time series
|
||||
win_length : int > 0 [scalar]
|
||||
window length of the autocorrelation window for tempogram
|
||||
calculation
|
||||
start_bpm : float [scalar]
|
||||
initial guess of the BPM if `bpm` is not provided
|
||||
std_bpm : float > 0 [scalar]
|
||||
standard deviation of tempo distribution
|
||||
max_tempo : float > 0 [scalar, optional]
|
||||
If provided, only estimate tempo below this threshold
|
||||
freqs : np.ndarray
|
||||
Frequencies (in BPM) of the tempogram axis.
|
||||
factors : np.ndarray
|
||||
Multiples of the fundamental tempo (bpm) to estimate.
|
||||
If not provided, the factors are as specified above.
|
||||
prior : scipy.stats.rv_continuous [optional]
|
||||
A prior distribution over tempo (in beats per minute).
|
||||
By default, a pseudo-log-normal prior is used.
|
||||
If given, ``start_bpm`` and ``std_bpm`` will be ignored.
|
||||
center : bool
|
||||
If `True`, onset windows are centered.
|
||||
If `False`, windows are left-aligned.
|
||||
aggregate : callable [optional]
|
||||
Aggregation function for estimating global tempogram ratio.
|
||||
If `None`, then ratios are estimated independently for each frame.
|
||||
window : string, function, number, tuple, or np.ndarray [shape=(win_length,)]
|
||||
A window specification as in `stft`.
|
||||
kind : str
|
||||
Interpolation mode for measuring tempogram ratios
|
||||
fill_value : float
|
||||
The value to fill when extrapolating beyond the observed
|
||||
frequency range.
|
||||
norm : {np.inf, -np.inf, 0, float > 0, None}
|
||||
Normalization mode. Set to `None` to disable normalization.
|
||||
|
||||
Returns
|
||||
-------
|
||||
tgr : np.ndarray
|
||||
The tempogram ratio for the specified factors.
|
||||
If `aggregate` is provided, the trailing time axis
|
||||
will be removed.
|
||||
If `aggregate` is not provided (default), ratios
|
||||
will be estimated for each frame.
|
||||
|
||||
See Also
|
||||
--------
|
||||
tempogram
|
||||
tempo
|
||||
librosa.f0_harmonics
|
||||
librosa.tempo_frequencies
|
||||
|
||||
Examples
|
||||
--------
|
||||
Compute tempogram ratio features using the default factors
|
||||
for a waltz (3/4 time)
|
||||
|
||||
>>> import matplotlib.pyplot as plt
|
||||
>>> y, sr = librosa.load(librosa.ex('sweetwaltz'))
|
||||
>>> tempogram = librosa.feature.tempogram(y=y, sr=sr)
|
||||
>>> tgr = librosa.feature.tempogram_ratio(tg=tempogram, sr=sr)
|
||||
>>> fig, ax = plt.subplots(nrows=2, sharex=True)
|
||||
>>> librosa.display.specshow(tempogram, x_axis='time', y_axis='tempo',
|
||||
... ax=ax[0])
|
||||
>>> librosa.display.specshow(tgr, x_axis='time', ax=ax[1])
|
||||
>>> ax[0].label_outer()
|
||||
>>> ax[0].set(title="Tempogram")
|
||||
>>> ax[1].set(title="Tempogram ratio")
|
||||
"""
|
||||
# Get a tempogram and time-varying tempo estimate
|
||||
if tg is None:
|
||||
tg = tempogram(
|
||||
y=y,
|
||||
sr=sr,
|
||||
onset_envelope=onset_envelope,
|
||||
hop_length=hop_length,
|
||||
win_length=win_length,
|
||||
center=center,
|
||||
window=window,
|
||||
norm=norm,
|
||||
)
|
||||
|
||||
if freqs is None:
|
||||
freqs = tempo_frequencies(sr=sr, n_bins=len(tg), hop_length=hop_length)
|
||||
|
||||
# Estimate tempo per-frame, no aggregation yet
|
||||
if bpm is None:
|
||||
bpm = tempo(
|
||||
sr=sr,
|
||||
tg=tg,
|
||||
hop_length=hop_length,
|
||||
start_bpm=start_bpm,
|
||||
std_bpm=std_bpm,
|
||||
max_tempo=max_tempo,
|
||||
aggregate=None,
|
||||
prior=prior,
|
||||
)
|
||||
|
||||
if factors is None:
|
||||
# metric multiples from Prockup'15
|
||||
factors = np.array(
|
||||
[4, 8 / 3, 3, 2, 4 / 3, 3 / 2, 1, 2 / 3, 3 / 4, 1 / 2, 1 / 3, 3 / 8, 1 / 4]
|
||||
)
|
||||
|
||||
tgr = f0_harmonics(
|
||||
tg, freqs=freqs, f0=bpm, harmonics=factors, kind=kind, fill_value=fill_value
|
||||
)
|
||||
|
||||
if aggregate is not None:
|
||||
return aggregate(tgr, axis=-1) # type: ignore
|
||||
|
||||
return tgr
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,310 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Feature manipulation utilities"""
|
||||
|
||||
import numpy as np
|
||||
import scipy.signal
|
||||
from numba import jit
|
||||
|
||||
from .._cache import cache
|
||||
from ..util.exceptions import ParameterError
|
||||
from typing import Any
|
||||
|
||||
__all__ = ["delta", "stack_memory"]
|
||||
|
||||
|
||||
@cache(level=40)
|
||||
def delta(
|
||||
data: np.ndarray,
|
||||
*,
|
||||
width: int = 9,
|
||||
order: int = 1,
|
||||
axis: int = -1,
|
||||
mode: str = "interp",
|
||||
**kwargs: Any,
|
||||
) -> np.ndarray:
|
||||
r"""Compute delta features: local estimate of the derivative
|
||||
of the input data along the selected axis.
|
||||
|
||||
Delta features are computed Savitsky-Golay filtering.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : np.ndarray
|
||||
the input data matrix (eg, spectrogram)
|
||||
|
||||
width : int, positive, odd [scalar]
|
||||
Number of frames over which to compute the delta features.
|
||||
Cannot exceed the length of ``data`` along the specified axis.
|
||||
|
||||
If ``mode='interp'``, then ``width`` must be at least ``data.shape[axis]``.
|
||||
|
||||
order : int > 0 [scalar]
|
||||
the order of the difference operator.
|
||||
1 for first derivative, 2 for second, etc.
|
||||
|
||||
axis : int [scalar]
|
||||
the axis along which to compute deltas.
|
||||
Default is -1 (columns).
|
||||
|
||||
mode : str, {'interp', 'nearest', 'mirror', 'constant', 'wrap'}
|
||||
Padding mode for estimating differences at the boundaries.
|
||||
|
||||
**kwargs : additional keyword arguments
|
||||
See `scipy.signal.savgol_filter`
|
||||
|
||||
Returns
|
||||
-------
|
||||
delta_data : np.ndarray [shape=(..., t)]
|
||||
delta matrix of ``data`` at specified order
|
||||
|
||||
Notes
|
||||
-----
|
||||
This function caches at level 40.
|
||||
|
||||
See Also
|
||||
--------
|
||||
scipy.signal.savgol_filter
|
||||
|
||||
Examples
|
||||
--------
|
||||
Compute MFCC deltas, delta-deltas
|
||||
|
||||
>>> y, sr = librosa.load(librosa.ex('libri1'), duration=5)
|
||||
>>> mfcc = librosa.feature.mfcc(y=y, sr=sr)
|
||||
>>> mfcc_delta = librosa.feature.delta(mfcc)
|
||||
>>> mfcc_delta
|
||||
array([[-5.713e+02, -5.697e+02, ..., -1.522e+02, -1.224e+02],
|
||||
[ 1.104e+01, 1.330e+01, ..., 2.089e+02, 1.698e+02],
|
||||
...,
|
||||
[ 2.829e+00, 1.933e+00, ..., -3.149e+00, 2.294e-01],
|
||||
[ 2.890e+00, 2.187e+00, ..., 6.959e+00, -1.039e+00]],
|
||||
dtype=float32)
|
||||
|
||||
>>> mfcc_delta2 = librosa.feature.delta(mfcc, order=2)
|
||||
>>> mfcc_delta2
|
||||
array([[-1.195, -1.195, ..., -4.328, -4.328],
|
||||
[-1.566, -1.566, ..., -9.949, -9.949],
|
||||
...,
|
||||
[ 0.707, 0.707, ..., 2.287, 2.287],
|
||||
[ 0.655, 0.655, ..., -1.719, -1.719]], dtype=float32)
|
||||
|
||||
>>> import matplotlib.pyplot as plt
|
||||
>>> fig, ax = plt.subplots(nrows=3, sharex=True, sharey=True)
|
||||
>>> img1 = librosa.display.specshow(mfcc, ax=ax[0], x_axis='time')
|
||||
>>> ax[0].set(title='MFCC')
|
||||
>>> ax[0].label_outer()
|
||||
>>> img2 = librosa.display.specshow(mfcc_delta, ax=ax[1], x_axis='time')
|
||||
>>> ax[1].set(title=r'MFCC-$\Delta$')
|
||||
>>> ax[1].label_outer()
|
||||
>>> img3 = librosa.display.specshow(mfcc_delta2, ax=ax[2], x_axis='time')
|
||||
>>> ax[2].set(title=r'MFCC-$\Delta^2$')
|
||||
>>> fig.colorbar(img1, ax=[ax[0]])
|
||||
>>> fig.colorbar(img2, ax=[ax[1]])
|
||||
>>> fig.colorbar(img3, ax=[ax[2]])
|
||||
"""
|
||||
data = np.atleast_1d(data)
|
||||
|
||||
if mode == "interp" and width > data.shape[axis]:
|
||||
raise ParameterError(
|
||||
f"when mode='interp', width={width} "
|
||||
f"cannot exceed data.shape[axis]={data.shape[axis]}"
|
||||
)
|
||||
|
||||
if width < 3 or np.mod(width, 2) != 1:
|
||||
raise ParameterError("width must be an odd integer >= 3")
|
||||
|
||||
if order <= 0 or not isinstance(order, (int, np.integer)):
|
||||
raise ParameterError("order must be a positive integer")
|
||||
|
||||
kwargs.pop("deriv", None)
|
||||
kwargs.setdefault("polyorder", order)
|
||||
result: np.ndarray = scipy.signal.savgol_filter(
|
||||
data, width, deriv=order, axis=axis, mode=mode, **kwargs
|
||||
)
|
||||
return result
|
||||
|
||||
|
||||
@cache(level=40)
|
||||
def stack_memory(
|
||||
data: np.ndarray, *, n_steps: int = 2, delay: int = 1, **kwargs: Any
|
||||
) -> np.ndarray:
|
||||
"""Short-term history embedding: vertically concatenate a data
|
||||
vector or matrix with delayed copies of itself.
|
||||
|
||||
Each column ``data[:, i]`` is mapped to::
|
||||
|
||||
data[..., i] -> [data[..., i],
|
||||
data[..., i - delay],
|
||||
...
|
||||
data[..., i - (n_steps-1)*delay]]
|
||||
|
||||
For columns ``i < (n_steps - 1) * delay``, the data will be padded.
|
||||
By default, the data is padded with zeros, but this behavior can be
|
||||
overridden by supplying additional keyword arguments which are passed
|
||||
to `np.pad()`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : np.ndarray [shape=(..., d, t)]
|
||||
Input data matrix. If ``data`` is a vector (``data.ndim == 1``),
|
||||
it will be interpreted as a row matrix and reshaped to ``(1, t)``.
|
||||
|
||||
n_steps : int > 0 [scalar]
|
||||
embedding dimension, the number of steps back in time to stack
|
||||
|
||||
delay : int != 0 [scalar]
|
||||
the number of columns to step.
|
||||
|
||||
Positive values embed from the past (previous columns).
|
||||
|
||||
Negative values embed from the future (subsequent columns).
|
||||
|
||||
**kwargs : additional keyword arguments
|
||||
Additional arguments to pass to `numpy.pad`
|
||||
|
||||
Returns
|
||||
-------
|
||||
data_history : np.ndarray [shape=(..., m * d, t)]
|
||||
data augmented with lagged copies of itself,
|
||||
where ``m == n_steps - 1``.
|
||||
|
||||
Notes
|
||||
-----
|
||||
This function caches at level 40.
|
||||
|
||||
Examples
|
||||
--------
|
||||
Keep two steps (current and previous)
|
||||
|
||||
>>> data = np.arange(-3, 3)
|
||||
>>> librosa.feature.stack_memory(data)
|
||||
array([[-3, -2, -1, 0, 1, 2],
|
||||
[ 0, -3, -2, -1, 0, 1]])
|
||||
|
||||
Or three steps
|
||||
|
||||
>>> librosa.feature.stack_memory(data, n_steps=3)
|
||||
array([[-3, -2, -1, 0, 1, 2],
|
||||
[ 0, -3, -2, -1, 0, 1],
|
||||
[ 0, 0, -3, -2, -1, 0]])
|
||||
|
||||
Use reflection padding instead of zero-padding
|
||||
|
||||
>>> librosa.feature.stack_memory(data, n_steps=3, mode='reflect')
|
||||
array([[-3, -2, -1, 0, 1, 2],
|
||||
[-2, -3, -2, -1, 0, 1],
|
||||
[-1, -2, -3, -2, -1, 0]])
|
||||
|
||||
Or pad with edge-values, and delay by 2
|
||||
|
||||
>>> librosa.feature.stack_memory(data, n_steps=3, delay=2, mode='edge')
|
||||
array([[-3, -2, -1, 0, 1, 2],
|
||||
[-3, -3, -3, -2, -1, 0],
|
||||
[-3, -3, -3, -3, -3, -2]])
|
||||
|
||||
Stack time-lagged beat-synchronous chroma edge padding
|
||||
|
||||
>>> y, sr = librosa.load(librosa.ex('sweetwaltz'), duration=10)
|
||||
>>> chroma = librosa.feature.chroma_cqt(y=y, sr=sr)
|
||||
>>> tempo, beats = librosa.beat.beat_track(y=y, sr=sr, hop_length=512)
|
||||
>>> beats = librosa.util.fix_frames(beats, x_min=0)
|
||||
>>> chroma_sync = librosa.util.sync(chroma, beats)
|
||||
>>> chroma_lag = librosa.feature.stack_memory(chroma_sync, n_steps=3,
|
||||
... mode='edge')
|
||||
|
||||
Plot the result
|
||||
|
||||
>>> import matplotlib.pyplot as plt
|
||||
>>> fig, ax = plt.subplots()
|
||||
>>> beat_times = librosa.frames_to_time(beats, sr=sr, hop_length=512)
|
||||
>>> librosa.display.specshow(chroma_lag, y_axis='chroma', x_axis='time',
|
||||
... x_coords=beat_times, ax=ax)
|
||||
>>> ax.text(1.0, 1/6, "Lag=0", transform=ax.transAxes, rotation=-90, ha="left", va="center")
|
||||
>>> ax.text(1.0, 3/6, "Lag=1", transform=ax.transAxes, rotation=-90, ha="left", va="center")
|
||||
>>> ax.text(1.0, 5/6, "Lag=2", transform=ax.transAxes, rotation=-90, ha="left", va="center")
|
||||
>>> ax.set(title='Time-lagged chroma', ylabel="")
|
||||
"""
|
||||
if n_steps < 1:
|
||||
raise ParameterError("n_steps must be a positive integer")
|
||||
|
||||
if delay == 0:
|
||||
raise ParameterError("delay must be a non-zero integer")
|
||||
|
||||
data = np.atleast_2d(data)
|
||||
t = data.shape[-1]
|
||||
|
||||
if t < 1:
|
||||
raise ParameterError(
|
||||
"Cannot stack memory when input data has "
|
||||
f"no columns. Given data.shape={data.shape}"
|
||||
)
|
||||
kwargs.setdefault("mode", "constant")
|
||||
|
||||
if kwargs["mode"] == "constant":
|
||||
kwargs.setdefault("constant_values", [0])
|
||||
|
||||
padding = [(0, 0) for _ in range(data.ndim)]
|
||||
|
||||
# Pad the end with zeros, which will roll to the front below
|
||||
if delay > 0:
|
||||
padding[-1] = (int((n_steps - 1) * delay), 0)
|
||||
else:
|
||||
padding[-1] = (0, int((n_steps - 1) * -delay))
|
||||
|
||||
data = np.pad(data, padding, **kwargs)
|
||||
|
||||
# Construct the shape of the target array
|
||||
shape = list(data.shape)
|
||||
shape[-2] = shape[-2] * n_steps
|
||||
shape[-1] = t
|
||||
shape = tuple(shape)
|
||||
|
||||
# Construct the output array to match layout and dtype of input
|
||||
history = np.empty_like(data, shape=shape)
|
||||
|
||||
# Populate the output array
|
||||
__stack(history, data, n_steps, delay)
|
||||
|
||||
return history
|
||||
|
||||
|
||||
@jit(nopython=True, cache=True)
|
||||
def __stack(history, data, n_steps, delay):
|
||||
"""Memory-stacking helper function.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
history : output array (2-dimensional)
|
||||
data : pre-padded input array (2-dimensional)
|
||||
n_steps : int > 0, the number of steps to stack
|
||||
delay : int != 0, the amount of delay between steps
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
Output is stored directly in the history array
|
||||
"""
|
||||
# Dimension of each copy of the data
|
||||
d = data.shape[-2]
|
||||
|
||||
# Total number of time-steps to output
|
||||
t = history.shape[-1]
|
||||
|
||||
if delay > 0:
|
||||
for step in range(n_steps):
|
||||
q = n_steps - 1 - step
|
||||
# nth block is original shifted left by n*delay steps
|
||||
history[..., step * d : (step + 1) * d, :] = data[
|
||||
..., q * delay : q * delay + t
|
||||
]
|
||||
else:
|
||||
# Handle the last block separately to avoid -t:0 empty slices
|
||||
history[..., -d:, :] = data[..., -t:]
|
||||
|
||||
for step in range(n_steps - 1):
|
||||
# nth block is original shifted right by n*delay steps
|
||||
q = n_steps - 1 - step
|
||||
history[..., step * d : (step + 1) * d, :] = data[
|
||||
..., -t + q * delay : q * delay
|
||||
]
|
||||
Reference in New Issue
Block a user