This commit is contained in:
2026-04-10 15:06:59 +02:00
parent 3031b7153b
commit e5a4711004
7806 changed files with 1918528 additions and 335 deletions

View File

@@ -0,0 +1,215 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Core IO and DSP
===============
Audio loading
-------------
.. autosummary::
:toctree: generated/
load
stream
to_mono
resample
get_duration
get_samplerate
Time-domain processing
----------------------
.. autosummary::
:toctree: generated/
autocorrelate
lpc
zero_crossings
mu_compress
mu_expand
Signal generation
-----------------
.. autosummary::
:toctree: generated/
clicks
tone
chirp
Spectral representations
------------------------
.. autosummary::
:toctree: generated/
stft
istft
reassigned_spectrogram
cqt
icqt
hybrid_cqt
pseudo_cqt
vqt
iirt
fmt
magphase
Phase recovery
--------------
.. autosummary::
:toctree: generated/
griffinlim
griffinlim_cqt
Harmonics
---------
.. autosummary::
:toctree: generated/
interp_harmonics
salience
f0_harmonics
phase_vocoder
Magnitude scaling
-----------------
.. autosummary::
:toctree: generated/
amplitude_to_db
db_to_amplitude
power_to_db
db_to_power
perceptual_weighting
frequency_weighting
multi_frequency_weighting
A_weighting
B_weighting
C_weighting
D_weighting
pcen
Time unit conversion
--------------------
.. autosummary::
:toctree: generated/
frames_to_samples
frames_to_time
samples_to_frames
samples_to_time
time_to_frames
time_to_samples
blocks_to_frames
blocks_to_samples
blocks_to_time
Frequency unit conversion
-------------------------
.. autosummary::
:toctree: generated/
hz_to_note
hz_to_midi
hz_to_svara_h
hz_to_svara_c
hz_to_fjs
midi_to_hz
midi_to_note
midi_to_svara_h
midi_to_svara_c
note_to_hz
note_to_midi
note_to_svara_h
note_to_svara_c
hz_to_mel
hz_to_octs
mel_to_hz
octs_to_hz
A4_to_tuning
tuning_to_A4
Music notation
--------------
.. autosummary::
:toctree: generated/
key_to_notes
key_to_degrees
mela_to_svara
mela_to_degrees
thaat_to_degrees
list_mela
list_thaat
fifths_to_note
interval_to_fjs
interval_frequencies
pythagorean_intervals
plimit_intervals
Frequency range generation
--------------------------
.. autosummary::
:toctree: generated/
fft_frequencies
cqt_frequencies
mel_frequencies
tempo_frequencies
fourier_tempo_frequencies
Pitch and tuning
----------------
.. autosummary::
:toctree: generated/
pyin
yin
estimate_tuning
pitch_tuning
piptrack
Miscellaneous
-------------
.. autosummary::
:toctree: generated/
samples_like
times_like
get_fftlib
set_fftlib
"""
import lazy_loader as lazy
from .version import version as __version__
__getattr__, __dir__, __all__ = lazy.attach_stub(__name__, __file__)

View File

@@ -0,0 +1,123 @@
from . import core
from . import beat
from . import decompose
from . import display
from . import effects
from . import feature
from . import filters
from . import onset
from . import segment
from . import sequence
from . import util
from ._cache import cache as cache
from .util.exceptions import (
LibrosaError as LibrosaError,
ParameterError as ParameterError,
)
from .util.files import example as example, ex as ex
from .util.files import cite as cite
from .version import show_versions as show_versions
from .core import (
frames_to_samples as frames_to_samples,
frames_to_time as frames_to_time,
samples_to_frames as samples_to_frames,
samples_to_time as samples_to_time,
time_to_samples as time_to_samples,
time_to_frames as time_to_frames,
blocks_to_samples as blocks_to_samples,
blocks_to_frames as blocks_to_frames,
blocks_to_time as blocks_to_time,
note_to_hz as note_to_hz,
note_to_midi as note_to_midi,
midi_to_hz as midi_to_hz,
midi_to_note as midi_to_note,
hz_to_note as hz_to_note,
hz_to_midi as hz_to_midi,
hz_to_mel as hz_to_mel,
hz_to_octs as hz_to_octs,
hz_to_fjs as hz_to_fjs,
mel_to_hz as mel_to_hz,
octs_to_hz as octs_to_hz,
A4_to_tuning as A4_to_tuning,
tuning_to_A4 as tuning_to_A4,
fft_frequencies as fft_frequencies,
cqt_frequencies as cqt_frequencies,
mel_frequencies as mel_frequencies,
tempo_frequencies as tempo_frequencies,
fourier_tempo_frequencies as fourier_tempo_frequencies,
A_weighting as A_weighting,
B_weighting as B_weighting,
C_weighting as C_weighting,
D_weighting as D_weighting,
Z_weighting as Z_weighting,
frequency_weighting as frequency_weighting,
multi_frequency_weighting as multi_frequency_weighting,
samples_like as samples_like,
times_like as times_like,
midi_to_svara_h as midi_to_svara_h,
midi_to_svara_c as midi_to_svara_c,
note_to_svara_h as note_to_svara_h,
note_to_svara_c as note_to_svara_c,
hz_to_svara_h as hz_to_svara_h,
hz_to_svara_c as hz_to_svara_c,
load as load,
stream as stream,
to_mono as to_mono,
resample as resample,
get_duration as get_duration,
get_samplerate as get_samplerate,
autocorrelate as autocorrelate,
lpc as lpc,
zero_crossings as zero_crossings,
clicks as clicks,
tone as tone,
chirp as chirp,
mu_compress as mu_compress,
mu_expand as mu_expand,
stft as stft,
istft as istft,
magphase as magphase,
iirt as iirt,
reassigned_spectrogram as reassigned_spectrogram,
phase_vocoder as phase_vocoder,
perceptual_weighting as perceptual_weighting,
power_to_db as power_to_db,
db_to_power as db_to_power,
amplitude_to_db as amplitude_to_db,
db_to_amplitude as db_to_amplitude,
fmt as fmt,
pcen as pcen,
griffinlim as griffinlim,
estimate_tuning as estimate_tuning,
pitch_tuning as pitch_tuning,
piptrack as piptrack,
yin as yin,
pyin as pyin,
cqt as cqt,
hybrid_cqt as hybrid_cqt,
pseudo_cqt as pseudo_cqt,
icqt as icqt,
griffinlim_cqt as griffinlim_cqt,
vqt as vqt,
salience as salience,
interp_harmonics as interp_harmonics,
f0_harmonics as f0_harmonics,
get_fftlib as get_fftlib,
set_fftlib as set_fftlib,
key_to_degrees as key_to_degrees,
key_to_notes as key_to_notes,
mela_to_degrees as mela_to_degrees,
mela_to_svara as mela_to_svara,
thaat_to_degrees as thaat_to_degrees,
list_mela as list_mela,
list_thaat as list_thaat,
fifths_to_note as fifths_to_note,
interval_to_fjs as interval_to_fjs,
interval_frequencies as interval_frequencies,
pythagorean_intervals as pythagorean_intervals,
plimit_intervals as plimit_intervals,
)

View File

@@ -0,0 +1,90 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Function caching"""
import os
from typing import Any, Callable, TypeVar
from joblib import Memory
from decorator import FunctionMaker
def _decorator_apply(dec, func):
return FunctionMaker.create(
func,
"return decfunc(%(shortsignature)s)",
dict(decfunc=dec(func)),
__wrapped__=func,
)
_F = TypeVar("_F", bound=Callable[..., Any])
class CacheManager(object):
"""The librosa cache manager class wraps joblib.Memory
with a __call__ attribute, so that it may act as a function.
Additionally, it provides a caching level filter, so that
different functions can be cached or not depending on the user's
preference for speed vs. storage usage.
"""
def __init__(self, *args: Any, **kwargs: Any):
level = kwargs.pop("level", 10)
# Initialize the memory object
self.memory: Memory = Memory(*args, **kwargs)
# The level parameter controls which data we cache
# smaller numbers mean less caching
self.level: int = level
def __call__(self, level: int) -> Callable[[_F], _F]:
"""
Cache with an explicitly defined level.
Example usage:
@cache(level=2)
def semi_important_function(some_arguments):
...
"""
def wrapper(function):
"""Add an input/output cache to the specified function."""
if self.memory.location is not None and self.level >= level:
return _decorator_apply(self.memory.cache, function)
else:
return function
return wrapper
def clear(self, *args: Any, **kwargs: Any) -> None:
"""Clear the cache"""
self.memory.clear(*args, **kwargs)
def eval(self, *args: Any, **kwargs: Any) -> Any:
"""Evaluate a function"""
return self.memory.eval(*args, **kwargs)
def format(self, *args: Any, **kwargs: Any) -> Any:
"""Return the formatted representation of an object"""
return self.memory.format(*args, **kwargs)
def reduce_size(self, *args: Any, **kwargs: Any) -> None:
"""Reduce the size of the cache"""
self.memory.reduce_size(*args, **kwargs) # pragma: no cover
def warn(self, *args: Any, **kwargs: Any) -> None:
"""Raise a warning"""
self.memory.warn(*args, **kwargs) # pragma: no cover
# Instantiate the cache from the environment
cache: CacheManager = CacheManager(
os.environ.get("LIBROSA_CACHE_DIR", None),
mmap_mode=os.environ.get("LIBROSA_CACHE_MMAP", None),
compress=os.environ.get("LIBROSA_CACHE_COMPRESS", False),
verbose=int(os.environ.get("LIBROSA_CACHE_VERBOSE", 0)),
level=int(os.environ.get("LIBROSA_CACHE_LEVEL", 10)),
)

View File

@@ -0,0 +1,84 @@
from __future__ import annotations
from typing import Callable, Generator, List, TypeVar, Union, Tuple, Any, Sequence
from typing_extensions import Literal, Never
import numpy as np
from numpy.typing import ArrayLike
_WindowSpec = Union[str, Tuple[Any, ...], float, Callable[[int], np.ndarray], ArrayLike]
_T = TypeVar("_T")
_IterableLike = Union[List[_T], Tuple[_T, ...], Generator[_T, None, None]]
_SequenceLike = Union[Sequence[_T], np.ndarray]
_ScalarOrSequence = Union[_T, _SequenceLike[_T]]
# The following definitions are copied from numpy/_typing/_scalars.py
# (We don't import them directly from numpy because they're an implementation detail.)
###
### START COPIED CODE
###
_CharLike_co = Union[str, bytes]
# The 6 `<X>Like_co` type-aliases below represent all scalars that can be
# coerced into `<X>` (with the casting rule `same_kind`)
_BoolLike_co = Union[bool, np.bool_]
_UIntLike_co = Union[_BoolLike_co, "np.unsignedinteger[Any]"]
_IntLike_co = Union[_BoolLike_co, int, "np.integer[Any]"]
_FloatLike_co = Union[_IntLike_co, float, "np.floating[Any]"]
_ComplexLike_co = Union[_FloatLike_co, complex, "np.complexfloating[Any, Any]"]
_TD64Like_co = Union[_IntLike_co, np.timedelta64]
_NumberLike_co = Union[int, float, complex, "np.number[Any]", np.bool_]
_ScalarLike_co = Union[
int,
float,
complex,
str,
bytes,
np.generic,
]
# `_VoidLike_co` is technically not a scalar, but it's close enough
_VoidLike_co = Union[Tuple[Any, ...], np.void]
# Padding modes in general
_ModeKind = Literal[
"constant",
"edge",
"linear_ramp",
"maximum",
"mean",
"median",
"minimum",
"reflect",
"symmetric",
"wrap",
"empty",
]
###
### END COPIED CODE
###
# Padding modes for head/tail padding
# These rule out padding modes that depend on the entire array
_STFTPad = Literal[
"constant",
"edge",
"linear_ramp",
"reflect",
"symmetric",
"empty",
]
_PadMode = Union[_ModeKind, Callable[..., Any]]
_PadModeSTFT = Union[_STFTPad, Callable[..., Any]]
def _ensure_not_reachable(__arg: Never):
"""
Ensure that a code path is not reachable, like typing_extension.assert_never.
This doesn't raise an exception so that we are forced to manually
raise a more user friendly exception afterwards.
"""
...

View File

@@ -0,0 +1,699 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Beat and tempo
==============
.. autosummary::
:toctree: generated/
beat_track
plp
"""
import numpy as np
import scipy
import scipy.stats
import numba
from . import core
from . import onset
from . import util
from .feature import fourier_tempogram
from .feature import tempo as _tempo
from .util.exceptions import ParameterError
from .util.decorators import moved
from typing import Optional, Tuple, Union
from ._typing import _FloatLike_co
__all__ = ["beat_track", "tempo", "plp"]
tempo = moved(moved_from="librosa.beat.tempo", version="0.10.0", version_removed="1.0")(
_tempo
)
def beat_track(
*,
y: Optional[np.ndarray] = None,
sr: float = 22050,
onset_envelope: Optional[np.ndarray] = None,
hop_length: int = 512,
start_bpm: float = 120.0,
tightness: float = 100,
trim: bool = True,
bpm: Optional[Union[_FloatLike_co, np.ndarray]] = None,
prior: Optional[scipy.stats.rv_continuous] = None,
units: str = "frames",
sparse: bool = True
) -> Tuple[Union[_FloatLike_co, np.ndarray], np.ndarray]:
r"""Dynamic programming beat tracker.
Beats are detected in three stages, following the method of [#]_:
1. Measure onset strength
2. Estimate tempo from onset correlation
3. Pick peaks in onset strength approximately consistent with estimated
tempo
.. [#] Ellis, Daniel PW. "Beat tracking by dynamic programming."
Journal of New Music Research 36.1 (2007): 51-60.
http://labrosa.ee.columbia.edu/projects/beattrack/
Parameters
----------
y : np.ndarray [shape=(..., n)] or None
audio time series
sr : number > 0 [scalar]
sampling rate of ``y``
onset_envelope : np.ndarray [shape=(..., m)] or None
(optional) pre-computed onset strength envelope.
hop_length : int > 0 [scalar]
number of audio samples between successive ``onset_envelope`` values
start_bpm : float > 0 [scalar]
initial guess for the tempo estimator (in beats per minute)
tightness : float [scalar]
tightness of beat distribution around tempo
trim : bool [scalar]
trim leading/trailing beats with weak onsets
bpm : float [scalar] or np.ndarray [shape=(...)]
(optional) If provided, use ``bpm`` as the tempo instead of
estimating it from ``onsets``.
If multichannel, tempo estimates can be provided for all channels.
Tempo estimates may also be time-varying, in which case the shape
of ``bpm`` should match that of ``onset_envelope``, i.e.,
one estimate provided for each frame.
prior : scipy.stats.rv_continuous [optional]
An optional prior distribution over tempo.
If provided, ``start_bpm`` will be ignored.
units : {'frames', 'samples', 'time'}
The units to encode detected beat events in.
By default, 'frames' are used.
sparse : bool
If ``True`` (default), detections are returned as an array of frames,
samples, or time indices (as specified by ``units=``).
If ``False``, detections are encoded as a dense boolean array where
``beats[..., n]`` is true if there's a beat at frame index ``n``.
.. note:: multi-channel input is only supported when ``sparse=False``.
Returns
-------
tempo : float [scalar, non-negative] or np.ndarray
estimated global tempo (in beats per minute)
If multi-channel and ``bpm`` is not provided, a separate
tempo will be returned for each channel.
.. note::
By default, the tempo is returned as an ndarray even for mono input.
In this case, the array will have a single element and be one-dimensional.
This is to ensure consistent return types for multi-channel input.
beats : np.ndarray
estimated beat event locations.
If `sparse=True` (default), beat locations are given in the specified units
(default is frame indices).
If `sparse=False` (required for multichannel input), beat events are
indicated by a boolean for each frame.
.. note::
If no onset strength could be detected, beat_tracker estimates 0 BPM
and returns an empty list.
Raises
------
ParameterError
if neither ``y`` nor ``onset_envelope`` are provided,
or if ``units`` is not one of 'frames', 'samples', or 'time'
See Also
--------
librosa.onset.onset_strength
Examples
--------
Track beats using time series input
>>> y, sr = librosa.load(librosa.ex('choice'), duration=10)
>>> tempo, beats = librosa.beat.beat_track(y=y, sr=sr)
>>> tempo
135.99917763157896
Print the frames corresponding to beats
>>> beats
array([ 3, 21, 40, 59, 78, 96, 116, 135, 154, 173, 192, 211,
230, 249, 268, 287, 306, 325, 344, 363])
Or print them as timestamps
>>> librosa.frames_to_time(beats, sr=sr)
array([0.07 , 0.488, 0.929, 1.37 , 1.811, 2.229, 2.694, 3.135,
3.576, 4.017, 4.458, 4.899, 5.341, 5.782, 6.223, 6.664,
7.105, 7.546, 7.988, 8.429])
Output beat detections as a boolean array instead of frame indices
>>> tempo, beats_dense = librosa.beat.beat_track(y=y, sr=sr, sparse=False)
>>> beats_dense
array([False, False, False, True, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, ..., False, False, True,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False])
Track beats using a pre-computed onset envelope
>>> onset_env = librosa.onset.onset_strength(y=y, sr=sr,
... aggregate=np.median)
>>> tempo, beats = librosa.beat.beat_track(onset_envelope=onset_env,
... sr=sr)
>>> tempo
135.99917763157896
>>> beats
array([ 3, 21, 40, 59, 78, 96, 116, 135, 154, 173, 192, 211,
230, 249, 268, 287, 306, 325, 344, 363])
Plot the beat events against the onset strength envelope
>>> import matplotlib.pyplot as plt
>>> hop_length = 512
>>> fig, ax = plt.subplots(nrows=2, sharex=True)
>>> times = librosa.times_like(onset_env, sr=sr, hop_length=hop_length)
>>> M = librosa.feature.melspectrogram(y=y, sr=sr, hop_length=hop_length)
>>> librosa.display.specshow(librosa.power_to_db(M, ref=np.max),
... y_axis='mel', x_axis='time', hop_length=hop_length,
... ax=ax[0])
>>> ax[0].label_outer()
>>> ax[0].set(title='Mel spectrogram')
>>> ax[1].plot(times, librosa.util.normalize(onset_env),
... label='Onset strength')
>>> ax[1].vlines(times[beats], 0, 1, alpha=0.5, color='r',
... linestyle='--', label='Beats')
>>> ax[1].legend()
"""
# First, get the frame->beat strength profile if we don't already have one
if onset_envelope is None:
if y is None:
raise ParameterError("y or onset_envelope must be provided")
onset_envelope = onset.onset_strength(
y=y, sr=sr, hop_length=hop_length, aggregate=np.median
)
if sparse and onset_envelope.ndim != 1:
raise ParameterError(f"sparse=True (default) does not support "
f"{onset_envelope.ndim}-dimensional inputs. "
f"Either set sparse=False or convert the signal to mono.")
# Do we have any onsets to grab?
if not onset_envelope.any():
if sparse:
return (0.0, np.array([], dtype=int))
else:
return (np.zeros(shape=onset_envelope.shape[:-1], dtype=float),
np.zeros_like(onset_envelope, dtype=bool))
# Estimate BPM if one was not provided
if bpm is None:
bpm = _tempo(
onset_envelope=onset_envelope,
sr=sr,
hop_length=hop_length,
start_bpm=start_bpm,
prior=prior,
)
# Ensure that tempo is in a shape that is compatible with vectorization
_bpm = np.atleast_1d(bpm)
bpm_expanded = util.expand_to(_bpm,
ndim=onset_envelope.ndim,
axes=range(_bpm.ndim))
# Then, run the tracker
beats = __beat_tracker(onset_envelope, bpm_expanded, float(sr) / hop_length, tightness, trim)
if sparse:
beats = np.flatnonzero(beats)
if units == "frames":
pass
elif units == "samples":
return (bpm, core.frames_to_samples(beats, hop_length=hop_length))
elif units == "time":
return (bpm, core.frames_to_time(beats, hop_length=hop_length, sr=sr))
else:
raise ParameterError(f"Invalid unit type: {units}")
return (bpm, beats)
def plp(
*,
y: Optional[np.ndarray] = None,
sr: float = 22050,
onset_envelope: Optional[np.ndarray] = None,
hop_length: int = 512,
win_length: int = 384,
tempo_min: Optional[float] = 30,
tempo_max: Optional[float] = 300,
prior: Optional[scipy.stats.rv_continuous] = None,
) -> np.ndarray:
"""Predominant local pulse (PLP) estimation. [#]_
The PLP method analyzes the onset strength envelope in the frequency domain
to find a locally stable tempo for each frame. These local periodicities
are used to synthesize local half-waves, which are combined such that peaks
coincide with rhythmically salient frames (e.g. onset events on a musical time grid).
The local maxima of the pulse curve can be taken as estimated beat positions.
This method may be preferred over the dynamic programming method of `beat_track`
when the tempo is expected to vary significantly over time. Additionally,
since `plp` does not require the entire signal to make predictions, it may be
preferable when beat-tracking long recordings in a streaming setting.
.. [#] Grosche, P., & Muller, M. (2011).
"Extracting predominant local pulse information from music recordings."
IEEE Transactions on Audio, Speech, and Language Processing, 19(6), 1688-1701.
Parameters
----------
y : np.ndarray [shape=(..., n)] or None
audio time series. Multi-channel is supported.
sr : number > 0 [scalar]
sampling rate of ``y``
onset_envelope : np.ndarray [shape=(..., n)] or None
(optional) pre-computed onset strength envelope
hop_length : int > 0 [scalar]
number of audio samples between successive ``onset_envelope`` values
win_length : int > 0 [scalar]
number of frames to use for tempogram analysis.
By default, 384 frames (at ``sr=22050`` and ``hop_length=512``) corresponds
to about 8.9 seconds.
tempo_min, tempo_max : numbers > 0 [scalar], optional
Minimum and maximum permissible tempo values. ``tempo_max`` must be at least
``tempo_min``.
Set either (or both) to `None` to disable this constraint.
prior : scipy.stats.rv_continuous [optional]
A prior distribution over tempo (in beats per minute).
By default, a uniform prior over ``[tempo_min, tempo_max]`` is used.
Returns
-------
pulse : np.ndarray, shape=[(..., n)]
The estimated pulse curve. Maxima correspond to rhythmically salient
points of time.
If input is multi-channel, one pulse curve per channel is computed.
See Also
--------
beat_track
librosa.onset.onset_strength
librosa.feature.fourier_tempogram
Examples
--------
Visualize the PLP compared to an onset strength envelope.
Both are normalized here to make comparison easier.
>>> y, sr = librosa.load(librosa.ex('brahms'))
>>> onset_env = librosa.onset.onset_strength(y=y, sr=sr)
>>> pulse = librosa.beat.plp(onset_envelope=onset_env, sr=sr)
>>> # Or compute pulse with an alternate prior, like log-normal
>>> import scipy.stats
>>> prior = scipy.stats.lognorm(loc=np.log(120), scale=120, s=1)
>>> pulse_lognorm = librosa.beat.plp(onset_envelope=onset_env, sr=sr,
... prior=prior)
>>> melspec = librosa.feature.melspectrogram(y=y, sr=sr)
>>> import matplotlib.pyplot as plt
>>> fig, ax = plt.subplots(nrows=3, sharex=True)
>>> librosa.display.specshow(librosa.power_to_db(melspec,
... ref=np.max),
... x_axis='time', y_axis='mel', ax=ax[0])
>>> ax[0].set(title='Mel spectrogram')
>>> ax[0].label_outer()
>>> ax[1].plot(librosa.times_like(onset_env),
... librosa.util.normalize(onset_env),
... label='Onset strength')
>>> ax[1].plot(librosa.times_like(pulse),
... librosa.util.normalize(pulse),
... label='Predominant local pulse (PLP)')
>>> ax[1].set(title='Uniform tempo prior [30, 300]')
>>> ax[1].label_outer()
>>> ax[2].plot(librosa.times_like(onset_env),
... librosa.util.normalize(onset_env),
... label='Onset strength')
>>> ax[2].plot(librosa.times_like(pulse_lognorm),
... librosa.util.normalize(pulse_lognorm),
... label='Predominant local pulse (PLP)')
>>> ax[2].set(title='Log-normal tempo prior, mean=120', xlim=[5, 20])
>>> ax[2].legend()
PLP local maxima can be used as estimates of beat positions.
>>> tempo, beats = librosa.beat.beat_track(onset_envelope=onset_env)
>>> beats_plp = np.flatnonzero(librosa.util.localmax(pulse))
>>> import matplotlib.pyplot as plt
>>> fig, ax = plt.subplots(nrows=2, sharex=True, sharey=True)
>>> times = librosa.times_like(onset_env, sr=sr)
>>> ax[0].plot(times, librosa.util.normalize(onset_env),
... label='Onset strength')
>>> ax[0].vlines(times[beats], 0, 1, alpha=0.5, color='r',
... linestyle='--', label='Beats')
>>> ax[0].legend()
>>> ax[0].set(title='librosa.beat.beat_track')
>>> ax[0].label_outer()
>>> # Limit the plot to a 15-second window
>>> times = librosa.times_like(pulse, sr=sr)
>>> ax[1].plot(times, librosa.util.normalize(pulse),
... label='PLP')
>>> ax[1].vlines(times[beats_plp], 0, 1, alpha=0.5, color='r',
... linestyle='--', label='PLP Beats')
>>> ax[1].legend()
>>> ax[1].set(title='librosa.beat.plp', xlim=[5, 20])
>>> ax[1].xaxis.set_major_formatter(librosa.display.TimeFormatter())
"""
# Step 1: get the onset envelope
if onset_envelope is None:
onset_envelope = onset.onset_strength(
y=y, sr=sr, hop_length=hop_length, aggregate=np.median
)
if tempo_min is not None and tempo_max is not None and tempo_max <= tempo_min:
raise ParameterError(
f"tempo_max={tempo_max} must be larger than tempo_min={tempo_min}"
)
# Step 2: get the fourier tempogram
ftgram = fourier_tempogram(
onset_envelope=onset_envelope,
sr=sr,
hop_length=hop_length,
win_length=win_length,
)
# Step 3: pin to the feasible tempo range
tempo_frequencies = core.fourier_tempo_frequencies(
sr=sr, hop_length=hop_length, win_length=win_length
)
if tempo_min is not None:
ftgram[..., tempo_frequencies < tempo_min, :] = 0
if tempo_max is not None:
ftgram[..., tempo_frequencies > tempo_max, :] = 0
# reshape lengths to match dimension properly
tempo_frequencies = util.expand_to(tempo_frequencies, ndim=ftgram.ndim, axes=-2)
# Step 3: Discard everything below the peak
ftmag = np.log1p(1e6 * np.abs(ftgram))
if prior is not None:
ftmag += prior.logpdf(tempo_frequencies)
peak_values = ftmag.max(axis=-2, keepdims=True)
ftgram[ftmag < peak_values] = 0
# Normalize to keep only phase information
ftgram /= util.tiny(ftgram) ** 0.5 + np.abs(ftgram.max(axis=-2, keepdims=True))
# Step 5: invert the Fourier tempogram to get the pulse
pulse = core.istft(
ftgram, hop_length=1, n_fft=win_length, length=onset_envelope.shape[-1]
)
# Step 6: retain only the positive part of the pulse cycle
pulse = np.clip(pulse, 0, None, pulse)
# Return the normalized pulse
return util.normalize(pulse, axis=-1)
def __beat_tracker(
onset_envelope: np.ndarray, bpm: np.ndarray, frame_rate: float, tightness: float, trim: bool
) -> np.ndarray:
"""Tracks beats in an onset strength envelope.
Parameters
----------
onset_envelope : np.ndarray [shape=(..., n,)]
onset strength envelope
bpm : float [scalar] or np.ndarray [shape=(...)]
tempo estimate
frame_rate : float [scalar]
frame rate of the spectrogram (sr / hop_length, frames per second)
tightness : float [scalar, positive]
how closely do we adhere to bpm?
trim : bool [scalar]
trim leading/trailing beats with weak onsets?
Returns
-------
beats : np.ndarray [shape=(n,)]
frame numbers of beat events
"""
if np.any(bpm <= 0):
raise ParameterError(f"bpm={bpm} must be strictly positive")
if tightness <= 0:
raise ParameterError("tightness must be strictly positive")
# TODO: this might be better accomplished with a np.broadcast_shapes check
if bpm.shape[-1] not in (1, onset_envelope.shape[-1]):
raise ParameterError(f"Invalid bpm shape={bpm.shape} does not match onset envelope shape={onset_envelope.shape}")
# convert bpm to frames per beat (rounded)
# [frames / sec] * [60 sec / min] / [beat / min] = [frames / beat]
frames_per_beat = np.round(frame_rate * 60.0 / bpm)
# localscore is a smoothed version of AGC'd onset envelope
localscore = __beat_local_score(__normalize_onsets(onset_envelope), frames_per_beat)
# run the DP
backlink, cumscore = __beat_track_dp(localscore, frames_per_beat, tightness)
# Reconstruct the beat path from backlinks
tail = __last_beat(cumscore)
beats = np.zeros_like(onset_envelope, dtype=bool)
__dp_backtrack(backlink, tail, beats)
# Discard spurious trailing beats
beats: np.ndarray = __trim_beats(localscore, beats, trim)
return beats
# -- Helper functions for beat tracking
def __normalize_onsets(onsets):
"""Normalize onset strength by its standard deviation"""
norm = onsets.std(ddof=1, axis=-1, keepdims=True)
return onsets / (norm + util.tiny(onsets))
@numba.guvectorize(
[
"void(float32[:], float32[:], float32[:])",
"void(float64[:], float64[:], float64[:])",
],
"(t),(n)->(t)",
nopython=True, cache=False)
def __beat_local_score(onset_envelope, frames_per_beat, localscore):
# This function essentially implements a same-mode convolution,
# but also allows for a time-varying convolution-like filter to support dynamic tempo.
N = len(onset_envelope)
if len(frames_per_beat) == 1:
# Static tempo mode
# NOTE: when we can bump the minimum numba to 0.58, we can eliminate this branch and just use
# np.convolve(..., mode='same') directly
window = np.exp(-0.5 * (np.arange(-frames_per_beat[0], frames_per_beat[0] + 1) * 32.0 / frames_per_beat[0]) ** 2)
K = len(window)
# This is a vanilla same-mode convolution
for i in range(len(onset_envelope)):
localscore[i] = 0.
# we need i + K // 2 - k < N ==> k > i + K //2 - N
# and i + K // 2 - k >= 0 ==> k <= i + K // 2
for k in range(max(0, i + K // 2 - N + 1), min(i + K // 2, K)):
localscore[i] += window[k] * onset_envelope[i + K//2 -k]
elif len(frames_per_beat) == len(onset_envelope):
# Time-varying tempo estimates
# This isn't exactly a convolution anymore, since the filter is time-varying, but it's pretty close
for i in range(len(onset_envelope)):
window = np.exp(-0.5 * (np.arange(-frames_per_beat[i], frames_per_beat[i] + 1) * 32.0 / frames_per_beat[i]) ** 2)
K = 2 * int(frames_per_beat[i]) + 1
localscore[i] = 0.
for k in range(max(0, i + K // 2 - N + 1), min(i + K // 2, K)):
localscore[i] += window[k] * onset_envelope[i + K // 2 - k]
@numba.guvectorize(
[
"void(float32[:], float32[:], float32, int32[:], float32[:])",
"void(float64[:], float64[:], float32, int32[:], float64[:])",
],
"(t),(n),()->(t),(t)",
nopython=True, cache=True)
def __beat_track_dp(localscore, frames_per_beat, tightness, backlink, cumscore):
"""Core dynamic program for beat tracking"""
# Threshold for the first beat to exceed
score_thresh = 0.01 * localscore.max()
# Are we on the first beat?
first_beat = True
backlink[0] = -1
cumscore[0] = localscore[0]
# If tv == 0, then tv * i will always be 0, so we only ever use frames_per_beat[0]
# If tv == 1, then tv * i = i, so we use the time-varying FPB
tv = int(len(frames_per_beat) > 1)
for i, score_i in enumerate(localscore):
best_score = - np.inf
beat_location = -1
# Search over all possible predecessors to find the best preceding beat
# NOTE: to provide time-varying tempo estimates, we replace
# frames_per_beat[0] by frames_per_beat[i] in this loop body.
for loc in range(i - np.round(frames_per_beat[tv * i] / 2), i - 2 * frames_per_beat[tv * i] - 1, - 1):
# Once we're searching past the start, break out
if loc < 0:
break
score = cumscore[loc] - tightness * (np.log(i - loc) - np.log(frames_per_beat[tv * i]))**2
if score > best_score:
best_score = score
beat_location = loc
# Add the local score
if beat_location >= 0:
cumscore[i] = score_i + best_score
else:
# No back-link found, so just use the current score
cumscore[i] = score_i
# Special case the first onset. Stop if the localscore is small
if first_beat and score_i < score_thresh:
backlink[i] = -1
else:
backlink[i] = beat_location
first_beat = False
@numba.guvectorize(
[
"void(float32[:], bool_[:], bool_, bool_[:])",
"void(float64[:], bool_[:], bool_, bool_[:])"
],
"(t),(t),()->(t)",
nopython=True, cache=True
)
def __trim_beats(localscore, beats, trim, beats_trimmed):
"""Remove spurious leading and trailing beats from the detection array"""
# Populate the trimmed beats array with the existing values
beats_trimmed[:] = beats
# Compute the threshold: 1/2 RMS of the smoothed beat envelope
w = np.hanning(5)
# Slicing here to implement same-mode convolution in older numba where
# mode='same' is not yet supported
smooth_boe = np.convolve(localscore[beats], w)[len(w)//2:len(localscore)+len(w)//2]
# This logic is to preserve old behavior and always discard beats detected with oenv==0
if trim:
threshold = 0.5 * ((smooth_boe**2).mean()**0.5)
else:
threshold = 0.0
# Suppress bad beats
n = 0
while localscore[n] <= threshold:
beats_trimmed[n] = False
n += 1
n = len(localscore) - 1
while localscore[n] <= threshold:
beats_trimmed[n] = False
n -= 1
pass
def __last_beat(cumscore):
"""Identify the position of the last detected beat"""
# Use a masked array to support multidimensional statistics
# We negate the mask here because of numpy masked array semantics
mask = ~util.localmax(cumscore, axis=-1)
masked_scores = np.ma.masked_array(data=cumscore, mask=mask) # type: ignore
medians = np.ma.median(masked_scores, axis=-1)
thresholds = 0.5 * np.ma.getdata(medians)
# Also find the last beat positions
tail = np.empty(shape=cumscore.shape[:-1], dtype=int)
__last_beat_selector(cumscore, mask, thresholds, tail)
return tail
@numba.guvectorize(
[
"void(float32[:], bool_[:], float32, int64[:])",
"void(float64[:], bool_[:], float64, int64[:])",
],
"(t),(t),()->()",
nopython=True, cache=True
)
def __last_beat_selector(cumscore, mask, threshold, out):
"""Vectorized helper to identify the last valid beat position:
cumscore[n] > threshold and not mask[n]
"""
n = len(cumscore) - 1
out[0] = n
while n >= 0:
if not mask[n] and cumscore[n] >= threshold:
out[0] = n
break
else:
n -= 1
@numba.guvectorize(
[
"void(int32[:], int32, bool_[:])",
"void(int64[:], int64, bool_[:])"
],
"(t),()->(t)",
nopython=True, cache=True
)
def __dp_backtrack(backlinks, tail, beats):
"""Populate the beat indicator array from a sequence of backlinks"""
n = tail
while n >= 0:
beats[n] = True
n = backlinks[n]

View File

@@ -0,0 +1,7 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Core IO and DSP functions"""
import lazy_loader as lazy
__getattr__, __dir__, __all__ = lazy.attach_stub(__name__, __file__)

View File

@@ -0,0 +1,124 @@
from .convert import (
frames_to_samples as frames_to_samples,
frames_to_time as frames_to_time,
samples_to_frames as samples_to_frames,
samples_to_time as samples_to_time,
time_to_samples as time_to_samples,
time_to_frames as time_to_frames,
blocks_to_samples as blocks_to_samples,
blocks_to_frames as blocks_to_frames,
blocks_to_time as blocks_to_time,
note_to_hz as note_to_hz,
note_to_midi as note_to_midi,
midi_to_hz as midi_to_hz,
midi_to_note as midi_to_note,
hz_to_note as hz_to_note,
hz_to_midi as hz_to_midi,
hz_to_mel as hz_to_mel,
hz_to_octs as hz_to_octs,
hz_to_fjs as hz_to_fjs,
mel_to_hz as mel_to_hz,
octs_to_hz as octs_to_hz,
A4_to_tuning as A4_to_tuning,
tuning_to_A4 as tuning_to_A4,
fft_frequencies as fft_frequencies,
cqt_frequencies as cqt_frequencies,
mel_frequencies as mel_frequencies,
tempo_frequencies as tempo_frequencies,
fourier_tempo_frequencies as fourier_tempo_frequencies,
A_weighting as A_weighting,
B_weighting as B_weighting,
C_weighting as C_weighting,
D_weighting as D_weighting,
Z_weighting as Z_weighting,
frequency_weighting as frequency_weighting,
multi_frequency_weighting as multi_frequency_weighting,
samples_like as samples_like,
times_like as times_like,
midi_to_svara_h as midi_to_svara_h,
midi_to_svara_c as midi_to_svara_c,
note_to_svara_h as note_to_svara_h,
note_to_svara_c as note_to_svara_c,
hz_to_svara_h as hz_to_svara_h,
hz_to_svara_c as hz_to_svara_c,
)
from .audio import (
load as load,
stream as stream,
to_mono as to_mono,
resample as resample,
get_duration as get_duration,
get_samplerate as get_samplerate,
autocorrelate as autocorrelate,
lpc as lpc,
zero_crossings as zero_crossings,
clicks as clicks,
tone as tone,
chirp as chirp,
mu_compress as mu_compress,
mu_expand as mu_expand,
)
from .spectrum import (
stft as stft,
istft as istft,
magphase as magphase,
iirt as iirt,
reassigned_spectrogram as reassigned_spectrogram,
phase_vocoder as phase_vocoder,
perceptual_weighting as perceptual_weighting,
power_to_db as power_to_db,
db_to_power as db_to_power,
amplitude_to_db as amplitude_to_db,
db_to_amplitude as db_to_amplitude,
fmt as fmt,
pcen as pcen,
griffinlim as griffinlim,
)
from .pitch import (
estimate_tuning as estimate_tuning,
pitch_tuning as pitch_tuning,
piptrack as piptrack,
yin as yin,
pyin as pyin,
)
from .constantq import (
cqt as cqt,
hybrid_cqt as hybrid_cqt,
pseudo_cqt as pseudo_cqt,
icqt as icqt,
griffinlim_cqt as griffinlim_cqt,
vqt as vqt,
)
from .harmonic import (
salience as salience,
interp_harmonics as interp_harmonics,
f0_harmonics as f0_harmonics,
)
from .fft import (
get_fftlib as get_fftlib,
set_fftlib as set_fftlib,
)
from .notation import (
key_to_degrees as key_to_degrees,
key_to_notes as key_to_notes,
mela_to_degrees as mela_to_degrees,
mela_to_svara as mela_to_svara,
thaat_to_degrees as thaat_to_degrees,
list_mela as list_mela,
list_thaat as list_thaat,
fifths_to_note as fifths_to_note,
interval_to_fjs as interval_to_fjs,
)
from .intervals import (
interval_frequencies as interval_frequencies,
pythagorean_intervals as pythagorean_intervals,
plimit_intervals as plimit_intervals,
)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,64 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Fast Fourier Transform (FFT) library container"""
import scipy.fft
from types import ModuleType
from typing import Optional
from ..util.decorators import deprecated
__all__ = ["get_fftlib", "set_fftlib"]
# Object to hold FFT interfaces
__FFTLIB: Optional[ModuleType] = scipy.fft
@deprecated(version="0.11.0", version_removed="1.0")
def set_fftlib(lib: Optional[ModuleType] = None) -> None:
"""Set the FFT library used by librosa.
.. warning:: This functionality is deprecated in librosa 0.11 and will be
removed in 1.0. To achieve the same effect, use either the
`scipy.fft.set_backend` context manager or
`scipy.fft.set_global_backend` function.
Parameters
----------
lib : None or module
Must implement an interface compatible with `scipy.fft`.
If ``None``, reverts to `scipy.fft`.
Examples
--------
Use `pyfftw`:
>>> import pyfftw
>>> librosa.set_fftlib(pyfftw.interfaces.numpy_fft)
Reset to default `scipy` implementation
>>> librosa.set_fftlib()
"""
global __FFTLIB
if lib is None:
lib = scipy.fft
__FFTLIB = lib
def get_fftlib() -> ModuleType:
"""Get the FFT library currently used by librosa
Returns
-------
fft : module
The FFT library currently used by librosa.
Must API-compatible with `numpy.fft`.
"""
if __FFTLIB is None:
# This path should never occur because importing
# this module will call set_fftlib
assert False # pragma: no cover
return __FFTLIB

View File

@@ -0,0 +1,450 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Harmonic calculations for frequency representations"""
import warnings
import numpy as np
import scipy.interpolate
import scipy.signal
from ..util.exceptions import ParameterError
from ..util import is_unique
from numpy.typing import ArrayLike
from typing import Callable, Optional, Sequence
__all__ = ["salience", "interp_harmonics", "f0_harmonics"]
def salience(
S: np.ndarray,
*,
freqs: np.ndarray,
harmonics: Sequence[float],
weights: Optional[ArrayLike] = None,
aggregate: Optional[Callable] = None,
filter_peaks: bool = True,
fill_value: float = np.nan,
kind: str = "linear",
axis: int = -2,
) -> np.ndarray:
"""Harmonic salience function.
Parameters
----------
S : np.ndarray [shape=(..., d, n)]
input time frequency magnitude representation (e.g. STFT or CQT magnitudes).
Must be real-valued and non-negative.
freqs : np.ndarray, shape=(S.shape[axis]) or shape=S.shape
The frequency values corresponding to S's elements along the
chosen axis.
Frequencies can also be time-varying, e.g. as computed by
`reassigned_spectrogram`, in which case the shape should
match ``S``.
harmonics : list-like, non-negative
Harmonics to include in salience computation. The first harmonic (1)
corresponds to ``S`` itself. Values less than one (e.g., 1/2) correspond
to sub-harmonics.
weights : list-like
The weight to apply to each harmonic in the summation. (default:
uniform weights). Must be the same length as ``harmonics``.
aggregate : function
aggregation function (default: `np.average`)
If ``aggregate=np.average``, then a weighted average is
computed per-harmonic according to the specified weights.
For all other aggregation functions, all harmonics
are treated equally.
filter_peaks : bool
If true, returns harmonic summation only on frequencies of peak
magnitude. Otherwise returns harmonic summation over the full spectrum.
Defaults to True.
fill_value : float
The value to fill non-peaks in the output representation. (default:
`np.nan`) Only used if ``filter_peaks == True``.
kind : str
Interpolation type for harmonic estimation.
See `scipy.interpolate.interp1d`.
axis : int
The axis along which to compute harmonics
Returns
-------
S_sal : np.ndarray
``S_sal`` will have the same shape as ``S``, and measure
the overall harmonic energy at each frequency.
See Also
--------
interp_harmonics
Examples
--------
>>> y, sr = librosa.load(librosa.ex('trumpet'), duration=3)
>>> S = np.abs(librosa.stft(y))
>>> freqs = librosa.fft_frequencies(sr=sr)
>>> harms = [1, 2, 3, 4]
>>> weights = [1.0, 0.5, 0.33, 0.25]
>>> S_sal = librosa.salience(S, freqs=freqs, harmonics=harms, weights=weights, fill_value=0)
>>> print(S_sal.shape)
(1025, 115)
>>> import matplotlib.pyplot as plt
>>> fig, ax = plt.subplots(nrows=2, sharex=True, sharey=True)
>>> librosa.display.specshow(librosa.amplitude_to_db(S, ref=np.max),
... sr=sr, y_axis='log', x_axis='time', ax=ax[0])
>>> ax[0].set(title='Magnitude spectrogram')
>>> ax[0].label_outer()
>>> img = librosa.display.specshow(librosa.amplitude_to_db(S_sal,
... ref=np.max),
... sr=sr, y_axis='log', x_axis='time', ax=ax[1])
>>> ax[1].set(title='Salience spectrogram')
>>> fig.colorbar(img, ax=ax, format="%+2.0f dB")
"""
if aggregate is None:
aggregate = np.average
if weights is None:
weights = np.ones((len(harmonics),))
else:
weights = np.array(weights, dtype=float)
S_harm = interp_harmonics(S, freqs=freqs, harmonics=harmonics, kind=kind, axis=axis)
S_sal: np.ndarray
if aggregate is np.average:
S_sal = aggregate(S_harm, axis=axis - 1, weights=weights)
else:
S_sal = aggregate(S_harm, axis=axis - 1)
if filter_peaks:
S_peaks = scipy.signal.argrelmax(S, axis=axis)
S_out = np.empty(S.shape)
S_out.fill(fill_value)
S_out[S_peaks] = S_sal[S_peaks]
S_sal = S_out
return S_sal
def interp_harmonics(
x: np.ndarray,
*,
freqs: np.ndarray,
harmonics: ArrayLike,
kind: str = "linear",
fill_value: float = 0,
axis: int = -2,
) -> np.ndarray:
"""Compute the energy at harmonics of time-frequency representation.
Given a frequency-based energy representation such as a spectrogram
or tempogram, this function computes the energy at the chosen harmonics
of the frequency axis. (See examples below.)
The resulting harmonic array can then be used as input to a salience
computation.
Parameters
----------
x : np.ndarray
The input energy
freqs : np.ndarray, shape=(x.shape[axis]) or shape=x.shape
The frequency values corresponding to x's elements along the
chosen axis.
Frequencies can also be time-varying, e.g. as computed by
`reassigned_spectrogram`, in which case the shape should
match ``x``.
harmonics : list-like, non-negative
Harmonics to compute as ``harmonics[i] * freqs``.
The first harmonic (1) corresponds to ``freqs``.
Values less than one (e.g., 1/2) correspond to sub-harmonics.
kind : str
Interpolation type. See `scipy.interpolate.interp1d`.
fill_value : float
The value to fill when extrapolating beyond the observed
frequency range.
axis : int
The axis along which to compute harmonics
Returns
-------
x_harm : np.ndarray
``x_harm[i]`` will have the same shape as ``x``, and measure
the energy at the ``harmonics[i]`` harmonic of each frequency.
A new dimension indexing harmonics will be inserted immediately
before ``axis``.
See Also
--------
scipy.interpolate.interp1d
Examples
--------
Estimate the harmonics of a time-averaged tempogram
>>> y, sr = librosa.load(librosa.ex('sweetwaltz'))
>>> # Compute the time-varying tempogram and average over time
>>> tempi = np.mean(librosa.feature.tempogram(y=y, sr=sr), axis=1)
>>> # We'll measure the first five harmonics
>>> harmonics = [1, 2, 3, 4, 5]
>>> f_tempo = librosa.tempo_frequencies(len(tempi), sr=sr)
>>> # Build the harmonic tensor; we only have one axis here (tempo)
>>> t_harmonics = librosa.interp_harmonics(tempi, freqs=f_tempo, harmonics=harmonics, axis=0)
>>> print(t_harmonics.shape)
(5, 384)
>>> # And plot the results
>>> import matplotlib.pyplot as plt
>>> fig, ax = plt.subplots()
>>> librosa.display.specshow(t_harmonics, x_axis='tempo', sr=sr, ax=ax)
>>> ax.set(yticks=np.arange(len(harmonics)),
... yticklabels=['{:.3g}'.format(_) for _ in harmonics],
... ylabel='Harmonic', xlabel='Tempo (BPM)')
We can also compute frequency harmonics for spectrograms.
To calculate sub-harmonic energy, use values < 1.
>>> y, sr = librosa.load(librosa.ex('trumpet'), duration=3)
>>> harmonics = [1./3, 1./2, 1, 2, 3, 4]
>>> S = np.abs(librosa.stft(y))
>>> fft_freqs = librosa.fft_frequencies(sr=sr)
>>> S_harm = librosa.interp_harmonics(S, freqs=fft_freqs, harmonics=harmonics, axis=0)
>>> print(S_harm.shape)
(6, 1025, 646)
>>> fig, ax = plt.subplots(nrows=3, ncols=2, sharex=True, sharey=True)
>>> for i, _sh in enumerate(S_harm):
... img = librosa.display.specshow(librosa.amplitude_to_db(_sh,
... ref=S.max()),
... sr=sr, y_axis='log', x_axis='time',
... ax=ax.flat[i])
... ax.flat[i].set(title='h={:.3g}'.format(harmonics[i]))
... ax.flat[i].label_outer()
>>> fig.colorbar(img, ax=ax, format="%+2.f dB")
"""
if freqs.ndim == 1 and len(freqs) == x.shape[axis]:
# Build the 1-D interpolator.
# All frames have a common domain, so we only need one interpolator here.
# First, verify that the input frequencies are unique
if not is_unique(freqs, axis=0):
warnings.warn(
"Frequencies are not unique. This may produce incorrect harmonic interpolations.",
stacklevel=2,
)
f_interp = scipy.interpolate.interp1d(
freqs,
x,
axis=axis,
bounds_error=False,
copy=False,
kind=kind,
fill_value=fill_value,
)
# Set the interpolation points
f_out = np.multiply.outer(harmonics, freqs)
# Interpolate; suppress type checks
return f_interp(f_out) # type: ignore
elif freqs.shape == x.shape:
if not np.all(is_unique(freqs, axis=axis)):
warnings.warn(
"Frequencies are not unique. This may produce incorrect harmonic interpolations.",
stacklevel=2,
)
# If we have time-varying frequencies, then it must match exactly the shape of the input
# We'll define a frame-wise interpolator helper function that we will vectorize over
# the entire input array
def _f_interp(_a, _b):
interp = scipy.interpolate.interp1d(
_a, _b, bounds_error=False, copy=False, kind=kind, fill_value=fill_value
)
return interp(np.multiply.outer(_a, harmonics))
# Signature is expanding frequency into a new dimension
xfunc = np.vectorize(_f_interp, signature="(f),(f)->(f,h)")
# Rotate the vectorizing axis to the tail so that we get parallelism over frames
# Afterward, we're swapping (-1, axis-1) instead of (-1,axis)
# because a new dimension has been inserted
return ( # type: ignore
xfunc(freqs.swapaxes(axis, -1), x.swapaxes(axis, -1))
.swapaxes(
# Return the original target axis to its place
-2,
axis,
)
.swapaxes(
# Put the new harmonic axis directly in front of the target axis
-1,
axis - 1,
)
)
else:
raise ParameterError(
f"freqs.shape={freqs.shape} is incompatible with input shape={x.shape}"
)
def f0_harmonics(
x: np.ndarray,
*,
f0: np.ndarray,
freqs: np.ndarray,
harmonics: ArrayLike,
kind: str = "linear",
fill_value: float = 0,
axis: int = -2,
) -> np.ndarray:
"""Compute the energy at selected harmonics of a time-varying
fundamental frequency.
This function can be used to reduce a `frequency * time` representation
to a `harmonic * time` representation, effectively normalizing out for
the fundamental frequency. The result can be used as a representation
of timbre when f0 corresponds to pitch, or as a representation of
rhythm when f0 corresponds to tempo.
This function differs from `interp_harmonics`, which computes the
harmonics of *all* frequencies.
Parameters
----------
x : np.ndarray [shape=(..., frequencies, n)]
The input array (e.g., STFT magnitudes)
f0 : np.ndarray [shape=(..., n)]
The fundamental frequency (f0) of each frame in the input
Shape should match ``x.shape[-1]``
freqs : np.ndarray, shape=(x.shape[axis]) or shape=x.shape
The frequency values corresponding to X's elements along the
chosen axis.
Frequencies can also be time-varying, e.g. as computed by
`reassigned_spectrogram`, in which case the shape should
match ``x``.
harmonics : list-like, non-negative
Harmonics to compute as ``harmonics[i] * f0``
Values less than one (e.g., 1/2) correspond to sub-harmonics.
kind : str
Interpolation type. See `scipy.interpolate.interp1d`.
fill_value : float
The value to fill when extrapolating beyond the observed
frequency range.
axis : int
The axis corresponding to frequency in ``x``
Returns
-------
f0_harm : np.ndarray [shape=(..., len(harmonics), n)]
Interpolated energy at each specified harmonic of the fundamental
frequency for each time step.
See Also
--------
interp_harmonics
librosa.feature.tempogram_ratio
Examples
--------
This example estimates the fundamental (f0), and then extracts the first
12 harmonics
>>> y, sr = librosa.load(librosa.ex('trumpet'))
>>> f0, voicing, voicing_p = librosa.pyin(y=y, sr=sr, fmin=200, fmax=700)
>>> S = np.abs(librosa.stft(y))
>>> freqs = librosa.fft_frequencies(sr=sr)
>>> harmonics = np.arange(1, 13)
>>> f0_harm = librosa.f0_harmonics(S, freqs=freqs, f0=f0, harmonics=harmonics)
>>> import matplotlib.pyplot as plt
>>> fig, ax =plt.subplots(nrows=2, sharex=True)
>>> librosa.display.specshow(librosa.amplitude_to_db(S, ref=np.max),
... x_axis='time', y_axis='log', ax=ax[0])
>>> times = librosa.times_like(f0)
>>> for h in harmonics:
... ax[0].plot(times, h * f0, label=f"{h}*f0")
>>> ax[0].legend(ncols=4, loc='lower right')
>>> ax[0].label_outer()
>>> librosa.display.specshow(librosa.amplitude_to_db(f0_harm, ref=np.max),
... x_axis='time', ax=ax[1])
>>> ax[1].set_yticks(harmonics-1)
>>> ax[1].set_yticklabels(harmonics)
>>> ax[1].set(ylabel='Harmonics')
"""
result: np.ndarray
if freqs.ndim == 1 and len(freqs) == x.shape[axis]:
if not is_unique(freqs, axis=0):
warnings.warn(
"Frequencies are not unique. This may produce incorrect harmonic interpolations.",
stacklevel=2,
)
# We have a fixed frequency grid
idx = np.isfinite(freqs)
def _f_interps(data, f):
interp = scipy.interpolate.interp1d(
freqs[idx],
data[idx],
axis=0,
bounds_error=False,
copy=False,
assume_sorted=False,
kind=kind,
fill_value=fill_value,
)
return interp(f)
xfunc = np.vectorize(_f_interps, signature="(f),(h)->(h)")
result = xfunc(x.swapaxes(axis, -1), np.multiply.outer(f0, harmonics)).swapaxes(
axis, -1
)
elif freqs.shape == x.shape:
if not np.all(is_unique(freqs, axis=axis)):
warnings.warn(
"Frequencies are not unique. This may produce incorrect harmonic interpolations.",
stacklevel=2,
)
# We have a dynamic frequency grid, not so bad
def _f_interpd(data, frequencies, f):
idx = np.isfinite(frequencies)
interp = scipy.interpolate.interp1d(
frequencies[idx],
data[idx],
axis=0,
bounds_error=False,
copy=False,
assume_sorted=False,
kind=kind,
fill_value=fill_value,
)
return interp(f)
xfunc = np.vectorize(_f_interpd, signature="(f),(f),(h)->(h)")
result = xfunc(
x.swapaxes(axis, -1),
freqs.swapaxes(axis, -1),
np.multiply.outer(f0, harmonics),
).swapaxes(axis, -1)
else:
raise ParameterError(
f"freqs.shape={freqs.shape} is incompatible with input shape={x.shape}"
)
return np.nan_to_num(result, copy=False, nan=fill_value)

View File

@@ -0,0 +1,510 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
"""Functions for interval construction"""
from typing import Collection, Dict, List, Union, overload, Iterable
from typing_extensions import Literal
import msgpack
import numpy as np
from numpy.typing import ArrayLike
from .._cache import cache
from .._typing import _FloatLike_co
from ..util.files import _resource_file
with _resource_file("librosa.core", "intervals.msgpack") as imsgpack:
with imsgpack.open("rb") as _fdesc:
# We use floats for dictionary keys, so strict mapping is disabled
INTERVALS = msgpack.load(_fdesc, strict_map_key=False)
@cache(level=10)
def interval_frequencies(
n_bins: int,
*,
fmin: _FloatLike_co,
intervals: Union[str, Collection[float]],
bins_per_octave: int = 12,
tuning: float = 0.0,
sort: bool = True
) -> np.ndarray:
"""Construct a set of frequencies from an interval set
Parameters
----------
n_bins : int
The number of frequencies to generate
fmin : float > 0
The minimum frequency
intervals : str or array of floats in [1, 2)
If `str`, must be one of the following:
- `'equal'` - equal temperament
- `'pythagorean'` - Pythagorean intervals
- `'ji3'` - 3-limit just intonation
- `'ji5'` - 5-limit just intonation
- `'ji7'` - 7-limit just intonation
Otherwise, an array of intervals in the range [1, 2) can be provided.
bins_per_octave : int > 0
If `intervals` is a string specification, how many bins to
generate per octave.
If `intervals` is an array, then this parameter is ignored.
tuning : float
Deviation from A440 tuning in fractional bins.
This is only used when `intervals == 'equal'`
sort : bool
Sort the intervals in ascending order.
Returns
-------
frequencies : array of float
The frequencies
Examples
--------
Generate two octaves of Pythagorean intervals starting at 55Hz
>>> librosa.interval_frequencies(24, fmin=55, intervals="pythagorean", bins_per_octave=12)
array([ 55. , 58.733, 61.875, 66.075, 69.609, 74.334, 78.311,
82.5 , 88.099, 92.812, 99.112, 104.414, 110. , 117.466,
123.75 , 132.149, 139.219, 148.668, 156.621, 165. , 176.199,
185.625, 198.224, 208.828])
Generate two octaves of 5-limit intervals starting at 55Hz
>>> librosa.interval_frequencies(24, fmin=55, intervals="ji5", bins_per_octave=12)
array([ 55. , 58.667, 61.875, 66. , 68.75 , 73.333, 77.344,
82.5 , 88. , 91.667, 99. , 103.125, 110. , 117.333,
123.75 , 132. , 137.5 , 146.667, 154.687, 165. , 176. ,
183.333, 198. , 206.25 ])
Generate three octaves using only three intervals
>>> intervals = [1, 4/3, 3/2]
>>> librosa.interval_frequencies(9, fmin=55, intervals=intervals)
array([ 55. , 73.333, 82.5 , 110. , 146.667, 165. , 220. ,
293.333, 330. ])
"""
if isinstance(intervals, str):
if intervals == "equal":
# Maybe include tuning here?
ratios = 2.0 ** (
(tuning + np.arange(0, bins_per_octave, dtype=float)) / bins_per_octave
)
elif intervals == "pythagorean":
ratios = pythagorean_intervals(bins_per_octave=bins_per_octave, sort=sort)
elif intervals == "ji3":
ratios = plimit_intervals(
primes=[3], bins_per_octave=bins_per_octave, sort=sort
)
elif intervals == "ji5":
ratios = plimit_intervals(
primes=[3, 5], bins_per_octave=bins_per_octave, sort=sort
)
elif intervals == "ji7":
ratios = plimit_intervals(
primes=[3, 5, 7], bins_per_octave=bins_per_octave, sort=sort
)
else:
ratios = np.array(intervals)
bins_per_octave = len(ratios)
# We have one octave of ratios, tile it up to however many we need
# and trim back to the right number of bins
n_octaves = np.ceil(n_bins / bins_per_octave)
all_ratios = np.multiply.outer(2.0 ** np.arange(n_octaves), ratios).flatten()[
:n_bins
]
if sort:
all_ratios = np.sort(all_ratios)
return all_ratios * fmin
@overload
def pythagorean_intervals(
*,
bins_per_octave: int = ...,
sort: bool = ...,
return_factors: Literal[False] = ...
) -> np.ndarray:
...
@overload
def pythagorean_intervals(
*, bins_per_octave: int = ..., sort: bool = ..., return_factors: Literal[True]
) -> List[Dict[int, int]]:
...
@overload
def pythagorean_intervals(
*, bins_per_octave: int = ..., sort: bool = ..., return_factors: bool = ...
) -> Union[np.ndarray, List[Dict[int, int]]]:
...
@cache(level=10)
def pythagorean_intervals(
*, bins_per_octave: int = 12, sort: bool = True, return_factors: bool = False
) -> Union[np.ndarray, List[Dict[int, int]]]:
"""Pythagorean intervals
Intervals are constructed by stacking ratios of 3/2 (i.e.,
just perfect fifths) and folding down to a single octave::
1, 3/2, 9/8, 27/16, 81/64, ...
Note that this differs from 3-limit just intonation intervals
in that Pythagorean intervals only use positive powers of 3
(ascending fifths) while 3-limit intervals use both positive
and negative powers (descending fifths).
Parameters
----------
bins_per_octave : int
The number of intervals to generate
sort : bool
If `True` then intervals are returned in ascending order.
If `False`, then intervals are returned in circle-of-fifths order.
return_factors : bool
If `True` then return a list of dictionaries encoding the prime factorization
of each interval as `{2: p2, 3: p3}` (meaning `3**p3 * 2**p2`).
If `False` (default), return intervals as an array of floating point numbers.
Returns
-------
intervals : np.ndarray or list of dictionaries
The constructed interval set. All intervals are mapped
to the range [1, 2).
See Also
--------
plimit_intervals
Examples
--------
Generate the first 12 intervals
>>> librosa.pythagorean_intervals(bins_per_octave=12)
array([1. , 1.067871, 1.125 , 1.201355, 1.265625, 1.351524,
1.423828, 1.5 , 1.601807, 1.6875 , 1.802032, 1.898437])
>>> # Compare to the 12-tone equal temperament intervals:
>>> 2**(np.arange(12)/12)
array([1. , 1.059463, 1.122462, 1.189207, 1.259921, 1.33484 ,
1.414214, 1.498307, 1.587401, 1.681793, 1.781797, 1.887749])
Or the first 7, in circle-of-fifths order
>>> librosa.pythagorean_intervals(bins_per_octave=7, sort=False)
array([1. , 1.5 , 1.125 , 1.6875 , 1.265625, 1.898437,
1.423828])
Generate the first 7, in circle-of-fifths other and factored form
>>> librosa.pythagorean_intervals(bins_per_octave=7, sort=False, return_factors=True)
[
{2: 0, 3: 0},
{2: -1, 3: 1},
{2: -3, 3: 2},
{2: -4, 3: 3},
{2: -6, 3: 4},
{2: -7, 3: 5},
{2: -9, 3: 6}
]
"""
# Generate all powers of 3 in log space
pow3 = np.arange(bins_per_octave)
# Using modf here to quickly get the fractional part of the log,
# accounting for whatever power of 2 is necessary to get 3**k
# within the octave.
log_ratios: np.ndarray
pow2: np.ndarray
log_ratios, pow2 = np.modf(pow3 * np.log2(3))
# If the fractional part is negative, add
# one more power of two to get it into the range [0, 1).
too_small = log_ratios < 0
log_ratios[too_small] += 1
pow2[too_small] += 1
# Convert powers of 2 to integer
pow2 = pow2.astype(int)
idx: Iterable[int]
if sort:
# Order the intervals
idx = np.argsort(log_ratios)
log_ratios = log_ratios[idx]
else:
# If not sorting, we'll take powers in order
idx = range(bins_per_octave)
if return_factors:
return list({2: -pow2[i], 3: pow3[i]} for i in idx)
return np.power(2, log_ratios)
def __harmonic_distance(logs, a, b):
"""Compute the harmonic distance between ratios a and b.
Harmonic distance is defined as `log2(a * b) - 2*log2(gcd(a, b))` [#]_.
Here we are expressing a and b as prime factorization exponents,
and the prime basis are provided in their log2 form.
.. [#] Tenney, James.
"On Crystal Growth in harmonic space (19931998)."
Contemporary Music Review 27.1 (2008): 47-56.
"""
a = np.array(a)
b = np.array(b)
# numerator = positive exponents
a_num = np.maximum(a, 0)
# denominator = negative exponents
a_den = a_num - a
b_num = np.maximum(b, 0)
b_den = b_num - b
# log2(ab / gcd(a,b)**2) = log(a) + log(b) - 2 * log(gcd(a,b))
# gcd(a,b) for rationals: gcd(a_num, b_num) / lcm(a_den, b_den)
# gcd = minimum(a_num, b_num) and lcm = maximum(a_den, b_den)
gcd = np.minimum(a_num, b_num) - np.maximum(a_den, b_den)
# Rounding this to 6 decimals to avoid floating point weirdness
return np.around(logs.dot(a + b - 2 * gcd), 6)
def _crystal_tie_break(a, b, logs):
"""Given two tuples of prime powers, break ties."""
return logs.dot(np.abs(a)) < logs.dot(np.abs(b))
@overload
def plimit_intervals(
*,
primes: ArrayLike,
bins_per_octave: int = ...,
sort: bool = ...,
return_factors: Literal[False] = ...
) -> np.ndarray:
...
@overload
def plimit_intervals(
*,
primes: ArrayLike,
bins_per_octave: int = ...,
sort: bool = ...,
return_factors: Literal[True]
) -> List[Dict[int, int]]:
...
@overload
def plimit_intervals(
*,
primes: ArrayLike,
bins_per_octave: int = ...,
sort: bool = ...,
return_factors: bool = ...
) -> Union[np.ndarray, List[Dict[int, int]]]:
...
@cache(level=10)
def plimit_intervals(
*,
primes: ArrayLike,
bins_per_octave: int = 12,
sort: bool = True,
return_factors: bool = False
) -> Union[np.ndarray, List[Dict[int, int]]]:
"""Construct p-limit intervals for a given set of prime factors.
This function is based on the "harmonic crystal growth" algorithm
of [#1]_ [#2]_.
.. [#1] Tenney, James.
"On Crystal Growth in harmonic space (19931998)."
Contemporary Music Review 27.1 (2008): 47-56.
.. [#2] Sabat, Marc, and James Tenney.
"Three crystal growth algorithms in 23-limit constrained harmonic space."
Contemporary Music Review 27, no. 1 (2008): 57-78.
Parameters
----------
primes : array of odd primes
Which prime factors are to be used
bins_per_octave : int
The number of intervals to construct
sort : bool
If `True` then intervals are returned in ascending order.
If `False`, then intervals are returned in crystal growth order.
return_factors : bool
If `True` then return a list of dictionaries encoding the prime factorization
of each interval as `{2: p2, 3: p3, ...}` (meaning `3**p3 * 2**p2`).
If `False` (default), return intervals as an array of floating point numbers.
Returns
-------
intervals : np.ndarray or list of dictionaries
The constructed interval set. All intervals are mapped
to the range [1, 2).
See Also
--------
pythagorean_intervals
Examples
--------
Compare 3-limit tuning to Pythagorean tuning and 12-TET
>>> librosa.plimit_intervals(primes=[3], bins_per_octave=12)
array([1. , 1.05349794, 1.125 , 1.18518519, 1.265625 ,
1.33333333, 1.40466392, 1.5 , 1.58024691, 1.6875 ,
1.77777778, 1.8984375 ])
>>> # Pythagorean intervals:
>>> librosa.pythagorean_intervals(bins_per_octave=12)
array([1. , 1.06787109, 1.125 , 1.20135498, 1.265625 ,
1.35152435, 1.42382812, 1.5 , 1.60180664, 1.6875 ,
1.80203247, 1.8984375 ])
>>> # 12-TET intervals:
>>> 2**(np.arange(12)/12)
array([1. , 1.05946309, 1.12246205, 1.18920712, 1.25992105,
1.33483985, 1.41421356, 1.49830708, 1.58740105, 1.68179283,
1.78179744, 1.88774863])
Create a 7-bin, 5-limit interval set
>>> librosa.plimit_intervals(primes=[3, 5], bins_per_octave=7)
array([1. , 1.125 , 1.25 , 1.33333333, 1.5 ,
1.66666667, 1.875 ])
The same example, but now in factored form
>>> librosa.plimit_intervals(primes=[3, 5], bins_per_octave=7,
... return_factors=True)
[
{},
{2: -3, 3: 2},
{2: -2, 5: 1},
{2: 2, 3: -1},
{2: -1, 3: 1},
{3: -1, 5: 1},
{2: -3, 3: 1, 5: 1}
]
"""
primes = np.atleast_1d(primes)
logs = np.log2(primes, dtype=np.float64)
# The seed set are primes and their reciprocals
# These are the values that we can use to expand our
# interval set. These are expressed in terms of the
# prime factorization exponents
seeds = []
for i in range(len(primes)):
# Add the prime
seed = [0] * len(primes)
seed[i] = 1
seeds.append(tuple(seed))
# Add the inverse
seed[i] = -1
seeds.append(tuple(seed))
# The frontier is the set of candidate intervals for inclusion
frontier = seeds.copy()
# The distances table will let us keep track of the harmonic
# distances between all selected intervals
distances = dict()
# Initialize the interval set with the root (1)
intervals = list()
root = tuple([0] * len(primes))
intervals.append(root)
while len(intervals) < bins_per_octave:
# Find the element on the frontier that minimizes the total
# harmonic distance to the existing set
score = np.inf
best_f = 0
for f, point in enumerate(frontier):
# Compute harmonic distance (HD) to each selected interval
HD = 0.0
for s in intervals:
if (s, point) not in distances:
distances[s, point] = __harmonic_distance(logs, point, s)
distances[point, s] = distances[s, point]
HD += distances[s, point]
if HD < score or (
np.isclose(HD, score)
and _crystal_tie_break(point, frontier[best_f], logs)
):
score = HD
best_f = f
new_point = frontier.pop(best_f)
intervals.append(new_point)
for _ in seeds:
new_seed = tuple(np.array(new_point) + np.array(_))
if new_seed not in intervals and new_seed not in frontier:
frontier.append(new_seed)
pows = np.array(list(intervals), dtype=float)
log_ratios: np.ndarray
pow2: np.ndarray
log_ratios, pow2 = np.modf(pows.dot(logs))
# If the fractional part is negative, add
# one more power of two to get it into the range [0, 1).
too_small = log_ratios < 0
log_ratios[too_small] += 1
pow2[too_small] -= 1
# Convert powers of 2 to integer
pow2 = pow2.astype(int)
idx: Iterable[int]
if sort:
# Order the intervals
idx = np.argsort(log_ratios)
log_ratios = log_ratios[idx]
else:
# If not sorting, we'll take powers in order
idx = range(bins_per_octave)
if return_factors:
# Collect the factorized intervals into a list
factors = []
for i in idx:
v = dict()
if pow2[i] != 0:
v[2] = -pow2[i]
v.update({p: int(power) for p, power in zip(primes, pows[i]) if power != 0})
factors.append(v)
return factors
# Otherwise, just return intervals as floats
return np.power(2, log_ratios)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,597 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Spectrogram decomposition
=========================
.. autosummary::
:toctree: generated/
decompose
hpss
nn_filter
"""
import numpy as np
import scipy.sparse
from scipy.ndimage import median_filter
import sklearn.decomposition
from . import core
from ._cache import cache
from . import segment
from . import util
from .util.exceptions import ParameterError
from typing import Any, Callable, List, Optional, Tuple, Union
from ._typing import _IntLike_co, _FloatLike_co
__all__ = ["decompose", "hpss", "nn_filter"]
def decompose(
S: np.ndarray,
*,
n_components: Optional[int] = None,
transformer: Optional[object] = None,
sort: bool = False,
fit: bool = True,
**kwargs: Any,
) -> Tuple[np.ndarray, np.ndarray]:
"""Decompose a feature matrix.
Given a spectrogram ``S``, produce a decomposition into ``components``
and ``activations`` such that ``S ~= components.dot(activations)``.
By default, this is done with with non-negative matrix factorization (NMF),
but any `sklearn.decomposition`-type object will work.
Parameters
----------
S : np.ndarray [shape=(..., n_features, n_samples), dtype=float]
The input feature matrix (e.g., magnitude spectrogram)
If the input has multiple channels (leading dimensions), they will be automatically
flattened prior to decomposition.
If the input is multi-channel, channels and features are automatically flattened into
a single axis before the decomposition.
For example, a stereo input `S` with shape `(2, n_features, n_samples)` is
automatically reshaped to `(2 * n_features, n_samples)`.
n_components : int > 0 [scalar] or None
number of desired components
if None, then ``n_features`` components are used
transformer : None or object
If None, use `sklearn.decomposition.NMF`
Otherwise, any object with a similar interface to NMF should work.
``transformer`` must follow the scikit-learn convention, where
input data is ``(n_samples, n_features)``.
`transformer.fit_transform()` will be run on ``S.T`` (not ``S``),
the return value of which is stored (transposed) as ``activations``
The components will be retrieved as ``transformer.components_.T``::
S ~= np.dot(activations, transformer.components_).T
or equivalently::
S ~= np.dot(transformer.components_.T, activations.T)
sort : bool
If ``True``, components are sorted by ascending peak frequency.
.. note:: If used with ``transformer``, sorting is applied to copies
of the decomposition parameters, and not to ``transformer``
internal parameters.
.. warning:: If the input array has more than two dimensions
(e.g., if it's a multi-channel spectrogram), then axis sorting
is not supported and a `ParameterError` exception is raised.
fit : bool
If `True`, components are estimated from the input ``S``.
If `False`, components are assumed to be pre-computed and stored
in ``transformer``, and are not changed.
**kwargs : Additional keyword arguments to the default transformer
`sklearn.decomposition.NMF`
Returns
-------
components: np.ndarray [shape=(..., n_features, n_components)]
matrix of components (basis elements).
activations: np.ndarray [shape=(n_components, n_samples)]
transformed matrix/activation matrix
Raises
------
ParameterError
if ``fit`` is False and no ``transformer`` object is provided.
if the input array is multi-channel and ``sort=True`` is specified.
See Also
--------
sklearn.decomposition : SciKit-Learn matrix decomposition modules
Examples
--------
Decompose a magnitude spectrogram into 16 components with NMF
>>> y, sr = librosa.load(librosa.ex('pistachio'), duration=5)
>>> S = np.abs(librosa.stft(y))
>>> comps, acts = librosa.decompose.decompose(S, n_components=16)
Sort components by ascending peak frequency
>>> comps, acts = librosa.decompose.decompose(S, n_components=16,
... sort=True)
Or with sparse dictionary learning
>>> import sklearn.decomposition
>>> T = sklearn.decomposition.MiniBatchDictionaryLearning(n_components=16)
>>> scomps, sacts = librosa.decompose.decompose(S, transformer=T, sort=True)
>>> import matplotlib.pyplot as plt
>>> layout = [list(".AAAA"), list("BCCCC"), list(".DDDD")]
>>> fig, ax = plt.subplot_mosaic(layout, constrained_layout=True)
>>> librosa.display.specshow(librosa.amplitude_to_db(S, ref=np.max),
... y_axis='log', x_axis='time', ax=ax['A'])
>>> ax['A'].set(title='Input spectrogram')
>>> ax['A'].label_outer()
>>> librosa.display.specshow(librosa.amplitude_to_db(comps,
>>> ref=np.max),
>>> y_axis='log', ax=ax['B'])
>>> ax['B'].set(title='Components')
>>> ax['B'].label_outer()
>>> ax['B'].sharey(ax['A'])
>>> librosa.display.specshow(acts, x_axis='time', ax=ax['C'], cmap='gray_r')
>>> ax['C'].set(ylabel='Components', title='Activations')
>>> ax['C'].sharex(ax['A'])
>>> ax['C'].label_outer()
>>> S_approx = comps.dot(acts)
>>> img = librosa.display.specshow(librosa.amplitude_to_db(S_approx,
>>> ref=np.max),
>>> y_axis='log', x_axis='time', ax=ax['D'])
>>> ax['D'].set(title='Reconstructed spectrogram')
>>> ax['D'].sharex(ax['A'])
>>> ax['D'].sharey(ax['A'])
>>> ax['D'].label_outer()
>>> fig.colorbar(img, ax=list(ax.values()), format="%+2.f dB")
"""
# Do a swapaxes and unroll
orig_shape = list(S.shape)
if S.ndim > 2 and sort:
raise ParameterError(
"Parameter sort=True is unsupported for input with more than two dimensions"
)
# Transpose S and unroll feature dimensions
# Use order='F' here to preserve the temporal ordering
S = S.T.reshape((S.shape[-1], -1), order="F")
if n_components is None:
n_components = S.shape[-1]
if transformer is None:
if fit is False:
raise ParameterError("fit must be True if transformer is None")
transformer = sklearn.decomposition.NMF(n_components=n_components, **kwargs)
# Suppressing type errors here because we don't want to overly restrict
# the transformer object type
activations: np.ndarray
if fit:
activations = transformer.fit_transform(S).T # type: ignore
else:
activations = transformer.transform(S).T # type: ignore
components: np.ndarray = transformer.components_ # type: ignore
component_shape = orig_shape[:-1] + [-1]
# use order='F' here to preserve component ordering
components = components.reshape(component_shape[::-1], order="F").T
if sort:
components, idx = util.axis_sort(components, index=True)
activations = activations[idx]
return components, activations
@cache(level=30)
def hpss(
S: np.ndarray,
*,
kernel_size: Union[
_IntLike_co, Tuple[_IntLike_co, _IntLike_co], List[_IntLike_co]
] = 31,
power: float = 2.0,
mask: bool = False,
margin: Union[
_FloatLike_co, Tuple[_FloatLike_co, _FloatLike_co], List[_FloatLike_co]
] = 1.0,
) -> Tuple[np.ndarray, np.ndarray]:
"""Median-filtering harmonic percussive source separation (HPSS).
If ``margin = 1.0``, decomposes an input spectrogram ``S = H + P``
where ``H`` contains the harmonic components,
and ``P`` contains the percussive components.
If ``margin > 1.0``, decomposes an input spectrogram ``S = H + P + R``
where ``R`` contains residual components not included in ``H`` or ``P``.
This implementation is based upon the algorithm described by [#]_ and [#]_.
.. [#] Fitzgerald, Derry.
"Harmonic/percussive separation using median filtering."
13th International Conference on Digital Audio Effects (DAFX10),
Graz, Austria, 2010.
.. [#] Driedger, Müller, Disch.
"Extending harmonic-percussive separation of audio."
15th International Society for Music Information Retrieval Conference (ISMIR 2014),
Taipei, Taiwan, 2014.
Parameters
----------
S : np.ndarray [shape=(..., d, n)]
input spectrogram. May be real (magnitude) or complex.
Multi-channel is supported.
kernel_size : int or tuple (kernel_harmonic, kernel_percussive)
kernel size(s) for the median filters.
- If scalar, the same size is used for both harmonic and percussive.
- If tuple, the first value specifies the width of the
harmonic filter, and the second value specifies the width
of the percussive filter.
power : float > 0 [scalar]
Exponent for the Wiener filter when constructing soft mask matrices.
mask : bool
Return the masking matrices instead of components.
Masking matrices contain non-negative real values that
can be used to measure the assignment of energy from ``S``
into harmonic or percussive components.
Components can be recovered by multiplying ``S * mask_H``
or ``S * mask_P``.
margin : float or tuple (margin_harmonic, margin_percussive)
margin size(s) for the masks (as described in [2]_)
- If scalar, the same size is used for both harmonic and percussive.
- If tuple, the first value specifies the margin of the
harmonic mask, and the second value specifies the margin
of the percussive mask.
Returns
-------
harmonic : np.ndarray [shape=(..., d, n)]
harmonic component (or mask)
percussive : np.ndarray [shape=(..., d, n)]
percussive component (or mask)
See Also
--------
librosa.util.softmask
Notes
-----
This function caches at level 30.
Examples
--------
Separate into harmonic and percussive
>>> y, sr = librosa.load(librosa.ex('choice'), duration=5)
>>> D = librosa.stft(y)
>>> H, P = librosa.decompose.hpss(D)
>>> import matplotlib.pyplot as plt
>>> fig, ax = plt.subplots(nrows=3, sharex=True, sharey=True)
>>> img = librosa.display.specshow(librosa.amplitude_to_db(np.abs(D),
... ref=np.max),
... y_axis='log', x_axis='time', ax=ax[0])
>>> ax[0].set(title='Full power spectrogram')
>>> ax[0].label_outer()
>>> librosa.display.specshow(librosa.amplitude_to_db(np.abs(H),
... ref=np.max(np.abs(D))),
... y_axis='log', x_axis='time', ax=ax[1])
>>> ax[1].set(title='Harmonic power spectrogram')
>>> ax[1].label_outer()
>>> librosa.display.specshow(librosa.amplitude_to_db(np.abs(P),
... ref=np.max(np.abs(D))),
... y_axis='log', x_axis='time', ax=ax[2])
>>> ax[2].set(title='Percussive power spectrogram')
>>> fig.colorbar(img, ax=ax, format='%+2.0f dB')
Or with a narrower horizontal filter
>>> H, P = librosa.decompose.hpss(D, kernel_size=(13, 31))
Just get harmonic/percussive masks, not the spectra
>>> mask_H, mask_P = librosa.decompose.hpss(D, mask=True)
>>> mask_H
array([[1.853e-03, 1.701e-04, ..., 9.922e-01, 1.000e+00],
[2.316e-03, 2.127e-04, ..., 9.989e-01, 1.000e+00],
...,
[8.195e-05, 6.939e-05, ..., 3.105e-04, 4.231e-04],
[3.159e-05, 4.156e-05, ..., 6.216e-04, 6.188e-04]],
dtype=float32)
>>> mask_P
array([[9.981e-01, 9.998e-01, ..., 7.759e-03, 3.201e-05],
[9.977e-01, 9.998e-01, ..., 1.122e-03, 4.451e-06],
...,
[9.999e-01, 9.999e-01, ..., 9.997e-01, 9.996e-01],
[1.000e+00, 1.000e+00, ..., 9.994e-01, 9.994e-01]],
dtype=float32)
Separate into harmonic/percussive/residual components by using a margin > 1.0
>>> H, P = librosa.decompose.hpss(D, margin=3.0)
>>> R = D - (H+P)
>>> y_harm = librosa.istft(H)
>>> y_perc = librosa.istft(P)
>>> y_resi = librosa.istft(R)
Get a more isolated percussive component by widening its margin
>>> H, P = librosa.decompose.hpss(D, margin=(1.0,5.0))
"""
phase: Union[float, np.ndarray]
if np.iscomplexobj(S):
S, phase = core.magphase(S)
else:
phase = 1
if isinstance(kernel_size, tuple) or isinstance(kernel_size, list):
win_harm = kernel_size[0]
win_perc = kernel_size[1]
else:
win_harm = kernel_size
win_perc = kernel_size
if isinstance(margin, tuple) or isinstance(margin, list):
margin_harm = margin[0]
margin_perc = margin[1]
else:
margin_harm = margin
margin_perc = margin
# margin minimum is 1.0
if margin_harm < 1 or margin_perc < 1:
raise ParameterError(
"Margins must be >= 1.0. " "A typical range is between 1 and 10."
)
# shape for kernels
harm_shape: List[_IntLike_co] = [1] * S.ndim
harm_shape[-1] = win_harm
perc_shape: List[_IntLike_co] = [1] * S.ndim
perc_shape[-2] = win_perc
# Compute median filters. Pre-allocation here preserves memory layout.
harm = np.empty_like(S)
harm[:] = median_filter(S, size=harm_shape, mode="reflect")
perc = np.empty_like(S)
perc[:] = median_filter(S, size=perc_shape, mode="reflect")
split_zeros = margin_harm == 1 and margin_perc == 1
mask_harm = util.softmask(
harm, perc * margin_harm, power=power, split_zeros=split_zeros
)
mask_perc = util.softmask(
perc, harm * margin_perc, power=power, split_zeros=split_zeros
)
if mask:
return mask_harm, mask_perc
return ((S * mask_harm) * phase, (S * mask_perc) * phase)
@cache(level=30)
def nn_filter(
S: np.ndarray,
*,
rec: Optional[Union[scipy.sparse.spmatrix, np.ndarray]] = None,
aggregate: Optional[Callable] = None,
axis: int = -1,
**kwargs: Any,
) -> np.ndarray:
"""Filter by nearest-neighbor aggregation.
Each data point (e.g, spectrogram column) is replaced
by aggregating its nearest neighbors in feature space.
This can be useful for de-noising a spectrogram or feature matrix.
The non-local means method [#]_ can be recovered by providing a
weighted recurrence matrix as input and specifying ``aggregate=np.average``.
Similarly, setting ``aggregate=np.median`` produces sparse de-noising
as in REPET-SIM [#]_.
.. [#] Buades, A., Coll, B., & Morel, J. M.
(2005, June). A non-local algorithm for image denoising.
In Computer Vision and Pattern Recognition, 2005.
CVPR 2005. IEEE Computer Society Conference on (Vol. 2, pp. 60-65). IEEE.
.. [#] Rafii, Z., & Pardo, B.
(2012, October). "Music/Voice Separation Using the Similarity Matrix."
International Society for Music Information Retrieval Conference, 2012.
Parameters
----------
S : np.ndarray
The input data (spectrogram) to filter. Multi-channel is supported.
rec : (optional) scipy.sparse.spmatrix or np.ndarray
Optionally, a pre-computed nearest-neighbor matrix
as provided by `librosa.segment.recurrence_matrix`
aggregate : function
aggregation function (default: `np.mean`)
If ``aggregate=np.average``, then a weighted average is
computed according to the (per-row) weights in ``rec``.
For all other aggregation functions, all neighbors
are treated equally.
axis : int
The axis along which to filter (by default, columns)
**kwargs
Additional keyword arguments provided to
`librosa.segment.recurrence_matrix` if ``rec`` is not provided
Returns
-------
S_filtered : np.ndarray
The filtered data, with shape equivalent to the input ``S``.
Raises
------
ParameterError
if ``rec`` is provided and its shape is incompatible with ``S``.
See Also
--------
decompose
hpss
librosa.segment.recurrence_matrix
Notes
-----
This function caches at level 30.
Examples
--------
De-noise a chromagram by non-local median filtering.
By default this would use euclidean distance to select neighbors,
but this can be overridden directly by setting the ``metric`` parameter.
>>> y, sr = librosa.load(librosa.ex('brahms'),
... offset=30, duration=10)
>>> chroma = librosa.feature.chroma_cqt(y=y, sr=sr)
>>> chroma_med = librosa.decompose.nn_filter(chroma,
... aggregate=np.median,
... metric='cosine')
To use non-local means, provide an affinity matrix and ``aggregate=np.average``.
>>> rec = librosa.segment.recurrence_matrix(chroma, mode='affinity',
... metric='cosine', sparse=True)
>>> chroma_nlm = librosa.decompose.nn_filter(chroma, rec=rec,
... aggregate=np.average)
>>> import matplotlib.pyplot as plt
>>> fig, ax = plt.subplots(nrows=5, sharex=True, sharey=True, figsize=(10, 10))
>>> librosa.display.specshow(chroma, y_axis='chroma', x_axis='time', ax=ax[0])
>>> ax[0].set(title='Unfiltered')
>>> ax[0].label_outer()
>>> librosa.display.specshow(chroma_med, y_axis='chroma', x_axis='time', ax=ax[1])
>>> ax[1].set(title='Median-filtered')
>>> ax[1].label_outer()
>>> imgc = librosa.display.specshow(chroma_nlm, y_axis='chroma', x_axis='time', ax=ax[2])
>>> ax[2].set(title='Non-local means')
>>> ax[2].label_outer()
>>> imgr1 = librosa.display.specshow(chroma - chroma_med,
... y_axis='chroma', x_axis='time', ax=ax[3])
>>> ax[3].set(title='Original - median')
>>> ax[3].label_outer()
>>> imgr2 = librosa.display.specshow(chroma - chroma_nlm,
... y_axis='chroma', x_axis='time', ax=ax[4])
>>> ax[4].label_outer()
>>> ax[4].set(title='Original - NLM')
>>> fig.colorbar(imgc, ax=ax[:3])
>>> fig.colorbar(imgr1, ax=[ax[3]])
>>> fig.colorbar(imgr2, ax=[ax[4]])
"""
if aggregate is None:
aggregate = np.mean
rec_s: scipy.sparse.spmatrix
if rec is None:
kwargs = dict(kwargs)
kwargs["sparse"] = True
rec_s = segment.recurrence_matrix(S, axis=axis, **kwargs)
elif not scipy.sparse.issparse(rec):
rec_s = scipy.sparse.csc_matrix(rec)
else:
rec_s = rec
if rec_s.shape[0] != S.shape[axis] or rec_s.shape[0] != rec_s.shape[1]:
raise ParameterError(
"Invalid self-similarity matrix shape "
f"rec.shape={rec_s.shape} for S.shape={S.shape}"
)
return __nn_filter_helper(
rec_s.data, rec_s.indices, rec_s.indptr, S.swapaxes(0, axis), aggregate
).swapaxes(0, axis)
def __nn_filter_helper(
R_data, R_indices, R_ptr, S: np.ndarray, aggregate: Callable
) -> np.ndarray:
"""Nearest-neighbor filter helper function.
This is an internal function, not for use outside of the decompose module.
It applies the nearest-neighbor filter to S, assuming that the first index
corresponds to observations.
Parameters
----------
R_data, R_indices, R_ptr : np.ndarrays
The ``data``, ``indices``, and ``indptr`` of a scipy.sparse matrix
S : np.ndarray
The observation data to filter
aggregate : callable
The aggregation operator
Returns
-------
S_out : np.ndarray like S
The filtered data array
"""
s_out = np.empty_like(S)
for i in range(len(R_ptr) - 1):
# Get the non-zeros out of the recurrence matrix
targets = R_indices[R_ptr[i] : R_ptr[i + 1]]
if not len(targets):
s_out[i] = S[i]
continue
neighbors = np.take(S, targets, axis=0)
if aggregate is np.average:
weights = R_data[R_ptr[i] : R_ptr[i + 1]]
s_out[i] = aggregate(neighbors, axis=0, weights=weights)
else:
s_out[i] = aggregate(neighbors, axis=0)
return s_out

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,63 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Feature extraction
==================
Spectral features
-----------------
.. autosummary::
:toctree: generated/
chroma_stft
chroma_cqt
chroma_cens
chroma_vqt
melspectrogram
mfcc
rms
spectral_centroid
spectral_bandwidth
spectral_contrast
spectral_flatness
spectral_rolloff
poly_features
tonnetz
zero_crossing_rate
Rhythm features
---------------
.. autosummary::
:toctree: generated/
tempo
tempogram
fourier_tempogram
tempogram_ratio
Feature manipulation
--------------------
.. autosummary::
:toctree: generated/
delta
stack_memory
Feature inversion
-----------------
.. autosummary::
:toctree: generated
inverse.mel_to_stft
inverse.mel_to_audio
inverse.mfcc_to_mel
inverse.mfcc_to_audio
"""
import lazy_loader as lazy
__getattr__, __dir__, __all__ = lazy.attach_stub(__name__, __file__)

View File

@@ -0,0 +1,31 @@
from .utils import (
delta as delta,
stack_memory as stack_memory,
)
from .spectral import (
spectral_centroid as spectral_centroid,
spectral_bandwidth as spectral_bandwidth,
spectral_contrast as spectral_contrast,
spectral_rolloff as spectral_rolloff,
spectral_flatness as spectral_flatness,
poly_features as poly_features,
rms as rms,
zero_crossing_rate as zero_crossing_rate,
chroma_stft as chroma_stft,
chroma_cqt as chroma_cqt,
chroma_cens as chroma_cens,
chroma_vqt as chroma_vqt,
melspectrogram as melspectrogram,
mfcc as mfcc,
tonnetz as tonnetz,
)
from .rhythm import (
tempogram as tempogram,
fourier_tempogram as fourier_tempogram,
tempo as tempo,
tempogram_ratio as tempogram_ratio,
)
from . import (
inverse as inverse,
)

View File

@@ -0,0 +1,373 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Feature inversion"""
import warnings
import numpy as np
from ..core.fft import get_fftlib
from ..util.exceptions import ParameterError
from ..core.spectrum import griffinlim
from ..core.spectrum import db_to_power
from ..util.utils import tiny
from .. import filters
from ..util import nnls, expand_to
from numpy.typing import DTypeLike
from typing import Any, Optional
from .._typing import _WindowSpec, _PadModeSTFT
__all__ = ["mel_to_stft", "mel_to_audio", "mfcc_to_mel", "mfcc_to_audio"]
def mel_to_stft(
M: np.ndarray,
*,
sr: float = 22050,
n_fft: int = 2048,
power: float = 2.0,
**kwargs: Any,
) -> np.ndarray:
"""Approximate STFT magnitude from a Mel power spectrogram.
Parameters
----------
M : np.ndarray [shape=(..., n_mels, n), non-negative]
The spectrogram as produced by `feature.melspectrogram`
sr : number > 0 [scalar]
sampling rate of the underlying signal
n_fft : int > 0 [scalar]
number of FFT components in the resulting STFT
power : float > 0 [scalar]
Exponent for the magnitude melspectrogram
**kwargs : additional keyword arguments for Mel filter bank parameters
fmin : float >= 0 [scalar]
lowest frequency (in Hz)
fmax : float >= 0 [scalar]
highest frequency (in Hz).
If `None`, use ``fmax = sr / 2.0``
htk : bool [scalar]
use HTK formula instead of Slaney
norm : {None, 'slaney', or number} [scalar]
If 'slaney', divide the triangular mel weights by the width of
the mel band (area normalization).
If numeric, use `librosa.util.normalize` to normalize each filter
by to unit l_p norm. See `librosa.util.normalize` for a full
description of supported norm values (including `+-np.inf`).
Otherwise, leave all the triangles aiming for a peak value of 1.0
dtype : np.dtype
The data type of the output basis.
By default, uses 32-bit (single-precision) floating point.
Returns
-------
S : np.ndarray [shape=(..., n_fft, t), non-negative]
An approximate linear magnitude spectrogram
See Also
--------
librosa.feature.melspectrogram
librosa.stft
librosa.filters.mel
librosa.util.nnls
Examples
--------
>>> y, sr = librosa.load(librosa.ex('trumpet'))
>>> S = librosa.util.abs2(librosa.stft(y))
>>> mel_spec = librosa.feature.melspectrogram(S=S, sr=sr)
>>> S_inv = librosa.feature.inverse.mel_to_stft(mel_spec, sr=sr)
Compare the results visually
>>> import matplotlib.pyplot as plt
>>> fig, ax = plt.subplots(nrows=3, sharex=True, sharey=True)
>>> img = librosa.display.specshow(librosa.amplitude_to_db(S, ref=np.max, top_db=None),
... y_axis='log', x_axis='time', ax=ax[0])
>>> ax[0].set(title='Original STFT')
>>> ax[0].label_outer()
>>> librosa.display.specshow(librosa.amplitude_to_db(S_inv, ref=np.max, top_db=None),
... y_axis='log', x_axis='time', ax=ax[1])
>>> ax[1].set(title='Reconstructed STFT')
>>> ax[1].label_outer()
>>> librosa.display.specshow(librosa.amplitude_to_db(np.abs(S_inv - S),
... ref=S.max(), top_db=None),
... vmax=0, y_axis='log', x_axis='time', cmap='magma', ax=ax[2])
>>> ax[2].set(title='Residual error (dB)')
>>> fig.colorbar(img, ax=ax, format="%+2.f dB")
"""
# Construct a mel basis with dtype matching the input data
mel_basis = filters.mel(
sr=sr, n_fft=n_fft, n_mels=M.shape[-2], dtype=M.dtype, **kwargs
)
# Find the non-negative least squares solution, and apply
# the inverse exponent.
# We'll do the exponentiation in-place.
inverse = nnls(mel_basis, M)
return np.power(inverse, 1.0 / power, out=inverse)
def mel_to_audio(
M: np.ndarray,
*,
sr: float = 22050,
n_fft: int = 2048,
hop_length: Optional[int] = None,
win_length: Optional[int] = None,
window: _WindowSpec = "hann",
center: bool = True,
pad_mode: _PadModeSTFT = "constant",
power: float = 2.0,
n_iter: int = 32,
length: Optional[int] = None,
dtype: DTypeLike = np.float32,
**kwargs: Any,
) -> np.ndarray:
"""Invert a mel power spectrogram to audio using Griffin-Lim.
This is primarily a convenience wrapper for:
>>> S = librosa.feature.inverse.mel_to_stft(M)
>>> y = librosa.griffinlim(S)
Parameters
----------
M : np.ndarray [shape=(..., n_mels, n), non-negative]
The spectrogram as produced by `feature.melspectrogram`
sr : number > 0 [scalar]
sampling rate of the underlying signal
n_fft : int > 0 [scalar]
number of FFT components in the resulting STFT
hop_length : None or int > 0
The hop length of the STFT. If not provided, it will default to ``n_fft // 4``
win_length : None or int > 0
The window length of the STFT. By default, it will equal ``n_fft``
window : string, tuple, number, function, or np.ndarray [shape=(n_fft,)]
A window specification as supported by `stft` or `istft`
center : boolean
If `True`, the STFT is assumed to use centered frames.
If `False`, the STFT is assumed to use left-aligned frames.
pad_mode : string
If ``center=True``, the padding mode to use at the edges of the signal.
By default, STFT uses zero padding.
power : float > 0 [scalar]
Exponent for the magnitude melspectrogram
n_iter : int > 0
The number of iterations for Griffin-Lim
length : None or int > 0
If provided, the output ``y`` is zero-padded or clipped to exactly ``length``
samples.
dtype : np.dtype
Real numeric type for the time-domain signal. Default is 32-bit float.
**kwargs : additional keyword arguments for Mel filter bank parameters
fmin : float >= 0 [scalar]
lowest frequency (in Hz)
fmax : float >= 0 [scalar]
highest frequency (in Hz).
If `None`, use ``fmax = sr / 2.0``
htk : bool [scalar]
use HTK formula instead of Slaney
norm : {None, 'slaney', or number} [scalar]
If 'slaney', divide the triangular mel weights by the width of
the mel band (area normalization).
If numeric, use `librosa.util.normalize` to normalize each filter
by to unit l_p norm. See `librosa.util.normalize` for a full
description of supported norm values (including `+-np.inf`).
Otherwise, leave all the triangles aiming for a peak value of 1.0
Returns
-------
y : np.ndarray [shape(..., n,)]
time-domain signal reconstructed from ``M``
See Also
--------
librosa.griffinlim
librosa.feature.melspectrogram
librosa.filters.mel
librosa.feature.inverse.mel_to_stft
"""
stft = mel_to_stft(M, sr=sr, n_fft=n_fft, power=power, **kwargs)
return griffinlim(
stft,
n_iter=n_iter,
hop_length=hop_length,
win_length=win_length,
n_fft=n_fft,
window=window,
center=center,
dtype=dtype,
length=length,
pad_mode=pad_mode,
)
def mfcc_to_mel(
mfcc: np.ndarray,
*,
n_mels: int = 128,
dct_type: int = 2,
norm: Optional[str] = "ortho",
ref: float = 1.0,
lifter: float = 0,
) -> np.ndarray:
"""Invert Mel-frequency cepstral coefficients to approximate a Mel power
spectrogram.
This inversion proceeds in two steps:
1. The inverse DCT is applied to the MFCCs
2. `librosa.db_to_power` is applied to map the dB-scaled result to a power spectrogram
Parameters
----------
mfcc : np.ndarray [shape=(..., n_mfcc, n)]
The Mel-frequency cepstral coefficients
n_mels : int > 0
The number of Mel frequencies
dct_type : {1, 2, 3}
Discrete cosine transform (DCT) type
By default, DCT type-2 is used.
norm : None or 'ortho'
If ``dct_type`` is `2 or 3`, setting ``norm='ortho'`` uses an orthonormal
DCT basis.
Normalization is not supported for `dct_type=1`.
ref : float
Reference power for (inverse) decibel calculation
lifter : number >= 0
If ``lifter>0``, apply inverse liftering (inverse cepstral filtering)::
M[n, :] <- M[n, :] / (1 + sin(pi * (n + 1) / lifter) * lifter / 2)
Returns
-------
M : np.ndarray [shape=(..., n_mels, n)]
An approximate Mel power spectrum recovered from ``mfcc``
Warns
-----
UserWarning
due to critical values in lifter array that invokes underflow.
See Also
--------
librosa.feature.mfcc
librosa.feature.melspectrogram
scipy.fft.dct
"""
if lifter > 0:
n_mfcc = mfcc.shape[-2]
idx = np.arange(1, 1 + n_mfcc, dtype=mfcc.dtype)
idx = expand_to(idx, ndim=mfcc.ndim, axes=-2)
lifter_sine = 1 + lifter * 0.5 * np.sin(np.pi * idx / lifter)
# raise a UserWarning if lifter array includes critical values
if np.any(np.abs(lifter_sine) < np.finfo(lifter_sine.dtype).eps):
warnings.warn(
message="lifter array includes critical values that may invoke underflow.",
category=UserWarning,
stacklevel=2,
)
# lifter mfcc values
mfcc = mfcc / (lifter_sine + tiny(mfcc))
elif lifter != 0:
raise ParameterError("MFCC to mel lifter must be a non-negative number.")
fft = get_fftlib()
logmel = fft.idct(mfcc, axis=-2, type=dct_type, norm=norm, n=n_mels)
melspec: np.ndarray = db_to_power(logmel, ref=ref)
return melspec
def mfcc_to_audio(
mfcc: np.ndarray,
*,
n_mels: int = 128,
dct_type: int = 2,
norm: Optional[str] = "ortho",
ref: float = 1.0,
lifter: float = 0,
**kwargs: Any,
) -> np.ndarray:
"""Convert Mel-frequency cepstral coefficients to a time-domain audio signal
This function is primarily a convenience wrapper for the following steps:
1. Convert mfcc to Mel power spectrum (`mfcc_to_mel`)
2. Convert Mel power spectrum to time-domain audio (`mel_to_audio`)
Parameters
----------
mfcc : np.ndarray [shape=(..., n_mfcc, n)]
The Mel-frequency cepstral coefficients
n_mels : int > 0
The number of Mel frequencies
dct_type : {1, 2, 3}
Discrete cosine transform (DCT) type
By default, DCT type-2 is used.
norm : None or 'ortho'
If ``dct_type`` is `2 or 3`, setting ``norm='ortho'`` uses an orthonormal
DCT basis.
Normalization is not supported for ``dct_type=1``.
ref : float
Reference power for (inverse) decibel calculation
lifter : number >= 0
If ``lifter>0``, apply inverse liftering (inverse cepstral filtering)::
M[n, :] <- M[n, :] / (1 + sin(pi * (n + 1) / lifter)) * lifter / 2
**kwargs : additional keyword arguments to pass through to `mel_to_audio`
M : np.ndarray [shape=(..., n_mels, n), non-negative]
The spectrogram as produced by `feature.melspectrogram`
sr : number > 0 [scalar]
sampling rate of the underlying signal
n_fft : int > 0 [scalar]
number of FFT components in the resulting STFT
hop_length : None or int > 0
The hop length of the STFT. If not provided, it will default to ``n_fft // 4``
win_length : None or int > 0
The window length of the STFT. By default, it will equal ``n_fft``
window : string, tuple, number, function, or np.ndarray [shape=(n_fft,)]
A window specification as supported by `stft` or `istft`
center : boolean
If `True`, the STFT is assumed to use centered frames.
If `False`, the STFT is assumed to use left-aligned frames.
pad_mode : string
If ``center=True``, the padding mode to use at the edges of the signal.
By default, STFT uses zero padding.
power : float > 0 [scalar]
Exponent for the magnitude melspectrogram
n_iter : int > 0
The number of iterations for Griffin-Lim
length : None or int > 0
If provided, the output ``y`` is zero-padded or clipped to exactly ``length``
samples.
dtype : np.dtype
Real numeric type for the time-domain signal. Default is 32-bit float.
**kwargs : additional keyword arguments for Mel filter bank parameters
fmin : float >= 0 [scalar]
lowest frequency (in Hz)
fmax : float >= 0 [scalar]
highest frequency (in Hz).
If `None`, use ``fmax = sr / 2.0``
htk : bool [scalar]
use HTK formula instead of Slaney
Returns
-------
y : np.ndarray [shape=(..., n)]
A time-domain signal reconstructed from `mfcc`
See Also
--------
mfcc_to_mel
mel_to_audio
librosa.feature.mfcc
librosa.griffinlim
scipy.fft.dct
"""
mel_spec = mfcc_to_mel(
mfcc, n_mels=n_mels, dct_type=dct_type, norm=norm, ref=ref, lifter=lifter
)
return mel_to_audio(mel_spec, **kwargs)

View File

@@ -0,0 +1,655 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Rhythmic feature extraction"""
import numpy as np
import scipy
from .. import util
from .._cache import cache
from ..core.audio import autocorrelate
from ..core.spectrum import stft
from ..core.convert import tempo_frequencies, time_to_frames
from ..core.harmonic import f0_harmonics
from ..util.exceptions import ParameterError
from ..filters import get_window
from typing import Optional, Callable, Any
from .._typing import _WindowSpec
__all__ = ["tempogram", "fourier_tempogram", "tempo", "tempogram_ratio"]
# -- Rhythmic features -- #
def tempogram(
*,
y: Optional[np.ndarray] = None,
sr: float = 22050,
onset_envelope: Optional[np.ndarray] = None,
hop_length: int = 512,
win_length: int = 384,
center: bool = True,
window: _WindowSpec = "hann",
norm: Optional[float] = np.inf,
) -> np.ndarray:
"""Compute the tempogram: local autocorrelation of the onset strength envelope. [#]_
.. [#] Grosche, Peter, Meinard Müller, and Frank Kurth.
"Cyclic tempogram - A mid-level tempo representation for music signals."
ICASSP, 2010.
Parameters
----------
y : np.ndarray [shape=(..., n)] or None
Audio time series. Multi-channel is supported.
sr : number > 0 [scalar]
sampling rate of ``y``
onset_envelope : np.ndarray [shape=(..., n) or (..., m, n)] or None
Optional pre-computed onset strength envelope as provided by
`librosa.onset.onset_strength`.
If multi-dimensional, tempograms are computed independently for each
band (first dimension).
hop_length : int > 0
number of audio samples between successive onset measurements
win_length : int > 0
length of the onset autocorrelation window (in frames/onset measurements)
The default settings (384) corresponds to ``384 * hop_length / sr ~= 8.9s``.
center : bool
If `True`, onset autocorrelation windows are centered.
If `False`, windows are left-aligned.
window : string, function, number, tuple, or np.ndarray [shape=(win_length,)]
A window specification as in `stft`.
norm : {np.inf, -np.inf, 0, float > 0, None}
Normalization mode. Set to `None` to disable normalization.
Returns
-------
tempogram : np.ndarray [shape=(..., win_length, n)]
Localized autocorrelation of the onset strength envelope.
If given multi-band input (``onset_envelope.shape==(m,n)``) then
``tempogram[i]`` is the tempogram of ``onset_envelope[i]``.
Raises
------
ParameterError
if neither ``y`` nor ``onset_envelope`` are provided
if ``win_length < 1``
See Also
--------
fourier_tempogram
librosa.onset.onset_strength
librosa.util.normalize
librosa.stft
Examples
--------
>>> # Compute local onset autocorrelation
>>> y, sr = librosa.load(librosa.ex('nutcracker'), duration=30)
>>> hop_length = 512
>>> oenv = librosa.onset.onset_strength(y=y, sr=sr, hop_length=hop_length)
>>> tempogram = librosa.feature.tempogram(onset_envelope=oenv, sr=sr,
... hop_length=hop_length)
>>> # Compute global onset autocorrelation
>>> ac_global = librosa.autocorrelate(oenv, max_size=tempogram.shape[0])
>>> ac_global = librosa.util.normalize(ac_global)
>>> # Estimate the global tempo for display purposes
>>> tempo = librosa.feature.tempo(onset_envelope=oenv, sr=sr,
... hop_length=hop_length)[0]
>>> import matplotlib.pyplot as plt
>>> fig, ax = plt.subplots(nrows=4, figsize=(10, 10))
>>> times = librosa.times_like(oenv, sr=sr, hop_length=hop_length)
>>> ax[0].plot(times, oenv, label='Onset strength')
>>> ax[0].label_outer()
>>> ax[0].legend(frameon=True)
>>> librosa.display.specshow(tempogram, sr=sr, hop_length=hop_length,
>>> x_axis='time', y_axis='tempo', cmap='magma',
... ax=ax[1])
>>> ax[1].axhline(tempo, color='w', linestyle='--', alpha=1,
... label='Estimated tempo={:g}'.format(tempo))
>>> ax[1].legend(loc='upper right')
>>> ax[1].set(title='Tempogram')
>>> x = np.linspace(0, tempogram.shape[0] * float(hop_length) / sr,
... num=tempogram.shape[0])
>>> ax[2].plot(x, np.mean(tempogram, axis=1), label='Mean local autocorrelation')
>>> ax[2].plot(x, ac_global, '--', alpha=0.75, label='Global autocorrelation')
>>> ax[2].set(xlabel='Lag (seconds)')
>>> ax[2].legend(frameon=True)
>>> freqs = librosa.tempo_frequencies(tempogram.shape[0], hop_length=hop_length, sr=sr)
>>> ax[3].semilogx(freqs[1:], np.mean(tempogram[1:], axis=1),
... label='Mean local autocorrelation', base=2)
>>> ax[3].semilogx(freqs[1:], ac_global[1:], '--', alpha=0.75,
... label='Global autocorrelation', base=2)
>>> ax[3].axvline(tempo, color='black', linestyle='--', alpha=.8,
... label='Estimated tempo={:g}'.format(tempo))
>>> ax[3].legend(frameon=True)
>>> ax[3].set(xlabel='BPM')
>>> ax[3].grid(True)
"""
from ..onset import onset_strength
if win_length < 1:
raise ParameterError("win_length must be a positive integer")
ac_window = get_window(window, win_length, fftbins=True)
if onset_envelope is None:
if y is None:
raise ParameterError("Either y or onset_envelope must be provided")
onset_envelope = onset_strength(y=y, sr=sr, hop_length=hop_length)
# Center the autocorrelation windows
n = onset_envelope.shape[-1]
if center:
padding = [(0, 0) for _ in onset_envelope.shape]
padding[-1] = (int(win_length // 2),) * 2
onset_envelope = np.pad(
onset_envelope, padding, mode="linear_ramp", end_values=[0, 0]
)
# Carve onset envelope into frames
odf_frame = util.frame(onset_envelope, frame_length=win_length, hop_length=1)
# Truncate to the length of the original signal
if center:
odf_frame = odf_frame[..., :n]
# explicit broadcast of ac_window
ac_window = util.expand_to(ac_window, ndim=odf_frame.ndim, axes=-2)
# Window, autocorrelate, and normalize
return util.normalize(
autocorrelate(odf_frame * ac_window, axis=-2), norm=norm, axis=-2
)
def fourier_tempogram(
*,
y: Optional[np.ndarray] = None,
sr: float = 22050,
onset_envelope: Optional[np.ndarray] = None,
hop_length: int = 512,
win_length: int = 384,
center: bool = True,
window: _WindowSpec = "hann",
) -> np.ndarray:
"""Compute the Fourier tempogram: the short-time Fourier transform of the
onset strength envelope. [#]_
.. [#] Grosche, Peter, Meinard Müller, and Frank Kurth.
"Cyclic tempogram - A mid-level tempo representation for music signals."
ICASSP, 2010.
Parameters
----------
y : np.ndarray [shape=(..., n)] or None
Audio time series. Multi-channel is supported.
sr : number > 0 [scalar]
sampling rate of ``y``
onset_envelope : np.ndarray [shape=(..., n)] or None
Optional pre-computed onset strength envelope as provided by
``librosa.onset.onset_strength``.
Multi-channel is supported.
hop_length : int > 0
number of audio samples between successive onset measurements
win_length : int > 0
length of the onset window (in frames/onset measurements)
The default settings (384) corresponds to ``384 * hop_length / sr ~= 8.9s``.
center : bool
If `True`, onset windows are centered.
If `False`, windows are left-aligned.
window : string, function, number, tuple, or np.ndarray [shape=(win_length,)]
A window specification as in `stft`.
Returns
-------
tempogram : np.ndarray [shape=(..., win_length // 2 + 1, n)]
Complex short-time Fourier transform of the onset envelope.
Raises
------
ParameterError
if neither ``y`` nor ``onset_envelope`` are provided
if ``win_length < 1``
See Also
--------
tempogram
librosa.onset.onset_strength
librosa.util.normalize
librosa.stft
Examples
--------
>>> # Compute local onset autocorrelation
>>> y, sr = librosa.load(librosa.ex('nutcracker'))
>>> hop_length = 512
>>> oenv = librosa.onset.onset_strength(y=y, sr=sr, hop_length=hop_length)
>>> tempogram = librosa.feature.fourier_tempogram(onset_envelope=oenv, sr=sr,
... hop_length=hop_length)
>>> # Compute the auto-correlation tempogram, unnormalized to make comparison easier
>>> ac_tempogram = librosa.feature.tempogram(onset_envelope=oenv, sr=sr,
... hop_length=hop_length, norm=None)
>>> import matplotlib.pyplot as plt
>>> fig, ax = plt.subplots(nrows=3, sharex=True)
>>> ax[0].plot(librosa.times_like(oenv), oenv, label='Onset strength')
>>> ax[0].legend(frameon=True)
>>> ax[0].label_outer()
>>> librosa.display.specshow(np.abs(tempogram), sr=sr, hop_length=hop_length,
>>> x_axis='time', y_axis='fourier_tempo', cmap='magma',
... ax=ax[1])
>>> ax[1].set(title='Fourier tempogram')
>>> ax[1].label_outer()
>>> librosa.display.specshow(ac_tempogram, sr=sr, hop_length=hop_length,
>>> x_axis='time', y_axis='tempo', cmap='magma',
... ax=ax[2])
>>> ax[2].set(title='Autocorrelation tempogram')
"""
from ..onset import onset_strength
if win_length < 1:
raise ParameterError("win_length must be a positive integer")
if onset_envelope is None:
if y is None:
raise ParameterError("Either y or onset_envelope must be provided")
onset_envelope = onset_strength(y=y, sr=sr, hop_length=hop_length)
# Generate the short-time Fourier transform
return stft(
onset_envelope, n_fft=win_length, hop_length=1, center=center, window=window
)
@cache(level=30)
def tempo(
*,
y: Optional[np.ndarray] = None,
sr: float = 22050,
onset_envelope: Optional[np.ndarray] = None,
tg: Optional[np.ndarray] = None,
hop_length: int = 512,
start_bpm: float = 120,
std_bpm: float = 1.0,
ac_size: float = 8.0,
max_tempo: Optional[float] = 320.0,
aggregate: Optional[Callable[..., Any]] = np.mean,
prior: Optional[scipy.stats.rv_continuous] = None,
) -> np.ndarray:
"""Estimate the tempo (beats per minute)
Parameters
----------
y : np.ndarray [shape=(..., n)] or None
audio time series. Multi-channel is supported.
sr : number > 0 [scalar]
sampling rate of the time series
onset_envelope : np.ndarray [shape=(..., n)]
pre-computed onset strength envelope
tg : np.ndarray
pre-computed tempogram. If provided, then `y` and
`onset_envelope` are ignored, and `win_length` is
inferred from the shape of the tempogram.
hop_length : int > 0 [scalar]
hop length of the time series
start_bpm : float [scalar]
initial guess of the BPM
std_bpm : float > 0 [scalar]
standard deviation of tempo distribution
ac_size : float > 0 [scalar]
length (in seconds) of the auto-correlation window
max_tempo : float > 0 [scalar, optional]
If provided, only estimate tempo below this threshold
aggregate : callable [optional]
Aggregation function for estimating global tempo.
If `None`, then tempo is estimated independently for each frame.
prior : scipy.stats.rv_continuous [optional]
A prior distribution over tempo (in beats per minute).
By default, a pseudo-log-normal prior is used.
If given, ``start_bpm`` and ``std_bpm`` will be ignored.
Returns
-------
tempo : np.ndarray
estimated tempo (beats per minute).
If input is multi-channel, one tempo estimate per channel is provided.
See Also
--------
librosa.onset.onset_strength
librosa.feature.tempogram
Notes
-----
This function caches at level 30.
Examples
--------
>>> # Estimate a static tempo
>>> y, sr = librosa.load(librosa.ex('nutcracker'), duration=30)
>>> onset_env = librosa.onset.onset_strength(y=y, sr=sr)
>>> tempo = librosa.feature.tempo(onset_envelope=onset_env, sr=sr)
>>> tempo
array([143.555])
>>> # Or a static tempo with a uniform prior instead
>>> import scipy.stats
>>> prior = scipy.stats.uniform(30, 300) # uniform over 30-300 BPM
>>> utempo = librosa.feature.tempo(onset_envelope=onset_env, sr=sr, prior=prior)
>>> utempo
array([161.499])
>>> # Or a dynamic tempo
>>> dtempo = librosa.feature.tempo(onset_envelope=onset_env, sr=sr,
... aggregate=None)
>>> dtempo
array([ 89.103, 89.103, 89.103, ..., 123.047, 123.047, 123.047])
>>> # Dynamic tempo with a proper log-normal prior
>>> prior_lognorm = scipy.stats.lognorm(loc=np.log(120), scale=120, s=1)
>>> dtempo_lognorm = librosa.feature.tempo(onset_envelope=onset_env, sr=sr,
... aggregate=None,
... prior=prior_lognorm)
>>> dtempo_lognorm
array([ 89.103, 89.103, 89.103, ..., 123.047, 123.047, 123.047])
Plot the estimated tempo against the onset autocorrelation
>>> import matplotlib.pyplot as plt
>>> # Convert to scalar
>>> tempo = tempo.item()
>>> utempo = utempo.item()
>>> # Compute 2-second windowed autocorrelation
>>> hop_length = 512
>>> ac = librosa.autocorrelate(onset_env, max_size=2 * sr // hop_length)
>>> freqs = librosa.tempo_frequencies(len(ac), sr=sr,
... hop_length=hop_length)
>>> # Plot on a BPM axis. We skip the first (0-lag) bin.
>>> fig, ax = plt.subplots()
>>> ax.semilogx(freqs[1:], librosa.util.normalize(ac)[1:],
... label='Onset autocorrelation', base=2)
>>> ax.axvline(tempo, 0, 1, alpha=0.75, linestyle='--', color='r',
... label='Tempo (default prior): {:.2f} BPM'.format(tempo))
>>> ax.axvline(utempo, 0, 1, alpha=0.75, linestyle=':', color='g',
... label='Tempo (uniform prior): {:.2f} BPM'.format(utempo))
>>> ax.set(xlabel='Tempo (BPM)', title='Static tempo estimation')
>>> ax.grid(True)
>>> ax.legend()
Plot dynamic tempo estimates over a tempogram
>>> fig, ax = plt.subplots()
>>> tg = librosa.feature.tempogram(onset_envelope=onset_env, sr=sr,
... hop_length=hop_length)
>>> librosa.display.specshow(tg, x_axis='time', y_axis='tempo', cmap='magma', ax=ax)
>>> ax.plot(librosa.times_like(dtempo), dtempo,
... color='c', linewidth=1.5, label='Tempo estimate (default prior)')
>>> ax.plot(librosa.times_like(dtempo_lognorm), dtempo_lognorm,
... color='c', linewidth=1.5, linestyle='--',
... label='Tempo estimate (lognorm prior)')
>>> ax.set(title='Dynamic tempo estimation')
>>> ax.legend()
"""
if start_bpm <= 0:
raise ParameterError("start_bpm must be strictly positive")
if tg is None:
win_length = time_to_frames(ac_size, sr=sr, hop_length=hop_length).item()
tg = tempogram(
y=y,
sr=sr,
onset_envelope=onset_envelope,
hop_length=hop_length,
win_length=win_length,
)
else:
# Override window length by what's actually given
win_length = tg.shape[-2]
# Eventually, we want this to work for time-varying tempo
if aggregate is not None:
tg = aggregate(tg, axis=-1, keepdims=True)
assert tg is not None
# Get the BPM values for each bin, skipping the 0-lag bin
bpms = tempo_frequencies(win_length, hop_length=hop_length, sr=sr)
# Weight the autocorrelation by a log-normal distribution
if prior is None:
logprior = -0.5 * ((np.log2(bpms) - np.log2(start_bpm)) / std_bpm) ** 2
else:
logprior = prior.logpdf(bpms)
# Kill everything above the max tempo
if max_tempo is not None:
max_idx = int(np.argmax(bpms < max_tempo))
logprior[:max_idx] = -np.inf
# explicit axis expansion
logprior = util.expand_to(logprior, ndim=tg.ndim, axes=-2)
# Get the maximum, weighted by the prior
# Using log1p here for numerical stability
best_period = np.argmax(np.log1p(1e6 * tg) + logprior, axis=-2)
tempo_est: np.ndarray = np.take(bpms, best_period)
return tempo_est
@cache(level=40)
def tempogram_ratio(
*,
y: Optional[np.ndarray] = None,
sr: float = 22050,
onset_envelope: Optional[np.ndarray] = None,
tg: Optional[np.ndarray] = None,
bpm: Optional[np.ndarray] = None,
hop_length: int = 512,
win_length: int = 384,
start_bpm: float = 120,
std_bpm: float = 1.0,
max_tempo: Optional[float] = 320.0,
freqs: Optional[np.ndarray] = None,
factors: Optional[np.ndarray] = None,
aggregate: Optional[Callable[..., Any]] = None,
prior: Optional[scipy.stats.rv_continuous] = None,
center: bool = True,
window: _WindowSpec = "hann",
kind: str = "linear",
fill_value: float = 0,
norm: Optional[float] = np.inf,
) -> np.ndarray:
"""Tempogram ratio features, also known as spectral rhythm patterns. [1]_
This function summarizes the energy at metrically important multiples
of the tempo. For example, if the tempo corresponds to the quarter-note
period, the tempogram ratio will measure the energy at the eighth note,
sixteenth note, half note, whole note, etc. periods, as well as dotted
and triplet ratios.
By default, the multiplicative factors used here are as specified by
[2]_. If the estimated tempo corresponds to a quarter note, these factors
will measure relative energy at the following metrical subdivisions:
+-------+--------+------------------+
| Index | Factor | Description |
+=======+========+==================+
| 0 | 4 | Sixteenth note |
+-------+--------+------------------+
| 1 | 8/3 | Dotted sixteenth |
+-------+--------+------------------+
| 2 | 3 | Eighth triplet |
+-------+--------+------------------+
| 3 | 2 | Eighth note |
+-------+--------+------------------+
| 4 | 4/3 | Dotted eighth |
+-------+--------+------------------+
| 5 | 3/2 | Quarter triplet |
+-------+--------+------------------+
| 6 | 1 | Quarter note |
+-------+--------+------------------+
| 7 | 2/3 | Dotted quarter |
+-------+--------+------------------+
| 8 | 3/4 | Half triplet |
+-------+--------+------------------+
| 9 | 1/2 | Half note |
+-------+--------+------------------+
| 10 | 1/3 | Dotted half note |
+-------+--------+------------------+
| 11 | 3/8 | Whole triplet |
+-------+--------+------------------+
| 12 | 1/4 | Whole note |
+-------+--------+------------------+
.. [1] Peeters, Geoffroy.
"Rhythm Classification Using Spectral Rhythm Patterns."
In ISMIR, pp. 644-647. 2005.
.. [2] Prockup, Matthew, Andreas F. Ehmann, Fabien Gouyon, Erik M. Schmidt, and Youngmoo E. Kim.
"Modeling musical rhythm at scale with the music genome project."
In 2015 IEEE workshop on applications of signal processing to audio and acoustics (WASPAA), pp. 1-5. IEEE, 2015.
Parameters
----------
y : np.ndarray [shape=(..., n)] or None
audio time series
sr : number > 0 [scalar]
sampling rate of the time series
onset_envelope : np.ndarray [shape=(..., n)]
pre-computed onset strength envelope
tg : np.ndarray
pre-computed tempogram. If provided, then `y` and
`onset_envelope` are ignored, and `win_length` is
inferred from the shape of the tempogram.
bpm : np.ndarray
pre-computed tempo estimate. This must be a per-frame
estimate, and have dimension compatible with `tg`.
hop_length : int > 0 [scalar]
hop length of the time series
win_length : int > 0 [scalar]
window length of the autocorrelation window for tempogram
calculation
start_bpm : float [scalar]
initial guess of the BPM if `bpm` is not provided
std_bpm : float > 0 [scalar]
standard deviation of tempo distribution
max_tempo : float > 0 [scalar, optional]
If provided, only estimate tempo below this threshold
freqs : np.ndarray
Frequencies (in BPM) of the tempogram axis.
factors : np.ndarray
Multiples of the fundamental tempo (bpm) to estimate.
If not provided, the factors are as specified above.
prior : scipy.stats.rv_continuous [optional]
A prior distribution over tempo (in beats per minute).
By default, a pseudo-log-normal prior is used.
If given, ``start_bpm`` and ``std_bpm`` will be ignored.
center : bool
If `True`, onset windows are centered.
If `False`, windows are left-aligned.
aggregate : callable [optional]
Aggregation function for estimating global tempogram ratio.
If `None`, then ratios are estimated independently for each frame.
window : string, function, number, tuple, or np.ndarray [shape=(win_length,)]
A window specification as in `stft`.
kind : str
Interpolation mode for measuring tempogram ratios
fill_value : float
The value to fill when extrapolating beyond the observed
frequency range.
norm : {np.inf, -np.inf, 0, float > 0, None}
Normalization mode. Set to `None` to disable normalization.
Returns
-------
tgr : np.ndarray
The tempogram ratio for the specified factors.
If `aggregate` is provided, the trailing time axis
will be removed.
If `aggregate` is not provided (default), ratios
will be estimated for each frame.
See Also
--------
tempogram
tempo
librosa.f0_harmonics
librosa.tempo_frequencies
Examples
--------
Compute tempogram ratio features using the default factors
for a waltz (3/4 time)
>>> import matplotlib.pyplot as plt
>>> y, sr = librosa.load(librosa.ex('sweetwaltz'))
>>> tempogram = librosa.feature.tempogram(y=y, sr=sr)
>>> tgr = librosa.feature.tempogram_ratio(tg=tempogram, sr=sr)
>>> fig, ax = plt.subplots(nrows=2, sharex=True)
>>> librosa.display.specshow(tempogram, x_axis='time', y_axis='tempo',
... ax=ax[0])
>>> librosa.display.specshow(tgr, x_axis='time', ax=ax[1])
>>> ax[0].label_outer()
>>> ax[0].set(title="Tempogram")
>>> ax[1].set(title="Tempogram ratio")
"""
# Get a tempogram and time-varying tempo estimate
if tg is None:
tg = tempogram(
y=y,
sr=sr,
onset_envelope=onset_envelope,
hop_length=hop_length,
win_length=win_length,
center=center,
window=window,
norm=norm,
)
if freqs is None:
freqs = tempo_frequencies(sr=sr, n_bins=len(tg), hop_length=hop_length)
# Estimate tempo per-frame, no aggregation yet
if bpm is None:
bpm = tempo(
sr=sr,
tg=tg,
hop_length=hop_length,
start_bpm=start_bpm,
std_bpm=std_bpm,
max_tempo=max_tempo,
aggregate=None,
prior=prior,
)
if factors is None:
# metric multiples from Prockup'15
factors = np.array(
[4, 8 / 3, 3, 2, 4 / 3, 3 / 2, 1, 2 / 3, 3 / 4, 1 / 2, 1 / 3, 3 / 8, 1 / 4]
)
tgr = f0_harmonics(
tg, freqs=freqs, f0=bpm, harmonics=factors, kind=kind, fill_value=fill_value
)
if aggregate is not None:
return aggregate(tgr, axis=-1) # type: ignore
return tgr

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,310 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Feature manipulation utilities"""
import numpy as np
import scipy.signal
from numba import jit
from .._cache import cache
from ..util.exceptions import ParameterError
from typing import Any
__all__ = ["delta", "stack_memory"]
@cache(level=40)
def delta(
data: np.ndarray,
*,
width: int = 9,
order: int = 1,
axis: int = -1,
mode: str = "interp",
**kwargs: Any,
) -> np.ndarray:
r"""Compute delta features: local estimate of the derivative
of the input data along the selected axis.
Delta features are computed Savitsky-Golay filtering.
Parameters
----------
data : np.ndarray
the input data matrix (eg, spectrogram)
width : int, positive, odd [scalar]
Number of frames over which to compute the delta features.
Cannot exceed the length of ``data`` along the specified axis.
If ``mode='interp'``, then ``width`` must be at least ``data.shape[axis]``.
order : int > 0 [scalar]
the order of the difference operator.
1 for first derivative, 2 for second, etc.
axis : int [scalar]
the axis along which to compute deltas.
Default is -1 (columns).
mode : str, {'interp', 'nearest', 'mirror', 'constant', 'wrap'}
Padding mode for estimating differences at the boundaries.
**kwargs : additional keyword arguments
See `scipy.signal.savgol_filter`
Returns
-------
delta_data : np.ndarray [shape=(..., t)]
delta matrix of ``data`` at specified order
Notes
-----
This function caches at level 40.
See Also
--------
scipy.signal.savgol_filter
Examples
--------
Compute MFCC deltas, delta-deltas
>>> y, sr = librosa.load(librosa.ex('libri1'), duration=5)
>>> mfcc = librosa.feature.mfcc(y=y, sr=sr)
>>> mfcc_delta = librosa.feature.delta(mfcc)
>>> mfcc_delta
array([[-5.713e+02, -5.697e+02, ..., -1.522e+02, -1.224e+02],
[ 1.104e+01, 1.330e+01, ..., 2.089e+02, 1.698e+02],
...,
[ 2.829e+00, 1.933e+00, ..., -3.149e+00, 2.294e-01],
[ 2.890e+00, 2.187e+00, ..., 6.959e+00, -1.039e+00]],
dtype=float32)
>>> mfcc_delta2 = librosa.feature.delta(mfcc, order=2)
>>> mfcc_delta2
array([[-1.195, -1.195, ..., -4.328, -4.328],
[-1.566, -1.566, ..., -9.949, -9.949],
...,
[ 0.707, 0.707, ..., 2.287, 2.287],
[ 0.655, 0.655, ..., -1.719, -1.719]], dtype=float32)
>>> import matplotlib.pyplot as plt
>>> fig, ax = plt.subplots(nrows=3, sharex=True, sharey=True)
>>> img1 = librosa.display.specshow(mfcc, ax=ax[0], x_axis='time')
>>> ax[0].set(title='MFCC')
>>> ax[0].label_outer()
>>> img2 = librosa.display.specshow(mfcc_delta, ax=ax[1], x_axis='time')
>>> ax[1].set(title=r'MFCC-$\Delta$')
>>> ax[1].label_outer()
>>> img3 = librosa.display.specshow(mfcc_delta2, ax=ax[2], x_axis='time')
>>> ax[2].set(title=r'MFCC-$\Delta^2$')
>>> fig.colorbar(img1, ax=[ax[0]])
>>> fig.colorbar(img2, ax=[ax[1]])
>>> fig.colorbar(img3, ax=[ax[2]])
"""
data = np.atleast_1d(data)
if mode == "interp" and width > data.shape[axis]:
raise ParameterError(
f"when mode='interp', width={width} "
f"cannot exceed data.shape[axis]={data.shape[axis]}"
)
if width < 3 or np.mod(width, 2) != 1:
raise ParameterError("width must be an odd integer >= 3")
if order <= 0 or not isinstance(order, (int, np.integer)):
raise ParameterError("order must be a positive integer")
kwargs.pop("deriv", None)
kwargs.setdefault("polyorder", order)
result: np.ndarray = scipy.signal.savgol_filter(
data, width, deriv=order, axis=axis, mode=mode, **kwargs
)
return result
@cache(level=40)
def stack_memory(
data: np.ndarray, *, n_steps: int = 2, delay: int = 1, **kwargs: Any
) -> np.ndarray:
"""Short-term history embedding: vertically concatenate a data
vector or matrix with delayed copies of itself.
Each column ``data[:, i]`` is mapped to::
data[..., i] -> [data[..., i],
data[..., i - delay],
...
data[..., i - (n_steps-1)*delay]]
For columns ``i < (n_steps - 1) * delay``, the data will be padded.
By default, the data is padded with zeros, but this behavior can be
overridden by supplying additional keyword arguments which are passed
to `np.pad()`.
Parameters
----------
data : np.ndarray [shape=(..., d, t)]
Input data matrix. If ``data`` is a vector (``data.ndim == 1``),
it will be interpreted as a row matrix and reshaped to ``(1, t)``.
n_steps : int > 0 [scalar]
embedding dimension, the number of steps back in time to stack
delay : int != 0 [scalar]
the number of columns to step.
Positive values embed from the past (previous columns).
Negative values embed from the future (subsequent columns).
**kwargs : additional keyword arguments
Additional arguments to pass to `numpy.pad`
Returns
-------
data_history : np.ndarray [shape=(..., m * d, t)]
data augmented with lagged copies of itself,
where ``m == n_steps - 1``.
Notes
-----
This function caches at level 40.
Examples
--------
Keep two steps (current and previous)
>>> data = np.arange(-3, 3)
>>> librosa.feature.stack_memory(data)
array([[-3, -2, -1, 0, 1, 2],
[ 0, -3, -2, -1, 0, 1]])
Or three steps
>>> librosa.feature.stack_memory(data, n_steps=3)
array([[-3, -2, -1, 0, 1, 2],
[ 0, -3, -2, -1, 0, 1],
[ 0, 0, -3, -2, -1, 0]])
Use reflection padding instead of zero-padding
>>> librosa.feature.stack_memory(data, n_steps=3, mode='reflect')
array([[-3, -2, -1, 0, 1, 2],
[-2, -3, -2, -1, 0, 1],
[-1, -2, -3, -2, -1, 0]])
Or pad with edge-values, and delay by 2
>>> librosa.feature.stack_memory(data, n_steps=3, delay=2, mode='edge')
array([[-3, -2, -1, 0, 1, 2],
[-3, -3, -3, -2, -1, 0],
[-3, -3, -3, -3, -3, -2]])
Stack time-lagged beat-synchronous chroma edge padding
>>> y, sr = librosa.load(librosa.ex('sweetwaltz'), duration=10)
>>> chroma = librosa.feature.chroma_cqt(y=y, sr=sr)
>>> tempo, beats = librosa.beat.beat_track(y=y, sr=sr, hop_length=512)
>>> beats = librosa.util.fix_frames(beats, x_min=0)
>>> chroma_sync = librosa.util.sync(chroma, beats)
>>> chroma_lag = librosa.feature.stack_memory(chroma_sync, n_steps=3,
... mode='edge')
Plot the result
>>> import matplotlib.pyplot as plt
>>> fig, ax = plt.subplots()
>>> beat_times = librosa.frames_to_time(beats, sr=sr, hop_length=512)
>>> librosa.display.specshow(chroma_lag, y_axis='chroma', x_axis='time',
... x_coords=beat_times, ax=ax)
>>> ax.text(1.0, 1/6, "Lag=0", transform=ax.transAxes, rotation=-90, ha="left", va="center")
>>> ax.text(1.0, 3/6, "Lag=1", transform=ax.transAxes, rotation=-90, ha="left", va="center")
>>> ax.text(1.0, 5/6, "Lag=2", transform=ax.transAxes, rotation=-90, ha="left", va="center")
>>> ax.set(title='Time-lagged chroma', ylabel="")
"""
if n_steps < 1:
raise ParameterError("n_steps must be a positive integer")
if delay == 0:
raise ParameterError("delay must be a non-zero integer")
data = np.atleast_2d(data)
t = data.shape[-1]
if t < 1:
raise ParameterError(
"Cannot stack memory when input data has "
f"no columns. Given data.shape={data.shape}"
)
kwargs.setdefault("mode", "constant")
if kwargs["mode"] == "constant":
kwargs.setdefault("constant_values", [0])
padding = [(0, 0) for _ in range(data.ndim)]
# Pad the end with zeros, which will roll to the front below
if delay > 0:
padding[-1] = (int((n_steps - 1) * delay), 0)
else:
padding[-1] = (0, int((n_steps - 1) * -delay))
data = np.pad(data, padding, **kwargs)
# Construct the shape of the target array
shape = list(data.shape)
shape[-2] = shape[-2] * n_steps
shape[-1] = t
shape = tuple(shape)
# Construct the output array to match layout and dtype of input
history = np.empty_like(data, shape=shape)
# Populate the output array
__stack(history, data, n_steps, delay)
return history
@jit(nopython=True, cache=True)
def __stack(history, data, n_steps, delay):
"""Memory-stacking helper function.
Parameters
----------
history : output array (2-dimensional)
data : pre-padded input array (2-dimensional)
n_steps : int > 0, the number of steps to stack
delay : int != 0, the amount of delay between steps
Returns
-------
None
Output is stored directly in the history array
"""
# Dimension of each copy of the data
d = data.shape[-2]
# Total number of time-steps to output
t = history.shape[-1]
if delay > 0:
for step in range(n_steps):
q = n_steps - 1 - step
# nth block is original shifted left by n*delay steps
history[..., step * d : (step + 1) * d, :] = data[
..., q * delay : q * delay + t
]
else:
# Handle the last block separately to avoid -t:0 empty slices
history[..., -d:, :] = data[..., -t:]
for step in range(n_steps - 1):
# nth block is original shifted right by n*delay steps
q = n_steps - 1 - step
history[..., step * d : (step + 1) * d, :] = data[
..., -t + q * delay : q * delay
]

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,641 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Onset detection
===============
.. autosummary::
:toctree: generated/
onset_detect
onset_backtrack
onset_strength
onset_strength_multi
"""
import numpy as np
import scipy
from ._cache import cache
from . import core
from . import util
from .util.exceptions import ParameterError
from .feature.spectral import melspectrogram
from typing import Any, Callable, Optional, Union, Sequence
__all__ = ["onset_detect", "onset_strength", "onset_strength_multi", "onset_backtrack"]
def onset_detect(
*,
y: Optional[np.ndarray] = None,
sr: float = 22050,
onset_envelope: Optional[np.ndarray] = None,
hop_length: int = 512,
backtrack: bool = False,
energy: Optional[np.ndarray] = None,
units: str = "frames",
normalize: bool = True,
sparse: bool = True,
**kwargs: Any,
) -> np.ndarray:
"""Locate note onset events by picking peaks in an onset strength envelope.
The `peak_pick` parameters were chosen by large-scale hyper-parameter
optimization over the dataset provided by [#]_.
.. [#] https://github.com/CPJKU/onset_db
Parameters
----------
y : np.ndarray [shape=(..., n)]
audio time-series. Multi-channel is supported.
sr : number > 0 [scalar]
sampling rate of ``y``
onset_envelope : np.ndarray [shape=(..., m)]
(optional) pre-computed onset strength envelope
hop_length : int > 0 [scalar]
hop length (in samples)
units : {'frames', 'samples', 'time'}
The units to encode detected onset events in.
By default, 'frames' are used.
backtrack : bool
If ``True``, detected onset events are backtracked to the nearest
preceding minimum of ``energy``.
This is primarily useful when using onsets as slice points for segmentation.
.. note:: backtracking is only supported if ``sparse=True``.
energy : np.ndarray [shape=(m,)] (optional)
An energy function to use for backtracking detected onset events.
If none is provided, then ``onset_envelope`` is used.
normalize : bool
If ``True`` (default), normalize the onset envelope to have minimum of 0 and
maximum of 1 prior to detection. This is helpful for standardizing the
parameters of `librosa.util.peak_pick`.
Otherwise, the onset envelope is left unnormalized.
sparse : bool
If ``True`` (default), detections are returned as an array of frames,
samples, or time indices (as specified by ``units=``).
If ``False``, detections are encoded as a dense boolean array where
``onsets[n]`` is True if there's an onset at frame index ``n``.
.. note:: multi-channel input is only supported if ``sparse=False``.
**kwargs : additional keyword arguments
Additional parameters for peak picking.
See `librosa.util.peak_pick` for details.
Returns
-------
onsets : np.ndarray [shape=(n_onsets,) or onset_envelope.shape]
estimated positions of detected onsets, in whichever units
are specified. By default, frame indices.
If `sparse=False`, `onsets[..., n]` indicates an onset
detection at frame index `n`.
.. note::
If no onset strength could be detected, onset_detect returns
an empty array (sparse=True) or all-False array (sparse=False).
Raises
------
ParameterError
if neither ``y`` nor ``onsets`` are provided
or if ``units`` is not one of 'frames', 'samples', or 'time'
See Also
--------
onset_strength : compute onset strength per-frame
onset_backtrack : backtracking onset events
librosa.util.peak_pick : pick peaks from a time series
Examples
--------
Get onset times from a signal
>>> y, sr = librosa.load(librosa.ex('trumpet'))
>>> librosa.onset.onset_detect(y=y, sr=sr, units='time')
array([0.07 , 0.232, 0.395, 0.604, 0.743, 0.929, 1.045, 1.115,
1.416, 1.672, 1.881, 2.043, 2.206, 2.368, 2.554, 3.019])
Or use a pre-computed onset envelope
>>> o_env = librosa.onset.onset_strength(y=y, sr=sr)
>>> times = librosa.times_like(o_env, sr=sr)
>>> onset_frames = librosa.onset.onset_detect(onset_envelope=o_env, sr=sr)
>>> import matplotlib.pyplot as plt
>>> D = np.abs(librosa.stft(y))
>>> fig, ax = plt.subplots(nrows=2, sharex=True)
>>> librosa.display.specshow(librosa.amplitude_to_db(D, ref=np.max),
... x_axis='time', y_axis='log', ax=ax[0], sr=sr)
>>> ax[0].set(title='Power spectrogram')
>>> ax[0].label_outer()
>>> ax[1].plot(times, o_env, label='Onset strength')
>>> ax[1].vlines(times[onset_frames], 0, o_env.max(), color='r', alpha=0.9,
... linestyle='--', label='Onsets')
>>> ax[1].legend()
"""
# First, get the frame->beat strength profile if we don't already have one
if onset_envelope is None:
if y is None:
raise ParameterError("y or onset_envelope must be provided")
onset_envelope = onset_strength(y=y, sr=sr, hop_length=hop_length)
# Shift onset envelope up to be non-negative
# (a common normalization step to make the threshold more consistent)
if normalize:
# Normalize onset strength function to [0, 1] range
# Normalization is performed over the trailing axis
onset_envelope = onset_envelope - np.min(onset_envelope, keepdims=True, axis=-1)
# Mypy does not realize that oenv is not None by now
# Max-scale with safe division
onset_envelope /= np.max(onset_envelope, keepdims=True, axis=-1) + util.tiny(onset_envelope) # type: ignore
# help out mypy
assert onset_envelope is not None
# Do we have any onsets to grab?
if not onset_envelope.any() or not np.all(np.isfinite(onset_envelope)):
if sparse:
onsets = np.array([], dtype=int)
else:
onsets = np.zeros_like(onset_envelope, dtype=bool)
else:
# These parameter settings found by large-scale search
kwargs.setdefault("pre_max", 0.03 * sr // hop_length) # 30ms
kwargs.setdefault("post_max", 0.00 * sr // hop_length + 1) # 0ms
kwargs.setdefault("pre_avg", 0.10 * sr // hop_length) # 100ms
kwargs.setdefault("post_avg", 0.10 * sr // hop_length + 1) # 100ms
kwargs.setdefault("wait", 0.03 * sr // hop_length) # 30ms
kwargs.setdefault("delta", 0.07)
# Peak pick the onset envelope
onsets = util.peak_pick(onset_envelope, sparse=sparse, axis=-1, **kwargs)
# Optionally backtrack the events
if backtrack:
if not sparse:
raise ParameterError("onset backtracking is only supported if sparse=True")
if energy is None:
energy = onset_envelope
assert energy is not None
onsets = onset_backtrack(onsets, energy)
if sparse:
if units == "frames":
pass
elif units == "samples":
onsets = core.frames_to_samples(onsets, hop_length=hop_length)
elif units == "time":
onsets = core.frames_to_time(onsets, hop_length=hop_length, sr=sr)
else:
raise ParameterError(f"Invalid unit type: {units}")
return onsets
def onset_strength(
*,
y: Optional[np.ndarray] = None,
sr: float = 22050,
S: Optional[np.ndarray] = None,
lag: int = 1,
max_size: int = 1,
ref: Optional[np.ndarray] = None,
detrend: bool = False,
center: bool = True,
feature: Optional[Callable] = None,
aggregate: Optional[Union[Callable, bool]] = None,
**kwargs: Any,
) -> np.ndarray:
"""Compute a spectral flux onset strength envelope.
Onset strength at time ``t`` is determined by::
mean_f max(0, S[f, t] - ref[f, t - lag])
where ``ref`` is ``S`` after local max filtering along the frequency
axis [#]_.
By default, if a time series ``y`` is provided, S will be the
log-power Mel spectrogram.
.. [#] Böck, Sebastian, and Gerhard Widmer.
"Maximum filter vibrato suppression for onset detection."
16th International Conference on Digital Audio Effects,
Maynooth, Ireland. 2013.
Parameters
----------
y : np.ndarray [shape=(..., n)]
audio time-series. Multi-channel is supported.
sr : number > 0 [scalar]
sampling rate of ``y``
S : np.ndarray [shape=(..., d, m)]
pre-computed (log-power) spectrogram
lag : int > 0
time lag for computing differences
max_size : int > 0
size (in frequency bins) of the local max filter.
set to `1` to disable filtering.
ref : None or np.ndarray [shape=(..., d, m)]
An optional pre-computed reference spectrum, of the same shape as ``S``.
If not provided, it will be computed from ``S``.
If provided, it will override any local max filtering governed by ``max_size``.
detrend : bool [scalar]
Filter the onset strength to remove the DC component
center : bool [scalar]
Shift the onset function by ``n_fft // (2 * hop_length)`` frames.
This corresponds to using a centered frame analysis in the short-time Fourier
transform.
feature : function
Function for computing time-series features, eg, scaled spectrograms.
By default, uses `librosa.feature.melspectrogram` with ``fmax=sr/2``
aggregate : function
Aggregation function to use when combining onsets
at different frequency bins.
Default: `np.mean`
**kwargs : additional keyword arguments
Additional parameters to ``feature()``, if ``S`` is not provided.
Returns
-------
onset_envelope : np.ndarray [shape=(..., m,)]
vector containing the onset strength envelope.
If the input contains multiple channels, then onset envelope is computed for each channel.
Raises
------
ParameterError
if neither ``(y, sr)`` nor ``S`` are provided
or if ``lag`` or ``max_size`` are not positive integers
See Also
--------
onset_detect
onset_strength_multi
Examples
--------
First, load some audio and plot the spectrogram
>>> import matplotlib.pyplot as plt
>>> y, sr = librosa.load(librosa.ex('trumpet'), duration=3)
>>> D = np.abs(librosa.stft(y))
>>> times = librosa.times_like(D, sr=sr)
>>> fig, ax = plt.subplots(nrows=2, sharex=True)
>>> librosa.display.specshow(librosa.amplitude_to_db(D, ref=np.max),
... y_axis='log', x_axis='time', ax=ax[0], sr=sr)
>>> ax[0].set(title='Power spectrogram')
>>> ax[0].label_outer()
Construct a standard onset function
>>> onset_env = librosa.onset.onset_strength(y=y, sr=sr)
>>> ax[1].plot(times, 2 + onset_env / onset_env.max(), alpha=0.8,
... label='Mean (mel)')
Median aggregation, and custom mel options
>>> onset_env = librosa.onset.onset_strength(y=y, sr=sr,
... aggregate=np.median,
... fmax=8000, n_mels=256)
>>> ax[1].plot(times, 1 + onset_env / onset_env.max(), alpha=0.8,
... label='Median (custom mel)')
Constant-Q spectrogram instead of Mel
>>> C = np.abs(librosa.cqt(y=y, sr=sr))
>>> onset_env = librosa.onset.onset_strength(sr=sr, S=librosa.amplitude_to_db(C, ref=np.max))
>>> ax[1].plot(times, onset_env / onset_env.max(), alpha=0.8,
... label='Mean (CQT)')
>>> ax[1].legend()
>>> ax[1].set(ylabel='Normalized strength', yticks=[])
"""
if aggregate is False:
raise ParameterError(
"aggregate parameter cannot be False when computing full-spectrum onset strength."
)
odf_all = onset_strength_multi(
y=y,
sr=sr,
S=S,
lag=lag,
max_size=max_size,
ref=ref,
detrend=detrend,
center=center,
feature=feature,
aggregate=aggregate,
channels=None,
**kwargs,
)
return odf_all[..., 0, :]
def onset_backtrack(events: np.ndarray, energy: np.ndarray) -> np.ndarray:
"""Backtrack detected onset events to the nearest preceding local
minimum of an energy function.
This function can be used to roll back the timing of detected onsets
from a detected peak amplitude to the preceding minimum.
This is most useful when using onsets to determine slice points for
segmentation, as described by [#]_.
.. [#] Jehan, Tristan.
"Creating music by listening"
Doctoral dissertation
Massachusetts Institute of Technology, 2005.
Parameters
----------
events : np.ndarray, dtype=int
List of onset event frame indices, as computed by `onset_detect`
energy : np.ndarray, shape=(m,)
An energy function
Returns
-------
events_backtracked : np.ndarray, shape=events.shape
The input events matched to nearest preceding minima of ``energy``.
Examples
--------
Backtrack the events using the onset envelope
>>> y, sr = librosa.load(librosa.ex('trumpet'), duration=3)
>>> oenv = librosa.onset.onset_strength(y=y, sr=sr)
>>> times = librosa.times_like(oenv, sr=sr)
>>> # Detect events without backtracking
>>> onset_raw = librosa.onset.onset_detect(onset_envelope=oenv,
... backtrack=False)
>>> onset_bt = librosa.onset.onset_backtrack(onset_raw, oenv)
Backtrack the events using the RMS values
>>> S = np.abs(librosa.stft(y=y))
>>> rms = librosa.feature.rms(S=S)
>>> onset_bt_rms = librosa.onset.onset_backtrack(onset_raw, rms[0])
Plot the results
>>> import matplotlib.pyplot as plt
>>> fig, ax = plt.subplots(nrows=3, sharex=True)
>>> librosa.display.specshow(librosa.amplitude_to_db(S, ref=np.max),
... y_axis='log', x_axis='time', ax=ax[0])
>>> ax[0].label_outer()
>>> ax[1].plot(times, oenv, label='Onset strength')
>>> ax[1].vlines(librosa.frames_to_time(onset_raw), 0, oenv.max(), label='Raw onsets')
>>> ax[1].vlines(librosa.frames_to_time(onset_bt), 0, oenv.max(), label='Backtracked', color='r')
>>> ax[1].legend()
>>> ax[1].label_outer()
>>> ax[2].plot(times, rms[0], label='RMS')
>>> ax[2].vlines(librosa.frames_to_time(onset_bt_rms), 0, rms.max(), label='Backtracked (RMS)', color='r')
>>> ax[2].legend()
"""
# Find points where energy is non-increasing
# all points: energy[i] <= energy[i-1]
# tail points: energy[i] < energy[i+1]
minima = np.flatnonzero((energy[1:-1] <= energy[:-2]) & (energy[1:-1] < energy[2:]))
# Pad on a 0, just in case we have onsets with no preceding minimum
# Shift by one to account for slicing in minima detection
minima = util.fix_frames(1 + minima, x_min=0)
# Only match going left from the detected events
results: np.ndarray = minima[util.match_events(events, minima, right=False)]
return results
@cache(level=30)
def onset_strength_multi(
*,
y: Optional[np.ndarray] = None,
sr: float = 22050,
S: Optional[np.ndarray] = None,
n_fft: int = 2048,
hop_length: int = 512,
lag: int = 1,
max_size: int = 1,
ref: Optional[np.ndarray] = None,
detrend: bool = False,
center: bool = True,
feature: Optional[Callable] = None,
aggregate: Optional[Union[Callable, bool]] = None,
channels: Optional[Union[Sequence[int], Sequence[slice]]] = None,
**kwargs: Any,
) -> np.ndarray:
"""Compute a spectral flux onset strength envelope across multiple channels.
Onset strength for channel ``i`` at time ``t`` is determined by::
mean_{f in channels[i]} max(0, S[f, t+1] - S[f, t])
Parameters
----------
y : np.ndarray [shape=(..., n,)]
audio time-series. Multi-channel is supported.
sr : number > 0 [scalar]
sampling rate of ``y``
S : np.ndarray [shape=(..., d, m)]
pre-computed (log-power) spectrogram
n_fft : int > 0 [scalar]
FFT window size for use in ``feature()`` if ``S`` is not provided.
hop_length : int > 0 [scalar]
hop length for use in ``feature()`` if ``S`` is not provided.
lag : int > 0
time lag for computing differences
max_size : int > 0
size (in frequency bins) of the local max filter.
set to `1` to disable filtering.
ref : None or np.ndarray [shape=(d, m)]
An optional pre-computed reference spectrum, of the same shape as ``S``.
If not provided, it will be computed from ``S``.
If provided, it will override any local max filtering governed by ``max_size``.
detrend : bool [scalar]
Filter the onset strength to remove the DC component
center : bool [scalar]
Shift the onset function by ``n_fft // (2 * hop_length)`` frames.
This corresponds to using a centered frame analysis in the short-time Fourier
transform.
feature : function
Function for computing time-series features, eg, scaled spectrograms.
By default, uses `librosa.feature.melspectrogram` with ``fmax=sr/2``
Must support arguments: ``y, sr, n_fft, hop_length``
aggregate : function or False
Aggregation function to use when combining onsets
at different frequency bins.
If ``False``, then no aggregation is performed.
Default: `np.mean`
channels : list or None
Array of channel boundaries or slice objects.
If `None`, then a single channel is generated to span all bands.
**kwargs : additional keyword arguments
Additional parameters to ``feature()``, if ``S`` is not provided.
Returns
-------
onset_envelope : np.ndarray [shape=(..., n_channels, m)]
array containing the onset strength envelope for each specified channel
Raises
------
ParameterError
if neither ``(y, sr)`` nor ``S`` are provided
See Also
--------
onset_strength
Notes
-----
This function caches at level 30.
Examples
--------
First, load some audio and plot the spectrogram
>>> import matplotlib.pyplot as plt
>>> y, sr = librosa.load(librosa.ex('choice'), duration=5)
>>> D = np.abs(librosa.stft(y))
>>> fig, ax = plt.subplots(nrows=2, sharex=True)
>>> img1 = librosa.display.specshow(librosa.amplitude_to_db(D, ref=np.max),
... y_axis='log', x_axis='time', ax=ax[0])
>>> ax[0].set(title='Power spectrogram')
>>> ax[0].label_outer()
>>> fig.colorbar(img1, ax=[ax[0]], format="%+2.f dB")
Construct a standard onset function over four sub-bands
>>> onset_subbands = librosa.onset.onset_strength_multi(y=y, sr=sr,
... channels=[0, 32, 64, 96, 128])
>>> img2 = librosa.display.specshow(onset_subbands, x_axis='time', ax=ax[1])
>>> ax[1].set(ylabel='Sub-bands', title='Sub-band onset strength')
>>> fig.colorbar(img2, ax=[ax[1]])
"""
if feature is None:
feature = melspectrogram
kwargs.setdefault("fmax", 0.5 * sr)
if aggregate is None:
aggregate = np.mean
if not util.is_positive_int(lag):
raise ParameterError(f"lag={lag} must be a positive integer")
if not util.is_positive_int(max_size):
raise ParameterError(f"max_size={max_size} must be a positive integer")
# First, compute mel spectrogram
if S is None:
S = np.abs(feature(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length, **kwargs))
# Convert to dBs
S = core.power_to_db(S)
# Assertion to make type checking happy
assert S is not None
# Ensure that S is at least 2-d
S = np.atleast_2d(S)
# Compute the reference spectrogram.
# Efficiency hack: skip filtering step and pass by reference
# if max_size will produce a no-op.
if ref is None:
if max_size == 1:
ref = S
else:
ref = scipy.ndimage.maximum_filter1d(S, max_size, axis=-2)
elif ref.shape != S.shape:
raise ParameterError(
f"Reference spectrum shape {ref.shape} must match input spectrum {S.shape}"
)
# Compute difference to the reference, spaced by lag
onset_env = S[..., lag:] - ref[..., :-lag]
# Discard negatives (decreasing amplitude)
onset_env = np.maximum(0.0, onset_env)
# Aggregate within channels
pad = True
if channels is None:
channels = [slice(None)]
else:
pad = False
if callable(aggregate):
onset_env = util.sync(
onset_env, channels, aggregate=aggregate, pad=pad, axis=-2
)
# compensate for lag
pad_width = lag
if center:
# Counter-act framing effects. Shift the onsets by n_fft / hop_length
pad_width += n_fft // (2 * hop_length)
padding = [(0, 0) for _ in onset_env.shape]
padding[-1] = (int(pad_width), 0)
onset_env = np.pad(onset_env, padding, mode="constant")
# remove the DC component
if detrend:
onset_env = scipy.signal.lfilter([1.0, -1.0], [1.0, -0.99], onset_env, axis=-1)
# Trim to match the input duration
if center:
onset_env = onset_env[..., : S.shape[-1]]
return onset_env

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,84 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Skip pydocstyle checks that erroneously trigger on "example"
# noqa: D405,D214,D407
"""
Utilities
=========
Array operations
----------------
.. autosummary::
:toctree: generated/
frame
pad_center
expand_to
fix_length
fix_frames
index_to_slice
softmask
stack
sync
axis_sort
normalize
shear
sparsify_rows
buf_to_float
tiny
Matching
--------
.. autosummary::
:toctree: generated/
match_intervals
match_events
Miscellaneous
-------------
.. autosummary::
:toctree: generated/
localmax
localmin
peak_pick
nnls
cyclic_gradient
dtype_c2r
dtype_r2c
count_unique
is_unique
abs2
phasor
Input validation
----------------
.. autosummary::
:toctree: generated/
valid_audio
valid_int
valid_intervals
is_positive_int
File operations
---------------
.. autosummary::
:toctree: generated/
example
example_info
list_examples
find_files
cite
"""
import lazy_loader as lazy
__getattr__, __dir__, __all__ = lazy.attach_stub(__name__, __file__)

View File

@@ -0,0 +1,59 @@
from . import decorators
from . import exceptions
from .files import (
find_files as find_files,
example as example,
ex as ex,
list_examples as list_examples,
example_info as example_info,
cite as cite,
)
from .matching import (
match_intervals as match_intervals,
match_events as match_events,
)
from .deprecation import (
Deprecated as Deprecated,
rename_kw as rename_kw,
)
from ._nnls import (
nnls as nnls,
)
from .utils import (
MAX_MEM_BLOCK as MAX_MEM_BLOCK,
frame as frame,
pad_center as pad_center,
expand_to as expand_to,
fix_length as fix_length,
valid_audio as valid_audio,
valid_int as valid_int,
is_positive_int as is_positive_int,
valid_intervals as valid_intervals,
fix_frames as fix_frames,
axis_sort as axis_sort,
localmax as localmax,
localmin as localmin,
normalize as normalize,
peak_pick as peak_pick,
sparsify_rows as sparsify_rows,
shear as shear,
stack as stack,
fill_off_diagonal as fill_off_diagonal,
index_to_slice as index_to_slice,
sync as sync,
softmask as softmask,
buf_to_float as buf_to_float,
tiny as tiny,
cyclic_gradient as cyclic_gradient,
dtype_r2c as dtype_r2c,
dtype_c2r as dtype_c2r,
count_unique as count_unique,
is_unique as is_unique,
abs2 as abs2,
phasor as phasor,
)

Some files were not shown because too many files have changed in this diff Show More