Violin_midi_pro / musc /representations.py
Hygee's picture
Upload 9 files
e72f2a9 verified
from mir_eval import melody
import numpy as np
from scipy.stats import norm
import librosa
import pretty_midi
from scipy.ndimage import gaussian_filter1d
class PerformanceLabel:
"""
The dataset labeling class for performance representations. Currently, includes onset, note, and fine-grained f0
representations. Note min, note max, and f0_bin_per_semitone values are to be arranged per instrument. The default
values are for violin performance analysis. Fretted instruments might not require such f0 resolutions per semitone.
"""
def __init__(self, note_min='F#3', note_max='C8', f0_bins_per_semitone=9, f0_smooth_std_c=None,
onset_smooth_std=0.7, f0_tolerance_c=200):
midi_min = pretty_midi.note_name_to_number(note_min)
midi_max = pretty_midi.note_name_to_number(note_max)
self.midi_centers = np.arange(midi_min, midi_max)
self.onset_smooth_std=onset_smooth_std # onset smoothing along time axis (compensate for alignment)
f0_hz_range = librosa.note_to_hz([note_min, note_max])
f0_c_min, f0_c_max = melody.hz2cents(f0_hz_range)
self.f0_granularity_c = 100/f0_bins_per_semitone
if not f0_smooth_std_c:
f0_smooth_std_c = self.f0_granularity_c * 5/4 # Keep the ratio from the CREPE paper (20 cents and 25 cents)
self.f0_smooth_std_c = f0_smooth_std_c
self.f0_centers_c = np.arange(f0_c_min, f0_c_max, self.f0_granularity_c)
self.f0_centers_hz = 10 * 2 ** (self.f0_centers_c / 1200)
self.f0_n_bins = len(self.f0_centers_c)
self.pdf_normalizer = norm.pdf(0)
self.f0_c2hz = lambda c: 10*2**(c/1200)
self.f0_hz2c = melody.hz2cents
self.midi_centers_c = self.f0_hz2c(librosa.midi_to_hz(self.midi_centers))
self.f0_tolerance_bins = int(f0_tolerance_c/self.f0_granularity_c)
self.f0_transition_matrix = gaussian_filter1d(np.eye(2*self.f0_tolerance_bins + 1), 25/self.f0_granularity_c)
def f0_c2label(self, pitch_c):
"""
Convert a single f0 value in cents to a one-hot label vector with smoothing (i.e., create a gaussian blur around
the target f0 bin for regularization and training stability. The blur is controlled by self.f0_smooth_std_c
:param pitch_c: a single pitch value in cents
:return: one-hot label vector with frequency blur
"""
result = norm.pdf((self.f0_centers_c - pitch_c) / self.f0_smooth_std_c).astype(np.float32)
result /= self.pdf_normalizer
return result
def f0_label2c(self, salience, center=None):
"""
Convert the salience predictions to monophonic f0 in cents. Only outputs a single f0 value per frame!
:param salience: f0 activations
:param center: f0 center bin to calculate the weighted average. Use argmax if empty
:return: f0 array per frame (in cents).
"""
if salience.ndim == 1:
if center is None:
center = int(np.argmax(salience))
start = max(0, center - 4)
end = min(len(salience), center + 5)
salience = salience[start:end]
product_sum = np.sum(salience * self.f0_centers_c[start:end])
weight_sum = np.sum(salience)
return product_sum / np.clip(weight_sum, 1e-8, None)
if salience.ndim == 2:
return np.array([self.f0_label2c(salience[i, :]) for i in range(salience.shape[0])])
raise Exception("label should be either 1d or 2d ndarray")
def fill_onset_matrix(self, onsets, window, feature_rate):
"""
Create a sparse onset matrix from window and onsets (per-semitone). Apply a gaussian smoothing (along time)
so that we can tolerate better the alignment problems. This is similar to the frequency smoothing for the f0.
The temporal smoothing is controlled by the parameter self.onset_smooth_std
:param onsets: A 2d np.array of individual note onsets with their respective time values
(Nx2: time in seconds - midi number)
:param window: Timestamps for the frame centers of the sparse matrix
:param feature_rate: Window timestamps are integer, this is to convert them to seconds
:return: onset_roll: A sparse matrix filled with temporally blurred onsets.
"""
onsets = self.get_window_feats(onsets, window, feature_rate)
onset_roll = np.zeros((len(window), len(self.midi_centers)))
for onset in onsets:
onset, note = onset # it was a pair with time and midi note
if self.midi_centers[0] < note < self.midi_centers[-1]: # midi note should be in the range defined
note = int(note) - self.midi_centers[0] # find the note index in our range
onset = (onset*feature_rate)-window[0] # onset index (as float but in frames, not in seconds!)
start = max(0, int(onset) - 3)
end = min(len(window) - 1, int(onset) + 3)
try:
vals = norm.pdf(np.linspace(start - onset, end - onset, end - start + 1) / self.onset_smooth_std)
# if you increase 0.7 you smooth the peak
# if you decrease it, e.g., 0.1, it becomes too peaky! around 0.5-0.7 seems ok
vals /= self.pdf_normalizer
onset_roll[start:end + 1, note] += vals
except ValueError:
print('start',start, 'onset', onset, 'end', end)
return onset_roll, onsets
def fill_note_matrix(self, notes, window, feature_rate):
"""
Create the note matrix (piano roll) from window timestamps and note values per frame.
:param notes: A 2d np.array of individual notes with their active time values Nx2
:param window: Timestamps for the frame centers of the output
:param feature_rate: Window timestamps are integer, this is to convert them to seconds
:return note_roll: The piano roll in the defined range of [note_min, note_max).
"""
notes = self.get_window_feats(notes, window, feature_rate)
# take the notes in the midi range defined
notes = notes[np.logical_and(notes[:,1]>=self.midi_centers[0], notes[:,1]<=self.midi_centers[-1]),:]
times = (notes[:,0]*feature_rate - window[0]).astype(int) # in feature samples (fs:self.hop/self.sr)
notes = (notes[:,1] - self.midi_centers[0]).astype(int)
note_roll = np.zeros((len(window), len(self.midi_centers)))
note_roll[(times, notes)] = 1
return note_roll, notes
def fill_f0_matrix(self, f0s, window, feature_rate):
"""
Unlike the labels for onsets and notes, f0 label is only relevant for strictly monophonic regions! Thus, this
function returns a boolean which represents where to apply the given values.
Never back-propagate without the boolean! Empty frames mean that the label is not that reliable.
:param f0s: A 2d np.array of f0 values with the time they belong to (2xN: time in seconds - f0 in Hz)
:param window: Timestamps for the frame centers of the output
:param feature_rate: Window timestamps are integer, this is to convert them to seconds
:return f0_roll: f0 label matrix and
f0_hz: f0 values in Hz
annotation_bool: A boolean array representing which frames have reliable f0 annotations.
"""
f0s = self.get_window_feats(f0s, window, feature_rate)
f0_cents = np.zeros_like(window, dtype=float)
f0s[:,1] = self.f0_hz2c(f0s[:,1]) # convert f0 in hz to cents
annotation_bool = np.zeros_like(window, dtype=bool)
f0_roll = np.zeros((len(window), len(self.f0_centers_c)))
times_in_frame = f0s[:, 0]*feature_rate - window[0]
for t, f0 in enumerate(f0s):
t = times_in_frame[t]
if t%1 < 0.25: # only consider it as annotation if the f0 values is really close to the frame center
t = int(np.round(t))
f0_roll[t] = self.f0_c2label(f0[1])
annotation_bool[t] = True
f0_cents[t] = f0[1]
return f0_roll, f0_cents, annotation_bool
@staticmethod
def get_window_feats(time_feature_matrix, window, feature_rate):
"""
Restrict the feature matrix to the features that are inside the window
:param window: Timestamps for the frame centers of the output
:param time_feature_matrix: A 2d array of Nx2 per the entire file.
:param feature_rate: Window timestamps are integer, this is to convert them to seconds
:return: window_features: the features inside the given window
"""
start = time_feature_matrix[:,0]>(window[0]-0.5)/feature_rate
end = time_feature_matrix[:,0]<(window[-1]+0.5)/feature_rate
window_features = np.logical_and(start, end)
window_features = np.array(time_feature_matrix[window_features,:])
return window_features
def represent_midi(self, midi, feature_rate):
"""
Represent a midi file as sparse matrices of onsets, offsets, and notes. No f0 is included.
:param midi: A midi file (either a path or a pretty_midi.PrettyMIDI object)
:param feature_rate: The feature rate in Hz
:return: dict {onset, offset, note, time}: Same format with the model's learning and outputs
"""
def _get_onsets_offsets_frames(midi_content):
if isinstance(midi_content, str):
midi_content = pretty_midi.PrettyMIDI(midi_content)
onsets = []
offsets = []
frames = []
for instrument in midi_content.instruments:
for note in instrument.notes:
start = int(np.round(note.start * feature_rate))
end = int(np.round(note.end * feature_rate))
note_times = (np.arange(start, end+0.5)/feature_rate)[:, np.newaxis]
note_pitch = np.full_like(note_times, fill_value=note.pitch)
onsets.append([note.start, note.pitch])
offsets.append([note.end, note.pitch])
frames.append(np.hstack([note_times, note_pitch]))
onsets = np.vstack(onsets)
offsets = np.vstack(offsets)
frames = np.vstack(frames)
return onsets, offsets, frames, midi_content
onset_array, offset_array, frame_array, midi_object = _get_onsets_offsets_frames(midi)
window = np.arange(frame_array[0, 0]*feature_rate, frame_array[-1, 0]*feature_rate, dtype=int)
onset_roll, _ = self.fill_onset_matrix(onset_array, window, feature_rate)
offset_roll, _ = self.fill_onset_matrix(offset_array, window, feature_rate)
note_roll, _ = self.fill_note_matrix(frame_array, window, feature_rate)
start_anchor = onset_array[onset_array[:, 0]==np.min(onset_array[:, 0])]
end_anchor = offset_array[offset_array[:, 0]==np.max(offset_array[:, 0])]
return {
'midi': midi_object,
'note': note_roll,
'onset': onset_roll,
'offset': offset_roll,
'time': window/feature_rate,
'start_anchor': start_anchor,
'end_anchor': end_anchor
}