Spaces:
Running
Running
| from mir_eval import melody | |
| import numpy as np | |
| from scipy.stats import norm | |
| import librosa | |
| import pretty_midi | |
| from scipy.ndimage import gaussian_filter1d | |
| class PerformanceLabel: | |
| """ | |
| The dataset labeling class for performance representations. Currently, includes onset, note, and fine-grained f0 | |
| representations. Note min, note max, and f0_bin_per_semitone values are to be arranged per instrument. The default | |
| values are for violin performance analysis. Fretted instruments might not require such f0 resolutions per semitone. | |
| """ | |
| def __init__(self, note_min='F#3', note_max='C8', f0_bins_per_semitone=9, f0_smooth_std_c=None, | |
| onset_smooth_std=0.7, f0_tolerance_c=200): | |
| midi_min = pretty_midi.note_name_to_number(note_min) | |
| midi_max = pretty_midi.note_name_to_number(note_max) | |
| self.midi_centers = np.arange(midi_min, midi_max) | |
| self.onset_smooth_std=onset_smooth_std # onset smoothing along time axis (compensate for alignment) | |
| f0_hz_range = librosa.note_to_hz([note_min, note_max]) | |
| f0_c_min, f0_c_max = melody.hz2cents(f0_hz_range) | |
| self.f0_granularity_c = 100/f0_bins_per_semitone | |
| if not f0_smooth_std_c: | |
| f0_smooth_std_c = self.f0_granularity_c * 5/4 # Keep the ratio from the CREPE paper (20 cents and 25 cents) | |
| self.f0_smooth_std_c = f0_smooth_std_c | |
| self.f0_centers_c = np.arange(f0_c_min, f0_c_max, self.f0_granularity_c) | |
| self.f0_centers_hz = 10 * 2 ** (self.f0_centers_c / 1200) | |
| self.f0_n_bins = len(self.f0_centers_c) | |
| self.pdf_normalizer = norm.pdf(0) | |
| self.f0_c2hz = lambda c: 10*2**(c/1200) | |
| self.f0_hz2c = melody.hz2cents | |
| self.midi_centers_c = self.f0_hz2c(librosa.midi_to_hz(self.midi_centers)) | |
| self.f0_tolerance_bins = int(f0_tolerance_c/self.f0_granularity_c) | |
| self.f0_transition_matrix = gaussian_filter1d(np.eye(2*self.f0_tolerance_bins + 1), 25/self.f0_granularity_c) | |
| def f0_c2label(self, pitch_c): | |
| """ | |
| Convert a single f0 value in cents to a one-hot label vector with smoothing (i.e., create a gaussian blur around | |
| the target f0 bin for regularization and training stability. The blur is controlled by self.f0_smooth_std_c | |
| :param pitch_c: a single pitch value in cents | |
| :return: one-hot label vector with frequency blur | |
| """ | |
| result = norm.pdf((self.f0_centers_c - pitch_c) / self.f0_smooth_std_c).astype(np.float32) | |
| result /= self.pdf_normalizer | |
| return result | |
| def f0_label2c(self, salience, center=None): | |
| """ | |
| Convert the salience predictions to monophonic f0 in cents. Only outputs a single f0 value per frame! | |
| :param salience: f0 activations | |
| :param center: f0 center bin to calculate the weighted average. Use argmax if empty | |
| :return: f0 array per frame (in cents). | |
| """ | |
| if salience.ndim == 1: | |
| if center is None: | |
| center = int(np.argmax(salience)) | |
| start = max(0, center - 4) | |
| end = min(len(salience), center + 5) | |
| salience = salience[start:end] | |
| product_sum = np.sum(salience * self.f0_centers_c[start:end]) | |
| weight_sum = np.sum(salience) | |
| return product_sum / np.clip(weight_sum, 1e-8, None) | |
| if salience.ndim == 2: | |
| return np.array([self.f0_label2c(salience[i, :]) for i in range(salience.shape[0])]) | |
| raise Exception("label should be either 1d or 2d ndarray") | |
| def fill_onset_matrix(self, onsets, window, feature_rate): | |
| """ | |
| Create a sparse onset matrix from window and onsets (per-semitone). Apply a gaussian smoothing (along time) | |
| so that we can tolerate better the alignment problems. This is similar to the frequency smoothing for the f0. | |
| The temporal smoothing is controlled by the parameter self.onset_smooth_std | |
| :param onsets: A 2d np.array of individual note onsets with their respective time values | |
| (Nx2: time in seconds - midi number) | |
| :param window: Timestamps for the frame centers of the sparse matrix | |
| :param feature_rate: Window timestamps are integer, this is to convert them to seconds | |
| :return: onset_roll: A sparse matrix filled with temporally blurred onsets. | |
| """ | |
| onsets = self.get_window_feats(onsets, window, feature_rate) | |
| onset_roll = np.zeros((len(window), len(self.midi_centers))) | |
| for onset in onsets: | |
| onset, note = onset # it was a pair with time and midi note | |
| if self.midi_centers[0] < note < self.midi_centers[-1]: # midi note should be in the range defined | |
| note = int(note) - self.midi_centers[0] # find the note index in our range | |
| onset = (onset*feature_rate)-window[0] # onset index (as float but in frames, not in seconds!) | |
| start = max(0, int(onset) - 3) | |
| end = min(len(window) - 1, int(onset) + 3) | |
| try: | |
| vals = norm.pdf(np.linspace(start - onset, end - onset, end - start + 1) / self.onset_smooth_std) | |
| # if you increase 0.7 you smooth the peak | |
| # if you decrease it, e.g., 0.1, it becomes too peaky! around 0.5-0.7 seems ok | |
| vals /= self.pdf_normalizer | |
| onset_roll[start:end + 1, note] += vals | |
| except ValueError: | |
| print('start',start, 'onset', onset, 'end', end) | |
| return onset_roll, onsets | |
| def fill_note_matrix(self, notes, window, feature_rate): | |
| """ | |
| Create the note matrix (piano roll) from window timestamps and note values per frame. | |
| :param notes: A 2d np.array of individual notes with their active time values Nx2 | |
| :param window: Timestamps for the frame centers of the output | |
| :param feature_rate: Window timestamps are integer, this is to convert them to seconds | |
| :return note_roll: The piano roll in the defined range of [note_min, note_max). | |
| """ | |
| notes = self.get_window_feats(notes, window, feature_rate) | |
| # take the notes in the midi range defined | |
| notes = notes[np.logical_and(notes[:,1]>=self.midi_centers[0], notes[:,1]<=self.midi_centers[-1]),:] | |
| times = (notes[:,0]*feature_rate - window[0]).astype(int) # in feature samples (fs:self.hop/self.sr) | |
| notes = (notes[:,1] - self.midi_centers[0]).astype(int) | |
| note_roll = np.zeros((len(window), len(self.midi_centers))) | |
| note_roll[(times, notes)] = 1 | |
| return note_roll, notes | |
| def fill_f0_matrix(self, f0s, window, feature_rate): | |
| """ | |
| Unlike the labels for onsets and notes, f0 label is only relevant for strictly monophonic regions! Thus, this | |
| function returns a boolean which represents where to apply the given values. | |
| Never back-propagate without the boolean! Empty frames mean that the label is not that reliable. | |
| :param f0s: A 2d np.array of f0 values with the time they belong to (2xN: time in seconds - f0 in Hz) | |
| :param window: Timestamps for the frame centers of the output | |
| :param feature_rate: Window timestamps are integer, this is to convert them to seconds | |
| :return f0_roll: f0 label matrix and | |
| f0_hz: f0 values in Hz | |
| annotation_bool: A boolean array representing which frames have reliable f0 annotations. | |
| """ | |
| f0s = self.get_window_feats(f0s, window, feature_rate) | |
| f0_cents = np.zeros_like(window, dtype=float) | |
| f0s[:,1] = self.f0_hz2c(f0s[:,1]) # convert f0 in hz to cents | |
| annotation_bool = np.zeros_like(window, dtype=bool) | |
| f0_roll = np.zeros((len(window), len(self.f0_centers_c))) | |
| times_in_frame = f0s[:, 0]*feature_rate - window[0] | |
| for t, f0 in enumerate(f0s): | |
| t = times_in_frame[t] | |
| if t%1 < 0.25: # only consider it as annotation if the f0 values is really close to the frame center | |
| t = int(np.round(t)) | |
| f0_roll[t] = self.f0_c2label(f0[1]) | |
| annotation_bool[t] = True | |
| f0_cents[t] = f0[1] | |
| return f0_roll, f0_cents, annotation_bool | |
| def get_window_feats(time_feature_matrix, window, feature_rate): | |
| """ | |
| Restrict the feature matrix to the features that are inside the window | |
| :param window: Timestamps for the frame centers of the output | |
| :param time_feature_matrix: A 2d array of Nx2 per the entire file. | |
| :param feature_rate: Window timestamps are integer, this is to convert them to seconds | |
| :return: window_features: the features inside the given window | |
| """ | |
| start = time_feature_matrix[:,0]>(window[0]-0.5)/feature_rate | |
| end = time_feature_matrix[:,0]<(window[-1]+0.5)/feature_rate | |
| window_features = np.logical_and(start, end) | |
| window_features = np.array(time_feature_matrix[window_features,:]) | |
| return window_features | |
| def represent_midi(self, midi, feature_rate): | |
| """ | |
| Represent a midi file as sparse matrices of onsets, offsets, and notes. No f0 is included. | |
| :param midi: A midi file (either a path or a pretty_midi.PrettyMIDI object) | |
| :param feature_rate: The feature rate in Hz | |
| :return: dict {onset, offset, note, time}: Same format with the model's learning and outputs | |
| """ | |
| def _get_onsets_offsets_frames(midi_content): | |
| if isinstance(midi_content, str): | |
| midi_content = pretty_midi.PrettyMIDI(midi_content) | |
| onsets = [] | |
| offsets = [] | |
| frames = [] | |
| for instrument in midi_content.instruments: | |
| for note in instrument.notes: | |
| start = int(np.round(note.start * feature_rate)) | |
| end = int(np.round(note.end * feature_rate)) | |
| note_times = (np.arange(start, end+0.5)/feature_rate)[:, np.newaxis] | |
| note_pitch = np.full_like(note_times, fill_value=note.pitch) | |
| onsets.append([note.start, note.pitch]) | |
| offsets.append([note.end, note.pitch]) | |
| frames.append(np.hstack([note_times, note_pitch])) | |
| onsets = np.vstack(onsets) | |
| offsets = np.vstack(offsets) | |
| frames = np.vstack(frames) | |
| return onsets, offsets, frames, midi_content | |
| onset_array, offset_array, frame_array, midi_object = _get_onsets_offsets_frames(midi) | |
| window = np.arange(frame_array[0, 0]*feature_rate, frame_array[-1, 0]*feature_rate, dtype=int) | |
| onset_roll, _ = self.fill_onset_matrix(onset_array, window, feature_rate) | |
| offset_roll, _ = self.fill_onset_matrix(offset_array, window, feature_rate) | |
| note_roll, _ = self.fill_note_matrix(frame_array, window, feature_rate) | |
| start_anchor = onset_array[onset_array[:, 0]==np.min(onset_array[:, 0])] | |
| end_anchor = offset_array[offset_array[:, 0]==np.max(offset_array[:, 0])] | |
| return { | |
| 'midi': midi_object, | |
| 'note': note_roll, | |
| 'onset': onset_roll, | |
| 'offset': offset_roll, | |
| 'time': window/feature_rate, | |
| 'start_anchor': start_anchor, | |
| 'end_anchor': end_anchor | |
| } | |