Spaces:

Hygee
/

Violin_midi_pro

Sleeping

App Files Files Community

Violin_midi_pro / musc /representations.py

Hygee

Upload 9 files

e72f2a9 verified over 1 year ago

raw

history blame contribute delete

11.4 kB

	from mir_eval import melody
	import numpy as np
	from scipy.stats import norm
	import librosa
	import pretty_midi
	from scipy.ndimage import gaussian_filter1d


	class PerformanceLabel:
	"""
	The dataset labeling class for performance representations. Currently, includes onset, note, and fine-grained f0
	representations. Note min, note max, and f0_bin_per_semitone values are to be arranged per instrument. The default
	values are for violin performance analysis. Fretted instruments might not require such f0 resolutions per semitone.
	"""
	def __init__(self, note_min='F#3', note_max='C8', f0_bins_per_semitone=9, f0_smooth_std_c=None,
	onset_smooth_std=0.7, f0_tolerance_c=200):
	midi_min = pretty_midi.note_name_to_number(note_min)
	midi_max = pretty_midi.note_name_to_number(note_max)
	self.midi_centers = np.arange(midi_min, midi_max)
	self.onset_smooth_std=onset_smooth_std # onset smoothing along time axis (compensate for alignment)

	f0_hz_range = librosa.note_to_hz([note_min, note_max])
	f0_c_min, f0_c_max = melody.hz2cents(f0_hz_range)
	self.f0_granularity_c = 100/f0_bins_per_semitone
	if not f0_smooth_std_c:
	f0_smooth_std_c = self.f0_granularity_c * 5/4 # Keep the ratio from the CREPE paper (20 cents and 25 cents)
	self.f0_smooth_std_c = f0_smooth_std_c

	self.f0_centers_c = np.arange(f0_c_min, f0_c_max, self.f0_granularity_c)
	self.f0_centers_hz = 10 * 2 ** (self.f0_centers_c / 1200)
	self.f0_n_bins = len(self.f0_centers_c)

	self.pdf_normalizer = norm.pdf(0)

	self.f0_c2hz = lambda c: 102*(c/1200)
	self.f0_hz2c = melody.hz2cents
	self.midi_centers_c = self.f0_hz2c(librosa.midi_to_hz(self.midi_centers))

	self.f0_tolerance_bins = int(f0_tolerance_c/self.f0_granularity_c)
	self.f0_transition_matrix = gaussian_filter1d(np.eye(2*self.f0_tolerance_bins + 1), 25/self.f0_granularity_c)

	def f0_c2label(self, pitch_c):
	"""
	Convert a single f0 value in cents to a one-hot label vector with smoothing (i.e., create a gaussian blur around
	the target f0 bin for regularization and training stability. The blur is controlled by self.f0_smooth_std_c
	:param pitch_c: a single pitch value in cents
	:return: one-hot label vector with frequency blur
	"""
	result = norm.pdf((self.f0_centers_c - pitch_c) / self.f0_smooth_std_c).astype(np.float32)
	result /= self.pdf_normalizer
	return result

	def f0_label2c(self, salience, center=None):
	"""
	Convert the salience predictions to monophonic f0 in cents. Only outputs a single f0 value per frame!
	:param salience: f0 activations
	:param center: f0 center bin to calculate the weighted average. Use argmax if empty
	:return: f0 array per frame (in cents).
	"""
	if salience.ndim == 1:
	if center is None:
	center = int(np.argmax(salience))
	start = max(0, center - 4)
	end = min(len(salience), center + 5)
	salience = salience[start:end]
	product_sum = np.sum(salience * self.f0_centers_c[start:end])
	weight_sum = np.sum(salience)
	return product_sum / np.clip(weight_sum, 1e-8, None)
	if salience.ndim == 2:
	return np.array([self.f0_label2c(salience[i, :]) for i in range(salience.shape[0])])
	raise Exception("label should be either 1d or 2d ndarray")

	def fill_onset_matrix(self, onsets, window, feature_rate):
	"""
	Create a sparse onset matrix from window and onsets (per-semitone). Apply a gaussian smoothing (along time)
	so that we can tolerate better the alignment problems. This is similar to the frequency smoothing for the f0.
	The temporal smoothing is controlled by the parameter self.onset_smooth_std
	:param onsets: A 2d np.array of individual note onsets with their respective time values
	(Nx2: time in seconds - midi number)
	:param window: Timestamps for the frame centers of the sparse matrix
	:param feature_rate: Window timestamps are integer, this is to convert them to seconds
	:return: onset_roll: A sparse matrix filled with temporally blurred onsets.
	"""
	onsets = self.get_window_feats(onsets, window, feature_rate)
	onset_roll = np.zeros((len(window), len(self.midi_centers)))
	for onset in onsets:
	onset, note = onset # it was a pair with time and midi note
	if self.midi_centers[0] < note < self.midi_centers[-1]: # midi note should be in the range defined
	note = int(note) - self.midi_centers[0] # find the note index in our range
	onset = (onset*feature_rate)-window[0] # onset index (as float but in frames, not in seconds!)
	start = max(0, int(onset) - 3)
	end = min(len(window) - 1, int(onset) + 3)
	try:
	vals = norm.pdf(np.linspace(start - onset, end - onset, end - start + 1) / self.onset_smooth_std)
	# if you increase 0.7 you smooth the peak
	# if you decrease it, e.g., 0.1, it becomes too peaky! around 0.5-0.7 seems ok
	vals /= self.pdf_normalizer
	onset_roll[start:end + 1, note] += vals
	except ValueError:
	print('start',start, 'onset', onset, 'end', end)
	return onset_roll, onsets

	def fill_note_matrix(self, notes, window, feature_rate):
	"""
	Create the note matrix (piano roll) from window timestamps and note values per frame.
	:param notes: A 2d np.array of individual notes with their active time values Nx2
	:param window: Timestamps for the frame centers of the output
	:param feature_rate: Window timestamps are integer, this is to convert them to seconds
	:return note_roll: The piano roll in the defined range of [note_min, note_max).
	"""
	notes = self.get_window_feats(notes, window, feature_rate)

	# take the notes in the midi range defined
	notes = notes[np.logical_and(notes[:,1]>=self.midi_centers[0], notes[:,1]<=self.midi_centers[-1]),:]

	times = (notes[:,0]*feature_rate - window[0]).astype(int) # in feature samples (fs:self.hop/self.sr)
	notes = (notes[:,1] - self.midi_centers[0]).astype(int)

	note_roll = np.zeros((len(window), len(self.midi_centers)))
	note_roll[(times, notes)] = 1
	return note_roll, notes


	def fill_f0_matrix(self, f0s, window, feature_rate):
	"""
	Unlike the labels for onsets and notes, f0 label is only relevant for strictly monophonic regions! Thus, this
	function returns a boolean which represents where to apply the given values.
	Never back-propagate without the boolean! Empty frames mean that the label is not that reliable.

	:param f0s: A 2d np.array of f0 values with the time they belong to (2xN: time in seconds - f0 in Hz)
	:param window: Timestamps for the frame centers of the output
	:param feature_rate: Window timestamps are integer, this is to convert them to seconds

	:return f0_roll: f0 label matrix and
	f0_hz: f0 values in Hz
	annotation_bool: A boolean array representing which frames have reliable f0 annotations.
	"""
	f0s = self.get_window_feats(f0s, window, feature_rate)
	f0_cents = np.zeros_like(window, dtype=float)
	f0s[:,1] = self.f0_hz2c(f0s[:,1]) # convert f0 in hz to cents

	annotation_bool = np.zeros_like(window, dtype=bool)
	f0_roll = np.zeros((len(window), len(self.f0_centers_c)))
	times_in_frame = f0s[:, 0]*feature_rate - window[0]
	for t, f0 in enumerate(f0s):
	t = times_in_frame[t]
	if t%1 < 0.25: # only consider it as annotation if the f0 values is really close to the frame center
	t = int(np.round(t))
	f0_roll[t] = self.f0_c2label(f0[1])
	annotation_bool[t] = True
	f0_cents[t] = f0[1]

	return f0_roll, f0_cents, annotation_bool


	@staticmethod
	def get_window_feats(time_feature_matrix, window, feature_rate):
	"""
	Restrict the feature matrix to the features that are inside the window
	:param window: Timestamps for the frame centers of the output
	:param time_feature_matrix: A 2d array of Nx2 per the entire file.
	:param feature_rate: Window timestamps are integer, this is to convert them to seconds
	:return: window_features: the features inside the given window
	"""
	start = time_feature_matrix[:,0]>(window[0]-0.5)/feature_rate
	end = time_feature_matrix[:,0]<(window[-1]+0.5)/feature_rate
	window_features = np.logical_and(start, end)
	window_features = np.array(time_feature_matrix[window_features,:])
	return window_features

	def represent_midi(self, midi, feature_rate):
	"""
	Represent a midi file as sparse matrices of onsets, offsets, and notes. No f0 is included.
	:param midi: A midi file (either a path or a pretty_midi.PrettyMIDI object)
	:param feature_rate: The feature rate in Hz
	:return: dict {onset, offset, note, time}: Same format with the model's learning and outputs
	"""
	def _get_onsets_offsets_frames(midi_content):
	if isinstance(midi_content, str):
	midi_content = pretty_midi.PrettyMIDI(midi_content)
	onsets = []
	offsets = []
	frames = []
	for instrument in midi_content.instruments:
	for note in instrument.notes:
	start = int(np.round(note.start * feature_rate))
	end = int(np.round(note.end * feature_rate))
	note_times = (np.arange(start, end+0.5)/feature_rate)[:, np.newaxis]
	note_pitch = np.full_like(note_times, fill_value=note.pitch)
	onsets.append([note.start, note.pitch])
	offsets.append([note.end, note.pitch])
	frames.append(np.hstack([note_times, note_pitch]))
	onsets = np.vstack(onsets)
	offsets = np.vstack(offsets)
	frames = np.vstack(frames)
	return onsets, offsets, frames, midi_content
	onset_array, offset_array, frame_array, midi_object = _get_onsets_offsets_frames(midi)
	window = np.arange(frame_array[0, 0]feature_rate, frame_array[-1, 0]feature_rate, dtype=int)
	onset_roll, _ = self.fill_onset_matrix(onset_array, window, feature_rate)
	offset_roll, _ = self.fill_onset_matrix(offset_array, window, feature_rate)
	note_roll, _ = self.fill_note_matrix(frame_array, window, feature_rate)
	start_anchor = onset_array[onset_array[:, 0]==np.min(onset_array[:, 0])]
	end_anchor = offset_array[offset_array[:, 0]==np.max(offset_array[:, 0])]
	return {
	'midi': midi_object,
	'note': note_roll,
	'onset': onset_roll,
	'offset': offset_roll,
	'time': window/feature_rate,
	'start_anchor': start_anchor,
	'end_anchor': end_anchor
	}