Spaces:
Runtime error
Runtime error
from transformers import AutoProcessor, MusicgenForConditionalGeneration | |
#Andy removed: from datasets import load_dataset | |
import torchaudio | |
import torch | |
#Andy edited: import losses | |
import audio_diffusion_attacks_forhf.src.losses as losses | |
from audiotools import AudioSignal | |
class MusicGenEval: | |
def __init__(self, input_sample_rate, audio_steps): | |
model_name="facebook/musicgen-stereo-small" | |
self.processor = AutoProcessor.from_pretrained(model_name) | |
self.model = MusicgenForConditionalGeneration.from_pretrained(model_name) | |
#Andy commented: self.model=self.model.to(device='cuda') | |
self.input_sample_rate=input_sample_rate | |
self.audio_steps=audio_steps | |
self.mel_loss = losses.MelSpectrogramLoss(n_mels=[5, 10, 20, 40, 80, 160, 320], | |
window_lengths=[32, 64, 128, 256, 512, 1024, 2048], | |
mel_fmin=[0, 0, 0, 0, 0, 0, 0], | |
pow=1.0, | |
clamp_eps=1.0e-5, | |
mag_weight=0.0) | |
def eval(self, original_audio, protected_audio): | |
original_audio=original_audio[:, :, :self.audio_steps] | |
protected_audio=protected_audio[:, :, :self.audio_steps] | |
input_len=original_audio.shape[-1] | |
#Andy edited: unprotected_gen=self.generate_audio(original_audio)[0].to(device='cuda') | |
unprotected_gen=self.generate_audio(original_audio)[0] | |
#Andy edited: protected_gen=self.generate_audio(protected_audio)[0].to(device='cuda') | |
protected_gen=self.generate_audio(protected_audio)[0] | |
eval_dict={} | |
# Difference between original and unprotected gen | |
eval_dict["original_unprotectedgen_l1"]=torch.mean(torch.abs(original_audio-unprotected_gen[:, :input_len])) | |
eval_dict["original_unprotectedgen_mel"]=self.mel_loss(AudioSignal(original_audio, self.input_sample_rate), AudioSignal(unprotected_gen[:, :input_len], self.input_sample_rate)) | |
# Difference between original and protected gen | |
eval_dict["original_protectedgen_l1"]=torch.mean(torch.abs(original_audio-protected_gen[:, :input_len])) | |
eval_dict["original_protectedgen_mel"]=self.mel_loss(AudioSignal(original_audio, self.input_sample_rate), AudioSignal(protected_gen[:, :input_len], self.input_sample_rate)) | |
# Difference between protected and protected gen | |
eval_dict["protected_protectedgen_l1"]=torch.mean(torch.abs(protected_audio-protected_gen[:, :input_len])) | |
eval_dict["protected_protectedgen_mel"]=self.mel_loss(AudioSignal(protected_audio, self.input_sample_rate), AudioSignal(protected_gen[:, :input_len], self.input_sample_rate)) | |
# Difference between unprotected gen and protected gen | |
eval_dict["protectedgen_unprotectedgen_l1"]=torch.mean(torch.abs(protected_gen-unprotected_gen)) | |
eval_dict["protectedgen_unprotectedgen_mel"]=self.mel_loss(AudioSignal(protected_gen, self.input_sample_rate), AudioSignal(unprotected_gen, self.input_sample_rate)) | |
return eval_dict, unprotected_gen, protected_gen | |
def generate_audio(self, audio): | |
torch.manual_seed(0) | |
#Andy edited: transform = torchaudio.transforms.Resample(self.input_sample_rate, 32000).to(device='cuda') | |
transform = torchaudio.transforms.Resample(self.input_sample_rate, 32000) | |
waveform=transform(audio[0]).detach().cpu() | |
# waveform.clamp_(0,1) | |
a=torch.min(waveform) | |
b=torch.max(waveform) | |
c=waveform.isnan().any() | |
# sample = processor(raw_audio=waveform, sampling_rate=48000, return_tensors="pt") | |
inputs = self.processor( | |
audio=waveform, | |
sampling_rate=32000, | |
text=["music"], | |
padding=True, | |
return_tensors="pt", | |
) | |
for d in inputs.data: | |
#Andy edited: inputs.data[d]=inputs.data[d].to(device='cuda') | |
inputs.data[d]=inputs.data[d] | |
audio_values = self.model.generate(**inputs, do_sample=True, guidance_scale=3, max_new_tokens=1024) | |
#Andy edited: transform = torchaudio.transforms.Resample(32000, self.input_sample_rate).to(device='cuda') | |
transform = torchaudio.transforms.Resample(32000, self.input_sample_rate) | |
audio_values=transform(audio_values) | |
return audio_values | |
model_name="facebook/musicgen-stereo-small" | |
processor = AutoProcessor.from_pretrained(model_name) | |
#Andy commented (hesitant): model = MusicgenForConditionalGeneration.from_pretrained(model_name).to(device='cuda') | |
'''Andy commented (hesitant): | |
song_name="Texas Sun" | |
waveform, sample_rate = torchaudio.load(f"test_audio/{song_name}.mp3") | |
waveform=waveform[:, :500000] | |
torch.manual_seed(0) | |
transform = torchaudio.transforms.Resample(sample_rate, 32000) | |
waveform=transform(waveform) | |
# sample = processor(raw_audio=waveform, sampling_rate=48000, return_tensors="pt") | |
inputs = processor( | |
audio=waveform, | |
sampling_rate=32000, | |
text=["music"], | |
padding=True, | |
return_tensors="pt", | |
) | |
for d in inputs.data: | |
inputs.data[d]=inputs.data[d].to(device='cuda') | |
audio_values = model.generate(**inputs, do_sample=False, guidance_scale=3, max_new_tokens=512, top_k=0, top_p=250) | |
torchaudio.save(f"test_audio/perturbed/{model_name[9:]}_{song_name}.mp3", audio_values.detach().cpu()[0], 32000) | |
u=0 | |
''' |