Spaces:

reach-vb
/

music-spectrogram-diffusion

Runtime error

File size: 1,636 Bytes

89d7ff4
 
 
 
 
 
 
ba2b8a5
 
 
89d7ff4
 
 
 
 
ba2b8a5
 
 
89d7ff4
 
 
 
 
 
 
 
1381480
89d7ff4
 
 
 
f1a443d
89d7ff4
 
 
 
ba2b8a5
89d7ff4
 
 
 
 
 
f1a443d
 
89d7ff4

import gradio as gr
import librosa
import numpy as np
import torch

from diffusers import SpectrogramDiffusionPipeline, MidiProcessor

pipe = SpectrogramDiffusionPipeline.from_pretrained("google/music-spectrogram-diffusion", torch_dtype=torch.float16).to("cuda")
pipe.enable_xformers_memory_efficient_attention()

processor = MidiProcessor()


def predict(audio_file_pth):

    with torch.inference_mode():
        output = pipe(processor(audio_file_pth.name)[:2])
        audio = output.audios[0]

    return (16000, audio.ravel())


title = "Music Spectrogram Diffusion: Multi-instrument Music Synthesis with Spectrogram Diffusion"

description = """
In this work, the authors focus on a middle ground of neural synthesizers that can generate audio from MIDI sequences with arbitrary combinations of instruments in realtime. 
This enables training on a wide range of transcription datasets with a single model, which in turn offers note-level control of composition and instrumentation across a wide range of instruments.

They use a simple two-stage process: MIDI to spectrograms with an encoder-decoder Transformer, then spectrograms to audio with a generative adversarial network (GAN) spectrogram inverter.
"""

examples = ["examples/beethoven_mond_2.mid", "examples/beethoven_hammerklavier_2.mid"]

gr.Interface(
    fn=predict,
    inputs=[
        gr.File(label="Upload MIDI", file_count="single", file_types=[".mid"]),
    ],
    outputs=[
        gr.Audio(label="Synthesised Music", type="numpy"),
    ],
    title=title,
    description=description,
    theme='gradio/monochrome',
    examples=examples,
).launch(debug=True)