Spaces:
Sleeping
Sleeping
File size: 4,794 Bytes
0291473 d0713cc 0291473 4af33c5 0291473 64ba1a9 0291473 24b2aa0 0291473 a43e51c 0291473 24b2aa0 805ddf8 4af33c5 d0713cc 0a716a3 431cf64 24b2aa0 431cf64 4518a48 a43e51c 431cf64 ba020f3 5d8dc18 f4a055e f7ae953 3711db1 a111e33 3711db1 d4856a0 3711db1 d4856a0 3711db1 c4f1532 93b34e9 c62407b 3711db1 7764d32 9e7e24e 3711db1 b1d070d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
import gradio as gr
from diffusers import AudioLDMControlNetPipeline, ControlNetModel
import os
from pretty_midi import PrettyMIDI
from tempfile import _TemporaryFileWrapper
import torch
import torchaudio
if torch.cuda.is_available():
device = "cuda"
torch_dtype = torch.float16
else:
device = "cpu"
torch_dtype = torch.float32
controlnet = ControlNetModel.from_pretrained(
"lauraibnz/midi-audioldm", torch_dtype=torch_dtype)
pipe = AudioLDMControlNetPipeline.from_pretrained(
"cvssp/audioldm-m-full", controlnet=controlnet, torch_dtype=torch_dtype)
pipe = pipe.to(device)
generator = torch.Generator(device)
def predict(midi_file=None, prompt="", negative_prompt="", audio_length_in_s=5, random_seed=0, controlnet_conditioning_scale=1, num_inference_steps=20, guess_mode=False):
if isinstance(midi_file, _TemporaryFileWrapper):
midi_file = midi_file.name
midi = PrettyMIDI(midi_file)
audio = pipe(
prompt,
negative_prompt=negative_prompt,
midi=midi,
audio_length_in_s=audio_length_in_s,
num_inference_steps=num_inference_steps,
controlnet_conditioning_scale=float(controlnet_conditioning_scale),
guess_mode=guess_mode,
generator=generator.manual_seed(int(random_seed)),
)
return (16000, audio.audios.T)
with gr.Blocks(title="🎹 MIDI-AudioLDM", theme=gr.themes.Base(text_size=gr.themes.sizes.text_md, font=[gr.themes.GoogleFont("Nunito Sans")])) as demo:
gr.HTML(
"""
<h1 align="center";font size="16">🎹 MIDI-AudioLDM </h1>)
"""
gr.Markdown(
"""
MIDI-AudioLDM is a MIDI-conditioned text-to-audio model based on the project [AudioLDM](https://huggingface.co/spaces/haoheliu/audioldm-text-to-audio-generation). The model has been conditioned using the ControlNet architecture and has been developed within Hugging Face’s [🧨 Diffusers](https://huggingface.co/docs/diffusers/) framework. Once trained, MIDI-AudioLDM accepts a MIDI file and a text prompt as inputs and returns an audio file, which is an interpretation of the MIDI based on the given text description. This enables detailed control over different musical aspects such as notes, mood and timbre.
""")
with gr.Row():
with gr.Column(variant='panel'):
midi = gr.File(label="midi file", file_types=[".mid"])
prompt = gr.Textbox(label="prompt")
with gr.Column(variant='panel'):
audio = gr.Audio(label="audio")
with gr.Accordion("Advanced Settings", open=False):
neg_prompt = gr.Textbox(label="negative prompt")
duration = gr.Slider(0, 30, value=5, step=5, label="duration (seconds)")
seed = gr.Number(value=42, label="seed")
cond = gr.Slider(0.0, 1.0, value=1.0, step=0.1, label="conditioning scale")
inf = gr.Slider(0, 50, value=20, step=0.1, label="inference steps")
guess = gr.Checkbox(label="guess mode")
btn = gr.Button("Generate")
btn.click(predict, inputs=[midi, prompt, neg_prompt, duration, seed, cond, inf, guess], outputs=[audio])
gr.Examples(examples=[["S00.mid", "piano", "", 10, 25, 1.0, 20, False]], inputs=[midi, prompt, neg_prompt, duration, seed, cond, inf, guess], fn=predict, outputs=audio, cache_examples=True)
# demo = gr.Interface(
# fn=predict, inputs=[
# gr.File(label="midi file", file_types=[".mid"]),
# "text",
# gr.Textbox(label="negative prompt"),
# gr.Slider(0, 30, value=5, step=5, label="duration (seconds)"),
# gr.Number(value=42, label="seed"),
# gr.Slider(0.0, 1.0, value=1.0, step=0.1, label="conditioning scale"),
# gr.Slider(0, 50, value=20, step=0.1, label="inference steps"),
# gr.Checkbox(label="guess mode")
# ],
# outputs="audio",
# examples=[["S00.mid", "piano", "", 10, 25, 1.0, 20, False]],
# cache_examples=True,
# title="🎹 MIDI-AudioLDM",
# description="MIDI-AudioLDM is a MIDI-conditioned text-to-audio model based on the project [AudioLDM](https://huggingface.co/spaces/haoheliu/audioldm-text-to-audio-generation). The model has been conditioned using the ControlNet architecture and has been developed within Hugging Face’s [🧨 Diffusers](https://huggingface.co/docs/diffusers/) framework. Once trained, MIDI-AudioLDM accepts a MIDI file and a text prompt as inputs and returns an audio file, which is an interpretation of the MIDI based on the given text description. This enables detailed control over different musical aspects such as notes, mood and timbre.",
# theme=gr.themes.Base(text_size=gr.themes.sizes.text_md, font=[gr.themes.GoogleFont("Nunito Sans")])
# )
demo.launch() |