audio-diffusion_style_transfer

Running

File size: 2,552 Bytes

9d749c2
 
 
 
 
 
 
f2582cd
eb4dbcc
7e73b22
 
9d749c2
7e73b22
9d749c2
 
 
 
 
 
 
 
21462bf
 
 
9d749c2
be5bb7c
9d749c2

import argparse

import gradio as gr

from audiodiffusion import AudioDiffusion


def generate_spectrogram_audio_and_loop(audio_file,model_id):
    print(audio_file)
    print(model_id)
    audio_diffusion = AudioDiffusion(model_id=model_id)
    image, (sample_rate,
            audio) = audio_diffusion.generate_spectrogram_and_audio_from_audio(audio_file)
    loop = AudioDiffusion.loop_it(audio, sample_rate)
    if loop is None:
        loop = audio
    return image, (sample_rate, audio), (sample_rate, loop)


demo = gr.Interface(fn=generate_spectrogram_audio_and_loop,
                    title="Audio Diffusion",
                    description="Forked from https://huggingface.co/spaces/teticio/audio-diffusion Built to style transfer to audio using Huggingface diffusers.\
        Outputs a 5 second audio clip with elements from the initial audio uploaded. This takes about 2 hours without a GPU, so why not bake a cake in the meantime? (Or try the teticio/audio-diffusion-ddim-256 \
                model which is faster.) The code for doing style transfer method was already into teticio's repo and python notebooks I just, I think hooked it up into a hugging face space. still need some more testing and such but would be cool hook up step number and then to also do inpainting and outpointing In this space and get the api working with the updated pipelines",
                    inputs=[
                        gr.Audio(source="upload",type="filepath"),
                        gr.Dropdown(label="Model",
                                    choices=[
                                        "teticio/audio-diffusion-256",
                                        "teticio/audio-diffusion-breaks-256",
                                        "teticio/audio-diffusion-instrumental-hiphop-256",
                                        "teticio/audio-diffusion-ddim-256"
                                    ],
                                    value="teticio/audio-diffusion-256")
                    ],
                    outputs=[
                        gr.Image(label="Mel spectrogram", image_mode="L"),
                        gr.Audio(label="Audio"),
                        gr.Audio(label="Loop"),
                    ],
                    allow_flagging="never")

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--port", type=int)
    parser.add_argument("--server", type=int)
    args = parser.parse_args()
    demo.launch(server_name=args.server or "0.0.0.0", server_port=args.port)