Spaces:
Running
Running
| import gradio as gr | |
| import torch | |
| from diffusers import StableAudioPipeline | |
| from huggingface_hub import hf_hub_download | |
| import spaces | |
| from translatepy import Translator | |
| import numpy as np | |
| import random | |
| import soundfile as sf | |
| translator = Translator() | |
| # Constants | |
| model = "stabilityai/stable-audio-open-1.0" | |
| MAX_SEED = np.iinfo(np.int32).max | |
| CSS = """ | |
| .gradio-container { | |
| max-width: 690px !important; | |
| } | |
| footer { | |
| visibility: hidden; | |
| } | |
| """ | |
| JS = """function () { | |
| gradioURL = window.location.href | |
| if (!gradioURL.endsWith('?__theme=dark')) { | |
| window.location.replace(gradioURL + '?__theme=dark'); | |
| } | |
| }""" | |
| DESCRIPTION = """ | |
| <center> | |
| Stable Audio Open 1.0 generates variable-length (up to 47s) stereo audio at 44.1kHz from text prompts. \ | |
| It comprises three components: an autoencoder that compresses waveforms into a manageable sequence length, \ | |
| a T5-based text embedding for text conditioning, and a transformer-based diffusion (DiT) model that operates in the latent space of the autoencoder. | |
| </center> | |
| """ | |
| # Ensure model and scheduler are initialized in GPU-enabled function | |
| if torch.cuda.is_available(): | |
| pipe = StableAudioPipeline.from_pretrained( | |
| model, | |
| torch_dtype=torch.float16) | |
| pipe = pipe.to("cuda") | |
| # Function | |
| def main( | |
| prompt, | |
| negative="low quality", | |
| second: float = 10.0): | |
| if seed == -1: | |
| seed = random.randint(0, MAX_SEED) | |
| seed = int(seed) | |
| generator = torch.Generator().manual_seed(seed) | |
| prompt = str(translator.translate(prompt, 'English')) | |
| print(f'prompt:{prompt}') | |
| audio = pipe( | |
| prompt, | |
| negative_prompt=negative, | |
| audio_end_in_s=second, | |
| num_inference_steps=200, | |
| num_waveforms_per_prompt=3, | |
| generator=generator, | |
| ).audios | |
| os.makedirs("outputs", exist_ok=True) | |
| base_count = len(glob(os.path.join("outputs", "*.mp4"))) | |
| audio_path = os.path.join("outputs", f"{base_count:06d}.wav") | |
| sf.write(audio_path, audio[0].T.float().cpu().numpy(), pipe.vae.samping_rate) | |
| return audio_path, seed | |
| # Gradio Interface | |
| with gr.Blocks(theme='soft', css=CSS, js=JS, title="Stable Audio Open") as iface: | |
| with gr.Accordion(""): | |
| gr.Markdown(DESCRIPTION) | |
| output = gr.Audio(label="Podcast", type="filepath", interactive=False, autoplay=True, elem_classes="audio") # Create an output textbox | |
| prompt = gr.Textbox(label="Prompt", placeholder="1000 BPM percussive sound of water drops") | |
| negative = gr.Textbox(label="Negative prompt", placeholder="Low quality") | |
| with gr.Row(): | |
| second = gr.Slider(5.0, 60.0, value=10.0, label="Second", step=0.1), | |
| seed = gr.Slider(1, MAX_SEED, value=0, label="Seed", step=1), | |
| with gr.Row(): | |
| submit_btn = gr.Button("π Send") # Create a submit button | |
| clear_btn = gr.ClearButton([prompt, seed, output], value="ποΈ Clear") # Create a clear button | |
| # Set up the event listeners | |
| submit_btn.click(main, inputs=[prompt, negative, second, seed], outputs=[output, seed]) | |
| #gr.close_all() | |
| iface.queue().launch(show_api=False) # Launch the Gradio interface |