Spaces:
Running
on
Zero
Running
on
Zero
import spaces | |
import os | |
import tempfile | |
import gradio as gr | |
from dotenv import load_dotenv | |
import torch | |
from scipy.io.wavfile import write | |
from diffusers import DiffusionPipeline | |
from transformers import pipeline | |
from pydub import AudioSegment | |
import numpy as np | |
# Load environment variables | |
load_dotenv() | |
hf_token = os.getenv("HF_TKN") | |
# Device configuration | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
torch_dtype = torch.float16 if device == "cuda" else torch.float32 | |
# Initialize models with automatic device detection | |
def load_models(): | |
global captioning_pipeline, pipe | |
captioning_pipeline = pipeline( | |
"image-to-text", | |
model="nlpconnect/vit-gpt2-image-captioning", | |
device=0 if torch.cuda.is_available() else -1 | |
) | |
pipe = DiffusionPipeline.from_pretrained( | |
"cvssp/audioldm2", | |
use_auth_token=hf_token, | |
torch_dtype=torch_dtype | |
).to(device) | |
load_models() | |
def analyze_image(image_file): | |
"""Generate caption from image with error handling""" | |
try: | |
results = captioning_pipeline(image_file) | |
if results and isinstance(results, list): | |
return results[0].get("generated_text", "").strip() | |
return "Could not generate caption" | |
except Exception as e: | |
return f"Error: {str(e)}" | |
def generate_audio(prompt): | |
"""Generate audio from text prompt""" | |
try: | |
return pipe( | |
prompt=prompt, | |
num_inference_steps=50, | |
guidance_scale=7.5 | |
).audios[0] | |
except Exception as e: | |
print(f"Audio generation error: {str(e)}") | |
return None | |
def blend_audios(audio_list): | |
"""Mix multiple audio arrays into one""" | |
try: | |
valid_audios = [arr for arr in audio_list if arr is not None] | |
if not valid_audios: | |
return None | |
max_length = max(arr.shape[0] for arr in valid_audios) | |
mixed = np.zeros(max_length) | |
for arr in valid_audios: | |
if arr.shape[0] < max_length: | |
padded = np.pad(arr, (0, max_length - arr.shape[0])) | |
else: | |
padded = arr[:max_length] | |
mixed += padded | |
mixed = mixed / np.max(np.abs(mixed)) | |
_, tmp_path = tempfile.mkstemp(suffix=".wav") | |
write(tmp_path, 16000, mixed) | |
return tmp_path | |
except Exception as e: | |
print(f"Blending error: {str(e)}") | |
return None | |
css = """ | |
#col-container { max-width: 800px; margin: 0 auto; } | |
.toggle-row { margin: 1rem 0; } | |
.prompt-box { margin-bottom: 0.5rem; } | |
.danger { color: #ff4444; font-weight: bold; } | |
""" | |
with gr.Blocks(css=css) as demo: | |
with gr.Column(elem_id="col-container"): | |
# Header Section | |
gr.HTML(""" | |
<h1 style="text-align: center;">🎶 Generate Sound Effects from Image or Text</h1> | |
<p style="text-align: center;"> | |
âš¡ Powered by <a href="https://bilsimaging.com" target="_blank">Bilsimaging</a> | |
</p> | |
""") | |
# Input Mode Toggle | |
input_mode = gr.Radio( | |
choices=["Image Input", "Text Input"], | |
value="Image Input", | |
label="Select Input Mode", | |
elem_classes="toggle-row" | |
) | |
# Image Input Section | |
with gr.Column(visible=True) as image_col: | |
image_upload = gr.Image(type="filepath", label="Upload Image") | |
generate_desc_btn = gr.Button("Generate Description from Image", variant="primary") | |
caption_display = gr.Textbox(label="Generated Description", interactive=False) | |
# Text Input Section | |
with gr.Column(visible=False) as text_col: | |
with gr.Row(): | |
prompt1 = gr.Textbox(label="Sound Prompt 1", lines=2, placeholder="Enter sound description...") | |
prompt2 = gr.Textbox(label="Sound Prompt 2", lines=2, placeholder="Enter sound description...") | |
additional_prompts = gr.Column() | |
add_prompt_btn = gr.Button("âž• Add Another Prompt", variant="secondary") | |
gr.Markdown("<div class='danger'>Max 5 prompts for stability</div>") | |
# Generation Controls | |
generate_sound_btn = gr.Button("Generate Sound Effect", variant="primary") | |
audio_output = gr.Audio(label="Generated Sound Effect", interactive=False) | |
# Documentation Section | |
gr.Markdown(""" | |
## 👥 How You Can Contribute | |
We welcome contributions! Contact us at [[email protected]](mailto:[email protected]). | |
Support us on [Ko-fi](https://ko-fi.com/bilsimaging) - Bilel Aroua | |
""") | |
# Visitor Badge | |
gr.HTML(""" | |
<div style="text-align: center;"> | |
<a href="https://visitorbadge.io/status?path=https://huggingface.co/spaces/Bils/Generate-Sound-Effects-from-Image"> | |
<img src="https://api.visitorbadge.io/api/visitors?path=https://huggingface.co/spaces/Bils/Generate-Sound-Effects-from-Image&countColor=%23263759"/> | |
</a> | |
</div> | |
""") | |
# Input Mode Toggle Handler | |
input_mode.change( | |
lambda mode: (gr.update(visible=mode == "Image Input"), gr.update(visible=mode == "Text Input")), | |
inputs=input_mode, | |
outputs=[image_col, text_col], | |
concurrency_limit=1 | |
) | |
# Image Description Generation | |
generate_desc_btn.click( | |
analyze_image, | |
inputs=image_upload, | |
outputs=caption_display, | |
concurrency_limit=2 | |
) | |
# Dynamic Prompt Addition | |
def add_prompt(current_count): | |
if current_count >= 5: | |
return current_count, gr.update() | |
new_count = current_count + 1 | |
new_prompt = gr.Textbox( | |
label=f"Sound Prompt {new_count}", | |
lines=2, | |
visible=True, | |
placeholder="Enter sound description..." | |
) | |
return new_count, new_prompt | |
prompt_count = gr.State(2) | |
add_prompt_btn.click( | |
add_prompt, | |
inputs=prompt_count, | |
outputs=[prompt_count, additional_prompts], | |
concurrency_limit=1 | |
) | |
# Sound Generation Handler | |
def process_inputs(mode, image_file, caption, *prompts): | |
try: | |
if mode == "Image Input": | |
if not image_file: | |
raise gr.Error("Please upload an image") | |
caption = analyze_image(image_file) | |
prompts = [caption] | |
else: | |
prompts = [p.strip() for p in prompts if p.strip()] | |
if not prompts: | |
raise gr.Error("Please enter at least one valid prompt") | |
# Generate individual audio tracks | |
audio_tracks = [] | |
for prompt in prompts: | |
if not prompt: | |
continue | |
audio = generate_audio(prompt) | |
if audio is not None: | |
audio_tracks.append(audio) | |
# Blend audio tracks | |
if not audio_tracks: | |
return None | |
return blend_audios(audio_tracks) | |
except Exception as e: | |
raise gr.Error(f"Processing error: {str(e)}") | |
generate_sound_btn.click( | |
process_inputs, | |
inputs=[input_mode, image_upload, caption_display, prompt1, prompt2], | |
outputs=audio_output, | |
concurrency_limit=2 | |
) | |
if __name__ == "__main__": | |
demo.launch(max_threads=4) |