Bils's picture
Update app.py
2f15cbe verified
raw
history blame
7.47 kB
import spaces
import os
import tempfile
import gradio as gr
from dotenv import load_dotenv
import torch
from scipy.io.wavfile import write
from diffusers import DiffusionPipeline
from transformers import pipeline
from pydub import AudioSegment
import numpy as np
# Load environment variables
load_dotenv()
hf_token = os.getenv("HF_TKN")
# Device configuration
device = "cuda" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if device == "cuda" else torch.float32
# Initialize models with automatic device detection
@spaces.GPU(duration=120)
def load_models():
global captioning_pipeline, pipe
captioning_pipeline = pipeline(
"image-to-text",
model="nlpconnect/vit-gpt2-image-captioning",
device=0 if torch.cuda.is_available() else -1
)
pipe = DiffusionPipeline.from_pretrained(
"cvssp/audioldm2",
use_auth_token=hf_token,
torch_dtype=torch_dtype
).to(device)
load_models()
@spaces.GPU(duration=60)
def analyze_image(image_file):
"""Generate caption from image with error handling"""
try:
results = captioning_pipeline(image_file)
if results and isinstance(results, list):
return results[0].get("generated_text", "").strip()
return "Could not generate caption"
except Exception as e:
return f"Error: {str(e)}"
@spaces.GPU(duration=120)
def generate_audio(prompt):
"""Generate audio from text prompt"""
try:
return pipe(
prompt=prompt,
num_inference_steps=50,
guidance_scale=7.5
).audios[0]
except Exception as e:
print(f"Audio generation error: {str(e)}")
return None
def blend_audios(audio_list):
"""Mix multiple audio arrays into one"""
try:
valid_audios = [arr for arr in audio_list if arr is not None]
if not valid_audios:
return None
max_length = max(arr.shape[0] for arr in valid_audios)
mixed = np.zeros(max_length)
for arr in valid_audios:
if arr.shape[0] < max_length:
padded = np.pad(arr, (0, max_length - arr.shape[0]))
else:
padded = arr[:max_length]
mixed += padded
mixed = mixed / np.max(np.abs(mixed))
_, tmp_path = tempfile.mkstemp(suffix=".wav")
write(tmp_path, 16000, mixed)
return tmp_path
except Exception as e:
print(f"Blending error: {str(e)}")
return None
css = """
#col-container { max-width: 800px; margin: 0 auto; }
.toggle-row { margin: 1rem 0; }
.prompt-box { margin-bottom: 0.5rem; }
.danger { color: #ff4444; font-weight: bold; }
"""
with gr.Blocks(css=css) as demo:
with gr.Column(elem_id="col-container"):
# Header Section
gr.HTML("""
<h1 style="text-align: center;">🎶 Generate Sound Effects from Image or Text</h1>
<p style="text-align: center;">
âš¡ Powered by <a href="https://bilsimaging.com" target="_blank">Bilsimaging</a>
</p>
""")
# Input Mode Toggle
input_mode = gr.Radio(
choices=["Image Input", "Text Input"],
value="Image Input",
label="Select Input Mode",
elem_classes="toggle-row"
)
# Image Input Section
with gr.Column(visible=True) as image_col:
image_upload = gr.Image(type="filepath", label="Upload Image")
generate_desc_btn = gr.Button("Generate Description from Image", variant="primary")
caption_display = gr.Textbox(label="Generated Description", interactive=False)
# Text Input Section
with gr.Column(visible=False) as text_col:
with gr.Row():
prompt1 = gr.Textbox(label="Sound Prompt 1", lines=2, placeholder="Enter sound description...")
prompt2 = gr.Textbox(label="Sound Prompt 2", lines=2, placeholder="Enter sound description...")
additional_prompts = gr.Column()
add_prompt_btn = gr.Button("âž• Add Another Prompt", variant="secondary")
gr.Markdown("<div class='danger'>Max 5 prompts for stability</div>")
# Generation Controls
generate_sound_btn = gr.Button("Generate Sound Effect", variant="primary")
audio_output = gr.Audio(label="Generated Sound Effect", interactive=False)
# Documentation Section
gr.Markdown("""
## 👥 How You Can Contribute
We welcome contributions! Contact us at [[email protected]](mailto:[email protected]).
Support us on [Ko-fi](https://ko-fi.com/bilsimaging) - Bilel Aroua
""")
# Visitor Badge
gr.HTML("""
<div style="text-align: center;">
<a href="https://visitorbadge.io/status?path=https://huggingface.co/spaces/Bils/Generate-Sound-Effects-from-Image">
<img src="https://api.visitorbadge.io/api/visitors?path=https://huggingface.co/spaces/Bils/Generate-Sound-Effects-from-Image&countColor=%23263759"/>
</a>
</div>
""")
# Input Mode Toggle Handler
input_mode.change(
lambda mode: (gr.update(visible=mode == "Image Input"), gr.update(visible=mode == "Text Input")),
inputs=input_mode,
outputs=[image_col, text_col],
concurrency_limit=1
)
# Image Description Generation
generate_desc_btn.click(
analyze_image,
inputs=image_upload,
outputs=caption_display,
concurrency_limit=2
)
# Dynamic Prompt Addition
def add_prompt(current_count):
if current_count >= 5:
return current_count, gr.update()
new_count = current_count + 1
new_prompt = gr.Textbox(
label=f"Sound Prompt {new_count}",
lines=2,
visible=True,
placeholder="Enter sound description..."
)
return new_count, new_prompt
prompt_count = gr.State(2)
add_prompt_btn.click(
add_prompt,
inputs=prompt_count,
outputs=[prompt_count, additional_prompts],
concurrency_limit=1
)
# Sound Generation Handler
def process_inputs(mode, image_file, caption, *prompts):
try:
if mode == "Image Input":
if not image_file:
raise gr.Error("Please upload an image")
caption = analyze_image(image_file)
prompts = [caption]
else:
prompts = [p.strip() for p in prompts if p.strip()]
if not prompts:
raise gr.Error("Please enter at least one valid prompt")
# Generate individual audio tracks
audio_tracks = []
for prompt in prompts:
if not prompt:
continue
audio = generate_audio(prompt)
if audio is not None:
audio_tracks.append(audio)
# Blend audio tracks
if not audio_tracks:
return None
return blend_audios(audio_tracks)
except Exception as e:
raise gr.Error(f"Processing error: {str(e)}")
generate_sound_btn.click(
process_inputs,
inputs=[input_mode, image_upload, caption_display, prompt1, prompt2],
outputs=audio_output,
concurrency_limit=2
)
if __name__ == "__main__":
demo.launch(max_threads=4)