Spaces:
Running
on
Zero
Running
on
Zero
File size: 7,474 Bytes
698d4cd 2019ee0 213e5d3 698d4cd 81b2481 213e5d3 2f15cbe 213e5d3 2f15cbe 213e5d3 698d4cd 213e5d3 2f15cbe 24da5c3 2f15cbe 9c06b1a 2f15cbe 9c06b1a 2f15cbe 9c06b1a 2f15cbe 81b2481 2f15cbe 81b2481 698d4cd 2f15cbe 24da5c3 2f15cbe 698d4cd 2f15cbe 4d9e689 2f15cbe 4d9e689 2f15cbe 698d4cd 4d9e689 172038e 2f15cbe 172038e 698d4cd 2f15cbe 698d4cd 2f15cbe a4f881b e18ae6e 2f15cbe 698d4cd 2f15cbe 698d4cd 2f15cbe 698d4cd 8a09658 2f15cbe e18ae6e a4f881b 2f15cbe |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 |
import spaces
import os
import tempfile
import gradio as gr
from dotenv import load_dotenv
import torch
from scipy.io.wavfile import write
from diffusers import DiffusionPipeline
from transformers import pipeline
from pydub import AudioSegment
import numpy as np
# Load environment variables
load_dotenv()
hf_token = os.getenv("HF_TKN")
# Device configuration
device = "cuda" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if device == "cuda" else torch.float32
# Initialize models with automatic device detection
@spaces.GPU(duration=120)
def load_models():
global captioning_pipeline, pipe
captioning_pipeline = pipeline(
"image-to-text",
model="nlpconnect/vit-gpt2-image-captioning",
device=0 if torch.cuda.is_available() else -1
)
pipe = DiffusionPipeline.from_pretrained(
"cvssp/audioldm2",
use_auth_token=hf_token,
torch_dtype=torch_dtype
).to(device)
load_models()
@spaces.GPU(duration=60)
def analyze_image(image_file):
"""Generate caption from image with error handling"""
try:
results = captioning_pipeline(image_file)
if results and isinstance(results, list):
return results[0].get("generated_text", "").strip()
return "Could not generate caption"
except Exception as e:
return f"Error: {str(e)}"
@spaces.GPU(duration=120)
def generate_audio(prompt):
"""Generate audio from text prompt"""
try:
return pipe(
prompt=prompt,
num_inference_steps=50,
guidance_scale=7.5
).audios[0]
except Exception as e:
print(f"Audio generation error: {str(e)}")
return None
def blend_audios(audio_list):
"""Mix multiple audio arrays into one"""
try:
valid_audios = [arr for arr in audio_list if arr is not None]
if not valid_audios:
return None
max_length = max(arr.shape[0] for arr in valid_audios)
mixed = np.zeros(max_length)
for arr in valid_audios:
if arr.shape[0] < max_length:
padded = np.pad(arr, (0, max_length - arr.shape[0]))
else:
padded = arr[:max_length]
mixed += padded
mixed = mixed / np.max(np.abs(mixed))
_, tmp_path = tempfile.mkstemp(suffix=".wav")
write(tmp_path, 16000, mixed)
return tmp_path
except Exception as e:
print(f"Blending error: {str(e)}")
return None
css = """
#col-container { max-width: 800px; margin: 0 auto; }
.toggle-row { margin: 1rem 0; }
.prompt-box { margin-bottom: 0.5rem; }
.danger { color: #ff4444; font-weight: bold; }
"""
with gr.Blocks(css=css) as demo:
with gr.Column(elem_id="col-container"):
# Header Section
gr.HTML("""
<h1 style="text-align: center;">🎶 Generate Sound Effects from Image or Text</h1>
<p style="text-align: center;">
⚡ Powered by <a href="https://bilsimaging.com" target="_blank">Bilsimaging</a>
</p>
""")
# Input Mode Toggle
input_mode = gr.Radio(
choices=["Image Input", "Text Input"],
value="Image Input",
label="Select Input Mode",
elem_classes="toggle-row"
)
# Image Input Section
with gr.Column(visible=True) as image_col:
image_upload = gr.Image(type="filepath", label="Upload Image")
generate_desc_btn = gr.Button("Generate Description from Image", variant="primary")
caption_display = gr.Textbox(label="Generated Description", interactive=False)
# Text Input Section
with gr.Column(visible=False) as text_col:
with gr.Row():
prompt1 = gr.Textbox(label="Sound Prompt 1", lines=2, placeholder="Enter sound description...")
prompt2 = gr.Textbox(label="Sound Prompt 2", lines=2, placeholder="Enter sound description...")
additional_prompts = gr.Column()
add_prompt_btn = gr.Button("➕ Add Another Prompt", variant="secondary")
gr.Markdown("<div class='danger'>Max 5 prompts for stability</div>")
# Generation Controls
generate_sound_btn = gr.Button("Generate Sound Effect", variant="primary")
audio_output = gr.Audio(label="Generated Sound Effect", interactive=False)
# Documentation Section
gr.Markdown("""
## 👥 How You Can Contribute
We welcome contributions! Contact us at [[email protected]](mailto:[email protected]).
Support us on [Ko-fi](https://ko-fi.com/bilsimaging) - Bilel Aroua
""")
# Visitor Badge
gr.HTML("""
<div style="text-align: center;">
<a href="https://visitorbadge.io/status?path=https://huggingface.co/spaces/Bils/Generate-Sound-Effects-from-Image">
<img src="https://api.visitorbadge.io/api/visitors?path=https://huggingface.co/spaces/Bils/Generate-Sound-Effects-from-Image&countColor=%23263759"/>
</a>
</div>
""")
# Input Mode Toggle Handler
input_mode.change(
lambda mode: (gr.update(visible=mode == "Image Input"), gr.update(visible=mode == "Text Input")),
inputs=input_mode,
outputs=[image_col, text_col],
concurrency_limit=1
)
# Image Description Generation
generate_desc_btn.click(
analyze_image,
inputs=image_upload,
outputs=caption_display,
concurrency_limit=2
)
# Dynamic Prompt Addition
def add_prompt(current_count):
if current_count >= 5:
return current_count, gr.update()
new_count = current_count + 1
new_prompt = gr.Textbox(
label=f"Sound Prompt {new_count}",
lines=2,
visible=True,
placeholder="Enter sound description..."
)
return new_count, new_prompt
prompt_count = gr.State(2)
add_prompt_btn.click(
add_prompt,
inputs=prompt_count,
outputs=[prompt_count, additional_prompts],
concurrency_limit=1
)
# Sound Generation Handler
def process_inputs(mode, image_file, caption, *prompts):
try:
if mode == "Image Input":
if not image_file:
raise gr.Error("Please upload an image")
caption = analyze_image(image_file)
prompts = [caption]
else:
prompts = [p.strip() for p in prompts if p.strip()]
if not prompts:
raise gr.Error("Please enter at least one valid prompt")
# Generate individual audio tracks
audio_tracks = []
for prompt in prompts:
if not prompt:
continue
audio = generate_audio(prompt)
if audio is not None:
audio_tracks.append(audio)
# Blend audio tracks
if not audio_tracks:
return None
return blend_audios(audio_tracks)
except Exception as e:
raise gr.Error(f"Processing error: {str(e)}")
generate_sound_btn.click(
process_inputs,
inputs=[input_mode, image_upload, caption_display, prompt1, prompt2],
outputs=audio_output,
concurrency_limit=2
)
if __name__ == "__main__":
demo.launch(max_threads=4) |