import spaces import os import tempfile import gradio as gr from dotenv import load_dotenv import torch from scipy.io.wavfile import write from diffusers import DiffusionPipeline from transformers import pipeline from pydub import AudioSegment import numpy as np # Load environment variables load_dotenv() hf_token = os.getenv("HF_TKN") # Device configuration device = "cuda" if torch.cuda.is_available() else "cpu" torch_dtype = torch.float16 if device == "cuda" else torch.float32 # Initialize models with automatic device detection @spaces.GPU(duration=120) def load_models(): global captioning_pipeline, pipe captioning_pipeline = pipeline( "image-to-text", model="nlpconnect/vit-gpt2-image-captioning", device=0 if torch.cuda.is_available() else -1 ) pipe = DiffusionPipeline.from_pretrained( "cvssp/audioldm2", use_auth_token=hf_token, torch_dtype=torch_dtype ).to(device) load_models() @spaces.GPU(duration=60) def analyze_image(image_file): """Generate caption from image with error handling""" try: results = captioning_pipeline(image_file) if results and isinstance(results, list): return results[0].get("generated_text", "").strip() return "Could not generate caption" except Exception as e: return f"Error: {str(e)}" @spaces.GPU(duration=120) def generate_audio(prompt): """Generate audio from text prompt""" try: return pipe( prompt=prompt, num_inference_steps=50, guidance_scale=7.5 ).audios[0] except Exception as e: print(f"Audio generation error: {str(e)}") return None def blend_audios(audio_list): """Mix multiple audio arrays into one""" try: valid_audios = [arr for arr in audio_list if arr is not None] if not valid_audios: return None max_length = max(arr.shape[0] for arr in valid_audios) mixed = np.zeros(max_length) for arr in valid_audios: if arr.shape[0] < max_length: padded = np.pad(arr, (0, max_length - arr.shape[0])) else: padded = arr[:max_length] mixed += padded mixed = mixed / np.max(np.abs(mixed)) _, tmp_path = tempfile.mkstemp(suffix=".wav") write(tmp_path, 16000, mixed) return tmp_path except Exception as e: print(f"Blending error: {str(e)}") return None css = """ #col-container { max-width: 800px; margin: 0 auto; } .toggle-row { margin: 1rem 0; } .prompt-box { margin-bottom: 0.5rem; } .danger { color: #ff4444; font-weight: bold; } """ with gr.Blocks(css=css) as demo: with gr.Column(elem_id="col-container"): # Header Section gr.HTML("""

🎶 Generate Sound Effects from Image or Text

âš¡ Powered by Bilsimaging

""") # Input Mode Toggle input_mode = gr.Radio( choices=["Image Input", "Text Input"], value="Image Input", label="Select Input Mode", elem_classes="toggle-row" ) # Image Input Section with gr.Column(visible=True) as image_col: image_upload = gr.Image(type="filepath", label="Upload Image") generate_desc_btn = gr.Button("Generate Description from Image", variant="primary") caption_display = gr.Textbox(label="Generated Description", interactive=False) # Text Input Section with gr.Column(visible=False) as text_col: with gr.Row(): prompt1 = gr.Textbox(label="Sound Prompt 1", lines=2, placeholder="Enter sound description...") prompt2 = gr.Textbox(label="Sound Prompt 2", lines=2, placeholder="Enter sound description...") additional_prompts = gr.Column() add_prompt_btn = gr.Button("âž• Add Another Prompt", variant="secondary") gr.Markdown("
Max 5 prompts for stability
") # Generation Controls generate_sound_btn = gr.Button("Generate Sound Effect", variant="primary") audio_output = gr.Audio(label="Generated Sound Effect", interactive=False) # Documentation Section gr.Markdown(""" ## 👥 How You Can Contribute We welcome contributions! Contact us at [contact@bilsimaging.com](mailto:contact@bilsimaging.com). Support us on [Ko-fi](https://ko-fi.com/bilsimaging) - Bilel Aroua """) # Visitor Badge gr.HTML("""
""") # Input Mode Toggle Handler input_mode.change( lambda mode: (gr.update(visible=mode == "Image Input"), gr.update(visible=mode == "Text Input")), inputs=input_mode, outputs=[image_col, text_col], concurrency_limit=1 ) # Image Description Generation generate_desc_btn.click( analyze_image, inputs=image_upload, outputs=caption_display, concurrency_limit=2 ) # Dynamic Prompt Addition def add_prompt(current_count): if current_count >= 5: return current_count, gr.update() new_count = current_count + 1 new_prompt = gr.Textbox( label=f"Sound Prompt {new_count}", lines=2, visible=True, placeholder="Enter sound description..." ) return new_count, new_prompt prompt_count = gr.State(2) add_prompt_btn.click( add_prompt, inputs=prompt_count, outputs=[prompt_count, additional_prompts], concurrency_limit=1 ) # Sound Generation Handler def process_inputs(mode, image_file, caption, *prompts): try: if mode == "Image Input": if not image_file: raise gr.Error("Please upload an image") caption = analyze_image(image_file) prompts = [caption] else: prompts = [p.strip() for p in prompts if p.strip()] if not prompts: raise gr.Error("Please enter at least one valid prompt") # Generate individual audio tracks audio_tracks = [] for prompt in prompts: if not prompt: continue audio = generate_audio(prompt) if audio is not None: audio_tracks.append(audio) # Blend audio tracks if not audio_tracks: return None return blend_audios(audio_tracks) except Exception as e: raise gr.Error(f"Processing error: {str(e)}") generate_sound_btn.click( process_inputs, inputs=[input_mode, image_upload, caption_display, prompt1, prompt2], outputs=audio_output, concurrency_limit=2 ) if __name__ == "__main__": demo.launch(max_threads=4)