import spaces import os import tempfile import gradio as gr from dotenv import load_dotenv import torch from scipy.io.wavfile import write from diffusers import DiffusionPipeline from transformers import pipeline from pydub import AudioSegment import numpy as np # Load environment variables load_dotenv() hf_token = os.getenv("HF_TKN") # Device configuration device = "cuda" if torch.cuda.is_available() else "cpu" torch_dtype = torch.float16 if device == "cuda" else torch.float32 # Initialize models with automatic device detection @spaces.GPU(duration=120) def load_models(): global captioning_pipeline, pipe captioning_pipeline = pipeline( "image-to-text", model="nlpconnect/vit-gpt2-image-captioning", device=0 if torch.cuda.is_available() else -1 ) pipe = DiffusionPipeline.from_pretrained( "cvssp/audioldm2", use_auth_token=hf_token, torch_dtype=torch_dtype ).to(device) load_models() @spaces.GPU(duration=60) def analyze_image(image_file): """Generate caption from image with error handling""" try: results = captioning_pipeline(image_file) if results and isinstance(results, list): return results[0].get("generated_text", "").strip() return "Could not generate caption" except Exception as e: return f"Error: {str(e)}" @spaces.GPU(duration=120) def generate_audio(prompt): """Generate audio from text prompt""" try: return pipe( prompt=prompt, num_inference_steps=50, guidance_scale=7.5 ).audios[0] except Exception as e: print(f"Audio generation error: {str(e)}") return None def blend_audios(audio_list): """Mix multiple audio arrays into one""" try: valid_audios = [arr for arr in audio_list if arr is not None] if not valid_audios: return None max_length = max(arr.shape[0] for arr in valid_audios) mixed = np.zeros(max_length) for arr in valid_audios: if arr.shape[0] < max_length: padded = np.pad(arr, (0, max_length - arr.shape[0])) else: padded = arr[:max_length] mixed += padded mixed = mixed / np.max(np.abs(mixed)) _, tmp_path = tempfile.mkstemp(suffix=".wav") write(tmp_path, 16000, mixed) return tmp_path except Exception as e: print(f"Blending error: {str(e)}") return None css = """ #col-container { max-width: 800px; margin: 0 auto; } .toggle-row { margin: 1rem 0; } .prompt-box { margin-bottom: 0.5rem; } .danger { color: #ff4444; font-weight: bold; } """ with gr.Blocks(css=css) as demo: with gr.Column(elem_id="col-container"): # Header Section gr.HTML("""
âš¡ Powered by Bilsimaging
""") # Input Mode Toggle input_mode = gr.Radio( choices=["Image Input", "Text Input"], value="Image Input", label="Select Input Mode", elem_classes="toggle-row" ) # Image Input Section with gr.Column(visible=True) as image_col: image_upload = gr.Image(type="filepath", label="Upload Image") generate_desc_btn = gr.Button("Generate Description from Image", variant="primary") caption_display = gr.Textbox(label="Generated Description", interactive=False) # Text Input Section with gr.Column(visible=False) as text_col: with gr.Row(): prompt1 = gr.Textbox(label="Sound Prompt 1", lines=2, placeholder="Enter sound description...") prompt2 = gr.Textbox(label="Sound Prompt 2", lines=2, placeholder="Enter sound description...") additional_prompts = gr.Column() add_prompt_btn = gr.Button("âž• Add Another Prompt", variant="secondary") gr.Markdown("