Spaces:

Bils
/

Generate-Sound-Effects-from-Image

Runtime error

File size: 4,432 Bytes

import spaces
import os
import tempfile
import gradio as gr
from dotenv import load_dotenv
import torch
from scipy.io.wavfile import write
from diffusers import DiffusionPipeline
from transformers import pipeline
from pathlib import Path

load_dotenv()
hf_token = os.getenv("HF_TKN")

device_id = 0 if torch.cuda.is_available() else -1

captioning_pipeline = pipeline(
    "image-to-text",
    model="nlpconnect/vit-gpt2-image-captioning",
    device=device_id
)

pipe = DiffusionPipeline.from_pretrained(
    "cvssp/audioldm2",
    use_auth_token=hf_token
)

@spaces.GPU(duration=120)
def analyze_image_with_free_model(image_file):
    try:
        with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file:
            temp_file.write(image_file)
            temp_image_path = temp_file.name

        results = captioning_pipeline(temp_image_path)
        if not results or not isinstance(results, list):
            return "Error: Could not generate caption.", True
        
        caption = results[0].get("generated_text", "").strip()
        if not caption:
            return "No caption was generated.", True
        return caption, False

    except Exception as e:
        return f"Error analyzing image: {e}", True

@spaces.GPU(duration=120)
def get_audioldm_from_caption(caption):
    try:
        pipe.to("cuda")
        audio_output = pipe(
            prompt=caption,
            num_inference_steps=50,
            guidance_scale=7.5
        )
        pipe.to("cpu")
        audio = audio_output.audios[0]

        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
            write(temp_wav.name, 16000, audio)
            return temp_wav.name

    except Exception as e:
        print(f"Error generating audio from caption: {e}")
        return None

with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo:
    with gr.Row():
        with gr.Column(scale=1):
            gr.Image(value="https://via.placeholder.com/150", interactive=False, label="App Logo", elem_id="app-logo")
        with gr.Column(scale=5):
            gr.HTML("""
            <div style="text-align: center; font-size: 32px; font-weight: bold; margin-bottom: 10px;">🎶 Image-to-Sound Generator</div>
            <div style="text-align: center; font-size: 16px; color: #6c757d;">Transform your images into descriptive captions and immersive soundscapes.</div>
            """)

    with gr.Row():
        with gr.Column():
            gr.Markdown("""
            ### How It Works
            1. **Upload an Image**: Select an image to analyze.
            2. **Generate Description**: Get a detailed caption describing your image.
            3. **Generate Sound**: Create an audio representation based on the caption.
            """)

    with gr.Row():
        with gr.Column(scale=1):
            image_upload = gr.File(label="Upload Image", type="binary")
            generate_description_button = gr.Button("Generate Description", variant="primary")
        with gr.Column(scale=2):
            caption_display = gr.Textbox(label="Generated Caption", interactive=False, placeholder="Your image caption will appear here.")
            generate_sound_button = gr.Button("Generate Sound", variant="primary")
        with gr.Column(scale=1):
            audio_output = gr.Audio(label="Generated Sound Effect", interactive=False)

    with gr.Row():
        gr.Markdown("""
        ## About This App
        This application uses advanced machine learning models to transform images into text captions and generate matching sound effects. It's a unique blend of visual and auditory creativity, powered by state-of-the-art AI technology.

        For inquiries, contact us at [[email protected]](mailto:[email protected]).
        """)

    def update_caption(image_file):
        description, _ = analyze_image_with_free_model(image_file)
        return description

    def generate_sound(description):
        if not description or description.startswith("Error"):
            return None
        audio_path = get_audioldm_from_caption(description)
        return audio_path

    generate_description_button.click(
        fn=update_caption,
        inputs=image_upload,
        outputs=caption_display
    )

    generate_sound_button.click(
        fn=generate_sound,
        inputs=caption_display,
        outputs=audio_output
    )

demo.launch(debug=True, share=True)