Spaces:
Running
on
Zero
Running
on
Zero
import spaces | |
import os | |
import tempfile | |
import gradio as gr | |
from dotenv import load_dotenv | |
import torch | |
from scipy.io.wavfile import write | |
from diffusers import DiffusionPipeline | |
from transformers import pipeline | |
from pathlib import Path | |
load_dotenv() | |
hf_token = os.getenv("HF_TKN") | |
device_id = 0 if torch.cuda.is_available() else -1 | |
captioning_pipeline = pipeline( | |
"image-to-text", | |
model="nlpconnect/vit-gpt2-image-captioning", | |
device=device_id | |
) | |
pipe = DiffusionPipeline.from_pretrained( | |
"cvssp/audioldm2", | |
use_auth_token=hf_token | |
) | |
def analyze_image_with_free_model(image_file): | |
try: | |
with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file: | |
temp_file.write(image_file) | |
temp_image_path = temp_file.name | |
results = captioning_pipeline(temp_image_path) | |
if not results or not isinstance(results, list): | |
return "Error: Could not generate caption.", True | |
caption = results[0].get("generated_text", "").strip() | |
if not caption: | |
return "No caption was generated.", True | |
return caption, False | |
except Exception as e: | |
return f"Error analyzing image: {e}", True | |
def get_audioldm_from_caption(caption): | |
try: | |
pipe.to("cuda") | |
audio_output = pipe( | |
prompt=caption, | |
num_inference_steps=50, | |
guidance_scale=7.5 | |
) | |
pipe.to("cpu") | |
audio = audio_output.audios[0] | |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav: | |
write(temp_wav.name, 16000, audio) | |
return temp_wav.name | |
except Exception as e: | |
print(f"Error generating audio from caption: {e}") | |
return None | |
css = """ | |
#header-container { | |
text-align: center; | |
margin: 20px 0; | |
} | |
#header-title { | |
font-size: 36px; | |
font-weight: bold; | |
margin-bottom: 10px; | |
} | |
#header-subtitle { | |
font-size: 18px; | |
margin-bottom: 20px; | |
color: #6c757d; | |
} | |
#main-container { | |
max-width: 900px; | |
margin: 0 auto; | |
padding: 20px; | |
border-radius: 12px; | |
background-color: #f9f9f9; | |
box-shadow: 0 2px 5px rgba(0, 0, 0, 0.1); | |
} | |
button.primary-button { | |
background-color: #007bff; | |
color: white; | |
border: none; | |
padding: 10px 20px; | |
border-radius: 5px; | |
font-size: 16px; | |
cursor: pointer; | |
} | |
button.primary-button:hover { | |
background-color: #0056b3; | |
} | |
""" | |
with gr.Blocks(css=css) as demo: | |
with gr.Column(elem_id="header-container"): | |
gr.HTML(""" | |
<div id="header-title">🎶 Image-to-Sound Generator</div> | |
<div id="header-subtitle">Transform your images into descriptive captions and immersive soundscapes.</div> | |
""") | |
with gr.Group(elem_id="main-container"): | |
gr.Markdown(""" | |
### How It Works | |
1. **Upload an Image**: Select an image to analyze. | |
2. **Generate Description**: Get a detailed caption describing your image. | |
3. **Generate Sound**: Create an audio representation based on the caption. | |
""") | |
image_upload = gr.File(label="Upload Image", type="binary") | |
generate_description_button = gr.Button("Generate Description", elem_classes="primary-button") | |
caption_display = gr.Textbox(label="Generated Caption", interactive=False, placeholder="Your image caption will appear here.") | |
generate_sound_button = gr.Button("Generate Sound", elem_classes="primary-button") | |
audio_output = gr.Audio(label="Generated Sound Effect", interactive=False) | |
with gr.Group(): | |
gr.Markdown(""" | |
## About This App | |
This application uses advanced machine learning models to transform images into text captions and generate matching sound effects. It's a unique blend of visual and auditory creativity, powered by state-of-the-art AI technology. | |
### Powered By | |
- [Hugging Face](https://huggingface.co) | |
- [Diffusion Models](https://huggingface.co/models) | |
For inquiries, contact us at [[email protected]](mailto:[email protected]). | |
""") | |
def update_caption(image_file): | |
description, _ = analyze_image_with_free_model(image_file) | |
return description | |
def generate_sound(description): | |
if not description or description.startswith("Error"): | |
return None | |
audio_path = get_audioldm_from_caption(description) | |
return audio_path | |
generate_description_button.click( | |
fn=update_caption, | |
inputs=image_upload, | |
outputs=caption_display | |
) | |
generate_sound_button.click( | |
fn=generate_sound, | |
inputs=caption_display, | |
outputs=audio_output | |
) | |
demo.launch(debug=True, share=True) | |