Bils's picture
Update app.py
21c490c verified
raw
history blame
4.81 kB
import spaces
import os
import tempfile
import gradio as gr
from dotenv import load_dotenv
import torch
from scipy.io.wavfile import write
from diffusers import DiffusionPipeline
from transformers import pipeline
from pathlib import Path
load_dotenv()
hf_token = os.getenv("HF_TKN")
device_id = 0 if torch.cuda.is_available() else -1
captioning_pipeline = pipeline(
"image-to-text",
model="nlpconnect/vit-gpt2-image-captioning",
device=device_id
)
pipe = DiffusionPipeline.from_pretrained(
"cvssp/audioldm2",
use_auth_token=hf_token
)
@spaces.GPU(duration=120)
def analyze_image_with_free_model(image_file):
try:
with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file:
temp_file.write(image_file)
temp_image_path = temp_file.name
results = captioning_pipeline(temp_image_path)
if not results or not isinstance(results, list):
return "Error: Could not generate caption.", True
caption = results[0].get("generated_text", "").strip()
if not caption:
return "No caption was generated.", True
return caption, False
except Exception as e:
return f"Error analyzing image: {e}", True
@spaces.GPU(duration=120)
def get_audioldm_from_caption(caption):
try:
pipe.to("cuda")
audio_output = pipe(
prompt=caption,
num_inference_steps=50,
guidance_scale=7.5
)
pipe.to("cpu")
audio = audio_output.audios[0]
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
write(temp_wav.name, 16000, audio)
return temp_wav.name
except Exception as e:
print(f"Error generating audio from caption: {e}")
return None
css = """
#header-container {
text-align: center;
margin: 20px 0;
}
#header-title {
font-size: 36px;
font-weight: bold;
margin-bottom: 10px;
}
#header-subtitle {
font-size: 18px;
margin-bottom: 20px;
color: #6c757d;
}
#main-container {
max-width: 900px;
margin: 0 auto;
padding: 20px;
border-radius: 12px;
background-color: #f9f9f9;
box-shadow: 0 2px 5px rgba(0, 0, 0, 0.1);
}
button.primary-button {
background-color: #007bff;
color: white;
border: none;
padding: 10px 20px;
border-radius: 5px;
font-size: 16px;
cursor: pointer;
}
button.primary-button:hover {
background-color: #0056b3;
}
"""
with gr.Blocks(css=css) as demo:
with gr.Column(elem_id="header-container"):
gr.HTML("""
<div id="header-title">🎶 Image-to-Sound Generator</div>
<div id="header-subtitle">Transform your images into descriptive captions and immersive soundscapes.</div>
""")
with gr.Group(elem_id="main-container"):
gr.Markdown("""
### How It Works
1. **Upload an Image**: Select an image to analyze.
2. **Generate Description**: Get a detailed caption describing your image.
3. **Generate Sound**: Create an audio representation based on the caption.
""")
image_upload = gr.File(label="Upload Image", type="binary")
generate_description_button = gr.Button("Generate Description", elem_classes="primary-button")
caption_display = gr.Textbox(label="Generated Caption", interactive=False, placeholder="Your image caption will appear here.")
generate_sound_button = gr.Button("Generate Sound", elem_classes="primary-button")
audio_output = gr.Audio(label="Generated Sound Effect", interactive=False)
with gr.Group():
gr.Markdown("""
## About This App
This application uses advanced machine learning models to transform images into text captions and generate matching sound effects. It's a unique blend of visual and auditory creativity, powered by state-of-the-art AI technology.
### Powered By
- [Hugging Face](https://huggingface.co)
- [Diffusion Models](https://huggingface.co/models)
For inquiries, contact us at [[email protected]](mailto:[email protected]).
""")
def update_caption(image_file):
description, _ = analyze_image_with_free_model(image_file)
return description
def generate_sound(description):
if not description or description.startswith("Error"):
return None
audio_path = get_audioldm_from_caption(description)
return audio_path
generate_description_button.click(
fn=update_caption,
inputs=image_upload,
outputs=caption_display
)
generate_sound_button.click(
fn=generate_sound,
inputs=caption_display,
outputs=audio_output
)
demo.launch(debug=True, share=True)