Spaces:
Running
on
Zero
Running
on
Zero
| import spaces | |
| import os | |
| import tempfile | |
| import gradio as gr | |
| from dotenv import load_dotenv | |
| import torch | |
| from scipy.io.wavfile import write | |
| from diffusers import DiffusionPipeline | |
| from transformers import pipeline | |
| from pathlib import Path | |
| load_dotenv() | |
| hf_token = os.getenv("HF_TKN") | |
| device_id = 0 if torch.cuda.is_available() else -1 | |
| captioning_pipeline = pipeline( | |
| "image-to-text", | |
| model="nlpconnect/vit-gpt2-image-captioning", | |
| device=device_id | |
| ) | |
| pipe = DiffusionPipeline.from_pretrained( | |
| "cvssp/audioldm2", | |
| use_auth_token=hf_token | |
| ) | |
| def analyze_image_with_free_model(image_file): | |
| try: | |
| with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file: | |
| temp_file.write(image_file) | |
| temp_image_path = temp_file.name | |
| results = captioning_pipeline(temp_image_path) | |
| if not results or not isinstance(results, list): | |
| return "Error: Could not generate caption.", True | |
| caption = results[0].get("generated_text", "").strip() | |
| if not caption: | |
| return "No caption was generated.", True | |
| return caption, False | |
| except Exception as e: | |
| return f"Error analyzing image: {e}", True | |
| def get_audioldm_from_caption(caption): | |
| try: | |
| pipe.to("cuda") | |
| audio_output = pipe( | |
| prompt=caption, | |
| num_inference_steps=50, | |
| guidance_scale=7.5 | |
| ) | |
| pipe.to("cpu") | |
| audio = audio_output.audios[0] | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav: | |
| write(temp_wav.name, 16000, audio) | |
| return temp_wav.name | |
| except Exception as e: | |
| print(f"Error generating audio from caption: {e}") | |
| return None | |
| css = """ | |
| #header-container { | |
| text-align: center; | |
| margin: 20px 0; | |
| } | |
| #header-title { | |
| font-size: 36px; | |
| font-weight: bold; | |
| margin-bottom: 10px; | |
| } | |
| #header-subtitle { | |
| font-size: 18px; | |
| margin-bottom: 20px; | |
| color: #6c757d; | |
| } | |
| #main-container { | |
| max-width: 900px; | |
| margin: 0 auto; | |
| padding: 20px; | |
| border-radius: 12px; | |
| background-color: #f9f9f9; | |
| box-shadow: 0 2px 5px rgba(0, 0, 0, 0.1); | |
| } | |
| button.primary-button { | |
| background-color: #007bff; | |
| color: white; | |
| border: none; | |
| padding: 10px 20px; | |
| border-radius: 5px; | |
| font-size: 16px; | |
| cursor: pointer; | |
| } | |
| button.primary-button:hover { | |
| background-color: #0056b3; | |
| } | |
| """ | |
| with gr.Blocks(css=css) as demo: | |
| with gr.Column(elem_id="header-container"): | |
| gr.HTML(""" | |
| <div id="header-title">🎶 Image-to-Sound Generator</div> | |
| <div id="header-subtitle">Transform your images into descriptive captions and immersive soundscapes.</div> | |
| """) | |
| with gr.Group(elem_id="main-container"): | |
| gr.Markdown(""" | |
| ### How It Works | |
| 1. **Upload an Image**: Select an image to analyze. | |
| 2. **Generate Description**: Get a detailed caption describing your image. | |
| 3. **Generate Sound**: Create an audio representation based on the caption. | |
| """) | |
| image_upload = gr.File(label="Upload Image", type="binary") | |
| generate_description_button = gr.Button("Generate Description", elem_classes="primary-button") | |
| caption_display = gr.Textbox(label="Generated Caption", interactive=False, placeholder="Your image caption will appear here.") | |
| generate_sound_button = gr.Button("Generate Sound", elem_classes="primary-button") | |
| audio_output = gr.Audio(label="Generated Sound Effect", interactive=False) | |
| with gr.Group(): | |
| gr.Markdown(""" | |
| ## About This App | |
| This application uses advanced machine learning models to transform images into text captions and generate matching sound effects. It's a unique blend of visual and auditory creativity, powered by state-of-the-art AI technology. | |
| ### Powered By | |
| - [Hugging Face](https://huggingface.co) | |
| - [Diffusion Models](https://huggingface.co/models) | |
| For inquiries, contact us at [[email protected]](mailto:[email protected]). | |
| """) | |
| def update_caption(image_file): | |
| description, _ = analyze_image_with_free_model(image_file) | |
| return description | |
| def generate_sound(description): | |
| if not description or description.startswith("Error"): | |
| return None | |
| audio_path = get_audioldm_from_caption(description) | |
| return audio_path | |
| generate_description_button.click( | |
| fn=update_caption, | |
| inputs=image_upload, | |
| outputs=caption_display | |
| ) | |
| generate_sound_button.click( | |
| fn=generate_sound, | |
| inputs=caption_display, | |
| outputs=audio_output | |
| ) | |
| demo.launch(debug=True, share=True) | |