Spaces:

napstablook911
/

ImageToSound

Sleeping

App Files Files Community

napstablook911 commited on Jun 23

Commit

c5c0d60

verified ·

1 Parent(s): 3490480

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +148 -106

src/streamlit_app.py CHANGED Viewed

@@ -1,119 +1,161 @@
 import streamlit as st
 import torch
-import torchaudio
-from transformers import ViTImageProcessor, AutoTokenizer, VisionEncoderDecoderModel
-from einops import rearrange
-from stable_audio_tools import get_pretrained_model
-from stable_audio_tools.inference.generation import generate_diffusion_cond
-import io # Per salvare l'audio in memoria per Streamlit
-st.set_page_config(layout="wide")
-st.title("Image Captioning and Soundscape Generation")
-# Funzione per caricare i modelli e metterli in cache
-@st.cache_resource
-def load_models():
-    # Imposta il dispositivo su "cpu" come da requisiti per lo Space
-    device = "cpu"
-    st.write(f"Utilizzo del dispositivo: {device}")
-    # Caricamento del modello ViT-GPT2 per la captioning dell'immagine
-    st.write("Caricamento del modello ViT-GPT2 per la captioning dell'immagine...")
     try:
-        vit_gpt2_feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning", cache_dir="/app/hf_cache")
-        vit_gpt2_tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning", cache_dir="/app/hf_cache")
-        vit_gpt2_model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning", cache_dir="/app/hf_cache").to(device)
-        st.write("Modello ViT-GPT2 caricato.")
     except Exception as e:
-        st.error(f"Errore durante il caricamento del modello ViT-GPT2: {e}")
-        st.stop() # Ferma l'app se il modello essenziale non carica
-    # Caricamento del modello Stable Audio Open Small per la generazione del soundscape
-    st.write("Caricamento del modello Stable Audio Open Small per la generazione del soundscape...")
     try:
-        # Carica il modello Stable Audio usando stable_audio_tools
-        stable_audio_model, stable_audio_config = get_pretrained_model("stabilityai/stable-audio-open-small", cache_dir="/app/hf_cache")
-        stable_audio_model = stable_audio_model.to(device)
-        st.write("Modello Stable Audio Open Small caricato.")
-        return vit_gpt2_feature_extractor, vit_gpt2_model, vit_gpt2_tokenizer, stable_audio_model, stable_audio_config
     except Exception as e:
-        st.error(f"Errore durante il caricamento del modello Stable Audio Open Small: {e}")
-        st.stop() # Ferma l'app se il modello essenziale non carica
-# Carica i modelli all'avvio dell'app
-vit_gpt2_feature_extractor, vit_gpt2_model, vit_gpt2_tokenizer, stable_audio_model, stable_audio_config = load_models()
-# Funzione per generare la caption dell'immagine
-def generate_caption(image_pil):
-    pixel_values = vit_gpt2_feature_extractor(images=image_pil, return_tensors="pt").pixel_values
-    output_ids = vit_gpt2_model.generate(pixel_values, max_new_tokens=16)
-    caption = vit_gpt2_tokenizer.decode(output_ids[0], skip_special_tokens=True)
-    return caption
-# Funzione per generare il soundscape
-def generate_soundscape(prompt_text):
-    sample_size = stable_audio_config["sample_size"]
-    sample_rate = stable_audio_config["sample_rate"]
-    # Assicurati che il modello sia sulla CPU per la generazione
-    device = "cpu"
-    conditioning = [{
-      "prompt": prompt_text,
-    }]
-    # Genera audio
-    with st.spinner("Generazione audio in corso... (potrebbe richiedere un po' di tempo)"):
-        output = generate_diffusion_cond(
-          stable_audio_model,
-          conditioning=conditioning,
-          sample_size=sample_size,
-          device=device,
-          steps=100, # Numero di step di diffusione (puoi renderlo configurabile)
-          cfg_scale=7, # Scala di classifer-free guidance
-          sigma_min=0.03,
-          sigma_max=500,
-          sampler_type="dpmpp-3m-sde" # Tipo di sampler
-        )
-    # Riorganizza il batch audio in una singola sequenza
-    output = rearrange(output, "b d n -> d (b n)")
-    # Peak normalize, clip, converti in int16, e prepara per la riproduzione
-    output = output.to(torch.float32).div(torch.max(torch.abs(output))).clamp(-1, 1).mul(32767).to(torch.int16).cpu()
-    # Salva l'audio in un buffer di memoria per Streamlit
-    buffer = io.BytesIO()
-    torchaudio.save(buffer, output, sample_rate, format="wav")
-    return buffer.getvalue(), sample_rate
-# Streamlit UI
-uploaded_file = st.file_uploader("Carica un'immagine per la captioning:", type=["png", "jpg", "jpeg"])
-caption = ""
 if uploaded_file is not None:
-    from PIL import Image
-    image = Image.open(uploaded_file).convert("RGB")
-    st.image(image, caption="Immagine caricata.", use_column_width=True)
-    with st.spinner("Generazione della caption..."):
-        caption = generate_caption(image)
-        st.success(f"Caption generata: **{caption}**")
-# Campo di input per il prompt del soundscape
-st.header("Generazione Soundscape")
-soundscape_prompt_input = st.text_input(
-    "Inserisci un prompt per il soundscape (es. 'A gentle rain with thunder and distant birds'):",
-    value=caption if caption else "A natural outdoor soundscape" # Pre-popola con la caption se disponibile
-)
-if st.button("Genera Soundscape Audio"):
-    if soundscape_prompt_input:
-        audio_bytes, sr = generate_soundscape(soundscape_prompt_input)
-        st.audio(audio_bytes, format='audio/wav', sample_rate=sr)
-    else:
-        st.warning("Per favore, inserisci un prompt per generare il soundscape.")
-st.info("Nota: La generazione del soundscape può richiedere un po' di tempo a seconda della complessità del prompt e delle risorse disponibili.")

 import streamlit as st
+from PIL import Image
+import io
+import soundfile as sf
+import numpy as np
 import torch
+from transformers import pipeline
+from diffusers import StableAudioPipeline
+# --- Configuration ---
+# Determine the optimal device for model inference
+# Prioritize CUDA (NVIDIA GPUs), then MPS (Apple Silicon), fallback to CPU
+DEVICE = "cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu")
+# Use float16 for reduced memory and faster inference on compatible hardware (GPU/MPS)
+# Fallback to float32 for CPU for better stability
+TORCH_DTYPE = torch.float16 if DEVICE in ["cuda", "mps"] else torch.float32
+# --- Cached Model Loading Functions ---
+@st.cache_resource(show_spinner="Loading Image Captioning Model (BLIP)...")
+def load_blip_model():
+    """
+    Loads the BLIP image captioning model using Hugging Face transformers pipeline.
+    The model is cached to prevent reloading on every Streamlit rerun.
+    """
     try:
+        captioner = pipeline(
+            "image-to-text",
+            model="Salesforce/blip-image-captioning-base",
+            torch_dtype=TORCH_DTYPE,
+            device=DEVICE
+        )
+        return captioner
     except Exception as e:
+        st.error(f"Failed to load BLIP model: {e}")
+        return None
+@st.cache_resource(show_spinner="Loading Audio Generation Model (Stable Audio Open 1.0)...")
+def load_stable_audio_model():
+    """
+    Loads the Stable Audio Open 1.0 pipeline using Hugging Face diffusers.
+    The pipeline is cached to prevent reloading on every Streamlit rerun.
+    """
     try:
+        audio_pipeline = StableAudioPipeline.from_pretrained(
+            "stabilityai/stable-audio-open-1.0",
+            torch_dtype=TORCH_DTYPE
+        ).to(DEVICE)
+        return audio_pipeline
     except Exception as e:
+        st.error(f"Failed to load Stable Audio model: {e}")
+        return None
+# --- Audio Conversion Utility ---
+def convert_numpy_to_wav_bytes(audio_array: np.ndarray, sample_rate: int) -> bytes:
+    """
+    Converts a NumPy audio array to an in-memory WAV byte stream.
+    This avoids writing temporary files to disk, which is efficient and
+    suitable for ephemeral environments like Hugging Face Spaces.
+    """
+    byte_io = io.BytesIO()
+    # Stable Audio Open's diffusers output is (channels, frames).
+    # soundfile typically expects (frames, channels) for stereo.
+    # Transpose if it's a 2D array (stereo) to match soundfile's expectation.
+    if audio_array.ndim == 2 and audio_array.shape == 2: # Check if stereo (2 channels)
+        audio_array = audio_array.T # Transpose to (frames, channels) [1]
+    # Write the NumPy array to the in-memory BytesIO object as a WAV file [1, 2]
+    sf.write(byte_io, audio_array, sample_rate, format='WAV', subtype='FLOAT')
+    # IMPORTANT: Reset the stream position to the beginning before reading [3]
+    byte_io.seek(0)
+    return byte_io.read()
+# --- Streamlit App Layout ---
+st.set_page_config(layout="centered", page_title="Image-to-Soundscape Generator")
+st.title("🏞️ Image-to-Soundscape Generator 🎶")
+st.markdown("Upload a landscape image, and let AI transform it into a unique soundscape!")
+# Initialize session state for persistence across reruns [4]
+if "audio_bytes" not in st.session_state:
+    st.session_state.audio_bytes = None
+if "image_uploaded" not in st.session_state:
+    st.session_state.image_uploaded = False
+# --- UI Components ---
+uploaded_file = st.file_uploader("Choose a landscape image...", type=["jpg", "jpeg", "png"]) # [5]
 if uploaded_file is not None:
+    st.session_state.image_uploaded = True
+    image = Image.open(uploaded_file).convert("RGB") # Ensure image is in RGB format
+    st.image(image, caption="Uploaded Image", use_column_width=True) # [6]
+    # Button to trigger the generation pipeline
+    if st.button("Generate Soundscape"):
+        st.session_state.audio_bytes = None # Clear previous audio
+        with st.spinner("Generating soundscape... This may take a moment."): # [4]
+            try:
+                # 1. Load BLIP model and generate caption (hidden from user)
+                captioner = load_blip_model()
+                if captioner is None:
+                    st.error("Image captioning model could not be loaded. Please try again.")
+                    st.session_state.image_uploaded = False # Reset to allow re-upload
+                    st.stop()
+                # Generate caption
+                # The BLIP pipeline expects a PIL Image object directly
+                caption_results = captioner(image)
+                # Extract the generated text from the pipeline's output [7]
+                generated_caption = caption_results['generated_text']
+                # Optional: Enhance prompt for soundscape generation
+                # This helps guide the audio model towards environmental sounds
+                soundscape_prompt = f"A soundscape of {generated_caption}"
+                # 2. Load Stable Audio model and generate audio
+                audio_pipeline = load_stable_audio_model()
+                if audio_pipeline is None:
+                    st.error("Audio generation model could not be loaded. Please try again.")
+                    st.session_state.image_uploaded = False # Reset to allow re-upload
+                    st.stop()
+                # Generate audio with optimized parameters for speed [8, 9]
+                # num_inference_steps: Lower for faster generation, higher for better quality
+                # audio_end_in_s: Shorter audio for faster generation
+                # negative_prompt: Helps improve perceived quality [9]
+                audio_output = audio_pipeline(
+                    prompt=soundscape_prompt,
+                    num_inference_steps=50,  # Tuned for faster generation [9]
+                    audio_end_in_s=10.0,     # 10 seconds audio length [8]
+                    negative_prompt="low quality, average quality, distorted" # [9]
+                )
+                # Extract the NumPy array and sample rate [10]
+                audio_numpy_array = audio_output.audios
+                sample_rate = audio_pipeline.config.sampling_rate
+                # 3. Convert NumPy array to WAV bytes and store in session state
+                st.session_state.audio_bytes = convert_numpy_to_wav_bytes(audio_numpy_array, sample_rate)
+                st.success("Soundscape generated successfully!")
+            except Exception as e:
+                st.error(f"An error occurred during generation: {e}") # [11]
+                st.session_state.audio_bytes = None # Clear any partial audio
+                st.session_state.image_uploaded = False # Reset to allow re-upload
+                st.exception(e) # Display full traceback for debugging [11]
+# Display generated soundscape if available in session state
+if st.session_state.audio_bytes:
+    st.subheader("Generated Soundscape:")
+    st.audio(st.session_state.audio_bytes, format='audio/wav') # [6, 12]
+    st.markdown("You can download the audio using the controls above.")
+# Reset button for new image upload
+if st.session_state.image_uploaded and st.button("Upload New Image"):
+    st.session_state.audio_bytes = None
+    st.session_state.image_uploaded = False
+    st.rerun() # Rerun the app to clear the file uploader