Spaces:
Sleeping
Sleeping
Update src/streamlit_app.py
Browse files- src/streamlit_app.py +17 -16
src/streamlit_app.py
CHANGED
@@ -35,15 +35,16 @@ def load_blip_model():
|
|
35 |
st.error(f"Failed to load BLIP model: {e}")
|
36 |
return None
|
37 |
|
38 |
-
@st.cache_resource(show_spinner="Loading Audio Generation Model (Stable Audio Open
|
39 |
def load_stable_audio_model():
|
40 |
"""
|
41 |
-
Loads the Stable Audio Open
|
42 |
The pipeline is cached to prevent reloading on every Streamlit rerun.
|
43 |
"""
|
44 |
try:
|
|
|
45 |
audio_pipeline = StableAudioPipeline.from_pretrained(
|
46 |
-
"stabilityai/stable-audio-open-
|
47 |
torch_dtype=TORCH_DTYPE
|
48 |
).to(DEVICE)
|
49 |
return audio_pipeline
|
@@ -90,7 +91,7 @@ uploaded_file = st.file_uploader("Choose a landscape image...", type=["jpg", "jp
|
|
90 |
if uploaded_file is not None:
|
91 |
st.session_state.image_uploaded = True
|
92 |
image = Image.open(uploaded_file).convert("RGB") # Ensure image is in RGB format
|
93 |
-
st.image(image, caption="Uploaded Image", use_container_width=True)
|
94 |
|
95 |
# Button to trigger the generation pipeline
|
96 |
if st.button("Generate Soundscape"):
|
@@ -108,9 +109,9 @@ if uploaded_file is not None:
|
|
108 |
# Generate caption
|
109 |
# The BLIP pipeline expects a PIL Image object directly
|
110 |
caption_results = captioner(image)
|
111 |
-
# Extract the generated text from the pipeline's output
|
112 |
generated_caption = caption_results[0]['generated_text']
|
113 |
-
|
114 |
# Optional: Enhance prompt for soundscape generation
|
115 |
# This helps guide the audio model towards environmental sounds
|
116 |
soundscape_prompt = f"A soundscape of {generated_caption}"
|
@@ -122,18 +123,18 @@ if uploaded_file is not None:
|
|
122 |
st.session_state.image_uploaded = False # Reset to allow re-upload
|
123 |
st.stop()
|
124 |
|
125 |
-
# Generate audio with optimized parameters for speed [
|
126 |
# num_inference_steps: Lower for faster generation, higher for better quality
|
127 |
-
# audio_end_in_s: Shorter audio for faster generation
|
128 |
-
# negative_prompt: Helps improve perceived quality [
|
129 |
audio_output = audio_pipeline(
|
130 |
prompt=soundscape_prompt,
|
131 |
-
num_inference_steps=50, # Tuned for faster generation [
|
132 |
-
audio_end_in_s=10.0, # 10 seconds audio length [
|
133 |
-
negative_prompt="low quality, average quality, distorted" # [
|
134 |
)
|
135 |
|
136 |
-
# Extract the NumPy array and sample rate [
|
137 |
audio_numpy_array = audio_output.audios
|
138 |
sample_rate = audio_pipeline.config.sampling_rate
|
139 |
|
@@ -143,15 +144,15 @@ if uploaded_file is not None:
|
|
143 |
st.success("Soundscape generated successfully!")
|
144 |
|
145 |
except Exception as e:
|
146 |
-
st.error(f"An error occurred during generation: {e}") #
|
147 |
st.session_state.audio_bytes = None # Clear any partial audio
|
148 |
st.session_state.image_uploaded = False # Reset to allow re-upload
|
149 |
-
st.exception(e) # Display full traceback for debugging
|
150 |
|
151 |
# Display generated soundscape if available in session state
|
152 |
if st.session_state.audio_bytes:
|
153 |
st.subheader("Generated Soundscape:")
|
154 |
-
st.audio(st.session_state.audio_bytes, format='audio/wav') #
|
155 |
st.markdown("You can download the audio using the controls above.")
|
156 |
|
157 |
# Reset button for new image upload
|
|
|
35 |
st.error(f"Failed to load BLIP model: {e}")
|
36 |
return None
|
37 |
|
38 |
+
@st.cache_resource(show_spinner="Loading Audio Generation Model (Stable Audio Open Small)...")
|
39 |
def load_stable_audio_model():
|
40 |
"""
|
41 |
+
Loads the Stable Audio Open Small pipeline using Hugging Face diffusers.
|
42 |
The pipeline is cached to prevent reloading on every Streamlit rerun.
|
43 |
"""
|
44 |
try:
|
45 |
+
# Changed model to stabilityai/stable-audio-open-small
|
46 |
audio_pipeline = StableAudioPipeline.from_pretrained(
|
47 |
+
"stabilityai/stable-audio-open-small", # <--- MODEL CHANGED HERE
|
48 |
torch_dtype=TORCH_DTYPE
|
49 |
).to(DEVICE)
|
50 |
return audio_pipeline
|
|
|
91 |
if uploaded_file is not None:
|
92 |
st.session_state.image_uploaded = True
|
93 |
image = Image.open(uploaded_file).convert("RGB") # Ensure image is in RGB format
|
94 |
+
st.image(image, caption="Uploaded Image", use_container_width=True) # Updated deprecated parameter [6]
|
95 |
|
96 |
# Button to trigger the generation pipeline
|
97 |
if st.button("Generate Soundscape"):
|
|
|
109 |
# Generate caption
|
110 |
# The BLIP pipeline expects a PIL Image object directly
|
111 |
caption_results = captioner(image)
|
112 |
+
# Extract the generated text from the pipeline's output
|
113 |
generated_caption = caption_results[0]['generated_text']
|
114 |
+
|
115 |
# Optional: Enhance prompt for soundscape generation
|
116 |
# This helps guide the audio model towards environmental sounds
|
117 |
soundscape_prompt = f"A soundscape of {generated_caption}"
|
|
|
123 |
st.session_state.image_uploaded = False # Reset to allow re-upload
|
124 |
st.stop()
|
125 |
|
126 |
+
# Generate audio with optimized parameters for speed [7, 8]
|
127 |
# num_inference_steps: Lower for faster generation, higher for better quality
|
128 |
+
# audio_end_in_s: Shorter audio for faster generation (max 11s for stable-audio-open-small) [10, 11, 12]
|
129 |
+
# negative_prompt: Helps improve perceived quality [8]
|
130 |
audio_output = audio_pipeline(
|
131 |
prompt=soundscape_prompt,
|
132 |
+
num_inference_steps=50, # Tuned for faster generation [8]
|
133 |
+
audio_end_in_s=10.0, # 10 seconds audio length (within 11s limit for small model) [10, 11, 12]
|
134 |
+
negative_prompt="low quality, average quality, distorted" # [8]
|
135 |
)
|
136 |
|
137 |
+
# Extract the NumPy array and sample rate [9]
|
138 |
audio_numpy_array = audio_output.audios
|
139 |
sample_rate = audio_pipeline.config.sampling_rate
|
140 |
|
|
|
144 |
st.success("Soundscape generated successfully!")
|
145 |
|
146 |
except Exception as e:
|
147 |
+
st.error(f"An error occurred during generation: {e}") #
|
148 |
st.session_state.audio_bytes = None # Clear any partial audio
|
149 |
st.session_state.image_uploaded = False # Reset to allow re-upload
|
150 |
+
st.exception(e) # Display full traceback for debugging
|
151 |
|
152 |
# Display generated soundscape if available in session state
|
153 |
if st.session_state.audio_bytes:
|
154 |
st.subheader("Generated Soundscape:")
|
155 |
+
st.audio(st.session_state.audio_bytes, format='audio/wav') #
|
156 |
st.markdown("You can download the audio using the controls above.")
|
157 |
|
158 |
# Reset button for new image upload
|