napstablook911 commited on
Commit
8e93ddc
·
verified ·
1 Parent(s): 463aeec

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +17 -16
src/streamlit_app.py CHANGED
@@ -35,15 +35,16 @@ def load_blip_model():
35
  st.error(f"Failed to load BLIP model: {e}")
36
  return None
37
 
38
- @st.cache_resource(show_spinner="Loading Audio Generation Model (Stable Audio Open 1.0)...")
39
  def load_stable_audio_model():
40
  """
41
- Loads the Stable Audio Open 1.0 pipeline using Hugging Face diffusers.
42
  The pipeline is cached to prevent reloading on every Streamlit rerun.
43
  """
44
  try:
 
45
  audio_pipeline = StableAudioPipeline.from_pretrained(
46
- "stabilityai/stable-audio-open-1.0",
47
  torch_dtype=TORCH_DTYPE
48
  ).to(DEVICE)
49
  return audio_pipeline
@@ -90,7 +91,7 @@ uploaded_file = st.file_uploader("Choose a landscape image...", type=["jpg", "jp
90
  if uploaded_file is not None:
91
  st.session_state.image_uploaded = True
92
  image = Image.open(uploaded_file).convert("RGB") # Ensure image is in RGB format
93
- st.image(image, caption="Uploaded Image", use_container_width=True)
94
 
95
  # Button to trigger the generation pipeline
96
  if st.button("Generate Soundscape"):
@@ -108,9 +109,9 @@ if uploaded_file is not None:
108
  # Generate caption
109
  # The BLIP pipeline expects a PIL Image object directly
110
  caption_results = captioner(image)
111
- # Extract the generated text from the pipeline's output [7]
112
  generated_caption = caption_results[0]['generated_text']
113
-
114
  # Optional: Enhance prompt for soundscape generation
115
  # This helps guide the audio model towards environmental sounds
116
  soundscape_prompt = f"A soundscape of {generated_caption}"
@@ -122,18 +123,18 @@ if uploaded_file is not None:
122
  st.session_state.image_uploaded = False # Reset to allow re-upload
123
  st.stop()
124
 
125
- # Generate audio with optimized parameters for speed [8, 9]
126
  # num_inference_steps: Lower for faster generation, higher for better quality
127
- # audio_end_in_s: Shorter audio for faster generation
128
- # negative_prompt: Helps improve perceived quality [9]
129
  audio_output = audio_pipeline(
130
  prompt=soundscape_prompt,
131
- num_inference_steps=50, # Tuned for faster generation [9]
132
- audio_end_in_s=10.0, # 10 seconds audio length [8]
133
- negative_prompt="low quality, average quality, distorted" # [9]
134
  )
135
 
136
- # Extract the NumPy array and sample rate [10]
137
  audio_numpy_array = audio_output.audios
138
  sample_rate = audio_pipeline.config.sampling_rate
139
 
@@ -143,15 +144,15 @@ if uploaded_file is not None:
143
  st.success("Soundscape generated successfully!")
144
 
145
  except Exception as e:
146
- st.error(f"An error occurred during generation: {e}") # [11]
147
  st.session_state.audio_bytes = None # Clear any partial audio
148
  st.session_state.image_uploaded = False # Reset to allow re-upload
149
- st.exception(e) # Display full traceback for debugging [11]
150
 
151
  # Display generated soundscape if available in session state
152
  if st.session_state.audio_bytes:
153
  st.subheader("Generated Soundscape:")
154
- st.audio(st.session_state.audio_bytes, format='audio/wav') # [6, 12]
155
  st.markdown("You can download the audio using the controls above.")
156
 
157
  # Reset button for new image upload
 
35
  st.error(f"Failed to load BLIP model: {e}")
36
  return None
37
 
38
+ @st.cache_resource(show_spinner="Loading Audio Generation Model (Stable Audio Open Small)...")
39
  def load_stable_audio_model():
40
  """
41
+ Loads the Stable Audio Open Small pipeline using Hugging Face diffusers.
42
  The pipeline is cached to prevent reloading on every Streamlit rerun.
43
  """
44
  try:
45
+ # Changed model to stabilityai/stable-audio-open-small
46
  audio_pipeline = StableAudioPipeline.from_pretrained(
47
+ "stabilityai/stable-audio-open-small", # <--- MODEL CHANGED HERE
48
  torch_dtype=TORCH_DTYPE
49
  ).to(DEVICE)
50
  return audio_pipeline
 
91
  if uploaded_file is not None:
92
  st.session_state.image_uploaded = True
93
  image = Image.open(uploaded_file).convert("RGB") # Ensure image is in RGB format
94
+ st.image(image, caption="Uploaded Image", use_container_width=True) # Updated deprecated parameter [6]
95
 
96
  # Button to trigger the generation pipeline
97
  if st.button("Generate Soundscape"):
 
109
  # Generate caption
110
  # The BLIP pipeline expects a PIL Image object directly
111
  caption_results = captioner(image)
112
+ # Extract the generated text from the pipeline's output
113
  generated_caption = caption_results[0]['generated_text']
114
+
115
  # Optional: Enhance prompt for soundscape generation
116
  # This helps guide the audio model towards environmental sounds
117
  soundscape_prompt = f"A soundscape of {generated_caption}"
 
123
  st.session_state.image_uploaded = False # Reset to allow re-upload
124
  st.stop()
125
 
126
+ # Generate audio with optimized parameters for speed [7, 8]
127
  # num_inference_steps: Lower for faster generation, higher for better quality
128
+ # audio_end_in_s: Shorter audio for faster generation (max 11s for stable-audio-open-small) [10, 11, 12]
129
+ # negative_prompt: Helps improve perceived quality [8]
130
  audio_output = audio_pipeline(
131
  prompt=soundscape_prompt,
132
+ num_inference_steps=50, # Tuned for faster generation [8]
133
+ audio_end_in_s=10.0, # 10 seconds audio length (within 11s limit for small model) [10, 11, 12]
134
+ negative_prompt="low quality, average quality, distorted" # [8]
135
  )
136
 
137
+ # Extract the NumPy array and sample rate [9]
138
  audio_numpy_array = audio_output.audios
139
  sample_rate = audio_pipeline.config.sampling_rate
140
 
 
144
  st.success("Soundscape generated successfully!")
145
 
146
  except Exception as e:
147
+ st.error(f"An error occurred during generation: {e}") #
148
  st.session_state.audio_bytes = None # Clear any partial audio
149
  st.session_state.image_uploaded = False # Reset to allow re-upload
150
+ st.exception(e) # Display full traceback for debugging
151
 
152
  # Display generated soundscape if available in session state
153
  if st.session_state.audio_bytes:
154
  st.subheader("Generated Soundscape:")
155
+ st.audio(st.session_state.audio_bytes, format='audio/wav') #
156
  st.markdown("You can download the audio using the controls above.")
157
 
158
  # Reset button for new image upload