quazim commited on
Commit
bb422a5
·
1 Parent(s): 3d157c8
Files changed (1) hide show
  1. app.py +29 -17
app.py CHANGED
@@ -132,30 +132,43 @@ def generate_music(text_prompt, duration=10, guidance_scale=3.0):
132
  output = outputs[0]
133
  audio_data = output['audio']
134
  sample_rate = output['sampling_rate']
135
-
136
  print(f"[GENERATION] Audio shape: {audio_data.shape}")
137
  print(f"[GENERATION] Sample rate: {sample_rate}")
138
-
139
- if len(audio_data.shape) > 1:
140
- audio_data = audio_data[0] if audio_data.shape[0] < audio_data.shape[1] else audio_data[:, 0]
141
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  audio_data = audio_data.flatten()
143
-
 
 
144
  max_val = np.max(np.abs(audio_data))
145
  if max_val > 0:
146
  audio_data = audio_data / max_val * 0.95 # Scale to 95% to avoid clipping
147
-
148
  audio_data = audio_data.astype(np.float32)
149
-
150
  print(f"[GENERATION] Final audio shape: {audio_data.shape}")
151
  print(f"[GENERATION] Audio range: [{np.min(audio_data):.3f}, {np.max(audio_data):.3f}]")
 
152
 
153
- with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
154
- sf.write(tmp_file.name, audio_data, sample_rate)
155
- temp_path = tmp_file.name
156
-
157
- print(f"[GENERATION] Audio saved to: {temp_path}")
158
- return temp_path
159
 
160
  except Exception as e:
161
  print(f"[ERROR] Generation failed: {str(e)}")
@@ -198,9 +211,8 @@ with gr.Blocks(title="MusicGen Large - Music Generation") as demo:
198
  with gr.Column():
199
  audio_output = gr.Audio(
200
  label="Generated Music",
201
- type="filepath",
202
- format="wav",
203
- interactive=False
204
  )
205
 
206
  with gr.Accordion("Tips", open=False):
 
132
  output = outputs[0]
133
  audio_data = output['audio']
134
  sample_rate = output['sampling_rate']
135
+
136
  print(f"[GENERATION] Audio shape: {audio_data.shape}")
137
  print(f"[GENERATION] Sample rate: {sample_rate}")
138
+ print(f"[GENERATION] Audio dtype: {audio_data.dtype}")
139
+ print(f"[GENERATION] Audio is numpy: {type(audio_data)}")
140
+
141
+ if hasattr(audio_data, 'cpu'):
142
+ audio_data = audio_data.cpu().numpy()
143
+
144
+ print(f"[GENERATION] Audio shape after tensor conversion: {audio_data.shape}")
145
+
146
+ if len(audio_data.shape) == 3:
147
+ audio_data = audio_data[0]
148
+
149
+ if len(audio_data.shape) == 2:
150
+ if audio_data.shape[0] < audio_data.shape[1]:
151
+ audio_data = audio_data.T
152
+ if audio_data.shape[1] > 1:
153
+ audio_data = audio_data[:, 0]
154
+ else:
155
+ audio_data = audio_data.flatten()
156
+
157
  audio_data = audio_data.flatten()
158
+
159
+ print(f"[GENERATION] Audio shape after flattening: {audio_data.shape}")
160
+
161
  max_val = np.max(np.abs(audio_data))
162
  if max_val > 0:
163
  audio_data = audio_data / max_val * 0.95 # Scale to 95% to avoid clipping
164
+
165
  audio_data = audio_data.astype(np.float32)
166
+
167
  print(f"[GENERATION] Final audio shape: {audio_data.shape}")
168
  print(f"[GENERATION] Audio range: [{np.min(audio_data):.3f}, {np.max(audio_data):.3f}]")
169
+ print(f"[GENERATION] Sample rate: {sample_rate}")
170
 
171
+ return (sample_rate, audio_data)
 
 
 
 
 
172
 
173
  except Exception as e:
174
  print(f"[ERROR] Generation failed: {str(e)}")
 
211
  with gr.Column():
212
  audio_output = gr.Audio(
213
  label="Generated Music",
214
+ type="numpy",
215
+ interactive=False,
 
216
  )
217
 
218
  with gr.Accordion("Tips", open=False):