Spaces:
Running
on
L40S
Running
on
L40S
updated
Browse files
app.py
CHANGED
@@ -12,7 +12,6 @@ os.environ['ELASTIC_LOG_LEVEL'] = 'DEBUG'
|
|
12 |
from transformers import AutoProcessor, pipeline
|
13 |
from elastic_models.transformers import MusicgenForConditionalGeneration
|
14 |
|
15 |
-
|
16 |
MODEL_CONFIG = {
|
17 |
'cost_per_hour': 1.8, # $1.8 per hour
|
18 |
}
|
@@ -207,9 +206,9 @@ def generate_music(text_prompt, duration=10, guidance_scale=3.0):
|
|
207 |
max_val = np.max(np.abs(audio_data))
|
208 |
if max_val > 0:
|
209 |
audio_data = audio_data / max_val * 0.95
|
210 |
-
|
211 |
audio_data = (audio_data * 32767).astype(np.int16)
|
212 |
-
|
213 |
print(f"[GENERATION] Final audio shape: {audio_data.shape}")
|
214 |
print(f"[GENERATION] Audio range: [{np.min(audio_data)}, {np.max(audio_data)}]")
|
215 |
print(f"[GENERATION] Audio dtype: {audio_data.dtype}")
|
@@ -225,7 +224,7 @@ def generate_music(text_prompt, duration=10, guidance_scale=3.0):
|
|
225 |
file_size = os.path.getsize(temp_path)
|
226 |
print(f"[GENERATION] Audio saved to: {temp_path}")
|
227 |
print(f"[GENERATION] File size: {file_size} bytes")
|
228 |
-
|
229 |
# Try returning numpy format instead
|
230 |
print(f"[GENERATION] Returning numpy tuple: ({sample_rate}, audio_array)")
|
231 |
return (sample_rate, audio_data)
|
@@ -265,7 +264,7 @@ def get_cache_key(prompt, duration, guidance_scale):
|
|
265 |
def generate_music_batch(text_prompt, duration=10, guidance_scale=3.0, model_mode="compressed"):
|
266 |
try:
|
267 |
cache_key = get_cache_key(text_prompt, duration, guidance_scale)
|
268 |
-
|
269 |
generator, processor = load_model()
|
270 |
model_name = "Compressed (S)"
|
271 |
|
@@ -301,18 +300,18 @@ def generate_music_batch(text_prompt, duration=10, guidance_scale=3.0, model_mod
|
|
301 |
|
302 |
audio_variants = []
|
303 |
sample_rate = outputs[0]['sampling_rate']
|
304 |
-
|
305 |
for i, output in enumerate(outputs):
|
306 |
audio_data = output['audio']
|
307 |
-
|
308 |
-
print(f"[GENERATION] Processing variant {i+1} audio shape: {audio_data.shape}")
|
309 |
-
|
310 |
if hasattr(audio_data, 'cpu'):
|
311 |
audio_data = audio_data.cpu().numpy()
|
312 |
|
313 |
if len(audio_data.shape) == 3:
|
314 |
audio_data = audio_data[0]
|
315 |
-
|
316 |
if len(audio_data.shape) == 2:
|
317 |
if audio_data.shape[0] < audio_data.shape[1]:
|
318 |
audio_data = audio_data.T
|
@@ -320,31 +319,31 @@ def generate_music_batch(text_prompt, duration=10, guidance_scale=3.0, model_mod
|
|
320 |
audio_data = audio_data[:, 0]
|
321 |
else:
|
322 |
audio_data = audio_data.flatten()
|
323 |
-
|
324 |
audio_data = audio_data.flatten()
|
325 |
-
|
326 |
max_val = np.max(np.abs(audio_data))
|
327 |
if max_val > 0:
|
328 |
audio_data = audio_data / max_val * 0.95
|
329 |
-
|
330 |
audio_data = (audio_data * 32767).astype(np.int16)
|
331 |
audio_variants.append((sample_rate, audio_data))
|
332 |
-
|
333 |
-
print(f"[GENERATION] Variant {i+1} final shape: {audio_data.shape}")
|
334 |
|
335 |
comparison_message = ""
|
336 |
-
|
337 |
if cache_key in original_time_cache:
|
338 |
original_time = original_time_cache[cache_key]
|
339 |
cost_info = calculate_cost_savings(generation_time, original_time)
|
340 |
-
|
341 |
comparison_message = f"π° Cost Savings: ${cost_info['savings']:.4f} ({cost_info['savings_percent']:.1f}%) - Compressed: ${cost_info['compressed_cost']:.4f} vs Original: ${cost_info['original_cost']:.4f}"
|
342 |
print(f"[COST] Savings: ${cost_info['savings']:.4f} ({cost_info['savings_percent']:.1f}%)")
|
343 |
else:
|
344 |
try:
|
345 |
print(f"[TIMING] Measuring original model speed for comparison...")
|
346 |
original_generator, original_processor = load_original_model()
|
347 |
-
|
348 |
original_start = time.time()
|
349 |
original_outputs = original_generator(
|
350 |
prompts,
|
@@ -352,25 +351,26 @@ def generate_music_batch(text_prompt, duration=10, guidance_scale=3.0, model_mod
|
|
352 |
generate_kwargs=generation_params
|
353 |
)
|
354 |
original_time = time.time() - original_start
|
355 |
-
|
356 |
original_time_cache[cache_key] = original_time
|
357 |
-
|
358 |
cost_info = calculate_cost_savings(generation_time, original_time)
|
359 |
comparison_message = f"π° Cost Savings: ${cost_info['savings']:.4f} ({cost_info['savings_percent']:.1f}%) - Compressed: ${cost_info['compressed_cost']:.4f} vs Original: ${cost_info['original_cost']:.4f}"
|
360 |
-
print(
|
|
|
361 |
print(f"[TIMING] Original: {original_time:.2f}s, Compressed: {generation_time:.2f}s")
|
362 |
-
|
363 |
del original_generator, original_processor
|
364 |
cleanup_gpu()
|
365 |
print(f"[CLEANUP] Original model cleaned up after timing measurement")
|
366 |
-
|
367 |
except Exception as e:
|
368 |
print(f"[WARNING] Could not measure original timing: {e}")
|
369 |
compressed_cost = calculate_generation_cost(generation_time, 'S')
|
370 |
comparison_message = f"πΈ Compressed Cost: ${compressed_cost:.4f} (could not compare with original)"
|
371 |
|
372 |
generation_info = f"β
Generated 4 variants in {generation_time:.2f}s\n{comparison_message}"
|
373 |
-
|
374 |
return audio_variants[0], audio_variants[1], audio_variants[2], audio_variants[3], generation_info
|
375 |
|
376 |
except Exception as e:
|
@@ -382,7 +382,8 @@ def generate_music_batch(text_prompt, duration=10, guidance_scale=3.0, model_mod
|
|
382 |
|
383 |
with gr.Blocks(title="MusicGen Large - Music Generation") as demo:
|
384 |
gr.Markdown("# π΅ MusicGen Large Music Generator")
|
385 |
-
gr.Markdown(
|
|
|
386 |
|
387 |
with gr.Row():
|
388 |
with gr.Column():
|
@@ -392,7 +393,7 @@ with gr.Blocks(title="MusicGen Large - Music Generation") as demo:
|
|
392 |
lines=3,
|
393 |
value="A groovy funk bassline with a tight drum beat"
|
394 |
)
|
395 |
-
|
396 |
with gr.Row():
|
397 |
duration = gr.Slider(
|
398 |
minimum=5,
|
@@ -410,15 +411,15 @@ with gr.Blocks(title="MusicGen Large - Music Generation") as demo:
|
|
410 |
info="Higher values follow prompt more closely"
|
411 |
)
|
412 |
|
413 |
-
generate_btn = gr.Button("π΅ Generate
|
414 |
|
415 |
with gr.Column():
|
416 |
generation_info = gr.Markdown("Ready to generate music variants with cost comparison vs original model")
|
417 |
-
|
418 |
with gr.Row():
|
419 |
audio_output1 = gr.Audio(label="Variant 1", type="numpy")
|
420 |
audio_output2 = gr.Audio(label="Variant 2", type="numpy")
|
421 |
-
|
422 |
with gr.Row():
|
423 |
audio_output3 = gr.Audio(label="Variant 3", type="numpy")
|
424 |
audio_output4 = gr.Audio(label="Variant 4", type="numpy")
|
@@ -431,9 +432,11 @@ with gr.Blocks(title="MusicGen Large - Music Generation") as demo:
|
|
431 |
- Duration is limited to 30 seconds for faster generation
|
432 |
""")
|
433 |
|
|
|
434 |
def generate_simple(text_prompt, duration, guidance_scale):
|
435 |
return generate_music_batch(text_prompt, duration, guidance_scale, "compressed")
|
436 |
|
|
|
437 |
generate_btn.click(
|
438 |
fn=generate_simple,
|
439 |
inputs=[text_input, duration, guidance_scale],
|
|
|
12 |
from transformers import AutoProcessor, pipeline
|
13 |
from elastic_models.transformers import MusicgenForConditionalGeneration
|
14 |
|
|
|
15 |
MODEL_CONFIG = {
|
16 |
'cost_per_hour': 1.8, # $1.8 per hour
|
17 |
}
|
|
|
206 |
max_val = np.max(np.abs(audio_data))
|
207 |
if max_val > 0:
|
208 |
audio_data = audio_data / max_val * 0.95
|
209 |
+
|
210 |
audio_data = (audio_data * 32767).astype(np.int16)
|
211 |
+
|
212 |
print(f"[GENERATION] Final audio shape: {audio_data.shape}")
|
213 |
print(f"[GENERATION] Audio range: [{np.min(audio_data)}, {np.max(audio_data)}]")
|
214 |
print(f"[GENERATION] Audio dtype: {audio_data.dtype}")
|
|
|
224 |
file_size = os.path.getsize(temp_path)
|
225 |
print(f"[GENERATION] Audio saved to: {temp_path}")
|
226 |
print(f"[GENERATION] File size: {file_size} bytes")
|
227 |
+
|
228 |
# Try returning numpy format instead
|
229 |
print(f"[GENERATION] Returning numpy tuple: ({sample_rate}, audio_array)")
|
230 |
return (sample_rate, audio_data)
|
|
|
264 |
def generate_music_batch(text_prompt, duration=10, guidance_scale=3.0, model_mode="compressed"):
|
265 |
try:
|
266 |
cache_key = get_cache_key(text_prompt, duration, guidance_scale)
|
267 |
+
|
268 |
generator, processor = load_model()
|
269 |
model_name = "Compressed (S)"
|
270 |
|
|
|
300 |
|
301 |
audio_variants = []
|
302 |
sample_rate = outputs[0]['sampling_rate']
|
303 |
+
|
304 |
for i, output in enumerate(outputs):
|
305 |
audio_data = output['audio']
|
306 |
+
|
307 |
+
print(f"[GENERATION] Processing variant {i + 1} audio shape: {audio_data.shape}")
|
308 |
+
|
309 |
if hasattr(audio_data, 'cpu'):
|
310 |
audio_data = audio_data.cpu().numpy()
|
311 |
|
312 |
if len(audio_data.shape) == 3:
|
313 |
audio_data = audio_data[0]
|
314 |
+
|
315 |
if len(audio_data.shape) == 2:
|
316 |
if audio_data.shape[0] < audio_data.shape[1]:
|
317 |
audio_data = audio_data.T
|
|
|
319 |
audio_data = audio_data[:, 0]
|
320 |
else:
|
321 |
audio_data = audio_data.flatten()
|
322 |
+
|
323 |
audio_data = audio_data.flatten()
|
324 |
+
|
325 |
max_val = np.max(np.abs(audio_data))
|
326 |
if max_val > 0:
|
327 |
audio_data = audio_data / max_val * 0.95
|
328 |
+
|
329 |
audio_data = (audio_data * 32767).astype(np.int16)
|
330 |
audio_variants.append((sample_rate, audio_data))
|
331 |
+
|
332 |
+
print(f"[GENERATION] Variant {i + 1} final shape: {audio_data.shape}")
|
333 |
|
334 |
comparison_message = ""
|
335 |
+
|
336 |
if cache_key in original_time_cache:
|
337 |
original_time = original_time_cache[cache_key]
|
338 |
cost_info = calculate_cost_savings(generation_time, original_time)
|
339 |
+
|
340 |
comparison_message = f"π° Cost Savings: ${cost_info['savings']:.4f} ({cost_info['savings_percent']:.1f}%) - Compressed: ${cost_info['compressed_cost']:.4f} vs Original: ${cost_info['original_cost']:.4f}"
|
341 |
print(f"[COST] Savings: ${cost_info['savings']:.4f} ({cost_info['savings_percent']:.1f}%)")
|
342 |
else:
|
343 |
try:
|
344 |
print(f"[TIMING] Measuring original model speed for comparison...")
|
345 |
original_generator, original_processor = load_original_model()
|
346 |
+
|
347 |
original_start = time.time()
|
348 |
original_outputs = original_generator(
|
349 |
prompts,
|
|
|
351 |
generate_kwargs=generation_params
|
352 |
)
|
353 |
original_time = time.time() - original_start
|
354 |
+
|
355 |
original_time_cache[cache_key] = original_time
|
356 |
+
|
357 |
cost_info = calculate_cost_savings(generation_time, original_time)
|
358 |
comparison_message = f"π° Cost Savings: ${cost_info['savings']:.4f} ({cost_info['savings_percent']:.1f}%) - Compressed: ${cost_info['compressed_cost']:.4f} vs Original: ${cost_info['original_cost']:.4f}"
|
359 |
+
print(
|
360 |
+
f"[COST] First comparison - Savings: ${cost_info['savings']:.4f} ({cost_info['savings_percent']:.1f}%)")
|
361 |
print(f"[TIMING] Original: {original_time:.2f}s, Compressed: {generation_time:.2f}s")
|
362 |
+
|
363 |
del original_generator, original_processor
|
364 |
cleanup_gpu()
|
365 |
print(f"[CLEANUP] Original model cleaned up after timing measurement")
|
366 |
+
|
367 |
except Exception as e:
|
368 |
print(f"[WARNING] Could not measure original timing: {e}")
|
369 |
compressed_cost = calculate_generation_cost(generation_time, 'S')
|
370 |
comparison_message = f"πΈ Compressed Cost: ${compressed_cost:.4f} (could not compare with original)"
|
371 |
|
372 |
generation_info = f"β
Generated 4 variants in {generation_time:.2f}s\n{comparison_message}"
|
373 |
+
|
374 |
return audio_variants[0], audio_variants[1], audio_variants[2], audio_variants[3], generation_info
|
375 |
|
376 |
except Exception as e:
|
|
|
382 |
|
383 |
with gr.Blocks(title="MusicGen Large - Music Generation") as demo:
|
384 |
gr.Markdown("# π΅ MusicGen Large Music Generator")
|
385 |
+
gr.Markdown(
|
386 |
+
"Generate music from text descriptions using Facebook's MusicGen Large model accelerated by TheStage for 2.3x faster performance")
|
387 |
|
388 |
with gr.Row():
|
389 |
with gr.Column():
|
|
|
393 |
lines=3,
|
394 |
value="A groovy funk bassline with a tight drum beat"
|
395 |
)
|
396 |
+
|
397 |
with gr.Row():
|
398 |
duration = gr.Slider(
|
399 |
minimum=5,
|
|
|
411 |
info="Higher values follow prompt more closely"
|
412 |
)
|
413 |
|
414 |
+
generate_btn = gr.Button("π΅ Generate Music", variant="primary", size="lg")
|
415 |
|
416 |
with gr.Column():
|
417 |
generation_info = gr.Markdown("Ready to generate music variants with cost comparison vs original model")
|
418 |
+
|
419 |
with gr.Row():
|
420 |
audio_output1 = gr.Audio(label="Variant 1", type="numpy")
|
421 |
audio_output2 = gr.Audio(label="Variant 2", type="numpy")
|
422 |
+
|
423 |
with gr.Row():
|
424 |
audio_output3 = gr.Audio(label="Variant 3", type="numpy")
|
425 |
audio_output4 = gr.Audio(label="Variant 4", type="numpy")
|
|
|
432 |
- Duration is limited to 30 seconds for faster generation
|
433 |
""")
|
434 |
|
435 |
+
|
436 |
def generate_simple(text_prompt, duration, guidance_scale):
|
437 |
return generate_music_batch(text_prompt, duration, guidance_scale, "compressed")
|
438 |
|
439 |
+
|
440 |
generate_btn.click(
|
441 |
fn=generate_simple,
|
442 |
inputs=[text_input, duration, guidance_scale],
|