Spaces:

TheStageAI
/

Elastic-musicgen-large

Running on L40S

App Files Files Community

quazim commited on Jun 27

Commit

836dde3

1 Parent(s): 9c28790

updated

Browse files

Files changed (1) hide show

app.py +155 -153

app.py CHANGED Viewed

@@ -1,134 +1,139 @@
 import gradio as gr
 import torch
-from transformers import AutoProcessor
-from elastic_models.transformers import MusicgenForConditionalGeneration
-import scipy.io.wavfile
 import numpy as np
-import subprocess
-import sys
 import os
-# def setup_flash_attention():
-#     """One-time setup for flash-attention with special flags"""
-#     # Check if flash-attn is already installed
-#     try:
-#         import flash_attn
-#         print("flash-attn already installed")
-#         return
-#     except ImportError:
-#         pass
-#     # Check if we've already tried to install it in this session
-#     if os.path.exists("/tmp/flash_attn_installed"):
-#         return
-#     try:
-#         print("Installing flash-attn with --no-build-isolation...")
-#         subprocess.run([
-#             sys.executable, "-m", "pip", "install",
-#             "flash-attn==2.7.3", "--no-build-isolation"
-#         ], check=True)
-#         # Uninstall apex if it exists
-#         subprocess.run([
-#             sys.executable, "-m", "pip", "uninstall", "apex", "-y"
-#         ], check=False)  # Don't fail if apex isn't installed
-#         # Mark as installed
-#         with open("/tmp/flash_attn_installed", "w") as f:
-#             f.write("installed")
-#         print("flash-attn installation completed")
-#     except subprocess.CalledProcessError as e:
-#         print(f"Warning: Failed to install flash-attn: {e}")
-#         # Continue anyway - the model might work without it
-# Run setup once when the module is imported
-# setup_flash_attention()
-# Load model and processor
-# @gr.cache()
-# def load_model():
-#     """Load the musicgen model and processor"""
-#     processor = AutoProcessor.from_pretrained("facebook/musicgen-large")
-#     model = MusicgenForConditionalGeneration.from_pretrained(
-#                 "facebook/musicgen-large",
-#                 torch_dtype=torch.float16,
-#                 device="cuda",
-#                 mode="S",
-#                 __paged=True,
-#             )
-#     return processor, model
-_processor, _model = None, None
-def load_model():
-    global _processor, _model
-    if _model is None:
-        print("Initial model loading...")
-        _processor = AutoProcessor.from_pretrained("facebook/musicgen-large")
-        _model = MusicgenForConditionalGeneration.from_pretrained(
-                    "facebook/musicgen-large",
-                    torch_dtype=torch.float16,
-                    device="cuda",
-                    mode="S",
-                    __paged=True,
         )
-        _model.eval()
-    return _processor, _model
-def generate_music(text_prompt, duration=10, temperature=1.0, top_k=250, top_p=0.0):
-    """Generate music based on text prompt"""
     try:
-        processor, model = load_model()
-        # Process the text prompt
-        print("Processor start")
-        inputs = processor(
-            text=[text_prompt],
-            padding=True,
-            return_tensors="pt",
-        ).to("cuda")
-        print("Processor end")
-        print(inputs)
-        # Generate audio
-        with torch.no_grad():
-            audio_values = model.generate(
-                **inputs,
-                max_new_tokens=duration * 50,  # Approximate tokens per second
-                do_sample=True,
-                temperature=temperature,
-                top_k=top_k,
-                top_p=top_p,
-                cache_implementation="paged"
-            )
-        audio_data = audio_values[0, 0].cpu().numpy().astype(np.float32)
-        sample_rate = model.config.sample_rate
-        # Normalize audio
-        audio_data = audio_data / np.max(np.abs(audio_data))
-        return sample_rate, audio_data
     except Exception as e:
-        print(f"Error: {str(e)}")
-        return None
-# Create Gradio interface
-with gr.Blocks(title="MusicGen Large - Music Generation") as demo:
     gr.Markdown("# 🎵 MusicGen Large Music Generator")
-    gr.Markdown("Generate music from text descriptions using Facebook's MusicGen Large model.")
     with gr.Row():
         with gr.Column():
             text_input = gr.Textbox(
                 label="Music Description",
-                placeholder="Enter a description of the music you want to generate (e.g., 'upbeat jazz with piano and drums')",
-                lines=3
             )
             with gr.Row():
                 duration = gr.Slider(
                     minimum=5,
@@ -137,66 +142,63 @@ with gr.Blocks(title="MusicGen Large - Music Generation") as demo:
                     step=1,
                     label="Duration (seconds)"
                 )
-                temperature = gr.Slider(
-                    minimum=0.1,
-                    maximum=2.0,
-                    value=1.0,
-                    step=0.1,
-                    label="Temperature (creativity)"
-                )
-            with gr.Row():
-                top_k = gr.Slider(
-                    minimum=1,
-                    maximum=500,
-                    value=250,
-                    step=1,
-                    label="Top-k"
                 )
-                top_p = gr.Slider(
-                    minimum=0.0,
-                    maximum=1.0,
-                    value=0.0,
-                    step=0.1,
-                    label="Top-p"
-                )
-            generate_btn = gr.Button("🎵 Generate Music", variant="primary")
         with gr.Column():
             audio_output = gr.Audio(
                 label="Generated Music",
                 type="numpy"
             )
-            gr.Markdown("### Tips:")
-            gr.Markdown("""
-            - Be specific in your descriptions (e.g., "slow blues guitar with harmonica")
-            - Higher temperature = more creative/random results
-            - Lower temperature = more predictable results
-            - Duration is limited to 30 seconds for faster generation
-            """)
-    # Example prompts
     gr.Examples(
         examples=[
-            ["upbeat jazz with piano and drums"],
-            ["relaxing acoustic guitar melody"],
-            ["electronic dance music with heavy bass"],
-            ["classical violin concerto"],
-            ["reggae with steel drums and bass"],
-            ["rock ballad with electric guitar solo"],
         ],
-        inputs=text_input,
         label="Example Prompts"
     )
-    # Connect the generate button to the function
-    generate_btn.click(
-        fn=generate_music,
-        inputs=[text_input, duration, temperature, top_k, top_p],
-        outputs=audio_output
-    )
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
 import torch
+import gc
 import numpy as np
+import random
+from transformers import AutoProcessor, pipeline
 import os
+os.environ['ELASTIC_LOG_LEVEL'] = 'DEBUG'
+from elastic_models.transformers import MusicgenForConditionalGeneration
+def set_seed(seed: int = 42):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+def cleanup_gpu():
+    """Clean up GPU memory to avoid TensorRT conflicts."""
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+        gc.collect()
+_generator = None
+_processor = None
+def load_model():
+    """Load the musicgen model and processor using pipeline approach"""
+    global _generator, _processor
+    if _generator is None:
+        print("[MODEL] Starting model initialization...")
+        cleanup_gpu()
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        print(f"[MODEL] Using device: {device}")
+        print("[MODEL] Loading processor...")
+        _processor = AutoProcessor.from_pretrained(
+            "facebook/musicgen-large",
+            cache_dir="/mnt/fs/huggingface_cache/"
+        )
+        print("[MODEL] Loading model...")
+        model = MusicgenForConditionalGeneration.from_pretrained(
+            "facebook/musicgen-large",
+            torch_dtype=torch.float16,
+            device=device,
+            mode="S",
+            __paged=True,
+        )
+        model.eval()
+        print("[MODEL] Creating pipeline...")
+        _generator = pipeline(
+            task="text-to-audio",
+            model=model,
+            tokenizer=_processor.tokenizer,
+            device=device,
         )
+        print("[MODEL] Model initialization completed successfully")
+    return _generator, _processor
+def calculate_max_tokens(duration_seconds):
+    token_rate = 50
+    max_new_tokens = int(duration_seconds * token_rate)
+    print(f"[MODEL] Duration: {duration_seconds}s -> Tokens: {max_new_tokens} (rate: {token_rate})")
+    return max_new_tokens
+def generate_music(text_prompt, duration=10, guidance_scale=3.0):
+    """Generate music based on text prompt using pipeline"""
     try:
+        generator, processor = load_model()
+        print(f"[GENERATION] Starting generation...")
+        print(f"[GENERATION] Prompt: '{text_prompt}'")
+        print(f"[GENERATION] Duration: {duration}s")
+        print(f"[GENERATION] Guidance scale: {guidance_scale}")
+        cleanup_gpu()
+        set_seed(42)
+        max_new_tokens = calculate_max_tokens(duration)
+        generation_params = {
+            'do_sample': True,
+            'guidance_scale': guidance_scale,
+            'max_new_tokens': max_new_tokens,
+            'min_new_tokens': max_new_tokens,
+            'cache_implementation': 'paged',
+        }
+        prompts = [text_prompt]
+        outputs = generator(
+            prompts,
+            batch_size=1,
+            generate_kwargs=generation_params
+        )
+        print(f"[GENERATION] Generation completed successfully")
+        output = outputs[0]
+        audio_data = output['audio']
+        sample_rate = output['sampling_rate']
+        print(f"[GENERATION] Audio shape: {audio_data.shape}")
+        print(f"[GENERATION] Sample rate: {sample_rate}")
+        audio_data = audio_data.astype(np.float32)
+        return sample_rate, audio_data
     except Exception as e:
+        print(f"[ERROR] Generation failed: {str(e)}")
+        cleanup_gpu()
+        return None, None
+with gr.Blocks(title="MusicGen Large - Music Generation", theme=gr.themes.Soft()) as demo:
     gr.Markdown("# 🎵 MusicGen Large Music Generator")
+    gr.Markdown("Generate music from text descriptions using Facebook's MusicGen Large model with elastic compression.")
     with gr.Row():
         with gr.Column():
             text_input = gr.Textbox(
                 label="Music Description",
+                placeholder="Enter a description of the music you want to generate",
+                lines=3,
+                value="A groovy funk bassline with a tight drum beat"
             )
             with gr.Row():
                 duration = gr.Slider(
                     minimum=5,
                     step=1,
                     label="Duration (seconds)"
                 )
+                guidance_scale = gr.Slider(
+                    minimum=1.0,
+                    maximum=10.0,
+                    value=3.0,
+                    step=0.5,
+                    label="Guidance Scale",
+                    info="Higher values follow prompt more closely"
                 )
+            generate_btn = gr.Button("🎵 Generate Music", variant="primary", size="lg")
         with gr.Column():
             audio_output = gr.Audio(
                 label="Generated Music",
                 type="numpy"
             )
+            with gr.Accordion("Tips", open=False):
+                gr.Markdown("""
+                - Be specific in your descriptions (e.g., "slow blues guitar with harmonica")
+                - Higher guidance scale = follows prompt more closely
+                - Lower guidance scale = more creative/varied results
+                - Duration is limited to 30 seconds for faster generation
+                """)
+    generate_btn.click(
+        fn=generate_music,
+        inputs=[text_input, duration, guidance_scale],
+        outputs=audio_output
+    )
     gr.Examples(
         examples=[
+            ["A groovy funk bassline with a tight drum beat", 10, 3.0],
+            ["Relaxing acoustic guitar melody", 15, 3.0],
+            ["Electronic dance music with heavy bass", 10, 4.0],
+            ["Classical violin concerto", 20, 3.5],
+            ["Reggae with steel drums and bass", 12, 3.0],
+            ["Rock ballad with electric guitar solo", 15, 3.5],
+            ["Jazz piano improvisation with brushed drums", 18, 3.0],
+            ["Ambient synthwave with retro vibes", 25, 2.5],
         ],
+        inputs=[text_input, duration, guidance_scale],
         label="Example Prompts"
     )
+    gr.Markdown("---")
+    gr.Markdown("""
+    <div style="text-align: center; color: #666; font-size: 12px; margin-top: 2rem;">
+        <strong>Limitations:</strong><br>
+        • The model is not able to generate realistic vocals.<br>
+        • The model has been trained with English descriptions and will not perform as well in other languages.<br>
+        • The model does not perform equally well for all music styles and cultures.<br>
+        • The model sometimes generates end of songs, collapsing to silence.<br>
+        • It is sometimes difficult to assess what types of text descriptions provide the best generations. Prompt engineering may be required to obtain satisfying results.
+    </div>
+    """)
 if __name__ == "__main__":
     demo.launch()