import gradio as gr from transformers import MusicGenForConditionalGeneration, MusicGenProcessor import torch import soundfile as sf from transformers import AutoModel, AutoProcessor # Use AutoModel and AutoProcessor to automatically select the correct model and processor model_name = "facebook/musicgen-small" model = AutoModel.from_pretrained(model_name) processor = AutoProcessor.from_pretrained(model_name) # Define a function to generate audio from text def generate_audio(text): # Tokenize the input text inputs = processor(text, return_tensors="pt") # Generate audio (samples) with torch.no_grad(): generated_audio = model.generate(**inputs) # Convert tensor to numpy and save as a WAV file audio_path = "/tmp/generated_audio.wav" audio_data = generated_audio[0].cpu().numpy() # Access the first sample # Save the generated audio sf.write(audio_path, audio_data, 16000) # Assuming a sample rate of 16kHz return audio_path # Set up the Gradio interface iface = gr.Interface( fn=generate_audio, inputs=gr.Textbox(label="Enter Text"), outputs=gr.Audio(type="file", label="Generated Audio"), title="Text-to-Audio Chatbot", description="Enter a text prompt and get a music clip generated by the MusicGen model." ) iface.launch()