import gradio as gr import torch import numpy as np from diffusers import I2VGenXLPipeline from transformers import MusicgenForConditionalGeneration, AutoProcessor from PIL import Image from moviepy.editor import ImageSequenceClip import io import ffmpeg import scipy.io.wavfile def generate_video(image, prompt, negative_prompt, video_length): generator = torch.manual_seed(8888) # Set the device to CPU or a non-NVIDIA GPU device = torch.device("mps" if torch.backends.mps.is_available() else "cpu") print(f"Using device: {device}") # Load the pipeline pipeline = I2VGenXLPipeline.from_pretrained("ali-vilab/i2vgen-xl", torch_dtype=torch.float32) pipeline.to(device) # Move the model to the selected device # Generate frames with progress tracking frames = [] total_frames = video_length * 30 # Assuming 30 frames per second for i in range(total_frames): frame = pipeline( prompt=prompt, image=image, num_inference_steps=5, negative_prompt=negative_prompt, guidance_scale=9.0, generator=generator, num_frames=1 ).frames[0] frames.append(np.array(frame)) # Update progress yield (i + 1) / total_frames # Yield progress # Create a video clip from the frames output_file = "output_video.mp4" clip = ImageSequenceClip(frames, fps=30) # Set the frames per second clip.write_videofile(output_file, codec='libx264', audio=False) return output_file def generate_music(prompt, unconditional=False): model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small") device = "cuda:0" if torch.cuda.is_available() else "cpu" model.to(device) # Generate music if unconditional: unconditional_inputs = model.get_unconditional_inputs(num_samples=1) audio_values = model.generate(**unconditional_inputs, do_sample=True, max_new_tokens=256) else: processor = AutoProcessor.from_pretrained("facebook/musicgen-small") inputs = processor( text=prompt, padding=True, return_tensors="pt", ) audio_values = model.generate(**inputs.to(device), do_sample=True, guidance_scale=3, max_new_tokens=256) sampling_rate = model.config.audio_encoder.sampling_rate audio_file = "musicgen_out.wav" # Save the generated audio scipy.io.wavfile.write(audio_file, sampling_rate, audio_values[0].cpu().numpy()) return audio_file def combine_audio_video(audio_file, video_file): output_file = "combined_output.mp4" audio = ffmpeg.input(audio_file) video = ffmpeg.input(video_file) output = ffmpeg.output(video, audio, output_file, vcodec='copy', acodec='aac') ffmpeg.run(output) return output_file # Gradio interface def interface(image_path, prompt, negative_prompt, video_length, music_prompt, unconditional): # Convert the uploaded image path to a PIL Image image = Image.open(image_path) # Generate video and track progress video_file = generate_video(image, prompt, negative_prompt, video_length) # Generate music audio_file = generate_music(music_prompt, unconditional) # Combine audio and video combined_file = combine_audio_video(audio_file, video_file) return combined_file # Create Gradio Blocks with gr.Blocks() as demo: gr.Markdown("# AI-Powered Video and Music Generation") with gr.Row(): image_input = gr.Image(type="filepath", label="Upload Image") prompt_input = gr.Textbox(label="Enter the Video Prompt") negative_prompt_input = gr.Textbox(label="Enter the Negative Prompt") video_length_input = gr.Number(label="Video Length (seconds)", value=10, precision=0) music_prompt_input = gr.Textbox(label="Enter the Music Prompt") unconditional_checkbox = gr.Checkbox(label="Generate Unconditional Music") generate_button = gr.Button("Generate Video and Music") output_video = gr.Video(label="Output Video with Sound") # Define the button action generate_button.click( interface, inputs=[image_input, prompt_input, negative_prompt_input, video_length_input, music_prompt_input, unconditional_checkbox], outputs=output_video, show_progress=True # Show progress bar ) # Launch the Gradio app demo.launch()