File size: 4,431 Bytes
4ef3fa0
 
 
 
fb68207
4ef3fa0
 
 
fb68207
 
4ef3fa0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fb68207
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4ef3fa0
59899f0
 
 
4ef3fa0
 
fb68207
 
 
 
 
 
 
 
 
4ef3fa0
 
 
fb68207
4ef3fa0
 
409ee88
fb68207
4ef3fa0
 
fb68207
 
4ef3fa0
fb68207
 
4ef3fa0
 
 
 
fb68207
4ef3fa0
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import gradio as gr
import torch
import numpy as np
from diffusers import I2VGenXLPipeline
from transformers import MusicgenForConditionalGeneration, AutoProcessor
from PIL import Image
from moviepy.editor import ImageSequenceClip
import io
import ffmpeg
import scipy.io.wavfile

def generate_video(image, prompt, negative_prompt, video_length):
    generator = torch.manual_seed(8888)

    # Set the device to CPU or a non-NVIDIA GPU
    device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
    print(f"Using device: {device}")

    # Load the pipeline
    pipeline = I2VGenXLPipeline.from_pretrained("ali-vilab/i2vgen-xl", torch_dtype=torch.float32)
    pipeline.to(device)  # Move the model to the selected device

    # Generate frames with progress tracking
    frames = []
    total_frames = video_length * 30  # Assuming 30 frames per second

    for i in range(total_frames):
        frame = pipeline(
            prompt=prompt,
            image=image,
            num_inference_steps=5,
            negative_prompt=negative_prompt,
            guidance_scale=9.0,
            generator=generator,
            num_frames=1
        ).frames[0]
        frames.append(np.array(frame))

        # Update progress
        yield (i + 1) / total_frames  # Yield progress

    # Create a video clip from the frames
    output_file = "output_video.mp4"
    clip = ImageSequenceClip(frames, fps=30)  # Set the frames per second
    clip.write_videofile(output_file, codec='libx264', audio=False)

    return output_file

def generate_music(prompt, unconditional=False):
    model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    model.to(device)

    # Generate music
    if unconditional:
        unconditional_inputs = model.get_unconditional_inputs(num_samples=1)
        audio_values = model.generate(**unconditional_inputs, do_sample=True, max_new_tokens=256)
    else:
        processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
        inputs = processor(
            text=prompt,
            padding=True,
            return_tensors="pt",
        )
        audio_values = model.generate(**inputs.to(device), do_sample=True, guidance_scale=3, max_new_tokens=256)

    sampling_rate = model.config.audio_encoder.sampling_rate
    audio_file = "musicgen_out.wav"
    # Save the generated audio
    scipy.io.wavfile.write(audio_file, sampling_rate, audio_values[0].cpu().numpy())
    
    return audio_file

def combine_audio_video(audio_file, video_file):
    output_file = "combined_output.mp4"
    audio = ffmpeg.input(audio_file)
    video = ffmpeg.input(video_file)
    output = ffmpeg.output(video, audio, output_file, vcodec='copy', acodec='aac')
    ffmpeg.run(output)
    return output_file

# Gradio interface
def interface(image_path, prompt, negative_prompt, video_length, music_prompt, unconditional):
    # Convert the uploaded image path to a PIL Image
    image = Image.open(image_path)
    
    # Generate video and track progress
    video_file = generate_video(image, prompt, negative_prompt, video_length)
    
    # Generate music
    audio_file = generate_music(music_prompt, unconditional)
    
    # Combine audio and video
    combined_file = combine_audio_video(audio_file, video_file)
    
    return combined_file

# Create Gradio Blocks
with gr.Blocks() as demo:
    gr.Markdown("# AI-Powered Video and Music Generation")
    
    with gr.Row():
        image_input = gr.Image(type="filepath", label="Upload Image")
        prompt_input = gr.Textbox(label="Enter the Video Prompt")
        negative_prompt_input = gr.Textbox(label="Enter the Negative Prompt")
        video_length_input = gr.Number(label="Video Length (seconds)", value=10, precision=0)
        music_prompt_input = gr.Textbox(label="Enter the Music Prompt")
        unconditional_checkbox = gr.Checkbox(label="Generate Unconditional Music")

    generate_button = gr.Button("Generate Video and Music")
    output_video = gr.Video(label="Output Video with Sound")

    # Define the button action
    generate_button.click(
        interface,
        inputs=[image_input, prompt_input, negative_prompt_input, video_length_input, music_prompt_input, unconditional_checkbox],
        outputs=output_video,
        show_progress=True  # Show progress bar
    )

# Launch the Gradio app
demo.launch()