TextToVideo-Dalle

Sleeping

App Files Files Community

ruslanmv commited on Jan 18

Commit

7f0496b

verified ·

1 Parent(s): deebc86

Update app.py

Browse files

Files changed (1) hide show

app.py +203 -220

app.py CHANGED Viewed

@@ -1,297 +1,280 @@
 import gradio as gr
 import torch
-import moviepy.editor as mpe
-from PIL import Image, ImageDraw, ImageFont
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 from min_dalle import MinDalle
 from gtts import gTTS
 from pydub import AudioSegment
-import nltk
-import textwrap
 import os
 import glob
 import subprocess
-import imageio_ffmpeg
-# Define a fallback for environments without GPU
-if os.environ.get("SPACES_ZERO_GPU") is not None:
-    import spaces
-else:
-    class spaces:
-        @staticmethod
-        def GPU(func):
-            def wrapper(*args, **kwargs):
-                return func(*args, **kwargs)
-            return wrapper
-# Ensure 'punkt' is downloaded for nltk
 try:
     nltk.data.find('tokenizers/punkt')
 except LookupError:
     nltk.download('punkt')
-# Download FFmpeg using imageio_ffmpeg (more robust)
-try:
-    imageio_ffmpeg.get_ffmpeg_exe()
-    print("FFmpeg downloaded successfully (if not already present).")
-except Exception as e:
-    print(f"Error downloading FFmpeg using imageio_ffmpeg: {e}")
-    raise
-description = "Video Story Generator with Audio \n PS:  Generation of video by using Artificial Intelligence by dalle-mini and distilbart and gtss "
-title = "Video Story Generator with Audio by using dalle-mini and distilbart and gtss  "
 tokenizer = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
 model = AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-cnn-12-6")
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-model.to(device)
 print(f"Using device: {device}")
-def get_output_video(text):
-    print("Starting get_output_video function...")
-    inputs = tokenizer(text,
-                       max_length=1024,
-                       truncation=True,
-                       return_tensors="pt").to(device)
-    summary_ids = model.generate(inputs["input_ids"])
-    summary = tokenizer.batch_decode(summary_ids,
-                                     skip_special_tokens=True,
-                                     clean_up_tokenization_spaces=False)
-    plot = list(summary[0].split('.'))
-    print(f"Summarized plot: {plot}")
-    '''
-    The required models will be downloaded to models_root if they are not already there.
-    Set the dtype to torch.float16 to save GPU memory.
-    If you have an Ampere architecture GPU you can use torch.bfloat16.
-        Set the device to either "cuda" or "cpu". Once everything has finished initializing,
-    float32 is faster than float16 but uses more GPU memory.
-    '''
-    #@spaces.GPU(duration=60 * 3)
-    def generate_image(
-        is_mega: bool,
-        text: str,
-        seed: int,
-        grid_size: int,
-        top_k: int,
-        image_path: str,
-        models_root: str,
-        fp16: bool,
-    ):
-        print(f"Generating image for: {text}")
-        model = MinDalle(
-            is_mega=is_mega,
-            models_root=models_root,
-            is_reusable=True,
-            is_verbose=True,
-            dtype=torch.float16 if fp16 else torch.float32,  # ensures correct data type
-            device=device
-        )
-        # Ensure correct dtype for inputs
         image = model.generate_image(
             text,
             seed,
             grid_size,
-            top_k=top_k,
-            is_verbose=True
         )
-        print(f"Image generated successfully.")
-        return image
     generated_images = []
     for i, senten in enumerate(plot[:-1]):
-        print(f"Generating image {i+1} of {len(plot)-1}...")
-        try:
-            image = generate_image(
-                is_mega=True,
-                text=senten,
-                seed=1,
-                grid_size=1,  # param {type:"integer"}
-                top_k=256,  # param {type:"integer"}
-                image_path='generated',
-                models_root='pretrained',
-                fp16=True,
-            )
-            generated_images.append(image)
-            print(f"Image {i+1} generated and appended.")
-        except Exception as e:
-            print(f"Error generating image {i+1}: {e}")
-            raise
-    # Step 4- Creation of the subtitles
     sentences = plot[:-1]
-    num_sentences = len(sentences)
-    assert len(generated_images) == len(sentences), print('Something is wrong')
-    # We can generate our list of subtitles
-    from nltk import tokenize
-    c = 0
-    sub_names = []
-    for k in range(len(generated_images)):
-        subtitles = tokenize.sent_tokenize(sentences[k])
-        sub_names.append(subtitles)
-        print(f"Subtitles generated for image {k+1}: {subtitles}")
-    # Step 5- Adding Subtitles to the Images
     def draw_multiple_line_text(image, text, font, text_color, text_start_height):
         draw = ImageDraw.Draw(image)
         image_width, image_height = image.size
         y_text = text_start_height
         lines = textwrap.wrap(text, width=40)
         for line in lines:
-            line_width, line_height = font.getbbox(line)[2:4] # Use getbbox for better size calculation
             draw.text(((image_width - line_width) / 2, y_text),
                       line, font=font, fill=text_color)
             y_text += line_height
     def add_text_to_img(text1, image_input):
-        '''
-        Testing draw_multiple_line_text
-        '''
-        image = image_input
-        fontsize = 20  # Increased font size
         path_font = "/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf"
         if not os.path.exists(path_font):
-            # Try alternative location on different systems
             path_font = "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf"
-            if not os.path.exists(path_font):
-                print("Font file not found. Subtitles might not be rendered correctly.")
-                path_font = None
-        if path_font is not None:
-            try:
-                font = ImageFont.truetype(path_font, fontsize)
-                text_color = (255, 255, 0)
-                text_start_height = 200
-                draw_multiple_line_text(image, text1, font, text_color, text_start_height)
-            except Exception as e:
-                print(f"Error loading or using font: {e}")
-        return image
     generated_images_sub = []
-    for k in range(len(generated_images)):
-        imagenes = generated_images[k].copy()
         text_to_add = sub_names[k][0]
-        result = add_text_to_img(text_to_add, imagenes)
         generated_images_sub.append(result)
-        print(f"Subtitles added to image {k+1}.")
-    # Step  7 - Creation of audio
-    c = 0
     mp3_names = []
     mp3_lengths = []
-    for k in range(len(generated_images)):
-        text_to_add = sub_names[k][0]
-        print(f"Generating audio for: {text_to_add}")
-        f_name = 'audio_' + str(c) + '.mp3'
         mp3_names.append(f_name)
-        # The text that you want to convert to audio
-        mytext = text_to_add
-        # Language in which you want to convert
-        language = 'en'
-        # Passing the text and language to the engine,
-        # here we have marked slow=False. Which tells
-        # the module that the converted audio should
-        # have a high speed
-        myobj = gTTS(text=mytext, lang=language, slow=False)
-        # Saving the converted audio in a mp3 file named
-        sound_file = f_name
-        myobj.save(sound_file)
-        audio = AudioSegment.from_file(sound_file, format="mp3")
-        duration = len(audio) / 1000
-        mp3_lengths.append(duration)
-        print(f"Audio duration: {duration} seconds")
-        c += 1
-    # Step 8 - Merge audio files
-    cwd = os.getcwd().replace(chr(92), '/')
-    export_path = 'result.mp3'
-    silence = AudioSegment.silent(duration=500)
-    full_audio = AudioSegment.empty()
-    for n, mp3_file in enumerate(mp3_names):
-        mp3_file = mp3_file.replace(chr(92), '/')
-        print(f"Merging audio file: {mp3_file}")
-        # Load the current mp3 into `audio_segment`
-        audio_segment = AudioSegment.from_mp3(mp3_file)
-        # Just accumulate the new `audio_segment` + `silence`
-        full_audio += audio_segment + silence
-        print(f'Merging audio {n+1} completed.')
-    # The loop will exit once all files in the list have been used
-    # Then export
-    full_audio.export(export_path, format='mp3')
-    print('\nAudio merging done!')
-    # Step 9 - Creation of the video with adjusted times of the sound
-    c = 0
-    file_names = []
-    for img in generated_images_sub:
-        f_name = 'img_' + str(c) + '.jpg'
-        file_names.append(f_name)
-        img.save(f_name)
-        print(f"Saving image: {f_name}")
-        c += 1
-    print(f"Image file names: {file_names}")
     clips = []
-    d = 0
-    for m in file_names:
-        duration = mp3_lengths[d]
-        print(f"Creating video clip {d+1} with duration: {duration} seconds")
-        clips.append(mpe.ImageClip(m).set_duration(duration + 0.5))
-        d += 1
-    concat_clip = mpe.concatenate_videoclips(clips, method="compose")
-    concat_clip.write_videofile("result_new.mp4", fps=24)
-    print("Video clips concatenated and saved as result_new.mp4")
-    # Step 10 - Merge Video + Audio
-    movie_name = 'result_new.mp4'
-    export_path = 'result.mp3'
     movie_final = 'result_final.mp4'
     def combine_audio(vidname, audname, outname, fps=24):
-        my_clip = mpe.VideoFileClip(vidname)
-        audio_background = mpe.AudioFileClip(audname)
         final_clip = my_clip.set_audio(audio_background)
         final_clip.write_videofile(outname, fps=fps)
-    combine_audio(movie_name, export_path, movie_final)  # create a new file
-    print("Video and audio merged successfully!")
-    # Cleanup intermediate files
-    for f in file_names:
-        os.remove(f)
-    for f in mp3_names:
-        os.remove(f)
-    os.remove("result_new.mp4")
     os.remove("result.mp3")
-    print("Intermediate files cleaned up.")
-    print("Finished get_output_video function.")
     return 'result_final.mp4'
 text = 'Once, there was a girl called Laura who went to the supermarket to buy the ingredients to make a cake. Because today is her birthday and her friends come to her house and help her to prepare the cake.'
-demo = gr.Blocks()
 with demo:
     gr.Markdown("# Video Generator from stories with Artificial Intelligence")
-    gr.Markdown(
-        "A story can be input by user. The story is summarized using DistillBART model. Then, then it is generated the images by using Dalle-mini and created the subtitles and audio gtts. These are generated as a video.")
     with gr.Row():
-        # Left column (inputs)
         with gr.Column():
-            input_start_text = gr.Textbox(value=text,
-                                           label="Type your story here, for now a sample story is added already!")
             with gr.Row():
                 button_gen_video = gr.Button("Generate Video")
-        # Right column (outputs)
         with gr.Column():
             output_interpolation = gr.Video(label="Generated Video")
     gr.Markdown("<h3>Future Works </h3>")
-    gr.Markdown(
-        "This program text-to-video AI software generating videos from any prompt! AI software to build an art gallery. The future version will use Dalle-2 For more info visit [ruslanmv.com](https://ruslanmv.com/) ")
     button_gen_video.click(fn=get_output_video, inputs=input_start_text, outputs=output_interpolation)
-demo.launch(debug=True)

+import moviepy.editor as mpy
+from PIL import Image
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 import gradio as gr
 import torch
 from min_dalle import MinDalle
+from huggingface_hub import snapshot_download
+from PIL import Image, ImageDraw, ImageFont
+import textwrap
+from mutagen.mp3 import MP3
 from gtts import gTTS
 from pydub import AudioSegment
 import os
 import glob
+import nltk
 import subprocess
+import shutil
+import matplotlib.pyplot as plt
+import gc  # Import the garbage collector
+from audio import *
+# Download necessary NLTK data
 try:
     nltk.data.find('tokenizers/punkt')
 except LookupError:
     nltk.download('punkt')
+description = "Video Story Generator with Audio \n PS:  Generation of video by using Artifical Intellingence by dalle-mini and distilbart and gtss "
+title = "Video Story Generator with Audio by using dalle-mini and distilbart and gtss   "
+# Load tokenizer and model for text summarization
 tokenizer = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
 model = AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-cnn-12-6")
+# Check for CUDA availability and set device
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 print(f"Using device: {device}")
+model.to(device)
+# Function to log GPU memory (optional, for debugging)
+def log_gpu_memory():
+    if torch.cuda.is_available():
+        print(subprocess.check_output('nvidia-smi').decode('utf-8'))
+    else:
+        print("CUDA is not available. Cannot log GPU memory.")
+# --------- MinDalle Image Generation Functions ---------
+# Load MinDalle model once
+def load_min_dalle_model(models_root: str = 'pretrained', fp16: bool = True):
+    """
+    Load the MinDalle model.
+    Args:
+        models_root: Path to the directory containing MinDalle models.
+        fp16: Whether to use float16 for faster generation (requires CUDA).
+    Returns:
+        An instance of the MinDalle model.
+    """
+    print("DEBUG: Loading MinDalle model...")
+    return MinDalle(
+        is_mega=True,
+        models_root=models_root,
+        is_reusable=False,  # Set is_reusable to False
+        is_verbose=True,
+        dtype=torch.float16 if fp16 else torch.float32,
+        device=device
+    )
+# Initialize the MinDalle model
+min_dalle_model = load_min_dalle_model()
+def generate_image_with_min_dalle(
+    model: MinDalle,
+    text: str,
+    seed: int = -1,
+    grid_size: int = 1
+):
+    """
+    Generates an image from text using MinDalle.
+    Args:
+        model: The preloaded MinDalle model.
+        text: The text prompt to generate the image from.
+        seed: The random seed for image generation. -1 for random.
+        grid_size: The grid size for multiple image generation.
+    Returns:
+        A PIL Image object.
+    """
+    print(f"DEBUG: Generating image with MinDalle for text: '{text}'")
+    model.is_reusable = False
+    with torch.no_grad():
         image = model.generate_image(
             text,
             seed,
             grid_size,
+            is_verbose=False
         )
+    # Clear GPU memory after generation
+    torch.cuda.empty_cache()
+    gc.collect()
+    print("DEBUG: Image generated successfully.")
+    return image
+# --------- End of MinDalle Functions ---------
+# Merge audio files
+from pydub import AudioSegment
+import os
+# Function to generate video from text
+def get_output_video(text):
+    print("DEBUG: Starting get_output_video function...")
+   # Summarize the input text
+    print("DEBUG: Summarizing text...")
+    inputs = tokenizer(
+        text,
+        max_length=1024,
+        truncation=True,
+        return_tensors="pt"
+    ).to(device)
+    summary_ids = model.generate(inputs["input_ids"])
+    summary = tokenizer.batch_decode(
+        summary_ids,
+        skip_special_tokens=True,
+        clean_up_tokenization_spaces=False
+    )
+    plot = list(summary[0].split('.'))
+    print(f"DEBUG: Summary generated: {plot}")
+    # Generate images for each sentence in the plot
     generated_images = []
     for i, senten in enumerate(plot[:-1]):
+        print(f"DEBUG: Generating image {i+1} of {len(plot)-1}...")
+        image_dir = f"image_{i}"
+        os.makedirs(image_dir, exist_ok=True)
+        min_dalle_model = load_min_dalle_model()
+        image = generate_image_with_min_dalle(
+            min_dalle_model,
+            text=senten,
+            seed=1,
+            grid_size=1
+        )
+        generated_images.append(image)
+        image_path = os.path.join(image_dir, "generated_image.png")
+        image.save(image_path)
+        print(f"DEBUG: Image generated and saved to {image_path}")
+        del min_dalle_model
+        torch.cuda.empty_cache()
+        gc.collect()
+    # Create subtitles from the plot
     sentences = plot[:-1]
+    print("DEBUG: Creating subtitles...")
+    assert len(generated_images) == len(sentences), "Mismatch in number of images and sentences."
+    sub_names = [nltk.tokenize.sent_tokenize(sentence) for sentence in sentences]
+    # Add subtitles to images
     def draw_multiple_line_text(image, text, font, text_color, text_start_height):
         draw = ImageDraw.Draw(image)
         image_width, image_height = image.size
         y_text = text_start_height
         lines = textwrap.wrap(text, width=40)
         for line in lines:
+            line_width, line_height = font.getbbox(line)[2:]
             draw.text(((image_width - line_width) / 2, y_text),
                       line, font=font, fill=text_color)
             y_text += line_height
     def add_text_to_img(text1, image_input):
+        print(f"DEBUG: Adding text to image: '{text1}'")
+        fontsize = 30
         path_font = "/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf"
         if not os.path.exists(path_font):
             path_font = "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf"
+        font = ImageFont.truetype(path_font, fontsize)
+        text_color = (255, 255, 0)
+        text_start_height = image_input.height - (fontsize * len(textwrap.wrap(text1, width=40))) - 20
+        draw_multiple_line_text(image_input, text1, font, text_color, text_start_height)
+        return image_input
     generated_images_sub = []
+    for k, image in enumerate(generated_images):
         text_to_add = sub_names[k][0]
+        result = add_text_to_img(text_to_add, image.copy())
         generated_images_sub.append(result)
+        result.save(f"image_{k}/generated_image_with_subtitles.png")
+    # Generate audio for each subtitle
     mp3_names = []
     mp3_lengths = []
+    for k, text_to_add in enumerate(sub_names):
+        print(f"DEBUG: Generating audio for: '{text_to_add[0]}'")
+        f_name = f'audio_{k}.mp3'
         mp3_names.append(f_name)
+        myobj = gTTS(text=text_to_add[0], lang='en', slow=False)
+        myobj.save(f_name)
+        audio = MP3(f_name)
+        mp3_lengths.append(audio.info.length)
+        print(f"DEBUG: Audio duration: {audio.info.length} seconds")
+    # Merge audio files
+    export_path = merge_audio_files(mp3_names)
+    # Create video clips from images
     clips = []
+    for k, img in enumerate(generated_images_sub):
+        duration = mp3_lengths[k]
+        print(f"DEBUG: Creating video clip {k+1} with duration: {duration} seconds")
+        clip = mpy.ImageClip(f"image_{k}/generated_image_with_subtitles.png").set_duration(duration + 0.5)
+        clips.append(clip)
+    # Concatenate video clips
+    print("DEBUG: Concatenating video clips...")
+    concat_clip = mpy.concatenate_videoclips(clips, method="compose")
+    concat_clip.write_videofile("result_no_audio.mp4", fps=24)
+    # Combine video and audio
+    movie_name = 'result_no_audio.mp4'
     movie_final = 'result_final.mp4'
     def combine_audio(vidname, audname, outname, fps=24):
+        print(f"DEBUG: Combining audio for video: '{vidname}'")
+        my_clip = mpy.VideoFileClip(vidname)
+        audio_background = mpy.AudioFileClip(audname)
         final_clip = my_clip.set_audio(audio_background)
         final_clip.write_videofile(outname, fps=fps)
+    combine_audio(movie_name, export_path, movie_final)
+    # Clean up
+    print("DEBUG: Cleaning up files...")
+    for i in range(len(generated_images_sub)):
+        shutil.rmtree(f"image_{i}")
+        os.remove(f"audio_{i}.mp3")
     os.remove("result.mp3")
+    os.remove("result_no_audio.mp4")
+    print("DEBUG: Cleanup complete.")
+    print("DEBUG: get_output_video function completed successfully.")
     return 'result_final.mp4'
+# Example text (can be changed by user in Gradio interface)
 text = 'Once, there was a girl called Laura who went to the supermarket to buy the ingredients to make a cake. Because today is her birthday and her friends come to her house and help her to prepare the cake.'
+# Create Gradio interface
+demo = gr.Blocks()
 with demo:
     gr.Markdown("# Video Generator from stories with Artificial Intelligence")
+    gr.Markdown("A story can be input by user. The story is summarized using DistilBART model. Then, the images are generated by using Dalle-mini, and the subtitles and audio are created using gTTS. These are combined to generate a video.")
     with gr.Row():
         with gr.Column():
+            input_start_text = gr.Textbox(value=text, label="Type your story here, for now a sample story is added already!")
             with gr.Row():
                 button_gen_video = gr.Button("Generate Video")
         with gr.Column():
             output_interpolation = gr.Video(label="Generated Video")
     gr.Markdown("<h3>Future Works </h3>")
+    gr.Markdown("This program is a text-to-video AI software generating videos from any prompt! AI software to build an art gallery. The future version will use Dalle-2. For more info visit [ruslanmv.com](https://ruslanmv.com/) ")
     button_gen_video.click(fn=get_output_video, inputs=input_start_text, outputs=output_interpolation)
+# Launch the Gradio app
+demo.launch(debug=True, share=False)