ruslanmv commited on
Commit
7f0496b
·
verified ·
1 Parent(s): deebc86

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +203 -220
app.py CHANGED
@@ -1,297 +1,280 @@
 
 
 
1
  import gradio as gr
2
  import torch
3
- import moviepy.editor as mpe
4
- from PIL import Image, ImageDraw, ImageFont
5
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
6
  from min_dalle import MinDalle
 
 
 
 
7
  from gtts import gTTS
8
  from pydub import AudioSegment
9
- import nltk
10
- import textwrap
11
  import os
12
  import glob
 
13
  import subprocess
14
- import imageio_ffmpeg
15
-
16
-
17
- # Define a fallback for environments without GPU
18
- if os.environ.get("SPACES_ZERO_GPU") is not None:
19
- import spaces
20
- else:
21
- class spaces:
22
- @staticmethod
23
- def GPU(func):
24
- def wrapper(*args, **kwargs):
25
- return func(*args, **kwargs)
26
- return wrapper
27
-
28
- # Ensure 'punkt' is downloaded for nltk
29
  try:
30
  nltk.data.find('tokenizers/punkt')
31
  except LookupError:
32
  nltk.download('punkt')
33
 
34
- # Download FFmpeg using imageio_ffmpeg (more robust)
35
- try:
36
- imageio_ffmpeg.get_ffmpeg_exe()
37
- print("FFmpeg downloaded successfully (if not already present).")
38
- except Exception as e:
39
- print(f"Error downloading FFmpeg using imageio_ffmpeg: {e}")
40
- raise
41
-
42
- description = "Video Story Generator with Audio \n PS: Generation of video by using Artificial Intelligence by dalle-mini and distilbart and gtss "
43
- title = "Video Story Generator with Audio by using dalle-mini and distilbart and gtss "
44
 
 
45
  tokenizer = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
46
  model = AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-cnn-12-6")
 
 
47
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
48
- model.to(device)
49
  print(f"Using device: {device}")
 
50
 
51
- def get_output_video(text):
52
- print("Starting get_output_video function...")
53
- inputs = tokenizer(text,
54
- max_length=1024,
55
- truncation=True,
56
- return_tensors="pt").to(device)
57
- summary_ids = model.generate(inputs["input_ids"])
58
- summary = tokenizer.batch_decode(summary_ids,
59
- skip_special_tokens=True,
60
- clean_up_tokenization_spaces=False)
61
- plot = list(summary[0].split('.'))
62
- print(f"Summarized plot: {plot}")
63
-
64
- '''
65
- The required models will be downloaded to models_root if they are not already there.
66
- Set the dtype to torch.float16 to save GPU memory.
67
- If you have an Ampere architecture GPU you can use torch.bfloat16.
68
- Set the device to either "cuda" or "cpu". Once everything has finished initializing,
69
- float32 is faster than float16 but uses more GPU memory.
70
- '''
71
-
72
- #@spaces.GPU(duration=60 * 3)
73
- def generate_image(
74
- is_mega: bool,
75
- text: str,
76
- seed: int,
77
- grid_size: int,
78
- top_k: int,
79
- image_path: str,
80
- models_root: str,
81
- fp16: bool,
82
- ):
83
- print(f"Generating image for: {text}")
84
- model = MinDalle(
85
- is_mega=is_mega,
86
- models_root=models_root,
87
- is_reusable=True,
88
- is_verbose=True,
89
- dtype=torch.float16 if fp16 else torch.float32, # ensures correct data type
90
- device=device
91
- )
92
 
93
- # Ensure correct dtype for inputs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  image = model.generate_image(
95
  text,
96
  seed,
97
  grid_size,
98
- top_k=top_k,
99
- is_verbose=True
100
  )
101
- print(f"Image generated successfully.")
102
- return image
103
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  generated_images = []
105
  for i, senten in enumerate(plot[:-1]):
106
- print(f"Generating image {i+1} of {len(plot)-1}...")
107
- try:
108
- image = generate_image(
109
- is_mega=True,
110
- text=senten,
111
- seed=1,
112
- grid_size=1, # param {type:"integer"}
113
- top_k=256, # param {type:"integer"}
114
- image_path='generated',
115
- models_root='pretrained',
116
- fp16=True,
117
- )
118
- generated_images.append(image)
119
- print(f"Image {i+1} generated and appended.")
120
- except Exception as e:
121
- print(f"Error generating image {i+1}: {e}")
122
- raise
123
-
124
- # Step 4- Creation of the subtitles
 
 
 
125
  sentences = plot[:-1]
126
- num_sentences = len(sentences)
127
- assert len(generated_images) == len(sentences), print('Something is wrong')
128
-
129
- # We can generate our list of subtitles
130
- from nltk import tokenize
131
- c = 0
132
- sub_names = []
133
- for k in range(len(generated_images)):
134
- subtitles = tokenize.sent_tokenize(sentences[k])
135
- sub_names.append(subtitles)
136
- print(f"Subtitles generated for image {k+1}: {subtitles}")
137
-
138
- # Step 5- Adding Subtitles to the Images
139
  def draw_multiple_line_text(image, text, font, text_color, text_start_height):
140
  draw = ImageDraw.Draw(image)
141
  image_width, image_height = image.size
142
  y_text = text_start_height
143
  lines = textwrap.wrap(text, width=40)
144
  for line in lines:
145
- line_width, line_height = font.getbbox(line)[2:4] # Use getbbox for better size calculation
146
  draw.text(((image_width - line_width) / 2, y_text),
147
  line, font=font, fill=text_color)
148
  y_text += line_height
149
 
150
  def add_text_to_img(text1, image_input):
151
- '''
152
- Testing draw_multiple_line_text
153
- '''
154
- image = image_input
155
- fontsize = 20 # Increased font size
156
  path_font = "/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf"
157
  if not os.path.exists(path_font):
158
- # Try alternative location on different systems
159
  path_font = "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf"
160
- if not os.path.exists(path_font):
161
- print("Font file not found. Subtitles might not be rendered correctly.")
162
- path_font = None
163
- if path_font is not None:
164
- try:
165
- font = ImageFont.truetype(path_font, fontsize)
166
- text_color = (255, 255, 0)
167
- text_start_height = 200
168
- draw_multiple_line_text(image, text1, font, text_color, text_start_height)
169
- except Exception as e:
170
- print(f"Error loading or using font: {e}")
171
-
172
- return image
173
 
174
  generated_images_sub = []
175
- for k in range(len(generated_images)):
176
- imagenes = generated_images[k].copy()
177
  text_to_add = sub_names[k][0]
178
- result = add_text_to_img(text_to_add, imagenes)
179
  generated_images_sub.append(result)
180
- print(f"Subtitles added to image {k+1}.")
181
 
182
- # Step 7 - Creation of audio
183
- c = 0
184
  mp3_names = []
185
  mp3_lengths = []
186
- for k in range(len(generated_images)):
187
- text_to_add = sub_names[k][0]
188
- print(f"Generating audio for: {text_to_add}")
189
- f_name = 'audio_' + str(c) + '.mp3'
190
  mp3_names.append(f_name)
191
- # The text that you want to convert to audio
192
- mytext = text_to_add
193
- # Language in which you want to convert
194
- language = 'en'
195
- # Passing the text and language to the engine,
196
- # here we have marked slow=False. Which tells
197
- # the module that the converted audio should
198
- # have a high speed
199
- myobj = gTTS(text=mytext, lang=language, slow=False)
200
- # Saving the converted audio in a mp3 file named
201
- sound_file = f_name
202
- myobj.save(sound_file)
203
- audio = AudioSegment.from_file(sound_file, format="mp3")
204
- duration = len(audio) / 1000
205
- mp3_lengths.append(duration)
206
- print(f"Audio duration: {duration} seconds")
207
- c += 1
208
-
209
- # Step 8 - Merge audio files
210
- cwd = os.getcwd().replace(chr(92), '/')
211
- export_path = 'result.mp3'
212
- silence = AudioSegment.silent(duration=500)
213
- full_audio = AudioSegment.empty()
214
- for n, mp3_file in enumerate(mp3_names):
215
- mp3_file = mp3_file.replace(chr(92), '/')
216
- print(f"Merging audio file: {mp3_file}")
217
- # Load the current mp3 into `audio_segment`
218
- audio_segment = AudioSegment.from_mp3(mp3_file)
219
- # Just accumulate the new `audio_segment` + `silence`
220
- full_audio += audio_segment + silence
221
- print(f'Merging audio {n+1} completed.')
222
- # The loop will exit once all files in the list have been used
223
- # Then export
224
- full_audio.export(export_path, format='mp3')
225
- print('\nAudio merging done!')
226
-
227
- # Step 9 - Creation of the video with adjusted times of the sound
228
- c = 0
229
- file_names = []
230
- for img in generated_images_sub:
231
- f_name = 'img_' + str(c) + '.jpg'
232
- file_names.append(f_name)
233
- img.save(f_name)
234
- print(f"Saving image: {f_name}")
235
- c += 1
236
- print(f"Image file names: {file_names}")
237
 
 
 
 
 
238
  clips = []
239
- d = 0
240
- for m in file_names:
241
- duration = mp3_lengths[d]
242
- print(f"Creating video clip {d+1} with duration: {duration} seconds")
243
- clips.append(mpe.ImageClip(m).set_duration(duration + 0.5))
244
- d += 1
245
- concat_clip = mpe.concatenate_videoclips(clips, method="compose")
246
- concat_clip.write_videofile("result_new.mp4", fps=24)
247
- print("Video clips concatenated and saved as result_new.mp4")
248
-
249
- # Step 10 - Merge Video + Audio
250
- movie_name = 'result_new.mp4'
251
- export_path = 'result.mp3'
252
  movie_final = 'result_final.mp4'
253
 
254
  def combine_audio(vidname, audname, outname, fps=24):
255
- my_clip = mpe.VideoFileClip(vidname)
256
- audio_background = mpe.AudioFileClip(audname)
 
257
  final_clip = my_clip.set_audio(audio_background)
258
  final_clip.write_videofile(outname, fps=fps)
259
 
260
- combine_audio(movie_name, export_path, movie_final) # create a new file
261
- print("Video and audio merged successfully!")
262
 
263
- # Cleanup intermediate files
264
- for f in file_names:
265
- os.remove(f)
266
- for f in mp3_names:
267
- os.remove(f)
268
- os.remove("result_new.mp4")
269
  os.remove("result.mp3")
270
- print("Intermediate files cleaned up.")
271
 
272
- print("Finished get_output_video function.")
 
273
  return 'result_final.mp4'
274
 
 
 
 
275
  text = 'Once, there was a girl called Laura who went to the supermarket to buy the ingredients to make a cake. Because today is her birthday and her friends come to her house and help her to prepare the cake.'
276
- demo = gr.Blocks()
277
 
 
 
278
  with demo:
279
  gr.Markdown("# Video Generator from stories with Artificial Intelligence")
280
- gr.Markdown(
281
- "A story can be input by user. The story is summarized using DistillBART model. Then, then it is generated the images by using Dalle-mini and created the subtitles and audio gtts. These are generated as a video.")
282
  with gr.Row():
283
- # Left column (inputs)
284
  with gr.Column():
285
- input_start_text = gr.Textbox(value=text,
286
- label="Type your story here, for now a sample story is added already!")
287
  with gr.Row():
288
  button_gen_video = gr.Button("Generate Video")
289
- # Right column (outputs)
290
  with gr.Column():
291
  output_interpolation = gr.Video(label="Generated Video")
292
  gr.Markdown("<h3>Future Works </h3>")
293
- gr.Markdown(
294
- "This program text-to-video AI software generating videos from any prompt! AI software to build an art gallery. The future version will use Dalle-2 For more info visit [ruslanmv.com](https://ruslanmv.com/) ")
295
  button_gen_video.click(fn=get_output_video, inputs=input_start_text, outputs=output_interpolation)
296
 
297
- demo.launch(debug=True)
 
 
1
+ import moviepy.editor as mpy
2
+ from PIL import Image
3
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
4
  import gradio as gr
5
  import torch
 
 
 
6
  from min_dalle import MinDalle
7
+ from huggingface_hub import snapshot_download
8
+ from PIL import Image, ImageDraw, ImageFont
9
+ import textwrap
10
+ from mutagen.mp3 import MP3
11
  from gtts import gTTS
12
  from pydub import AudioSegment
 
 
13
  import os
14
  import glob
15
+ import nltk
16
  import subprocess
17
+ import shutil
18
+ import matplotlib.pyplot as plt
19
+ import gc # Import the garbage collector
20
+ from audio import *
21
+ # Download necessary NLTK data
 
 
 
 
 
 
 
 
 
 
22
  try:
23
  nltk.data.find('tokenizers/punkt')
24
  except LookupError:
25
  nltk.download('punkt')
26
 
27
+ description = "Video Story Generator with Audio \n PS: Generation of video by using Artifical Intellingence by dalle-mini and distilbart and gtss "
28
+ title = "Video Story Generator with Audio by using dalle-mini and distilbart and gtss "
 
 
 
 
 
 
 
 
29
 
30
+ # Load tokenizer and model for text summarization
31
  tokenizer = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
32
  model = AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-cnn-12-6")
33
+
34
+ # Check for CUDA availability and set device
35
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 
36
  print(f"Using device: {device}")
37
+ model.to(device)
38
 
39
+ # Function to log GPU memory (optional, for debugging)
40
+ def log_gpu_memory():
41
+ if torch.cuda.is_available():
42
+ print(subprocess.check_output('nvidia-smi').decode('utf-8'))
43
+ else:
44
+ print("CUDA is not available. Cannot log GPU memory.")
45
+
46
+ # --------- MinDalle Image Generation Functions ---------
47
+
48
+ # Load MinDalle model once
49
+ def load_min_dalle_model(models_root: str = 'pretrained', fp16: bool = True):
50
+ """
51
+ Load the MinDalle model.
52
+
53
+ Args:
54
+ models_root: Path to the directory containing MinDalle models.
55
+ fp16: Whether to use float16 for faster generation (requires CUDA).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
+ Returns:
58
+ An instance of the MinDalle model.
59
+ """
60
+ print("DEBUG: Loading MinDalle model...")
61
+ return MinDalle(
62
+ is_mega=True,
63
+ models_root=models_root,
64
+ is_reusable=False, # Set is_reusable to False
65
+ is_verbose=True,
66
+ dtype=torch.float16 if fp16 else torch.float32,
67
+ device=device
68
+ )
69
+
70
+ # Initialize the MinDalle model
71
+ min_dalle_model = load_min_dalle_model()
72
+
73
+
74
+
75
+ def generate_image_with_min_dalle(
76
+ model: MinDalle,
77
+ text: str,
78
+ seed: int = -1,
79
+ grid_size: int = 1
80
+ ):
81
+ """
82
+ Generates an image from text using MinDalle.
83
+
84
+ Args:
85
+ model: The preloaded MinDalle model.
86
+ text: The text prompt to generate the image from.
87
+ seed: The random seed for image generation. -1 for random.
88
+ grid_size: The grid size for multiple image generation.
89
+
90
+ Returns:
91
+ A PIL Image object.
92
+ """
93
+ print(f"DEBUG: Generating image with MinDalle for text: '{text}'")
94
+ model.is_reusable = False
95
+ with torch.no_grad():
96
  image = model.generate_image(
97
  text,
98
  seed,
99
  grid_size,
100
+ is_verbose=False
 
101
  )
 
 
102
 
103
+ # Clear GPU memory after generation
104
+ torch.cuda.empty_cache()
105
+ gc.collect()
106
+
107
+ print("DEBUG: Image generated successfully.")
108
+ return image
109
+
110
+
111
+ # --------- End of MinDalle Functions ---------
112
+ # Merge audio files
113
+
114
+ from pydub import AudioSegment
115
+ import os
116
+
117
+
118
+
119
+
120
+
121
+ # Function to generate video from text
122
+ def get_output_video(text):
123
+ print("DEBUG: Starting get_output_video function...")
124
+
125
+ # Summarize the input text
126
+ print("DEBUG: Summarizing text...")
127
+ inputs = tokenizer(
128
+ text,
129
+ max_length=1024,
130
+ truncation=True,
131
+ return_tensors="pt"
132
+ ).to(device)
133
+ summary_ids = model.generate(inputs["input_ids"])
134
+ summary = tokenizer.batch_decode(
135
+ summary_ids,
136
+ skip_special_tokens=True,
137
+ clean_up_tokenization_spaces=False
138
+ )
139
+ plot = list(summary[0].split('.'))
140
+ print(f"DEBUG: Summary generated: {plot}")
141
+
142
+ # Generate images for each sentence in the plot
143
  generated_images = []
144
  for i, senten in enumerate(plot[:-1]):
145
+ print(f"DEBUG: Generating image {i+1} of {len(plot)-1}...")
146
+ image_dir = f"image_{i}"
147
+ os.makedirs(image_dir, exist_ok=True)
148
+
149
+ min_dalle_model = load_min_dalle_model()
150
+
151
+ image = generate_image_with_min_dalle(
152
+ min_dalle_model,
153
+ text=senten,
154
+ seed=1,
155
+ grid_size=1
156
+ )
157
+ generated_images.append(image)
158
+ image_path = os.path.join(image_dir, "generated_image.png")
159
+ image.save(image_path)
160
+ print(f"DEBUG: Image generated and saved to {image_path}")
161
+
162
+ del min_dalle_model
163
+ torch.cuda.empty_cache()
164
+ gc.collect()
165
+
166
+ # Create subtitles from the plot
167
  sentences = plot[:-1]
168
+ print("DEBUG: Creating subtitles...")
169
+ assert len(generated_images) == len(sentences), "Mismatch in number of images and sentences."
170
+ sub_names = [nltk.tokenize.sent_tokenize(sentence) for sentence in sentences]
171
+
172
+ # Add subtitles to images
 
 
 
 
 
 
 
 
173
  def draw_multiple_line_text(image, text, font, text_color, text_start_height):
174
  draw = ImageDraw.Draw(image)
175
  image_width, image_height = image.size
176
  y_text = text_start_height
177
  lines = textwrap.wrap(text, width=40)
178
  for line in lines:
179
+ line_width, line_height = font.getbbox(line)[2:]
180
  draw.text(((image_width - line_width) / 2, y_text),
181
  line, font=font, fill=text_color)
182
  y_text += line_height
183
 
184
  def add_text_to_img(text1, image_input):
185
+ print(f"DEBUG: Adding text to image: '{text1}'")
186
+ fontsize = 30
 
 
 
187
  path_font = "/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf"
188
  if not os.path.exists(path_font):
 
189
  path_font = "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf"
190
+
191
+ font = ImageFont.truetype(path_font, fontsize)
192
+ text_color = (255, 255, 0)
193
+ text_start_height = image_input.height - (fontsize * len(textwrap.wrap(text1, width=40))) - 20
194
+ draw_multiple_line_text(image_input, text1, font, text_color, text_start_height)
195
+ return image_input
 
 
 
 
 
 
 
196
 
197
  generated_images_sub = []
198
+ for k, image in enumerate(generated_images):
 
199
  text_to_add = sub_names[k][0]
200
+ result = add_text_to_img(text_to_add, image.copy())
201
  generated_images_sub.append(result)
202
+ result.save(f"image_{k}/generated_image_with_subtitles.png")
203
 
204
+ # Generate audio for each subtitle
 
205
  mp3_names = []
206
  mp3_lengths = []
207
+ for k, text_to_add in enumerate(sub_names):
208
+ print(f"DEBUG: Generating audio for: '{text_to_add[0]}'")
209
+ f_name = f'audio_{k}.mp3'
 
210
  mp3_names.append(f_name)
211
+ myobj = gTTS(text=text_to_add[0], lang='en', slow=False)
212
+ myobj.save(f_name)
213
+ audio = MP3(f_name)
214
+ mp3_lengths.append(audio.info.length)
215
+ print(f"DEBUG: Audio duration: {audio.info.length} seconds")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
216
 
217
+ # Merge audio files
218
+ export_path = merge_audio_files(mp3_names)
219
+
220
+ # Create video clips from images
221
  clips = []
222
+ for k, img in enumerate(generated_images_sub):
223
+ duration = mp3_lengths[k]
224
+ print(f"DEBUG: Creating video clip {k+1} with duration: {duration} seconds")
225
+ clip = mpy.ImageClip(f"image_{k}/generated_image_with_subtitles.png").set_duration(duration + 0.5)
226
+ clips.append(clip)
227
+
228
+ # Concatenate video clips
229
+ print("DEBUG: Concatenating video clips...")
230
+ concat_clip = mpy.concatenate_videoclips(clips, method="compose")
231
+ concat_clip.write_videofile("result_no_audio.mp4", fps=24)
232
+
233
+ # Combine video and audio
234
+ movie_name = 'result_no_audio.mp4'
235
  movie_final = 'result_final.mp4'
236
 
237
  def combine_audio(vidname, audname, outname, fps=24):
238
+ print(f"DEBUG: Combining audio for video: '{vidname}'")
239
+ my_clip = mpy.VideoFileClip(vidname)
240
+ audio_background = mpy.AudioFileClip(audname)
241
  final_clip = my_clip.set_audio(audio_background)
242
  final_clip.write_videofile(outname, fps=fps)
243
 
244
+ combine_audio(movie_name, export_path, movie_final)
 
245
 
246
+ # Clean up
247
+ print("DEBUG: Cleaning up files...")
248
+ for i in range(len(generated_images_sub)):
249
+ shutil.rmtree(f"image_{i}")
250
+ os.remove(f"audio_{i}.mp3")
 
251
  os.remove("result.mp3")
252
+ os.remove("result_no_audio.mp4")
253
 
254
+ print("DEBUG: Cleanup complete.")
255
+ print("DEBUG: get_output_video function completed successfully.")
256
  return 'result_final.mp4'
257
 
258
+
259
+
260
+ # Example text (can be changed by user in Gradio interface)
261
  text = 'Once, there was a girl called Laura who went to the supermarket to buy the ingredients to make a cake. Because today is her birthday and her friends come to her house and help her to prepare the cake.'
 
262
 
263
+ # Create Gradio interface
264
+ demo = gr.Blocks()
265
  with demo:
266
  gr.Markdown("# Video Generator from stories with Artificial Intelligence")
267
+ gr.Markdown("A story can be input by user. The story is summarized using DistilBART model. Then, the images are generated by using Dalle-mini, and the subtitles and audio are created using gTTS. These are combined to generate a video.")
 
268
  with gr.Row():
 
269
  with gr.Column():
270
+ input_start_text = gr.Textbox(value=text, label="Type your story here, for now a sample story is added already!")
 
271
  with gr.Row():
272
  button_gen_video = gr.Button("Generate Video")
 
273
  with gr.Column():
274
  output_interpolation = gr.Video(label="Generated Video")
275
  gr.Markdown("<h3>Future Works </h3>")
276
+ gr.Markdown("This program is a text-to-video AI software generating videos from any prompt! AI software to build an art gallery. The future version will use Dalle-2. For more info visit [ruslanmv.com](https://ruslanmv.com/) ")
 
277
  button_gen_video.click(fn=get_output_video, inputs=input_start_text, outputs=output_interpolation)
278
 
279
+ # Launch the Gradio app
280
+ demo.launch(debug=True, share=False)