ruslanmv commited on
Commit
6668dc9
·
verified ·
1 Parent(s): ec331dc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +238 -213
app.py CHANGED
@@ -1,240 +1,265 @@
1
- # Step 2 - Importing Libraries
2
- from moviepy.editor import *
3
- from PIL import Image
4
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM,pipeline
5
  import gradio as gr
6
- import torch, torch.backends.cudnn, torch.backends.cuda
7
- from min_dalle import MinDalle
8
- from huggingface_hub import snapshot_download
9
  from PIL import Image, ImageDraw, ImageFont
10
- import textwrap
11
- from mutagen.mp3 import MP3
12
  from gtts import gTTS
13
  from pydub import AudioSegment
14
- from os import getcwd
15
- import glob
16
  import nltk
17
- import subprocess
18
- nltk.download('punkt')
19
- description = " Video Story Generator with Audio \n PS: Generation of video by using Artifical Intellingence by dalle-mini and distilbart and gtss "
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  title = "Video Story Generator with Audio by using dalle-mini and distilbart and gtss "
 
21
  tokenizer = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
22
  model = AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-cnn-12-6")
23
-
24
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 
25
  print(device)
26
 
27
- #def log_gpu_memory():
28
- # print(subprocess.check_output('nvidia-smi').decode('utf-8'))
29
- #log_gpu_memory()
30
-
31
 
32
  def get_output_video(text):
33
- inputs = tokenizer(text,
34
- max_length=1024,
35
- truncation=True,
36
- return_tensors="pt")
37
-
38
- summary_ids = model.generate(inputs["input_ids"])
39
- summary = tokenizer.batch_decode(summary_ids,
40
- skip_special_tokens=True,
41
- clean_up_tokenization_spaces=False)
42
- plot = list(summary[0].split('.'))
43
-
44
- '''
45
- The required models will be downloaded to models_root if they are not already there.
46
- Set the dtype to torch.float16 to save GPU memory.
47
- If you have an Ampere architecture GPU you can use torch.bfloat16.
48
- Set the device to either "cuda" or "cpu". Once everything has finished initializing,
49
- float32 is faster than float16 but uses more GPU memory.
50
-
51
- '''
52
-
53
- def generate_image(
54
- is_mega: bool,
55
- text: str,
56
- seed: int,
57
- grid_size: int,
58
- top_k: int,
59
- image_path: str,
60
- models_root: str,
61
- fp16: bool,):
62
- model = MinDalle(
63
- is_mega=is_mega,
64
- models_root=models_root,
65
- is_reusable=True,
66
- is_verbose=True,
67
- dtype=torch.float16 if fp16 else torch.float32, #param ["float32", "float16", "bfloat16"]
68
- #device='cuda' #'cpu'
69
- )
70
- #log_gpu_memory()
71
-
72
- image = model.generate_image(
73
- text,
74
- seed,
75
- grid_size,
76
- top_k=top_k,
77
- is_verbose=True
78
- )
79
-
80
- return image
81
-
82
- generated_images = []
83
- for senten in plot[:-1]:
84
- image=generate_image(
85
- is_mega= True,
86
- text=senten,
87
- seed=1,
88
- grid_size=1, #param {type:"integer"}
89
- top_k=256, #param {type:"integer"}
90
-
91
- image_path='generated',
92
- models_root='pretrained',
93
- fp16=256,)
94
- generated_images.append(image)
95
-
96
- # Step 4- Creation of the subtitles
97
- sentences =plot[:-1]
98
- num_sentences=len(sentences)
99
- assert len(generated_images) == len(sentences) , print('Something is wrong')
100
- #We can generate our list of subtitles
101
- from nltk import tokenize
102
- c = 0
103
- sub_names = []
104
- for k in range(len(generated_images)):
105
- subtitles=tokenize.sent_tokenize(sentences[k])
106
- sub_names.append(subtitles)
107
-
108
- # Step 5- Adding Subtitles to the Images
109
- def draw_multiple_line_text(image, text, font, text_color, text_start_height):
110
- draw = ImageDraw.Draw(image)
111
- image_width, image_height = image.size
112
- y_text = text_start_height
113
- lines = textwrap.wrap(text, width=40)
114
- for line in lines:
115
- line_width, line_height = font.getsize(line)
116
- draw.text(((image_width - line_width) / 2, y_text),
117
- line, font=font, fill=text_color)
118
- y_text += line_height
119
-
120
- def add_text_to_img(text1,image_input):
121
- '''
122
- Testing draw_multiple_line_text
123
- '''
124
- image =image_input
125
- fontsize = 13 # starting font size
126
- path_font="/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf"
127
- font = ImageFont.truetype(path_font, fontsize)
128
- text_color = (255,255,0)
129
- text_start_height = 200
130
- draw_multiple_line_text(image, text1, font, text_color, text_start_height)
131
- return image
132
-
133
- generated_images_sub = []
134
- for k in range(len(generated_images)):
135
- imagenes = generated_images[k].copy()
136
- text_to_add=sub_names[k][0]
137
- result=add_text_to_img(text_to_add,imagenes)
138
- generated_images_sub.append(result)
139
- # Step 7 - Creation of audio
140
- c = 0
141
- mp3_names = []
142
- mp3_lengths = []
143
- for k in range(len(generated_images)):
144
- text_to_add=sub_names[k][0]
145
- print(text_to_add)
146
- f_name = 'audio_'+str(c)+'.mp3'
147
- mp3_names.append(f_name)
148
- # The text that you want to convert to audio
149
- mytext = text_to_add
150
- # Language in which you want to convert
151
- language = 'en'
152
- # Passing the text and language to the engine,
153
- # here we have marked slow=False. Which tells
154
- # the module that the converted audio should
155
- # have a high speed
156
- myobj = gTTS(text=mytext, lang=language, slow=False)
157
- # Saving the converted audio in a mp3 file named
158
- sound_file=f_name
159
- myobj.save(sound_file)
160
- audio = MP3(sound_file)
161
- duration=audio.info.length
162
- mp3_lengths.append(duration)
163
- print(audio.info.length)
164
- c+=1
165
-
166
- # Step 8 - Merge audio files
167
- cwd = (getcwd()).replace(chr(92), '/')
168
- #export_path = f'{cwd}/result.mp3'
169
- export_path ='result.mp3'
170
- MP3_FILES = glob.glob(pathname=f'{cwd}/*.mp3', recursive=True)
171
- silence = AudioSegment.silent(duration=500)
172
- full_audio = AudioSegment.empty() # this will accumulate the entire mp3 audios
173
- for n, mp3_file in enumerate(mp3_names):
174
- mp3_file = mp3_file.replace(chr(92), '/')
175
- print(n, mp3_file)
176
-
177
- # Load the current mp3 into `audio_segment`
178
- audio_segment = AudioSegment.from_mp3(mp3_file)
179
-
180
- # Just accumulate the new `audio_segment` + `silence`
181
- full_audio += audio_segment + silence
182
- print('Merging ', n)
183
-
184
- # The loop will exit once all files in the list have been used
185
- # Then export
186
- full_audio.export(export_path, format='mp3')
187
- print('\ndone!')
188
-
189
- # Step 9 - Creation of the video with adjusted times of the sound
190
- c = 0
191
- file_names = []
192
- for img in generated_images_sub:
193
- f_name = 'img_'+str(c)+'.jpg'
194
- file_names.append(f_name)
195
- img = img.save(f_name)
196
- c+=1
197
- print(file_names)
198
- clips=[]
199
- d=0
200
- for m in file_names:
201
- duration=mp3_lengths[d]
202
- print(d,duration)
203
- clips.append(ImageClip(m).set_duration(duration+0.5))
204
- d+=1
205
- concat_clip = concatenate_videoclips(clips, method="compose")
206
- concat_clip.write_videofile("result_new.mp4", fps=24)
207
-
208
- # Step 10 - Merge Video + Audio
209
- movie_name = 'result_new.mp4'
210
- export_path='result.mp3'
211
- movie_final= 'result_final.mp4'
212
-
213
- def combine_audio(vidname, audname, outname, fps=60):
214
- import moviepy.editor as mpe
215
- my_clip = mpe.VideoFileClip(vidname)
216
- audio_background = mpe.AudioFileClip(audname)
217
- final_clip = my_clip.set_audio(audio_background)
218
- final_clip.write_videofile(outname,fps=fps)
219
- combine_audio(movie_name, export_path, movie_final) # create a new file
220
- return 'result_final.mp4'
221
- text ='Once, there was a girl called Laura who went to the supermarket to buy the ingredients to make a cake. Because today is her birthday and her friends come to her house and help her to prepare the cake.'
 
 
 
 
 
 
 
 
 
222
  demo = gr.Blocks()
223
  with demo:
224
  gr.Markdown("# Video Generator from stories with Artificial Intelligence")
225
- gr.Markdown("A story can be input by user. The story is summarized using DistillBART model. Then, then it is generated the images by using Dalle-mini and created the subtitles and audio gtts. These are generated as a video.")
 
226
  with gr.Row():
227
  # Left column (inputs)
228
  with gr.Column():
229
-
230
- input_start_text = gr.Textbox(value=text, label="Type your story here, for now a sample story is added already!")
231
  with gr.Row():
232
  button_gen_video = gr.Button("Generate Video")
233
  # Right column (outputs)
234
  with gr.Column():
235
  output_interpolation = gr.Video(label="Generated Video")
236
  gr.Markdown("<h3>Future Works </h3>")
237
- gr.Markdown("This program text-to-video AI software generating videos from any prompt! AI software to build an art gallery. The future version will use Dalle-2 For more info visit [ruslanmv.com](https://ruslanmv.com/) ")
 
238
  button_gen_video.click(fn=get_output_video, inputs=input_start_text, outputs=output_interpolation)
239
-
240
  demo.launch(debug=False)
 
 
 
 
 
1
  import gradio as gr
2
+ import torch
3
+ from moviepy.editor import *
 
4
  from PIL import Image, ImageDraw, ImageFont
5
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
6
+ from min_dalle import MinDalle
7
  from gtts import gTTS
8
  from pydub import AudioSegment
 
 
9
  import nltk
10
+ import textwrap
11
+ import os
12
+ import glob
13
+
14
+ # Ensure 'punkt' is downloaded for nltk
15
+ try:
16
+ nltk.data.find('tokenizers/punkt')
17
+ except LookupError:
18
+ nltk.download('punkt')
19
+
20
+ # Download ffmpeg if not found (using a more robust method)
21
+ try:
22
+ from imageio_ffmpeg import get_ffmpeg_exe
23
+ get_ffmpeg_exe()
24
+ except Exception as e:
25
+ print(f"Error downloading ffmpeg: {e}")
26
+ print("Attempting to download ffmpeg using a different method...")
27
+ try:
28
+ import imageio
29
+ imageio.plugins.ffmpeg.download(directory=os.path.join(os.path.expanduser("~"), ".imageio"))
30
+ print("ffmpeg downloaded successfully.")
31
+ except Exception as e:
32
+ print(f"Failed to download ffmpeg: {e}")
33
+ print("Please ensure you have an internet connection and that imageio and imageio_ffmpeg are installed.")
34
+ raise
35
+
36
+
37
+ description = " Video Story Generator with Audio \n PS: Generation of video by using Artificial Intelligence by dalle-mini and distilbart and gtss "
38
  title = "Video Story Generator with Audio by using dalle-mini and distilbart and gtss "
39
+
40
  tokenizer = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
41
  model = AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-cnn-12-6")
 
42
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
43
+ model.to(device)
44
  print(device)
45
 
 
 
 
 
46
 
47
  def get_output_video(text):
48
+ inputs = tokenizer(text,
49
+ max_length=1024,
50
+ truncation=True,
51
+ return_tensors="pt").to(device)
52
+ summary_ids = model.generate(inputs["input_ids"])
53
+ summary = tokenizer.batch_decode(summary_ids,
54
+ skip_special_tokens=True,
55
+ clean_up_tokenization_spaces=False)
56
+ plot = list(summary[0].split('.'))
57
+
58
+ '''
59
+ The required models will be downloaded to models_root if they are not already there.
60
+ Set the dtype to torch.float16 to save GPU memory.
61
+ If you have an Ampere architecture GPU you can use torch.bfloat16.
62
+ Set the device to either "cuda" or "cpu". Once everything has finished initializing,
63
+ float32 is faster than float16 but uses more GPU memory.
64
+ '''
65
+
66
+ def generate_image(
67
+ is_mega: bool,
68
+ text: str,
69
+ seed: int,
70
+ grid_size: int,
71
+ top_k: int,
72
+ image_path: str,
73
+ models_root: str,
74
+ fp16: bool,
75
+ ):
76
+ model = MinDalle(
77
+ is_mega=is_mega,
78
+ models_root=models_root,
79
+ is_reusable=True,
80
+ is_verbose=True,
81
+ dtype=torch.float16 if fp16 else torch.float32,
82
+ device=device
83
+ )
84
+
85
+ image = model.generate_image(
86
+ text,
87
+ seed,
88
+ grid_size,
89
+ top_k=top_k,
90
+ is_verbose=True
91
+ )
92
+ return image
93
+
94
+ generated_images = []
95
+ for senten in plot[:-1]:
96
+ image = generate_image(
97
+ is_mega=True,
98
+ text=senten,
99
+ seed=1,
100
+ grid_size=1, # param {type:"integer"}
101
+ top_k=256, # param {type:"integer"}
102
+ image_path='generated',
103
+ models_root='pretrained',
104
+ fp16=True, )
105
+ generated_images.append(image)
106
+
107
+ # Step 4- Creation of the subtitles
108
+ sentences = plot[:-1]
109
+ num_sentences = len(sentences)
110
+ assert len(generated_images) == len(sentences), print('Something is wrong')
111
+ # We can generate our list of subtitles
112
+ from nltk import tokenize
113
+ c = 0
114
+ sub_names = []
115
+ for k in range(len(generated_images)):
116
+ subtitles = tokenize.sent_tokenize(sentences[k])
117
+ sub_names.append(subtitles)
118
+
119
+ # Step 5- Adding Subtitles to the Images
120
+ def draw_multiple_line_text(image, text, font, text_color, text_start_height):
121
+ draw = ImageDraw.Draw(image)
122
+ image_width, image_height = image.size
123
+ y_text = text_start_height
124
+ lines = textwrap.wrap(text, width=40)
125
+ for line in lines:
126
+ line_width, line_height = font.getsize(line)
127
+ draw.text(((image_width - line_width) / 2, y_text),
128
+ line, font=font, fill=text_color)
129
+ y_text += line_height
130
+
131
+ def add_text_to_img(text1, image_input):
132
+ '''
133
+ Testing draw_multiple_line_text
134
+ '''
135
+ image = image_input
136
+ fontsize = 13 # starting font size
137
+ path_font = "/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf"
138
+ if not os.path.exists(path_font):
139
+ # Try alternative location on different systems
140
+ path_font = "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf"
141
+ if not os.path.exists(path_font):
142
+ print("Font file not found. Subtitles might not be rendered correctly.")
143
+ path_font = None
144
+
145
+ if path_font is not None:
146
+ try:
147
+ font = ImageFont.truetype(path_font, fontsize)
148
+ text_color = (255, 255, 0)
149
+ text_start_height = 200
150
+ draw_multiple_line_text(image, text1, font, text_color, text_start_height)
151
+ except Exception as e:
152
+ print(f"Error loading or using font: {e}")
153
+
154
+ return image
155
+
156
+ generated_images_sub = []
157
+ for k in range(len(generated_images)):
158
+ imagenes = generated_images[k].copy()
159
+ text_to_add = sub_names[k][0]
160
+ result = add_text_to_img(text_to_add, imagenes)
161
+ generated_images_sub.append(result)
162
+
163
+ # Step 7 - Creation of audio
164
+ c = 0
165
+ mp3_names = []
166
+ mp3_lengths = []
167
+ for k in range(len(generated_images)):
168
+ text_to_add = sub_names[k][0]
169
+ print(text_to_add)
170
+ f_name = 'audio_' + str(c) + '.mp3'
171
+ mp3_names.append(f_name)
172
+ # The text that you want to convert to audio
173
+ mytext = text_to_add
174
+ # Language in which you want to convert
175
+ language = 'en'
176
+ # Passing the text and language to the engine,
177
+ # here we have marked slow=False. Which tells
178
+ # the module that the converted audio should
179
+ # have a high speed
180
+ myobj = gTTS(text=mytext, lang=language, slow=False)
181
+ # Saving the converted audio in a mp3 file named
182
+ sound_file = f_name
183
+ myobj.save(sound_file)
184
+ audio = AudioSegment.from_file(sound_file, format="mp3")
185
+ duration = len(audio) / 1000
186
+ mp3_lengths.append(duration)
187
+ print(duration)
188
+ c += 1
189
+
190
+ # Step 8 - Merge audio files
191
+ cwd = os.getcwd().replace(chr(92), '/')
192
+ export_path = 'result.mp3'
193
+ silence = AudioSegment.silent(duration=500)
194
+ full_audio = AudioSegment.empty()
195
+
196
+ for n, mp3_file in enumerate(mp3_names):
197
+ mp3_file = mp3_file.replace(chr(92), '/')
198
+ print(n, mp3_file)
199
+ # Load the current mp3 into `audio_segment`
200
+ audio_segment = AudioSegment.from_mp3(mp3_file)
201
+ # Just accumulate the new `audio_segment` + `silence`
202
+ full_audio += audio_segment + silence
203
+ print('Merging ', n)
204
+ # The loop will exit once all files in the list have been used
205
+ # Then export
206
+ full_audio.export(export_path, format='mp3')
207
+ print('\ndone!')
208
+
209
+ # Step 9 - Creation of the video with adjusted times of the sound
210
+ c = 0
211
+ file_names = []
212
+ for img in generated_images_sub:
213
+ f_name = 'img_' + str(c) + '.jpg'
214
+ file_names.append(f_name)
215
+ img.save(f_name)
216
+ c += 1
217
+ print(file_names)
218
+
219
+ clips = []
220
+ d = 0
221
+ for m in file_names:
222
+ duration = mp3_lengths[d]
223
+ print(d, duration)
224
+ clips.append(ImageClip(m).set_duration(duration + 0.5))
225
+ d += 1
226
+ concat_clip = concatenate_videoclips(clips, method="compose")
227
+ concat_clip.write_videofile("result_new.mp4", fps=24)
228
+
229
+ # Step 10 - Merge Video + Audio
230
+ movie_name = 'result_new.mp4'
231
+ export_path = 'result.mp3'
232
+ movie_final = 'result_final.mp4'
233
+
234
+ def combine_audio(vidname, audname, outname, fps=24):
235
+ import moviepy.editor as mpe
236
+ my_clip = mpe.VideoFileClip(vidname)
237
+ audio_background = mpe.AudioFileClip(audname)
238
+ final_clip = my_clip.set_audio(audio_background)
239
+ final_clip.write_videofile(outname, fps=fps)
240
+
241
+ combine_audio(movie_name, export_path, movie_final) # create a new file
242
+ return 'result_final.mp4'
243
+
244
+
245
+ text = 'Once, there was a girl called Laura who went to the supermarket to buy the ingredients to make a cake. Because today is her birthday and her friends come to her house and help her to prepare the cake.'
246
  demo = gr.Blocks()
247
  with demo:
248
  gr.Markdown("# Video Generator from stories with Artificial Intelligence")
249
+ gr.Markdown(
250
+ "A story can be input by user. The story is summarized using DistillBART model. Then, then it is generated the images by using Dalle-mini and created the subtitles and audio gtts. These are generated as a video.")
251
  with gr.Row():
252
  # Left column (inputs)
253
  with gr.Column():
254
+ input_start_text = gr.Textbox(value=text,
255
+ label="Type your story here, for now a sample story is added already!")
256
  with gr.Row():
257
  button_gen_video = gr.Button("Generate Video")
258
  # Right column (outputs)
259
  with gr.Column():
260
  output_interpolation = gr.Video(label="Generated Video")
261
  gr.Markdown("<h3>Future Works </h3>")
262
+ gr.Markdown(
263
+ "This program text-to-video AI software generating videos from any prompt! AI software to build an art gallery. The future version will use Dalle-2 For more info visit [ruslanmv.com](https://ruslanmv.com/) ")
264
  button_gen_video.click(fn=get_output_video, inputs=input_start_text, outputs=output_interpolation)
 
265
  demo.launch(debug=False)