Spaces:

ysharma
/

text-to-ner-to-image-to-video

Runtime error

App Files Files Community

ysharma HF Staff commited on Apr 22, 2022

Commit

a0feee2

1 Parent(s): 2e1e454

Final updates

Browse files

Files changed (1) hide show

app.py +7 -79

app.py CHANGED Viewed

@@ -1,8 +1,6 @@
 import gradio as gr
 import moviepy.video.io.ImageSequenceClip
-#image_folder= '/content/gdrive/My Drive/AI/sample_imgs/'
 from PIL import Image
-#import os, sys
 from pydub import AudioSegment
 # Import everything needed to edit video clips
 from moviepy.editor import *
@@ -11,8 +9,6 @@ import mutagen
 from mutagen.mp3 import MP3
 import cv2
-#path = "/content/gdrive/My Drive/AI/sample_imgs/"
-#dirs = os.listdir( path )
 def resize(img_list):
     print("** inside resize **")
@@ -25,28 +21,6 @@ def resize(img_list):
     print(type(resize_img_list[0]))
     return resize_img_list
-#def resize():
-#    for item in dirs:
-#        if os.path.isfile(path+item):
-#            im = Image.open(path+item)
-#            f, e = os.path.splitext(path+item)
-#            imResize = im.resize((256,256), Image.ANTIALIAS)
-#            imResize.save(f + ' resized.jpg', 'JPEG', quality=90)
-#resize_img_list = resize(img_list)
-#image_files = [os.path.join(image_folder,img)
-#               for img in resize_img_list
-#               if img.endswith(".jpg")]
-#print(image_files)
-#def images_to_video(fps, resize_img_list):
-#    clip = moviepy.video.io.ImageSequenceClip.ImageSequenceClip(resize_img_list, fps=fps)
-#    return clip
-#clip.write_videofile('/content/gdrive/My Drive/AI/my_vid_20apr.mp4')
-#gradio.inputs.Audio(self, source="upload", type="numpy", label=None, optional=False)
 def merge_audio_video(entities_num, resize_img_list, text_input):
     print("** inside merge aud vid **")
@@ -68,7 +42,7 @@ def merge_audio_video(entities_num, resize_img_list, text_input):
     audio_length = int(MP3("audio.mp3").info.length)
     #Calculate the desired frame per second based on given audio length and entities identified
-    fps= entities_num / audio_length #19 #length of audio file   #13 / 19
     fps = float(format(fps, '.5f'))
     print('fps is: ',fps)
@@ -88,8 +62,6 @@ def merge_audio_video(entities_num, resize_img_list, text_input):
     # adding audio to the video clip
     mergedclip = videoclip.set_audio(audioclip)
     print('video and audio merged')
-    # showing video clip
-    #videoclip.ipython_display()
     #Getting size and frame count of merged video file
     print('Getting size and frame count of merged video file')
@@ -120,7 +92,6 @@ def engine(text_input):
     entities = [tupl for tupl in entities if None not in tupl]
     entities_num = len(entities)
-    #img = run(text_input,'50','256','256','1',10)  #entities[0][0]
     #Generate images using multimodelart's space for each entity identified above
     img_list = []
     for ent in entities:
@@ -132,65 +103,22 @@ def engine(text_input):
     resize_img_list = resize(img_list)
     print('back from resize')
-    #Convert text to speech using facebook's latest model from HF hub
-    #speech = text2speech(text_input)
-    #print('back in engine')
-    #getting audio clip's duration
-    #audio_length = int(WAVE(speech).info.length)
-    #Calculate the desired frame per second based on given audio length and entities identified
-    #fps= entities_num / audio_length #19 #length of audio file   #13 / 19
-    #fps = float(format(fps, '.5f'))
-    #print('fps is: ',fps)
-    #Convert string of images into a video
-    #clip = images_to_video(fps, resize_img_list)
     #Merge video and audio created above
     mergedclip = merge_audio_video(entities_num, resize_img_list, text_input)
     print('Back in engine')
     print(' merged clip type :',type(mergedclip))
     print('Writing the merged video clip to a file')
     mergedclip.to_videofile('mergedvideo.mp4')
-    print('mergedvideo.mp4 created')
-    #{'prompt':text_input,'steps':'50','width':'256','height':'256','images':'1','scale':10}).launch()
-    #img_intfc = gr.Interface.load("spaces/multimodalart/latentdiffusion", inputs=[gr.inputs.Textbox(lines=1, label="Input Text"), gr.inputs.Textbox(lines=1, label="Input Text"), gr.inputs.Textbox(lines=1, label="Input Text"), gr.inputs.Textbox(lines=1, label="Input Text"), gr.inputs.Textbox(lines=1, label="Input Text"), gr.inputs.Textbox(lines=1, label="Input Text")],
-    #outputs=[gr.outputs.Image(type="pil", label="output image"),gr.outputs.Carousel(label="Individual images",components=["image"]),gr.outputs.Textbox(label="Error")], )
-    #title="Convert text to image")
-    #img = img_intfc[0]
-    #img = img_intfc(text_input,'50','256','256','1',10)
-    #print(img)
-    #print(type(img))
-    #print(img)
-    #print(type(img[1][0][0]))
-    #print(img[1])
-    #img = img[0]
-    #inputs=['George',50,256,256,1,10]
-    #run(prompt, steps, width, height, images, scale)
-    return 'mergedvideo.mp4' #img, entities, speech
-#image = gr.outputs.Image(type="pil", label="output image")
 app = gr.Interface(engine,
                    gr.inputs.Textbox(lines=5, label="Input Text"),
                    gr.outputs.Video(type=None, label='Final Merged video'),
-                   #[gr.outputs.Image(type="auto", label="Output"), gr.outputs.Textbox(type="auto", label="Text"), gr.outputs.Audio(type="file", label="Speech Answer") ],
-                   #live=True,
-                   #outputs=[#gr.outputs.Textbox(type="auto", label="Text"),gr.outputs.Audio(type="file", label="Speech Answer"),
-                   #outputs= img, #gr.outputs.Carousel(label="Individual images",components=["image"]), #, gr.outputs.Textbox(label="Error")],
-                   examples = ['Apple'],
-                   description="Takes a text as input and reads it out to you."
-                   #examples=["On April 17th Sunday George celebrated Easter. He is staying at Empire State building with his parents. He is a citizen of Canada and speaks English and French fluently. His role model is former president Obama. He got 1000 dollar from his mother to visit Disney World and to buy new iPhone mobile.  George likes watching Game of Thrones."]
                    ).launch(enable_queue=True, debug=True)
- #get_audio = gr.Button("generate audio")
- #get_audio.click(text2speech, inputs=text, outputs=speech)
-#def greet(name):
-#    return "Hello " + name + "!!"
-#iface = gr.Interface(fn=greet, inputs="text", outputs="text")
-#iface.launch()

 import gradio as gr
 import moviepy.video.io.ImageSequenceClip
 from PIL import Image
 from pydub import AudioSegment
 # Import everything needed to edit video clips
 from moviepy.editor import *
 from mutagen.mp3 import MP3
 import cv2
 def resize(img_list):
     print("** inside resize **")
     print(type(resize_img_list[0]))
     return resize_img_list
 def merge_audio_video(entities_num, resize_img_list, text_input):
     print("** inside merge aud vid **")
     audio_length = int(MP3("audio.mp3").info.length)
     #Calculate the desired frame per second based on given audio length and entities identified
+    fps= entities_num / audio_length  #length of audio file
     fps = float(format(fps, '.5f'))
     print('fps is: ',fps)
     # adding audio to the video clip
     mergedclip = videoclip.set_audio(audioclip)
     print('video and audio merged')
     #Getting size and frame count of merged video file
     print('Getting size and frame count of merged video file')
     entities = [tupl for tupl in entities if None not in tupl]
     entities_num = len(entities)
     #Generate images using multimodelart's space for each entity identified above
     img_list = []
     for ent in entities:
     resize_img_list = resize(img_list)
     print('back from resize')
     #Merge video and audio created above
     mergedclip = merge_audio_video(entities_num, resize_img_list, text_input)
     print('Back in engine')
     print(' merged clip type :',type(mergedclip))
     print('Writing the merged video clip to a file')
     mergedclip.to_videofile('mergedvideo.mp4')
+    print('mergedvideo.mp4 created')
+    return 'mergedvideo.mp4'
 app = gr.Interface(engine,
                    gr.inputs.Textbox(lines=5, label="Input Text"),
                    gr.outputs.Video(type=None, label='Final Merged video'),
+                   description="Takes a text as input, extracts the entities in it, generate images using multimodalart space for every entity separately. Also, generates speech from input-text using facebook's fastspeech2-en-ljspeech from hub. Creates a video by stringing all the entity-images together. Fuses the AI generated audio and video together to create a coherent movie for you to watch. A fun little app that lets you turn your text to video (well, in some ways atleast :) )" ,
+                   examples=["On April 17th Sunday George celebrated Easter. He is staying at Empire State building with his parents. He is a citizen of Canada and speaks English and French fluently. His role model is former president Obama. He got 1000 dollar from his mother to visit Disney World and to buy new iPhone mobile.  George likes watching Game of Thrones.", "Apple"]
                    ).launch(enable_queue=True, debug=True)