Spaces:
Runtime error
Runtime error
import gradio as gr | |
import moviepy.video.io.ImageSequenceClip | |
#image_folder= '/content/gdrive/My Drive/AI/sample_imgs/' | |
from PIL import Image | |
#import os, sys | |
from pydub import AudioSegment | |
# Import everything needed to edit video clips | |
from moviepy.editor import * | |
import numpy as np | |
import mutagen | |
from mutagen.mp3 import MP3 | |
import cv2 | |
#path = "/content/gdrive/My Drive/AI/sample_imgs/" | |
#dirs = os.listdir( path ) | |
def resize(img_list): | |
print("** inside resize **") | |
print(img_list) | |
resize_img_list = [] | |
for item in img_list: | |
im = Image.open(item) | |
imResize = im.resize((256,256), Image.ANTIALIAS) | |
resize_img_list.append(np.array(imResize)) | |
print(type(resize_img_list[0])) | |
return resize_img_list | |
#def resize(): | |
# for item in dirs: | |
# if os.path.isfile(path+item): | |
# im = Image.open(path+item) | |
# f, e = os.path.splitext(path+item) | |
# imResize = im.resize((256,256), Image.ANTIALIAS) | |
# imResize.save(f + ' resized.jpg', 'JPEG', quality=90) | |
#resize_img_list = resize(img_list) | |
#image_files = [os.path.join(image_folder,img) | |
# for img in resize_img_list | |
# if img.endswith(".jpg")] | |
#print(image_files) | |
#def images_to_video(fps, resize_img_list): | |
# clip = moviepy.video.io.ImageSequenceClip.ImageSequenceClip(resize_img_list, fps=fps) | |
# return clip | |
#clip.write_videofile('/content/gdrive/My Drive/AI/my_vid_20apr.mp4') | |
#gradio.inputs.Audio(self, source="upload", type="numpy", label=None, optional=False) | |
def merge_audio_video(entities_num, resize_img_list, text_input): | |
print("** inside merge aud vid **") | |
print(type(resize_img_list)) | |
print(type(resize_img_list[0])) | |
#Convert text to speech using facebook's latest model from HF hub | |
speech = text2speech(text_input) | |
print('type of speech : ',type(speech)) | |
print(speech) | |
wav_audio = AudioSegment.from_file(speech, "flac") #("/content/gdrive/My Drive/AI/audio1.flac", "flac") | |
#convert flac to mp3 audio format | |
print('flac audio read', type(wav_audio)) | |
wav_audio.export("audio.mp3", format="mp3") #("/content/gdrive/My Drive/AI/audio1.mp3", format="mp3") | |
print('flac audio converted to mp3 audio' ) | |
print('now getting duration of this mp3 audio' ) | |
#getting audio clip's duration | |
audio_length = int(MP3("audio.mp3").info.length) | |
#Calculate the desired frame per second based on given audio length and entities identified | |
fps= entities_num / audio_length #19 #length of audio file #13 / 19 | |
fps = float(format(fps, '.5f')) | |
print('fps is: ',fps) | |
#String a list of images into a video and write to memory | |
clip = moviepy.video.io.ImageSequenceClip.ImageSequenceClip(resize_img_list, fps=fps) | |
clip.write_videofile('my_vid_tmp.mp4') | |
print('video clip created from images') | |
# loading video file | |
print('Starting video and audio merge') | |
videoclip = VideoFileClip('my_vid_tmp.mp4') #("/content/gdrive/My Drive/AI/my_video1.mp4") | |
print('loading video-clip audio') | |
# loading audio file | |
audioclip = AudioFileClip('audio.mp3') #.subclip(0, 15) | |
print('loading mp3-format audio') | |
# adding audio to the video clip | |
mergedclip = videoclip.set_audio(audioclip) | |
print('video and audio merged') | |
# showing video clip | |
#videoclip.ipython_display() | |
#Getting size and frame count of merged video file | |
print('Getting size and frame count of merged video file') | |
duration = mergedclip.duration | |
frame_count = mergedclip.fps | |
print('duration is:',duration) | |
print('frame count :', frame_count) | |
return mergedclip | |
fastspeech = gr.Interface.load("huggingface/facebook/fastspeech2-en-ljspeech") | |
def text2speech(text): | |
print('inside testtospeech') | |
print(type(fastspeech)) | |
print(fastspeech) | |
speech = fastspeech(text) | |
print(type(speech)) | |
print(speech) | |
return speech | |
def engine(text_input): | |
print(" ** Inside Enngine **") | |
#Extract entities from text | |
ner = gr.Interface.load("huggingface/flair/ner-english-ontonotes-large") | |
entities = ner(text_input) | |
entities = [tupl for tupl in entities if None not in tupl] | |
entities_num = len(entities) | |
#img = run(text_input,'50','256','256','1',10) #entities[0][0] | |
#Generate images using multimodelart's space for each entity identified above | |
img_list = [] | |
for ent in entities: | |
img = gr.Interface.load("spaces/multimodalart/latentdiffusion")(ent[0],'50','256','256','1',10)[0] | |
img_list.append(img) | |
print('img_list size:',len(img_list)) | |
#Resizing all images produced to same size | |
resize_img_list = resize(img_list) | |
print('back from resize') | |
#Convert text to speech using facebook's latest model from HF hub | |
#speech = text2speech(text_input) | |
#print('back in engine') | |
#getting audio clip's duration | |
#audio_length = int(WAVE(speech).info.length) | |
#Calculate the desired frame per second based on given audio length and entities identified | |
#fps= entities_num / audio_length #19 #length of audio file #13 / 19 | |
#fps = float(format(fps, '.5f')) | |
#print('fps is: ',fps) | |
#Convert string of images into a video | |
#clip = images_to_video(fps, resize_img_list) | |
#Merge video and audio created above | |
mergedclip = merge_audio_video(entities_num, resize_img_list, text_input) | |
print('Back in engine') | |
print(' merged clip type :',type(mergedclip)) | |
print('Writing the merged video clip to a file') | |
mergedclip.to_videofile('mergedvideo.mp4') | |
print('mergedvideo.mp4 created') | |
#{'prompt':text_input,'steps':'50','width':'256','height':'256','images':'1','scale':10}).launch() | |
#img_intfc = gr.Interface.load("spaces/multimodalart/latentdiffusion", inputs=[gr.inputs.Textbox(lines=1, label="Input Text"), gr.inputs.Textbox(lines=1, label="Input Text"), gr.inputs.Textbox(lines=1, label="Input Text"), gr.inputs.Textbox(lines=1, label="Input Text"), gr.inputs.Textbox(lines=1, label="Input Text"), gr.inputs.Textbox(lines=1, label="Input Text")], | |
#outputs=[gr.outputs.Image(type="pil", label="output image"),gr.outputs.Carousel(label="Individual images",components=["image"]),gr.outputs.Textbox(label="Error")], ) | |
#title="Convert text to image") | |
#img = img_intfc[0] | |
#img = img_intfc(text_input,'50','256','256','1',10) | |
#print(img) | |
#print(type(img)) | |
#print(img) | |
#print(type(img[1][0][0])) | |
#print(img[1]) | |
#img = img[0] | |
#inputs=['George',50,256,256,1,10] | |
#run(prompt, steps, width, height, images, scale) | |
return 'mergedvideo.mp4' #img, entities, speech | |
#image = gr.outputs.Image(type="pil", label="output image") | |
app = gr.Interface(engine, | |
gr.inputs.Textbox(lines=5, label="Input Text"), | |
gr.outputs.Video(type='playable_video', label='Final Merged video'), | |
#[gr.outputs.Image(type="auto", label="Output"), gr.outputs.Textbox(type="auto", label="Text"), gr.outputs.Audio(type="file", label="Speech Answer") ], | |
#live=True, | |
#outputs=[#gr.outputs.Textbox(type="auto", label="Text"),gr.outputs.Audio(type="file", label="Speech Answer"), | |
#outputs= img, #gr.outputs.Carousel(label="Individual images",components=["image"]), #, gr.outputs.Textbox(label="Error")], | |
examples = ['Apple'], | |
description="Takes a text as input and reads it out to you." | |
#examples=["On April 17th Sunday George celebrated Easter. He is staying at Empire State building with his parents. He is a citizen of Canada and speaks English and French fluently. His role model is former president Obama. He got 1000 dollar from his mother to visit Disney World and to buy new iPhone mobile. George likes watching Game of Thrones."] | |
).launch(enable_queue=True, debug=True) | |
#get_audio = gr.Button("generate audio") | |
#get_audio.click(text2speech, inputs=text, outputs=speech) | |
#def greet(name): | |
# return "Hello " + name + "!!" | |
#iface = gr.Interface(fn=greet, inputs="text", outputs="text") | |
#iface.launch() |