ysharma's picture
ysharma HF staff
1
d5c7b7a
raw
history blame
5.82 kB
import gradio as gr
import moviepy.video.io.ImageSequenceClip
#image_folder= '/content/gdrive/My Drive/AI/sample_imgs/'
from PIL import Image
#import os, sys
from pydub import AudioSegment
# Import everything needed to edit video clips
from moviepy.editor import *
#path = "/content/gdrive/My Drive/AI/sample_imgs/"
#dirs = os.listdir( path )
def resize(img_list):
resize_img_list = []
for item in img_list:
im = Image.open(item)
imResize = im.resize((256,256), Image.ANTIALIAS)
resize_img_list.append(imResize)
return resize_img_list
#def resize():
# for item in dirs:
# if os.path.isfile(path+item):
# im = Image.open(path+item)
# f, e = os.path.splitext(path+item)
# imResize = im.resize((256,256), Image.ANTIALIAS)
# imResize.save(f + ' resized.jpg', 'JPEG', quality=90)
#resize_img_list = resize(img_list)
#image_files = [os.path.join(image_folder,img)
# for img in resize_img_list
# if img.endswith(".jpg")]
#print(image_files)
def images_to_video(fps, resize_img_list):
clip = moviepy.video.io.ImageSequenceClip.ImageSequenceClip(resize_img_list, fps=fps)
return clip
#clip.write_videofile('/content/gdrive/My Drive/AI/my_vid_20apr.mp4')
def merge_audio_video(speech, clip):
#convert flac to mp3 audio format
wav_audio = AudioSegment.from_file(speech, "flac") #("/content/gdrive/My Drive/AI/audio1.flac", "flac")
wav_audio.export("audio.mp3", format="mp3") #("/content/gdrive/My Drive/AI/audio1.mp3", format="mp3")
# loading video dsa gfg intro video
clip = VideoFileClip(clip) #("/content/gdrive/My Drive/AI/my_video1.mp4")
# loading audio file
audioclip = AudioFileClip('audio.mp3') #.subclip(0, 15)
# adding audio to the video clip
videoclip = clip.set_audio(audioclip)
# showing video clip
#videoclip.ipython_display()
return videoclip
fastspeech = gr.Interface.load("huggingface/facebook/fastspeech2-en-ljspeech")
def text2speech(text):
return fastspeech(text)
def engine(text_input):
#Extract entities from text
ner = gr.Interface.load("huggingface/flair/ner-english-ontonotes-large")
entities = ner(text_input)
entities = [tupl for tupl in entities if None not in tupl]
entities_num = len(entities)
#img = run(text_input,'50','256','256','1',10) #entities[0][0]
#Generate images using multimodelart's space for each entity identified above
img_list = []
for ent in entities:
img = gr.Interface.load("spaces/multimodalart/latentdiffusion")(ent[0],'50','256','256','1',10)[0]
img_list.append(img)
#Resizing all images produced to same size
resize_img_list = resize(img_list)
#Convert text to speech using facebook's latest model from HF hub
speech = text2speech(text_input)
#Calculate the desired frame per second based on given audio length and entities identified
fps= entities_num / 19 #length of audio file #13 / 19
#Convert string of images into a video
clip = images_to_video(fps, resize_img_list)
#Merge video and audio created above
merged_file = merge_audio_video(speech, clip)
#{'prompt':text_input,'steps':'50','width':'256','height':'256','images':'1','scale':10}).launch()
#img_intfc = gr.Interface.load("spaces/multimodalart/latentdiffusion", inputs=[gr.inputs.Textbox(lines=1, label="Input Text"), gr.inputs.Textbox(lines=1, label="Input Text"), gr.inputs.Textbox(lines=1, label="Input Text"), gr.inputs.Textbox(lines=1, label="Input Text"), gr.inputs.Textbox(lines=1, label="Input Text"), gr.inputs.Textbox(lines=1, label="Input Text")],
#outputs=[gr.outputs.Image(type="pil", label="output image"),gr.outputs.Carousel(label="Individual images",components=["image"]),gr.outputs.Textbox(label="Error")], )
#title="Convert text to image")
#img = img_intfc[0]
#img = img_intfc(text_input,'50','256','256','1',10)
#print(img)
#print(type(img))
#print(img)
#print(type(img[1][0][0]))
#print(img[1])
#img = img[0]
#inputs=['George',50,256,256,1,10]
#run(prompt, steps, width, height, images, scale)
return merged_file #img, entities, speech
#image = gr.outputs.Image(type="pil", label="output image")
app = gr.Interface(engine,
gr.inputs.Textbox(lines=5, label="Input Text"),
gradio.outputs.Video(self, type=None, label=None),
#[gr.outputs.Image(type="auto", label="Output"), gr.outputs.Textbox(type="auto", label="Text"), gr.outputs.Audio(type="file", label="Speech Answer") ],
#live=True,
#outputs=[#gr.outputs.Textbox(type="auto", label="Text"),gr.outputs.Audio(type="file", label="Speech Answer"),
#outputs= img, #gr.outputs.Carousel(label="Individual images",components=["image"]), #, gr.outputs.Textbox(label="Error")],
examples = ['Apple'],
description="Takes a text as input and reads it out to you."
#examples=["On April 17th Sunday George celebrated Easter. He is staying at Empire State building with his parents. He is a citizen of Canada and speaks English and French fluently. His role model is former president Obama. He got 1000 dollar from his mother to visit Disney World and to buy new iPhone mobile. George likes watching Game of Thrones."]
).launch(debug=True) #(enable_queue=True)
#get_audio = gr.Button("generate audio")
#get_audio.click(text2speech, inputs=text, outputs=speech)
#def greet(name):
# return "Hello " + name + "!!"
#iface = gr.Interface(fn=greet, inputs="text", outputs="text")
#iface.launch()