Spaces:
Runtime error
Runtime error
File size: 8,277 Bytes
1ebdc35 8d3fb0e 43bfed9 5a52517 7838680 fa8bc0c 8d3fb0e aa64400 8d3fb0e 43bfed9 8d3fb0e 82ea97e 8d3fb0e 7838680 8d3fb0e bbff8a4 aa64400 43bfed9 92e8e59 bbff8a4 d2a9eb1 8d3fb0e bbff8a4 5a52517 8d3fb0e 5a52517 92e8e59 7838680 92e8e59 8d3fb0e 92e8e59 82ea97e 92e8e59 8d3fb0e 92e8e59 8d3fb0e 82ea97e 5a52517 8d3fb0e fa8bc0c a4f9546 fa8bc0c 82ea97e 8d3fb0e 1ebdc35 92e8e59 7473aec d2a9eb1 1ebdc35 530da97 8d3fb0e 66c69b6 1ebdc35 8d3fb0e 0e26b77 d5c7b7a 92e8e59 8d3fb0e d5c7b7a bbff8a4 5a52517 92e8e59 5a52517 d5c7b7a 92e8e59 8d3fb0e d5c7b7a 82ea97e 8d3fb0e d5c7b7a bbff8a4 4f896b1 defe9d7 9ba3c73 defe9d7 8d3fb0e 1ebdc35 856b53c 66c69b6 856b53c 321912e dc6aef0 1ebdc35 8d3fb0e 1f2fe63 1ebdc35 df1b15b f99bb9b e23531d 8d3fb0e 1ebdc35 c0915d4 df1b15b 04086e6 1ebdc35 c0e7ea5 1ebdc35 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 |
import gradio as gr
import moviepy.video.io.ImageSequenceClip
#image_folder= '/content/gdrive/My Drive/AI/sample_imgs/'
from PIL import Image
#import os, sys
from pydub import AudioSegment
# Import everything needed to edit video clips
from moviepy.editor import *
import numpy as np
import mutagen
from mutagen.mp3 import MP3
import cv2
#path = "/content/gdrive/My Drive/AI/sample_imgs/"
#dirs = os.listdir( path )
def resize(img_list):
print("** inside resize **")
print(img_list)
resize_img_list = []
for item in img_list:
im = Image.open(item)
imResize = im.resize((256,256), Image.ANTIALIAS)
resize_img_list.append(np.array(imResize))
print(type(resize_img_list[0]))
return resize_img_list
#def resize():
# for item in dirs:
# if os.path.isfile(path+item):
# im = Image.open(path+item)
# f, e = os.path.splitext(path+item)
# imResize = im.resize((256,256), Image.ANTIALIAS)
# imResize.save(f + ' resized.jpg', 'JPEG', quality=90)
#resize_img_list = resize(img_list)
#image_files = [os.path.join(image_folder,img)
# for img in resize_img_list
# if img.endswith(".jpg")]
#print(image_files)
#def images_to_video(fps, resize_img_list):
# clip = moviepy.video.io.ImageSequenceClip.ImageSequenceClip(resize_img_list, fps=fps)
# return clip
#clip.write_videofile('/content/gdrive/My Drive/AI/my_vid_20apr.mp4')
#gradio.inputs.Audio(self, source="upload", type="numpy", label=None, optional=False)
def merge_audio_video(entities_num, resize_img_list, text_input):
print("** inside merge aud vid **")
print(type(resize_img_list))
print(type(resize_img_list[0]))
#Convert text to speech using facebook's latest model from HF hub
speech = text2speech(text_input)
print('type of speech : ',type(speech))
print(speech)
wav_audio = AudioSegment.from_file(speech, "flac") #("/content/gdrive/My Drive/AI/audio1.flac", "flac")
#convert flac to mp3 audio format
print('flac audio read', type(wav_audio))
wav_audio.export("audio.mp3", format="mp3") #("/content/gdrive/My Drive/AI/audio1.mp3", format="mp3")
print('flac audio converted to mp3 audio' )
print('now getting duration of this mp3 audio' )
#getting audio clip's duration
audio_length = int(MP3("audio.mp3").info.length)
#Calculate the desired frame per second based on given audio length and entities identified
fps= entities_num / audio_length #19 #length of audio file #13 / 19
fps = float(format(fps, '.5f'))
print('fps is: ',fps)
#String a list of images into a video and write to memory
clip = moviepy.video.io.ImageSequenceClip.ImageSequenceClip(resize_img_list, fps=fps)
clip.write_videofile('my_vid_tmp.mp4')
print('video clip created from images')
# loading video file
print('Starting video and audio merge')
videoclip = VideoFileClip('my_vid_tmp.mp4') #("/content/gdrive/My Drive/AI/my_video1.mp4")
print('loading video-clip audio')
# loading audio file
audioclip = AudioFileClip('audio.mp3') #.subclip(0, 15)
print('loading mp3-format audio')
# adding audio to the video clip
mergedclip = videoclip.set_audio(audioclip)
print('video and audio merged')
# showing video clip
#videoclip.ipython_display()
#Getting size and frame count of merged video file
print('Getting size and frame count of merged video file')
duration = mergedclip.duration
frame_count = mergedclip.fps
print('duration is:',duration)
print('frame count :', frame_count)
return mergedclip
fastspeech = gr.Interface.load("huggingface/facebook/fastspeech2-en-ljspeech")
def text2speech(text):
print('inside testtospeech')
print(type(fastspeech))
print(fastspeech)
speech = fastspeech(text)
print(type(speech))
print(speech)
return speech
def engine(text_input):
print(" ** Inside Enngine **")
#Extract entities from text
ner = gr.Interface.load("huggingface/flair/ner-english-ontonotes-large")
entities = ner(text_input)
entities = [tupl for tupl in entities if None not in tupl]
entities_num = len(entities)
#img = run(text_input,'50','256','256','1',10) #entities[0][0]
#Generate images using multimodelart's space for each entity identified above
img_list = []
for ent in entities:
img = gr.Interface.load("spaces/multimodalart/latentdiffusion")(ent[0],'50','256','256','1',10)[0]
img_list.append(img)
print('img_list size:',len(img_list))
#Resizing all images produced to same size
resize_img_list = resize(img_list)
print('back from resize')
#Convert text to speech using facebook's latest model from HF hub
#speech = text2speech(text_input)
#print('back in engine')
#getting audio clip's duration
#audio_length = int(WAVE(speech).info.length)
#Calculate the desired frame per second based on given audio length and entities identified
#fps= entities_num / audio_length #19 #length of audio file #13 / 19
#fps = float(format(fps, '.5f'))
#print('fps is: ',fps)
#Convert string of images into a video
#clip = images_to_video(fps, resize_img_list)
#Merge video and audio created above
mergedclip = merge_audio_video(entities_num, resize_img_list, text_input)
print('Back in engine')
print(' merged clip type :',type(mergedclip))
print('Writing the merged video clip to a file')
mergedclip.to_videofile('mergedvideo.mp4')
print('mergedvideo.mp4 created')
#{'prompt':text_input,'steps':'50','width':'256','height':'256','images':'1','scale':10}).launch()
#img_intfc = gr.Interface.load("spaces/multimodalart/latentdiffusion", inputs=[gr.inputs.Textbox(lines=1, label="Input Text"), gr.inputs.Textbox(lines=1, label="Input Text"), gr.inputs.Textbox(lines=1, label="Input Text"), gr.inputs.Textbox(lines=1, label="Input Text"), gr.inputs.Textbox(lines=1, label="Input Text"), gr.inputs.Textbox(lines=1, label="Input Text")],
#outputs=[gr.outputs.Image(type="pil", label="output image"),gr.outputs.Carousel(label="Individual images",components=["image"]),gr.outputs.Textbox(label="Error")], )
#title="Convert text to image")
#img = img_intfc[0]
#img = img_intfc(text_input,'50','256','256','1',10)
#print(img)
#print(type(img))
#print(img)
#print(type(img[1][0][0]))
#print(img[1])
#img = img[0]
#inputs=['George',50,256,256,1,10]
#run(prompt, steps, width, height, images, scale)
return 'mergedvideo.mp4' #img, entities, speech
#image = gr.outputs.Image(type="pil", label="output image")
app = gr.Interface(engine,
gr.inputs.Textbox(lines=5, label="Input Text"),
gr.outputs.Video(type='playable_video', label='Final Merged video'),
#[gr.outputs.Image(type="auto", label="Output"), gr.outputs.Textbox(type="auto", label="Text"), gr.outputs.Audio(type="file", label="Speech Answer") ],
#live=True,
#outputs=[#gr.outputs.Textbox(type="auto", label="Text"),gr.outputs.Audio(type="file", label="Speech Answer"),
#outputs= img, #gr.outputs.Carousel(label="Individual images",components=["image"]), #, gr.outputs.Textbox(label="Error")],
examples = ['Apple'],
description="Takes a text as input and reads it out to you."
#examples=["On April 17th Sunday George celebrated Easter. He is staying at Empire State building with his parents. He is a citizen of Canada and speaks English and French fluently. His role model is former president Obama. He got 1000 dollar from his mother to visit Disney World and to buy new iPhone mobile. George likes watching Game of Thrones."]
).launch(enable_queue=True, debug=True)
#get_audio = gr.Button("generate audio")
#get_audio.click(text2speech, inputs=text, outputs=speech)
#def greet(name):
# return "Hello " + name + "!!"
#iface = gr.Interface(fn=greet, inputs="text", outputs="text")
#iface.launch() |