Spaces:
Runtime error
Runtime error
File size: 8,107 Bytes
1ebdc35 8d3fb0e 43bfed9 5a52517 7838680 fa8bc0c 8d3fb0e aa64400 b032967 8d3fb0e 43bfed9 b032967 8d3fb0e bbff8a4 aa64400 b032967 92e8e59 bbff8a4 b032967 8d3fb0e bbff8a4 b032967 8d3fb0e 5a52517 92e8e59 7838680 b032967 92e8e59 a0feee2 92e8e59 b032967 8d3fb0e 92e8e59 b032967 92e8e59 82ea97e b032967 92e8e59 8d3fb0e 92e8e59 8d3fb0e 82ea97e b032967 8d3fb0e fa8bc0c a4f9546 fa8bc0c 82ea97e 8d3fb0e 1ebdc35 b032967 7473aec d2a9eb1 b032967 d2a9eb1 1ebdc35 530da97 8d3fb0e 66c69b6 1ebdc35 8d3fb0e 0e26b77 d5c7b7a b032967 8d3fb0e a0feee2 d5c7b7a bbff8a4 b032967 9ba3c73 a0feee2 1ebdc35 b032967 a0feee2 8d3fb0e f99bb9b 7ed3c51 b032967 087fd05 b032967 c43bf7c c0e7ea5 1ebdc35 a0feee2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
import gradio as gr
import moviepy.video.io.ImageSequenceClip
from PIL import Image
from pydub import AudioSegment
# Import everything needed to edit video clips
from moviepy.editor import *
import numpy as np
import mutagen
from mutagen.mp3 import MP3
import cv2
def resize(img_list):
print("** inside resize **")
print('Entity-Images generated by multimodal interface are:',img_list)
resize_img_list = []
for item in img_list:
im = Image.open(item)
imResize = im.resize((256,256), Image.ANTIALIAS)
resize_img_list.append(np.array(imResize))
print('Type of elements in the image list:',type(resize_img_list[0]))
return resize_img_list
def merge_audio_video(entities_num, resize_img_list, text_input):
print("** inside merge aud vid **")
print('Type of image list variable: ',type(resize_img_list))
print('Type of elements in the image list: ',type(resize_img_list[0]))
#Convert text to speech using facebook's latest model from HF hub
speech = text2speech(text_input)
print('Back in merge_audio_video')
print('Type of speech variable : ',type(speech))
print('Type of Audio file: ',speech)
wav_audio = AudioSegment.from_file(speech, "flac") #("/content/gdrive/My Drive/AI/audio1.flac", "flac")
#convert flac to mp3 audio format
print('COnverting flac format to mp3 using AudioSegment object:', type(wav_audio))
wav_audio.export("audio.mp3", format="mp3") #("/content/gdrive/My Drive/AI/audio1.mp3", format="mp3")
print('flac audio converted to mp3 audio' )
print('now getting duration of this mp3 audio' )
#getting audio clip's duration
audio_length = int(MP3("audio.mp3").info.length)
print('Audio length is :',audio_length)
#Calculate the desired frame per second based on given audio length and entities identified
fps= entities_num / audio_length #length of audio file
fps = float(format(fps, '.5f'))
print('Based on number of entities/images and audio length, FPS is set as : ',fps)
#String a list of images into a video and write to memory
clip = moviepy.video.io.ImageSequenceClip.ImageSequenceClip(resize_img_list, fps=fps)
clip.write_videofile('my_vid_tmp.mp4')
print('video clip created successfully from images')
# loading video file
print('Starting video and audio merge')
videoclip = VideoFileClip('my_vid_tmp.mp4') #("/content/gdrive/My Drive/AI/my_video1.mp4")
print('loading video-clip')
# loading audio file
audioclip = AudioFileClip('audio.mp3') #.subclip(0, 15)
print('loading mp3-format audio')
# adding audio to the video clip
mergedclip = videoclip.set_audio(audioclip)
print('video and audio merged successfully')
#Getting size and frame count of merged video file
print('Getting size and frame count of merged video file')
duration = mergedclip.duration
frame_count = mergedclip.fps
print('duration is:',duration)
print('frame count :', frame_count)
return mergedclip
fastspeech = gr.Interface.load("huggingface/facebook/fastspeech2-en-ljspeech")
def text2speech(text):
print('** inside testtospeech **')
print('Loading the model through :',type(fastspeech))
print(fastspeech)
speech = fastspeech(text)
print('Type of variable in which file is stored:',type(speech))
print('Type of Audio file generated :',speech)
return speech
def engine(text_input):
print(" ** Inside Enngine **")
#Extract entities from text
ner = gr.Interface.load("huggingface/flair/ner-english-ontonotes-large")
entities = ner(text_input)
entities = [tupl for tupl in entities if None not in tupl]
entities_num = len(entities)
#Generate images using multimodelart's space for each entity identified above
img_list = []
for ent in entities:
img = gr.Interface.load("spaces/multimodalart/latentdiffusion")(ent[0],'50','256','256','1',10)[0]
img_list.append(img)
print('img_list size:',len(img_list))
#Resizing all images produced to same size
resize_img_list = resize(img_list)
print('back from resize into engine')
#Merge video and audio created above
mergedclip = merge_audio_video(entities_num, resize_img_list, text_input)
print('\n Back in engine')
print(' Merged clip type :',type(mergedclip))
print('Writing the merged video clip to a video file')
mergedclip.to_videofile('mergedvideo.mp4')
print('mergedvideo.mp4 created')
print('################################ Single Run Completed ##############################')
return 'mergedvideo.mp4'
app = gr.Interface(engine,
gr.inputs.Textbox(lines=5, label="Input Text"),
gr.outputs.Video(type=None, label='Final Merged video'),
description="<div>Firstly, the Demo generates speech from input-text using facebook's fastspeech2-en-ljspeech from <a href='https://huggingface.co/facebook/fastspeech2-en-ljspeech' target='_blank'>HF hub</a>.<br>Then, takes the input-text and extracts the entities in it using Flair NER model from <a href='https://huggingface.co/flair/ner-english-ontonotes-large' target='_blank'>HF Hub</a>. <br>Then, generate images using <a href='https://huggingface.co/spaces/multimodalart/latentdiffusion' target='_blank'>Multimodalart Space</a> for every entity separately.<br>Creates a video by stringing all the entity-images together. <br>Lastly, Fuses the AI generated audio and video together to create a coherent movie for you to watch. <br><br>A fun little app that lets you turn your text to video (well, in some ways atleast :) ). More the entities in your text, More time to build the output, More fun to watch.<br> Please expect build time of around 10-20 seconds per entity. For instance, in the third and largest example there are 13 entities as per the NER model used here.</div>" ,
examples=["On April 17th Sunday George celebrated Easter. He is staying at Empire State building with his parents.", "George is a citizen of Canada and speaks English and French fluently. His role model is the former president Obama. " , "On April 17th Sunday George celebrated Easter. He is staying at Empire State building with his parents. He is a citizen of Canada and speaks English and French fluently. His role model is former president Obama. He got 1000 dollar from his mother to visit Disney World and to buy new iPhone mobile. George likes watching Game of Thrones.", "April is the month of Easter weekend. Visit places like Statue of Liberty with friends. Take at least 200 dollars in cash with you. Use Android phone to find places in Newyork City."],
title="Generate Audio & Video from Text",
article="<br><div>For best results, make sure to enter a text that has entities listed on model card for <a href='https://huggingface.co/flair/ner-english-ontonotes-large' target='_blank'>flair/ner-english-ontonotes-large</a>. Some examples of type of entities that will be helpful are - Date values, event names, building names, languages, locations, money value, organization names, famous people names, products and so on.<br>Also note that, this Space loads the most awesome Multimodalart space as a gradio interface, hence if the latter space is down former goes down too.</div><br><h4 style='font-size: 110%;margin-top:1em'>Who owns the videos produced by this demo?</h4><div><i>(Borrowing this from multimodalart spaces)</i> Definetly not me! Probably you do. I say probably because the Copyright discussion about AI generated art is ongoing. So <a href='https://www.theverge.com/2022/2/21/22944335/us-copyright-office-reject-ai-generated-art-recent-entrance-to-paradise' target='_blank'>it may be the case that everything produced here falls automatically into the public domain</a>. But in any case it is either yours or is in the public domain.</div>"
).launch(enable_queue=True, debug=True)
|