File size: 6,780 Bytes
1ebdc35
8d3fb0e
 
 
 
 
43bfed9
5a52517
7838680
fa8bc0c
8d3fb0e
 
 
aa64400
 
8d3fb0e
 
 
 
43bfed9
 
8d3fb0e
 
 
bbff8a4
aa64400
 
43bfed9
92e8e59
bbff8a4
 
 
d2a9eb1
 
8d3fb0e
bbff8a4
5a52517
8d3fb0e
5a52517
92e8e59
 
7838680
92e8e59
 
a0feee2
92e8e59
 
8d3fb0e
92e8e59
 
 
 
 
 
 
82ea97e
92e8e59
 
8d3fb0e
 
92e8e59
8d3fb0e
82ea97e
5a52517
8d3fb0e
fa8bc0c
 
a4f9546
 
fa8bc0c
 
 
82ea97e
8d3fb0e
1ebdc35
 
 
 
92e8e59
7473aec
 
d2a9eb1
 
 
 
1ebdc35
 
530da97
8d3fb0e
66c69b6
 
 
 
1ebdc35
8d3fb0e
 
 
 
 
0e26b77
 
d5c7b7a
 
92e8e59
8d3fb0e
a0feee2
d5c7b7a
bbff8a4
4f896b1
 
defe9d7
9ba3c73
a0feee2
1ebdc35
a0feee2
8d3fb0e
f99bb9b
 
c43bf7c
 
 
 
 
 
c0e7ea5
1ebdc35
a0feee2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import gradio as gr
import moviepy.video.io.ImageSequenceClip
from PIL import Image
from pydub import AudioSegment
# Import everything needed to edit video clips
from moviepy.editor import *
import numpy as np
import mutagen
from mutagen.mp3 import MP3
import cv2


def resize(img_list):
    print("** inside resize **")
    print(img_list)
    resize_img_list = []
    for item in img_list:
        im = Image.open(item)
        imResize = im.resize((256,256), Image.ANTIALIAS)
        resize_img_list.append(np.array(imResize))
    print(type(resize_img_list[0]))
    return resize_img_list
                

def merge_audio_video(entities_num, resize_img_list, text_input):
    print("** inside merge aud vid **")
    print(type(resize_img_list))
    print(type(resize_img_list[0]))
        
    
    #Convert text to speech using facebook's latest model from HF hub   
    speech = text2speech(text_input)
    print('type of speech : ',type(speech))
    print(speech)
    wav_audio = AudioSegment.from_file(speech, "flac")  #("/content/gdrive/My Drive/AI/audio1.flac", "flac")
    #convert flac to mp3 audio format 
    print('flac audio read', type(wav_audio))
    wav_audio.export("audio.mp3", format="mp3")  #("/content/gdrive/My Drive/AI/audio1.mp3", format="mp3")
    print('flac audio converted to mp3 audio' )
    print('now getting duration of this mp3 audio' )
    #getting audio clip's duration
    audio_length = int(MP3("audio.mp3").info.length)

    #Calculate the desired frame per second based on given audio length and entities identified
    fps= entities_num / audio_length  #length of audio file   
    fps = float(format(fps, '.5f'))
    print('fps is: ',fps)
    
    #String a list of images into a video and write to memory
    clip = moviepy.video.io.ImageSequenceClip.ImageSequenceClip(resize_img_list, fps=fps)
    clip.write_videofile('my_vid_tmp.mp4')
    print('video clip created from images') 
        
    # loading video file
    print('Starting video and audio merge')
    videoclip = VideoFileClip('my_vid_tmp.mp4') #("/content/gdrive/My Drive/AI/my_video1.mp4")
    print('loading video-clip audio')
       
    # loading audio file
    audioclip = AudioFileClip('audio.mp3') #.subclip(0, 15)
    print('loading mp3-format audio')  
    # adding audio to the video clip
    mergedclip = videoclip.set_audio(audioclip)
    print('video and audio merged')  
    
    #Getting size and frame count of merged video file
    print('Getting size and frame count of merged video file')
    duration = mergedclip.duration
    frame_count = mergedclip.fps 
    print('duration is:',duration)
    print('frame count :', frame_count)
    
    return mergedclip
    

fastspeech = gr.Interface.load("huggingface/facebook/fastspeech2-en-ljspeech")

def text2speech(text):
    print('inside testtospeech')
    print(type(fastspeech))
    print(fastspeech)
    speech = fastspeech(text)
    print(type(speech))
    print(speech)
    return speech
    
def engine(text_input):
    print(" ** Inside Enngine **")
    #Extract entities from text
    ner = gr.Interface.load("huggingface/flair/ner-english-ontonotes-large")
    entities = ner(text_input)
    entities = [tupl for tupl in entities if None not in tupl]
    entities_num = len(entities)
    
    #Generate images using multimodelart's space for each entity identified above
    img_list = []
    for ent in entities:
        img = gr.Interface.load("spaces/multimodalart/latentdiffusion")(ent[0],'50','256','256','1',10)[0] 
        img_list.append(img)
    
    print('img_list size:',len(img_list))
    #Resizing all images produced to same size
    resize_img_list = resize(img_list)
    print('back from resize')
     
 
    #Merge video and audio created above
    mergedclip = merge_audio_video(entities_num, resize_img_list, text_input)
    print('Back in engine')
    print(' merged clip type :',type(mergedclip))
    print('Writing the merged video clip to a file')
    mergedclip.to_videofile('mergedvideo.mp4')
    print('mergedvideo.mp4 created')    
    
    return 'mergedvideo.mp4' 
    
app = gr.Interface(engine, 
                   gr.inputs.Textbox(lines=5, label="Input Text"),
                   gr.outputs.Video(type=None, label='Final Merged video'), href='https://huggingface.co/flair/ner-english-ontonotes-large' target='_blank'
                   description="Firstly, generates speech from input-text using facebook's fastspeech2-en-ljspeech from hub.<br>Then, takes the input-text and extracts the entities in it using Flair NER model from <a href='https://huggingface.co/flair/ner-english-ontonotes-large' target='_blank'>HF Hub</a>. <br>Then, generate images using <a href='https://huggingface.co/spaces/multimodalart/latentdiffusion' target='_blank'>Multimodalart Space</a> for every entity separately.<br>Creates a video by stringing all the entity-images together. <br>Lastly, Fuses the AI enerated audio and video together to create a coherent movie for you to watch. <br><br>A fun little app that lets you turn your text to video (well, in some ways atleast :) ). More the entities in your text, More time to build the output, More fun" ,
                   examples=["On April 17th Sunday George celebrated Easter. He is staying at Empire State building with his parents. He is a citizen of Canada and speaks English and French fluently. His role model is former president Obama. He got 1000 dollar from his mother to visit Disney World and to buy new iPhone mobile.  George likes watching Game of Thrones.", "April is the month of Easter weekend. Visit places like Statue of Liberty with friends. Take at least 200 dollars in cash with you. Use Android phone to find places in Newyork City."],
                   title="Generate Video from Text",
                   article="<div>For best results, make sure to enter a text that has entities listed on model card for flair/ner-english-ontonotes-large. Some examples of type of entities that will be helpful are - Date values, event names, building names, languages, locations, money value, organization names, famous people names, products and so on.</div><br><h4 style='font-size: 110%;margin-top:1em'>Who owns the videos produced by this demo?</h4><div>(Borrowing this from multimodalart spaces)Definetly not me! Probably you do. I say probably because the Copyright discussion about AI generated art is ongoing. So <a href='https://www.theverge.com/2022/2/21/22944335/us-copyright-office-reject-ai-generated-art-recent-entrance-to-paradise' target='_blank'>it may be the case that everything produced here falls automatically into the public domain</a>. But in any case it is either yours or is in the public domain.</div>"

                   ).launch(enable_queue=True, debug=True)