Spaces:

ysharma
/

text-to-ner-to-image-to-video

Runtime error

App Files Files Community

text-to-ner-to-image-to-video / app.py

ysharma HF Staff

Final updates

a0feee2 over 3 years ago

raw

history blame

5.21 kB

	import gradio as gr
	import moviepy.video.io.ImageSequenceClip
	from PIL import Image
	from pydub import AudioSegment
	# Import everything needed to edit video clips
	from moviepy.editor import *
	import numpy as np
	import mutagen
	from mutagen.mp3 import MP3
	import cv2


	def resize(img_list):
	print(" inside resize ")
	print(img_list)
	resize_img_list = []
	for item in img_list:
	im = Image.open(item)
	imResize = im.resize((256,256), Image.ANTIALIAS)
	resize_img_list.append(np.array(imResize))
	print(type(resize_img_list[0]))
	return resize_img_list


	def merge_audio_video(entities_num, resize_img_list, text_input):
	print(" inside merge aud vid ")
	print(type(resize_img_list))
	print(type(resize_img_list[0]))


	#Convert text to speech using facebook's latest model from HF hub
	speech = text2speech(text_input)
	print('type of speech : ',type(speech))
	print(speech)
	wav_audio = AudioSegment.from_file(speech, "flac") #("/content/gdrive/My Drive/AI/audio1.flac", "flac")
	#convert flac to mp3 audio format
	print('flac audio read', type(wav_audio))
	wav_audio.export("audio.mp3", format="mp3") #("/content/gdrive/My Drive/AI/audio1.mp3", format="mp3")
	print('flac audio converted to mp3 audio' )
	print('now getting duration of this mp3 audio' )
	#getting audio clip's duration
	audio_length = int(MP3("audio.mp3").info.length)

	#Calculate the desired frame per second based on given audio length and entities identified
	fps= entities_num / audio_length #length of audio file
	fps = float(format(fps, '.5f'))
	print('fps is: ',fps)

	#String a list of images into a video and write to memory
	clip = moviepy.video.io.ImageSequenceClip.ImageSequenceClip(resize_img_list, fps=fps)
	clip.write_videofile('my_vid_tmp.mp4')
	print('video clip created from images')

	# loading video file
	print('Starting video and audio merge')
	videoclip = VideoFileClip('my_vid_tmp.mp4') #("/content/gdrive/My Drive/AI/my_video1.mp4")
	print('loading video-clip audio')

	# loading audio file
	audioclip = AudioFileClip('audio.mp3') #.subclip(0, 15)
	print('loading mp3-format audio')
	# adding audio to the video clip
	mergedclip = videoclip.set_audio(audioclip)
	print('video and audio merged')

	#Getting size and frame count of merged video file
	print('Getting size and frame count of merged video file')
	duration = mergedclip.duration
	frame_count = mergedclip.fps
	print('duration is:',duration)
	print('frame count :', frame_count)

	return mergedclip


	fastspeech = gr.Interface.load("huggingface/facebook/fastspeech2-en-ljspeech")

	def text2speech(text):
	print('inside testtospeech')
	print(type(fastspeech))
	print(fastspeech)
	speech = fastspeech(text)
	print(type(speech))
	print(speech)
	return speech

	def engine(text_input):
	print(" Inside Enngine ")
	#Extract entities from text
	ner = gr.Interface.load("huggingface/flair/ner-english-ontonotes-large")
	entities = ner(text_input)
	entities = [tupl for tupl in entities if None not in tupl]
	entities_num = len(entities)

	#Generate images using multimodelart's space for each entity identified above
	img_list = []
	for ent in entities:
	img = gr.Interface.load("spaces/multimodalart/latentdiffusion")(ent[0],'50','256','256','1',10)[0]
	img_list.append(img)

	print('img_list size:',len(img_list))
	#Resizing all images produced to same size
	resize_img_list = resize(img_list)
	print('back from resize')


	#Merge video and audio created above
	mergedclip = merge_audio_video(entities_num, resize_img_list, text_input)
	print('Back in engine')
	print(' merged clip type :',type(mergedclip))
	print('Writing the merged video clip to a file')
	mergedclip.to_videofile('mergedvideo.mp4')
	print('mergedvideo.mp4 created')

	return 'mergedvideo.mp4'

	app = gr.Interface(engine,
	gr.inputs.Textbox(lines=5, label="Input Text"),
	gr.outputs.Video(type=None, label='Final Merged video'),
	description="Takes a text as input, extracts the entities in it, generate images using multimodalart space for every entity separately. Also, generates speech from input-text using facebook's fastspeech2-en-ljspeech from hub. Creates a video by stringing all the entity-images together. Fuses the AI generated audio and video together to create a coherent movie for you to watch. A fun little app that lets you turn your text to video (well, in some ways atleast :) )" ,
	examples=["On April 17th Sunday George celebrated Easter. He is staying at Empire State building with his parents. He is a citizen of Canada and speaks English and French fluently. His role model is former president Obama. He got 1000 dollar from his mother to visit Disney World and to buy new iPhone mobile. George likes watching Game of Thrones.", "Apple"]
	).launch(enable_queue=True, debug=True)