Spaces:

Lap1official
/

Advanced_Video

Build error

App Files Files Community

Advanced_Video / app.py

Reality123b

Create app.py

fb105a4 verified 6 months ago

raw

history blame

16.3 kB

	import gradio as gr
	import imageio_ffmpeg
	import numpy as np
	from PIL import Image, ImageDraw, ImageFont
	import math
	import dlib
	import tempfile
	import requests
	import os
	from transformers import pipeline
	import cv2
	import io

	detector = dlib.get_frontal_face_detector()
	try:
	predictor = dlib.shape_predictor("shape_predictor_68_face_landmarks.dat")
	except RuntimeError:
	print("Downloading shape_predictor_68_face_landmarks.dat...")
	landmarks_url = "http://dlib.net/files/shape_predictor_68_face_landmarks.dat.bz2"
	landmarks_compressed = requests.get(landmarks_url).content
	import bz2
	landmarks_data = bz2.decompress(landmarks_compressed)
	with open("shape_predictor_68_face_landmarks.dat", "wb") as f:
	f.write(landmarks_data)
	predictor = dlib.shape_predictor("shape_predictor_68_face_landmarks.dat")

	API_URL = "https://api-inference.huggingface.co/models/black-forest-labs/flux-1-schnell"
	HF_TOKEN = os.getenv("HF_TOKEN")

	LLM_API_URL = "https://api-inference.huggingface.co/models/lmsys/fastchat-t5-3b-v1.0"

	def query_hf_image_generation(prompt):
	headers = {"Authorization": f"Bearer {HF_TOKEN}"}
	payload = {"inputs": prompt}
	response = requests.post(API_URL, headers=headers, json=payload)
	if response.status_code == 200:
	image_bytes = response.content
	image = Image.open(io.BytesIO(image_bytes))
	return image
	else:
	raise Exception(f"Image generation failed: {response.content}")

	def query_llm(prompt, image_description):
	headers = {"Authorization": f"Bearer {HF_TOKEN}"}
	system_prompt = "You are an expert in image to video creation, and give only the motion type, intensity, text overlay, text color, text start and end times for the image described below based on user's prompt. Give the response in a JSON format."
	prompt_template = f"<\|system\|>\n{system_prompt}</s>\n<\|user\|>\nImage Description: {image_description}\nUser Prompt: {prompt}</s>\n<\|assistant\|>\n"
	payload = {"inputs": prompt_template, "max_new_tokens": 200}
	response = requests.post(LLM_API_URL, headers=headers, json=payload)
	if response.status_code == 200:
	return response.json()[0]['generated_text']
	else:
	raise Exception(f"LLM query failed: {response.content}")

	def extract_motion_params(llm_output):
	try:
	import json
	start_index = llm_output.find('{')
	end_index = llm_output.rfind('}') + 1
	json_string = llm_output[start_index:end_index]
	params = json.loads(json_string)
	return params
	except:
	return {
	"motion_type": "none",
	"intensity": 0.25,
	"text_overlay": "",
	"text_color": "white",
	"start_time": 0,
	"end_time": 5
	}

	def detect_face_landmarks(image):
	gray = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)
	rects = detector(gray, 1)
	if len(rects) > 0:
	shape = predictor(gray, rects[0])
	shape = np.array([(shape.part(i).x, shape.part(i).y) for i in range(68)])
	return shape
	else:
	return None

	def apply_color_grading(frame, color_preset, intensity):
	if color_preset == "sepia":
	sepia_matrix = np.array([[0.393, 0.769, 0.189],
	[0.349, 0.686, 0.168],
	[0.272, 0.534, 0.131]])
	frame_float = frame.astype(np.float32) / 255.0
	sepia_effect = cv2.transform(frame_float, sepia_matrix)
	blended_frame = (1 - intensity) * frame_float + intensity * sepia_effect
	return (np.clip(blended_frame, 0, 1) * 255).astype(np.uint8)
	elif color_preset == "vintage":
	frame_float = frame.astype(np.float32) / 255.0
	frame_float[:, :, 0] = (1 - intensity 0.6)
	frame_float[:, :, 2] = (1 + intensity 0.3)
	grayscale = cv2.cvtColor(frame_float, cv2.COLOR_RGB2GRAY)
	grayscale_rgb = cv2.cvtColor(grayscale, cv2.COLOR_GRAY2RGB)
	blended_frame = (1 - intensity * 0.5) * frame_float + intensity * 0.5 * grayscale_rgb
	return (np.clip(blended_frame, 0, 1) * 255).astype(np.uint8)
	elif color_preset == "black_and_white":
	gray_frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
	return cv2.cvtColor(gray_frame, cv2.COLOR_GRAY2RGB)
	elif color_preset == "cold":
	frame_float = frame.astype(np.float32) / 255.0
	frame_float[:, :, 0] = (1 + intensity 0.7)
	frame_float[:, :, 2] = (1 - intensity 0.2)
	return (np.clip(frame_float, 0, 1) * 255).astype(np.uint8)
	elif color_preset == "warm":
	frame_float = frame.astype(np.float32) / 255.0
	frame_float[:, :, 2] = (1 + intensity 0.7)
	frame_float[:, :, 0] = (1 - intensity 0.2)
	return (np.clip(frame_float, 0, 1) * 255).astype(np.uint8)
	elif color_preset == "neon":
	frame_float = frame.astype(np.float32) / 255.0
	lab = cv2.cvtColor(frame_float, cv2.COLOR_RGB2LAB)
	l, a, b = cv2.split(lab)
	clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8,8))
	l = clahe.apply(l)
	lab = cv2.merge((l, a, b))
	frame_float = cv2.cvtColor(lab, cv2.COLOR_LAB2RGB)
	frame_float[:, :, 0] = (1 - intensity 0.4)
	frame_float[:, :, 1] = (1 + intensity 0.8)
	frame_float[:, :, 2] = (1 - intensity 0.4)
	return (np.clip(frame_float, 0, 1) * 255).astype(np.uint8)

	return frame

	def apply_vignette(frame, intensity):
	width, height = frame.shape[1], frame.shape[0]
	x = np.linspace(-1, 1, width)
	y = np.linspace(-1, 1, height)
	X, Y = np.meshgrid(x, y)
	radius = np.sqrt(X2 + Y2)
	vignette = 1 - intensity * radius**2
	vignette = np.clip(vignette, 0, 1)
	vignette = np.stack([vignette] * 3, axis=-1)
	frame_float = frame.astype(np.float32) / 255.0
	result = frame_float * vignette
	return (np.clip(result, 0, 1) * 255).astype(np.uint8)

	def apply_bokeh(frame, intensity, t):
	frame_float = frame.astype(np.float32) / 255.0
	gray = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
	circles = []
	for _ in range(int(intensity * 30)):
	radius = np.random.randint(5, 30)
	x = np.random.randint(radius, frame.shape[1] - radius)
	y = np.random.randint(radius, frame.shape[0] - radius)
	color = frame_float[y, x]
	brightness = np.random.uniform(0.5, 1.0)
	circles.append((x, y, radius, color, brightness))

	bokeh_effect = np.zeros_like(frame_float)
	for x, y, radius, color, brightness in circles:
	y_grid, x_grid = np.ogrid[-y:frame.shape[0]-y, -x:frame.shape[1]-x]
	mask = x_gridx_grid + y_gridy_grid <= radius*radius
	bokeh_effect[mask] += np.array(color) * brightness * (0.5 + 0.5 * np.sin(t * 2 * math.pi))

	blended_frame = frame_float + intensity * bokeh_effect
	return (np.clip(blended_frame, 0, 1) * 255).astype(np.uint8)

	def apply_advanced_motion(image, motion_type, intensity, duration, fps, text_overlay, text_color, font_size, start_time, end_time, color_preset, vignette_intensity):
	frames = []
	width, height = image.size
	landmarks = detect_face_landmarks(image)

	for i in range(int(duration * fps)):
	t = i / (duration * fps)
	frame = image.copy()

	if landmarks is not None:
	if motion_type == "head_nod":
	top_head = landmarks[27]
	bottom_head = landmarks[8]
	angle = math.sin(t * 2 * math.pi) * intensity * 8
	center_x = (top_head[0] + bottom_head[0]) // 2
	center_y = (top_head[1] + bottom_head[1]) // 2
	M = cv2.getRotationMatrix2D((center_x, center_y), angle, 1)
	rotated_image = cv2.warpAffine(np.array(image), M, (width, height), flags=cv2.INTER_LANCZOS4)
	frame = Image.fromarray(rotated_image)

	elif motion_type == "head_shake":
	top_head = landmarks[27]
	left_head = landmarks[0]
	right_head = landmarks[16]
	angle = math.sin(t * 3 * math.pi) * intensity * 6
	center_x = top_head[0]
	center_y = top_head[1]
	M = cv2.getRotationMatrix2D((center_x, center_y), angle, 1)
	rotated_image = cv2.warpAffine(np.array(image), M, (width, height), flags=cv2.INTER_LANCZOS4)
	frame = Image.fromarray(rotated_image)

	elif motion_type == "eye_blink":
	left_eye_top = landmarks[37]
	left_eye_bottom = landmarks[41]
	right_eye_top = landmarks[43]
	right_eye_bottom = landmarks[47]
	blink_progress = abs(math.sin(t * 2 * math.pi))
	if blink_progress > 0.9:
	draw = ImageDraw.Draw(frame)
	draw.line([tuple(landmarks[36]), tuple(landmarks[39])], fill=text_color, width=2)
	draw.line([tuple(landmarks[42]), tuple(landmarks[45])], fill=text_color, width=2)
	else:
	frame = image.copy()

	elif motion_type == "smile":
	mouth_left = landmarks[48]
	mouth_right = landmarks[54]
	mouth_top = landmarks[51]
	mouth_bottom = landmarks[57]
	smile_progress = intensity * t

	draw = ImageDraw.Draw(frame)
	curve_points = [
	tuple(mouth_left),
	(mouth_left[0] + (mouth_right[0] - mouth_left[0]) // 4, mouth_left[1] + int(20 * smile_progress)),
	(mouth_left[0] + 3 * (mouth_right[0] - mouth_left[0]) // 4, mouth_right[1] + int(20 * smile_progress)),
	tuple(mouth_right)
	]
	draw.line(curve_points, fill=text_color, width=4)

	if motion_type == "zoom":
	scale = 1 + intensity * t
	new_size = (int(width * scale), int(height * scale))
	resized_image = image.resize(new_size, Image.Resampling.LANCZOS)
	x_offset = (new_size[0] - width) // 2
	y_offset = (new_size[1] - height) // 2
	frame = resized_image.crop((x_offset, y_offset, x_offset + width, y_offset + height))

	elif motion_type == "pan":
	x_offset = int(intensity * t * (width - width))
	y_offset = int(intensity * t * (height - height))
	frame = Image.new("RGB", (width, height))
	frame.paste(image, (-x_offset, -y_offset))

	elif motion_type == "rotate":
	angle = intensity * t * 360
	rotated_image = image.rotate(angle, expand=True, resample=Image.Resampling.BICUBIC)
	x_offset = (rotated_image.width - width) // 2
	y_offset = (rotated_image.height - height) // 2
	frame = Image.new("RGB", (width, height))
	frame.paste(rotated_image, (-x_offset, -y_offset))

	elif motion_type == "move_right":
	x_offset = int(intensity * t * width)
	frame = Image.new("RGB", (width, height), "black")
	frame.paste(image, (x_offset, 0))

	elif motion_type == "move_left":
	x_offset = -int(intensity * t * width)
	frame = Image.new("RGB", (width, height), "black")
	frame.paste(image, (x_offset, 0))

	elif motion_type == "move_up":
	y_offset = -int(intensity * t * height)
	frame = Image.new("RGB", (width, height), "black")
	frame.paste(image, (0, y_offset))

	elif motion_type == "move_down":
	y_offset = int(intensity * t * height)
	frame = Image.new("RGB", (width, height), "black")
	frame.paste(image, (0, y_offset))

	elif motion_type == "shake":
	shake_intensity = intensity * 10
	x_offset = int(shake_intensity * math.sin(t * 2 * math.pi * 5))
	y_offset = int(shake_intensity * math.cos(t * 2 * math.pi * 3))
	frame = Image.new("RGB", (width, height))
	frame.paste(image, (x_offset, y_offset))

	elif motion_type == "fade_in":
	alpha = t
	frame = Image.blend(Image.new("RGB", (width, height), "black"), image, alpha)

	elif motion_type == "fade_out":
	alpha = 1 - t
	frame = Image.blend(Image.new("RGB", (width, height), "black"), image, alpha)

	elif motion_type == "rain":
	draw = ImageDraw.Draw(frame)
	for _ in range(int(intensity * 5)):
	x = np.random.randint(0, width)
	y = np.random.randint(0, height)
	length = np.random.randint(5, 15)
	speed = intensity * 3
	y_end = y + length + i * speed
	draw.line([(x, y), (x, y_end)], fill="lightblue", width=1)

	elif motion_type == "bokeh":
	frame_np = np.array(frame)
	frame_np = apply_bokeh(frame_np, intensity, t)
	frame = Image.fromarray(frame_np)

	frame_np = np.array(frame)

	if color_preset:
	frame_np = apply_color_grading(frame_np, color_preset, intensity)
	if vignette_intensity > 0:
	frame_np = apply_vignette(frame_np, vignette_intensity)

	frame = Image.fromarray(frame_np)

	draw = ImageDraw.Draw(frame)
	if text_overlay and start_time <= t <= end_time:
	try:
	font = ImageFont.truetype("arial.ttf", font_size)
	except IOError:
	font = ImageFont.load_default()
	text_width, text_height = draw.textsize(text_overlay, font=font)
	x = (width - text_width) // 2
	y = (height - text_height) // 2
	draw.text((x, y), text_overlay, font=font, fill=text_color)

	frames.append(np.array(frame))

	return frames

	def create_video_from_frames(frames, duration=5, fps=30):
	with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile:
	output_filename = tmpfile.name
	writer = imageio_ffmpeg.write_frames(output_filename, frames[0].shape[:2], pix_fmt_out='yuv420p', fps=fps, codec='libx264', preset="veryslow")
	writer.send(None)
	for frame in frames:
	writer.send(frame)
	writer.close()
	return output_filename

	def generate_and_animate(prompt):
	try:
	image = query_hf_image_generation(prompt)
	image_description = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")(image)[0]['generated_text']
	llm_response = query_llm(prompt, image_description)
	motion_params = extract_motion_params(llm_response)
	frames = apply_advanced_motion(
	image,
	motion_params["motion_type"],
	motion_params["intensity"],
	duration=5,
	fps=30,
	text_overlay=motion_params["text_overlay"],
	text_color=motion_params["text_color"],
	font_size=50,
	start_time=motion_params["start_time"],
	end_time=motion_params["end_time"],
	color_preset=motion_params.get("color_preset", None),
	vignette_intensity=motion_params.get("vignette_intensity", 0)
	)
	video_file = create_video_from_frames(frames)
	return video_file, gr.Image.update(value=image)
	except Exception as e:
	return str(e), None

	motion_types = [
	"zoom", "pan", "rotate", "move_right", "move_left", "move_up", "move_down",
	"shake", "fade_in", "fade_out", "head_nod", "head_shake", "eye_blink", "smile", "rain", "bokeh", "none"
	]
	text_colors = ["white", "black", "red", "green", "blue", "yellow"]
	color_presets = ["sepia", "vintage", "black_and_white", "cold", "warm", "neon", "none"]

	iface = gr.Interface(
	fn=generate_and_animate,
	inputs=[
	gr.Textbox(label="Prompt"),
	],
	outputs=[
	gr.Video(label="Generated Video"),
	gr.Image(label="Generated Image")
	],
	title="AI Video Generator",
	description="Enter a prompt to generate an image and animate it. Uses Flux 1, an LLM, and advanced video processing techniques."
	)

	if __name__ == "__main__":
	iface.launch(share=True, debug=True)