Spaces:
Build error
Build error
import gradio as gr | |
import imageio_ffmpeg | |
import numpy as np | |
from PIL import Image, ImageDraw, ImageFont | |
import math | |
import dlib | |
import tempfile | |
import requests | |
import os | |
from transformers import pipeline | |
import cv2 | |
import io | |
detector = dlib.get_frontal_face_detector() | |
try: | |
predictor = dlib.shape_predictor("shape_predictor_68_face_landmarks.dat") | |
except RuntimeError: | |
print("Downloading shape_predictor_68_face_landmarks.dat...") | |
landmarks_url = "http://dlib.net/files/shape_predictor_68_face_landmarks.dat.bz2" | |
landmarks_compressed = requests.get(landmarks_url).content | |
import bz2 | |
landmarks_data = bz2.decompress(landmarks_compressed) | |
with open("shape_predictor_68_face_landmarks.dat", "wb") as f: | |
f.write(landmarks_data) | |
predictor = dlib.shape_predictor("shape_predictor_68_face_landmarks.dat") | |
API_URL = "https://api-inference.huggingface.co/models/black-forest-labs/flux-1-schnell" | |
HF_TOKEN = os.getenv("HF_TOKEN") | |
LLM_API_URL = "https://api-inference.huggingface.co/models/lmsys/fastchat-t5-3b-v1.0" | |
def query_hf_image_generation(prompt): | |
headers = {"Authorization": f"Bearer {HF_TOKEN}"} | |
payload = {"inputs": prompt} | |
response = requests.post(API_URL, headers=headers, json=payload) | |
if response.status_code == 200: | |
image_bytes = response.content | |
image = Image.open(io.BytesIO(image_bytes)) | |
return image | |
else: | |
raise Exception(f"Image generation failed: {response.content}") | |
def query_llm(prompt, image_description): | |
headers = {"Authorization": f"Bearer {HF_TOKEN}"} | |
system_prompt = "You are an expert in image to video creation, and give only the motion type, intensity, text overlay, text color, text start and end times for the image described below based on user's prompt. Give the response in a JSON format." | |
prompt_template = f"<|system|>\n{system_prompt}</s>\n<|user|>\nImage Description: {image_description}\nUser Prompt: {prompt}</s>\n<|assistant|>\n" | |
payload = {"inputs": prompt_template, "max_new_tokens": 200} | |
response = requests.post(LLM_API_URL, headers=headers, json=payload) | |
if response.status_code == 200: | |
return response.json()[0]['generated_text'] | |
else: | |
raise Exception(f"LLM query failed: {response.content}") | |
def extract_motion_params(llm_output): | |
try: | |
import json | |
start_index = llm_output.find('{') | |
end_index = llm_output.rfind('}') + 1 | |
json_string = llm_output[start_index:end_index] | |
params = json.loads(json_string) | |
return params | |
except: | |
return { | |
"motion_type": "none", | |
"intensity": 0.25, | |
"text_overlay": "", | |
"text_color": "white", | |
"start_time": 0, | |
"end_time": 5 | |
} | |
def detect_face_landmarks(image): | |
gray = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY) | |
rects = detector(gray, 1) | |
if len(rects) > 0: | |
shape = predictor(gray, rects[0]) | |
shape = np.array([(shape.part(i).x, shape.part(i).y) for i in range(68)]) | |
return shape | |
else: | |
return None | |
def apply_color_grading(frame, color_preset, intensity): | |
if color_preset == "sepia": | |
sepia_matrix = np.array([[0.393, 0.769, 0.189], | |
[0.349, 0.686, 0.168], | |
[0.272, 0.534, 0.131]]) | |
frame_float = frame.astype(np.float32) / 255.0 | |
sepia_effect = cv2.transform(frame_float, sepia_matrix) | |
blended_frame = (1 - intensity) * frame_float + intensity * sepia_effect | |
return (np.clip(blended_frame, 0, 1) * 255).astype(np.uint8) | |
elif color_preset == "vintage": | |
frame_float = frame.astype(np.float32) / 255.0 | |
frame_float[:, :, 0] *= (1 - intensity * 0.6) | |
frame_float[:, :, 2] *= (1 + intensity * 0.3) | |
grayscale = cv2.cvtColor(frame_float, cv2.COLOR_RGB2GRAY) | |
grayscale_rgb = cv2.cvtColor(grayscale, cv2.COLOR_GRAY2RGB) | |
blended_frame = (1 - intensity * 0.5) * frame_float + intensity * 0.5 * grayscale_rgb | |
return (np.clip(blended_frame, 0, 1) * 255).astype(np.uint8) | |
elif color_preset == "black_and_white": | |
gray_frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY) | |
return cv2.cvtColor(gray_frame, cv2.COLOR_GRAY2RGB) | |
elif color_preset == "cold": | |
frame_float = frame.astype(np.float32) / 255.0 | |
frame_float[:, :, 0] *= (1 + intensity * 0.7) | |
frame_float[:, :, 2] *= (1 - intensity * 0.2) | |
return (np.clip(frame_float, 0, 1) * 255).astype(np.uint8) | |
elif color_preset == "warm": | |
frame_float = frame.astype(np.float32) / 255.0 | |
frame_float[:, :, 2] *= (1 + intensity * 0.7) | |
frame_float[:, :, 0] *= (1 - intensity * 0.2) | |
return (np.clip(frame_float, 0, 1) * 255).astype(np.uint8) | |
elif color_preset == "neon": | |
frame_float = frame.astype(np.float32) / 255.0 | |
lab = cv2.cvtColor(frame_float, cv2.COLOR_RGB2LAB) | |
l, a, b = cv2.split(lab) | |
clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8,8)) | |
l = clahe.apply(l) | |
lab = cv2.merge((l, a, b)) | |
frame_float = cv2.cvtColor(lab, cv2.COLOR_LAB2RGB) | |
frame_float[:, :, 0] *= (1 - intensity * 0.4) | |
frame_float[:, :, 1] *= (1 + intensity * 0.8) | |
frame_float[:, :, 2] *= (1 - intensity * 0.4) | |
return (np.clip(frame_float, 0, 1) * 255).astype(np.uint8) | |
return frame | |
def apply_vignette(frame, intensity): | |
width, height = frame.shape[1], frame.shape[0] | |
x = np.linspace(-1, 1, width) | |
y = np.linspace(-1, 1, height) | |
X, Y = np.meshgrid(x, y) | |
radius = np.sqrt(X**2 + Y**2) | |
vignette = 1 - intensity * radius**2 | |
vignette = np.clip(vignette, 0, 1) | |
vignette = np.stack([vignette] * 3, axis=-1) | |
frame_float = frame.astype(np.float32) / 255.0 | |
result = frame_float * vignette | |
return (np.clip(result, 0, 1) * 255).astype(np.uint8) | |
def apply_bokeh(frame, intensity, t): | |
frame_float = frame.astype(np.float32) / 255.0 | |
gray = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY) | |
circles = [] | |
for _ in range(int(intensity * 30)): | |
radius = np.random.randint(5, 30) | |
x = np.random.randint(radius, frame.shape[1] - radius) | |
y = np.random.randint(radius, frame.shape[0] - radius) | |
color = frame_float[y, x] | |
brightness = np.random.uniform(0.5, 1.0) | |
circles.append((x, y, radius, color, brightness)) | |
bokeh_effect = np.zeros_like(frame_float) | |
for x, y, radius, color, brightness in circles: | |
y_grid, x_grid = np.ogrid[-y:frame.shape[0]-y, -x:frame.shape[1]-x] | |
mask = x_grid*x_grid + y_grid*y_grid <= radius*radius | |
bokeh_effect[mask] += np.array(color) * brightness * (0.5 + 0.5 * np.sin(t * 2 * math.pi)) | |
blended_frame = frame_float + intensity * bokeh_effect | |
return (np.clip(blended_frame, 0, 1) * 255).astype(np.uint8) | |
def apply_advanced_motion(image, motion_type, intensity, duration, fps, text_overlay, text_color, font_size, start_time, end_time, color_preset, vignette_intensity): | |
frames = [] | |
width, height = image.size | |
landmarks = detect_face_landmarks(image) | |
for i in range(int(duration * fps)): | |
t = i / (duration * fps) | |
frame = image.copy() | |
if landmarks is not None: | |
if motion_type == "head_nod": | |
top_head = landmarks[27] | |
bottom_head = landmarks[8] | |
angle = math.sin(t * 2 * math.pi) * intensity * 8 | |
center_x = (top_head[0] + bottom_head[0]) // 2 | |
center_y = (top_head[1] + bottom_head[1]) // 2 | |
M = cv2.getRotationMatrix2D((center_x, center_y), angle, 1) | |
rotated_image = cv2.warpAffine(np.array(image), M, (width, height), flags=cv2.INTER_LANCZOS4) | |
frame = Image.fromarray(rotated_image) | |
elif motion_type == "head_shake": | |
top_head = landmarks[27] | |
left_head = landmarks[0] | |
right_head = landmarks[16] | |
angle = math.sin(t * 3 * math.pi) * intensity * 6 | |
center_x = top_head[0] | |
center_y = top_head[1] | |
M = cv2.getRotationMatrix2D((center_x, center_y), angle, 1) | |
rotated_image = cv2.warpAffine(np.array(image), M, (width, height), flags=cv2.INTER_LANCZOS4) | |
frame = Image.fromarray(rotated_image) | |
elif motion_type == "eye_blink": | |
left_eye_top = landmarks[37] | |
left_eye_bottom = landmarks[41] | |
right_eye_top = landmarks[43] | |
right_eye_bottom = landmarks[47] | |
blink_progress = abs(math.sin(t * 2 * math.pi)) | |
if blink_progress > 0.9: | |
draw = ImageDraw.Draw(frame) | |
draw.line([tuple(landmarks[36]), tuple(landmarks[39])], fill=text_color, width=2) | |
draw.line([tuple(landmarks[42]), tuple(landmarks[45])], fill=text_color, width=2) | |
else: | |
frame = image.copy() | |
elif motion_type == "smile": | |
mouth_left = landmarks[48] | |
mouth_right = landmarks[54] | |
mouth_top = landmarks[51] | |
mouth_bottom = landmarks[57] | |
smile_progress = intensity * t | |
draw = ImageDraw.Draw(frame) | |
curve_points = [ | |
tuple(mouth_left), | |
(mouth_left[0] + (mouth_right[0] - mouth_left[0]) // 4, mouth_left[1] + int(20 * smile_progress)), | |
(mouth_left[0] + 3 * (mouth_right[0] - mouth_left[0]) // 4, mouth_right[1] + int(20 * smile_progress)), | |
tuple(mouth_right) | |
] | |
draw.line(curve_points, fill=text_color, width=4) | |
if motion_type == "zoom": | |
scale = 1 + intensity * t | |
new_size = (int(width * scale), int(height * scale)) | |
resized_image = image.resize(new_size, Image.Resampling.LANCZOS) | |
x_offset = (new_size[0] - width) // 2 | |
y_offset = (new_size[1] - height) // 2 | |
frame = resized_image.crop((x_offset, y_offset, x_offset + width, y_offset + height)) | |
elif motion_type == "pan": | |
x_offset = int(intensity * t * (width - width)) | |
y_offset = int(intensity * t * (height - height)) | |
frame = Image.new("RGB", (width, height)) | |
frame.paste(image, (-x_offset, -y_offset)) | |
elif motion_type == "rotate": | |
angle = intensity * t * 360 | |
rotated_image = image.rotate(angle, expand=True, resample=Image.Resampling.BICUBIC) | |
x_offset = (rotated_image.width - width) // 2 | |
y_offset = (rotated_image.height - height) // 2 | |
frame = Image.new("RGB", (width, height)) | |
frame.paste(rotated_image, (-x_offset, -y_offset)) | |
elif motion_type == "move_right": | |
x_offset = int(intensity * t * width) | |
frame = Image.new("RGB", (width, height), "black") | |
frame.paste(image, (x_offset, 0)) | |
elif motion_type == "move_left": | |
x_offset = -int(intensity * t * width) | |
frame = Image.new("RGB", (width, height), "black") | |
frame.paste(image, (x_offset, 0)) | |
elif motion_type == "move_up": | |
y_offset = -int(intensity * t * height) | |
frame = Image.new("RGB", (width, height), "black") | |
frame.paste(image, (0, y_offset)) | |
elif motion_type == "move_down": | |
y_offset = int(intensity * t * height) | |
frame = Image.new("RGB", (width, height), "black") | |
frame.paste(image, (0, y_offset)) | |
elif motion_type == "shake": | |
shake_intensity = intensity * 10 | |
x_offset = int(shake_intensity * math.sin(t * 2 * math.pi * 5)) | |
y_offset = int(shake_intensity * math.cos(t * 2 * math.pi * 3)) | |
frame = Image.new("RGB", (width, height)) | |
frame.paste(image, (x_offset, y_offset)) | |
elif motion_type == "fade_in": | |
alpha = t | |
frame = Image.blend(Image.new("RGB", (width, height), "black"), image, alpha) | |
elif motion_type == "fade_out": | |
alpha = 1 - t | |
frame = Image.blend(Image.new("RGB", (width, height), "black"), image, alpha) | |
elif motion_type == "rain": | |
draw = ImageDraw.Draw(frame) | |
for _ in range(int(intensity * 5)): | |
x = np.random.randint(0, width) | |
y = np.random.randint(0, height) | |
length = np.random.randint(5, 15) | |
speed = intensity * 3 | |
y_end = y + length + i * speed | |
draw.line([(x, y), (x, y_end)], fill="lightblue", width=1) | |
elif motion_type == "bokeh": | |
frame_np = np.array(frame) | |
frame_np = apply_bokeh(frame_np, intensity, t) | |
frame = Image.fromarray(frame_np) | |
frame_np = np.array(frame) | |
if color_preset: | |
frame_np = apply_color_grading(frame_np, color_preset, intensity) | |
if vignette_intensity > 0: | |
frame_np = apply_vignette(frame_np, vignette_intensity) | |
frame = Image.fromarray(frame_np) | |
draw = ImageDraw.Draw(frame) | |
if text_overlay and start_time <= t <= end_time: | |
try: | |
font = ImageFont.truetype("arial.ttf", font_size) | |
except IOError: | |
font = ImageFont.load_default() | |
text_width, text_height = draw.textsize(text_overlay, font=font) | |
x = (width - text_width) // 2 | |
y = (height - text_height) // 2 | |
draw.text((x, y), text_overlay, font=font, fill=text_color) | |
frames.append(np.array(frame)) | |
return frames | |
def create_video_from_frames(frames, duration=5, fps=30): | |
with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile: | |
output_filename = tmpfile.name | |
writer = imageio_ffmpeg.write_frames(output_filename, frames[0].shape[:2], pix_fmt_out='yuv420p', fps=fps, codec='libx264', preset="veryslow") | |
writer.send(None) | |
for frame in frames: | |
writer.send(frame) | |
writer.close() | |
return output_filename | |
def generate_and_animate(prompt): | |
try: | |
image = query_hf_image_generation(prompt) | |
image_description = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")(image)[0]['generated_text'] | |
llm_response = query_llm(prompt, image_description) | |
motion_params = extract_motion_params(llm_response) | |
frames = apply_advanced_motion( | |
image, | |
motion_params["motion_type"], | |
motion_params["intensity"], | |
duration=5, | |
fps=30, | |
text_overlay=motion_params["text_overlay"], | |
text_color=motion_params["text_color"], | |
font_size=50, | |
start_time=motion_params["start_time"], | |
end_time=motion_params["end_time"], | |
color_preset=motion_params.get("color_preset", None), | |
vignette_intensity=motion_params.get("vignette_intensity", 0) | |
) | |
video_file = create_video_from_frames(frames) | |
return video_file, gr.Image.update(value=image) | |
except Exception as e: | |
return str(e), None | |
motion_types = [ | |
"zoom", "pan", "rotate", "move_right", "move_left", "move_up", "move_down", | |
"shake", "fade_in", "fade_out", "head_nod", "head_shake", "eye_blink", "smile", "rain", "bokeh", "none" | |
] | |
text_colors = ["white", "black", "red", "green", "blue", "yellow"] | |
color_presets = ["sepia", "vintage", "black_and_white", "cold", "warm", "neon", "none"] | |
iface = gr.Interface( | |
fn=generate_and_animate, | |
inputs=[ | |
gr.Textbox(label="Prompt"), | |
], | |
outputs=[ | |
gr.Video(label="Generated Video"), | |
gr.Image(label="Generated Image") | |
], | |
title="AI Video Generator", | |
description="Enter a prompt to generate an image and animate it. Uses Flux 1, an LLM, and advanced video processing techniques." | |
) | |
if __name__ == "__main__": | |
iface.launch(share=True, debug=True) |