Spaces:

Lap1official
/

Advanced_Video

Build error

File size: 16,253 Bytes

fb105a4

import gradio as gr
import imageio_ffmpeg
import numpy as np
from PIL import Image, ImageDraw, ImageFont
import math
import dlib
import tempfile
import requests
import os
from transformers import pipeline
import cv2
import io

detector = dlib.get_frontal_face_detector()
try:
    predictor = dlib.shape_predictor("shape_predictor_68_face_landmarks.dat")
except RuntimeError:
    print("Downloading shape_predictor_68_face_landmarks.dat...")
    landmarks_url = "http://dlib.net/files/shape_predictor_68_face_landmarks.dat.bz2"
    landmarks_compressed = requests.get(landmarks_url).content
    import bz2
    landmarks_data = bz2.decompress(landmarks_compressed)
    with open("shape_predictor_68_face_landmarks.dat", "wb") as f:
        f.write(landmarks_data)
    predictor = dlib.shape_predictor("shape_predictor_68_face_landmarks.dat")

API_URL = "https://api-inference.huggingface.co/models/black-forest-labs/flux-1-schnell"
HF_TOKEN = os.getenv("HF_TOKEN")

LLM_API_URL = "https://api-inference.huggingface.co/models/lmsys/fastchat-t5-3b-v1.0"

def query_hf_image_generation(prompt):
    headers = {"Authorization": f"Bearer {HF_TOKEN}"}
    payload = {"inputs": prompt}
    response = requests.post(API_URL, headers=headers, json=payload)
    if response.status_code == 200:
        image_bytes = response.content
        image = Image.open(io.BytesIO(image_bytes))
        return image
    else:
        raise Exception(f"Image generation failed: {response.content}")

def query_llm(prompt, image_description):
    headers = {"Authorization": f"Bearer {HF_TOKEN}"}
    system_prompt = "You are an expert in image to video creation, and give only the motion type, intensity, text overlay, text color, text start and end times for the image described below based on user's prompt. Give the response in a JSON format."
    prompt_template = f"<|system|>\n{system_prompt}</s>\n<|user|>\nImage Description: {image_description}\nUser Prompt: {prompt}</s>\n<|assistant|>\n"
    payload = {"inputs": prompt_template, "max_new_tokens": 200}
    response = requests.post(LLM_API_URL, headers=headers, json=payload)
    if response.status_code == 200:
        return response.json()[0]['generated_text']
    else:
        raise Exception(f"LLM query failed: {response.content}")

def extract_motion_params(llm_output):
    try:
        import json
        start_index = llm_output.find('{')
        end_index = llm_output.rfind('}') + 1
        json_string = llm_output[start_index:end_index]
        params = json.loads(json_string)
        return params
    except:
        return {
            "motion_type": "none",
            "intensity": 0.25,
            "text_overlay": "",
            "text_color": "white",
            "start_time": 0,
            "end_time": 5
        }

def detect_face_landmarks(image):
    gray = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)
    rects = detector(gray, 1)
    if len(rects) > 0:
        shape = predictor(gray, rects[0])
        shape = np.array([(shape.part(i).x, shape.part(i).y) for i in range(68)])
        return shape
    else:
        return None
    
def apply_color_grading(frame, color_preset, intensity):
    if color_preset == "sepia":
        sepia_matrix = np.array([[0.393, 0.769, 0.189],
                                [0.349, 0.686, 0.168],
                                [0.272, 0.534, 0.131]])
        frame_float = frame.astype(np.float32) / 255.0
        sepia_effect = cv2.transform(frame_float, sepia_matrix)
        blended_frame = (1 - intensity) * frame_float + intensity * sepia_effect
        return (np.clip(blended_frame, 0, 1) * 255).astype(np.uint8)
    elif color_preset == "vintage":
        frame_float = frame.astype(np.float32) / 255.0
        frame_float[:, :, 0] *= (1 - intensity * 0.6)
        frame_float[:, :, 2] *= (1 + intensity * 0.3)
        grayscale = cv2.cvtColor(frame_float, cv2.COLOR_RGB2GRAY)
        grayscale_rgb = cv2.cvtColor(grayscale, cv2.COLOR_GRAY2RGB)
        blended_frame = (1 - intensity * 0.5) * frame_float + intensity * 0.5 * grayscale_rgb
        return (np.clip(blended_frame, 0, 1) * 255).astype(np.uint8)
    elif color_preset == "black_and_white":
        gray_frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
        return cv2.cvtColor(gray_frame, cv2.COLOR_GRAY2RGB)
    elif color_preset == "cold":
        frame_float = frame.astype(np.float32) / 255.0
        frame_float[:, :, 0] *= (1 + intensity * 0.7)
        frame_float[:, :, 2] *= (1 - intensity * 0.2)
        return (np.clip(frame_float, 0, 1) * 255).astype(np.uint8)
    elif color_preset == "warm":
        frame_float = frame.astype(np.float32) / 255.0
        frame_float[:, :, 2] *= (1 + intensity * 0.7)
        frame_float[:, :, 0] *= (1 - intensity * 0.2)
        return (np.clip(frame_float, 0, 1) * 255).astype(np.uint8)
    elif color_preset == "neon":
        frame_float = frame.astype(np.float32) / 255.0
        lab = cv2.cvtColor(frame_float, cv2.COLOR_RGB2LAB)
        l, a, b = cv2.split(lab)
        clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8,8))
        l = clahe.apply(l)
        lab = cv2.merge((l, a, b))
        frame_float = cv2.cvtColor(lab, cv2.COLOR_LAB2RGB)
        frame_float[:, :, 0] *= (1 - intensity * 0.4) 
        frame_float[:, :, 1] *= (1 + intensity * 0.8) 
        frame_float[:, :, 2] *= (1 - intensity * 0.4)
        return (np.clip(frame_float, 0, 1) * 255).astype(np.uint8)

    return frame
    
def apply_vignette(frame, intensity):
    width, height = frame.shape[1], frame.shape[0]
    x = np.linspace(-1, 1, width)
    y = np.linspace(-1, 1, height)
    X, Y = np.meshgrid(x, y)
    radius = np.sqrt(X**2 + Y**2)
    vignette = 1 - intensity * radius**2
    vignette = np.clip(vignette, 0, 1)
    vignette = np.stack([vignette] * 3, axis=-1)
    frame_float = frame.astype(np.float32) / 255.0
    result = frame_float * vignette
    return (np.clip(result, 0, 1) * 255).astype(np.uint8)

def apply_bokeh(frame, intensity, t):
    frame_float = frame.astype(np.float32) / 255.0
    gray = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
    circles = []
    for _ in range(int(intensity * 30)):
        radius = np.random.randint(5, 30)
        x = np.random.randint(radius, frame.shape[1] - radius)
        y = np.random.randint(radius, frame.shape[0] - radius)
        color = frame_float[y, x]
        brightness = np.random.uniform(0.5, 1.0)
        circles.append((x, y, radius, color, brightness))
    
    bokeh_effect = np.zeros_like(frame_float)
    for x, y, radius, color, brightness in circles:
        y_grid, x_grid = np.ogrid[-y:frame.shape[0]-y, -x:frame.shape[1]-x]
        mask = x_grid*x_grid + y_grid*y_grid <= radius*radius
        bokeh_effect[mask] += np.array(color) * brightness * (0.5 + 0.5 * np.sin(t * 2 * math.pi))

    blended_frame = frame_float + intensity * bokeh_effect
    return (np.clip(blended_frame, 0, 1) * 255).astype(np.uint8)

def apply_advanced_motion(image, motion_type, intensity, duration, fps, text_overlay, text_color, font_size, start_time, end_time, color_preset, vignette_intensity):
    frames = []
    width, height = image.size
    landmarks = detect_face_landmarks(image)

    for i in range(int(duration * fps)):
        t = i / (duration * fps)
        frame = image.copy()

        if landmarks is not None:
            if motion_type == "head_nod":
                top_head = landmarks[27]
                bottom_head = landmarks[8]
                angle = math.sin(t * 2 * math.pi) * intensity * 8
                center_x = (top_head[0] + bottom_head[0]) // 2
                center_y = (top_head[1] + bottom_head[1]) // 2
                M = cv2.getRotationMatrix2D((center_x, center_y), angle, 1)
                rotated_image = cv2.warpAffine(np.array(image), M, (width, height), flags=cv2.INTER_LANCZOS4)
                frame = Image.fromarray(rotated_image)

            elif motion_type == "head_shake":
                top_head = landmarks[27]
                left_head = landmarks[0]
                right_head = landmarks[16]
                angle = math.sin(t * 3 * math.pi) * intensity * 6
                center_x = top_head[0]
                center_y = top_head[1]
                M = cv2.getRotationMatrix2D((center_x, center_y), angle, 1)
                rotated_image = cv2.warpAffine(np.array(image), M, (width, height), flags=cv2.INTER_LANCZOS4)
                frame = Image.fromarray(rotated_image)

            elif motion_type == "eye_blink":
                left_eye_top = landmarks[37]
                left_eye_bottom = landmarks[41]
                right_eye_top = landmarks[43]
                right_eye_bottom = landmarks[47]
                blink_progress = abs(math.sin(t * 2 * math.pi))
                if blink_progress > 0.9:
                    draw = ImageDraw.Draw(frame)
                    draw.line([tuple(landmarks[36]), tuple(landmarks[39])], fill=text_color, width=2)
                    draw.line([tuple(landmarks[42]), tuple(landmarks[45])], fill=text_color, width=2)
                else:
                    frame = image.copy()
            
            elif motion_type == "smile":
                mouth_left = landmarks[48]
                mouth_right = landmarks[54]
                mouth_top = landmarks[51]
                mouth_bottom = landmarks[57]
                smile_progress = intensity * t
                
                draw = ImageDraw.Draw(frame)
                curve_points = [
                    tuple(mouth_left),
                    (mouth_left[0] + (mouth_right[0] - mouth_left[0]) // 4, mouth_left[1] + int(20 * smile_progress)),
                    (mouth_left[0] + 3 * (mouth_right[0] - mouth_left[0]) // 4, mouth_right[1] + int(20 * smile_progress)),
                    tuple(mouth_right)
                ]
                draw.line(curve_points, fill=text_color, width=4)

        if motion_type == "zoom":
            scale = 1 + intensity * t
            new_size = (int(width * scale), int(height * scale))
            resized_image = image.resize(new_size, Image.Resampling.LANCZOS)
            x_offset = (new_size[0] - width) // 2
            y_offset = (new_size[1] - height) // 2
            frame = resized_image.crop((x_offset, y_offset, x_offset + width, y_offset + height))

        elif motion_type == "pan":
            x_offset = int(intensity * t * (width - width))
            y_offset = int(intensity * t * (height - height))
            frame = Image.new("RGB", (width, height))
            frame.paste(image, (-x_offset, -y_offset))

        elif motion_type == "rotate":
            angle = intensity * t * 360
            rotated_image = image.rotate(angle, expand=True, resample=Image.Resampling.BICUBIC)
            x_offset = (rotated_image.width - width) // 2
            y_offset = (rotated_image.height - height) // 2
            frame = Image.new("RGB", (width, height))
            frame.paste(rotated_image, (-x_offset, -y_offset))

        elif motion_type == "move_right":
            x_offset = int(intensity * t * width)
            frame = Image.new("RGB", (width, height), "black")
            frame.paste(image, (x_offset, 0))

        elif motion_type == "move_left":
            x_offset = -int(intensity * t * width)
            frame = Image.new("RGB", (width, height), "black")
            frame.paste(image, (x_offset, 0))

        elif motion_type == "move_up":
            y_offset = -int(intensity * t * height)
            frame = Image.new("RGB", (width, height), "black")
            frame.paste(image, (0, y_offset))

        elif motion_type == "move_down":
            y_offset = int(intensity * t * height)
            frame = Image.new("RGB", (width, height), "black")
            frame.paste(image, (0, y_offset))
        
        elif motion_type == "shake":
            shake_intensity = intensity * 10  
            x_offset = int(shake_intensity * math.sin(t * 2 * math.pi * 5))  
            y_offset = int(shake_intensity * math.cos(t * 2 * math.pi * 3))  
            frame = Image.new("RGB", (width, height))
            frame.paste(image, (x_offset, y_offset))
        
        elif motion_type == "fade_in":
            alpha = t 
            frame = Image.blend(Image.new("RGB", (width, height), "black"), image, alpha)

        elif motion_type == "fade_out":
            alpha = 1 - t
            frame = Image.blend(Image.new("RGB", (width, height), "black"), image, alpha)
        
        elif motion_type == "rain":
            draw = ImageDraw.Draw(frame)
            for _ in range(int(intensity * 5)):
                x = np.random.randint(0, width)
                y = np.random.randint(0, height)
                length = np.random.randint(5, 15)
                speed = intensity * 3
                y_end = y + length + i * speed
                draw.line([(x, y), (x, y_end)], fill="lightblue", width=1)
        
        elif motion_type == "bokeh":
            frame_np = np.array(frame)
            frame_np = apply_bokeh(frame_np, intensity, t)
            frame = Image.fromarray(frame_np)

        frame_np = np.array(frame)
        
        if color_preset:
            frame_np = apply_color_grading(frame_np, color_preset, intensity)
        if vignette_intensity > 0:
            frame_np = apply_vignette(frame_np, vignette_intensity)

        frame = Image.fromarray(frame_np)

        draw = ImageDraw.Draw(frame)
        if text_overlay and start_time <= t <= end_time:
            try:
                font = ImageFont.truetype("arial.ttf", font_size)
            except IOError:
                font = ImageFont.load_default()
            text_width, text_height = draw.textsize(text_overlay, font=font)
            x = (width - text_width) // 2
            y = (height - text_height) // 2
            draw.text((x, y), text_overlay, font=font, fill=text_color)

        frames.append(np.array(frame))

    return frames

def create_video_from_frames(frames, duration=5, fps=30):
    with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile:
        output_filename = tmpfile.name
        writer = imageio_ffmpeg.write_frames(output_filename, frames[0].shape[:2], pix_fmt_out='yuv420p', fps=fps, codec='libx264', preset="veryslow")
        writer.send(None)
        for frame in frames:
            writer.send(frame)
        writer.close()
    return output_filename

def generate_and_animate(prompt):
    try:
        image = query_hf_image_generation(prompt)
        image_description = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")(image)[0]['generated_text']
        llm_response = query_llm(prompt, image_description)
        motion_params = extract_motion_params(llm_response)
        frames = apply_advanced_motion(
            image,
            motion_params["motion_type"],
            motion_params["intensity"],
            duration=5,
            fps=30,
            text_overlay=motion_params["text_overlay"],
            text_color=motion_params["text_color"],
            font_size=50,
            start_time=motion_params["start_time"],
            end_time=motion_params["end_time"],
            color_preset=motion_params.get("color_preset", None),
            vignette_intensity=motion_params.get("vignette_intensity", 0)
        )
        video_file = create_video_from_frames(frames)
        return video_file, gr.Image.update(value=image)
    except Exception as e:
        return str(e), None

motion_types = [
    "zoom", "pan", "rotate", "move_right", "move_left", "move_up", "move_down", 
    "shake", "fade_in", "fade_out", "head_nod", "head_shake", "eye_blink", "smile", "rain", "bokeh", "none"
]
text_colors = ["white", "black", "red", "green", "blue", "yellow"]
color_presets = ["sepia", "vintage", "black_and_white", "cold", "warm", "neon", "none"]

iface = gr.Interface(
    fn=generate_and_animate,
    inputs=[
        gr.Textbox(label="Prompt"),
    ],
    outputs=[
        gr.Video(label="Generated Video"),
        gr.Image(label="Generated Image")
    ],
    title="AI Video Generator",
    description="Enter a prompt to generate an image and animate it. Uses Flux 1, an LLM, and advanced video processing techniques."
)

if __name__ == "__main__":
    iface.launch(share=True, debug=True)