File size: 19,540 Bytes
fb105a4
 
 
 
 
 
 
 
 
 
 
699acbf
 
 
fb105a4
 
 
 
 
 
 
 
 
 
 
 
 
 
b126c55
fb105a4
 
699acbf
fb105a4
 
 
 
699acbf
 
 
 
 
 
 
 
 
 
 
 
 
fb105a4
699acbf
fb105a4
699acbf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fb105a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
699acbf
fb105a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
699acbf
fb105a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
699acbf
fb105a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
699acbf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fb105a4
 
699acbf
fb105a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
699acbf
 
 
 
 
 
 
 
 
 
 
 
fb105a4
699acbf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
import gradio as gr
import numpy as np
from PIL import Image, ImageDraw, ImageFont
import math
import dlib
import tempfile
import requests
import os
from transformers import pipeline
import cv2
import io
import json
import re
import time

detector = dlib.get_frontal_face_detector()
try:
    predictor = dlib.shape_predictor("shape_predictor_68_face_landmarks.dat")
except RuntimeError:
    print("Downloading shape_predictor_68_face_landmarks.dat...")
    landmarks_url = "http://dlib.net/files/shape_predictor_68_face_landmarks.dat.bz2"
    landmarks_compressed = requests.get(landmarks_url).content
    import bz2
    landmarks_data = bz2.decompress(landmarks_compressed)
    with open("shape_predictor_68_face_landmarks.dat", "wb") as f:
        f.write(landmarks_data)
    predictor = dlib.shape_predictor("shape_predictor_68_face_landmarks.dat")

IMAGE_GEN_API = "https://api-inference.huggingface.co/models/black-forest-labs/FLUX.1-schnell"
HF_TOKEN = os.getenv("HF_TOKEN")

LLM_API = "https://api-inference.huggingface.co/models/lmsys/fastchat-t5-3b-v1.0"

def query_hf_image_generation(prompt):
    headers = {"Authorization": f"Bearer {HF_TOKEN}"}
    payload = {"inputs": prompt}
    for _ in range(3):
        try:
            response = requests.post(IMAGE_GEN_API, headers=headers, json=payload)
            response.raise_for_status()
            image_bytes = response.content
            image = Image.open(io.BytesIO(image_bytes))
            return image
        except requests.exceptions.RequestException as e:
            print(f"Image generation attempt failed: {e}")
            time.sleep(2)
    raise Exception("Image generation failed after multiple attempts")

def query_llm(prompt, system_prompt):
    headers = {"Authorization": f"Bearer {HF_TOKEN}"}
    prompt_template = f"<|system|>\n{system_prompt}</s>\n<|user|>\n{prompt}</s>\n<|assistant|>\n"
    payload = {"inputs": prompt_template, "max_new_tokens": 200}
    for _ in range(3):
        try:
            response = requests.post(LLM_API, headers=headers, json=payload)
            response.raise_for_status()
            return response.json()[0]['generated_text']
        except requests.exceptions.RequestException as e:
            print(f"LLM query attempt failed: {e}")
            time.sleep(2)
    raise Exception("LLM query failed after multiple attempts")

def segment_script(script):
    system_prompt = "You are a helpful assistant. Given a script, divide it into segments suitable for generating images, ensuring each segment is less than 500 characters."
    llm_response = query_llm(script, system_prompt)
    segments = llm_response.split('\n')
    segments = [seg.strip() for seg in segments if seg.strip()]
    return segments

def generate_image_prompts(script_segments):
    image_prompts = []
    for segment in script_segments:
        system_prompt = "You are a helpful assistant. Create a concise image prompt based on the following script segment:"
        prompt = f"Script Segment: {segment}"
        image_prompt = query_llm(prompt, system_prompt)
        image_prompts.append(image_prompt)
    return image_prompts

def extract_motion_params(llm_output):
    try:
        start_index = llm_output.find('{')
        end_index = llm_output.rfind('}') + 1
        json_string = llm_output[start_index:end_index]
        params = json.loads(json_string)
        return params
    except:
        return {
            "motion_type": "none",
            "intensity": 0.25,
            "text_overlay": "",
            "text_color": "white",
            "start_time": 0,
            "end_time": 5
        }

def detect_face_landmarks(image):
    gray = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)
    rects = detector(gray, 1)
    if len(rects) > 0:
        shape = predictor(gray, rects[0])
        shape = np.array([(shape.part(i).x, shape.part(i).y) for i in range(68)])
        return shape
    else:
        return None

def apply_color_grading(frame, color_preset, intensity):
    if color_preset == "sepia":
        sepia_matrix = np.array([[0.393, 0.769, 0.189],
                                [0.349, 0.686, 0.168],
                                [0.272, 0.534, 0.131]])
        frame_float = frame.astype(np.float32) / 255.0
        sepia_effect = cv2.transform(frame_float, sepia_matrix)
        blended_frame = (1 - intensity) * frame_float + intensity * sepia_effect
        return (np.clip(blended_frame, 0, 1) * 255).astype(np.uint8)
    elif color_preset == "vintage":
        frame_float = frame.astype(np.float32) / 255.0
        frame_float[:, :, 0] *= (1 - intensity * 0.6)
        frame_float[:, :, 2] *= (1 + intensity * 0.3)
        grayscale = cv2.cvtColor(frame_float, cv2.COLOR_RGB2GRAY)
        grayscale_rgb = cv2.cvtColor(grayscale, cv2.COLOR_GRAY2RGB)
        blended_frame = (1 - intensity * 0.5) * frame_float + intensity * 0.5 * grayscale_rgb
        return (np.clip(blended_frame, 0, 1) * 255).astype(np.uint8)
    elif color_preset == "black_and_white":
        gray_frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
        return cv2.cvtColor(gray_frame, cv2.COLOR_GRAY2RGB)
    elif color_preset == "cold":
        frame_float = frame.astype(np.float32) / 255.0
        frame_float[:, :, 0] *= (1 + intensity * 0.7)
        frame_float[:, :, 2] *= (1 - intensity * 0.2)
        return (np.clip(frame_float, 0, 1) * 255).astype(np.uint8)
    elif color_preset == "warm":
        frame_float = frame.astype(np.float32) / 255.0
        frame_float[:, :, 2] *= (1 + intensity * 0.7)
        frame_float[:, :, 0] *= (1 - intensity * 0.2)
        return (np.clip(frame_float, 0, 1) * 255).astype(np.uint8)
    elif color_preset == "neon":
        frame_float = frame.astype(np.float32) / 255.0
        lab = cv2.cvtColor(frame_float, cv2.COLOR_RGB2LAB)
        l, a, b = cv2.split(lab)
        clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8,8))
        l = clahe.apply(l)
        lab = cv2.merge((l, a, b))
        frame_float = cv2.cvtColor(lab, cv2.COLOR_LAB2RGB)
        frame_float[:, :, 0] *= (1 - intensity * 0.4) 
        frame_float[:, :, 1] *= (1 + intensity * 0.8) 
        frame_float[:, :, 2] *= (1 - intensity * 0.4)
        return (np.clip(frame_float, 0, 1) * 255).astype(np.uint8)

    return frame

def apply_vignette(frame, intensity):
    width, height = frame.shape[1], frame.shape[0]
    x = np.linspace(-1, 1, width)
    y = np.linspace(-1, 1, height)
    X, Y = np.meshgrid(x, y)
    radius = np.sqrt(X**2 + Y**2)
    vignette = 1 - intensity * radius**2
    vignette = np.clip(vignette, 0, 1)
    vignette = np.stack([vignette] * 3, axis=-1)
    frame_float = frame.astype(np.float32) / 255.0
    result = frame_float * vignette
    return (np.clip(result, 0, 1) * 255).astype(np.uint8)

def apply_bokeh(frame, intensity, t):
    frame_float = frame.astype(np.float32) / 255.0
    gray = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
    circles = []
    for _ in range(int(intensity * 30)):
        radius = np.random.randint(5, 30)
        x = np.random.randint(radius, frame.shape[1] - radius)
        y = np.random.randint(radius, frame.shape[0] - radius)
        color = frame_float[y, x]
        brightness = np.random.uniform(0.5, 1.0)
        circles.append((x, y, radius, color, brightness))
    
    bokeh_effect = np.zeros_like(frame_float)
    for x, y, radius, color, brightness in circles:
        y_grid, x_grid = np.ogrid[-y:frame.shape[0]-y, -x:frame.shape[1]-x]
        mask = x_grid*x_grid + y_grid*y_grid <= radius*radius
        bokeh_effect[mask] += np.array(color) * brightness * (0.5 + 0.5 * np.sin(t * 2 * math.pi))

    blended_frame = frame_float + intensity * bokeh_effect
    return (np.clip(blended_frame, 0, 1) * 255).astype(np.uint8)

def apply_advanced_motion(image, motion_type, intensity, duration, fps, text_overlay, text_color, font_size, start_time, end_time, color_preset, vignette_intensity):
    frames = []
    width, height = image.size
    landmarks = detect_face_landmarks(image)

    for i in range(int(duration * fps)):
        t = i / (duration * fps)
        frame = image.copy()

        if landmarks is not None:
            if motion_type == "head_nod":
                top_head = landmarks[27]
                bottom_head = landmarks[8]
                angle = math.sin(t * 2 * math.pi) * intensity * 8
                center_x = (top_head[0] + bottom_head[0]) // 2
                center_y = (top_head[1] + bottom_head[1]) // 2
                M = cv2.getRotationMatrix2D((center_x, center_y), angle, 1)
                rotated_image = cv2.warpAffine(np.array(image), M, (width, height), flags=cv2.INTER_LANCZOS4)
                frame = Image.fromarray(rotated_image)

            elif motion_type == "head_shake":
                top_head = landmarks[27]
                left_head = landmarks[0]
                right_head = landmarks[16]
                angle = math.sin(t * 3 * math.pi) * intensity * 6
                center_x = top_head[0]
                center_y = top_head[1]
                M = cv2.getRotationMatrix2D((center_x, center_y), angle, 1)
                rotated_image = cv2.warpAffine(np.array(image), M, (width, height), flags=cv2.INTER_LANCZOS4)
                frame = Image.fromarray(rotated_image)

            elif motion_type == "eye_blink":
                left_eye_top = landmarks[37]
                left_eye_bottom = landmarks[41]
                right_eye_top = landmarks[43]
                right_eye_bottom = landmarks[47]
                blink_progress = abs(math.sin(t * 2 * math.pi))
                if blink_progress > 0.9:
                    draw = ImageDraw.Draw(frame)
                    draw.line([tuple(landmarks[36]), tuple(landmarks[39])], fill=text_color, width=2)
                    draw.line([tuple(landmarks[42]), tuple(landmarks[45])], fill=text_color, width=2)
                else:
                    frame = image.copy()

            elif motion_type == "smile":
                mouth_left = landmarks[48]
                mouth_right = landmarks[54]
                mouth_top = landmarks[51]
                mouth_bottom = landmarks[57]
                smile_progress = intensity * t
                
                draw = ImageDraw.Draw(frame)
                curve_points = [
                    tuple(mouth_left),
                    (mouth_left[0] + (mouth_right[0] - mouth_left[0]) // 4, mouth_left[1] + int(20 * smile_progress)),
                    (mouth_left[0] + 3 * (mouth_right[0] - mouth_left[0]) // 4, mouth_right[1] + int(20 * smile_progress)),
                    tuple(mouth_right)
                ]
                draw.line(curve_points, fill=text_color, width=4)

        if motion_type == "zoom":
            scale = 1 + intensity * t
            new_size = (int(width * scale), int(height * scale))
            resized_image = image.resize(new_size, Image.Resampling.LANCZOS)
            x_offset = (new_size[0] - width) // 2
            y_offset = (new_size[1] - height) // 2
            frame = resized_image.crop((x_offset, y_offset, x_offset + width, y_offset + height))

        elif motion_type == "pan":
            x_offset = int(intensity * t * (width - width))
            y_offset = int(intensity * t * (height - height))
            frame = Image.new("RGB", (width, height))
            frame.paste(image, (-x_offset, -y_offset))

        elif motion_type == "rotate":
            angle = intensity * t * 360
            rotated_image = image.rotate(angle, expand=True, resample=Image.Resampling.BICUBIC)
            x_offset = (rotated_image.width - width) // 2
            y_offset = (rotated_image.height - height) // 2
            frame = Image.new("RGB", (width, height))
            frame.paste(rotated_image, (-x_offset, -y_offset))

        elif motion_type == "move_right":
            x_offset = int(intensity * t * width)
            frame = Image.new("RGB", (width, height), "black")
            frame.paste(image, (x_offset, 0))

        elif motion_type == "move_left":
            x_offset = -int(intensity * t * width)
            frame = Image.new("RGB", (width, height), "black")
            frame.paste(image, (x_offset, 0))

        elif motion_type == "move_up":
            y_offset = -int(intensity * t * height)
            frame = Image.new("RGB", (width, height), "black")
            frame.paste(image, (0, y_offset))

        elif motion_type == "move_down":
            y_offset = int(intensity * t * height)
            frame = Image.new("RGB", (width, height), "black")
            frame.paste(image, (0, y_offset))
        
        elif motion_type == "shake":
            shake_intensity = intensity * 10  
            x_offset = int(shake_intensity * math.sin(t * 2 * math.pi * 5))  
            y_offset = int(shake_intensity * math.cos(t * 2 * math.pi * 3))  
            frame = Image.new("RGB", (width, height))
            frame.paste(image, (x_offset, y_offset))
        
        elif motion_type == "fade_in":
            alpha = t 
            frame = Image.blend(Image.new("RGB", (width, height), "black"), image, alpha)

        elif motion_type == "fade_out":
            alpha = 1 - t
            frame = Image.blend(Image.new("RGB", (width, height), "black"), image, alpha)
        
        elif motion_type == "rain":
            draw = ImageDraw.Draw(frame)
            for _ in range(int(intensity * 5)):
                x = np.random.randint(0, width)
                y = np.random.randint(0, height)
                length = np.random.randint(5, 15)
                speed = intensity * 3
                y_end = y + length + i * speed
                draw.line([(x, y), (x, y_end)], fill="lightblue", width=1)
        
        elif motion_type == "bokeh":
            frame_np = np.array(frame)
            frame_np = apply_bokeh(frame_np, intensity, t)
            frame = Image.fromarray(frame_np)

        frame_np = np.array(frame)
        
        if color_preset:
            frame_np = apply_color_grading(frame_np, color_preset, intensity)
        if vignette_intensity > 0:
            frame_np = apply_vignette(frame_np, vignette_intensity)

        frame = Image.fromarray(frame_np)

        draw = ImageDraw.Draw(frame)
        if text_overlay and start_time <= t <= end_time:
            try:
                font = ImageFont.truetype("arial.ttf", font_size)
            except IOError:
                font = ImageFont.load_default()
            text_width, text_height = draw.textsize(text_overlay, font=font)
            x = (width - text_width) // 2
            y = (height - text_height) // 2
            draw.text((x, y), text_overlay, font=font, fill=text_color)

        frames.append(np.array(frame))

    return frames

def create_video_from_frames(frames, output_filename, fps=30):
    writer = imageio_ffmpeg.write_frames(output_filename, frames[0].shape[:2], pix_fmt_out='yuv420p', fps=fps, codec='libx264', preset="veryslow")
    writer.send(None)
    for frame in frames:
        writer.send(frame)
    writer.close()

def generate_video_from_script(script, duration_per_segment=5):
    script_segments = segment_script(script)
    image_prompts = generate_image_prompts(script_segments)
    all_frames = []

    for i, (segment, image_prompt) in enumerate(zip(script_segments, image_prompts)):
        print(f"Processing segment {i + 1} of {len(script_segments)}")
        print(f"  Segment: {segment}")
        print(f"  Image Prompt: {image_prompt}")
        
        image = query_hf_image_generation(image_prompt)
        image_description = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")(image)[0]['generated_text']
        
        system_prompt = "You are an expert in image to video creation. Provide the motion type, intensity, text overlay, text color, text start and end times, color preset, and vignette intensity for the following image description and user prompt. Give the response in a JSON format."
        prompt = f"Image Description: {image_description}\nUser Prompt: {segment}"
        llm_response = query_llm(prompt, system_prompt)
        
        print(f"  LLM Response: {llm_response}")
        motion_params = extract_motion_params(llm_response)
        print(f"  Motion Parameters: {motion_params}")
        
        frames = apply_advanced_motion(
            image,
            motion_params["motion_type"],
            motion_params["intensity"],
            duration=duration_per_segment,
            fps=30,
            text_overlay=motion_params["text_overlay"],
            text_color=motion_params["text_color"],
            font_size=50,
            start_time=motion_params["start_time"],
            end_time=motion_params["end_time"],
            color_preset=motion_params.get("color_preset", None),
            vignette_intensity=motion_params.get("vignette_intensity", 0)
        )
        all_frames.extend(frames)

    with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile:
        output_filename = tmpfile.name
    create_video_from_frames(all_frames, output_filename)
    return output_filename

def generate_and_animate(prompt):
    try:
        image = query_hf_image_generation(prompt)
        image_description = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")(image)[0]['generated_text']
        llm_response = query_llm(prompt, image_description)
        motion_params = extract_motion_params(llm_response)
        frames = apply_advanced_motion(
            image,
            motion_params["motion_type"],
            motion_params["intensity"],
            duration=5,
            fps=30,
            text_overlay=motion_params["text_overlay"],
            text_color=motion_params["text_color"],
            font_size=50,
            start_time=motion_params["start_time"],
            end_time=motion_params["end_time"],
            color_preset=motion_params.get("color_preset", None),
            vignette_intensity=motion_params.get("vignette_intensity", 0)
        )
        video_file = create_video_from_frames(frames)
        return video_file, gr.Image.update(value=image)
    except Exception as e:
        return str(e), None

motion_types = [
    "zoom", "pan", "rotate", "move_right", "move_left", "move_up", "move_down", 
    "shake", "fade_in", "fade_out", "head_nod", "head_shake", "eye_blink", "smile", "rain", "bokeh", "none"
]
text_colors = ["white", "black", "red", "green", "blue", "yellow"]
color_presets = ["sepia", "vintage", "black_and_white", "cold", "warm", "neon", "none"]

iface = gr.Interface(
    fn=generate_and_animate,
    inputs=[
        gr.Textbox(label="Prompt"),
    ],
    outputs=[
        gr.Video(label="Generated Video"),
        gr.Image(label="Generated Image")
    ],
    title="AI Video Generator",
    description="Enter a prompt to generate an image and animate it. Uses Flux 1, an LLM, and advanced video processing techniques."
)

video_iface = gr.Interface(
    fn=generate_video_from_script,
    inputs=[
        gr.Textbox(label="Script (max 1 minute video)", lines=5),
        gr.Slider(label="Duration per Segment (seconds)", minimum=1, maximum=10, step=1, value=5)
    ],
    outputs=gr.Video(label="Generated Video from Script"),
    title="Story Visualizer",
    description="Enter a short story script, and this will generate a video visualizing it using multiple images and animations."
)
demo = gr.TabbedInterface([iface, video_iface], ["Generate and Animate", "Story to Video"])

if __name__ == "__main__":
    demo.launch(share=True, debug=True)