Reality123b commited on
Commit
fb105a4
·
verified ·
1 Parent(s): e9feed7

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +373 -0
app.py ADDED
@@ -0,0 +1,373 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import imageio_ffmpeg
3
+ import numpy as np
4
+ from PIL import Image, ImageDraw, ImageFont
5
+ import math
6
+ import dlib
7
+ import tempfile
8
+ import requests
9
+ import os
10
+ from transformers import pipeline
11
+ import cv2
12
+ import io
13
+
14
+ detector = dlib.get_frontal_face_detector()
15
+ try:
16
+ predictor = dlib.shape_predictor("shape_predictor_68_face_landmarks.dat")
17
+ except RuntimeError:
18
+ print("Downloading shape_predictor_68_face_landmarks.dat...")
19
+ landmarks_url = "http://dlib.net/files/shape_predictor_68_face_landmarks.dat.bz2"
20
+ landmarks_compressed = requests.get(landmarks_url).content
21
+ import bz2
22
+ landmarks_data = bz2.decompress(landmarks_compressed)
23
+ with open("shape_predictor_68_face_landmarks.dat", "wb") as f:
24
+ f.write(landmarks_data)
25
+ predictor = dlib.shape_predictor("shape_predictor_68_face_landmarks.dat")
26
+
27
+ API_URL = "https://api-inference.huggingface.co/models/black-forest-labs/flux-1-schnell"
28
+ HF_TOKEN = os.getenv("HF_TOKEN")
29
+
30
+ LLM_API_URL = "https://api-inference.huggingface.co/models/lmsys/fastchat-t5-3b-v1.0"
31
+
32
+ def query_hf_image_generation(prompt):
33
+ headers = {"Authorization": f"Bearer {HF_TOKEN}"}
34
+ payload = {"inputs": prompt}
35
+ response = requests.post(API_URL, headers=headers, json=payload)
36
+ if response.status_code == 200:
37
+ image_bytes = response.content
38
+ image = Image.open(io.BytesIO(image_bytes))
39
+ return image
40
+ else:
41
+ raise Exception(f"Image generation failed: {response.content}")
42
+
43
+ def query_llm(prompt, image_description):
44
+ headers = {"Authorization": f"Bearer {HF_TOKEN}"}
45
+ system_prompt = "You are an expert in image to video creation, and give only the motion type, intensity, text overlay, text color, text start and end times for the image described below based on user's prompt. Give the response in a JSON format."
46
+ prompt_template = f"<|system|>\n{system_prompt}</s>\n<|user|>\nImage Description: {image_description}\nUser Prompt: {prompt}</s>\n<|assistant|>\n"
47
+ payload = {"inputs": prompt_template, "max_new_tokens": 200}
48
+ response = requests.post(LLM_API_URL, headers=headers, json=payload)
49
+ if response.status_code == 200:
50
+ return response.json()[0]['generated_text']
51
+ else:
52
+ raise Exception(f"LLM query failed: {response.content}")
53
+
54
+ def extract_motion_params(llm_output):
55
+ try:
56
+ import json
57
+ start_index = llm_output.find('{')
58
+ end_index = llm_output.rfind('}') + 1
59
+ json_string = llm_output[start_index:end_index]
60
+ params = json.loads(json_string)
61
+ return params
62
+ except:
63
+ return {
64
+ "motion_type": "none",
65
+ "intensity": 0.25,
66
+ "text_overlay": "",
67
+ "text_color": "white",
68
+ "start_time": 0,
69
+ "end_time": 5
70
+ }
71
+
72
+ def detect_face_landmarks(image):
73
+ gray = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)
74
+ rects = detector(gray, 1)
75
+ if len(rects) > 0:
76
+ shape = predictor(gray, rects[0])
77
+ shape = np.array([(shape.part(i).x, shape.part(i).y) for i in range(68)])
78
+ return shape
79
+ else:
80
+ return None
81
+
82
+ def apply_color_grading(frame, color_preset, intensity):
83
+ if color_preset == "sepia":
84
+ sepia_matrix = np.array([[0.393, 0.769, 0.189],
85
+ [0.349, 0.686, 0.168],
86
+ [0.272, 0.534, 0.131]])
87
+ frame_float = frame.astype(np.float32) / 255.0
88
+ sepia_effect = cv2.transform(frame_float, sepia_matrix)
89
+ blended_frame = (1 - intensity) * frame_float + intensity * sepia_effect
90
+ return (np.clip(blended_frame, 0, 1) * 255).astype(np.uint8)
91
+ elif color_preset == "vintage":
92
+ frame_float = frame.astype(np.float32) / 255.0
93
+ frame_float[:, :, 0] *= (1 - intensity * 0.6)
94
+ frame_float[:, :, 2] *= (1 + intensity * 0.3)
95
+ grayscale = cv2.cvtColor(frame_float, cv2.COLOR_RGB2GRAY)
96
+ grayscale_rgb = cv2.cvtColor(grayscale, cv2.COLOR_GRAY2RGB)
97
+ blended_frame = (1 - intensity * 0.5) * frame_float + intensity * 0.5 * grayscale_rgb
98
+ return (np.clip(blended_frame, 0, 1) * 255).astype(np.uint8)
99
+ elif color_preset == "black_and_white":
100
+ gray_frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
101
+ return cv2.cvtColor(gray_frame, cv2.COLOR_GRAY2RGB)
102
+ elif color_preset == "cold":
103
+ frame_float = frame.astype(np.float32) / 255.0
104
+ frame_float[:, :, 0] *= (1 + intensity * 0.7)
105
+ frame_float[:, :, 2] *= (1 - intensity * 0.2)
106
+ return (np.clip(frame_float, 0, 1) * 255).astype(np.uint8)
107
+ elif color_preset == "warm":
108
+ frame_float = frame.astype(np.float32) / 255.0
109
+ frame_float[:, :, 2] *= (1 + intensity * 0.7)
110
+ frame_float[:, :, 0] *= (1 - intensity * 0.2)
111
+ return (np.clip(frame_float, 0, 1) * 255).astype(np.uint8)
112
+ elif color_preset == "neon":
113
+ frame_float = frame.astype(np.float32) / 255.0
114
+ lab = cv2.cvtColor(frame_float, cv2.COLOR_RGB2LAB)
115
+ l, a, b = cv2.split(lab)
116
+ clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8,8))
117
+ l = clahe.apply(l)
118
+ lab = cv2.merge((l, a, b))
119
+ frame_float = cv2.cvtColor(lab, cv2.COLOR_LAB2RGB)
120
+ frame_float[:, :, 0] *= (1 - intensity * 0.4)
121
+ frame_float[:, :, 1] *= (1 + intensity * 0.8)
122
+ frame_float[:, :, 2] *= (1 - intensity * 0.4)
123
+ return (np.clip(frame_float, 0, 1) * 255).astype(np.uint8)
124
+
125
+ return frame
126
+
127
+ def apply_vignette(frame, intensity):
128
+ width, height = frame.shape[1], frame.shape[0]
129
+ x = np.linspace(-1, 1, width)
130
+ y = np.linspace(-1, 1, height)
131
+ X, Y = np.meshgrid(x, y)
132
+ radius = np.sqrt(X**2 + Y**2)
133
+ vignette = 1 - intensity * radius**2
134
+ vignette = np.clip(vignette, 0, 1)
135
+ vignette = np.stack([vignette] * 3, axis=-1)
136
+ frame_float = frame.astype(np.float32) / 255.0
137
+ result = frame_float * vignette
138
+ return (np.clip(result, 0, 1) * 255).astype(np.uint8)
139
+
140
+ def apply_bokeh(frame, intensity, t):
141
+ frame_float = frame.astype(np.float32) / 255.0
142
+ gray = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
143
+ circles = []
144
+ for _ in range(int(intensity * 30)):
145
+ radius = np.random.randint(5, 30)
146
+ x = np.random.randint(radius, frame.shape[1] - radius)
147
+ y = np.random.randint(radius, frame.shape[0] - radius)
148
+ color = frame_float[y, x]
149
+ brightness = np.random.uniform(0.5, 1.0)
150
+ circles.append((x, y, radius, color, brightness))
151
+
152
+ bokeh_effect = np.zeros_like(frame_float)
153
+ for x, y, radius, color, brightness in circles:
154
+ y_grid, x_grid = np.ogrid[-y:frame.shape[0]-y, -x:frame.shape[1]-x]
155
+ mask = x_grid*x_grid + y_grid*y_grid <= radius*radius
156
+ bokeh_effect[mask] += np.array(color) * brightness * (0.5 + 0.5 * np.sin(t * 2 * math.pi))
157
+
158
+ blended_frame = frame_float + intensity * bokeh_effect
159
+ return (np.clip(blended_frame, 0, 1) * 255).astype(np.uint8)
160
+
161
+ def apply_advanced_motion(image, motion_type, intensity, duration, fps, text_overlay, text_color, font_size, start_time, end_time, color_preset, vignette_intensity):
162
+ frames = []
163
+ width, height = image.size
164
+ landmarks = detect_face_landmarks(image)
165
+
166
+ for i in range(int(duration * fps)):
167
+ t = i / (duration * fps)
168
+ frame = image.copy()
169
+
170
+ if landmarks is not None:
171
+ if motion_type == "head_nod":
172
+ top_head = landmarks[27]
173
+ bottom_head = landmarks[8]
174
+ angle = math.sin(t * 2 * math.pi) * intensity * 8
175
+ center_x = (top_head[0] + bottom_head[0]) // 2
176
+ center_y = (top_head[1] + bottom_head[1]) // 2
177
+ M = cv2.getRotationMatrix2D((center_x, center_y), angle, 1)
178
+ rotated_image = cv2.warpAffine(np.array(image), M, (width, height), flags=cv2.INTER_LANCZOS4)
179
+ frame = Image.fromarray(rotated_image)
180
+
181
+ elif motion_type == "head_shake":
182
+ top_head = landmarks[27]
183
+ left_head = landmarks[0]
184
+ right_head = landmarks[16]
185
+ angle = math.sin(t * 3 * math.pi) * intensity * 6
186
+ center_x = top_head[0]
187
+ center_y = top_head[1]
188
+ M = cv2.getRotationMatrix2D((center_x, center_y), angle, 1)
189
+ rotated_image = cv2.warpAffine(np.array(image), M, (width, height), flags=cv2.INTER_LANCZOS4)
190
+ frame = Image.fromarray(rotated_image)
191
+
192
+ elif motion_type == "eye_blink":
193
+ left_eye_top = landmarks[37]
194
+ left_eye_bottom = landmarks[41]
195
+ right_eye_top = landmarks[43]
196
+ right_eye_bottom = landmarks[47]
197
+ blink_progress = abs(math.sin(t * 2 * math.pi))
198
+ if blink_progress > 0.9:
199
+ draw = ImageDraw.Draw(frame)
200
+ draw.line([tuple(landmarks[36]), tuple(landmarks[39])], fill=text_color, width=2)
201
+ draw.line([tuple(landmarks[42]), tuple(landmarks[45])], fill=text_color, width=2)
202
+ else:
203
+ frame = image.copy()
204
+
205
+ elif motion_type == "smile":
206
+ mouth_left = landmarks[48]
207
+ mouth_right = landmarks[54]
208
+ mouth_top = landmarks[51]
209
+ mouth_bottom = landmarks[57]
210
+ smile_progress = intensity * t
211
+
212
+ draw = ImageDraw.Draw(frame)
213
+ curve_points = [
214
+ tuple(mouth_left),
215
+ (mouth_left[0] + (mouth_right[0] - mouth_left[0]) // 4, mouth_left[1] + int(20 * smile_progress)),
216
+ (mouth_left[0] + 3 * (mouth_right[0] - mouth_left[0]) // 4, mouth_right[1] + int(20 * smile_progress)),
217
+ tuple(mouth_right)
218
+ ]
219
+ draw.line(curve_points, fill=text_color, width=4)
220
+
221
+ if motion_type == "zoom":
222
+ scale = 1 + intensity * t
223
+ new_size = (int(width * scale), int(height * scale))
224
+ resized_image = image.resize(new_size, Image.Resampling.LANCZOS)
225
+ x_offset = (new_size[0] - width) // 2
226
+ y_offset = (new_size[1] - height) // 2
227
+ frame = resized_image.crop((x_offset, y_offset, x_offset + width, y_offset + height))
228
+
229
+ elif motion_type == "pan":
230
+ x_offset = int(intensity * t * (width - width))
231
+ y_offset = int(intensity * t * (height - height))
232
+ frame = Image.new("RGB", (width, height))
233
+ frame.paste(image, (-x_offset, -y_offset))
234
+
235
+ elif motion_type == "rotate":
236
+ angle = intensity * t * 360
237
+ rotated_image = image.rotate(angle, expand=True, resample=Image.Resampling.BICUBIC)
238
+ x_offset = (rotated_image.width - width) // 2
239
+ y_offset = (rotated_image.height - height) // 2
240
+ frame = Image.new("RGB", (width, height))
241
+ frame.paste(rotated_image, (-x_offset, -y_offset))
242
+
243
+ elif motion_type == "move_right":
244
+ x_offset = int(intensity * t * width)
245
+ frame = Image.new("RGB", (width, height), "black")
246
+ frame.paste(image, (x_offset, 0))
247
+
248
+ elif motion_type == "move_left":
249
+ x_offset = -int(intensity * t * width)
250
+ frame = Image.new("RGB", (width, height), "black")
251
+ frame.paste(image, (x_offset, 0))
252
+
253
+ elif motion_type == "move_up":
254
+ y_offset = -int(intensity * t * height)
255
+ frame = Image.new("RGB", (width, height), "black")
256
+ frame.paste(image, (0, y_offset))
257
+
258
+ elif motion_type == "move_down":
259
+ y_offset = int(intensity * t * height)
260
+ frame = Image.new("RGB", (width, height), "black")
261
+ frame.paste(image, (0, y_offset))
262
+
263
+ elif motion_type == "shake":
264
+ shake_intensity = intensity * 10
265
+ x_offset = int(shake_intensity * math.sin(t * 2 * math.pi * 5))
266
+ y_offset = int(shake_intensity * math.cos(t * 2 * math.pi * 3))
267
+ frame = Image.new("RGB", (width, height))
268
+ frame.paste(image, (x_offset, y_offset))
269
+
270
+ elif motion_type == "fade_in":
271
+ alpha = t
272
+ frame = Image.blend(Image.new("RGB", (width, height), "black"), image, alpha)
273
+
274
+ elif motion_type == "fade_out":
275
+ alpha = 1 - t
276
+ frame = Image.blend(Image.new("RGB", (width, height), "black"), image, alpha)
277
+
278
+ elif motion_type == "rain":
279
+ draw = ImageDraw.Draw(frame)
280
+ for _ in range(int(intensity * 5)):
281
+ x = np.random.randint(0, width)
282
+ y = np.random.randint(0, height)
283
+ length = np.random.randint(5, 15)
284
+ speed = intensity * 3
285
+ y_end = y + length + i * speed
286
+ draw.line([(x, y), (x, y_end)], fill="lightblue", width=1)
287
+
288
+ elif motion_type == "bokeh":
289
+ frame_np = np.array(frame)
290
+ frame_np = apply_bokeh(frame_np, intensity, t)
291
+ frame = Image.fromarray(frame_np)
292
+
293
+ frame_np = np.array(frame)
294
+
295
+ if color_preset:
296
+ frame_np = apply_color_grading(frame_np, color_preset, intensity)
297
+ if vignette_intensity > 0:
298
+ frame_np = apply_vignette(frame_np, vignette_intensity)
299
+
300
+ frame = Image.fromarray(frame_np)
301
+
302
+ draw = ImageDraw.Draw(frame)
303
+ if text_overlay and start_time <= t <= end_time:
304
+ try:
305
+ font = ImageFont.truetype("arial.ttf", font_size)
306
+ except IOError:
307
+ font = ImageFont.load_default()
308
+ text_width, text_height = draw.textsize(text_overlay, font=font)
309
+ x = (width - text_width) // 2
310
+ y = (height - text_height) // 2
311
+ draw.text((x, y), text_overlay, font=font, fill=text_color)
312
+
313
+ frames.append(np.array(frame))
314
+
315
+ return frames
316
+
317
+ def create_video_from_frames(frames, duration=5, fps=30):
318
+ with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile:
319
+ output_filename = tmpfile.name
320
+ writer = imageio_ffmpeg.write_frames(output_filename, frames[0].shape[:2], pix_fmt_out='yuv420p', fps=fps, codec='libx264', preset="veryslow")
321
+ writer.send(None)
322
+ for frame in frames:
323
+ writer.send(frame)
324
+ writer.close()
325
+ return output_filename
326
+
327
+ def generate_and_animate(prompt):
328
+ try:
329
+ image = query_hf_image_generation(prompt)
330
+ image_description = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")(image)[0]['generated_text']
331
+ llm_response = query_llm(prompt, image_description)
332
+ motion_params = extract_motion_params(llm_response)
333
+ frames = apply_advanced_motion(
334
+ image,
335
+ motion_params["motion_type"],
336
+ motion_params["intensity"],
337
+ duration=5,
338
+ fps=30,
339
+ text_overlay=motion_params["text_overlay"],
340
+ text_color=motion_params["text_color"],
341
+ font_size=50,
342
+ start_time=motion_params["start_time"],
343
+ end_time=motion_params["end_time"],
344
+ color_preset=motion_params.get("color_preset", None),
345
+ vignette_intensity=motion_params.get("vignette_intensity", 0)
346
+ )
347
+ video_file = create_video_from_frames(frames)
348
+ return video_file, gr.Image.update(value=image)
349
+ except Exception as e:
350
+ return str(e), None
351
+
352
+ motion_types = [
353
+ "zoom", "pan", "rotate", "move_right", "move_left", "move_up", "move_down",
354
+ "shake", "fade_in", "fade_out", "head_nod", "head_shake", "eye_blink", "smile", "rain", "bokeh", "none"
355
+ ]
356
+ text_colors = ["white", "black", "red", "green", "blue", "yellow"]
357
+ color_presets = ["sepia", "vintage", "black_and_white", "cold", "warm", "neon", "none"]
358
+
359
+ iface = gr.Interface(
360
+ fn=generate_and_animate,
361
+ inputs=[
362
+ gr.Textbox(label="Prompt"),
363
+ ],
364
+ outputs=[
365
+ gr.Video(label="Generated Video"),
366
+ gr.Image(label="Generated Image")
367
+ ],
368
+ title="AI Video Generator",
369
+ description="Enter a prompt to generate an image and animate it. Uses Flux 1, an LLM, and advanced video processing techniques."
370
+ )
371
+
372
+ if __name__ == "__main__":
373
+ iface.launch(share=True, debug=True)