Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
@@ -10,6 +10,9 @@ import os
|
|
10 |
from transformers import pipeline
|
11 |
import cv2
|
12 |
import io
|
|
|
|
|
|
|
13 |
|
14 |
detector = dlib.get_frontal_face_detector()
|
15 |
try:
|
@@ -24,36 +27,58 @@ except RuntimeError:
|
|
24 |
f.write(landmarks_data)
|
25 |
predictor = dlib.shape_predictor("shape_predictor_68_face_landmarks.dat")
|
26 |
|
27 |
-
|
28 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
29 |
|
30 |
-
|
31 |
|
32 |
def query_hf_image_generation(prompt):
|
33 |
headers = {"Authorization": f"Bearer {HF_TOKEN}"}
|
34 |
payload = {"inputs": prompt}
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
|
|
|
|
|
|
|
|
44 |
headers = {"Authorization": f"Bearer {HF_TOKEN}"}
|
45 |
-
|
46 |
-
prompt_template = f"<|system|>\n{system_prompt}</s>\n<|user|>\nImage Description: {image_description}\nUser Prompt: {prompt}</s>\n<|assistant|>\n"
|
47 |
payload = {"inputs": prompt_template, "max_new_tokens": 200}
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
|
54 |
def extract_motion_params(llm_output):
|
55 |
try:
|
56 |
-
import json
|
57 |
start_index = llm_output.find('{')
|
58 |
end_index = llm_output.rfind('}') + 1
|
59 |
json_string = llm_output[start_index:end_index]
|
@@ -78,7 +103,7 @@ def detect_face_landmarks(image):
|
|
78 |
return shape
|
79 |
else:
|
80 |
return None
|
81 |
-
|
82 |
def apply_color_grading(frame, color_preset, intensity):
|
83 |
if color_preset == "sepia":
|
84 |
sepia_matrix = np.array([[0.393, 0.769, 0.189],
|
@@ -123,7 +148,7 @@ def apply_color_grading(frame, color_preset, intensity):
|
|
123 |
return (np.clip(frame_float, 0, 1) * 255).astype(np.uint8)
|
124 |
|
125 |
return frame
|
126 |
-
|
127 |
def apply_vignette(frame, intensity):
|
128 |
width, height = frame.shape[1], frame.shape[0]
|
129 |
x = np.linspace(-1, 1, width)
|
@@ -201,7 +226,7 @@ def apply_advanced_motion(image, motion_type, intensity, duration, fps, text_ove
|
|
201 |
draw.line([tuple(landmarks[42]), tuple(landmarks[45])], fill=text_color, width=2)
|
202 |
else:
|
203 |
frame = image.copy()
|
204 |
-
|
205 |
elif motion_type == "smile":
|
206 |
mouth_left = landmarks[48]
|
207 |
mouth_right = landmarks[54]
|
@@ -314,14 +339,53 @@ def apply_advanced_motion(image, motion_type, intensity, duration, fps, text_ove
|
|
314 |
|
315 |
return frames
|
316 |
|
317 |
-
def create_video_from_frames(frames,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
318 |
with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile:
|
319 |
output_filename = tmpfile.name
|
320 |
-
|
321 |
-
writer.send(None)
|
322 |
-
for frame in frames:
|
323 |
-
writer.send(frame)
|
324 |
-
writer.close()
|
325 |
return output_filename
|
326 |
|
327 |
def generate_and_animate(prompt):
|
@@ -369,5 +433,17 @@ iface = gr.Interface(
|
|
369 |
description="Enter a prompt to generate an image and animate it. Uses Flux 1, an LLM, and advanced video processing techniques."
|
370 |
)
|
371 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
372 |
if __name__ == "__main__":
|
373 |
-
|
|
|
10 |
from transformers import pipeline
|
11 |
import cv2
|
12 |
import io
|
13 |
+
import json
|
14 |
+
import re
|
15 |
+
import time
|
16 |
|
17 |
detector = dlib.get_frontal_face_detector()
|
18 |
try:
|
|
|
27 |
f.write(landmarks_data)
|
28 |
predictor = dlib.shape_predictor("shape_predictor_68_face_landmarks.dat")
|
29 |
|
30 |
+
IMAGE_GEN_API = "https://api-inference.huggingface.co/models/black-forest-labs/flux-1-schnell"
|
31 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
32 |
|
33 |
+
LLM_API = "https://api-inference.huggingface.co/models/lmsys/fastchat-t5-3b-v1.0"
|
34 |
|
35 |
def query_hf_image_generation(prompt):
|
36 |
headers = {"Authorization": f"Bearer {HF_TOKEN}"}
|
37 |
payload = {"inputs": prompt}
|
38 |
+
for _ in range(3):
|
39 |
+
try:
|
40 |
+
response = requests.post(IMAGE_GEN_API, headers=headers, json=payload)
|
41 |
+
response.raise_for_status()
|
42 |
+
image_bytes = response.content
|
43 |
+
image = Image.open(io.BytesIO(image_bytes))
|
44 |
+
return image
|
45 |
+
except requests.exceptions.RequestException as e:
|
46 |
+
print(f"Image generation attempt failed: {e}")
|
47 |
+
time.sleep(2)
|
48 |
+
raise Exception("Image generation failed after multiple attempts")
|
49 |
+
|
50 |
+
def query_llm(prompt, system_prompt):
|
51 |
headers = {"Authorization": f"Bearer {HF_TOKEN}"}
|
52 |
+
prompt_template = f"<|system|>\n{system_prompt}</s>\n<|user|>\n{prompt}</s>\n<|assistant|>\n"
|
|
|
53 |
payload = {"inputs": prompt_template, "max_new_tokens": 200}
|
54 |
+
for _ in range(3):
|
55 |
+
try:
|
56 |
+
response = requests.post(LLM_API, headers=headers, json=payload)
|
57 |
+
response.raise_for_status()
|
58 |
+
return response.json()[0]['generated_text']
|
59 |
+
except requests.exceptions.RequestException as e:
|
60 |
+
print(f"LLM query attempt failed: {e}")
|
61 |
+
time.sleep(2)
|
62 |
+
raise Exception("LLM query failed after multiple attempts")
|
63 |
+
|
64 |
+
def segment_script(script):
|
65 |
+
system_prompt = "You are a helpful assistant. Given a script, divide it into segments suitable for generating images, ensuring each segment is less than 500 characters."
|
66 |
+
llm_response = query_llm(script, system_prompt)
|
67 |
+
segments = llm_response.split('\n')
|
68 |
+
segments = [seg.strip() for seg in segments if seg.strip()]
|
69 |
+
return segments
|
70 |
+
|
71 |
+
def generate_image_prompts(script_segments):
|
72 |
+
image_prompts = []
|
73 |
+
for segment in script_segments:
|
74 |
+
system_prompt = "You are a helpful assistant. Create a concise image prompt based on the following script segment:"
|
75 |
+
prompt = f"Script Segment: {segment}"
|
76 |
+
image_prompt = query_llm(prompt, system_prompt)
|
77 |
+
image_prompts.append(image_prompt)
|
78 |
+
return image_prompts
|
79 |
|
80 |
def extract_motion_params(llm_output):
|
81 |
try:
|
|
|
82 |
start_index = llm_output.find('{')
|
83 |
end_index = llm_output.rfind('}') + 1
|
84 |
json_string = llm_output[start_index:end_index]
|
|
|
103 |
return shape
|
104 |
else:
|
105 |
return None
|
106 |
+
|
107 |
def apply_color_grading(frame, color_preset, intensity):
|
108 |
if color_preset == "sepia":
|
109 |
sepia_matrix = np.array([[0.393, 0.769, 0.189],
|
|
|
148 |
return (np.clip(frame_float, 0, 1) * 255).astype(np.uint8)
|
149 |
|
150 |
return frame
|
151 |
+
|
152 |
def apply_vignette(frame, intensity):
|
153 |
width, height = frame.shape[1], frame.shape[0]
|
154 |
x = np.linspace(-1, 1, width)
|
|
|
226 |
draw.line([tuple(landmarks[42]), tuple(landmarks[45])], fill=text_color, width=2)
|
227 |
else:
|
228 |
frame = image.copy()
|
229 |
+
|
230 |
elif motion_type == "smile":
|
231 |
mouth_left = landmarks[48]
|
232 |
mouth_right = landmarks[54]
|
|
|
339 |
|
340 |
return frames
|
341 |
|
342 |
+
def create_video_from_frames(frames, output_filename, fps=30):
|
343 |
+
writer = imageio_ffmpeg.write_frames(output_filename, frames[0].shape[:2], pix_fmt_out='yuv420p', fps=fps, codec='libx264', preset="veryslow")
|
344 |
+
writer.send(None)
|
345 |
+
for frame in frames:
|
346 |
+
writer.send(frame)
|
347 |
+
writer.close()
|
348 |
+
|
349 |
+
def generate_video_from_script(script, duration_per_segment=5):
|
350 |
+
script_segments = segment_script(script)
|
351 |
+
image_prompts = generate_image_prompts(script_segments)
|
352 |
+
all_frames = []
|
353 |
+
|
354 |
+
for i, (segment, image_prompt) in enumerate(zip(script_segments, image_prompts)):
|
355 |
+
print(f"Processing segment {i + 1} of {len(script_segments)}")
|
356 |
+
print(f" Segment: {segment}")
|
357 |
+
print(f" Image Prompt: {image_prompt}")
|
358 |
+
|
359 |
+
image = query_hf_image_generation(image_prompt)
|
360 |
+
image_description = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")(image)[0]['generated_text']
|
361 |
+
|
362 |
+
system_prompt = "You are an expert in image to video creation. Provide the motion type, intensity, text overlay, text color, text start and end times, color preset, and vignette intensity for the following image description and user prompt. Give the response in a JSON format."
|
363 |
+
prompt = f"Image Description: {image_description}\nUser Prompt: {segment}"
|
364 |
+
llm_response = query_llm(prompt, system_prompt)
|
365 |
+
|
366 |
+
print(f" LLM Response: {llm_response}")
|
367 |
+
motion_params = extract_motion_params(llm_response)
|
368 |
+
print(f" Motion Parameters: {motion_params}")
|
369 |
+
|
370 |
+
frames = apply_advanced_motion(
|
371 |
+
image,
|
372 |
+
motion_params["motion_type"],
|
373 |
+
motion_params["intensity"],
|
374 |
+
duration=duration_per_segment,
|
375 |
+
fps=30,
|
376 |
+
text_overlay=motion_params["text_overlay"],
|
377 |
+
text_color=motion_params["text_color"],
|
378 |
+
font_size=50,
|
379 |
+
start_time=motion_params["start_time"],
|
380 |
+
end_time=motion_params["end_time"],
|
381 |
+
color_preset=motion_params.get("color_preset", None),
|
382 |
+
vignette_intensity=motion_params.get("vignette_intensity", 0)
|
383 |
+
)
|
384 |
+
all_frames.extend(frames)
|
385 |
+
|
386 |
with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile:
|
387 |
output_filename = tmpfile.name
|
388 |
+
create_video_from_frames(all_frames, output_filename)
|
|
|
|
|
|
|
|
|
389 |
return output_filename
|
390 |
|
391 |
def generate_and_animate(prompt):
|
|
|
433 |
description="Enter a prompt to generate an image and animate it. Uses Flux 1, an LLM, and advanced video processing techniques."
|
434 |
)
|
435 |
|
436 |
+
video_iface = gr.Interface(
|
437 |
+
fn=generate_video_from_script,
|
438 |
+
inputs=[
|
439 |
+
gr.Textbox(label="Script (max 1 minute video)", lines=5),
|
440 |
+
gr.Slider(label="Duration per Segment (seconds)", minimum=1, maximum=10, step=1, value=5)
|
441 |
+
],
|
442 |
+
outputs=gr.Video(label="Generated Video from Script"),
|
443 |
+
title="Story Visualizer",
|
444 |
+
description="Enter a short story script, and this will generate a video visualizing it using multiple images and animations."
|
445 |
+
)
|
446 |
+
demo = gr.TabbedInterface([iface, video_iface], ["Generate and Animate", "Story to Video"])
|
447 |
+
|
448 |
if __name__ == "__main__":
|
449 |
+
demo.launch(share=True, debug=True)
|