Reality123b commited on
Commit
699acbf
·
verified ·
1 Parent(s): 56cf9ae

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +105 -29
app.py CHANGED
@@ -10,6 +10,9 @@ import os
10
  from transformers import pipeline
11
  import cv2
12
  import io
 
 
 
13
 
14
  detector = dlib.get_frontal_face_detector()
15
  try:
@@ -24,36 +27,58 @@ except RuntimeError:
24
  f.write(landmarks_data)
25
  predictor = dlib.shape_predictor("shape_predictor_68_face_landmarks.dat")
26
 
27
- API_URL = "https://api-inference.huggingface.co/models/black-forest-labs/flux-1-schnell"
28
  HF_TOKEN = os.getenv("HF_TOKEN")
29
 
30
- LLM_API_URL = "https://api-inference.huggingface.co/models/lmsys/fastchat-t5-3b-v1.0"
31
 
32
  def query_hf_image_generation(prompt):
33
  headers = {"Authorization": f"Bearer {HF_TOKEN}"}
34
  payload = {"inputs": prompt}
35
- response = requests.post(API_URL, headers=headers, json=payload)
36
- if response.status_code == 200:
37
- image_bytes = response.content
38
- image = Image.open(io.BytesIO(image_bytes))
39
- return image
40
- else:
41
- raise Exception(f"Image generation failed: {response.content}")
42
-
43
- def query_llm(prompt, image_description):
 
 
 
 
44
  headers = {"Authorization": f"Bearer {HF_TOKEN}"}
45
- system_prompt = "You are an expert in image to video creation, and give only the motion type, intensity, text overlay, text color, text start and end times for the image described below based on user's prompt. Give the response in a JSON format."
46
- prompt_template = f"<|system|>\n{system_prompt}</s>\n<|user|>\nImage Description: {image_description}\nUser Prompt: {prompt}</s>\n<|assistant|>\n"
47
  payload = {"inputs": prompt_template, "max_new_tokens": 200}
48
- response = requests.post(LLM_API_URL, headers=headers, json=payload)
49
- if response.status_code == 200:
50
- return response.json()[0]['generated_text']
51
- else:
52
- raise Exception(f"LLM query failed: {response.content}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
  def extract_motion_params(llm_output):
55
  try:
56
- import json
57
  start_index = llm_output.find('{')
58
  end_index = llm_output.rfind('}') + 1
59
  json_string = llm_output[start_index:end_index]
@@ -78,7 +103,7 @@ def detect_face_landmarks(image):
78
  return shape
79
  else:
80
  return None
81
-
82
  def apply_color_grading(frame, color_preset, intensity):
83
  if color_preset == "sepia":
84
  sepia_matrix = np.array([[0.393, 0.769, 0.189],
@@ -123,7 +148,7 @@ def apply_color_grading(frame, color_preset, intensity):
123
  return (np.clip(frame_float, 0, 1) * 255).astype(np.uint8)
124
 
125
  return frame
126
-
127
  def apply_vignette(frame, intensity):
128
  width, height = frame.shape[1], frame.shape[0]
129
  x = np.linspace(-1, 1, width)
@@ -201,7 +226,7 @@ def apply_advanced_motion(image, motion_type, intensity, duration, fps, text_ove
201
  draw.line([tuple(landmarks[42]), tuple(landmarks[45])], fill=text_color, width=2)
202
  else:
203
  frame = image.copy()
204
-
205
  elif motion_type == "smile":
206
  mouth_left = landmarks[48]
207
  mouth_right = landmarks[54]
@@ -314,14 +339,53 @@ def apply_advanced_motion(image, motion_type, intensity, duration, fps, text_ove
314
 
315
  return frames
316
 
317
- def create_video_from_frames(frames, duration=5, fps=30):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
318
  with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile:
319
  output_filename = tmpfile.name
320
- writer = imageio_ffmpeg.write_frames(output_filename, frames[0].shape[:2], pix_fmt_out='yuv420p', fps=fps, codec='libx264', preset="veryslow")
321
- writer.send(None)
322
- for frame in frames:
323
- writer.send(frame)
324
- writer.close()
325
  return output_filename
326
 
327
  def generate_and_animate(prompt):
@@ -369,5 +433,17 @@ iface = gr.Interface(
369
  description="Enter a prompt to generate an image and animate it. Uses Flux 1, an LLM, and advanced video processing techniques."
370
  )
371
 
 
 
 
 
 
 
 
 
 
 
 
 
372
  if __name__ == "__main__":
373
- iface.launch(share=True, debug=True)
 
10
  from transformers import pipeline
11
  import cv2
12
  import io
13
+ import json
14
+ import re
15
+ import time
16
 
17
  detector = dlib.get_frontal_face_detector()
18
  try:
 
27
  f.write(landmarks_data)
28
  predictor = dlib.shape_predictor("shape_predictor_68_face_landmarks.dat")
29
 
30
+ IMAGE_GEN_API = "https://api-inference.huggingface.co/models/black-forest-labs/flux-1-schnell"
31
  HF_TOKEN = os.getenv("HF_TOKEN")
32
 
33
+ LLM_API = "https://api-inference.huggingface.co/models/lmsys/fastchat-t5-3b-v1.0"
34
 
35
  def query_hf_image_generation(prompt):
36
  headers = {"Authorization": f"Bearer {HF_TOKEN}"}
37
  payload = {"inputs": prompt}
38
+ for _ in range(3):
39
+ try:
40
+ response = requests.post(IMAGE_GEN_API, headers=headers, json=payload)
41
+ response.raise_for_status()
42
+ image_bytes = response.content
43
+ image = Image.open(io.BytesIO(image_bytes))
44
+ return image
45
+ except requests.exceptions.RequestException as e:
46
+ print(f"Image generation attempt failed: {e}")
47
+ time.sleep(2)
48
+ raise Exception("Image generation failed after multiple attempts")
49
+
50
+ def query_llm(prompt, system_prompt):
51
  headers = {"Authorization": f"Bearer {HF_TOKEN}"}
52
+ prompt_template = f"<|system|>\n{system_prompt}</s>\n<|user|>\n{prompt}</s>\n<|assistant|>\n"
 
53
  payload = {"inputs": prompt_template, "max_new_tokens": 200}
54
+ for _ in range(3):
55
+ try:
56
+ response = requests.post(LLM_API, headers=headers, json=payload)
57
+ response.raise_for_status()
58
+ return response.json()[0]['generated_text']
59
+ except requests.exceptions.RequestException as e:
60
+ print(f"LLM query attempt failed: {e}")
61
+ time.sleep(2)
62
+ raise Exception("LLM query failed after multiple attempts")
63
+
64
+ def segment_script(script):
65
+ system_prompt = "You are a helpful assistant. Given a script, divide it into segments suitable for generating images, ensuring each segment is less than 500 characters."
66
+ llm_response = query_llm(script, system_prompt)
67
+ segments = llm_response.split('\n')
68
+ segments = [seg.strip() for seg in segments if seg.strip()]
69
+ return segments
70
+
71
+ def generate_image_prompts(script_segments):
72
+ image_prompts = []
73
+ for segment in script_segments:
74
+ system_prompt = "You are a helpful assistant. Create a concise image prompt based on the following script segment:"
75
+ prompt = f"Script Segment: {segment}"
76
+ image_prompt = query_llm(prompt, system_prompt)
77
+ image_prompts.append(image_prompt)
78
+ return image_prompts
79
 
80
  def extract_motion_params(llm_output):
81
  try:
 
82
  start_index = llm_output.find('{')
83
  end_index = llm_output.rfind('}') + 1
84
  json_string = llm_output[start_index:end_index]
 
103
  return shape
104
  else:
105
  return None
106
+
107
  def apply_color_grading(frame, color_preset, intensity):
108
  if color_preset == "sepia":
109
  sepia_matrix = np.array([[0.393, 0.769, 0.189],
 
148
  return (np.clip(frame_float, 0, 1) * 255).astype(np.uint8)
149
 
150
  return frame
151
+
152
  def apply_vignette(frame, intensity):
153
  width, height = frame.shape[1], frame.shape[0]
154
  x = np.linspace(-1, 1, width)
 
226
  draw.line([tuple(landmarks[42]), tuple(landmarks[45])], fill=text_color, width=2)
227
  else:
228
  frame = image.copy()
229
+
230
  elif motion_type == "smile":
231
  mouth_left = landmarks[48]
232
  mouth_right = landmarks[54]
 
339
 
340
  return frames
341
 
342
+ def create_video_from_frames(frames, output_filename, fps=30):
343
+ writer = imageio_ffmpeg.write_frames(output_filename, frames[0].shape[:2], pix_fmt_out='yuv420p', fps=fps, codec='libx264', preset="veryslow")
344
+ writer.send(None)
345
+ for frame in frames:
346
+ writer.send(frame)
347
+ writer.close()
348
+
349
+ def generate_video_from_script(script, duration_per_segment=5):
350
+ script_segments = segment_script(script)
351
+ image_prompts = generate_image_prompts(script_segments)
352
+ all_frames = []
353
+
354
+ for i, (segment, image_prompt) in enumerate(zip(script_segments, image_prompts)):
355
+ print(f"Processing segment {i + 1} of {len(script_segments)}")
356
+ print(f" Segment: {segment}")
357
+ print(f" Image Prompt: {image_prompt}")
358
+
359
+ image = query_hf_image_generation(image_prompt)
360
+ image_description = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")(image)[0]['generated_text']
361
+
362
+ system_prompt = "You are an expert in image to video creation. Provide the motion type, intensity, text overlay, text color, text start and end times, color preset, and vignette intensity for the following image description and user prompt. Give the response in a JSON format."
363
+ prompt = f"Image Description: {image_description}\nUser Prompt: {segment}"
364
+ llm_response = query_llm(prompt, system_prompt)
365
+
366
+ print(f" LLM Response: {llm_response}")
367
+ motion_params = extract_motion_params(llm_response)
368
+ print(f" Motion Parameters: {motion_params}")
369
+
370
+ frames = apply_advanced_motion(
371
+ image,
372
+ motion_params["motion_type"],
373
+ motion_params["intensity"],
374
+ duration=duration_per_segment,
375
+ fps=30,
376
+ text_overlay=motion_params["text_overlay"],
377
+ text_color=motion_params["text_color"],
378
+ font_size=50,
379
+ start_time=motion_params["start_time"],
380
+ end_time=motion_params["end_time"],
381
+ color_preset=motion_params.get("color_preset", None),
382
+ vignette_intensity=motion_params.get("vignette_intensity", 0)
383
+ )
384
+ all_frames.extend(frames)
385
+
386
  with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile:
387
  output_filename = tmpfile.name
388
+ create_video_from_frames(all_frames, output_filename)
 
 
 
 
389
  return output_filename
390
 
391
  def generate_and_animate(prompt):
 
433
  description="Enter a prompt to generate an image and animate it. Uses Flux 1, an LLM, and advanced video processing techniques."
434
  )
435
 
436
+ video_iface = gr.Interface(
437
+ fn=generate_video_from_script,
438
+ inputs=[
439
+ gr.Textbox(label="Script (max 1 minute video)", lines=5),
440
+ gr.Slider(label="Duration per Segment (seconds)", minimum=1, maximum=10, step=1, value=5)
441
+ ],
442
+ outputs=gr.Video(label="Generated Video from Script"),
443
+ title="Story Visualizer",
444
+ description="Enter a short story script, and this will generate a video visualizing it using multiple images and animations."
445
+ )
446
+ demo = gr.TabbedInterface([iface, video_iface], ["Generate and Animate", "Story to Video"])
447
+
448
  if __name__ == "__main__":
449
+ demo.launch(share=True, debug=True)