openfree commited on
Commit
cb2582c
ยท
verified ยท
1 Parent(s): a040b19

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +102 -82
app.py CHANGED
@@ -2,6 +2,7 @@ import gradio as gr
2
  from gradio_toggle import Toggle
3
  import torch
4
  from huggingface_hub import snapshot_download
 
5
 
6
  from xora.models.autoencoders.causal_video_autoencoder import CausalVideoAutoencoder
7
  from xora.models.transformers.transformer3d import Transformer3DModel
@@ -20,11 +21,33 @@ import tempfile
20
  import os
21
  import gc
22
  from openai import OpenAI
 
23
 
24
  # Load Hugging Face token if needed
25
  hf_token = os.getenv("HF_TOKEN")
26
  openai_api_key = os.getenv("OPENAI_API_KEY")
27
  client = OpenAI(api_key=openai_api_key)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  system_prompt_t2v_path = "assets/system_prompt_t2v.txt"
29
  system_prompt_i2v_path = "assets/system_prompt_i2v.txt"
30
  with open(system_prompt_t2v_path, "r") as f:
@@ -47,7 +70,6 @@ scheduler_dir = Path(model_path) / "scheduler"
47
 
48
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
49
 
50
-
51
  def load_vae(vae_dir):
52
  vae_ckpt_path = vae_dir / "vae_diffusion_pytorch_model.safetensors"
53
  vae_config_path = vae_dir / "config.json"
@@ -58,7 +80,6 @@ def load_vae(vae_dir):
58
  vae.load_state_dict(vae_state_dict)
59
  return vae.to(device=device, dtype=torch.bfloat16)
60
 
61
-
62
  def load_unet(unet_dir):
63
  unet_ckpt_path = unet_dir / "unet_diffusion_pytorch_model.safetensors"
64
  unet_config_path = unet_dir / "config.json"
@@ -68,13 +89,11 @@ def load_unet(unet_dir):
68
  transformer.load_state_dict(unet_state_dict, strict=True)
69
  return transformer.to(device=device, dtype=torch.bfloat16)
70
 
71
-
72
  def load_scheduler(scheduler_dir):
73
  scheduler_config_path = scheduler_dir / "scheduler_config.json"
74
  scheduler_config = RectifiedFlowScheduler.load_config(scheduler_config_path)
75
  return RectifiedFlowScheduler.from_config(scheduler_config)
76
 
77
-
78
  # Helper function for image processing
79
  def center_crop_and_resize(frame, target_height, target_width):
80
  h, w, _ = frame.shape
@@ -91,7 +110,6 @@ def center_crop_and_resize(frame, target_height, target_width):
91
  frame_resized = cv2.resize(frame_cropped, (target_width, target_height))
92
  return frame_resized
93
 
94
-
95
  def load_image_to_tensor_with_resize(image_path, target_height=512, target_width=768):
96
  image = Image.open(image_path).convert("RGB")
97
  image_np = np.array(image)
@@ -100,7 +118,6 @@ def load_image_to_tensor_with_resize(image_path, target_height=512, target_width
100
  frame_tensor = (frame_tensor / 127.5) - 1.0
101
  return frame_tensor.unsqueeze(0).unsqueeze(2)
102
 
103
-
104
  def enhance_prompt_if_enabled(prompt, enhance_toggle, type="t2v"):
105
  if not enhance_toggle:
106
  print("Enhance toggle is off, Prompt: ", prompt)
@@ -114,7 +131,7 @@ def enhance_prompt_if_enabled(prompt, enhance_toggle, type="t2v"):
114
 
115
  try:
116
  response = client.chat.completions.create(
117
- model="gpt-4o-mini",
118
  messages=messages,
119
  max_tokens=200,
120
  )
@@ -124,7 +141,6 @@ def enhance_prompt_if_enabled(prompt, enhance_toggle, type="t2v"):
124
  print(f"Error: {e}")
125
  return prompt
126
 
127
-
128
  # Preset options for resolution and frame configuration
129
  preset_options = [
130
  {"label": "1216x704, 41 frames", "width": 1216, "height": 704, "num_frames": 41},
@@ -156,8 +172,6 @@ preset_options = [
156
  {"label": "512x320, 257 frames", "width": 512, "height": 320, "num_frames": 257},
157
  ]
158
 
159
-
160
- # Function to toggle visibility of sliders based on preset selection
161
  def preset_changed(preset):
162
  if preset != "Custom":
163
  selected = next(item for item in preset_options if item["label"] == preset)
@@ -179,7 +193,6 @@ def preset_changed(preset):
179
  gr.update(visible=True),
180
  )
181
 
182
-
183
  # Load models
184
  vae = load_vae(vae_dir)
185
  unet = load_unet(unet_dir)
@@ -201,7 +214,6 @@ pipeline = XoraVideoPipeline(
201
  vae=vae,
202
  ).to(device)
203
 
204
-
205
  def generate_video_from_text(
206
  prompt="",
207
  enhance_prompt_toggle=False,
@@ -217,11 +229,16 @@ def generate_video_from_text(
217
  ):
218
  if len(prompt.strip()) < 50:
219
  raise gr.Error(
220
- "Prompt must be at least 50 characters long. Please provide more details for the best results.",
221
  duration=5,
222
  )
223
 
224
- prompt = enhance_prompt_if_enabled(prompt, enhance_prompt_toggle, type="t2v")
 
 
 
 
 
225
 
226
  sample = {
227
  "prompt": prompt,
@@ -257,7 +274,7 @@ def generate_video_from_text(
257
  ).images
258
  except Exception as e:
259
  raise gr.Error(
260
- f"An error occurred while generating the video. Please try again. Error: {e}",
261
  duration=5,
262
  )
263
  finally:
@@ -275,13 +292,13 @@ def generate_video_from_text(
275
  for frame in video_np[..., ::-1]:
276
  out.write(frame)
277
  out.release()
278
- # Explicitly delete tensors and clear cache
279
  del images
280
  del video_np
281
  torch.cuda.empty_cache()
282
  return output_path
283
 
284
 
 
285
  def generate_video_from_image(
286
  image_path,
287
  prompt="",
@@ -296,25 +313,29 @@ def generate_video_from_image(
296
  num_frames=121,
297
  progress=gr.Progress(),
298
  ):
299
-
300
  print("Height: ", height)
301
  print("Width: ", width)
302
  print("Num Frames: ", num_frames)
303
 
304
  if len(prompt.strip()) < 50:
305
  raise gr.Error(
306
- "Prompt must be at least 50 characters long. Please provide more details for the best results.",
307
  duration=5,
308
  )
309
 
310
  if not image_path:
311
- raise gr.Error("Please provide an input image.", duration=5)
 
 
 
 
312
 
313
  media_items = (
314
  load_image_to_tensor_with_resize(image_path, height, width).to(device).detach()
315
  )
316
 
317
- prompt = enhance_prompt_if_enabled(prompt, enhance_prompt_toggle, type="i2v")
 
318
 
319
  sample = {
320
  "prompt": prompt,
@@ -361,7 +382,7 @@ def generate_video_from_image(
361
  out.release()
362
  except Exception as e:
363
  raise gr.Error(
364
- f"An error occurred while generating the video. Please try again. Error: {e}",
365
  duration=5,
366
  )
367
 
@@ -371,7 +392,6 @@ def generate_video_from_image(
371
 
372
  return output_path
373
 
374
-
375
  def create_advanced_options():
376
  with gr.Accordion("Step 4: Advanced Options (Optional)", open=False):
377
  seed = gr.Slider(
@@ -418,8 +438,7 @@ def create_advanced_options():
418
  num_frames_slider,
419
  ]
420
 
421
-
422
- # Define the Gradio interface with tabs
423
  with gr.Blocks(theme=gr.themes.Soft()) as iface:
424
  with gr.Row(elem_id="title-row"):
425
  gr.Markdown(
@@ -430,7 +449,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as iface:
430
  """
431
  )
432
  with gr.Row(elem_id="title-row"):
433
- gr.HTML( # add technical report link
434
  """
435
  <div style="display:flex;column-gap:4px;">
436
  <a href="https://github.com/Lightricks/LTX-Video">
@@ -456,62 +475,63 @@ with gr.Blocks(theme=gr.themes.Soft()) as iface:
456
  ):
457
  gr.Markdown(
458
  """
459
- ๐Ÿ“ Prompt Engineering
460
 
461
- When writing prompts, focus on detailed, chronological descriptions of actions and scenes. Include specific movements, appearances, camera angles, and environmental details - all in a single flowing paragraph. Start directly with the action, and keep descriptions literal and precise. Think like a cinematographer describing a shot list. Keep within 200 words.
462
- For best results, build your prompts using this structure:
463
 
464
- - Start with main action in a single sentence
465
- - Add specific details about movements and gestures
466
- - Describe character/object appearances precisely
467
- - Include background and environment details
468
- - Specify camera angles and movements
469
- - Describe lighting and colors
470
- - Note any changes or sudden events
471
 
472
- See examples for more inspiration.
 
 
 
 
 
 
473
 
474
- ๐ŸŽฎ Parameter Guide
475
 
476
- - Resolution Preset: Higher resolutions for detailed scenes, lower for faster generation and simpler scenes
477
- - Seed: Save seed values to recreate specific styles or compositions you like
478
- - Guidance Scale: 3-3.5 are the recommended values
479
- - Inference Steps: More steps (40+) for quality, fewer steps (20-30) for speed
 
 
480
  """
481
  )
482
 
483
  with gr.Tabs():
484
  # Text to Video Tab
485
- with gr.TabItem("Text to Video"):
486
  with gr.Row():
487
  with gr.Column():
488
  txt2vid_prompt = gr.Textbox(
489
- label="Step 1: Enter Your Prompt",
490
- placeholder="Describe the video you want to generate (minimum 50 characters)...",
491
- value="A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage.",
492
  lines=5,
493
  )
494
  txt2vid_enhance_toggle = Toggle(
495
- label="Enhance Prompt",
496
  value=False,
497
  interactive=True,
498
  )
499
 
500
  txt2vid_negative_prompt = gr.Textbox(
501
- label="Step 2: Enter Negative Prompt",
502
- placeholder="Describe what you don't want in the video...",
503
- value="low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly",
504
  lines=2,
505
  )
506
 
507
  txt2vid_preset = gr.Dropdown(
508
  choices=[p["label"] for p in preset_options],
509
  value="768x512, 97 frames",
510
- label="Step 3.1: Choose Resolution Preset",
511
  )
512
 
513
  txt2vid_frame_rate = gr.Slider(
514
- label="Step 3.2: Frame Rate",
515
  minimum=21,
516
  maximum=30,
517
  step=1,
@@ -520,72 +540,72 @@ with gr.Blocks(theme=gr.themes.Soft()) as iface:
520
 
521
  txt2vid_advanced = create_advanced_options()
522
  txt2vid_generate = gr.Button(
523
- "Step 5: Generate Video",
524
  variant="primary",
525
  size="lg",
526
  )
527
 
528
  with gr.Column():
529
- txt2vid_output = gr.Video(label="Generated Output")
530
 
531
  with gr.Row():
532
  gr.Examples(
533
  examples=[
534
  [
535
- "A young woman in a traditional Mongolian dress is peeking through a sheer white curtain, her face showing a mix of curiosity and apprehension. The woman has long black hair styled in two braids, adorned with white beads, and her eyes are wide with a hint of surprise. Her dress is a vibrant blue with intricate gold embroidery, and she wears a matching headband with a similar design. The background is a simple white curtain, which creates a sense of mystery and intrigue.ith long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hairโ€™s face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage",
536
- "low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly",
537
  "assets/t2v_2.mp4",
538
  ],
539
  [
540
- "A young man with blond hair wearing a yellow jacket stands in a forest and looks around. He has light skin and his hair is styled with a middle part. He looks to the left and then to the right, his gaze lingering in each direction. The camera angle is low, looking up at the man, and remains stationary throughout the video. The background is slightly out of focus, with green trees and the sun shining brightly behind the man. The lighting is natural and warm, with the sun creating a lens flare that moves across the manโ€™s face. The scene is captured in real-life footage.",
541
- "low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly",
542
  "assets/t2v_1.mp4",
543
  ],
544
  [
545
- "A cyclist races along a winding mountain road. Clad in aerodynamic gear, he pedals intensely, sweat glistening on his brow. The camera alternates between close-ups of his determined expression and wide shots of the breathtaking landscape. Pine trees blur past, and the sky is a crisp blue. The scene is invigorating and competitive.",
546
- "low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly",
547
  "assets/t2v_0.mp4",
548
  ],
549
  ],
550
  inputs=[txt2vid_prompt, txt2vid_negative_prompt, txt2vid_output],
551
- label="Example Text-to-Video Generations",
552
  )
553
 
554
  # Image to Video Tab
555
- with gr.TabItem("Image to Video"):
556
  with gr.Row():
557
  with gr.Column():
558
  img2vid_image = gr.Image(
559
  type="filepath",
560
- label="Step 1: Upload Input Image",
561
  elem_id="image_upload",
562
  )
563
  img2vid_prompt = gr.Textbox(
564
- label="Step 2: Enter Your Prompt",
565
- placeholder="Describe how you want to animate the image (minimum 50 characters)...",
566
- value="A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage.",
567
  lines=5,
568
  )
569
  img2vid_enhance_toggle = Toggle(
570
- label="Enhance Prompt",
571
  value=False,
572
  interactive=True,
573
  )
574
  img2vid_negative_prompt = gr.Textbox(
575
- label="Step 3: Enter Negative Prompt",
576
- placeholder="Describe what you don't want in the video...",
577
- value="low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly",
578
  lines=2,
579
  )
580
 
581
  img2vid_preset = gr.Dropdown(
582
  choices=[p["label"] for p in preset_options],
583
  value="768x512, 97 frames",
584
- label="Step 3.1: Choose Resolution Preset",
585
  )
586
 
587
  img2vid_frame_rate = gr.Slider(
588
- label="Step 3.2: Frame Rate",
589
  minimum=21,
590
  maximum=30,
591
  step=1,
@@ -594,31 +614,31 @@ with gr.Blocks(theme=gr.themes.Soft()) as iface:
594
 
595
  img2vid_advanced = create_advanced_options()
596
  img2vid_generate = gr.Button(
597
- "Step 6: Generate Video", variant="primary", size="lg"
598
  )
599
 
600
  with gr.Column():
601
- img2vid_output = gr.Video(label="Generated Output")
602
 
603
  with gr.Row():
604
  gr.Examples(
605
  examples=[
606
  [
607
  "assets/i2v_i2.png",
608
- "A woman stirs a pot of boiling water on a white electric burner. Her hands, with purple nail polish, hold a wooden spoon and move it in a circular motion within a white pot filled with bubbling water. The pot sits on a white electric burner with black buttons and a digital display. The burner is positioned on a white countertop with a red and white checkered cloth partially visible in the bottom right corner. The camera angle is a direct overhead shot, remaining stationary throughout the scene. The lighting is bright and even, illuminating the scene with a neutral white light. The scene is real-life footage.",
609
- "low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly",
610
  "assets/i2v_2.mp4",
611
  ],
612
  [
613
  "assets/i2v_i0.png",
614
- "A woman in a long, flowing dress stands in a field, her back to the camera, gazing towards the horizon; her hair is long and light, cascading down her back; she stands beneath the sprawling branches of a large oak tree; to her left, a classic American car is parked on the dry grass; in the distance, a wrecked car lies on its side; the sky above is a dramatic canvas of bright white clouds against a darker sky; the entire image is in black and white, emphasizing the contrast of light and shadow. The woman is walking slowly towards the car.",
615
- "low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly",
616
  "assets/i2v_0.mp4",
617
  ],
618
  [
619
  "assets/i2v_i1.png",
620
- "A pair of hands shapes a piece of clay on a pottery wheel, gradually forming a cone shape. The hands, belonging to a person out of frame, are covered in clay and gently press a ball of clay onto the center of a spinning pottery wheel. The hands move in a circular motion, gradually forming a cone shape at the top of the clay. The camera is positioned directly above the pottery wheel, providing a birdโ€™s-eye view of the clay being shaped. The lighting is bright and even, illuminating the clay and the hands working on it. The scene is captured in real-life footage.",
621
- "low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly",
622
  "assets/i2v_1.mp4",
623
  ],
624
  ],
@@ -628,10 +648,10 @@ with gr.Blocks(theme=gr.themes.Soft()) as iface:
628
  img2vid_negative_prompt,
629
  img2vid_output,
630
  ],
631
- label="Example Image-to-Video Generations",
632
  )
633
 
634
- # [Previous event handlers remain the same]
635
  txt2vid_preset.change(
636
  fn=preset_changed, inputs=[txt2vid_preset], outputs=txt2vid_advanced[3:]
637
  )
@@ -674,4 +694,4 @@ with gr.Blocks(theme=gr.themes.Soft()) as iface:
674
  if __name__ == "__main__":
675
  iface.queue(max_size=64, default_concurrency_limit=1, api_open=False).launch(
676
  share=True, show_api=False
677
- )
 
2
  from gradio_toggle import Toggle
3
  import torch
4
  from huggingface_hub import snapshot_download
5
+ from transformers import pipeline
6
 
7
  from xora.models.autoencoders.causal_video_autoencoder import CausalVideoAutoencoder
8
  from xora.models.transformers.transformer3d import Transformer3DModel
 
21
  import os
22
  import gc
23
  from openai import OpenAI
24
+ import re
25
 
26
  # Load Hugging Face token if needed
27
  hf_token = os.getenv("HF_TOKEN")
28
  openai_api_key = os.getenv("OPENAI_API_KEY")
29
  client = OpenAI(api_key=openai_api_key)
30
+
31
+ # Initialize translation pipeline
32
+ translator = pipeline("translation", model="Helsinki-NLP/opus-mt-ko-en")
33
+
34
+ # Korean text detection function
35
+ def contains_korean(text):
36
+ korean_pattern = re.compile('[ใ„ฑ-ใ…Žใ…-ใ…ฃ๊ฐ€-ํžฃ]')
37
+ return bool(korean_pattern.search(text))
38
+
39
+ def translate_korean_prompt(prompt):
40
+ """
41
+ Translate Korean prompt to English if Korean text is detected
42
+ """
43
+ if contains_korean(prompt):
44
+ translated = translator(prompt)[0]['translation_text']
45
+ print(f"Original Korean prompt: {prompt}")
46
+ print(f"Translated English prompt: {translated}")
47
+ return translated
48
+ return prompt
49
+
50
+ # Load system prompts
51
  system_prompt_t2v_path = "assets/system_prompt_t2v.txt"
52
  system_prompt_i2v_path = "assets/system_prompt_i2v.txt"
53
  with open(system_prompt_t2v_path, "r") as f:
 
70
 
71
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
72
 
 
73
  def load_vae(vae_dir):
74
  vae_ckpt_path = vae_dir / "vae_diffusion_pytorch_model.safetensors"
75
  vae_config_path = vae_dir / "config.json"
 
80
  vae.load_state_dict(vae_state_dict)
81
  return vae.to(device=device, dtype=torch.bfloat16)
82
 
 
83
  def load_unet(unet_dir):
84
  unet_ckpt_path = unet_dir / "unet_diffusion_pytorch_model.safetensors"
85
  unet_config_path = unet_dir / "config.json"
 
89
  transformer.load_state_dict(unet_state_dict, strict=True)
90
  return transformer.to(device=device, dtype=torch.bfloat16)
91
 
 
92
  def load_scheduler(scheduler_dir):
93
  scheduler_config_path = scheduler_dir / "scheduler_config.json"
94
  scheduler_config = RectifiedFlowScheduler.load_config(scheduler_config_path)
95
  return RectifiedFlowScheduler.from_config(scheduler_config)
96
 
 
97
  # Helper function for image processing
98
  def center_crop_and_resize(frame, target_height, target_width):
99
  h, w, _ = frame.shape
 
110
  frame_resized = cv2.resize(frame_cropped, (target_width, target_height))
111
  return frame_resized
112
 
 
113
  def load_image_to_tensor_with_resize(image_path, target_height=512, target_width=768):
114
  image = Image.open(image_path).convert("RGB")
115
  image_np = np.array(image)
 
118
  frame_tensor = (frame_tensor / 127.5) - 1.0
119
  return frame_tensor.unsqueeze(0).unsqueeze(2)
120
 
 
121
  def enhance_prompt_if_enabled(prompt, enhance_toggle, type="t2v"):
122
  if not enhance_toggle:
123
  print("Enhance toggle is off, Prompt: ", prompt)
 
131
 
132
  try:
133
  response = client.chat.completions.create(
134
+ model="gpt-4-1106-preview",
135
  messages=messages,
136
  max_tokens=200,
137
  )
 
141
  print(f"Error: {e}")
142
  return prompt
143
 
 
144
  # Preset options for resolution and frame configuration
145
  preset_options = [
146
  {"label": "1216x704, 41 frames", "width": 1216, "height": 704, "num_frames": 41},
 
172
  {"label": "512x320, 257 frames", "width": 512, "height": 320, "num_frames": 257},
173
  ]
174
 
 
 
175
  def preset_changed(preset):
176
  if preset != "Custom":
177
  selected = next(item for item in preset_options if item["label"] == preset)
 
193
  gr.update(visible=True),
194
  )
195
 
 
196
  # Load models
197
  vae = load_vae(vae_dir)
198
  unet = load_unet(unet_dir)
 
214
  vae=vae,
215
  ).to(device)
216
 
 
217
  def generate_video_from_text(
218
  prompt="",
219
  enhance_prompt_toggle=False,
 
229
  ):
230
  if len(prompt.strip()) < 50:
231
  raise gr.Error(
232
+ "ํ”„๋กฌํ”„ํŠธ๋Š” ์ตœ์†Œ 50์ž ์ด์ƒ์ด์–ด์•ผ ํ•ฉ๋‹ˆ๋‹ค. ๋” ์ž์„ธํ•œ ์„ค๋ช…์„ ์ œ๊ณตํ•ด์ฃผ์„ธ์š”.",
233
  duration=5,
234
  )
235
 
236
+ # Translate Korean prompts to English
237
+ prompt = translate_korean_prompt(prompt)
238
+ negative_prompt = translate_korean_prompt(negative_prompt)
239
+
240
+ if enhance_prompt_toggle:
241
+ prompt = enhance_prompt_if_enabled(prompt, enhance_prompt_toggle, type="t2v")
242
 
243
  sample = {
244
  "prompt": prompt,
 
274
  ).images
275
  except Exception as e:
276
  raise gr.Error(
277
+ f"๋น„๋””์˜ค ์ƒ์„ฑ ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค. ๋‹ค์‹œ ์‹œ๋„ํ•ด์ฃผ์„ธ์š”. ์˜ค๋ฅ˜: {e}",
278
  duration=5,
279
  )
280
  finally:
 
292
  for frame in video_np[..., ::-1]:
293
  out.write(frame)
294
  out.release()
 
295
  del images
296
  del video_np
297
  torch.cuda.empty_cache()
298
  return output_path
299
 
300
 
301
+
302
  def generate_video_from_image(
303
  image_path,
304
  prompt="",
 
313
  num_frames=121,
314
  progress=gr.Progress(),
315
  ):
 
316
  print("Height: ", height)
317
  print("Width: ", width)
318
  print("Num Frames: ", num_frames)
319
 
320
  if len(prompt.strip()) < 50:
321
  raise gr.Error(
322
+ "ํ”„๋กฌํ”„ํŠธ๋Š” ์ตœ์†Œ 50์ž ์ด์ƒ์ด์–ด์•ผ ํ•ฉ๋‹ˆ๋‹ค. ๋” ์ž์„ธํ•œ ์„ค๋ช…์„ ์ œ๊ณตํ•ด์ฃผ์„ธ์š”.",
323
  duration=5,
324
  )
325
 
326
  if not image_path:
327
+ raise gr.Error("์ž…๋ ฅ ์ด๋ฏธ์ง€๋ฅผ ์ œ๊ณตํ•ด์ฃผ์„ธ์š”.", duration=5)
328
+
329
+ # Translate Korean prompts to English
330
+ prompt = translate_korean_prompt(prompt)
331
+ negative_prompt = translate_korean_prompt(negative_prompt)
332
 
333
  media_items = (
334
  load_image_to_tensor_with_resize(image_path, height, width).to(device).detach()
335
  )
336
 
337
+ if enhance_prompt_toggle:
338
+ prompt = enhance_prompt_if_enabled(prompt, enhance_prompt_toggle, type="i2v")
339
 
340
  sample = {
341
  "prompt": prompt,
 
382
  out.release()
383
  except Exception as e:
384
  raise gr.Error(
385
+ f"๋น„๋””์˜ค ์ƒ์„ฑ ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค. ๋‹ค์‹œ ์‹œ๋„ํ•ด์ฃผ์„ธ์š”. ์˜ค๋ฅ˜: {e}",
386
  duration=5,
387
  )
388
 
 
392
 
393
  return output_path
394
 
 
395
  def create_advanced_options():
396
  with gr.Accordion("Step 4: Advanced Options (Optional)", open=False):
397
  seed = gr.Slider(
 
438
  num_frames_slider,
439
  ]
440
 
441
+ # Gradio Interface Definition
 
442
  with gr.Blocks(theme=gr.themes.Soft()) as iface:
443
  with gr.Row(elem_id="title-row"):
444
  gr.Markdown(
 
449
  """
450
  )
451
  with gr.Row(elem_id="title-row"):
452
+ gr.HTML(
453
  """
454
  <div style="display:flex;column-gap:4px;">
455
  <a href="https://github.com/Lightricks/LTX-Video">
 
475
  ):
476
  gr.Markdown(
477
  """
478
+ ๐Ÿ“ ํ”„๋กฌํ”„ํŠธ ์ž‘์„ฑ ํŒ
479
 
480
+ ํ”„๋กฌํ”„ํŠธ ์ž‘์„ฑ ์‹œ ๋™์ž‘๊ณผ ์žฅ๋ฉด์— ๋Œ€ํ•œ ์ƒ์„ธํ•˜๊ณ  ์‹œ๊ฐ„ ์ˆœ์„œ๋Œ€๋กœ ๋œ ์„ค๋ช…์— ์ง‘์ค‘ํ•˜์„ธ์š”. ๊ตฌ์ฒด์ ์ธ ์›€์ง์ž„, ์™ธ๋ชจ, ์นด๋ฉ”๋ผ ๊ฐ๋„, ํ™˜๊ฒฝ ์„ธ๋ถ€ ์‚ฌํ•ญ์„ ํฌํ•จํ•˜๋˜ ํ•˜๋‚˜์˜ ๋ฌธ๋‹จ์œผ๋กœ ์ž์—ฐ์Šค๋Ÿฝ๊ฒŒ ์ž‘์„ฑํ•˜์„ธ์š”. ๋™์ž‘์œผ๋กœ ๋ฐ”๋กœ ์‹œ์ž‘ํ•˜๊ณ , ์„ค๋ช…์€ ๋ฌธ์ž ๊ทธ๋Œ€๋กœ ์ •ํ™•ํ•˜๊ฒŒ ํ•ด์ฃผ์„ธ์š”. ์ดฌ์˜ ๊ฐ๋…์ด ์ดฌ์˜ ๋ชฉ๋ก์„ ์„ค๋ช…ํ•˜๋Š” ๊ฒƒ์ฒ˜๋Ÿผ ์ƒ๊ฐํ•˜์„ธ์š”. 200๋‹จ์–ด ์ด๋‚ด๋กœ ์ž‘์„ฑํ•˜์„ธ์š”.
 
481
 
482
+ ํ”„๋กฌํ”„ํŠธ๋Š” ๋‹ค์Œ ๊ตฌ์กฐ๋กœ ์ž‘์„ฑํ•˜๋ฉด ์ข‹์Šต๋‹ˆ๋‹ค:
 
 
 
 
 
 
483
 
484
+ - ์ฃผ์š” ๋™์ž‘์„ ํ•œ ๋ฌธ์žฅ์œผ๋กœ ์‹œ์ž‘
485
+ - ๊ตฌ์ฒด์ ์ธ ๋™์ž‘๊ณผ ์ œ์Šค์ฒ˜ ์ถ”๊ฐ€
486
+ - ์บ๋ฆญํ„ฐ/๊ฐ์ฒด์˜ ์™ธ๋ชจ๋ฅผ ์ •ํ™•ํžˆ ์„ค๋ช…
487
+ - ๏ฟฝ๏ฟฝ๊ฒฝ๊ณผ ํ™˜๊ฒฝ ์„ธ๋ถ€ ์‚ฌํ•ญ ํฌํ•จ
488
+ - ์นด๋ฉ”๋ผ ๊ฐ๋„์™€ ์›€์ง์ž„ ์ง€์ •
489
+ - ์กฐ๋ช…๊ณผ ์ƒ‰์ƒ ์„ค๋ช…
490
+ - ๋ณ€ํ™”๋‚˜ ๊ฐ‘์ž‘์Šค๋Ÿฌ์šด ์‚ฌ๊ฑด ๊ธฐ๋ก
491
 
492
+ ์˜ˆ์‹œ๋ฅผ ์ฐธ๊ณ ํ•˜์„ธ์š”.
493
 
494
+ ๐ŸŽฎ ๋งค๊ฐœ๋ณ€์ˆ˜ ๊ฐ€์ด๋“œ
495
+
496
+ - ํ•ด์ƒ๋„ ํ”„๋ฆฌ์…‹: ์ƒ์„ธํ•œ ์žฅ๋ฉด์€ ๋†’์€ ํ•ด์ƒ๋„, ๋‹จ์ˆœํ•œ ์žฅ๋ฉด์€ ๋‚ฎ์€ ํ•ด์ƒ๋„ ์„ ํƒ
497
+ - Seed: ํŠน์ • ์Šคํƒ€์ผ์ด๋‚˜ ๊ตฌ์„ฑ์„ ์žฌํ˜„ํ•˜๊ณ  ์‹ถ์„ ๋•Œ seed ๊ฐ’ ์ €์žฅ
498
+ - Guidance Scale: 3-3.5๊ฐ€ ๊ถŒ์žฅ๊ฐ’
499
+ - Inference Steps: ํ’ˆ์งˆ์„ ์œ„ํ•ด์„œ๋Š” 40+ ๋‹จ๊ณ„, ์†๋„๋ฅผ ์œ„ํ•ด์„œ๋Š” 20-30 ๋‹จ๊ณ„
500
  """
501
  )
502
 
503
  with gr.Tabs():
504
  # Text to Video Tab
505
+ with gr.TabItem("ํ…์ŠคํŠธ๋กœ ๋น„๋””์˜ค ๋งŒ๋“ค๊ธฐ"):
506
  with gr.Row():
507
  with gr.Column():
508
  txt2vid_prompt = gr.Textbox(
509
+ label="Step 1: ํ”„๋กฌํ”„ํŠธ ์ž…๋ ฅ",
510
+ placeholder="์ƒ์„ฑํ•˜๊ณ  ์‹ถ์€ ๋น„๋””์˜ค๋ฅผ ์„ค๋ช…ํ•˜์„ธ์š” (์ตœ์†Œ 50์ž)...",
511
+ value="๊ฐˆ์ƒ‰ ๊ธด ๋จธ๋ฆฌ๋ฅผ ๊ฐ€์ง„ ์—ฌ์„ฑ์ด ๊ธˆ๋ฐœ์˜ ๊ธด ๋จธ๋ฆฌ๋ฅผ ๊ฐ€์ง„ ๋‹ค๋ฅธ ์—ฌ์„ฑ์„ ํ–ฅํ•ด ๋ฏธ์†Œ์ง“์Šต๋‹ˆ๋‹ค. ๊ฐˆ์ƒ‰ ๋จธ๋ฆฌ์˜ ์—ฌ์„ฑ์€ ๊ฒ€์€์ƒ‰ ์ž์ผ“์„ ์ž…๊ณ  ์žˆ์œผ๋ฉฐ ์˜ค๋ฅธ์ชฝ ๋บจ์— ์ž‘์€ ์ ์ด ์žˆ์Šต๋‹ˆ๋‹ค. ์นด๋ฉ”๋ผ ๊ฐ๋„๋Š” ๊ฐˆ์ƒ‰ ๋จธ๋ฆฌ ์—ฌ์„ฑ์˜ ์–ผ๊ตด์— ํด๋กœ์ฆˆ์—…๋˜์–ด ์žˆ์Šต๋‹ˆ๋‹ค. ์กฐ๋ช…์€ ์ž์—ฐ์Šค๋Ÿฝ๊ณ  ๋”ฐ๋œปํ•˜๋ฉฐ, ์„์–‘์—์„œ ์˜ค๋Š” ๋“ฏํ•œ ๋ถ€๋“œ๋Ÿฌ์šด ๋น›์ด ์žฅ๋ฉด์„ ๋น„์ถฅ๋‹ˆ๋‹ค. ์žฅ๋ฉด์€ ์‹ค์ œ ์˜์ƒ์ฒ˜๋Ÿผ ๋ณด์ž…๋‹ˆ๋‹ค.",
512
  lines=5,
513
  )
514
  txt2vid_enhance_toggle = Toggle(
515
+ label="ํ”„๋กฌํ”„ํŠธ ๊ฐœ์„ ",
516
  value=False,
517
  interactive=True,
518
  )
519
 
520
  txt2vid_negative_prompt = gr.Textbox(
521
+ label="Step 2: ๋„ค๊ฑฐํ‹ฐ๋ธŒ ํ”„๋กฌํ”„ํŠธ ์ž…๋ ฅ",
522
+ placeholder="๋น„๋””์˜ค์—์„œ ์›ํ•˜์ง€ ์•Š๋Š” ์š”์†Œ๋ฅผ ์„ค๋ช…ํ•˜์„ธ์š”...",
523
+ value="๋‚ฎ์€ ํ’ˆ์งˆ, ์ตœ์•…์˜ ํ’ˆ์งˆ, ๊ธฐํ˜•, ์™œ๊ณก๋œ, ์ผ๊ทธ๋Ÿฌ์ง„, ๋ชจ์…˜ ์Šค๋ฏธ์–ด, ๋ชจ์…˜ ์•„ํ‹ฐํŒฉํŠธ, ์œตํ•ฉ๋œ ์†๊ฐ€๋ฝ, ์ž˜๋ชป๋œ ํ•ด๋ถ€ํ•™, ์ด์ƒํ•œ ์†, ์ถ”ํ•œ",
524
  lines=2,
525
  )
526
 
527
  txt2vid_preset = gr.Dropdown(
528
  choices=[p["label"] for p in preset_options],
529
  value="768x512, 97 frames",
530
+ label="Step 3.1: ํ•ด์ƒ๋„ ํ”„๋ฆฌ์…‹ ์„ ํƒ",
531
  )
532
 
533
  txt2vid_frame_rate = gr.Slider(
534
+ label="Step 3.2: ํ”„๋ ˆ์ž„ ๋ ˆ์ดํŠธ",
535
  minimum=21,
536
  maximum=30,
537
  step=1,
 
540
 
541
  txt2vid_advanced = create_advanced_options()
542
  txt2vid_generate = gr.Button(
543
+ "Step 5: ๋น„๋””์˜ค ์ƒ์„ฑ",
544
  variant="primary",
545
  size="lg",
546
  )
547
 
548
  with gr.Column():
549
+ txt2vid_output = gr.Video(label="์ƒ์„ฑ๋œ ๋น„๋””์˜ค")
550
 
551
  with gr.Row():
552
  gr.Examples(
553
  examples=[
554
  [
555
+ "์ „ํ†ต์ ์ธ ๋ชฝ๊ณจ ๋“œ๋ ˆ์Šค๋ฅผ ์ž…์€ ์ Š์€ ์—ฌ์„ฑ์ด ์–‡์€ ํฐ์ƒ‰ ์ปคํŠผ์„ ํ†ตํ•ด ํ˜ธ๊ธฐ์‹ฌ๊ณผ ๊ธด์žฅ์ด ์„ž์ธ ํ‘œ์ •์œผ๋กœ ๋“ค์—ฌ๋‹ค๋ณด๊ณ  ์žˆ์Šต๋‹ˆ๋‹ค. ์—ฌ์„ฑ์€ ํฐ ๊ตฌ์Šฌ๋กœ ์žฅ์‹๋œ ๋‘ ๊ฐœ์˜ ๋•‹์€ ๋จธ๋ฆฌ๋กœ ์Šคํƒ€์ผ๋ง๋œ ๊ธด ๊ฒ€์€ ๋จธ๋ฆฌ๋ฅผ ํ•˜๊ณ  ์žˆ์œผ๋ฉฐ, ๋ˆˆ์€ ๋†€๋žŒ์„ ๋„๋ฉฐ ํฌ๊ฒŒ ๋– ์ ธ ์žˆ์Šต๋‹ˆ๋‹ค. ๊ทธ๋…€์˜ ๋“œ๋ ˆ์Šค๋Š” ํ™”๋ คํ•œ ๊ธˆ์ƒ‰ ์ž์ˆ˜๊ฐ€ ์ƒˆ๊ฒจ์ง„ ์„ ๋ช…ํ•œ ํŒŒ๋ž€์ƒ‰์ด๋ฉฐ, ๋น„์Šทํ•œ ๋””์ž์ธ์˜ ๋จธ๋ฆฌ๋ ๋ฅผ ํ•˜๊ณ  ์žˆ์Šต๋‹ˆ๋‹ค. ๋ฐฐ๊ฒฝ์€ ์‹ ๋น„๋กœ์›€๊ณผ ํ˜ธ๊ธฐ์‹ฌ์„ ์ž์•„๋‚ด๋Š” ๋‹จ์ˆœํ•œ ํฐ์ƒ‰ ์ปคํŠผ์ž…๋‹ˆ๋‹ค.",
556
+ "๋‚ฎ์€ ํ’ˆ์งˆ, ์ตœ์•…์˜ ํ’ˆ์งˆ, ๊ธฐํ˜•, ์™œ๊ณก๋œ, ์ผ๊ทธ๋Ÿฌ์ง„, ๋ชจ์…˜ ์Šค๋ฏธ์–ด, ๋ชจ์…˜ ์•„ํ‹ฐํŒฉํŠธ, ์œตํ•ฉ๋œ ์†๊ฐ€๋ฝ, ์ž˜๋ชป๋œ ํ•ด๋ถ€ํ•™, ์ด์ƒํ•œ ์†, ์ถ”ํ•œ",
557
  "assets/t2v_2.mp4",
558
  ],
559
  [
560
+ "๋…ธ๋ž€์ƒ‰ ์žฌํ‚ท์„ ์ž…์€ ๊ธˆ๋ฐœ ๋จธ๋ฆฌ์˜ ์ Š์€ ๋‚จ์ž๊ฐ€ ์ˆฒ์— ์„œ์„œ ์ฃผ์œ„๋ฅผ ๋‘˜๋Ÿฌ๋ด…๋‹ˆ๋‹ค. ๊ทธ๋Š” ๋ฐ์€ ํ”ผ๋ถ€๋ฅผ ๊ฐ€์กŒ๊ณ  ๋จธ๋ฆฌ๋Š” ๊ฐ€์šด๋ฐ ๊ฐ€๋ฅด๋งˆ๋กœ ์Šคํƒ€์ผ๋ง๋˜์–ด ์žˆ์Šต๋‹ˆ๋‹ค. ๊ทธ๋Š” ์™ผ์ชฝ์„ ๋ณด๊ณ  ๋‚œ ํ›„ ์˜ค๋ฅธ์ชฝ์„ ๋ณด๋ฉฐ, ๊ฐ ๋ฐฉํ–ฅ์„ ์ž ์‹œ ์‘์‹œํ•ฉ๋‹ˆ๋‹ค. ์นด๋ฉ”๋ผ๋Š” ๋‚ฎ์€ ๊ฐ๋„์—์„œ ๋‚จ์ž๋ฅผ ์˜ฌ๋ ค๋‹ค๋ณด๋ฉฐ ๊ณ ์ •๋˜์–ด ์žˆ์Šต๋‹ˆ๋‹ค. ๋ฐฐ๊ฒฝ์€ ์•ฝ๊ฐ„ ํ๋ฆฟํ•˜๋ฉฐ, ๋…น์ƒ‰ ๋‚˜๋ฌด๋“ค๊ณผ ๋‚จ์ž์˜ ๋’ค์—์„œ ๋ฐ๊ฒŒ ๋น„์น˜๋Š” ํƒœ์–‘์ด ๋ณด์ž…๋‹ˆ๋‹ค. ์กฐ๋ช…์€ ์ž์—ฐ์Šค๋Ÿฝ๊ณ  ๋”ฐ๋œปํ•˜๋ฉฐ, ํƒœ์–‘ ๋น›์ด ๋‚จ์ž์˜ ์–ผ๊ตด์„ ๊ฐ€๋กœ์ง€๋ฅด๋Š” ๋ Œ์ฆˆ ํ”Œ๋ ˆ์–ด๋ฅผ ๋งŒ๋“ญ๋‹ˆ๋‹ค. ์žฅ๋ฉด์€ ์‹ค์ œ ์˜์ƒ์ฒ˜๋Ÿผ ์ดฌ์˜๋˜์—ˆ์Šต๋‹ˆ๋‹ค.",
561
+ "๋‚ฎ์€ ํ’ˆ์งˆ, ์ตœ์•…์˜ ํ’ˆ์งˆ, ๊ธฐํ˜•, ์™œ๊ณก๋œ, ์ผ๊ทธ๋Ÿฌ์ง„, ๋ชจ์…˜ ์Šค๋ฏธ์–ด, ๋ชจ์…˜ ์•„ํ‹ฐํŒฉํŠธ, ์œตํ•ฉ๋œ ์†๊ฐ€๋ฝ, ์ž˜๋ชป๋œ ํ•ด๋ถ€ํ•™, ์ด์ƒํ•œ ์†, ์ถ”ํ•œ",
562
  "assets/t2v_1.mp4",
563
  ],
564
  [
565
+ "ํ•œ ์‚ฌ์ดํด๋ฆฌ์ŠคํŠธ๊ฐ€ ๊ตฝ์ด์ง„ ์‚ฐ๊ธธ์„ ๋”ฐ๋ผ ๋‹ฌ๋ฆฝ๋‹ˆ๋‹ค. ๊ณต๊ธฐ์—ญํ•™์ ์ธ ์žฅ๋น„๋ฅผ ์ž…์€ ๊ทธ๋Š” ๊ฐ•ํ•˜๊ฒŒ ํŽ˜๋‹ฌ์„ ๋ฐŸ๊ณ  ์žˆ์œผ๋ฉฐ, ์ด๋งˆ์—๋Š” ๋•€๋ฐฉ์šธ์ด ๋ฐ˜์ง์ž…๋‹ˆ๋‹ค. ์นด๋ฉ”๋ผ๋Š” ๊ทธ์˜ ๊ฒฐ์—ฐํ•œ ํ‘œ์ •๊ณผ ์ˆจ ๋ง‰ํžˆ๋Š” ํ’๊ฒฝ์„ ๋ฒˆ๊ฐˆ์•„๊ฐ€๋ฉฐ ๋ณด์—ฌ์ค๋‹ˆ๋‹ค. ์†Œ๋‚˜๋ฌด๋“ค์ด ์Šค์ณ ์ง€๋‚˜๊ฐ€๊ณ , ํ•˜๋Š˜์€ ์„ ๋ช…ํ•œ ํŒŒ๋ž€์ƒ‰์ž…๋‹ˆ๋‹ค. ์ด ์žฅ๋ฉด์€ ํ™œ๊ธฐ์ฐจ๊ณ  ๊ฒฝ์Ÿ์ ์ธ ๋ถ„์œ„๊ธฐ๋ฅผ ์ž์•„๋ƒ…๋‹ˆ๋‹ค.",
566
+ "๋‚ฎ์€ ํ’ˆ์งˆ, ์ตœ์•…์˜ ํ’ˆ์งˆ, ๊ธฐํ˜•, ์™œ๊ณก๋œ, ์ผ๊ทธ๋Ÿฌ์ง„, ๋ชจ์…˜ ์Šค๋ฏธ์–ด, ๋ชจ์…˜ ์•„ํ‹ฐํŒฉํŠธ, ์œตํ•ฉ๋œ ์†๊ฐ€๋ฝ, ์ž˜๋ชป๋œ ํ•ด๋ถ€ํ•™, ์ด์ƒํ•œ ์†, ์ถ”ํ•œ",
567
  "assets/t2v_0.mp4",
568
  ],
569
  ],
570
  inputs=[txt2vid_prompt, txt2vid_negative_prompt, txt2vid_output],
571
+ label="ํ…์ŠคํŠธ-๋น„๋””์˜ค ์ƒ์„ฑ ์˜ˆ์‹œ",
572
  )
573
 
574
  # Image to Video Tab
575
+ with gr.TabItem("์ด๋ฏธ์ง€๋กœ ๋น„๋””์˜ค ๋งŒ๋“ค๊ธฐ"):
576
  with gr.Row():
577
  with gr.Column():
578
  img2vid_image = gr.Image(
579
  type="filepath",
580
+ label="Step 1: ์ž…๋ ฅ ์ด๋ฏธ์ง€ ์—…๋กœ๋“œ",
581
  elem_id="image_upload",
582
  )
583
  img2vid_prompt = gr.Textbox(
584
+ label="Step 2: ํ”„๋กฌํ”„ํŠธ ์ž…๋ ฅ",
585
+ placeholder="์ด๋ฏธ์ง€๋ฅผ ์–ด๋–ป๊ฒŒ ์• ๋‹ˆ๋ฉ”์ด์…˜ํ™”ํ• ์ง€ ์„ค๋ช…ํ•˜์„ธ์š” (์ตœ์†Œ 50์ž)...",
586
+ value="๊ฐˆ์ƒ‰ ๊ธด ๋จธ๋ฆฌ๋ฅผ ๊ฐ€์ง„ ์—ฌ์„ฑ์ด ๊ธˆ๋ฐœ์˜ ๊ธด ๋จธ๋ฆฌ๋ฅผ ๊ฐ€์ง„ ๋‹ค๋ฅธ ์—ฌ์„ฑ์„ ํ–ฅํ•ด ๋ฏธ์†Œ์ง“์Šต๋‹ˆ๋‹ค. ๊ฐˆ์ƒ‰ ๋จธ๋ฆฌ์˜ ์—ฌ์„ฑ์€ ๊ฒ€์€์ƒ‰ ์ž์ผ“์„ ์ž…๊ณ  ์žˆ์œผ๋ฉฐ ์˜ค๋ฅธ์ชฝ ๋บจ์— ์ž‘์€ ์ ์ด ์žˆ์Šต๋‹ˆ๋‹ค. ์นด๋ฉ”๋ผ ๊ฐ๋„๋Š” ๊ฐˆ์ƒ‰ ๋จธ๋ฆฌ ์—ฌ์„ฑ์˜ ์–ผ๊ตด์— ํด๋กœ์ฆˆ์—…๋˜์–ด ์žˆ์Šต๋‹ˆ๋‹ค. ์กฐ๋ช…์€ ์ž์—ฐ์Šค๋Ÿฝ๊ณ  ๋”ฐ๋œปํ•˜๋ฉฐ, ์„์–‘์—์„œ ์˜ค๋Š” ๋“ฏํ•œ ๋ถ€๋“œ๋Ÿฌ์šด ๋น›์ด ์žฅ๋ฉด์„ ๋น„์ถฅ๋‹ˆ๋‹ค. ์žฅ๋ฉด์€ ์‹ค์ œ ์˜์ƒ์ฒ˜๋Ÿผ ๋ณด์ž…๋‹ˆ๋‹ค.",
587
  lines=5,
588
  )
589
  img2vid_enhance_toggle = Toggle(
590
+ label="ํ”„๋กฌํ”„ํŠธ ๊ฐœ์„ ",
591
  value=False,
592
  interactive=True,
593
  )
594
  img2vid_negative_prompt = gr.Textbox(
595
+ label="Step 3: ๋„ค๊ฑฐํ‹ฐ๋ธŒ ํ”„๋กฌํ”„ํŠธ ์ž…๋ ฅ",
596
+ placeholder="๋น„๋””์˜ค์—์„œ ์›ํ•˜์ง€ ์•Š๋Š” ์š”์†Œ๋ฅผ ์„ค๋ช…ํ•˜์„ธ์š”...",
597
+ value="๋‚ฎ์€ ํ’ˆ์งˆ, ์ตœ์•…์˜ ํ’ˆ์งˆ, ๊ธฐํ˜•, ์™œ๊ณก๋œ, ์ผ๊ทธ๋Ÿฌ์ง„, ๋ชจ์…˜ ์Šค๋ฏธ์–ด, ๋ชจ์…˜ ์•„ํ‹ฐํŒฉํŠธ, ์œตํ•ฉ๋œ ์†๊ฐ€๋ฝ, ์ž˜๋ชป๋œ ํ•ด๋ถ€ํ•™, ์ด์ƒํ•œ ์†, ์ถ”ํ•œ",
598
  lines=2,
599
  )
600
 
601
  img2vid_preset = gr.Dropdown(
602
  choices=[p["label"] for p in preset_options],
603
  value="768x512, 97 frames",
604
+ label="Step 3.1: ํ•ด์ƒ๋„ ํ”„๋ฆฌ์…‹ ์„ ํƒ",
605
  )
606
 
607
  img2vid_frame_rate = gr.Slider(
608
+ label="Step 3.2: ํ”„๋ ˆ์ž„ ๋ ˆ์ดํŠธ",
609
  minimum=21,
610
  maximum=30,
611
  step=1,
 
614
 
615
  img2vid_advanced = create_advanced_options()
616
  img2vid_generate = gr.Button(
617
+ "Step 6: ๋น„๋””์˜ค ์ƒ์„ฑ", variant="primary", size="lg"
618
  )
619
 
620
  with gr.Column():
621
+ img2vid_output = gr.Video(label="์ƒ์„ฑ๋œ ๋น„๋””์˜ค")
622
 
623
  with gr.Row():
624
  gr.Examples(
625
  examples=[
626
  [
627
  "assets/i2v_i2.png",
628
+ "์—ฌ์„ฑ์ด ํฐ์ƒ‰ ์ „๊ธฐ ๋ฒ„๋„ˆ ์œ„์—์„œ ๋“๋Š” ๋ฌผ์ด ๋‹ด๊ธด ๋ƒ„๋น„๋ฅผ ์ “๊ณ  ์žˆ์Šต๋‹ˆ๋‹ค. ๋ณด๋ผ์ƒ‰ ๋งค๋‹ˆํ์–ด๋ฅผ ๋ฐ”๋ฅธ ๊ทธ๋…€์˜ ์†์ด ํ•˜์–€ ๋ƒ„๋น„ ์•ˆ์—์„œ ๋‚˜๋ฌด ์ˆŸ๊ฐ€๋ฝ์„ ์›ํ˜•์œผ๋กœ ์›€์ง์ž…๋‹ˆ๋‹ค. ๋ƒ„๋น„๋Š” ๊ฒ€์€์ƒ‰ ๋ฒ„ํŠผ๊ณผ ๋””์ง€ํ„ธ ๋””์Šคํ”Œ๋ ˆ์ด๊ฐ€ ์žˆ๋Š” ํฐ์ƒ‰ ์ „๊ธฐ ๋ฒ„๋„ˆ ์œ„์— ๋†“์—ฌ ์žˆ์Šต๋‹ˆ๋‹ค. ๋ฒ„๋„ˆ๋Š” ์˜ค๋ฅธ์ชฝ ์•„๋ž˜ ๋ชจ์„œ๋ฆฌ์— ๋นจ๊ฐ„์ƒ‰๊ณผ ํฐ์ƒ‰ ์ฒดํฌ๋ฌด๋Šฌ ์ฒœ์ด ๋ถ€๋ถ„์ ์œผ๋กœ ๋ณด์ด๋Š” ํฐ์ƒ‰ ์กฐ๋ฆฌ๋Œ€ ์œ„์— ๋†“์—ฌ ์žˆ์Šต๋‹ˆ๋‹ค. ์นด๋ฉ”๋ผ ๊ฐ๋„๋Š” ์ •ํ™•ํžˆ ์œ„์—์„œ ๋‚ด๋ ค๋‹ค๋ณด๋Š” ๊ฐ๋„์ด๋ฉฐ ์žฅ๋ฉด ๋‚ด๋‚ด ๊ณ ์ •๋˜์–ด ์žˆ์Šต๋‹ˆ๋‹ค. ์กฐ๋ช…์€ ๋ฐ๊ณ  ๊ณ ๋ฅธ ์ค‘์„ฑ์ ์ธ ํฐ์ƒ‰ ๋น›์œผ๋กœ ์žฅ๋ฉด์„ ๋น„์ถฅ๋‹ˆ๋‹ค. ์žฅ๋ฉด์€ ์‹ค์ œ ์˜์ƒ์ฒ˜๋Ÿผ ๋ณด์ž…๋‹ˆ๋‹ค.",
629
+ "๋‚ฎ์€ ํ’ˆ์งˆ, ์ตœ์•…์˜ ํ’ˆ์งˆ, ๊ธฐํ˜•, ์™œ๊ณก๋œ, ์ผ๊ทธ๋Ÿฌ์ง„, ๋ชจ์…˜ ์Šค๋ฏธ์–ด, ๋ชจ์…˜ ์•„ํ‹ฐํŒฉํŠธ, ์œตํ•ฉ๋œ ์†๊ฐ€๋ฝ, ์ž˜๋ชป๋œ ํ•ด๋ถ€ํ•™, ์ด์ƒํ•œ ์†, ์ถ”ํ•œ",
630
  "assets/i2v_2.mp4",
631
  ],
632
  [
633
  "assets/i2v_i0.png",
634
+ "๊ธด ํ๋ฅด๋Š” ๋“œ๋ ˆ์Šค๋ฅผ ์ž…์€ ์—ฌ์„ฑ์ด ๋“คํŒ์— ์„œ์„œ ๋“ฑ์„ ์นด๋ฉ”๋ผ๋ฅผ ํ–ฅํ•œ ์ฑ„ ์ง€ํ‰์„ ์„ ๋ฐ”๋ผ๋ณด๊ณ  ์žˆ์Šต๋‹ˆ๋‹ค. ๊ทธ๋…€์˜ ๋จธ๋ฆฌ์นด๋ฝ์€ ๊ธธ๊ณ  ๋ฐ์œผ๋ฉฐ ๋“ฑ ์•„๋ž˜๋กœ ํ˜๋Ÿฌ๋‚ด๋ฆฝ๋‹ˆ๋‹ค. ๊ทธ๋…€๋Š” ํฐ ์ฐธ๋‚˜๋ฌด์˜ ๋„“๊ฒŒ ํผ์ง„ ๊ฐ€์ง€ ์•„๋ž˜์— ์„œ ์žˆ์Šต๋‹ˆ๋‹ค. ์™ผ์ชฝ์œผ๋กœ๋Š” ๋ง๋ผ๋ถ™์€ ์ž”๋”” ์œ„์— ํด๋ž˜์‹ํ•œ ๋ฏธ๊ตญ ์ž๋™์ฐจ๊ฐ€ ์ฃผ์ฐจ๋˜์–ด ์žˆ์Šต๋‹ˆ๋‹ค. ๋ฉ€๋ฆฌ์„œ๋Š” ํ•œ ๋Œ€์˜ ๋ถ€์„œ์ง„ ์ž๋™์ฐจ๊ฐ€ ์˜†์œผ๋กœ ๋ˆ„์›Œ ์žˆ์Šต๋‹ˆ๋‹ค. ์œ„์˜ ํ•˜๋Š˜์€ ์–ด๋‘์šด ํ•˜๋Š˜์„ ๋ฐฐ๊ฒฝ์œผ๋กœ ๋ฐ์€ ํฐ ๊ตฌ๋ฆ„์ด ๊ทน์ ์ธ ์บ”๋ฒ„์Šค๋ฅผ ์ด๋ฃจ๊ณ  ์žˆ์Šต๋‹ˆ๋‹ค. ์ „์ฒด ์ด๋ฏธ์ง€๋Š” ํ‘๋ฐฑ์œผ๋กœ, ๋น›๊ณผ ๊ทธ๋ฆผ์ž์˜ ๋Œ€๋น„๋ฅผ ๊ฐ•์กฐํ•ฉ๋‹ˆ๋‹ค. ์—ฌ์„ฑ์ด ์ฒœ์ฒœํžˆ ์ž๋™์ฐจ๋ฅผ ํ–ฅํ•ด ๊ฑธ์–ด๊ฐ€๊ณ  ์žˆ์Šต๋‹ˆ๋‹ค.",
635
+ "๋‚ฎ์€ ํ’ˆ์งˆ, ์ตœ์•…์˜ ํ’ˆ์งˆ, ๊ธฐํ˜•, ์™œ๊ณก๋œ, ์ผ๊ทธ๋Ÿฌ์ง„, ๋ชจ์…˜ ์Šค๋ฏธ์–ด, ๋ชจ์…˜ ์•„ํ‹ฐํŒฉํŠธ, ์œตํ•ฉ๋œ ์†๊ฐ€๋ฝ, ์ž˜๋ชป๋œ ํ•ด๋ถ€ํ•™, ์ด์ƒํ•œ ์†, ์ถ”ํ•œ",
636
  "assets/i2v_0.mp4",
637
  ],
638
  [
639
  "assets/i2v_i1.png",
640
+ "ํ•œ ์Œ์˜ ์†์ด ๋„์ž๊ธฐ ๋ฌผ๋ ˆ ์œ„์—์„œ ์ ํ†  ์กฐ๊ฐ์„ ๋ชจ์–‘ ์žก์•„ ์ ์ฐจ์ ์œผ๋กœ ์›๋ฟ” ๋ชจ์–‘์„ ๋งŒ๋“ค์–ด๊ฐ€๊ณ  ์žˆ์Šต๋‹ˆ๋‹ค. ํ”„๋ ˆ์ž„ ๋ฐ–์˜ ์‚ฌ๋žŒ์˜ ์†์ด ์ ํ† ๋กœ ๋ฎ์—ฌ ์žˆ์œผ๋ฉฐ, ํšŒ์ „ํ•˜๋Š” ๋„์ž๊ธฐ ๋ฌผ๋ ˆ ์ค‘์•™์— ์ ํ†  ๋ฉ์–ด๋ฆฌ๋ฅผ ๋ถ€๋“œ๋Ÿฝ๊ฒŒ ๋ˆ„๋ฅด๊ณ  ์žˆ์Šต๋‹ˆ๋‹ค. ์†์€ ์›ํ˜•์œผ๋กœ ์›€์ง์ด๋ฉฐ, ์ ํ†  ์œ„์ชฝ์— ์ ์ฐจ์ ์œผ๋กœ ์›๋ฟ” ๋ชจ์–‘์„ ๋งŒ๋“ค์–ด๊ฐ‘๋‹ˆ๋‹ค. ์นด๋ฉ”๋ผ๋Š” ๋„์ž๊ธฐ ๋ฌผ๋ ˆ ๋ฐ”๋กœ ์œ„์— ์œ„์น˜ํ•˜์—ฌ ์ ํ† ๊ฐ€ ๋ชจ์–‘ ์žกํ˜€๊ฐ€๋Š” ๊ฒƒ์„ ์กฐ๊ฐ๋„๋กœ ๋ณด์—ฌ์ค๋‹ˆ๋‹ค. ์กฐ๋ช…์€ ๋ฐ๊ณ  ๊ณ ๋ฅด๋ฉฐ, ์ ํ† ์™€ ๊ทธ๊ฒƒ์„ ๋‹ค๋ฃจ๋Š” ์†์„ ๋ฐ๊ฒŒ ๋น„์ถฅ๋‹ˆ๋‹ค. ์žฅ๋ฉด์€ ์‹ค์ œ ์˜์ƒ์ฒ˜๋Ÿผ ์ดฌ์˜๋˜์—ˆ์Šต๋‹ˆ๋‹ค.",
641
+ "๋‚ฎ์€ ํ’ˆ์งˆ, ์ตœ์•…์˜ ํ’ˆ์งˆ, ๊ธฐํ˜•, ์™œ๊ณก๋œ, ์ผ๊ทธ๋Ÿฌ์ง„, ๋ชจ์…˜ ์Šค๋ฏธ์–ด, ๋ชจ์…˜ ์•„ํ‹ฐํŒฉํŠธ, ์œตํ•ฉ๋œ ์†๊ฐ€๋ฝ, ์ž˜๋ชป๋œ ํ•ด๋ถ€ํ•™, ์ด์ƒํ•œ ์†, ์ถ”ํ•œ",
642
  "assets/i2v_1.mp4",
643
  ],
644
  ],
 
648
  img2vid_negative_prompt,
649
  img2vid_output,
650
  ],
651
+ label="์ด๋ฏธ์ง€-๋น„๋””์˜ค ์ƒ์„ฑ ์˜ˆ์‹œ",
652
  )
653
 
654
+ # Event handlers
655
  txt2vid_preset.change(
656
  fn=preset_changed, inputs=[txt2vid_preset], outputs=txt2vid_advanced[3:]
657
  )
 
694
  if __name__ == "__main__":
695
  iface.queue(max_size=64, default_concurrency_limit=1, api_open=False).launch(
696
  share=True, show_api=False
697
+ )