openfree commited on
Commit
7a7e12d
Β·
verified Β·
1 Parent(s): fdc505a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +66 -69
app.py CHANGED
@@ -23,12 +23,9 @@ import gc
23
  from openai import OpenAI
24
  import re
25
 
26
-
27
-
28
  # Load system prompts
29
  system_prompt_t2v = """당신은 λΉ„λ””μ˜€ 생성을 μœ„ν•œ ν”„λ‘¬ν”„νŠΈ μ „λ¬Έκ°€μž…λ‹ˆλ‹€.
30
  주어진 ν”„λ‘¬ν”„νŠΈλ₯Ό λ‹€μŒ ꡬ쑰에 맞게 κ°œμ„ ν•΄μ£Όμ„Έμš”:
31
-
32
  1. μ£Όμš” λ™μž‘μ„ λͺ…ν™•ν•œ ν•œ λ¬Έμž₯으둜 μ‹œμž‘
33
  2. ꡬ체적인 λ™μž‘κ³Ό 제슀처λ₯Ό μ‹œκ°„ μˆœμ„œλŒ€λ‘œ μ„€λͺ…
34
  3. 캐릭터/객체의 μ™Έλͺ¨λ₯Ό μƒμ„Ένžˆ λ¬˜μ‚¬
@@ -36,14 +33,12 @@ system_prompt_t2v = """당신은 λΉ„λ””μ˜€ 생성을 μœ„ν•œ ν”„λ‘¬ν”„νŠΈ μ „λ¬Έ
36
  5. 카메라 각도와 μ›€μ§μž„μ„ λͺ…μ‹œ
37
  6. μ‘°λͺ…κ³Ό 색상을 μžμ„Ένžˆ μ„€λͺ…
38
  7. λ³€ν™”λ‚˜ κ°‘μž‘μŠ€λŸ¬μš΄ 사건을 μžμ—°μŠ€λŸ½κ²Œ 포함
39
-
40
  λͺ¨λ“  μ„€λͺ…은 ν•˜λ‚˜μ˜ μžμ—°μŠ€λŸ¬μš΄ λ¬Έλ‹¨μœΌλ‘œ μž‘μ„±ν•˜κ³ ,
41
  촬영 감독이 촬영 λͺ©λ‘μ„ μ„€λͺ…ν•˜λŠ” κ²ƒμ²˜λŸΌ ꡬ체적이고 μ‹œκ°μ μœΌλ‘œ μž‘μ„±ν•˜μ„Έμš”.
42
  200단어λ₯Ό λ„˜μ§€ μ•Šλ„λ‘ ν•˜λ˜, μ΅œλŒ€ν•œ μƒμ„Έν•˜κ²Œ μž‘μ„±ν•˜μ„Έμš”."""
43
 
44
  system_prompt_i2v = """당신은 이미지 기반 λΉ„λ””μ˜€ 생성을 μœ„ν•œ ν”„λ‘¬ν”„νŠΈ μ „λ¬Έκ°€μž…λ‹ˆλ‹€.
45
  주어진 ν”„λ‘¬ν”„νŠΈλ₯Ό λ‹€μŒ ꡬ쑰에 맞게 κ°œμ„ ν•΄μ£Όμ„Έμš”:
46
-
47
  1. μ£Όμš” λ™μž‘μ„ λͺ…ν™•ν•œ ν•œ λ¬Έμž₯으둜 μ‹œμž‘
48
  2. ꡬ체적인 λ™μž‘κ³Ό 제슀처λ₯Ό μ‹œκ°„ μˆœμ„œλŒ€λ‘œ μ„€λͺ…
49
  3. 캐릭터/객체의 μ™Έλͺ¨λ₯Ό μƒμ„Ένžˆ λ¬˜μ‚¬
@@ -51,12 +46,10 @@ system_prompt_i2v = """당신은 이미지 기반 λΉ„λ””μ˜€ 생성을 μœ„ν•œ ν”„
51
  5. 카메라 각도와 μ›€μ§μž„μ„ λͺ…μ‹œ
52
  6. μ‘°λͺ…κ³Ό 색상을 μžμ„Ένžˆ μ„€λͺ…
53
  7. λ³€ν™”λ‚˜ κ°‘μž‘μŠ€λŸ¬μš΄ 사건을 μžμ—°μŠ€λŸ½κ²Œ 포함
54
-
55
  λͺ¨λ“  μ„€λͺ…은 ν•˜λ‚˜μ˜ μžμ—°μŠ€λŸ¬μš΄ λ¬Έλ‹¨μœΌλ‘œ μž‘μ„±ν•˜κ³ ,
56
  촬영 감독이 촬영 λͺ©λ‘μ„ μ„€λͺ…ν•˜λŠ” κ²ƒμ²˜λŸΌ ꡬ체적이고 μ‹œκ°μ μœΌλ‘œ μž‘μ„±ν•˜μ„Έμš”.
57
  200단어λ₯Ό λ„˜μ§€ μ•Šλ„λ‘ ν•˜λ˜, μ΅œλŒ€ν•œ μƒμ„Έν•˜κ²Œ μž‘μ„±ν•˜μ„Έμš”."""
58
 
59
-
60
  # Load Hugging Face token if needed
61
  hf_token = os.getenv("HF_TOKEN")
62
  openai_api_key = os.getenv("OPENAI_API_KEY")
@@ -81,7 +74,37 @@ def translate_korean_prompt(prompt):
81
  return translated
82
  return prompt
83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
 
 
 
 
85
 
86
  # Set model download directory within Hugging Face Spaces
87
  model_path = "asset"
@@ -145,36 +168,26 @@ def load_image_to_tensor_with_resize(image_path, target_height=512, target_width
145
  frame_tensor = (frame_tensor / 127.5) - 1.0
146
  return frame_tensor.unsqueeze(0).unsqueeze(2)
147
 
148
- def enhance_prompt_if_enabled(prompt, enhance_toggle, type="t2v"):
149
- if not enhance_toggle:
150
- print("Enhance toggle is off, Original Prompt: ", prompt)
151
- return prompt
152
-
153
- system_prompt = system_prompt_t2v if type == "t2v" else system_prompt_i2v
154
- messages = [
155
- {"role": "system", "content": system_prompt},
156
- {"role": "user", "content": prompt},
157
- ]
 
158
 
159
- try:
160
- response = client.chat.completions.create(
161
- model="gpt-4-1106-preview",
162
- messages=messages,
163
- max_tokens=200,
164
- )
165
- enhanced_prompt = response.choices[0].message.content.strip()
166
-
167
- print("\n=== ν”„λ‘¬ν”„νŠΈ 증강 κ²°κ³Ό ===")
168
- print("Original Prompt:")
169
- print(prompt)
170
- print("\nEnhanced Prompt:")
171
- print(enhanced_prompt)
172
- print("========================\n")
173
-
174
- return enhanced_prompt
175
- except Exception as e:
176
- print(f"Error during prompt enhancement: {e}")
177
- return prompt
178
 
179
  # Preset options for resolution and frame configuration
180
  preset_options = [
@@ -228,27 +241,6 @@ def preset_changed(preset):
228
  gr.update(visible=True),
229
  )
230
 
231
- # Load models
232
- vae = load_vae(vae_dir)
233
- unet = load_unet(unet_dir)
234
- scheduler = load_scheduler(scheduler_dir)
235
- patchifier = SymmetricPatchifier(patch_size=1)
236
- text_encoder = T5EncoderModel.from_pretrained(
237
- "PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="text_encoder"
238
- ).to(device)
239
- tokenizer = T5Tokenizer.from_pretrained(
240
- "PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="tokenizer"
241
- )
242
-
243
- pipeline = XoraVideoPipeline(
244
- transformer=unet,
245
- patchifier=patchifier,
246
- text_encoder=text_encoder,
247
- tokenizer=tokenizer,
248
- scheduler=scheduler,
249
- vae=vae,
250
- ).to(device)
251
-
252
  def generate_video_from_text(
253
  prompt="",
254
  enhance_prompt_toggle=False,
@@ -271,9 +263,6 @@ def generate_video_from_text(
271
  # Translate Korean prompts to English
272
  prompt = translate_korean_prompt(prompt)
273
  negative_prompt = translate_korean_prompt(negative_prompt)
274
-
275
- if enhance_prompt_toggle:
276
- prompt = enhance_prompt_if_enabled(prompt, enhance_prompt_toggle, type="t2v")
277
 
278
  sample = {
279
  "prompt": prompt,
@@ -332,8 +321,6 @@ def generate_video_from_text(
332
  torch.cuda.empty_cache()
333
  return output_path
334
 
335
-
336
-
337
  def generate_video_from_image(
338
  image_path,
339
  prompt="",
@@ -369,9 +356,6 @@ def generate_video_from_image(
369
  load_image_to_tensor_with_resize(image_path, height, width).to(device).detach()
370
  )
371
 
372
- if enhance_prompt_toggle:
373
- prompt = enhance_prompt_if_enabled(prompt, enhance_prompt_toggle, type="i2v")
374
-
375
  sample = {
376
  "prompt": prompt,
377
  "prompt_attention_mask": None,
@@ -475,9 +459,6 @@ def create_advanced_options():
475
 
476
  # Gradio Interface Definition
477
  with gr.Blocks(theme=gr.themes.Soft()) as iface:
478
-
479
-
480
-
481
  with gr.Tabs():
482
  # Text to Video Tab
483
  with gr.TabItem("ν…μŠ€νŠΈλ‘œ λΉ„λ””μ˜€ λ§Œλ“€κΈ°"):
@@ -634,6 +615,13 @@ with gr.Blocks(theme=gr.themes.Soft()) as iface:
634
  fn=preset_changed, inputs=[txt2vid_preset], outputs=txt2vid_advanced[3:]
635
  )
636
 
 
 
 
 
 
 
 
637
  txt2vid_generate.click(
638
  fn=generate_video_from_text,
639
  inputs=[
@@ -653,6 +641,13 @@ with gr.Blocks(theme=gr.themes.Soft()) as iface:
653
  fn=preset_changed, inputs=[img2vid_preset], outputs=img2vid_advanced[3:]
654
  )
655
 
 
 
 
 
 
 
 
656
  img2vid_generate.click(
657
  fn=generate_video_from_image,
658
  inputs=[
@@ -672,4 +667,6 @@ with gr.Blocks(theme=gr.themes.Soft()) as iface:
672
  if __name__ == "__main__":
673
  iface.queue(max_size=64, default_concurrency_limit=1, api_open=False).launch(
674
  share=True, show_api=False
675
- )
 
 
 
23
  from openai import OpenAI
24
  import re
25
 
 
 
26
  # Load system prompts
27
  system_prompt_t2v = """당신은 λΉ„λ””μ˜€ 생성을 μœ„ν•œ ν”„λ‘¬ν”„νŠΈ μ „λ¬Έκ°€μž…λ‹ˆλ‹€.
28
  주어진 ν”„λ‘¬ν”„νŠΈλ₯Ό λ‹€μŒ ꡬ쑰에 맞게 κ°œμ„ ν•΄μ£Όμ„Έμš”:
 
29
  1. μ£Όμš” λ™μž‘μ„ λͺ…ν™•ν•œ ν•œ λ¬Έμž₯으둜 μ‹œμž‘
30
  2. ꡬ체적인 λ™μž‘κ³Ό 제슀처λ₯Ό μ‹œκ°„ μˆœμ„œλŒ€λ‘œ μ„€λͺ…
31
  3. 캐릭터/객체의 μ™Έλͺ¨λ₯Ό μƒμ„Ένžˆ λ¬˜μ‚¬
 
33
  5. 카메라 각도와 μ›€μ§μž„μ„ λͺ…μ‹œ
34
  6. μ‘°λͺ…κ³Ό 색상을 μžμ„Ένžˆ μ„€λͺ…
35
  7. λ³€ν™”λ‚˜ κ°‘μž‘μŠ€λŸ¬μš΄ 사건을 μžμ—°μŠ€λŸ½κ²Œ 포함
 
36
  λͺ¨λ“  μ„€λͺ…은 ν•˜λ‚˜μ˜ μžμ—°μŠ€λŸ¬μš΄ λ¬Έλ‹¨μœΌλ‘œ μž‘μ„±ν•˜κ³ ,
37
  촬영 감독이 촬영 λͺ©λ‘μ„ μ„€λͺ…ν•˜λŠ” κ²ƒμ²˜λŸΌ ꡬ체적이고 μ‹œκ°μ μœΌλ‘œ μž‘μ„±ν•˜μ„Έμš”.
38
  200단어λ₯Ό λ„˜μ§€ μ•Šλ„λ‘ ν•˜λ˜, μ΅œλŒ€ν•œ μƒμ„Έν•˜κ²Œ μž‘μ„±ν•˜μ„Έμš”."""
39
 
40
  system_prompt_i2v = """당신은 이미지 기반 λΉ„λ””μ˜€ 생성을 μœ„ν•œ ν”„λ‘¬ν”„νŠΈ μ „λ¬Έκ°€μž…λ‹ˆλ‹€.
41
  주어진 ν”„λ‘¬ν”„νŠΈλ₯Ό λ‹€μŒ ꡬ쑰에 맞게 κ°œμ„ ν•΄μ£Όμ„Έμš”:
 
42
  1. μ£Όμš” λ™μž‘μ„ λͺ…ν™•ν•œ ν•œ λ¬Έμž₯으둜 μ‹œμž‘
43
  2. ꡬ체적인 λ™μž‘κ³Ό 제슀처λ₯Ό μ‹œκ°„ μˆœμ„œλŒ€λ‘œ μ„€λͺ…
44
  3. 캐릭터/객체의 μ™Έλͺ¨λ₯Ό μƒμ„Ένžˆ λ¬˜μ‚¬
 
46
  5. 카메라 각도와 μ›€μ§μž„μ„ λͺ…μ‹œ
47
  6. μ‘°λͺ…κ³Ό 색상을 μžμ„Ένžˆ μ„€λͺ…
48
  7. λ³€ν™”λ‚˜ κ°‘μž‘μŠ€λŸ¬μš΄ 사건을 μžμ—°μŠ€λŸ½κ²Œ 포함
 
49
  λͺ¨λ“  μ„€λͺ…은 ν•˜λ‚˜μ˜ μžμ—°μŠ€λŸ¬μš΄ λ¬Έλ‹¨μœΌλ‘œ μž‘μ„±ν•˜κ³ ,
50
  촬영 감독이 촬영 λͺ©λ‘μ„ μ„€λͺ…ν•˜λŠ” κ²ƒμ²˜λŸΌ ꡬ체적이고 μ‹œκ°μ μœΌλ‘œ μž‘μ„±ν•˜μ„Έμš”.
51
  200단어λ₯Ό λ„˜μ§€ μ•Šλ„λ‘ ν•˜λ˜, μ΅œλŒ€ν•œ μƒμ„Έν•˜κ²Œ μž‘μ„±ν•˜μ„Έμš”."""
52
 
 
53
  # Load Hugging Face token if needed
54
  hf_token = os.getenv("HF_TOKEN")
55
  openai_api_key = os.getenv("OPENAI_API_KEY")
 
74
  return translated
75
  return prompt
76
 
77
+ def enhance_prompt(prompt, type="t2v"):
78
+ system_prompt = system_prompt_t2v if type == "t2v" else system_prompt_i2v
79
+ messages = [
80
+ {"role": "system", "content": system_prompt},
81
+ {"role": "user", "content": prompt},
82
+ ]
83
+
84
+ try:
85
+ response = client.chat.completions.create(
86
+ model="gpt-4-1106-preview",
87
+ messages=messages,
88
+ max_tokens=200,
89
+ )
90
+ enhanced_prompt = response.choices[0].message.content.strip()
91
+
92
+ print("\n=== ν”„λ‘¬ν”„νŠΈ 증강 κ²°κ³Ό ===")
93
+ print("Original Prompt:")
94
+ print(prompt)
95
+ print("\nEnhanced Prompt:")
96
+ print(enhanced_prompt)
97
+ print("========================\n")
98
+
99
+ return enhanced_prompt
100
+ except Exception as e:
101
+ print(f"Error during prompt enhancement: {e}")
102
+ return prompt
103
 
104
+ def update_prompt(prompt, enhance_toggle, type="t2v"):
105
+ if enhance_toggle:
106
+ return enhance_prompt(prompt, type)
107
+ return prompt
108
 
109
  # Set model download directory within Hugging Face Spaces
110
  model_path = "asset"
 
168
  frame_tensor = (frame_tensor / 127.5) - 1.0
169
  return frame_tensor.unsqueeze(0).unsqueeze(2)
170
 
171
+ # Load models
172
+ vae = load_vae(vae_dir)
173
+ unet = load_unet(unet_dir)
174
+ scheduler = load_scheduler(scheduler_dir)
175
+ patchifier = SymmetricPatchifier(patch_size=1)
176
+ text_encoder = T5EncoderModel.from_pretrained(
177
+ "PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="text_encoder"
178
+ ).to(device)
179
+ tokenizer = T5Tokenizer.from_pretrained(
180
+ "PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="tokenizer"
181
+ )
182
 
183
+ pipeline = XoraVideoPipeline(
184
+ transformer=unet,
185
+ patchifier=patchifier,
186
+ text_encoder=text_encoder,
187
+ tokenizer=tokenizer,
188
+ scheduler=scheduler,
189
+ vae=vae,
190
+ ).to(device)
 
 
 
 
 
 
 
 
 
 
 
191
 
192
  # Preset options for resolution and frame configuration
193
  preset_options = [
 
241
  gr.update(visible=True),
242
  )
243
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
  def generate_video_from_text(
245
  prompt="",
246
  enhance_prompt_toggle=False,
 
263
  # Translate Korean prompts to English
264
  prompt = translate_korean_prompt(prompt)
265
  negative_prompt = translate_korean_prompt(negative_prompt)
 
 
 
266
 
267
  sample = {
268
  "prompt": prompt,
 
321
  torch.cuda.empty_cache()
322
  return output_path
323
 
 
 
324
  def generate_video_from_image(
325
  image_path,
326
  prompt="",
 
356
  load_image_to_tensor_with_resize(image_path, height, width).to(device).detach()
357
  )
358
 
 
 
 
359
  sample = {
360
  "prompt": prompt,
361
  "prompt_attention_mask": None,
 
459
 
460
  # Gradio Interface Definition
461
  with gr.Blocks(theme=gr.themes.Soft()) as iface:
 
 
 
462
  with gr.Tabs():
463
  # Text to Video Tab
464
  with gr.TabItem("ν…μŠ€νŠΈλ‘œ λΉ„λ””μ˜€ λ§Œλ“€κΈ°"):
 
615
  fn=preset_changed, inputs=[txt2vid_preset], outputs=txt2vid_advanced[3:]
616
  )
617
 
618
+ txt2vid_enhance_toggle.change(
619
+ fn=update_prompt,
620
+ inputs=[txt2vid_prompt, txt2vid_enhance_toggle],
621
+ outputs=txt2vid_prompt,
622
+ kwargs={"type": "t2v"}
623
+ )
624
+
625
  txt2vid_generate.click(
626
  fn=generate_video_from_text,
627
  inputs=[
 
641
  fn=preset_changed, inputs=[img2vid_preset], outputs=img2vid_advanced[3:]
642
  )
643
 
644
+ img2vid_enhance_toggle.change(
645
+ fn=update_prompt,
646
+ inputs=[img2vid_prompt, img2vid_enhance_toggle],
647
+ outputs=img2vid_prompt,
648
+ kwargs={"type": "i2v"}
649
+ )
650
+
651
  img2vid_generate.click(
652
  fn=generate_video_from_image,
653
  inputs=[
 
667
  if __name__ == "__main__":
668
  iface.queue(max_size=64, default_concurrency_limit=1, api_open=False).launch(
669
  share=True, show_api=False
670
+ )
671
+
672
+