Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -23,12 +23,9 @@ import gc
|
|
23 |
from openai import OpenAI
|
24 |
import re
|
25 |
|
26 |
-
|
27 |
-
|
28 |
# Load system prompts
|
29 |
system_prompt_t2v = """λΉμ μ λΉλμ€ μμ±μ μν ν둬ννΈ μ λ¬Έκ°μ
λλ€.
|
30 |
μ£Όμ΄μ§ ν둬ννΈλ₯Ό λ€μ ꡬ쑰μ λ§κ² κ°μ ν΄μ£ΌμΈμ:
|
31 |
-
|
32 |
1. μ£Όμ λμμ λͺ
νν ν λ¬Έμ₯μΌλ‘ μμ
|
33 |
2. ꡬ체μ μΈ λμκ³Ό μ μ€μ²λ₯Ό μκ° μμλλ‘ μ€λͺ
|
34 |
3. μΊλ¦ν°/κ°μ²΄μ μΈλͺ¨λ₯Ό μμΈν λ¬μ¬
|
@@ -36,14 +33,12 @@ system_prompt_t2v = """λΉμ μ λΉλμ€ μμ±μ μν ν둬ννΈ μ λ¬Έ
|
|
36 |
5. μΉ΄λ©λΌ κ°λμ μμ§μμ λͺ
μ
|
37 |
6. μ‘°λͺ
κ³Ό μμμ μμΈν μ€λͺ
|
38 |
7. λ³νλ κ°μμ€λ¬μ΄ μ¬κ±΄μ μμ°μ€λ½κ² ν¬ν¨
|
39 |
-
|
40 |
λͺ¨λ μ€λͺ
μ νλμ μμ°μ€λ¬μ΄ λ¬Έλ¨μΌλ‘ μμ±νκ³ ,
|
41 |
촬μ κ°λ
μ΄ μ΄¬μ λͺ©λ‘μ μ€λͺ
νλ κ²μ²λΌ ꡬ체μ μ΄κ³ μκ°μ μΌλ‘ μμ±νμΈμ.
|
42 |
200λ¨μ΄λ₯Ό λμ§ μλλ‘ νλ, μ΅λν μμΈνκ² μμ±νμΈμ."""
|
43 |
|
44 |
system_prompt_i2v = """λΉμ μ μ΄λ―Έμ§ κΈ°λ° λΉλμ€ μμ±μ μν ν둬ννΈ μ λ¬Έκ°μ
λλ€.
|
45 |
μ£Όμ΄μ§ ν둬ννΈλ₯Ό λ€μ ꡬ쑰μ λ§κ² κ°μ ν΄μ£ΌμΈμ:
|
46 |
-
|
47 |
1. μ£Όμ λμμ λͺ
νν ν λ¬Έμ₯μΌλ‘ μμ
|
48 |
2. ꡬ체μ μΈ λμκ³Ό μ μ€μ²λ₯Ό μκ° μμλλ‘ μ€λͺ
|
49 |
3. μΊλ¦ν°/κ°μ²΄μ μΈλͺ¨λ₯Ό μμΈν λ¬μ¬
|
@@ -51,12 +46,10 @@ system_prompt_i2v = """λΉμ μ μ΄λ―Έμ§ κΈ°λ° λΉλμ€ μμ±μ μν ν
|
|
51 |
5. μΉ΄λ©λΌ κ°λμ μμ§μμ λͺ
μ
|
52 |
6. μ‘°λͺ
κ³Ό μμμ μμΈν μ€λͺ
|
53 |
7. λ³νλ κ°μμ€λ¬μ΄ μ¬κ±΄μ μμ°μ€λ½κ² ν¬ν¨
|
54 |
-
|
55 |
λͺ¨λ μ€λͺ
μ νλμ μμ°μ€λ¬μ΄ λ¬Έλ¨μΌλ‘ μμ±νκ³ ,
|
56 |
촬μ κ°λ
μ΄ μ΄¬μ λͺ©λ‘μ μ€λͺ
νλ κ²μ²λΌ ꡬ체μ μ΄κ³ μκ°μ μΌλ‘ μμ±νμΈμ.
|
57 |
200λ¨μ΄λ₯Ό λμ§ μλλ‘ νλ, μ΅λν μμΈνκ² μμ±νμΈμ."""
|
58 |
|
59 |
-
|
60 |
# Load Hugging Face token if needed
|
61 |
hf_token = os.getenv("HF_TOKEN")
|
62 |
openai_api_key = os.getenv("OPENAI_API_KEY")
|
@@ -81,7 +74,37 @@ def translate_korean_prompt(prompt):
|
|
81 |
return translated
|
82 |
return prompt
|
83 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
|
|
|
|
|
|
|
|
|
85 |
|
86 |
# Set model download directory within Hugging Face Spaces
|
87 |
model_path = "asset"
|
@@ -145,36 +168,26 @@ def load_image_to_tensor_with_resize(image_path, target_height=512, target_width
|
|
145 |
frame_tensor = (frame_tensor / 127.5) - 1.0
|
146 |
return frame_tensor.unsqueeze(0).unsqueeze(2)
|
147 |
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
|
|
158 |
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
print("\n=== ν둬ννΈ μ¦κ° κ²°κ³Ό ===")
|
168 |
-
print("Original Prompt:")
|
169 |
-
print(prompt)
|
170 |
-
print("\nEnhanced Prompt:")
|
171 |
-
print(enhanced_prompt)
|
172 |
-
print("========================\n")
|
173 |
-
|
174 |
-
return enhanced_prompt
|
175 |
-
except Exception as e:
|
176 |
-
print(f"Error during prompt enhancement: {e}")
|
177 |
-
return prompt
|
178 |
|
179 |
# Preset options for resolution and frame configuration
|
180 |
preset_options = [
|
@@ -228,27 +241,6 @@ def preset_changed(preset):
|
|
228 |
gr.update(visible=True),
|
229 |
)
|
230 |
|
231 |
-
# Load models
|
232 |
-
vae = load_vae(vae_dir)
|
233 |
-
unet = load_unet(unet_dir)
|
234 |
-
scheduler = load_scheduler(scheduler_dir)
|
235 |
-
patchifier = SymmetricPatchifier(patch_size=1)
|
236 |
-
text_encoder = T5EncoderModel.from_pretrained(
|
237 |
-
"PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="text_encoder"
|
238 |
-
).to(device)
|
239 |
-
tokenizer = T5Tokenizer.from_pretrained(
|
240 |
-
"PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="tokenizer"
|
241 |
-
)
|
242 |
-
|
243 |
-
pipeline = XoraVideoPipeline(
|
244 |
-
transformer=unet,
|
245 |
-
patchifier=patchifier,
|
246 |
-
text_encoder=text_encoder,
|
247 |
-
tokenizer=tokenizer,
|
248 |
-
scheduler=scheduler,
|
249 |
-
vae=vae,
|
250 |
-
).to(device)
|
251 |
-
|
252 |
def generate_video_from_text(
|
253 |
prompt="",
|
254 |
enhance_prompt_toggle=False,
|
@@ -271,9 +263,6 @@ def generate_video_from_text(
|
|
271 |
# Translate Korean prompts to English
|
272 |
prompt = translate_korean_prompt(prompt)
|
273 |
negative_prompt = translate_korean_prompt(negative_prompt)
|
274 |
-
|
275 |
-
if enhance_prompt_toggle:
|
276 |
-
prompt = enhance_prompt_if_enabled(prompt, enhance_prompt_toggle, type="t2v")
|
277 |
|
278 |
sample = {
|
279 |
"prompt": prompt,
|
@@ -332,8 +321,6 @@ def generate_video_from_text(
|
|
332 |
torch.cuda.empty_cache()
|
333 |
return output_path
|
334 |
|
335 |
-
|
336 |
-
|
337 |
def generate_video_from_image(
|
338 |
image_path,
|
339 |
prompt="",
|
@@ -369,9 +356,6 @@ def generate_video_from_image(
|
|
369 |
load_image_to_tensor_with_resize(image_path, height, width).to(device).detach()
|
370 |
)
|
371 |
|
372 |
-
if enhance_prompt_toggle:
|
373 |
-
prompt = enhance_prompt_if_enabled(prompt, enhance_prompt_toggle, type="i2v")
|
374 |
-
|
375 |
sample = {
|
376 |
"prompt": prompt,
|
377 |
"prompt_attention_mask": None,
|
@@ -475,9 +459,6 @@ def create_advanced_options():
|
|
475 |
|
476 |
# Gradio Interface Definition
|
477 |
with gr.Blocks(theme=gr.themes.Soft()) as iface:
|
478 |
-
|
479 |
-
|
480 |
-
|
481 |
with gr.Tabs():
|
482 |
# Text to Video Tab
|
483 |
with gr.TabItem("ν
μ€νΈλ‘ λΉλμ€ λ§λ€κΈ°"):
|
@@ -634,6 +615,13 @@ with gr.Blocks(theme=gr.themes.Soft()) as iface:
|
|
634 |
fn=preset_changed, inputs=[txt2vid_preset], outputs=txt2vid_advanced[3:]
|
635 |
)
|
636 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
637 |
txt2vid_generate.click(
|
638 |
fn=generate_video_from_text,
|
639 |
inputs=[
|
@@ -653,6 +641,13 @@ with gr.Blocks(theme=gr.themes.Soft()) as iface:
|
|
653 |
fn=preset_changed, inputs=[img2vid_preset], outputs=img2vid_advanced[3:]
|
654 |
)
|
655 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
656 |
img2vid_generate.click(
|
657 |
fn=generate_video_from_image,
|
658 |
inputs=[
|
@@ -672,4 +667,6 @@ with gr.Blocks(theme=gr.themes.Soft()) as iface:
|
|
672 |
if __name__ == "__main__":
|
673 |
iface.queue(max_size=64, default_concurrency_limit=1, api_open=False).launch(
|
674 |
share=True, show_api=False
|
675 |
-
)
|
|
|
|
|
|
23 |
from openai import OpenAI
|
24 |
import re
|
25 |
|
|
|
|
|
26 |
# Load system prompts
|
27 |
system_prompt_t2v = """λΉμ μ λΉλμ€ μμ±μ μν ν둬ννΈ μ λ¬Έκ°μ
λλ€.
|
28 |
μ£Όμ΄μ§ ν둬ννΈλ₯Ό λ€μ ꡬ쑰μ λ§κ² κ°μ ν΄μ£ΌμΈμ:
|
|
|
29 |
1. μ£Όμ λμμ λͺ
νν ν λ¬Έμ₯μΌλ‘ μμ
|
30 |
2. ꡬ체μ μΈ λμκ³Ό μ μ€μ²λ₯Ό μκ° μμλλ‘ μ€λͺ
|
31 |
3. μΊλ¦ν°/κ°μ²΄μ μΈλͺ¨λ₯Ό μμΈν λ¬μ¬
|
|
|
33 |
5. μΉ΄λ©λΌ κ°λμ μμ§μμ λͺ
μ
|
34 |
6. μ‘°λͺ
κ³Ό μμμ μμΈν μ€λͺ
|
35 |
7. λ³νλ κ°μμ€λ¬μ΄ μ¬κ±΄μ μμ°μ€λ½κ² ν¬ν¨
|
|
|
36 |
λͺ¨λ μ€λͺ
μ νλμ μμ°μ€λ¬μ΄ λ¬Έλ¨μΌλ‘ μμ±νκ³ ,
|
37 |
촬μ κ°λ
μ΄ μ΄¬μ λͺ©λ‘μ μ€λͺ
νλ κ²μ²λΌ ꡬ체μ μ΄κ³ μκ°μ μΌλ‘ μμ±νμΈμ.
|
38 |
200λ¨μ΄λ₯Ό λμ§ μλλ‘ νλ, μ΅λν μμΈνκ² μμ±νμΈμ."""
|
39 |
|
40 |
system_prompt_i2v = """λΉμ μ μ΄λ―Έμ§ κΈ°λ° λΉλμ€ μμ±μ μν ν둬ννΈ μ λ¬Έκ°μ
λλ€.
|
41 |
μ£Όμ΄μ§ ν둬ννΈλ₯Ό λ€μ ꡬ쑰μ λ§κ² κ°μ ν΄μ£ΌμΈμ:
|
|
|
42 |
1. μ£Όμ λμμ λͺ
νν ν λ¬Έμ₯μΌλ‘ μμ
|
43 |
2. ꡬ체μ μΈ λμκ³Ό μ μ€μ²λ₯Ό μκ° μμλλ‘ μ€λͺ
|
44 |
3. μΊλ¦ν°/κ°μ²΄μ μΈλͺ¨λ₯Ό μμΈν λ¬μ¬
|
|
|
46 |
5. μΉ΄λ©λΌ κ°λμ μμ§μμ λͺ
μ
|
47 |
6. μ‘°λͺ
κ³Ό μμμ μμΈν μ€λͺ
|
48 |
7. λ³νλ κ°μμ€λ¬μ΄ μ¬κ±΄μ μμ°μ€λ½κ² ν¬ν¨
|
|
|
49 |
λͺ¨λ μ€λͺ
μ νλμ μμ°μ€λ¬μ΄ λ¬Έλ¨μΌλ‘ μμ±νκ³ ,
|
50 |
촬μ κ°λ
μ΄ μ΄¬μ λͺ©λ‘μ μ€λͺ
νλ κ²μ²λΌ ꡬ체μ μ΄κ³ μκ°μ μΌλ‘ μμ±νμΈμ.
|
51 |
200λ¨μ΄λ₯Ό λμ§ μλλ‘ νλ, μ΅λν μμΈνκ² μμ±νμΈμ."""
|
52 |
|
|
|
53 |
# Load Hugging Face token if needed
|
54 |
hf_token = os.getenv("HF_TOKEN")
|
55 |
openai_api_key = os.getenv("OPENAI_API_KEY")
|
|
|
74 |
return translated
|
75 |
return prompt
|
76 |
|
77 |
+
def enhance_prompt(prompt, type="t2v"):
|
78 |
+
system_prompt = system_prompt_t2v if type == "t2v" else system_prompt_i2v
|
79 |
+
messages = [
|
80 |
+
{"role": "system", "content": system_prompt},
|
81 |
+
{"role": "user", "content": prompt},
|
82 |
+
]
|
83 |
+
|
84 |
+
try:
|
85 |
+
response = client.chat.completions.create(
|
86 |
+
model="gpt-4-1106-preview",
|
87 |
+
messages=messages,
|
88 |
+
max_tokens=200,
|
89 |
+
)
|
90 |
+
enhanced_prompt = response.choices[0].message.content.strip()
|
91 |
+
|
92 |
+
print("\n=== ν둬ννΈ μ¦κ° κ²°κ³Ό ===")
|
93 |
+
print("Original Prompt:")
|
94 |
+
print(prompt)
|
95 |
+
print("\nEnhanced Prompt:")
|
96 |
+
print(enhanced_prompt)
|
97 |
+
print("========================\n")
|
98 |
+
|
99 |
+
return enhanced_prompt
|
100 |
+
except Exception as e:
|
101 |
+
print(f"Error during prompt enhancement: {e}")
|
102 |
+
return prompt
|
103 |
|
104 |
+
def update_prompt(prompt, enhance_toggle, type="t2v"):
|
105 |
+
if enhance_toggle:
|
106 |
+
return enhance_prompt(prompt, type)
|
107 |
+
return prompt
|
108 |
|
109 |
# Set model download directory within Hugging Face Spaces
|
110 |
model_path = "asset"
|
|
|
168 |
frame_tensor = (frame_tensor / 127.5) - 1.0
|
169 |
return frame_tensor.unsqueeze(0).unsqueeze(2)
|
170 |
|
171 |
+
# Load models
|
172 |
+
vae = load_vae(vae_dir)
|
173 |
+
unet = load_unet(unet_dir)
|
174 |
+
scheduler = load_scheduler(scheduler_dir)
|
175 |
+
patchifier = SymmetricPatchifier(patch_size=1)
|
176 |
+
text_encoder = T5EncoderModel.from_pretrained(
|
177 |
+
"PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="text_encoder"
|
178 |
+
).to(device)
|
179 |
+
tokenizer = T5Tokenizer.from_pretrained(
|
180 |
+
"PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="tokenizer"
|
181 |
+
)
|
182 |
|
183 |
+
pipeline = XoraVideoPipeline(
|
184 |
+
transformer=unet,
|
185 |
+
patchifier=patchifier,
|
186 |
+
text_encoder=text_encoder,
|
187 |
+
tokenizer=tokenizer,
|
188 |
+
scheduler=scheduler,
|
189 |
+
vae=vae,
|
190 |
+
).to(device)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
191 |
|
192 |
# Preset options for resolution and frame configuration
|
193 |
preset_options = [
|
|
|
241 |
gr.update(visible=True),
|
242 |
)
|
243 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
244 |
def generate_video_from_text(
|
245 |
prompt="",
|
246 |
enhance_prompt_toggle=False,
|
|
|
263 |
# Translate Korean prompts to English
|
264 |
prompt = translate_korean_prompt(prompt)
|
265 |
negative_prompt = translate_korean_prompt(negative_prompt)
|
|
|
|
|
|
|
266 |
|
267 |
sample = {
|
268 |
"prompt": prompt,
|
|
|
321 |
torch.cuda.empty_cache()
|
322 |
return output_path
|
323 |
|
|
|
|
|
324 |
def generate_video_from_image(
|
325 |
image_path,
|
326 |
prompt="",
|
|
|
356 |
load_image_to_tensor_with_resize(image_path, height, width).to(device).detach()
|
357 |
)
|
358 |
|
|
|
|
|
|
|
359 |
sample = {
|
360 |
"prompt": prompt,
|
361 |
"prompt_attention_mask": None,
|
|
|
459 |
|
460 |
# Gradio Interface Definition
|
461 |
with gr.Blocks(theme=gr.themes.Soft()) as iface:
|
|
|
|
|
|
|
462 |
with gr.Tabs():
|
463 |
# Text to Video Tab
|
464 |
with gr.TabItem("ν
μ€νΈλ‘ λΉλμ€ λ§λ€κΈ°"):
|
|
|
615 |
fn=preset_changed, inputs=[txt2vid_preset], outputs=txt2vid_advanced[3:]
|
616 |
)
|
617 |
|
618 |
+
txt2vid_enhance_toggle.change(
|
619 |
+
fn=update_prompt,
|
620 |
+
inputs=[txt2vid_prompt, txt2vid_enhance_toggle],
|
621 |
+
outputs=txt2vid_prompt,
|
622 |
+
kwargs={"type": "t2v"}
|
623 |
+
)
|
624 |
+
|
625 |
txt2vid_generate.click(
|
626 |
fn=generate_video_from_text,
|
627 |
inputs=[
|
|
|
641 |
fn=preset_changed, inputs=[img2vid_preset], outputs=img2vid_advanced[3:]
|
642 |
)
|
643 |
|
644 |
+
img2vid_enhance_toggle.change(
|
645 |
+
fn=update_prompt,
|
646 |
+
inputs=[img2vid_prompt, img2vid_enhance_toggle],
|
647 |
+
outputs=img2vid_prompt,
|
648 |
+
kwargs={"type": "i2v"}
|
649 |
+
)
|
650 |
+
|
651 |
img2vid_generate.click(
|
652 |
fn=generate_video_from_image,
|
653 |
inputs=[
|
|
|
667 |
if __name__ == "__main__":
|
668 |
iface.queue(max_size=64, default_concurrency_limit=1, api_open=False).launch(
|
669 |
share=True, show_api=False
|
670 |
+
)
|
671 |
+
|
672 |
+
|