Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -321,6 +321,8 @@ class WanInferencePipeline(nn.Module):
|
|
321 |
image = self.transform(image).unsqueeze(0).to(dtype=self.dtype)
|
322 |
|
323 |
_, _, h, w = image.shape
|
|
|
|
|
324 |
image = image * 2.0 - 1.0
|
325 |
image = image[:, :, None]
|
326 |
|
@@ -328,7 +330,7 @@ class WanInferencePipeline(nn.Module):
|
|
328 |
image = None
|
329 |
select_size = [height, width]
|
330 |
num = self.args.max_tokens * 16 * 16 * 4
|
331 |
-
den =
|
332 |
L0 = num // den
|
333 |
diff = (L0 - 1) % 4
|
334 |
L = L0 - diff
|
@@ -392,22 +394,29 @@ class WanInferencePipeline(nn.Module):
|
|
392 |
image = self.transform(image).unsqueeze(0).to(self.device, dtype=self.dtype)
|
393 |
|
394 |
_, _, h, w = image.shape
|
|
|
|
|
395 |
image = image * 2.0 - 1.0
|
396 |
image = image[:, :, None]
|
397 |
|
398 |
else:
|
399 |
image = None
|
400 |
-
|
401 |
-
|
|
|
|
|
402 |
|
403 |
# step 1: numerator and denominator as ints
|
404 |
num = args.max_tokens * 16 * 16 * 4
|
405 |
-
den =
|
406 |
|
407 |
# step 2: integer division
|
408 |
L0 = num // den # exact floor division, no float in sight
|
409 |
|
410 |
# step 3: make it ≡ 1 mod 4
|
|
|
|
|
|
|
411 |
diff = (L0 - 1) % 4
|
412 |
L = L0 - diff
|
413 |
if L < 1:
|
@@ -606,7 +615,7 @@ def get_duration(image_path, audio_path, text, num_steps, session_id, progress):
|
|
606 |
|
607 |
return int(duration_s)
|
608 |
|
609 |
-
def preprocess_img(input_image_path, raw_image_path,
|
610 |
|
611 |
if session_id is None:
|
612 |
session_id = uuid.uuid4().hex
|
@@ -622,7 +631,7 @@ def preprocess_img(input_image_path, raw_image_path, orientation_state, session_
|
|
622 |
image = inferpipe.transform(image).unsqueeze(0).to(dtype=inferpipe.dtype)
|
623 |
|
624 |
_, _, h, w = image.shape
|
625 |
-
select_size = match_size(
|
626 |
image = resize_pad(image, (h, w), select_size)
|
627 |
image = image * 2.0 - 1.0
|
628 |
image = image[:, :, None]
|
@@ -640,12 +649,13 @@ def preprocess_img(input_image_path, raw_image_path, orientation_state, session_
|
|
640 |
|
641 |
def infer_example(image_path, audio_path, text, num_steps, raw_image_path, session_id = None, progress=gr.Progress(track_tqdm=True),):
|
642 |
|
643 |
-
|
644 |
-
|
645 |
|
646 |
-
image_path, _ = preprocess_img(image_path, image_path, [[720, 400]], session_id)
|
647 |
result = infer(image_path, audio_path, text, num_steps, session_id, progress)
|
648 |
|
|
|
|
|
649 |
return result
|
650 |
|
651 |
@spaces.GPU(duration=get_duration)
|
@@ -703,8 +713,7 @@ def infer(image_path, audio_path, text, num_steps, session_id = None, progress=g
|
|
703 |
|
704 |
def apply_image(request):
|
705 |
print('image applied')
|
706 |
-
|
707 |
-
return request, request
|
708 |
|
709 |
def apply_audio(request):
|
710 |
print('audio applied')
|
@@ -730,15 +739,13 @@ def orientation_changed(session_id, evt: gr.EventData):
|
|
730 |
detail = getattr(evt, "data", None) or getattr(evt, "_data", {}) or {}
|
731 |
|
732 |
if detail['value'] == "9:16":
|
733 |
-
|
734 |
elif detail['value'] == "1:1":
|
735 |
-
|
736 |
elif detail['value'] == "16:9":
|
737 |
-
|
738 |
-
|
739 |
-
print(f'{session_id} has {orientation_state} orientation')
|
740 |
|
741 |
-
|
742 |
|
743 |
def clear_raw_image():
|
744 |
return ''
|
@@ -812,7 +819,6 @@ css = """
|
|
812 |
with gr.Blocks(css=css) as demo:
|
813 |
|
814 |
session_state = gr.State()
|
815 |
-
orientation_state = gr.State([[720, 400]])
|
816 |
demo.load(start_session, outputs=[session_state])
|
817 |
|
818 |
|
@@ -930,9 +936,7 @@ with gr.Blocks(css=css) as demo:
|
|
930 |
],
|
931 |
label="Image Samples",
|
932 |
inputs=[image_input],
|
933 |
-
|
934 |
-
fn=apply_image,
|
935 |
-
cache_examples=True
|
936 |
)
|
937 |
|
938 |
audio_examples = gr.Examples(
|
@@ -977,9 +981,9 @@ with gr.Blocks(css=css) as demo:
|
|
977 |
inputs=[audio_input, limit_on, session_state],
|
978 |
outputs=[audio_input],
|
979 |
)
|
980 |
-
image_input.orientation(fn=orientation_changed, inputs=[session_state]
|
981 |
image_input.clear(fn=clear_raw_image, outputs=[raw_img_text])
|
982 |
-
image_input.upload(fn=preprocess_img, inputs=[image_input, raw_img_text,
|
983 |
image_input.change(fn=update_generate_button, inputs=[image_input, audio_input, text_input, num_steps, session_state], outputs=[time_required])
|
984 |
audio_input.change(fn=update_generate_button, inputs=[image_input, audio_input, text_input, num_steps, session_state], outputs=[time_required])
|
985 |
num_steps.change(fn=slider_value_change, inputs=[image_input, audio_input, text_input, num_steps, session_state, adaptive_text], outputs=[time_required, text_input])
|
|
|
321 |
image = self.transform(image).unsqueeze(0).to(dtype=self.dtype)
|
322 |
|
323 |
_, _, h, w = image.shape
|
324 |
+
select_size = match_size(getattr( self.args, f'image_sizes_{ self.args.max_hw}'), h, w)
|
325 |
+
image = resize_pad(image, (h, w), select_size)
|
326 |
image = image * 2.0 - 1.0
|
327 |
image = image[:, :, None]
|
328 |
|
|
|
330 |
image = None
|
331 |
select_size = [height, width]
|
332 |
num = self.args.max_tokens * 16 * 16 * 4
|
333 |
+
den = select_size[0] * select_size[1]
|
334 |
L0 = num // den
|
335 |
diff = (L0 - 1) % 4
|
336 |
L = L0 - diff
|
|
|
394 |
image = self.transform(image).unsqueeze(0).to(self.device, dtype=self.dtype)
|
395 |
|
396 |
_, _, h, w = image.shape
|
397 |
+
select_size = match_size(getattr(self.args, f'image_sizes_{self.args.max_hw}'), h, w)
|
398 |
+
image = resize_pad(image, (h, w), select_size)
|
399 |
image = image * 2.0 - 1.0
|
400 |
image = image[:, :, None]
|
401 |
|
402 |
else:
|
403 |
image = None
|
404 |
+
select_size = [height, width]
|
405 |
+
# L = int(self.args.max_tokens * 16 * 16 * 4 / select_size[0] / select_size[1])
|
406 |
+
# L = L // 4 * 4 + 1 if L % 4 != 0 else L - 3 # video frames
|
407 |
+
# T = (L + 3) // 4 # latent frames
|
408 |
|
409 |
# step 1: numerator and denominator as ints
|
410 |
num = args.max_tokens * 16 * 16 * 4
|
411 |
+
den = select_size[0] * select_size[1]
|
412 |
|
413 |
# step 2: integer division
|
414 |
L0 = num // den # exact floor division, no float in sight
|
415 |
|
416 |
# step 3: make it ≡ 1 mod 4
|
417 |
+
# if L0 % 4 == 1, keep L0;
|
418 |
+
# otherwise subtract the difference so that (L0 - diff) % 4 == 1,
|
419 |
+
# but ensure the result stays positive.
|
420 |
diff = (L0 - 1) % 4
|
421 |
L = L0 - diff
|
422 |
if L < 1:
|
|
|
615 |
|
616 |
return int(duration_s)
|
617 |
|
618 |
+
def preprocess_img(input_image_path, raw_image_path, session_id = None):
|
619 |
|
620 |
if session_id is None:
|
621 |
session_id = uuid.uuid4().hex
|
|
|
631 |
image = inferpipe.transform(image).unsqueeze(0).to(dtype=inferpipe.dtype)
|
632 |
|
633 |
_, _, h, w = image.shape
|
634 |
+
select_size = match_size(getattr( args, f'image_sizes_{ args.max_hw}'), h, w)
|
635 |
image = resize_pad(image, (h, w), select_size)
|
636 |
image = image * 2.0 - 1.0
|
637 |
image = image[:, :, None]
|
|
|
649 |
|
650 |
def infer_example(image_path, audio_path, text, num_steps, raw_image_path, session_id = None, progress=gr.Progress(track_tqdm=True),):
|
651 |
|
652 |
+
current_image_size = args.image_sizes_720
|
653 |
+
args.image_sizes_720 = [[720, 400]]
|
654 |
|
|
|
655 |
result = infer(image_path, audio_path, text, num_steps, session_id, progress)
|
656 |
|
657 |
+
args.image_sizes_720 = current_image_size
|
658 |
+
|
659 |
return result
|
660 |
|
661 |
@spaces.GPU(duration=get_duration)
|
|
|
713 |
|
714 |
def apply_image(request):
|
715 |
print('image applied')
|
716 |
+
return request, None
|
|
|
717 |
|
718 |
def apply_audio(request):
|
719 |
print('audio applied')
|
|
|
739 |
detail = getattr(evt, "data", None) or getattr(evt, "_data", {}) or {}
|
740 |
|
741 |
if detail['value'] == "9:16":
|
742 |
+
args.image_sizes_720 = [[720, 400]]
|
743 |
elif detail['value'] == "1:1":
|
744 |
+
args.image_sizes_720 = [[720, 720]]
|
745 |
elif detail['value'] == "16:9":
|
746 |
+
args.image_sizes_720 = [[400, 720]]
|
|
|
|
|
747 |
|
748 |
+
print(f'{session_id} has {args.image_sizes_720} orientation')
|
749 |
|
750 |
def clear_raw_image():
|
751 |
return ''
|
|
|
819 |
with gr.Blocks(css=css) as demo:
|
820 |
|
821 |
session_state = gr.State()
|
|
|
822 |
demo.load(start_session, outputs=[session_state])
|
823 |
|
824 |
|
|
|
936 |
],
|
937 |
label="Image Samples",
|
938 |
inputs=[image_input],
|
939 |
+
cache_examples=False
|
|
|
|
|
940 |
)
|
941 |
|
942 |
audio_examples = gr.Examples(
|
|
|
981 |
inputs=[audio_input, limit_on, session_state],
|
982 |
outputs=[audio_input],
|
983 |
)
|
984 |
+
image_input.orientation(fn=orientation_changed, inputs=[session_state]).then(fn=preprocess_img, inputs=[image_input, raw_img_text, session_state], outputs=[image_input, raw_img_text])
|
985 |
image_input.clear(fn=clear_raw_image, outputs=[raw_img_text])
|
986 |
+
image_input.upload(fn=preprocess_img, inputs=[image_input, raw_img_text, session_state], outputs=[image_input, raw_img_text])
|
987 |
image_input.change(fn=update_generate_button, inputs=[image_input, audio_input, text_input, num_steps, session_state], outputs=[time_required])
|
988 |
audio_input.change(fn=update_generate_button, inputs=[image_input, audio_input, text_input, num_steps, session_state], outputs=[time_required])
|
989 |
num_steps.change(fn=slider_value_change, inputs=[image_input, audio_input, text_input, num_steps, session_state, adaptive_text], outputs=[time_required, text_input])
|