Spaces:

alexnasa
/

OmniAvatar

Running on Zero

App Files Files

xet

Community

alexnasa commited on Sep 9

Commit

69d77bb

verified ·

1 Parent(s): 5af1c18

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -35

app.py CHANGED Viewed

@@ -299,6 +299,7 @@ class WanInferencePipeline(nn.Module):
     def get_times(self, prompt,
             image_path=None,
             audio_path=None,
             seq_len=101, # not used while audio_path is not None
             height=720,
             width=720,
@@ -321,11 +322,11 @@ class WanInferencePipeline(nn.Module):
             image = self.transform(image).unsqueeze(0).to(dtype=self.dtype)
             _, _, h, w = image.shape
-            select_size = match_size(getattr( self.args, f'image_sizes_{ self.args.max_hw}'), h, w)
             image = resize_pad(image, (h, w), select_size)
             image = image * 2.0 - 1.0
             image = image[:, :, None]
         else:
             image = None
             select_size = [height, width]
@@ -373,6 +374,7 @@ class WanInferencePipeline(nn.Module):
     def forward(self, prompt,
                 image_path=None,
                 audio_path=None,
                 seq_len=101, # not used while audio_path is not None
                 height=720,
                 width=720,
@@ -394,17 +396,15 @@ class WanInferencePipeline(nn.Module):
             image = self.transform(image).unsqueeze(0).to(self.device, dtype=self.dtype)
             _, _, h, w = image.shape
-            select_size = match_size(getattr(self.args, f'image_sizes_{self.args.max_hw}'), h, w)
             image = resize_pad(image, (h, w), select_size)
             image = image * 2.0 - 1.0
             image = image[:, :, None]
         else:
             image = None
-            select_size = [height, width]
-        # L = int(self.args.max_tokens * 16 * 16 * 4 / select_size[0] / select_size[1])
-        # L = L // 4 * 4 + 1 if L % 4 != 0 else L - 3  # video frames
-        # T = (L + 3) // 4  # latent frames
         # step 1: numerator and denominator as ints
         num = args.max_tokens * 16 * 16 * 4
@@ -414,9 +414,6 @@ class WanInferencePipeline(nn.Module):
         L0 = num // den  # exact floor division, no float in sight
         # step 3: make it ≡ 1 mod 4
-        #    if L0 % 4 == 1, keep L0;
-        #    otherwise subtract the difference so that (L0 - diff) % 4 == 1,
-        #    but ensure the result stays positive.
         diff = (L0 - 1) % 4
         L  = L0 - diff
         if L < 1:
@@ -555,7 +552,7 @@ ADAPTIVE_PROMPT_TEMPLATES = [
     "A realistic video of a person speaking and sometimes looking directly to the camera and moving their eyes and pupils and head accordingly and turning and looking at the camera and looking away from the camera based on their movements with dynamic and rhythmic and extensive hand gestures that complement their speech. Their hands are clearly visible, independent, and unobstructed. Their facial expressions are expressive and full of emotion, enhancing the delivery. The camera remains steady, capturing sharp, clear movements and a focused, engaging presence."
 ]
-def slider_value_change(image_path, audio_path, text, num_steps, session_state, adaptive_text):
     if adaptive_text:
@@ -569,20 +566,20 @@ def slider_value_change(image_path, audio_path, text, num_steps, session_state,
         else:
             text = ADAPTIVE_PROMPT_TEMPLATES[1]
-    return update_generate_button(image_path, audio_path, text, num_steps, session_state), text
-def update_generate_button(image_path, audio_path, text, num_steps, session_state):
     if image_path is None or audio_path is None:
         return gr.update(value="⌚ Zero GPU Required: --")
-    duration_s = get_duration(image_path, audio_path, text, num_steps, session_state, None)
     duration_m = duration_s / 60
     return gr.update(value=f"⌚ Zero GPU Required: ~{duration_s}.0s ({duration_m:.1f} mins)")
-def get_duration(image_path, audio_path, text, num_steps, session_id, progress):
     if image_path is None:
         gr.Info("Step1: Please Provide an Image or Choose from Image Samples")
@@ -601,6 +598,7 @@ def get_duration(image_path, audio_path, text, num_steps, session_id, progress):
                 prompt=text,
                 image_path=image_path,
                 audio_path=audio_path,
                 seq_len=args.seq_len,
                 num_steps=num_steps
             )
@@ -615,7 +613,7 @@ def get_duration(image_path, audio_path, text, num_steps, session_id, progress):
     return int(duration_s)
-def preprocess_img(input_image_path, raw_image_path, session_id = None):
     if session_id is None:
         session_id = uuid.uuid4().hex
@@ -631,7 +629,7 @@ def preprocess_img(input_image_path, raw_image_path, session_id = None):
     image = inferpipe.transform(image).unsqueeze(0).to(dtype=inferpipe.dtype)
     _, _, h, w = image.shape
-    select_size = match_size(getattr( args, f'image_sizes_{ args.max_hw}'), h, w)
     image = resize_pad(image, (h, w), select_size)
     image = image * 2.0 - 1.0
     image = image[:, :, None]
@@ -649,17 +647,16 @@ def preprocess_img(input_image_path, raw_image_path, session_id = None):
 def infer_example(image_path, audio_path, text, num_steps, raw_image_path, session_id = None, progress=gr.Progress(track_tqdm=True),):
-    current_image_size = args.image_sizes_720
-    args.image_sizes_720 = [[720, 400]]
-    result = infer(image_path, audio_path, text, num_steps, session_id, progress)
-    args.image_sizes_720 = current_image_size
     return result
 @spaces.GPU(duration=get_duration)
-def infer(image_path, audio_path, text, num_steps, session_id = None, progress=gr.Progress(track_tqdm=True),):
     if image_path is None:
@@ -694,6 +691,7 @@ def infer(image_path, audio_path, text, num_steps, session_id = None, progress=g
                 prompt=text,
                 image_path=image_path,
                 audio_path=input_audio_path,
                 seq_len=args.seq_len,
                 num_steps=num_steps
             )
@@ -713,7 +711,8 @@ def infer(image_path, audio_path, text, num_steps, session_id = None, progress=g
 def apply_image(request):
     print('image applied')
-    return request, None
 def apply_audio(request):
     print('audio applied')
@@ -739,13 +738,15 @@ def orientation_changed(session_id, evt: gr.EventData):
     detail = getattr(evt, "data", None) or getattr(evt, "_data", {}) or {}
     if detail['value'] == "9:16":
-        args.image_sizes_720 = [[720, 400]]
     elif detail['value'] == "1:1":
-        args.image_sizes_720 = [[720, 720]]
     elif detail['value'] == "16:9":
-        args.image_sizes_720 = [[400, 720]]
-    print(f'{session_id} has {args.image_sizes_720} orientation')
 def clear_raw_image():
     return ''
@@ -819,6 +820,7 @@ css = """
 with gr.Blocks(css=css) as demo:
     session_state = gr.State()
     demo.load(start_session, outputs=[session_state])
@@ -936,7 +938,9 @@ with gr.Blocks(css=css) as demo:
                     ],
                     label="Image Samples",
                     inputs=[image_input],
-                    cache_examples=False
                     )
                 audio_examples = gr.Examples(
@@ -964,7 +968,7 @@ with gr.Blocks(css=css) as demo:
     infer_btn.click(
         fn=infer,
-        inputs=[image_input, audio_input, text_input, num_steps, session_state],
         outputs=[output_video]
     )
@@ -981,12 +985,12 @@ with gr.Blocks(css=css) as demo:
         inputs=[audio_input, limit_on, session_state],
         outputs=[audio_input],
     )
-    image_input.orientation(fn=orientation_changed, inputs=[session_state]).then(fn=preprocess_img, inputs=[image_input, raw_img_text, session_state], outputs=[image_input, raw_img_text])
     image_input.clear(fn=clear_raw_image, outputs=[raw_img_text])
-    image_input.upload(fn=preprocess_img, inputs=[image_input, raw_img_text, session_state], outputs=[image_input, raw_img_text])
-    image_input.change(fn=update_generate_button, inputs=[image_input, audio_input, text_input, num_steps, session_state], outputs=[time_required])
-    audio_input.change(fn=update_generate_button, inputs=[image_input, audio_input, text_input, num_steps, session_state], outputs=[time_required])
-    num_steps.change(fn=slider_value_change, inputs=[image_input, audio_input, text_input, num_steps, session_state, adaptive_text], outputs=[time_required, text_input])
     adaptive_text.change(fn=check_box_clicked, inputs=[adaptive_text], outputs=[text_input])
     audio_input.upload(fn=apply_audio, inputs=[audio_input], outputs=[audio_input]
     ).then(

     def get_times(self, prompt,
             image_path=None,
             audio_path=None,
+            orientation_state = None,
             seq_len=101, # not used while audio_path is not None
             height=720,
             width=720,
             image = self.transform(image).unsqueeze(0).to(dtype=self.dtype)
             _, _, h, w = image.shape
+            select_size = match_size(orientation_state, h, w)
             image = resize_pad(image, (h, w), select_size)
             image = image * 2.0 - 1.0
             image = image[:, :, None]
         else:
             image = None
             select_size = [height, width]
     def forward(self, prompt,
                 image_path=None,
                 audio_path=None,
+                orientation_state = None,
                 seq_len=101, # not used while audio_path is not None
                 height=720,
                 width=720,
             image = self.transform(image).unsqueeze(0).to(self.device, dtype=self.dtype)
             _, _, h, w = image.shape
+            select_size = match_size(orientation_state, h, w)
             image = resize_pad(image, (h, w), select_size)
             image = image * 2.0 - 1.0
             image = image[:, :, None]
         else:
             image = None
+            h = height
+            w = width
         # step 1: numerator and denominator as ints
         num = args.max_tokens * 16 * 16 * 4
         L0 = num // den  # exact floor division, no float in sight
         # step 3: make it ≡ 1 mod 4
         diff = (L0 - 1) % 4
         L  = L0 - diff
         if L < 1:
     "A realistic video of a person speaking and sometimes looking directly to the camera and moving their eyes and pupils and head accordingly and turning and looking at the camera and looking away from the camera based on their movements with dynamic and rhythmic and extensive hand gestures that complement their speech. Their hands are clearly visible, independent, and unobstructed. Their facial expressions are expressive and full of emotion, enhancing the delivery. The camera remains steady, capturing sharp, clear movements and a focused, engaging presence."
 ]
+def slider_value_change(image_path, audio_path, orientation_state, text, num_steps, session_state, adaptive_text):
     if adaptive_text:
         else:
             text = ADAPTIVE_PROMPT_TEMPLATES[1]
+    return update_generate_button(image_path, audio_path, orientation_state, text, num_steps, session_state), text
+def update_generate_button(image_path, audio_path, orientation_state, text, num_steps, session_state):
     if image_path is None or audio_path is None:
         return gr.update(value="⌚ Zero GPU Required: --")
+    duration_s = get_duration(image_path, audio_path, text, orientation_state, num_steps, session_state, None)
     duration_m = duration_s / 60
     return gr.update(value=f"⌚ Zero GPU Required: ~{duration_s}.0s ({duration_m:.1f} mins)")
+def get_duration(image_path, audio_path, text, orientation_state, num_steps, session_id, progress):
     if image_path is None:
         gr.Info("Step1: Please Provide an Image or Choose from Image Samples")
                 prompt=text,
                 image_path=image_path,
                 audio_path=audio_path,
+                orientation_state= orientation_state,
                 seq_len=args.seq_len,
                 num_steps=num_steps
             )
     return int(duration_s)
+def preprocess_img(input_image_path, raw_image_path, orientation_state, session_id = None):
     if session_id is None:
         session_id = uuid.uuid4().hex
     image = inferpipe.transform(image).unsqueeze(0).to(dtype=inferpipe.dtype)
     _, _, h, w = image.shape
+    select_size = match_size(orientation_state, h, w)
     image = resize_pad(image, (h, w), select_size)
     image = image * 2.0 - 1.0
     image = image[:, :, None]
 def infer_example(image_path, audio_path, text, num_steps, raw_image_path, session_id = None, progress=gr.Progress(track_tqdm=True),):
+    if session_id is None:
+        session_id = uuid.uuid4().hex
+    image_path, _ = preprocess_img(image_path, image_path, [[720, 400]], session_id)
+    result = infer(image_path, audio_path, text, [[720, 400]], num_steps, session_id, progress)
     return result
 @spaces.GPU(duration=get_duration)
+def infer(image_path, audio_path, text, orientation_state, num_steps, session_id = None, progress=gr.Progress(track_tqdm=True),):
     if image_path is None:
                 prompt=text,
                 image_path=image_path,
                 audio_path=input_audio_path,
+                orientation_state=orientation_state,
                 seq_len=args.seq_len,
                 num_steps=num_steps
             )
 def apply_image(request):
     print('image applied')
+    return request, request
 def apply_audio(request):
     print('audio applied')
     detail = getattr(evt, "data", None) or getattr(evt, "_data", {}) or {}
     if detail['value'] == "9:16":
+        orientation_state = [[720, 400]]
     elif detail['value'] == "1:1":
+        orientation_state = [[720, 720]]
     elif detail['value'] == "16:9":
+        orientation_state = [[400, 720]]
+    print(f'{session_id} has {orientation_state} orientation')
+    return orientation_state
 def clear_raw_image():
     return ''
 with gr.Blocks(css=css) as demo:
     session_state = gr.State()
+    orientation_state = gr.State([[720, 400]])
     demo.load(start_session, outputs=[session_state])
                     ],
                     label="Image Samples",
                     inputs=[image_input],
+                    outputs=[image_input, raw_img_text],
+                    fn=apply_image,
+                    cache_examples=True
                     )
                 audio_examples = gr.Examples(
     infer_btn.click(
         fn=infer,
+        inputs=[image_input, audio_input, text_input, orientation_state, num_steps, session_state],
         outputs=[output_video]
     )
         inputs=[audio_input, limit_on, session_state],
         outputs=[audio_input],
     )
+    image_input.orientation(fn=orientation_changed, inputs=[session_state], outputs=[orientation_state]).then(fn=preprocess_img, inputs=[image_input, raw_img_text, orientation_state, session_state], outputs=[image_input, raw_img_text])
     image_input.clear(fn=clear_raw_image, outputs=[raw_img_text])
+    image_input.upload(fn=preprocess_img, inputs=[image_input, raw_img_text, orientation_state, session_state], outputs=[image_input, raw_img_text])
+    image_input.change(fn=update_generate_button, inputs=[image_input, audio_input, orientation_state, text_input, num_steps, session_state], outputs=[time_required])
+    audio_input.change(fn=update_generate_button, inputs=[image_input, audio_input, orientation_state, text_input, num_steps, session_state], outputs=[time_required])
+    num_steps.change(fn=slider_value_change, inputs=[image_input, audio_input, orientation_state, text_input, num_steps, session_state, adaptive_text], outputs=[time_required, text_input])
     adaptive_text.change(fn=check_box_clicked, inputs=[adaptive_text], outputs=[text_input])
     audio_input.upload(fn=apply_audio, inputs=[audio_input], outputs=[audio_input]
     ).then(