LivePortrait2

Running on Zero

yerang commited on Oct 17, 2024

Commit

a17e76c

verified ·

1 Parent(s): 0697aec

Update stf/stf-api-alternative/src/stf_alternative/inference.py

Files changed (1) hide show

stf/stf-api-alternative/src/stf_alternative/inference.py CHANGED Viewed

@@ -141,10 +141,10 @@ def process_audio_chunk(audio_processor, audio_encoder, audio_chunk, device):
     input_values = audio_processor(
         audio_data, sampling_rate=16000, return_tensors="pt"
-    ).to(device)["input_values"]
-    with torch.no_grad():
-        logits = audio_encoder(input_values=input_values)
     return logits.last_hidden_state[0]
@@ -188,33 +188,35 @@ def to_img(t):
 def inference_model(model, v, device, verbose=False):
-    with torch.no_grad():
-        mel, ips, mask, alpha = (
-            v["mel"],
-            v["ips"],
-            v["mask"],
-            v["img_gt_with_alpha"],
-        )
-        cpu_ips = ips
-        cpu_alpha = alpha
-        audio = mel.to(device)
-        ips = ips.to(device).permute(0, 3, 1, 2)
-        pred = model.model(ips, audio)
-        gen_face = to_img(pred)
-        return [
-            {
-                "pred": o,
-                "mask": mask[j].numpy(),
-                "ips": cpu_ips[j].numpy(),
-                "img_gt_with_alpha": cpu_alpha[j].numpy(),
-                "filename": v["filename"][j],
-            }
-            for j, o in enumerate(gen_face)
-        ]
 def inference_model_remote(model, v, device, verbose=False):

     input_values = audio_processor(
         audio_data, sampling_rate=16000, return_tensors="pt"
+    ).cuda(0))["input_values"] #//.to(device)["input_values"]
+    #with torch.no_grad():
+    logits = audio_encoder(input_values=input_values)
     return logits.last_hidden_state[0]
 def inference_model(model, v, device, verbose=False):
+    #with torch.no_grad():
+    mel, ips, mask, alpha = (
+        v["mel"],
+        v["ips"],
+        v["mask"],
+        v["img_gt_with_alpha"],
+    )
+    cpu_ips = ips
+    cpu_alpha = alpha
+    #audio = mel.to(device)
+    #ips = ips.to(device).permute(0, 3, 1, 2)
+    audio = mel.cuda(0)
+    ips = ips.cuda(0).permute(0, 3, 1, 2)
+    pred = model.model(ips, audio)
+    gen_face = to_img(pred)
+    return [
+        {
+            "pred": o,
+            "mask": mask[j].numpy(),
+            "ips": cpu_ips[j].numpy(),
+            "img_gt_with_alpha": cpu_alpha[j].numpy(),
+            "filename": v["filename"][j],
+        }
+        for j, o in enumerate(gen_face)
+    ]
 def inference_model_remote(model, v, device, verbose=False):