EchoMimic

Running on Zero

App Files Files Community

wenmengzhou commited on Aug 2, 2024

Commit

65882a7

verified ·

1 Parent(s): 1932d9e

split video_generation to two function

Browse files

Files changed (1) hide show

webgui.py +104 -35

webgui.py CHANGED Viewed

@@ -160,9 +160,8 @@ def select_face(det_bboxes, probs):
     return sorted_bboxes[0]
 lmk_extractor = LMKExtractor()
-@spaces.GPU
-def process_video(uploaded_img, uploaded_audio, width, height, length, facemask_dilation_ratio, facecrop_dilation_ratio, context_frames, context_overlap, cfg, steps, sample_rate, fps, device):
-    #### face musk prepare
     face_img = cv2.imread(uploaded_img)
     if face_img is None:
         raise gr.Error("input image should be uploaded or selected.")
@@ -178,8 +177,7 @@ def process_video(uploaded_img, uploaded_audio, width, height, length, facemask_
         r_pad = int((re - rb) * facemask_dilation_ratio)
         c_pad = int((ce - cb) * facemask_dilation_ratio)
         face_mask[rb - r_pad : re + r_pad, cb - c_pad : ce + c_pad] = 255
-        #### face crop
         r_pad_crop = int((re - rb) * facecrop_dilation_ratio)
         c_pad_crop = int((ce - cb) * facecrop_dilation_ratio)
         crop_rect = [max(0, cb - c_pad_crop), max(0, rb - r_pad_crop), min(ce + c_pad_crop, face_img.shape[1]), min(re + r_pad_crop, face_img.shape[0])]
@@ -187,39 +185,14 @@ def process_video(uploaded_img, uploaded_audio, width, height, length, facemask_
         face_mask = crop_and_pad(face_mask, crop_rect)
         face_img = cv2.resize(face_img, (width, height))
         face_mask = cv2.resize(face_mask, (width, height))
     print('face detect done.')
-    # ==================== face_locator =====================
-    '''
-    driver_video = "./assets/driven_videos/c.mp4"
-    input_frames_cv2 = [cv2.resize(center_crop_cv2(pil_to_cv2(i)), (512, 512)) for i in pils_from_video(driver_video)]
-    ref_det = lmk_extractor(face_img)
-    visualizer = FaceMeshVisualizer(draw_iris=False, draw_mouse=False)
-    pose_list = []
-    sequence_driver_det = []
-    try:
-        for frame in input_frames_cv2:
-            result = lmk_extractor(frame)
-            assert result is not None, "{}, bad video, face not detected".format(driver_video)
-            sequence_driver_det.append(result)
-    except:
-        print("face detection failed")
-        exit()
-    sequence_det_ms = motion_sync(sequence_driver_det, ref_det)
-    for p in sequence_det_ms:
-        tgt_musk = visualizer.draw_landmarks((width, height), p)
-        tgt_musk_pil = Image.fromarray(np.array(tgt_musk).astype(np.uint8)).convert('RGB')
-        pose_list.append(torch.Tensor(np.array(tgt_musk_pil)).to(dtype=weight_dtype, device="cuda").permute(2,0,1) / 255.0)
-    '''
-    # face_mask_tensor = torch.stack(pose_list, dim=1).unsqueeze(0)
     face_mask_tensor = torch.Tensor(face_mask).to(dtype=weight_dtype, device="cuda").unsqueeze(0).unsqueeze(0).unsqueeze(0) / 255.0
     ref_image_pil = Image.fromarray(face_img[:, :, [2, 1, 0]])
-    #del pose_list, sequence_det_ms, sequence_driver_det, input_frames_cv2
     video = pipe(
         ref_image_pil,
@@ -230,7 +203,6 @@ def process_video(uploaded_img, uploaded_audio, width, height, length, facemask_
         length,
         steps,
         cfg,
-        #generator=generator,
         audio_sample_rate=sample_rate,
         context_frames=context_frames,
         fps=fps,
@@ -250,6 +222,103 @@ def process_video(uploaded_img, uploaded_audio, width, height, length, facemask_
     video_clip.write_videofile(str(final_output_path), codec="libx264", audio_codec="aac")
     return final_output_path
 with gr.Blocks() as demo:
     gr.Markdown('# EchoMimic')

     return sorted_bboxes[0]
 lmk_extractor = LMKExtractor()
+def face_detection(uploaded_img, facemask_dilation_ratio, facecrop_dilation_ratio, width, height):
     face_img = cv2.imread(uploaded_img)
     if face_img is None:
         raise gr.Error("input image should be uploaded or selected.")
         r_pad = int((re - rb) * facemask_dilation_ratio)
         c_pad = int((ce - cb) * facemask_dilation_ratio)
         face_mask[rb - r_pad : re + r_pad, cb - c_pad : ce + c_pad] = 255
         r_pad_crop = int((re - rb) * facecrop_dilation_ratio)
         c_pad_crop = int((ce - cb) * facecrop_dilation_ratio)
         crop_rect = [max(0, cb - c_pad_crop), max(0, rb - r_pad_crop), min(ce + c_pad_crop, face_img.shape[1]), min(re + r_pad_crop, face_img.shape[0])]
         face_mask = crop_and_pad(face_mask, crop_rect)
         face_img = cv2.resize(face_img, (width, height))
         face_mask = cv2.resize(face_mask, (width, height))
     print('face detect done.')
+    return face_img, face_mask
+@spaces.GPU
+def video_pipe(face_img, face_mask, uploaded_audio, width, height, length, context_frames, context_overlap, cfg, steps, sample_rate, fps, device):
     face_mask_tensor = torch.Tensor(face_mask).to(dtype=weight_dtype, device="cuda").unsqueeze(0).unsqueeze(0).unsqueeze(0) / 255.0
     ref_image_pil = Image.fromarray(face_img[:, :, [2, 1, 0]])
     video = pipe(
         ref_image_pil,
         length,
         steps,
         cfg,
         audio_sample_rate=sample_rate,
         context_frames=context_frames,
         fps=fps,
     video_clip.write_videofile(str(final_output_path), codec="libx264", audio_codec="aac")
     return final_output_path
+def process_video(uploaded_img, uploaded_audio, width, height, length, facemask_dilation_ratio, facecrop_dilation_ratio, context_frames, context_overlap, cfg, steps, sample_rate, fps, device):
+    face_img, face_mask = face_detection(uploaded_img, facemask_dilation_ratio, facecrop_dilation_ratio, width, height)
+    final_output_path = video_pipe(face_img, face_mask, uploaded_audio, width, height, length, context_frames, context_overlap, cfg, steps, sample_rate, fps, device)
+    return final_output_path
+# @spaces.GPU
+# def process_video(uploaded_img, uploaded_audio, width, height, length, facemask_dilation_ratio, facecrop_dilation_ratio, context_frames, context_overlap, cfg, steps, sample_rate, fps, device):
+#     #### face musk prepare
+#     face_img = cv2.imread(uploaded_img)
+#     if face_img is None:
+#         raise gr.Error("input image should be uploaded or selected.")
+#     face_mask = np.zeros((face_img.shape[0], face_img.shape[1])).astype('uint8')
+#     det_bboxes, probs = face_detector.detect(face_img)
+#     select_bbox = select_face(det_bboxes, probs)
+#     if select_bbox is None:
+#         face_mask[:, :] = 255
+#     else:
+#         xyxy = select_bbox[:4]
+#         xyxy = np.round(xyxy).astype('int')
+#         rb, re, cb, ce = xyxy[1], xyxy[3], xyxy[0], xyxy[2]
+#         r_pad = int((re - rb) * facemask_dilation_ratio)
+#         c_pad = int((ce - cb) * facemask_dilation_ratio)
+#         face_mask[rb - r_pad : re + r_pad, cb - c_pad : ce + c_pad] = 255
+#         #### face crop
+#         r_pad_crop = int((re - rb) * facecrop_dilation_ratio)
+#         c_pad_crop = int((ce - cb) * facecrop_dilation_ratio)
+#         crop_rect = [max(0, cb - c_pad_crop), max(0, rb - r_pad_crop), min(ce + c_pad_crop, face_img.shape[1]), min(re + r_pad_crop, face_img.shape[0])]
+#         face_img = crop_and_pad(face_img, crop_rect)
+#         face_mask = crop_and_pad(face_mask, crop_rect)
+#         face_img = cv2.resize(face_img, (width, height))
+#         face_mask = cv2.resize(face_mask, (width, height))
+#     print('face detect done.')
+#     # ==================== face_locator =====================
+#     '''
+#     driver_video = "./assets/driven_videos/c.mp4"
+#     input_frames_cv2 = [cv2.resize(center_crop_cv2(pil_to_cv2(i)), (512, 512)) for i in pils_from_video(driver_video)]
+#     ref_det = lmk_extractor(face_img)
+#     visualizer = FaceMeshVisualizer(draw_iris=False, draw_mouse=False)
+#     pose_list = []
+#     sequence_driver_det = []
+#     try:
+#         for frame in input_frames_cv2:
+#             result = lmk_extractor(frame)
+#             assert result is not None, "{}, bad video, face not detected".format(driver_video)
+#             sequence_driver_det.append(result)
+#     except:
+#         print("face detection failed")
+#         exit()
+#     sequence_det_ms = motion_sync(sequence_driver_det, ref_det)
+#     for p in sequence_det_ms:
+#         tgt_musk = visualizer.draw_landmarks((width, height), p)
+#         tgt_musk_pil = Image.fromarray(np.array(tgt_musk).astype(np.uint8)).convert('RGB')
+#         pose_list.append(torch.Tensor(np.array(tgt_musk_pil)).to(dtype=weight_dtype, device="cuda").permute(2,0,1) / 255.0)
+#     '''
+#     # face_mask_tensor = torch.stack(pose_list, dim=1).unsqueeze(0)
+#     face_mask_tensor = torch.Tensor(face_mask).to(dtype=weight_dtype, device="cuda").unsqueeze(0).unsqueeze(0).unsqueeze(0) / 255.0
+#     ref_image_pil = Image.fromarray(face_img[:, :, [2, 1, 0]])
+#     #del pose_list, sequence_det_ms, sequence_driver_det, input_frames_cv2
+#     video = pipe(
+#         ref_image_pil,
+#         uploaded_audio,
+#         face_mask_tensor,
+#         width,
+#         height,
+#         length,
+#         steps,
+#         cfg,
+#         #generator=generator,
+#         audio_sample_rate=sample_rate,
+#         context_frames=context_frames,
+#         fps=fps,
+#         context_overlap=context_overlap
+#     ).videos
+#     print('video pipe done.')
+#     save_dir = Path("output/tmp")
+#     save_dir.mkdir(exist_ok=True, parents=True)
+#     output_video_path = save_dir / "output_video.mp4"
+#     save_videos_grid(video, str(output_video_path), n_rows=1, fps=fps)
+#     video_clip = VideoFileClip(str(output_video_path))
+#     audio_clip = AudioFileClip(uploaded_audio)
+#     final_output_path = save_dir / "output_video_with_audio.mp4"
+#     video_clip = video_clip.set_audio(audio_clip)
+#     video_clip.write_videofile(str(final_output_path), codec="libx264", audio_codec="aac")
+#     return final_output_path
 with gr.Blocks() as demo:
     gr.Markdown('# EchoMimic')