WebashalarForML commited on
Commit
c51a38f
·
verified ·
1 Parent(s): 113ca6d

Upload 96 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +30 -0
  2. app.py +173 -0
  3. evaluation/README.md +63 -0
  4. evaluation/gen_videos_from_filelist.py +238 -0
  5. evaluation/real_videos_inference.py +305 -0
  6. evaluation/scores_LSE/SyncNetInstance_calc_scores.py +210 -0
  7. evaluation/scores_LSE/calculate_scores_LRS.py +53 -0
  8. evaluation/scores_LSE/calculate_scores_real_videos.py +45 -0
  9. evaluation/scores_LSE/calculate_scores_real_videos.sh +8 -0
  10. evaluation/test_filelists/README.md +13 -0
  11. evaluation/test_filelists/ReSyncED/random_pairs.txt +160 -0
  12. evaluation/test_filelists/ReSyncED/tts_pairs.txt +18 -0
  13. evaluation/test_filelists/lrs2.txt +0 -0
  14. evaluation/test_filelists/lrs3.txt +0 -0
  15. evaluation/test_filelists/lrw.txt +0 -0
  16. face_detection/README.md +1 -0
  17. face_detection/__init__.py +7 -0
  18. face_detection/__pycache__/__init__.cpython-313.pyc +0 -0
  19. face_detection/__pycache__/__init__.cpython-37.pyc +0 -0
  20. face_detection/__pycache__/api.cpython-313.pyc +0 -0
  21. face_detection/__pycache__/api.cpython-37.pyc +0 -0
  22. face_detection/__pycache__/models.cpython-313.pyc +0 -0
  23. face_detection/__pycache__/models.cpython-37.pyc +0 -0
  24. face_detection/__pycache__/utils.cpython-313.pyc +0 -0
  25. face_detection/__pycache__/utils.cpython-37.pyc +0 -0
  26. face_detection/api.py +79 -0
  27. face_detection/detection/__init__.py +1 -0
  28. face_detection/detection/__pycache__/__init__.cpython-37.pyc +0 -0
  29. face_detection/detection/__pycache__/core.cpython-37.pyc +0 -0
  30. face_detection/detection/core.py +130 -0
  31. face_detection/detection/sfd/__init__.py +1 -0
  32. face_detection/detection/sfd/__pycache__/__init__.cpython-37.pyc +0 -0
  33. face_detection/detection/sfd/__pycache__/bbox.cpython-37.pyc +0 -0
  34. face_detection/detection/sfd/__pycache__/detect.cpython-37.pyc +0 -0
  35. face_detection/detection/sfd/__pycache__/net_s3fd.cpython-37.pyc +0 -0
  36. face_detection/detection/sfd/__pycache__/sfd_detector.cpython-37.pyc +0 -0
  37. face_detection/detection/sfd/bbox.py +129 -0
  38. face_detection/detection/sfd/detect.py +112 -0
  39. face_detection/detection/sfd/net_s3fd.py +129 -0
  40. face_detection/detection/sfd/s3fd-619a316812.pth +3 -0
  41. face_detection/detection/sfd/sfd_detector.py +59 -0
  42. face_detection/models.py +261 -0
  43. face_detection/utils.py +313 -0
  44. filelists/README.md +1 -0
  45. inference.py +294 -0
  46. inference2.py +346 -0
  47. info_install.txt +54 -0
  48. input/audio/audio_hindi_tony_stark.mp3 +3 -0
  49. input/audio/harvard.wav +3 -0
  50. input/audio/processed_tony_stark.mp3 +3 -0
.gitattributes CHANGED
@@ -33,3 +33,33 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ input/audio_hindi_tony_stark.mp3 filter=lfs diff=lfs merge=lfs -text
37
+ input/audio/audio_hindi_tony_stark.mp3 filter=lfs diff=lfs merge=lfs -text
38
+ input/audio/harvard.wav filter=lfs diff=lfs merge=lfs -text
39
+ input/audio/processed_tony_stark.mp3 filter=lfs diff=lfs merge=lfs -text
40
+ input/audio/sample_male_audio.mp3 filter=lfs diff=lfs merge=lfs -text
41
+ input/harvard.wav filter=lfs diff=lfs merge=lfs -text
42
+ input/image/images.jpg filter=lfs diff=lfs merge=lfs -text
43
+ input/image/portrait-young-confident-handsome-businessman-blue-red-light.jpg filter=lfs diff=lfs merge=lfs -text
44
+ input/image/young-model-casual-fall-winter-outfits.jpg filter=lfs diff=lfs merge=lfs -text
45
+ input/portrait-young-confident-handsome-businessman-blue-red-light.jpg filter=lfs diff=lfs merge=lfs -text
46
+ input/processed_tony_stark.mp3 filter=lfs diff=lfs merge=lfs -text
47
+ input/processed_tony_stark.mp4 filter=lfs diff=lfs merge=lfs -text
48
+ input/tony_2.mp4 filter=lfs diff=lfs merge=lfs -text
49
+ input/tony_stark.mp4 filter=lfs diff=lfs merge=lfs -text
50
+ input/video/man_sample1_green_screen.mp4 filter=lfs diff=lfs merge=lfs -text
51
+ input/video/tony_2.mp4 filter=lfs diff=lfs merge=lfs -text
52
+ input/young-model-casual-fall-winter-outfits.jpg filter=lfs diff=lfs merge=lfs -text
53
+ results/output_image_video.mp4 filter=lfs diff=lfs merge=lfs -text
54
+ results/output_lipsynced.mp4 filter=lfs diff=lfs merge=lfs -text
55
+ results/output_tony_howard.mp4 filter=lfs diff=lfs merge=lfs -text
56
+ results/output_video_green_screen.mp4 filter=lfs diff=lfs merge=lfs -text
57
+ results/output_video.mp4 filter=lfs diff=lfs merge=lfs -text
58
+ results/result_14c740551950bab2.mp4 filter=lfs diff=lfs merge=lfs -text
59
+ results/result_8d8eccddd8fe1694.mp4 filter=lfs diff=lfs merge=lfs -text
60
+ results/result_be9149db044ae463.mp4 filter=lfs diff=lfs merge=lfs -text
61
+ results/result_c5fbd07c81197985.mp4 filter=lfs diff=lfs merge=lfs -text
62
+ results/result_ea00d0588539fef5.mp4 filter=lfs diff=lfs merge=lfs -text
63
+ results/result_voice.mp4 filter=lfs diff=lfs merge=lfs -text
64
+ temp/result.avi filter=lfs diff=lfs merge=lfs -text
65
+ temp/temp_audio.wav filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, render_template, request, session, redirect, url_for, flash, send_from_directory
2
+ import os
3
+ import secrets
4
+ from werkzeug.utils import secure_filename
5
+ import sys
6
+ import shutil
7
+
8
+ sys.path.append(os.path.dirname(__file__))
9
+ import inference2 # Import your refactored inference script
10
+
11
+ app = Flask(__name__)
12
+ app.secret_key = os.urandom(24)
13
+ app.config['UPLOAD_FOLDER'] = 'uploads'
14
+ app.config['RESULTS_FOLDER'] = 'results' # This directory is NOT inside static
15
+ app.config['CHECKPOINTS_FOLDER'] = 'checkpoints'
16
+ app.config['TEMP_FOLDER'] = 'temp'
17
+
18
+ ALLOWED_FACE_EXTENSIONS = {'png', 'jpg', 'jpeg', 'mp4', 'avi', 'mov'}
19
+ ALLOWED_AUDIO_EXTENSIONS = {'wav', 'mp3', 'aac', 'flac'}
20
+ ALLOWED_MODEL_EXTENSIONS = {'pth', 'pt'}
21
+
22
+ os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
23
+ os.makedirs(app.config['RESULTS_FOLDER'], exist_ok=True)
24
+ os.makedirs(app.config['CHECKPOINTS_FOLDER'], exist_ok=True)
25
+ os.makedirs(app.config['TEMP_FOLDER'], exist_ok=True)
26
+
27
+
28
+ def allowed_file(filename, allowed_extensions):
29
+ return '.' in filename and \
30
+ filename.rsplit('.', 1)[1].lower() in allowed_extensions
31
+
32
+ @app.route('/')
33
+ def index():
34
+ theme = session.get('theme', 'dark')
35
+ available_models = []
36
+ try:
37
+ model_files = [f for f in os.listdir(app.config['CHECKPOINTS_FOLDER'])
38
+ if allowed_file(f, ALLOWED_MODEL_EXTENSIONS)]
39
+ available_models = sorted(model_files)
40
+ except FileNotFoundError:
41
+ # flash("Checkpoints folder not found. Please create a 'checkpoints' directory.", 'error') # Messages removed
42
+ pass
43
+ except Exception as e:
44
+ # flash(f"Error loading models: {e}", 'error') # Messages removed
45
+ pass
46
+ return render_template('index.html', theme=theme, models=available_models)
47
+
48
+ @app.route('/toggle_theme')
49
+ def toggle_theme():
50
+ current_theme = session.get('theme', 'dark')
51
+ if current_theme == 'dark':
52
+ session['theme'] = 'light'
53
+ else:
54
+ session['theme'] = 'dark'
55
+ return redirect(request.referrer or url_for('index'))
56
+
57
+ @app.route('/infer', methods=['POST'])
58
+ def infer():
59
+ if request.method == 'POST':
60
+ if 'face_file' not in request.files or 'audio_file' not in request.files:
61
+ # flash('Both face and audio files are required.', 'error') # Messages removed
62
+ return redirect(url_for('index'))
63
+
64
+ face_file = request.files['face_file']
65
+ audio_file = request.files['audio_file']
66
+ selected_model = request.form.get('model_select')
67
+
68
+ if face_file.filename == '' or audio_file.filename == '':
69
+ # flash('No selected file for face or audio.', 'error') # Messages removed
70
+ return redirect(url_for('index'))
71
+
72
+ if not selected_model:
73
+ # flash('No model selected.', 'error') # Messages removed
74
+ return redirect(url_for('index'))
75
+
76
+ if not allowed_file(face_file.filename, ALLOWED_FACE_EXTENSIONS):
77
+ # flash('Invalid face file type. Allowed: png, jpg, jpeg, mp4, avi, mov', 'error') # Messages removed
78
+ return redirect(url_for('index'))
79
+ if not allowed_file(audio_file.filename, ALLOWED_AUDIO_EXTENSIONS):
80
+ # flash('Invalid audio file type. Allowed: wav, mp3, aac, flac', 'error') # Messages removed
81
+ return redirect(url_for('index'))
82
+
83
+ face_filename = secure_filename(face_file.filename)
84
+ audio_filename = secure_filename(audio_file.filename)
85
+
86
+ face_uuid = secrets.token_hex(8)
87
+ audio_uuid = secrets.token_hex(8)
88
+
89
+ face_path = os.path.join(app.config['UPLOAD_FOLDER'], f"{face_uuid}_{face_filename}")
90
+ audio_path = os.path.join(app.config['UPLOAD_FOLDER'], f"{audio_uuid}_{audio_filename}")
91
+
92
+ try:
93
+ face_file.save(face_path)
94
+ audio_file.save(audio_path)
95
+ except Exception as e:
96
+ # flash(f"Error saving uploaded files: {e}", 'error') # Messages removed
97
+ return redirect(url_for('index'))
98
+
99
+ checkpoint_path = os.path.join(app.config['CHECKPOINTS_FOLDER'], selected_model)
100
+ output_video_name = f"result_{face_uuid}.mp4"
101
+
102
+ try:
103
+ # flash('Starting inference... This may take a while.', 'info') # Messages removed
104
+ generated_video_path = inference2.run_inference(
105
+ checkpoint_path=checkpoint_path,
106
+ face_path=face_path,
107
+ audio_path=audio_path,
108
+ output_filename=output_video_name,
109
+ static=request.form.get('static_input') == 'on',
110
+ fps=float(request.form.get('fps', 25.0)),
111
+ resize_factor=int(request.form.get('resize_factor', 1)),
112
+ rotate=request.form.get('rotate') == 'on',
113
+ nosmooth=request.form.get('nosmooth') == 'on',
114
+ pads=[0, 10, 0, 0],
115
+ crop=[0, -1, 0, -1],
116
+ box=[-1, -1, -1, -1],
117
+ face_det_batch_size=16,
118
+ wav2lip_batch_size=128,
119
+ img_size=96
120
+ )
121
+ # flash('Inference completed successfully!', 'success') # Messages removed
122
+
123
+ # Redirect to the page that renders result.html
124
+ return redirect(url_for('render_result_page', filename=os.path.basename(generated_video_path)))
125
+
126
+ except ValueError as e:
127
+ # flash(f"Inference Error: {e}", 'error') # Messages removed
128
+ pass
129
+ except RuntimeError as e:
130
+ # flash(f"Runtime Error during inference: {e}", 'error') # Messages removed
131
+ pass
132
+ except Exception as e:
133
+ # flash(f"An unexpected error occurred: {e}", 'error') # Messages removed
134
+ pass
135
+ finally:
136
+ if os.path.exists(face_path):
137
+ os.remove(face_path)
138
+ if os.path.exists(audio_path):
139
+ os.remove(audio_path)
140
+
141
+ return redirect(url_for('index'))
142
+
143
+ # Route to render the result.html template
144
+ @app.route('/result_page/<filename>')
145
+ def render_result_page(filename):
146
+ theme = session.get('theme', 'dark')
147
+ # Check if the file actually exists before rendering
148
+ if not os.path.exists(os.path.join(app.config['RESULTS_FOLDER'], filename)):
149
+ # If the video isn't found, redirect or show an error
150
+ # Consider a dedicated error page or a message within index.html if no flashes are used
151
+ return redirect(url_for('index'))
152
+ return render_template('result.html', theme=theme, video_filename=filename)
153
+
154
+
155
+ # Route to serve the video file itself (used by <video src="...">)
156
+ @app.route('/results/<path:filename>') # Use <path:filename> to handle potential subdirectories in filename (though not needed here)
157
+ def serve_result_video(filename):
158
+ # This route is solely for serving the video file
159
+ return send_from_directory(app.config['RESULTS_FOLDER'], filename)
160
+
161
+ # Route to download the video file
162
+ @app.route('/download/<filename>') # Changed to /download/ for clarity
163
+ def download_result(filename):
164
+ return send_from_directory(app.config['RESULTS_FOLDER'], filename, as_attachment=True)
165
+
166
+
167
+ if __name__ == '__main__':
168
+ os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
169
+ os.makedirs(app.config['RESULTS_FOLDER'], exist_ok=True)
170
+ os.makedirs(app.config['CHECKPOINTS_FOLDER'], exist_ok=True)
171
+ os.makedirs(app.config['TEMP_FOLDER'], exist_ok=True)
172
+
173
+ app.run(debug=True)
evaluation/README.md ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Novel Evaluation Framework, new filelists, and using the LSE-D and LSE-C metric.
2
+
3
+ Our paper also proposes a novel evaluation framework (Section 4). To evaluate on LRS2, LRS3, and LRW, the filelists are present in the `test_filelists` folder. Please use `gen_videos_from_filelist.py` script to generate the videos. After that, you can calculate the LSE-D and LSE-C scores using the instructions below. Please see [this thread](https://github.com/Rudrabha/Wav2Lip/issues/22#issuecomment-712825380) on how to calculate the FID scores.
4
+
5
+ The videos of the ReSyncED benchmark for real-world evaluation will be released soon.
6
+
7
+ ### Steps to set-up the evaluation repository for LSE-D and LSE-C metric:
8
+ We use the pre-trained syncnet model available in this [repository](https://github.com/joonson/syncnet_python).
9
+
10
+ * Clone the SyncNet repository.
11
+ ```
12
+ git clone https://github.com/joonson/syncnet_python.git
13
+ ```
14
+ * Follow the procedure given in the above linked [repository](https://github.com/joonson/syncnet_python) to download the pretrained models and set up the dependencies.
15
+ * **Note: Please install a separate virtual environment for the evaluation scripts. The versions used by Wav2Lip and the publicly released code of SyncNet is different and can cause version mis-match issues. To avoid this, we suggest the users to install a separate virtual environment for the evaluation scripts**
16
+ ```
17
+ cd syncnet_python
18
+ pip install -r requirements.txt
19
+ sh download_model.sh
20
+ ```
21
+ * The above step should ensure that all the dependencies required by the repository is installed and the pre-trained models are downloaded.
22
+
23
+ ### Running the evaluation scripts:
24
+ * Copy our evaluation scripts given in this folder to the cloned repository.
25
+ ```
26
+ cd Wav2Lip/evaluation/scores_LSE/
27
+ cp *.py syncnet_python/
28
+ cp *.sh syncnet_python/
29
+ ```
30
+ **Note: We will release the test filelists for LRW, LRS2 and LRS3 shortly once we receive permission from the dataset creators. We will also release the Real World Dataset we have collected shortly.**
31
+
32
+ * Our evaluation technique does not require ground-truth of any sorts. Given lip-synced videos we can directly calculate the scores from only the generated videos. Please store the generated videos (from our test sets or your own generated videos) in the following folder structure.
33
+ ```
34
+ video data root (Folder containing all videos)
35
+ ├── All .mp4 files
36
+ ```
37
+ * Change the folder back to the cloned repository.
38
+ ```
39
+ cd syncnet_python
40
+ ```
41
+ * To run evaluation on the LRW, LRS2 and LRS3 test files, please run the following command:
42
+ ```
43
+ python calculate_scores_LRS.py --data_root /path/to/video/data/root --tmp_dir tmp_dir/
44
+ ```
45
+
46
+ * To run evaluation on the ReSynced dataset or your own generated videos, please run the following command:
47
+ ```
48
+ sh calculate_scores_real_videos.sh /path/to/video/data/root
49
+ ```
50
+ * The generated scores will be present in the all_scores.txt generated in the ```syncnet_python/``` folder
51
+
52
+ # Evaluation of image quality using FID metric.
53
+ We use the [pytorch-fid](https://github.com/mseitzer/pytorch-fid) repository for calculating the FID metrics. We dump all the frames in both ground-truth and generated videos and calculate the FID score.
54
+
55
+
56
+ # Opening issues related to evaluation scripts
57
+ * Please open the issues with the "Evaluation" label if you face any issues in the evaluation scripts.
58
+
59
+ # Acknowledgements
60
+ Our evaluation pipeline in based on two existing repositories. LSE metrics are based on the [syncnet_python](https://github.com/joonson/syncnet_python) repository and the FID score is based on [pytorch-fid](https://github.com/mseitzer/pytorch-fid) repository. We thank the authors of both the repositories for releasing their wonderful code.
61
+
62
+
63
+
evaluation/gen_videos_from_filelist.py ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from os import listdir, path
2
+ import numpy as np
3
+ import scipy, cv2, os, sys, argparse
4
+ import dlib, json, subprocess
5
+ from tqdm import tqdm
6
+ from glob import glob
7
+ import torch
8
+
9
+ sys.path.append('../')
10
+ import audio
11
+ import face_detection
12
+ from models import Wav2Lip
13
+
14
+ parser = argparse.ArgumentParser(description='Code to generate results for test filelists')
15
+
16
+ parser.add_argument('--filelist', type=str,
17
+ help='Filepath of filelist file to read', required=True)
18
+ parser.add_argument('--results_dir', type=str, help='Folder to save all results into',
19
+ required=True)
20
+ parser.add_argument('--data_root', type=str, required=True)
21
+ parser.add_argument('--checkpoint_path', type=str,
22
+ help='Name of saved checkpoint to load weights from', required=True)
23
+
24
+ parser.add_argument('--pads', nargs='+', type=int, default=[0, 0, 0, 0],
25
+ help='Padding (top, bottom, left, right)')
26
+ parser.add_argument('--face_det_batch_size', type=int,
27
+ help='Single GPU batch size for face detection', default=64)
28
+ parser.add_argument('--wav2lip_batch_size', type=int, help='Batch size for Wav2Lip', default=128)
29
+
30
+ # parser.add_argument('--resize_factor', default=1, type=int)
31
+
32
+ args = parser.parse_args()
33
+ args.img_size = 96
34
+
35
+ def get_smoothened_boxes(boxes, T):
36
+ for i in range(len(boxes)):
37
+ if i + T > len(boxes):
38
+ window = boxes[len(boxes) - T:]
39
+ else:
40
+ window = boxes[i : i + T]
41
+ boxes[i] = np.mean(window, axis=0)
42
+ return boxes
43
+
44
+ def face_detect(images):
45
+ batch_size = args.face_det_batch_size
46
+
47
+ while 1:
48
+ predictions = []
49
+ try:
50
+ for i in range(0, len(images), batch_size):
51
+ predictions.extend(detector.get_detections_for_batch(np.array(images[i:i + batch_size])))
52
+ except RuntimeError:
53
+ if batch_size == 1:
54
+ raise RuntimeError('Image too big to run face detection on GPU')
55
+ batch_size //= 2
56
+ args.face_det_batch_size = batch_size
57
+ print('Recovering from OOM error; New batch size: {}'.format(batch_size))
58
+ continue
59
+ break
60
+
61
+ results = []
62
+ pady1, pady2, padx1, padx2 = args.pads
63
+ for rect, image in zip(predictions, images):
64
+ if rect is None:
65
+ raise ValueError('Face not detected!')
66
+
67
+ y1 = max(0, rect[1] - pady1)
68
+ y2 = min(image.shape[0], rect[3] + pady2)
69
+ x1 = max(0, rect[0] - padx1)
70
+ x2 = min(image.shape[1], rect[2] + padx2)
71
+
72
+ results.append([x1, y1, x2, y2])
73
+
74
+ boxes = get_smoothened_boxes(np.array(results), T=5)
75
+ results = [[image[y1: y2, x1:x2], (y1, y2, x1, x2), True] for image, (x1, y1, x2, y2) in zip(images, boxes)]
76
+
77
+ return results
78
+
79
+ def datagen(frames, face_det_results, mels):
80
+ img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
81
+
82
+ for i, m in enumerate(mels):
83
+ if i >= len(frames): raise ValueError('Equal or less lengths only')
84
+
85
+ frame_to_save = frames[i].copy()
86
+ face, coords, valid_frame = face_det_results[i].copy()
87
+ if not valid_frame:
88
+ continue
89
+
90
+ face = cv2.resize(face, (args.img_size, args.img_size))
91
+
92
+ img_batch.append(face)
93
+ mel_batch.append(m)
94
+ frame_batch.append(frame_to_save)
95
+ coords_batch.append(coords)
96
+
97
+ if len(img_batch) >= args.wav2lip_batch_size:
98
+ img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
99
+
100
+ img_masked = img_batch.copy()
101
+ img_masked[:, args.img_size//2:] = 0
102
+
103
+ img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.
104
+ mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])
105
+
106
+ yield img_batch, mel_batch, frame_batch, coords_batch
107
+ img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
108
+
109
+ if len(img_batch) > 0:
110
+ img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
111
+
112
+ img_masked = img_batch.copy()
113
+ img_masked[:, args.img_size//2:] = 0
114
+
115
+ img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.
116
+ mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])
117
+
118
+ yield img_batch, mel_batch, frame_batch, coords_batch
119
+
120
+ fps = 25
121
+ mel_step_size = 16
122
+ mel_idx_multiplier = 80./fps
123
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
124
+ print('Using {} for inference.'.format(device))
125
+
126
+ detector = face_detection.FaceAlignment(face_detection.LandmarksType._2D,
127
+ flip_input=False, device=device)
128
+
129
+ def _load(checkpoint_path):
130
+ if device == 'cuda':
131
+ checkpoint = torch.load(checkpoint_path)
132
+ else:
133
+ checkpoint = torch.load(checkpoint_path,
134
+ map_location=lambda storage, loc: storage)
135
+ return checkpoint
136
+
137
+ def load_model(path):
138
+ model = Wav2Lip()
139
+ print("Load checkpoint from: {}".format(path))
140
+ checkpoint = _load(path)
141
+ s = checkpoint["state_dict"]
142
+ new_s = {}
143
+ for k, v in s.items():
144
+ new_s[k.replace('module.', '')] = v
145
+ model.load_state_dict(new_s)
146
+
147
+ model = model.to(device)
148
+ return model.eval()
149
+
150
+ model = load_model(args.checkpoint_path)
151
+
152
+ def main():
153
+ assert args.data_root is not None
154
+ data_root = args.data_root
155
+
156
+ if not os.path.isdir(args.results_dir): os.makedirs(args.results_dir)
157
+
158
+ with open(args.filelist, 'r') as filelist:
159
+ lines = filelist.readlines()
160
+
161
+ for idx, line in enumerate(tqdm(lines)):
162
+ audio_src, video = line.strip().split()
163
+
164
+ audio_src = os.path.join(data_root, audio_src) + '.mp4'
165
+ video = os.path.join(data_root, video) + '.mp4'
166
+
167
+ command = 'ffmpeg -loglevel panic -y -i {} -strict -2 {}'.format(audio_src, '../temp/temp.wav')
168
+ subprocess.call(command, shell=True)
169
+ temp_audio = '../temp/temp.wav'
170
+
171
+ wav = audio.load_wav(temp_audio, 16000)
172
+ mel = audio.melspectrogram(wav)
173
+ if np.isnan(mel.reshape(-1)).sum() > 0:
174
+ continue
175
+
176
+ mel_chunks = []
177
+ i = 0
178
+ while 1:
179
+ start_idx = int(i * mel_idx_multiplier)
180
+ if start_idx + mel_step_size > len(mel[0]):
181
+ break
182
+ mel_chunks.append(mel[:, start_idx : start_idx + mel_step_size])
183
+ i += 1
184
+
185
+ video_stream = cv2.VideoCapture(video)
186
+
187
+ full_frames = []
188
+ while 1:
189
+ still_reading, frame = video_stream.read()
190
+ if not still_reading or len(full_frames) > len(mel_chunks):
191
+ video_stream.release()
192
+ break
193
+ full_frames.append(frame)
194
+
195
+ if len(full_frames) < len(mel_chunks):
196
+ continue
197
+
198
+ full_frames = full_frames[:len(mel_chunks)]
199
+
200
+ try:
201
+ face_det_results = face_detect(full_frames.copy())
202
+ except ValueError as e:
203
+ continue
204
+
205
+ batch_size = args.wav2lip_batch_size
206
+ gen = datagen(full_frames.copy(), face_det_results, mel_chunks)
207
+
208
+ for i, (img_batch, mel_batch, frames, coords) in enumerate(gen):
209
+ if i == 0:
210
+ frame_h, frame_w = full_frames[0].shape[:-1]
211
+ out = cv2.VideoWriter('../temp/result.avi',
212
+ cv2.VideoWriter_fourcc(*'DIVX'), fps, (frame_w, frame_h))
213
+
214
+ img_batch = torch.FloatTensor(np.transpose(img_batch, (0, 3, 1, 2))).to(device)
215
+ mel_batch = torch.FloatTensor(np.transpose(mel_batch, (0, 3, 1, 2))).to(device)
216
+
217
+ with torch.no_grad():
218
+ pred = model(mel_batch, img_batch)
219
+
220
+
221
+ pred = pred.cpu().numpy().transpose(0, 2, 3, 1) * 255.
222
+
223
+ for pl, f, c in zip(pred, frames, coords):
224
+ y1, y2, x1, x2 = c
225
+ pl = cv2.resize(pl.astype(np.uint8), (x2 - x1, y2 - y1))
226
+ f[y1:y2, x1:x2] = pl
227
+ out.write(f)
228
+
229
+ out.release()
230
+
231
+ vid = os.path.join(args.results_dir, '{}.mp4'.format(idx))
232
+
233
+ command = 'ffmpeg -loglevel panic -y -i {} -i {} -strict -2 -q:v 1 {}'.format(temp_audio,
234
+ '../temp/result.avi', vid)
235
+ subprocess.call(command, shell=True)
236
+
237
+ if __name__ == '__main__':
238
+ main()
evaluation/real_videos_inference.py ADDED
@@ -0,0 +1,305 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from os import listdir, path
2
+ import numpy as np
3
+ import scipy, cv2, os, sys, argparse
4
+ import dlib, json, subprocess
5
+ from tqdm import tqdm
6
+ from glob import glob
7
+ import torch
8
+
9
+ sys.path.append('../')
10
+ import audio
11
+ import face_detection
12
+ from models import Wav2Lip
13
+
14
+ parser = argparse.ArgumentParser(description='Code to generate results on ReSyncED evaluation set')
15
+
16
+ parser.add_argument('--mode', type=str,
17
+ help='random | dubbed | tts', required=True)
18
+
19
+ parser.add_argument('--filelist', type=str,
20
+ help='Filepath of filelist file to read', default=None)
21
+
22
+ parser.add_argument('--results_dir', type=str, help='Folder to save all results into',
23
+ required=True)
24
+ parser.add_argument('--data_root', type=str, required=True)
25
+ parser.add_argument('--checkpoint_path', type=str,
26
+ help='Name of saved checkpoint to load weights from', required=True)
27
+ parser.add_argument('--pads', nargs='+', type=int, default=[0, 10, 0, 0],
28
+ help='Padding (top, bottom, left, right)')
29
+
30
+ parser.add_argument('--face_det_batch_size', type=int,
31
+ help='Single GPU batch size for face detection', default=16)
32
+
33
+ parser.add_argument('--wav2lip_batch_size', type=int, help='Batch size for Wav2Lip', default=128)
34
+ parser.add_argument('--face_res', help='Approximate resolution of the face at which to test', default=180)
35
+ parser.add_argument('--min_frame_res', help='Do not downsample further below this frame resolution', default=480)
36
+ parser.add_argument('--max_frame_res', help='Downsample to at least this frame resolution', default=720)
37
+ # parser.add_argument('--resize_factor', default=1, type=int)
38
+
39
+ args = parser.parse_args()
40
+ args.img_size = 96
41
+
42
+ def get_smoothened_boxes(boxes, T):
43
+ for i in range(len(boxes)):
44
+ if i + T > len(boxes):
45
+ window = boxes[len(boxes) - T:]
46
+ else:
47
+ window = boxes[i : i + T]
48
+ boxes[i] = np.mean(window, axis=0)
49
+ return boxes
50
+
51
+ def rescale_frames(images):
52
+ rect = detector.get_detections_for_batch(np.array([images[0]]))[0]
53
+ if rect is None:
54
+ raise ValueError('Face not detected!')
55
+ h, w = images[0].shape[:-1]
56
+
57
+ x1, y1, x2, y2 = rect
58
+
59
+ face_size = max(np.abs(y1 - y2), np.abs(x1 - x2))
60
+
61
+ diff = np.abs(face_size - args.face_res)
62
+ for factor in range(2, 16):
63
+ downsampled_res = face_size // factor
64
+ if min(h//factor, w//factor) < args.min_frame_res: break
65
+ if np.abs(downsampled_res - args.face_res) >= diff: break
66
+
67
+ factor -= 1
68
+ if factor == 1: return images
69
+
70
+ return [cv2.resize(im, (im.shape[1]//(factor), im.shape[0]//(factor))) for im in images]
71
+
72
+
73
+ def face_detect(images):
74
+ batch_size = args.face_det_batch_size
75
+ images = rescale_frames(images)
76
+
77
+ while 1:
78
+ predictions = []
79
+ try:
80
+ for i in range(0, len(images), batch_size):
81
+ predictions.extend(detector.get_detections_for_batch(np.array(images[i:i + batch_size])))
82
+ except RuntimeError:
83
+ if batch_size == 1:
84
+ raise RuntimeError('Image too big to run face detection on GPU')
85
+ batch_size //= 2
86
+ print('Recovering from OOM error; New batch size: {}'.format(batch_size))
87
+ continue
88
+ break
89
+
90
+ results = []
91
+ pady1, pady2, padx1, padx2 = args.pads
92
+ for rect, image in zip(predictions, images):
93
+ if rect is None:
94
+ raise ValueError('Face not detected!')
95
+
96
+ y1 = max(0, rect[1] - pady1)
97
+ y2 = min(image.shape[0], rect[3] + pady2)
98
+ x1 = max(0, rect[0] - padx1)
99
+ x2 = min(image.shape[1], rect[2] + padx2)
100
+
101
+ results.append([x1, y1, x2, y2])
102
+
103
+ boxes = get_smoothened_boxes(np.array(results), T=5)
104
+ results = [[image[y1: y2, x1:x2], (y1, y2, x1, x2), True] for image, (x1, y1, x2, y2) in zip(images, boxes)]
105
+
106
+ return results, images
107
+
108
+ def datagen(frames, face_det_results, mels):
109
+ img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
110
+
111
+ for i, m in enumerate(mels):
112
+ if i >= len(frames): raise ValueError('Equal or less lengths only')
113
+
114
+ frame_to_save = frames[i].copy()
115
+ face, coords, valid_frame = face_det_results[i].copy()
116
+ if not valid_frame:
117
+ continue
118
+
119
+ face = cv2.resize(face, (args.img_size, args.img_size))
120
+
121
+ img_batch.append(face)
122
+ mel_batch.append(m)
123
+ frame_batch.append(frame_to_save)
124
+ coords_batch.append(coords)
125
+
126
+ if len(img_batch) >= args.wav2lip_batch_size:
127
+ img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
128
+
129
+ img_masked = img_batch.copy()
130
+ img_masked[:, args.img_size//2:] = 0
131
+
132
+ img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.
133
+ mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])
134
+
135
+ yield img_batch, mel_batch, frame_batch, coords_batch
136
+ img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
137
+
138
+ if len(img_batch) > 0:
139
+ img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
140
+
141
+ img_masked = img_batch.copy()
142
+ img_masked[:, args.img_size//2:] = 0
143
+
144
+ img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.
145
+ mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])
146
+
147
+ yield img_batch, mel_batch, frame_batch, coords_batch
148
+
149
+ def increase_frames(frames, l):
150
+ ## evenly duplicating frames to increase length of video
151
+ while len(frames) < l:
152
+ dup_every = float(l) / len(frames)
153
+
154
+ final_frames = []
155
+ next_duplicate = 0.
156
+
157
+ for i, f in enumerate(frames):
158
+ final_frames.append(f)
159
+
160
+ if int(np.ceil(next_duplicate)) == i:
161
+ final_frames.append(f)
162
+
163
+ next_duplicate += dup_every
164
+
165
+ frames = final_frames
166
+
167
+ return frames[:l]
168
+
169
+ mel_step_size = 16
170
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
171
+ print('Using {} for inference.'.format(device))
172
+
173
+ detector = face_detection.FaceAlignment(face_detection.LandmarksType._2D,
174
+ flip_input=False, device=device)
175
+
176
+ def _load(checkpoint_path):
177
+ if device == 'cuda':
178
+ checkpoint = torch.load(checkpoint_path)
179
+ else:
180
+ checkpoint = torch.load(checkpoint_path,
181
+ map_location=lambda storage, loc: storage)
182
+ return checkpoint
183
+
184
+ def load_model(path):
185
+ model = Wav2Lip()
186
+ print("Load checkpoint from: {}".format(path))
187
+ checkpoint = _load(path)
188
+ s = checkpoint["state_dict"]
189
+ new_s = {}
190
+ for k, v in s.items():
191
+ new_s[k.replace('module.', '')] = v
192
+ model.load_state_dict(new_s)
193
+
194
+ model = model.to(device)
195
+ return model.eval()
196
+
197
+ model = load_model(args.checkpoint_path)
198
+
199
+ def main():
200
+ if not os.path.isdir(args.results_dir): os.makedirs(args.results_dir)
201
+
202
+ if args.mode == 'dubbed':
203
+ files = listdir(args.data_root)
204
+ lines = ['{} {}'.format(f, f) for f in files]
205
+
206
+ else:
207
+ assert args.filelist is not None
208
+ with open(args.filelist, 'r') as filelist:
209
+ lines = filelist.readlines()
210
+
211
+ for idx, line in enumerate(tqdm(lines)):
212
+ video, audio_src = line.strip().split()
213
+
214
+ audio_src = os.path.join(args.data_root, audio_src)
215
+ video = os.path.join(args.data_root, video)
216
+
217
+ command = 'ffmpeg -loglevel panic -y -i {} -strict -2 {}'.format(audio_src, '../temp/temp.wav')
218
+ subprocess.call(command, shell=True)
219
+ temp_audio = '../temp/temp.wav'
220
+
221
+ wav = audio.load_wav(temp_audio, 16000)
222
+ mel = audio.melspectrogram(wav)
223
+
224
+ if np.isnan(mel.reshape(-1)).sum() > 0:
225
+ raise ValueError('Mel contains nan!')
226
+
227
+ video_stream = cv2.VideoCapture(video)
228
+
229
+ fps = video_stream.get(cv2.CAP_PROP_FPS)
230
+ mel_idx_multiplier = 80./fps
231
+
232
+ full_frames = []
233
+ while 1:
234
+ still_reading, frame = video_stream.read()
235
+ if not still_reading:
236
+ video_stream.release()
237
+ break
238
+
239
+ if min(frame.shape[:-1]) > args.max_frame_res:
240
+ h, w = frame.shape[:-1]
241
+ scale_factor = min(h, w) / float(args.max_frame_res)
242
+ h = int(h/scale_factor)
243
+ w = int(w/scale_factor)
244
+
245
+ frame = cv2.resize(frame, (w, h))
246
+ full_frames.append(frame)
247
+
248
+ mel_chunks = []
249
+ i = 0
250
+ while 1:
251
+ start_idx = int(i * mel_idx_multiplier)
252
+ if start_idx + mel_step_size > len(mel[0]):
253
+ break
254
+ mel_chunks.append(mel[:, start_idx : start_idx + mel_step_size])
255
+ i += 1
256
+
257
+ if len(full_frames) < len(mel_chunks):
258
+ if args.mode == 'tts':
259
+ full_frames = increase_frames(full_frames, len(mel_chunks))
260
+ else:
261
+ raise ValueError('#Frames, audio length mismatch')
262
+
263
+ else:
264
+ full_frames = full_frames[:len(mel_chunks)]
265
+
266
+ try:
267
+ face_det_results, full_frames = face_detect(full_frames.copy())
268
+ except ValueError as e:
269
+ continue
270
+
271
+ batch_size = args.wav2lip_batch_size
272
+ gen = datagen(full_frames.copy(), face_det_results, mel_chunks)
273
+
274
+ for i, (img_batch, mel_batch, frames, coords) in enumerate(gen):
275
+ if i == 0:
276
+ frame_h, frame_w = full_frames[0].shape[:-1]
277
+
278
+ out = cv2.VideoWriter('../temp/result.avi',
279
+ cv2.VideoWriter_fourcc(*'DIVX'), fps, (frame_w, frame_h))
280
+
281
+ img_batch = torch.FloatTensor(np.transpose(img_batch, (0, 3, 1, 2))).to(device)
282
+ mel_batch = torch.FloatTensor(np.transpose(mel_batch, (0, 3, 1, 2))).to(device)
283
+
284
+ with torch.no_grad():
285
+ pred = model(mel_batch, img_batch)
286
+
287
+
288
+ pred = pred.cpu().numpy().transpose(0, 2, 3, 1) * 255.
289
+
290
+ for pl, f, c in zip(pred, frames, coords):
291
+ y1, y2, x1, x2 = c
292
+ pl = cv2.resize(pl.astype(np.uint8), (x2 - x1, y2 - y1))
293
+ f[y1:y2, x1:x2] = pl
294
+ out.write(f)
295
+
296
+ out.release()
297
+
298
+ vid = os.path.join(args.results_dir, '{}.mp4'.format(idx))
299
+ command = 'ffmpeg -loglevel panic -y -i {} -i {} -strict -2 -q:v 1 {}'.format('../temp/temp.wav',
300
+ '../temp/result.avi', vid)
301
+ subprocess.call(command, shell=True)
302
+
303
+
304
+ if __name__ == '__main__':
305
+ main()
evaluation/scores_LSE/SyncNetInstance_calc_scores.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python
2
+ #-*- coding: utf-8 -*-
3
+ # Video 25 FPS, Audio 16000HZ
4
+
5
+ import torch
6
+ import numpy
7
+ import time, pdb, argparse, subprocess, os, math, glob
8
+ import cv2
9
+ import python_speech_features
10
+
11
+ from scipy import signal
12
+ from scipy.io import wavfile
13
+ from SyncNetModel import *
14
+ from shutil import rmtree
15
+
16
+
17
+ # ==================== Get OFFSET ====================
18
+
19
+ def calc_pdist(feat1, feat2, vshift=10):
20
+
21
+ win_size = vshift*2+1
22
+
23
+ feat2p = torch.nn.functional.pad(feat2,(0,0,vshift,vshift))
24
+
25
+ dists = []
26
+
27
+ for i in range(0,len(feat1)):
28
+
29
+ dists.append(torch.nn.functional.pairwise_distance(feat1[[i],:].repeat(win_size, 1), feat2p[i:i+win_size,:]))
30
+
31
+ return dists
32
+
33
+ # ==================== MAIN DEF ====================
34
+
35
+ class SyncNetInstance(torch.nn.Module):
36
+
37
+ def __init__(self, dropout = 0, num_layers_in_fc_layers = 1024):
38
+ super(SyncNetInstance, self).__init__();
39
+
40
+ self.__S__ = S(num_layers_in_fc_layers = num_layers_in_fc_layers).cuda();
41
+
42
+ def evaluate(self, opt, videofile):
43
+
44
+ self.__S__.eval();
45
+
46
+ # ========== ==========
47
+ # Convert files
48
+ # ========== ==========
49
+
50
+ if os.path.exists(os.path.join(opt.tmp_dir,opt.reference)):
51
+ rmtree(os.path.join(opt.tmp_dir,opt.reference))
52
+
53
+ os.makedirs(os.path.join(opt.tmp_dir,opt.reference))
54
+
55
+ command = ("ffmpeg -loglevel error -y -i %s -threads 1 -f image2 %s" % (videofile,os.path.join(opt.tmp_dir,opt.reference,'%06d.jpg')))
56
+ output = subprocess.call(command, shell=True, stdout=None)
57
+
58
+ command = ("ffmpeg -loglevel error -y -i %s -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 %s" % (videofile,os.path.join(opt.tmp_dir,opt.reference,'audio.wav')))
59
+ output = subprocess.call(command, shell=True, stdout=None)
60
+
61
+ # ========== ==========
62
+ # Load video
63
+ # ========== ==========
64
+
65
+ images = []
66
+
67
+ flist = glob.glob(os.path.join(opt.tmp_dir,opt.reference,'*.jpg'))
68
+ flist.sort()
69
+
70
+ for fname in flist:
71
+ img_input = cv2.imread(fname)
72
+ img_input = cv2.resize(img_input, (224,224)) #HARD CODED, CHANGE BEFORE RELEASE
73
+ images.append(img_input)
74
+
75
+ im = numpy.stack(images,axis=3)
76
+ im = numpy.expand_dims(im,axis=0)
77
+ im = numpy.transpose(im,(0,3,4,1,2))
78
+
79
+ imtv = torch.autograd.Variable(torch.from_numpy(im.astype(float)).float())
80
+
81
+ # ========== ==========
82
+ # Load audio
83
+ # ========== ==========
84
+
85
+ sample_rate, audio = wavfile.read(os.path.join(opt.tmp_dir,opt.reference,'audio.wav'))
86
+ mfcc = zip(*python_speech_features.mfcc(audio,sample_rate))
87
+ mfcc = numpy.stack([numpy.array(i) for i in mfcc])
88
+
89
+ cc = numpy.expand_dims(numpy.expand_dims(mfcc,axis=0),axis=0)
90
+ cct = torch.autograd.Variable(torch.from_numpy(cc.astype(float)).float())
91
+
92
+ # ========== ==========
93
+ # Check audio and video input length
94
+ # ========== ==========
95
+
96
+ #if (float(len(audio))/16000) != (float(len(images))/25) :
97
+ # print("WARNING: Audio (%.4fs) and video (%.4fs) lengths are different."%(float(len(audio))/16000,float(len(images))/25))
98
+
99
+ min_length = min(len(images),math.floor(len(audio)/640))
100
+
101
+ # ========== ==========
102
+ # Generate video and audio feats
103
+ # ========== ==========
104
+
105
+ lastframe = min_length-5
106
+ im_feat = []
107
+ cc_feat = []
108
+
109
+ tS = time.time()
110
+ for i in range(0,lastframe,opt.batch_size):
111
+
112
+ im_batch = [ imtv[:,:,vframe:vframe+5,:,:] for vframe in range(i,min(lastframe,i+opt.batch_size)) ]
113
+ im_in = torch.cat(im_batch,0)
114
+ im_out = self.__S__.forward_lip(im_in.cuda());
115
+ im_feat.append(im_out.data.cpu())
116
+
117
+ cc_batch = [ cct[:,:,:,vframe*4:vframe*4+20] for vframe in range(i,min(lastframe,i+opt.batch_size)) ]
118
+ cc_in = torch.cat(cc_batch,0)
119
+ cc_out = self.__S__.forward_aud(cc_in.cuda())
120
+ cc_feat.append(cc_out.data.cpu())
121
+
122
+ im_feat = torch.cat(im_feat,0)
123
+ cc_feat = torch.cat(cc_feat,0)
124
+
125
+ # ========== ==========
126
+ # Compute offset
127
+ # ========== ==========
128
+
129
+ #print('Compute time %.3f sec.' % (time.time()-tS))
130
+
131
+ dists = calc_pdist(im_feat,cc_feat,vshift=opt.vshift)
132
+ mdist = torch.mean(torch.stack(dists,1),1)
133
+
134
+ minval, minidx = torch.min(mdist,0)
135
+
136
+ offset = opt.vshift-minidx
137
+ conf = torch.median(mdist) - minval
138
+
139
+ fdist = numpy.stack([dist[minidx].numpy() for dist in dists])
140
+ # fdist = numpy.pad(fdist, (3,3), 'constant', constant_values=15)
141
+ fconf = torch.median(mdist).numpy() - fdist
142
+ fconfm = signal.medfilt(fconf,kernel_size=9)
143
+
144
+ numpy.set_printoptions(formatter={'float': '{: 0.3f}'.format})
145
+ #print('Framewise conf: ')
146
+ #print(fconfm)
147
+ #print('AV offset: \t%d \nMin dist: \t%.3f\nConfidence: \t%.3f' % (offset,minval,conf))
148
+
149
+ dists_npy = numpy.array([ dist.numpy() for dist in dists ])
150
+ return offset.numpy(), conf.numpy(), minval.numpy()
151
+
152
+ def extract_feature(self, opt, videofile):
153
+
154
+ self.__S__.eval();
155
+
156
+ # ========== ==========
157
+ # Load video
158
+ # ========== ==========
159
+ cap = cv2.VideoCapture(videofile)
160
+
161
+ frame_num = 1;
162
+ images = []
163
+ while frame_num:
164
+ frame_num += 1
165
+ ret, image = cap.read()
166
+ if ret == 0:
167
+ break
168
+
169
+ images.append(image)
170
+
171
+ im = numpy.stack(images,axis=3)
172
+ im = numpy.expand_dims(im,axis=0)
173
+ im = numpy.transpose(im,(0,3,4,1,2))
174
+
175
+ imtv = torch.autograd.Variable(torch.from_numpy(im.astype(float)).float())
176
+
177
+ # ========== ==========
178
+ # Generate video feats
179
+ # ========== ==========
180
+
181
+ lastframe = len(images)-4
182
+ im_feat = []
183
+
184
+ tS = time.time()
185
+ for i in range(0,lastframe,opt.batch_size):
186
+
187
+ im_batch = [ imtv[:,:,vframe:vframe+5,:,:] for vframe in range(i,min(lastframe,i+opt.batch_size)) ]
188
+ im_in = torch.cat(im_batch,0)
189
+ im_out = self.__S__.forward_lipfeat(im_in.cuda());
190
+ im_feat.append(im_out.data.cpu())
191
+
192
+ im_feat = torch.cat(im_feat,0)
193
+
194
+ # ========== ==========
195
+ # Compute offset
196
+ # ========== ==========
197
+
198
+ print('Compute time %.3f sec.' % (time.time()-tS))
199
+
200
+ return im_feat
201
+
202
+
203
+ def loadParameters(self, path):
204
+ loaded_state = torch.load(path, map_location=lambda storage, loc: storage);
205
+
206
+ self_state = self.__S__.state_dict();
207
+
208
+ for name, param in loaded_state.items():
209
+
210
+ self_state[name].copy_(param);
evaluation/scores_LSE/calculate_scores_LRS.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python
2
+ #-*- coding: utf-8 -*-
3
+
4
+ import time, pdb, argparse, subprocess
5
+ import glob
6
+ import os
7
+ from tqdm import tqdm
8
+
9
+ from SyncNetInstance_calc_scores import *
10
+
11
+ # ==================== LOAD PARAMS ====================
12
+
13
+
14
+ parser = argparse.ArgumentParser(description = "SyncNet");
15
+
16
+ parser.add_argument('--initial_model', type=str, default="data/syncnet_v2.model", help='');
17
+ parser.add_argument('--batch_size', type=int, default='20', help='');
18
+ parser.add_argument('--vshift', type=int, default='15', help='');
19
+ parser.add_argument('--data_root', type=str, required=True, help='');
20
+ parser.add_argument('--tmp_dir', type=str, default="data/work/pytmp", help='');
21
+ parser.add_argument('--reference', type=str, default="demo", help='');
22
+
23
+ opt = parser.parse_args();
24
+
25
+
26
+ # ==================== RUN EVALUATION ====================
27
+
28
+ s = SyncNetInstance();
29
+
30
+ s.loadParameters(opt.initial_model);
31
+ #print("Model %s loaded."%opt.initial_model);
32
+ path = os.path.join(opt.data_root, "*.mp4")
33
+
34
+ all_videos = glob.glob(path)
35
+
36
+ prog_bar = tqdm(range(len(all_videos)))
37
+ avg_confidence = 0.
38
+ avg_min_distance = 0.
39
+
40
+
41
+ for videofile_idx in prog_bar:
42
+ videofile = all_videos[videofile_idx]
43
+ offset, confidence, min_distance = s.evaluate(opt, videofile=videofile)
44
+ avg_confidence += confidence
45
+ avg_min_distance += min_distance
46
+ prog_bar.set_description('Avg Confidence: {}, Avg Minimum Dist: {}'.format(round(avg_confidence / (videofile_idx + 1), 3), round(avg_min_distance / (videofile_idx + 1), 3)))
47
+ prog_bar.refresh()
48
+
49
+ print ('Average Confidence: {}'.format(avg_confidence/len(all_videos)))
50
+ print ('Average Minimum Distance: {}'.format(avg_min_distance/len(all_videos)))
51
+
52
+
53
+
evaluation/scores_LSE/calculate_scores_real_videos.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python
2
+ #-*- coding: utf-8 -*-
3
+
4
+ import time, pdb, argparse, subprocess, pickle, os, gzip, glob
5
+
6
+ from SyncNetInstance_calc_scores import *
7
+
8
+ # ==================== PARSE ARGUMENT ====================
9
+
10
+ parser = argparse.ArgumentParser(description = "SyncNet");
11
+ parser.add_argument('--initial_model', type=str, default="data/syncnet_v2.model", help='');
12
+ parser.add_argument('--batch_size', type=int, default='20', help='');
13
+ parser.add_argument('--vshift', type=int, default='15', help='');
14
+ parser.add_argument('--data_dir', type=str, default='data/work', help='');
15
+ parser.add_argument('--videofile', type=str, default='', help='');
16
+ parser.add_argument('--reference', type=str, default='', help='');
17
+ opt = parser.parse_args();
18
+
19
+ setattr(opt,'avi_dir',os.path.join(opt.data_dir,'pyavi'))
20
+ setattr(opt,'tmp_dir',os.path.join(opt.data_dir,'pytmp'))
21
+ setattr(opt,'work_dir',os.path.join(opt.data_dir,'pywork'))
22
+ setattr(opt,'crop_dir',os.path.join(opt.data_dir,'pycrop'))
23
+
24
+
25
+ # ==================== LOAD MODEL AND FILE LIST ====================
26
+
27
+ s = SyncNetInstance();
28
+
29
+ s.loadParameters(opt.initial_model);
30
+ #print("Model %s loaded."%opt.initial_model);
31
+
32
+ flist = glob.glob(os.path.join(opt.crop_dir,opt.reference,'0*.avi'))
33
+ flist.sort()
34
+
35
+ # ==================== GET OFFSETS ====================
36
+
37
+ dists = []
38
+ for idx, fname in enumerate(flist):
39
+ offset, conf, dist = s.evaluate(opt,videofile=fname)
40
+ print (str(dist)+" "+str(conf))
41
+
42
+ # ==================== PRINT RESULTS TO FILE ====================
43
+
44
+ #with open(os.path.join(opt.work_dir,opt.reference,'activesd.pckl'), 'wb') as fil:
45
+ # pickle.dump(dists, fil)
evaluation/scores_LSE/calculate_scores_real_videos.sh ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ rm all_scores.txt
2
+ yourfilenames=`ls $1`
3
+
4
+ for eachfile in $yourfilenames
5
+ do
6
+ python run_pipeline.py --videofile $1/$eachfile --reference wav2lip --data_dir tmp_dir
7
+ python calculate_scores_real_videos.py --videofile $1/$eachfile --reference wav2lip --data_dir tmp_dir >> all_scores.txt
8
+ done
evaluation/test_filelists/README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ This folder contains the filelists for the new evaluation framework proposed in the paper.
2
+
3
+ ## Test filelists for LRS2, LRS3, and LRW.
4
+
5
+ This folder contains three filelists, each containing a list of names of audio-video pairs from the test sets of LRS2, LRS3, and LRW. The LRS2 and LRW filelists are strictly "Copyright BBC" and can only be used for “non-commercial research by applicants who have an agreement with the BBC to access the Lip Reading in the Wild and/or Lip Reading Sentences in the Wild datasets”. Please follow this link for more details: [https://www.bbc.co.uk/rd/projects/lip-reading-datasets](https://www.bbc.co.uk/rd/projects/lip-reading-datasets).
6
+
7
+
8
+ ## ReSynCED benchmark
9
+
10
+ The sub-folder `ReSynCED` contains filelists for our own Real-world lip-Sync Evaluation Dataset (ReSyncED).
11
+
12
+
13
+ #### Instructions on how to use the above two filelists are available in the README of the parent folder.
evaluation/test_filelists/ReSyncED/random_pairs.txt ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ sachin.mp4 emma_cropped.mp4
2
+ sachin.mp4 mourinho.mp4
3
+ sachin.mp4 elon.mp4
4
+ sachin.mp4 messi2.mp4
5
+ sachin.mp4 cr1.mp4
6
+ sachin.mp4 sachin.mp4
7
+ sachin.mp4 sg.mp4
8
+ sachin.mp4 fergi.mp4
9
+ sachin.mp4 spanish_lec1.mp4
10
+ sachin.mp4 bush_small.mp4
11
+ sachin.mp4 macca_cut.mp4
12
+ sachin.mp4 ca_cropped.mp4
13
+ sachin.mp4 lecun.mp4
14
+ sachin.mp4 spanish_lec0.mp4
15
+ srk.mp4 emma_cropped.mp4
16
+ srk.mp4 mourinho.mp4
17
+ srk.mp4 elon.mp4
18
+ srk.mp4 messi2.mp4
19
+ srk.mp4 cr1.mp4
20
+ srk.mp4 srk.mp4
21
+ srk.mp4 sachin.mp4
22
+ srk.mp4 sg.mp4
23
+ srk.mp4 fergi.mp4
24
+ srk.mp4 spanish_lec1.mp4
25
+ srk.mp4 bush_small.mp4
26
+ srk.mp4 macca_cut.mp4
27
+ srk.mp4 ca_cropped.mp4
28
+ srk.mp4 guardiola.mp4
29
+ srk.mp4 lecun.mp4
30
+ srk.mp4 spanish_lec0.mp4
31
+ cr1.mp4 emma_cropped.mp4
32
+ cr1.mp4 elon.mp4
33
+ cr1.mp4 messi2.mp4
34
+ cr1.mp4 cr1.mp4
35
+ cr1.mp4 spanish_lec1.mp4
36
+ cr1.mp4 bush_small.mp4
37
+ cr1.mp4 macca_cut.mp4
38
+ cr1.mp4 ca_cropped.mp4
39
+ cr1.mp4 lecun.mp4
40
+ cr1.mp4 spanish_lec0.mp4
41
+ macca_cut.mp4 emma_cropped.mp4
42
+ macca_cut.mp4 elon.mp4
43
+ macca_cut.mp4 messi2.mp4
44
+ macca_cut.mp4 spanish_lec1.mp4
45
+ macca_cut.mp4 macca_cut.mp4
46
+ macca_cut.mp4 ca_cropped.mp4
47
+ macca_cut.mp4 spanish_lec0.mp4
48
+ lecun.mp4 emma_cropped.mp4
49
+ lecun.mp4 elon.mp4
50
+ lecun.mp4 messi2.mp4
51
+ lecun.mp4 spanish_lec1.mp4
52
+ lecun.mp4 macca_cut.mp4
53
+ lecun.mp4 ca_cropped.mp4
54
+ lecun.mp4 lecun.mp4
55
+ lecun.mp4 spanish_lec0.mp4
56
+ messi2.mp4 emma_cropped.mp4
57
+ messi2.mp4 elon.mp4
58
+ messi2.mp4 messi2.mp4
59
+ messi2.mp4 spanish_lec1.mp4
60
+ messi2.mp4 macca_cut.mp4
61
+ messi2.mp4 ca_cropped.mp4
62
+ messi2.mp4 spanish_lec0.mp4
63
+ ca_cropped.mp4 emma_cropped.mp4
64
+ ca_cropped.mp4 elon.mp4
65
+ ca_cropped.mp4 spanish_lec1.mp4
66
+ ca_cropped.mp4 ca_cropped.mp4
67
+ ca_cropped.mp4 spanish_lec0.mp4
68
+ spanish_lec1.mp4 spanish_lec1.mp4
69
+ spanish_lec1.mp4 spanish_lec0.mp4
70
+ elon.mp4 elon.mp4
71
+ elon.mp4 spanish_lec1.mp4
72
+ elon.mp4 spanish_lec0.mp4
73
+ guardiola.mp4 emma_cropped.mp4
74
+ guardiola.mp4 mourinho.mp4
75
+ guardiola.mp4 elon.mp4
76
+ guardiola.mp4 messi2.mp4
77
+ guardiola.mp4 cr1.mp4
78
+ guardiola.mp4 sachin.mp4
79
+ guardiola.mp4 sg.mp4
80
+ guardiola.mp4 fergi.mp4
81
+ guardiola.mp4 spanish_lec1.mp4
82
+ guardiola.mp4 bush_small.mp4
83
+ guardiola.mp4 macca_cut.mp4
84
+ guardiola.mp4 ca_cropped.mp4
85
+ guardiola.mp4 guardiola.mp4
86
+ guardiola.mp4 lecun.mp4
87
+ guardiola.mp4 spanish_lec0.mp4
88
+ fergi.mp4 emma_cropped.mp4
89
+ fergi.mp4 mourinho.mp4
90
+ fergi.mp4 elon.mp4
91
+ fergi.mp4 messi2.mp4
92
+ fergi.mp4 cr1.mp4
93
+ fergi.mp4 sachin.mp4
94
+ fergi.mp4 sg.mp4
95
+ fergi.mp4 fergi.mp4
96
+ fergi.mp4 spanish_lec1.mp4
97
+ fergi.mp4 bush_small.mp4
98
+ fergi.mp4 macca_cut.mp4
99
+ fergi.mp4 ca_cropped.mp4
100
+ fergi.mp4 lecun.mp4
101
+ fergi.mp4 spanish_lec0.mp4
102
+ spanish.mp4 emma_cropped.mp4
103
+ spanish.mp4 spanish.mp4
104
+ spanish.mp4 mourinho.mp4
105
+ spanish.mp4 elon.mp4
106
+ spanish.mp4 messi2.mp4
107
+ spanish.mp4 cr1.mp4
108
+ spanish.mp4 srk.mp4
109
+ spanish.mp4 sachin.mp4
110
+ spanish.mp4 sg.mp4
111
+ spanish.mp4 fergi.mp4
112
+ spanish.mp4 spanish_lec1.mp4
113
+ spanish.mp4 bush_small.mp4
114
+ spanish.mp4 macca_cut.mp4
115
+ spanish.mp4 ca_cropped.mp4
116
+ spanish.mp4 guardiola.mp4
117
+ spanish.mp4 lecun.mp4
118
+ spanish.mp4 spanish_lec0.mp4
119
+ bush_small.mp4 emma_cropped.mp4
120
+ bush_small.mp4 elon.mp4
121
+ bush_small.mp4 messi2.mp4
122
+ bush_small.mp4 spanish_lec1.mp4
123
+ bush_small.mp4 bush_small.mp4
124
+ bush_small.mp4 macca_cut.mp4
125
+ bush_small.mp4 ca_cropped.mp4
126
+ bush_small.mp4 lecun.mp4
127
+ bush_small.mp4 spanish_lec0.mp4
128
+ emma_cropped.mp4 emma_cropped.mp4
129
+ emma_cropped.mp4 elon.mp4
130
+ emma_cropped.mp4 spanish_lec1.mp4
131
+ emma_cropped.mp4 spanish_lec0.mp4
132
+ sg.mp4 emma_cropped.mp4
133
+ sg.mp4 mourinho.mp4
134
+ sg.mp4 elon.mp4
135
+ sg.mp4 messi2.mp4
136
+ sg.mp4 cr1.mp4
137
+ sg.mp4 sachin.mp4
138
+ sg.mp4 sg.mp4
139
+ sg.mp4 fergi.mp4
140
+ sg.mp4 spanish_lec1.mp4
141
+ sg.mp4 bush_small.mp4
142
+ sg.mp4 macca_cut.mp4
143
+ sg.mp4 ca_cropped.mp4
144
+ sg.mp4 lecun.mp4
145
+ sg.mp4 spanish_lec0.mp4
146
+ spanish_lec0.mp4 spanish_lec0.mp4
147
+ mourinho.mp4 emma_cropped.mp4
148
+ mourinho.mp4 mourinho.mp4
149
+ mourinho.mp4 elon.mp4
150
+ mourinho.mp4 messi2.mp4
151
+ mourinho.mp4 cr1.mp4
152
+ mourinho.mp4 sachin.mp4
153
+ mourinho.mp4 sg.mp4
154
+ mourinho.mp4 fergi.mp4
155
+ mourinho.mp4 spanish_lec1.mp4
156
+ mourinho.mp4 bush_small.mp4
157
+ mourinho.mp4 macca_cut.mp4
158
+ mourinho.mp4 ca_cropped.mp4
159
+ mourinho.mp4 lecun.mp4
160
+ mourinho.mp4 spanish_lec0.mp4
evaluation/test_filelists/ReSyncED/tts_pairs.txt ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ adam_1.mp4 andreng_optimization.wav
2
+ agad_2.mp4 agad_2.wav
3
+ agad_1.mp4 agad_1.wav
4
+ agad_3.mp4 agad_3.wav
5
+ rms_prop_1.mp4 rms_prop_tts.wav
6
+ tf_1.mp4 tf_1.wav
7
+ tf_2.mp4 tf_2.wav
8
+ andrew_ng_ai_business.mp4 andrewng_business_tts.wav
9
+ covid_autopsy_1.mp4 autopsy_tts.wav
10
+ news_1.mp4 news_tts.wav
11
+ andrew_ng_fund_1.mp4 andrewng_ai_fund.wav
12
+ covid_treatments_1.mp4 covid_tts.wav
13
+ pytorch_v_tf.mp4 pytorch_vs_tf_eng.wav
14
+ pytorch_1.mp4 pytorch.wav
15
+ pkb_1.mp4 pkb_1.wav
16
+ ss_1.mp4 ss_1.wav
17
+ carlsen_1.mp4 carlsen_eng.wav
18
+ french.mp4 french.wav
evaluation/test_filelists/lrs2.txt ADDED
The diff for this file is too large to render. See raw diff
 
evaluation/test_filelists/lrs3.txt ADDED
The diff for this file is too large to render. See raw diff
 
evaluation/test_filelists/lrw.txt ADDED
The diff for this file is too large to render. See raw diff
 
face_detection/README.md ADDED
@@ -0,0 +1 @@
 
 
1
+ The code for Face Detection in this folder has been taken from the wonderful [face_alignment](https://github.com/1adrianb/face-alignment) repository. This has been modified to take batches of faces at a time.
face_detection/__init__.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ __author__ = """Adrian Bulat"""
4
+ __email__ = '[email protected]'
5
+ __version__ = '1.0.1'
6
+
7
+ from .api import FaceAlignment, LandmarksType, NetworkSize
face_detection/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (366 Bytes). View file
 
face_detection/__pycache__/__init__.cpython-37.pyc ADDED
Binary file (339 Bytes). View file
 
face_detection/__pycache__/api.cpython-313.pyc ADDED
Binary file (3.85 kB). View file
 
face_detection/__pycache__/api.cpython-37.pyc ADDED
Binary file (2.7 kB). View file
 
face_detection/__pycache__/models.cpython-313.pyc ADDED
Binary file (15.2 kB). View file
 
face_detection/__pycache__/models.cpython-37.pyc ADDED
Binary file (7.13 kB). View file
 
face_detection/__pycache__/utils.cpython-313.pyc ADDED
Binary file (18.4 kB). View file
 
face_detection/__pycache__/utils.cpython-37.pyc ADDED
Binary file (10.1 kB). View file
 
face_detection/api.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import print_function
2
+ import os
3
+ import torch
4
+ from torch.utils.model_zoo import load_url
5
+ from enum import Enum
6
+ import numpy as np
7
+ import cv2
8
+ try:
9
+ import urllib.request as request_file
10
+ except BaseException:
11
+ import urllib as request_file
12
+
13
+ from .models import FAN, ResNetDepth
14
+ from .utils import *
15
+
16
+
17
+ class LandmarksType(Enum):
18
+ """Enum class defining the type of landmarks to detect.
19
+
20
+ ``_2D`` - the detected points ``(x,y)`` are detected in a 2D space and follow the visible contour of the face
21
+ ``_2halfD`` - this points represent the projection of the 3D points into 3D
22
+ ``_3D`` - detect the points ``(x,y,z)``` in a 3D space
23
+
24
+ """
25
+ _2D = 1
26
+ _2halfD = 2
27
+ _3D = 3
28
+
29
+
30
+ class NetworkSize(Enum):
31
+ # TINY = 1
32
+ # SMALL = 2
33
+ # MEDIUM = 3
34
+ LARGE = 4
35
+
36
+ def __new__(cls, value):
37
+ member = object.__new__(cls)
38
+ member._value_ = value
39
+ return member
40
+
41
+ def __int__(self):
42
+ return self.value
43
+
44
+ ROOT = os.path.dirname(os.path.abspath(__file__))
45
+
46
+ class FaceAlignment:
47
+ def __init__(self, landmarks_type, network_size=NetworkSize.LARGE,
48
+ device='cuda', flip_input=False, face_detector='sfd', verbose=False):
49
+ self.device = device
50
+ self.flip_input = flip_input
51
+ self.landmarks_type = landmarks_type
52
+ self.verbose = verbose
53
+
54
+ network_size = int(network_size)
55
+
56
+ if 'cuda' in device:
57
+ torch.backends.cudnn.benchmark = True
58
+
59
+ # Get the face detector
60
+ face_detector_module = __import__('face_detection.detection.' + face_detector,
61
+ globals(), locals(), [face_detector], 0)
62
+ self.face_detector = face_detector_module.FaceDetector(device=device, verbose=verbose)
63
+
64
+ def get_detections_for_batch(self, images):
65
+ images = images[..., ::-1]
66
+ detected_faces = self.face_detector.detect_from_batch(images.copy())
67
+ results = []
68
+
69
+ for i, d in enumerate(detected_faces):
70
+ if len(d) == 0:
71
+ results.append(None)
72
+ continue
73
+ d = d[0]
74
+ d = np.clip(d, 0, None)
75
+
76
+ x1, y1, x2, y2 = map(int, d[:-1])
77
+ results.append((x1, y1, x2, y2))
78
+
79
+ return results
face_detection/detection/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .core import FaceDetector
face_detection/detection/__pycache__/__init__.cpython-37.pyc ADDED
Binary file (197 Bytes). View file
 
face_detection/detection/__pycache__/core.cpython-37.pyc ADDED
Binary file (4.85 kB). View file
 
face_detection/detection/core.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import glob
3
+ from tqdm import tqdm
4
+ import numpy as np
5
+ import torch
6
+ import cv2
7
+
8
+
9
+ class FaceDetector(object):
10
+ """An abstract class representing a face detector.
11
+
12
+ Any other face detection implementation must subclass it. All subclasses
13
+ must implement ``detect_from_image``, that return a list of detected
14
+ bounding boxes. Optionally, for speed considerations detect from path is
15
+ recommended.
16
+ """
17
+
18
+ def __init__(self, device, verbose):
19
+ self.device = device
20
+ self.verbose = verbose
21
+
22
+ if verbose:
23
+ if 'cpu' in device:
24
+ logger = logging.getLogger(__name__)
25
+ logger.warning("Detection running on CPU, this may be potentially slow.")
26
+
27
+ if 'cpu' not in device and 'cuda' not in device:
28
+ if verbose:
29
+ logger.error("Expected values for device are: {cpu, cuda} but got: %s", device)
30
+ raise ValueError
31
+
32
+ def detect_from_image(self, tensor_or_path):
33
+ """Detects faces in a given image.
34
+
35
+ This function detects the faces present in a provided BGR(usually)
36
+ image. The input can be either the image itself or the path to it.
37
+
38
+ Arguments:
39
+ tensor_or_path {numpy.ndarray, torch.tensor or string} -- the path
40
+ to an image or the image itself.
41
+
42
+ Example::
43
+
44
+ >>> path_to_image = 'data/image_01.jpg'
45
+ ... detected_faces = detect_from_image(path_to_image)
46
+ [A list of bounding boxes (x1, y1, x2, y2)]
47
+ >>> image = cv2.imread(path_to_image)
48
+ ... detected_faces = detect_from_image(image)
49
+ [A list of bounding boxes (x1, y1, x2, y2)]
50
+
51
+ """
52
+ raise NotImplementedError
53
+
54
+ def detect_from_directory(self, path, extensions=['.jpg', '.png'], recursive=False, show_progress_bar=True):
55
+ """Detects faces from all the images present in a given directory.
56
+
57
+ Arguments:
58
+ path {string} -- a string containing a path that points to the folder containing the images
59
+
60
+ Keyword Arguments:
61
+ extensions {list} -- list of string containing the extensions to be
62
+ consider in the following format: ``.extension_name`` (default:
63
+ {['.jpg', '.png']}) recursive {bool} -- option wherever to scan the
64
+ folder recursively (default: {False}) show_progress_bar {bool} --
65
+ display a progressbar (default: {True})
66
+
67
+ Example:
68
+ >>> directory = 'data'
69
+ ... detected_faces = detect_from_directory(directory)
70
+ {A dictionary of [lists containing bounding boxes(x1, y1, x2, y2)]}
71
+
72
+ """
73
+ if self.verbose:
74
+ logger = logging.getLogger(__name__)
75
+
76
+ if len(extensions) == 0:
77
+ if self.verbose:
78
+ logger.error("Expected at list one extension, but none was received.")
79
+ raise ValueError
80
+
81
+ if self.verbose:
82
+ logger.info("Constructing the list of images.")
83
+ additional_pattern = '/**/*' if recursive else '/*'
84
+ files = []
85
+ for extension in extensions:
86
+ files.extend(glob.glob(path + additional_pattern + extension, recursive=recursive))
87
+
88
+ if self.verbose:
89
+ logger.info("Finished searching for images. %s images found", len(files))
90
+ logger.info("Preparing to run the detection.")
91
+
92
+ predictions = {}
93
+ for image_path in tqdm(files, disable=not show_progress_bar):
94
+ if self.verbose:
95
+ logger.info("Running the face detector on image: %s", image_path)
96
+ predictions[image_path] = self.detect_from_image(image_path)
97
+
98
+ if self.verbose:
99
+ logger.info("The detector was successfully run on all %s images", len(files))
100
+
101
+ return predictions
102
+
103
+ @property
104
+ def reference_scale(self):
105
+ raise NotImplementedError
106
+
107
+ @property
108
+ def reference_x_shift(self):
109
+ raise NotImplementedError
110
+
111
+ @property
112
+ def reference_y_shift(self):
113
+ raise NotImplementedError
114
+
115
+ @staticmethod
116
+ def tensor_or_path_to_ndarray(tensor_or_path, rgb=True):
117
+ """Convert path (represented as a string) or torch.tensor to a numpy.ndarray
118
+
119
+ Arguments:
120
+ tensor_or_path {numpy.ndarray, torch.tensor or string} -- path to the image, or the image itself
121
+ """
122
+ if isinstance(tensor_or_path, str):
123
+ return cv2.imread(tensor_or_path) if not rgb else cv2.imread(tensor_or_path)[..., ::-1]
124
+ elif torch.is_tensor(tensor_or_path):
125
+ # Call cpu in case its coming from cuda
126
+ return tensor_or_path.cpu().numpy()[..., ::-1].copy() if not rgb else tensor_or_path.cpu().numpy()
127
+ elif isinstance(tensor_or_path, np.ndarray):
128
+ return tensor_or_path[..., ::-1].copy() if not rgb else tensor_or_path
129
+ else:
130
+ raise TypeError
face_detection/detection/sfd/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .sfd_detector import SFDDetector as FaceDetector
face_detection/detection/sfd/__pycache__/__init__.cpython-37.pyc ADDED
Binary file (222 Bytes). View file
 
face_detection/detection/sfd/__pycache__/bbox.cpython-37.pyc ADDED
Binary file (4.67 kB). View file
 
face_detection/detection/sfd/__pycache__/detect.cpython-37.pyc ADDED
Binary file (3.77 kB). View file
 
face_detection/detection/sfd/__pycache__/net_s3fd.cpython-37.pyc ADDED
Binary file (3.88 kB). View file
 
face_detection/detection/sfd/__pycache__/sfd_detector.cpython-37.pyc ADDED
Binary file (2.95 kB). View file
 
face_detection/detection/sfd/bbox.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import print_function
2
+ import os
3
+ import sys
4
+ import cv2
5
+ import random
6
+ import datetime
7
+ import time
8
+ import math
9
+ import argparse
10
+ import numpy as np
11
+ import torch
12
+
13
+ try:
14
+ from iou import IOU
15
+ except BaseException:
16
+ # IOU cython speedup 10x
17
+ def IOU(ax1, ay1, ax2, ay2, bx1, by1, bx2, by2):
18
+ sa = abs((ax2 - ax1) * (ay2 - ay1))
19
+ sb = abs((bx2 - bx1) * (by2 - by1))
20
+ x1, y1 = max(ax1, bx1), max(ay1, by1)
21
+ x2, y2 = min(ax2, bx2), min(ay2, by2)
22
+ w = x2 - x1
23
+ h = y2 - y1
24
+ if w < 0 or h < 0:
25
+ return 0.0
26
+ else:
27
+ return 1.0 * w * h / (sa + sb - w * h)
28
+
29
+
30
+ def bboxlog(x1, y1, x2, y2, axc, ayc, aww, ahh):
31
+ xc, yc, ww, hh = (x2 + x1) / 2, (y2 + y1) / 2, x2 - x1, y2 - y1
32
+ dx, dy = (xc - axc) / aww, (yc - ayc) / ahh
33
+ dw, dh = math.log(ww / aww), math.log(hh / ahh)
34
+ return dx, dy, dw, dh
35
+
36
+
37
+ def bboxloginv(dx, dy, dw, dh, axc, ayc, aww, ahh):
38
+ xc, yc = dx * aww + axc, dy * ahh + ayc
39
+ ww, hh = math.exp(dw) * aww, math.exp(dh) * ahh
40
+ x1, x2, y1, y2 = xc - ww / 2, xc + ww / 2, yc - hh / 2, yc + hh / 2
41
+ return x1, y1, x2, y2
42
+
43
+
44
+ def nms(dets, thresh):
45
+ if 0 == len(dets):
46
+ return []
47
+ x1, y1, x2, y2, scores = dets[:, 0], dets[:, 1], dets[:, 2], dets[:, 3], dets[:, 4]
48
+ areas = (x2 - x1 + 1) * (y2 - y1 + 1)
49
+ order = scores.argsort()[::-1]
50
+
51
+ keep = []
52
+ while order.size > 0:
53
+ i = order[0]
54
+ keep.append(i)
55
+ xx1, yy1 = np.maximum(x1[i], x1[order[1:]]), np.maximum(y1[i], y1[order[1:]])
56
+ xx2, yy2 = np.minimum(x2[i], x2[order[1:]]), np.minimum(y2[i], y2[order[1:]])
57
+
58
+ w, h = np.maximum(0.0, xx2 - xx1 + 1), np.maximum(0.0, yy2 - yy1 + 1)
59
+ ovr = w * h / (areas[i] + areas[order[1:]] - w * h)
60
+
61
+ inds = np.where(ovr <= thresh)[0]
62
+ order = order[inds + 1]
63
+
64
+ return keep
65
+
66
+
67
+ def encode(matched, priors, variances):
68
+ """Encode the variances from the priorbox layers into the ground truth boxes
69
+ we have matched (based on jaccard overlap) with the prior boxes.
70
+ Args:
71
+ matched: (tensor) Coords of ground truth for each prior in point-form
72
+ Shape: [num_priors, 4].
73
+ priors: (tensor) Prior boxes in center-offset form
74
+ Shape: [num_priors,4].
75
+ variances: (list[float]) Variances of priorboxes
76
+ Return:
77
+ encoded boxes (tensor), Shape: [num_priors, 4]
78
+ """
79
+
80
+ # dist b/t match center and prior's center
81
+ g_cxcy = (matched[:, :2] + matched[:, 2:]) / 2 - priors[:, :2]
82
+ # encode variance
83
+ g_cxcy /= (variances[0] * priors[:, 2:])
84
+ # match wh / prior wh
85
+ g_wh = (matched[:, 2:] - matched[:, :2]) / priors[:, 2:]
86
+ g_wh = torch.log(g_wh) / variances[1]
87
+ # return target for smooth_l1_loss
88
+ return torch.cat([g_cxcy, g_wh], 1) # [num_priors,4]
89
+
90
+
91
+ def decode(loc, priors, variances):
92
+ """Decode locations from predictions using priors to undo
93
+ the encoding we did for offset regression at train time.
94
+ Args:
95
+ loc (tensor): location predictions for loc layers,
96
+ Shape: [num_priors,4]
97
+ priors (tensor): Prior boxes in center-offset form.
98
+ Shape: [num_priors,4].
99
+ variances: (list[float]) Variances of priorboxes
100
+ Return:
101
+ decoded bounding box predictions
102
+ """
103
+
104
+ boxes = torch.cat((
105
+ priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:],
106
+ priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1)
107
+ boxes[:, :2] -= boxes[:, 2:] / 2
108
+ boxes[:, 2:] += boxes[:, :2]
109
+ return boxes
110
+
111
+ def batch_decode(loc, priors, variances):
112
+ """Decode locations from predictions using priors to undo
113
+ the encoding we did for offset regression at train time.
114
+ Args:
115
+ loc (tensor): location predictions for loc layers,
116
+ Shape: [num_priors,4]
117
+ priors (tensor): Prior boxes in center-offset form.
118
+ Shape: [num_priors,4].
119
+ variances: (list[float]) Variances of priorboxes
120
+ Return:
121
+ decoded bounding box predictions
122
+ """
123
+
124
+ boxes = torch.cat((
125
+ priors[:, :, :2] + loc[:, :, :2] * variances[0] * priors[:, :, 2:],
126
+ priors[:, :, 2:] * torch.exp(loc[:, :, 2:] * variances[1])), 2)
127
+ boxes[:, :, :2] -= boxes[:, :, 2:] / 2
128
+ boxes[:, :, 2:] += boxes[:, :, :2]
129
+ return boxes
face_detection/detection/sfd/detect.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn.functional as F
3
+
4
+ import os
5
+ import sys
6
+ import cv2
7
+ import random
8
+ import datetime
9
+ import math
10
+ import argparse
11
+ import numpy as np
12
+
13
+ import scipy.io as sio
14
+ import zipfile
15
+ from .net_s3fd import s3fd
16
+ from .bbox import *
17
+
18
+
19
+ def detect(net, img, device):
20
+ img = img - np.array([104, 117, 123])
21
+ img = img.transpose(2, 0, 1)
22
+ img = img.reshape((1,) + img.shape)
23
+
24
+ if 'cuda' in device:
25
+ torch.backends.cudnn.benchmark = True
26
+
27
+ img = torch.from_numpy(img).float().to(device)
28
+ BB, CC, HH, WW = img.size()
29
+ with torch.no_grad():
30
+ olist = net(img)
31
+
32
+ bboxlist = []
33
+ for i in range(len(olist) // 2):
34
+ olist[i * 2] = F.softmax(olist[i * 2], dim=1)
35
+ olist = [oelem.data.cpu() for oelem in olist]
36
+ for i in range(len(olist) // 2):
37
+ ocls, oreg = olist[i * 2], olist[i * 2 + 1]
38
+ FB, FC, FH, FW = ocls.size() # feature map size
39
+ stride = 2**(i + 2) # 4,8,16,32,64,128
40
+ anchor = stride * 4
41
+ poss = zip(*np.where(ocls[:, 1, :, :] > 0.05))
42
+ for Iindex, hindex, windex in poss:
43
+ axc, ayc = stride / 2 + windex * stride, stride / 2 + hindex * stride
44
+ score = ocls[0, 1, hindex, windex]
45
+ loc = oreg[0, :, hindex, windex].contiguous().view(1, 4)
46
+ priors = torch.Tensor([[axc / 1.0, ayc / 1.0, stride * 4 / 1.0, stride * 4 / 1.0]])
47
+ variances = [0.1, 0.2]
48
+ box = decode(loc, priors, variances)
49
+ x1, y1, x2, y2 = box[0] * 1.0
50
+ # cv2.rectangle(imgshow,(int(x1),int(y1)),(int(x2),int(y2)),(0,0,255),1)
51
+ bboxlist.append([x1, y1, x2, y2, score])
52
+ bboxlist = np.array(bboxlist)
53
+ if 0 == len(bboxlist):
54
+ bboxlist = np.zeros((1, 5))
55
+
56
+ return bboxlist
57
+
58
+ def batch_detect(net, imgs, device):
59
+ imgs = imgs - np.array([104, 117, 123])
60
+ imgs = imgs.transpose(0, 3, 1, 2)
61
+
62
+ if 'cuda' in device:
63
+ torch.backends.cudnn.benchmark = True
64
+
65
+ imgs = torch.from_numpy(imgs).float().to(device)
66
+ BB, CC, HH, WW = imgs.size()
67
+ with torch.no_grad():
68
+ olist = net(imgs)
69
+
70
+ bboxlist = []
71
+ for i in range(len(olist) // 2):
72
+ olist[i * 2] = F.softmax(olist[i * 2], dim=1)
73
+ olist = [oelem.data.cpu() for oelem in olist]
74
+ for i in range(len(olist) // 2):
75
+ ocls, oreg = olist[i * 2], olist[i * 2 + 1]
76
+ FB, FC, FH, FW = ocls.size() # feature map size
77
+ stride = 2**(i + 2) # 4,8,16,32,64,128
78
+ anchor = stride * 4
79
+ poss = zip(*np.where(ocls[:, 1, :, :] > 0.05))
80
+ for Iindex, hindex, windex in poss:
81
+ axc, ayc = stride / 2 + windex * stride, stride / 2 + hindex * stride
82
+ score = ocls[:, 1, hindex, windex]
83
+ loc = oreg[:, :, hindex, windex].contiguous().view(BB, 1, 4)
84
+ priors = torch.Tensor([[axc / 1.0, ayc / 1.0, stride * 4 / 1.0, stride * 4 / 1.0]]).view(1, 1, 4)
85
+ variances = [0.1, 0.2]
86
+ box = batch_decode(loc, priors, variances)
87
+ box = box[:, 0] * 1.0
88
+ # cv2.rectangle(imgshow,(int(x1),int(y1)),(int(x2),int(y2)),(0,0,255),1)
89
+ bboxlist.append(torch.cat([box, score.unsqueeze(1)], 1).cpu().numpy())
90
+ bboxlist = np.array(bboxlist)
91
+ if 0 == len(bboxlist):
92
+ bboxlist = np.zeros((1, BB, 5))
93
+
94
+ return bboxlist
95
+
96
+ def flip_detect(net, img, device):
97
+ img = cv2.flip(img, 1)
98
+ b = detect(net, img, device)
99
+
100
+ bboxlist = np.zeros(b.shape)
101
+ bboxlist[:, 0] = img.shape[1] - b[:, 2]
102
+ bboxlist[:, 1] = b[:, 1]
103
+ bboxlist[:, 2] = img.shape[1] - b[:, 0]
104
+ bboxlist[:, 3] = b[:, 3]
105
+ bboxlist[:, 4] = b[:, 4]
106
+ return bboxlist
107
+
108
+
109
+ def pts_to_bb(pts):
110
+ min_x, min_y = np.min(pts, axis=0)
111
+ max_x, max_y = np.max(pts, axis=0)
112
+ return np.array([min_x, min_y, max_x, max_y])
face_detection/detection/sfd/net_s3fd.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+
5
+
6
+ class L2Norm(nn.Module):
7
+ def __init__(self, n_channels, scale=1.0):
8
+ super(L2Norm, self).__init__()
9
+ self.n_channels = n_channels
10
+ self.scale = scale
11
+ self.eps = 1e-10
12
+ self.weight = nn.Parameter(torch.Tensor(self.n_channels))
13
+ self.weight.data *= 0.0
14
+ self.weight.data += self.scale
15
+
16
+ def forward(self, x):
17
+ norm = x.pow(2).sum(dim=1, keepdim=True).sqrt() + self.eps
18
+ x = x / norm * self.weight.view(1, -1, 1, 1)
19
+ return x
20
+
21
+
22
+ class s3fd(nn.Module):
23
+ def __init__(self):
24
+ super(s3fd, self).__init__()
25
+ self.conv1_1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1)
26
+ self.conv1_2 = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)
27
+
28
+ self.conv2_1 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1)
29
+ self.conv2_2 = nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1)
30
+
31
+ self.conv3_1 = nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1)
32
+ self.conv3_2 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)
33
+ self.conv3_3 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)
34
+
35
+ self.conv4_1 = nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1)
36
+ self.conv4_2 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1)
37
+ self.conv4_3 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1)
38
+
39
+ self.conv5_1 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1)
40
+ self.conv5_2 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1)
41
+ self.conv5_3 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1)
42
+
43
+ self.fc6 = nn.Conv2d(512, 1024, kernel_size=3, stride=1, padding=3)
44
+ self.fc7 = nn.Conv2d(1024, 1024, kernel_size=1, stride=1, padding=0)
45
+
46
+ self.conv6_1 = nn.Conv2d(1024, 256, kernel_size=1, stride=1, padding=0)
47
+ self.conv6_2 = nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1)
48
+
49
+ self.conv7_1 = nn.Conv2d(512, 128, kernel_size=1, stride=1, padding=0)
50
+ self.conv7_2 = nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1)
51
+
52
+ self.conv3_3_norm = L2Norm(256, scale=10)
53
+ self.conv4_3_norm = L2Norm(512, scale=8)
54
+ self.conv5_3_norm = L2Norm(512, scale=5)
55
+
56
+ self.conv3_3_norm_mbox_conf = nn.Conv2d(256, 4, kernel_size=3, stride=1, padding=1)
57
+ self.conv3_3_norm_mbox_loc = nn.Conv2d(256, 4, kernel_size=3, stride=1, padding=1)
58
+ self.conv4_3_norm_mbox_conf = nn.Conv2d(512, 2, kernel_size=3, stride=1, padding=1)
59
+ self.conv4_3_norm_mbox_loc = nn.Conv2d(512, 4, kernel_size=3, stride=1, padding=1)
60
+ self.conv5_3_norm_mbox_conf = nn.Conv2d(512, 2, kernel_size=3, stride=1, padding=1)
61
+ self.conv5_3_norm_mbox_loc = nn.Conv2d(512, 4, kernel_size=3, stride=1, padding=1)
62
+
63
+ self.fc7_mbox_conf = nn.Conv2d(1024, 2, kernel_size=3, stride=1, padding=1)
64
+ self.fc7_mbox_loc = nn.Conv2d(1024, 4, kernel_size=3, stride=1, padding=1)
65
+ self.conv6_2_mbox_conf = nn.Conv2d(512, 2, kernel_size=3, stride=1, padding=1)
66
+ self.conv6_2_mbox_loc = nn.Conv2d(512, 4, kernel_size=3, stride=1, padding=1)
67
+ self.conv7_2_mbox_conf = nn.Conv2d(256, 2, kernel_size=3, stride=1, padding=1)
68
+ self.conv7_2_mbox_loc = nn.Conv2d(256, 4, kernel_size=3, stride=1, padding=1)
69
+
70
+ def forward(self, x):
71
+ h = F.relu(self.conv1_1(x))
72
+ h = F.relu(self.conv1_2(h))
73
+ h = F.max_pool2d(h, 2, 2)
74
+
75
+ h = F.relu(self.conv2_1(h))
76
+ h = F.relu(self.conv2_2(h))
77
+ h = F.max_pool2d(h, 2, 2)
78
+
79
+ h = F.relu(self.conv3_1(h))
80
+ h = F.relu(self.conv3_2(h))
81
+ h = F.relu(self.conv3_3(h))
82
+ f3_3 = h
83
+ h = F.max_pool2d(h, 2, 2)
84
+
85
+ h = F.relu(self.conv4_1(h))
86
+ h = F.relu(self.conv4_2(h))
87
+ h = F.relu(self.conv4_3(h))
88
+ f4_3 = h
89
+ h = F.max_pool2d(h, 2, 2)
90
+
91
+ h = F.relu(self.conv5_1(h))
92
+ h = F.relu(self.conv5_2(h))
93
+ h = F.relu(self.conv5_3(h))
94
+ f5_3 = h
95
+ h = F.max_pool2d(h, 2, 2)
96
+
97
+ h = F.relu(self.fc6(h))
98
+ h = F.relu(self.fc7(h))
99
+ ffc7 = h
100
+ h = F.relu(self.conv6_1(h))
101
+ h = F.relu(self.conv6_2(h))
102
+ f6_2 = h
103
+ h = F.relu(self.conv7_1(h))
104
+ h = F.relu(self.conv7_2(h))
105
+ f7_2 = h
106
+
107
+ f3_3 = self.conv3_3_norm(f3_3)
108
+ f4_3 = self.conv4_3_norm(f4_3)
109
+ f5_3 = self.conv5_3_norm(f5_3)
110
+
111
+ cls1 = self.conv3_3_norm_mbox_conf(f3_3)
112
+ reg1 = self.conv3_3_norm_mbox_loc(f3_3)
113
+ cls2 = self.conv4_3_norm_mbox_conf(f4_3)
114
+ reg2 = self.conv4_3_norm_mbox_loc(f4_3)
115
+ cls3 = self.conv5_3_norm_mbox_conf(f5_3)
116
+ reg3 = self.conv5_3_norm_mbox_loc(f5_3)
117
+ cls4 = self.fc7_mbox_conf(ffc7)
118
+ reg4 = self.fc7_mbox_loc(ffc7)
119
+ cls5 = self.conv6_2_mbox_conf(f6_2)
120
+ reg5 = self.conv6_2_mbox_loc(f6_2)
121
+ cls6 = self.conv7_2_mbox_conf(f7_2)
122
+ reg6 = self.conv7_2_mbox_loc(f7_2)
123
+
124
+ # max-out background label
125
+ chunk = torch.chunk(cls1, 4, 1)
126
+ bmax = torch.max(torch.max(chunk[0], chunk[1]), chunk[2])
127
+ cls1 = torch.cat([bmax, chunk[3]], dim=1)
128
+
129
+ return [cls1, reg1, cls2, reg2, cls3, reg3, cls4, reg4, cls5, reg5, cls6, reg6]
face_detection/detection/sfd/s3fd-619a316812.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:619a31681264d3f7f7fc7a16a42cbbe8b23f31a256f75a366e5a1bcd59b33543
3
+ size 89843225
face_detection/detection/sfd/sfd_detector.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import cv2
3
+ from torch.utils.model_zoo import load_url
4
+
5
+ from ..core import FaceDetector
6
+
7
+ from .net_s3fd import s3fd
8
+ from .bbox import *
9
+ from .detect import *
10
+
11
+ models_urls = {
12
+ 's3fd': 'https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth',
13
+ }
14
+
15
+
16
+ class SFDDetector(FaceDetector):
17
+ def __init__(self, device, path_to_detector=os.path.join(os.path.dirname(os.path.abspath(__file__)), 's3fd.pth'), verbose=False):
18
+ super(SFDDetector, self).__init__(device, verbose)
19
+
20
+ # Initialise the face detector
21
+ if not os.path.isfile(path_to_detector):
22
+ model_weights = load_url(models_urls['s3fd'])
23
+ else:
24
+ model_weights = torch.load(path_to_detector)
25
+
26
+ self.face_detector = s3fd()
27
+ self.face_detector.load_state_dict(model_weights)
28
+ self.face_detector.to(device)
29
+ self.face_detector.eval()
30
+
31
+ def detect_from_image(self, tensor_or_path):
32
+ image = self.tensor_or_path_to_ndarray(tensor_or_path)
33
+
34
+ bboxlist = detect(self.face_detector, image, device=self.device)
35
+ keep = nms(bboxlist, 0.3)
36
+ bboxlist = bboxlist[keep, :]
37
+ bboxlist = [x for x in bboxlist if x[-1] > 0.5]
38
+
39
+ return bboxlist
40
+
41
+ def detect_from_batch(self, images):
42
+ bboxlists = batch_detect(self.face_detector, images, device=self.device)
43
+ keeps = [nms(bboxlists[:, i, :], 0.3) for i in range(bboxlists.shape[1])]
44
+ bboxlists = [bboxlists[keep, i, :] for i, keep in enumerate(keeps)]
45
+ bboxlists = [[x for x in bboxlist if x[-1] > 0.5] for bboxlist in bboxlists]
46
+
47
+ return bboxlists
48
+
49
+ @property
50
+ def reference_scale(self):
51
+ return 195
52
+
53
+ @property
54
+ def reference_x_shift(self):
55
+ return 0
56
+
57
+ @property
58
+ def reference_y_shift(self):
59
+ return 0
face_detection/models.py ADDED
@@ -0,0 +1,261 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ import math
5
+
6
+
7
+ def conv3x3(in_planes, out_planes, strd=1, padding=1, bias=False):
8
+ "3x3 convolution with padding"
9
+ return nn.Conv2d(in_planes, out_planes, kernel_size=3,
10
+ stride=strd, padding=padding, bias=bias)
11
+
12
+
13
+ class ConvBlock(nn.Module):
14
+ def __init__(self, in_planes, out_planes):
15
+ super(ConvBlock, self).__init__()
16
+ self.bn1 = nn.BatchNorm2d(in_planes)
17
+ self.conv1 = conv3x3(in_planes, int(out_planes / 2))
18
+ self.bn2 = nn.BatchNorm2d(int(out_planes / 2))
19
+ self.conv2 = conv3x3(int(out_planes / 2), int(out_planes / 4))
20
+ self.bn3 = nn.BatchNorm2d(int(out_planes / 4))
21
+ self.conv3 = conv3x3(int(out_planes / 4), int(out_planes / 4))
22
+
23
+ if in_planes != out_planes:
24
+ self.downsample = nn.Sequential(
25
+ nn.BatchNorm2d(in_planes),
26
+ nn.ReLU(True),
27
+ nn.Conv2d(in_planes, out_planes,
28
+ kernel_size=1, stride=1, bias=False),
29
+ )
30
+ else:
31
+ self.downsample = None
32
+
33
+ def forward(self, x):
34
+ residual = x
35
+
36
+ out1 = self.bn1(x)
37
+ out1 = F.relu(out1, True)
38
+ out1 = self.conv1(out1)
39
+
40
+ out2 = self.bn2(out1)
41
+ out2 = F.relu(out2, True)
42
+ out2 = self.conv2(out2)
43
+
44
+ out3 = self.bn3(out2)
45
+ out3 = F.relu(out3, True)
46
+ out3 = self.conv3(out3)
47
+
48
+ out3 = torch.cat((out1, out2, out3), 1)
49
+
50
+ if self.downsample is not None:
51
+ residual = self.downsample(residual)
52
+
53
+ out3 += residual
54
+
55
+ return out3
56
+
57
+
58
+ class Bottleneck(nn.Module):
59
+
60
+ expansion = 4
61
+
62
+ def __init__(self, inplanes, planes, stride=1, downsample=None):
63
+ super(Bottleneck, self).__init__()
64
+ self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
65
+ self.bn1 = nn.BatchNorm2d(planes)
66
+ self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
67
+ padding=1, bias=False)
68
+ self.bn2 = nn.BatchNorm2d(planes)
69
+ self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
70
+ self.bn3 = nn.BatchNorm2d(planes * 4)
71
+ self.relu = nn.ReLU(inplace=True)
72
+ self.downsample = downsample
73
+ self.stride = stride
74
+
75
+ def forward(self, x):
76
+ residual = x
77
+
78
+ out = self.conv1(x)
79
+ out = self.bn1(out)
80
+ out = self.relu(out)
81
+
82
+ out = self.conv2(out)
83
+ out = self.bn2(out)
84
+ out = self.relu(out)
85
+
86
+ out = self.conv3(out)
87
+ out = self.bn3(out)
88
+
89
+ if self.downsample is not None:
90
+ residual = self.downsample(x)
91
+
92
+ out += residual
93
+ out = self.relu(out)
94
+
95
+ return out
96
+
97
+
98
+ class HourGlass(nn.Module):
99
+ def __init__(self, num_modules, depth, num_features):
100
+ super(HourGlass, self).__init__()
101
+ self.num_modules = num_modules
102
+ self.depth = depth
103
+ self.features = num_features
104
+
105
+ self._generate_network(self.depth)
106
+
107
+ def _generate_network(self, level):
108
+ self.add_module('b1_' + str(level), ConvBlock(self.features, self.features))
109
+
110
+ self.add_module('b2_' + str(level), ConvBlock(self.features, self.features))
111
+
112
+ if level > 1:
113
+ self._generate_network(level - 1)
114
+ else:
115
+ self.add_module('b2_plus_' + str(level), ConvBlock(self.features, self.features))
116
+
117
+ self.add_module('b3_' + str(level), ConvBlock(self.features, self.features))
118
+
119
+ def _forward(self, level, inp):
120
+ # Upper branch
121
+ up1 = inp
122
+ up1 = self._modules['b1_' + str(level)](up1)
123
+
124
+ # Lower branch
125
+ low1 = F.avg_pool2d(inp, 2, stride=2)
126
+ low1 = self._modules['b2_' + str(level)](low1)
127
+
128
+ if level > 1:
129
+ low2 = self._forward(level - 1, low1)
130
+ else:
131
+ low2 = low1
132
+ low2 = self._modules['b2_plus_' + str(level)](low2)
133
+
134
+ low3 = low2
135
+ low3 = self._modules['b3_' + str(level)](low3)
136
+
137
+ up2 = F.interpolate(low3, scale_factor=2, mode='nearest')
138
+
139
+ return up1 + up2
140
+
141
+ def forward(self, x):
142
+ return self._forward(self.depth, x)
143
+
144
+
145
+ class FAN(nn.Module):
146
+
147
+ def __init__(self, num_modules=1):
148
+ super(FAN, self).__init__()
149
+ self.num_modules = num_modules
150
+
151
+ # Base part
152
+ self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3)
153
+ self.bn1 = nn.BatchNorm2d(64)
154
+ self.conv2 = ConvBlock(64, 128)
155
+ self.conv3 = ConvBlock(128, 128)
156
+ self.conv4 = ConvBlock(128, 256)
157
+
158
+ # Stacking part
159
+ for hg_module in range(self.num_modules):
160
+ self.add_module('m' + str(hg_module), HourGlass(1, 4, 256))
161
+ self.add_module('top_m_' + str(hg_module), ConvBlock(256, 256))
162
+ self.add_module('conv_last' + str(hg_module),
163
+ nn.Conv2d(256, 256, kernel_size=1, stride=1, padding=0))
164
+ self.add_module('bn_end' + str(hg_module), nn.BatchNorm2d(256))
165
+ self.add_module('l' + str(hg_module), nn.Conv2d(256,
166
+ 68, kernel_size=1, stride=1, padding=0))
167
+
168
+ if hg_module < self.num_modules - 1:
169
+ self.add_module(
170
+ 'bl' + str(hg_module), nn.Conv2d(256, 256, kernel_size=1, stride=1, padding=0))
171
+ self.add_module('al' + str(hg_module), nn.Conv2d(68,
172
+ 256, kernel_size=1, stride=1, padding=0))
173
+
174
+ def forward(self, x):
175
+ x = F.relu(self.bn1(self.conv1(x)), True)
176
+ x = F.avg_pool2d(self.conv2(x), 2, stride=2)
177
+ x = self.conv3(x)
178
+ x = self.conv4(x)
179
+
180
+ previous = x
181
+
182
+ outputs = []
183
+ for i in range(self.num_modules):
184
+ hg = self._modules['m' + str(i)](previous)
185
+
186
+ ll = hg
187
+ ll = self._modules['top_m_' + str(i)](ll)
188
+
189
+ ll = F.relu(self._modules['bn_end' + str(i)]
190
+ (self._modules['conv_last' + str(i)](ll)), True)
191
+
192
+ # Predict heatmaps
193
+ tmp_out = self._modules['l' + str(i)](ll)
194
+ outputs.append(tmp_out)
195
+
196
+ if i < self.num_modules - 1:
197
+ ll = self._modules['bl' + str(i)](ll)
198
+ tmp_out_ = self._modules['al' + str(i)](tmp_out)
199
+ previous = previous + ll + tmp_out_
200
+
201
+ return outputs
202
+
203
+
204
+ class ResNetDepth(nn.Module):
205
+
206
+ def __init__(self, block=Bottleneck, layers=[3, 8, 36, 3], num_classes=68):
207
+ self.inplanes = 64
208
+ super(ResNetDepth, self).__init__()
209
+ self.conv1 = nn.Conv2d(3 + 68, 64, kernel_size=7, stride=2, padding=3,
210
+ bias=False)
211
+ self.bn1 = nn.BatchNorm2d(64)
212
+ self.relu = nn.ReLU(inplace=True)
213
+ self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
214
+ self.layer1 = self._make_layer(block, 64, layers[0])
215
+ self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
216
+ self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
217
+ self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
218
+ self.avgpool = nn.AvgPool2d(7)
219
+ self.fc = nn.Linear(512 * block.expansion, num_classes)
220
+
221
+ for m in self.modules():
222
+ if isinstance(m, nn.Conv2d):
223
+ n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
224
+ m.weight.data.normal_(0, math.sqrt(2. / n))
225
+ elif isinstance(m, nn.BatchNorm2d):
226
+ m.weight.data.fill_(1)
227
+ m.bias.data.zero_()
228
+
229
+ def _make_layer(self, block, planes, blocks, stride=1):
230
+ downsample = None
231
+ if stride != 1 or self.inplanes != planes * block.expansion:
232
+ downsample = nn.Sequential(
233
+ nn.Conv2d(self.inplanes, planes * block.expansion,
234
+ kernel_size=1, stride=stride, bias=False),
235
+ nn.BatchNorm2d(planes * block.expansion),
236
+ )
237
+
238
+ layers = []
239
+ layers.append(block(self.inplanes, planes, stride, downsample))
240
+ self.inplanes = planes * block.expansion
241
+ for i in range(1, blocks):
242
+ layers.append(block(self.inplanes, planes))
243
+
244
+ return nn.Sequential(*layers)
245
+
246
+ def forward(self, x):
247
+ x = self.conv1(x)
248
+ x = self.bn1(x)
249
+ x = self.relu(x)
250
+ x = self.maxpool(x)
251
+
252
+ x = self.layer1(x)
253
+ x = self.layer2(x)
254
+ x = self.layer3(x)
255
+ x = self.layer4(x)
256
+
257
+ x = self.avgpool(x)
258
+ x = x.view(x.size(0), -1)
259
+ x = self.fc(x)
260
+
261
+ return x
face_detection/utils.py ADDED
@@ -0,0 +1,313 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import print_function
2
+ import os
3
+ import sys
4
+ import time
5
+ import torch
6
+ import math
7
+ import numpy as np
8
+ import cv2
9
+
10
+
11
+ def _gaussian(
12
+ size=3, sigma=0.25, amplitude=1, normalize=False, width=None,
13
+ height=None, sigma_horz=None, sigma_vert=None, mean_horz=0.5,
14
+ mean_vert=0.5):
15
+ # handle some defaults
16
+ if width is None:
17
+ width = size
18
+ if height is None:
19
+ height = size
20
+ if sigma_horz is None:
21
+ sigma_horz = sigma
22
+ if sigma_vert is None:
23
+ sigma_vert = sigma
24
+ center_x = mean_horz * width + 0.5
25
+ center_y = mean_vert * height + 0.5
26
+ gauss = np.empty((height, width), dtype=np.float32)
27
+ # generate kernel
28
+ for i in range(height):
29
+ for j in range(width):
30
+ gauss[i][j] = amplitude * math.exp(-(math.pow((j + 1 - center_x) / (
31
+ sigma_horz * width), 2) / 2.0 + math.pow((i + 1 - center_y) / (sigma_vert * height), 2) / 2.0))
32
+ if normalize:
33
+ gauss = gauss / np.sum(gauss)
34
+ return gauss
35
+
36
+
37
+ def draw_gaussian(image, point, sigma):
38
+ # Check if the gaussian is inside
39
+ ul = [math.floor(point[0] - 3 * sigma), math.floor(point[1] - 3 * sigma)]
40
+ br = [math.floor(point[0] + 3 * sigma), math.floor(point[1] + 3 * sigma)]
41
+ if (ul[0] > image.shape[1] or ul[1] > image.shape[0] or br[0] < 1 or br[1] < 1):
42
+ return image
43
+ size = 6 * sigma + 1
44
+ g = _gaussian(size)
45
+ g_x = [int(max(1, -ul[0])), int(min(br[0], image.shape[1])) - int(max(1, ul[0])) + int(max(1, -ul[0]))]
46
+ g_y = [int(max(1, -ul[1])), int(min(br[1], image.shape[0])) - int(max(1, ul[1])) + int(max(1, -ul[1]))]
47
+ img_x = [int(max(1, ul[0])), int(min(br[0], image.shape[1]))]
48
+ img_y = [int(max(1, ul[1])), int(min(br[1], image.shape[0]))]
49
+ assert (g_x[0] > 0 and g_y[1] > 0)
50
+ image[img_y[0] - 1:img_y[1], img_x[0] - 1:img_x[1]
51
+ ] = image[img_y[0] - 1:img_y[1], img_x[0] - 1:img_x[1]] + g[g_y[0] - 1:g_y[1], g_x[0] - 1:g_x[1]]
52
+ image[image > 1] = 1
53
+ return image
54
+
55
+
56
+ def transform(point, center, scale, resolution, invert=False):
57
+ """Generate and affine transformation matrix.
58
+
59
+ Given a set of points, a center, a scale and a targer resolution, the
60
+ function generates and affine transformation matrix. If invert is ``True``
61
+ it will produce the inverse transformation.
62
+
63
+ Arguments:
64
+ point {torch.tensor} -- the input 2D point
65
+ center {torch.tensor or numpy.array} -- the center around which to perform the transformations
66
+ scale {float} -- the scale of the face/object
67
+ resolution {float} -- the output resolution
68
+
69
+ Keyword Arguments:
70
+ invert {bool} -- define wherever the function should produce the direct or the
71
+ inverse transformation matrix (default: {False})
72
+ """
73
+ _pt = torch.ones(3)
74
+ _pt[0] = point[0]
75
+ _pt[1] = point[1]
76
+
77
+ h = 200.0 * scale
78
+ t = torch.eye(3)
79
+ t[0, 0] = resolution / h
80
+ t[1, 1] = resolution / h
81
+ t[0, 2] = resolution * (-center[0] / h + 0.5)
82
+ t[1, 2] = resolution * (-center[1] / h + 0.5)
83
+
84
+ if invert:
85
+ t = torch.inverse(t)
86
+
87
+ new_point = (torch.matmul(t, _pt))[0:2]
88
+
89
+ return new_point.int()
90
+
91
+
92
+ def crop(image, center, scale, resolution=256.0):
93
+ """Center crops an image or set of heatmaps
94
+
95
+ Arguments:
96
+ image {numpy.array} -- an rgb image
97
+ center {numpy.array} -- the center of the object, usually the same as of the bounding box
98
+ scale {float} -- scale of the face
99
+
100
+ Keyword Arguments:
101
+ resolution {float} -- the size of the output cropped image (default: {256.0})
102
+
103
+ Returns:
104
+ [type] -- [description]
105
+ """ # Crop around the center point
106
+ """ Crops the image around the center. Input is expected to be an np.ndarray """
107
+ ul = transform([1, 1], center, scale, resolution, True)
108
+ br = transform([resolution, resolution], center, scale, resolution, True)
109
+ # pad = math.ceil(torch.norm((ul - br).float()) / 2.0 - (br[0] - ul[0]) / 2.0)
110
+ if image.ndim > 2:
111
+ newDim = np.array([br[1] - ul[1], br[0] - ul[0],
112
+ image.shape[2]], dtype=np.int32)
113
+ newImg = np.zeros(newDim, dtype=np.uint8)
114
+ else:
115
+ newDim = np.array([br[1] - ul[1], br[0] - ul[0]], dtype=np.int)
116
+ newImg = np.zeros(newDim, dtype=np.uint8)
117
+ ht = image.shape[0]
118
+ wd = image.shape[1]
119
+ newX = np.array(
120
+ [max(1, -ul[0] + 1), min(br[0], wd) - ul[0]], dtype=np.int32)
121
+ newY = np.array(
122
+ [max(1, -ul[1] + 1), min(br[1], ht) - ul[1]], dtype=np.int32)
123
+ oldX = np.array([max(1, ul[0] + 1), min(br[0], wd)], dtype=np.int32)
124
+ oldY = np.array([max(1, ul[1] + 1), min(br[1], ht)], dtype=np.int32)
125
+ newImg[newY[0] - 1:newY[1], newX[0] - 1:newX[1]
126
+ ] = image[oldY[0] - 1:oldY[1], oldX[0] - 1:oldX[1], :]
127
+ newImg = cv2.resize(newImg, dsize=(int(resolution), int(resolution)),
128
+ interpolation=cv2.INTER_LINEAR)
129
+ return newImg
130
+
131
+
132
+ def get_preds_fromhm(hm, center=None, scale=None):
133
+ """Obtain (x,y) coordinates given a set of N heatmaps. If the center
134
+ and the scale is provided the function will return the points also in
135
+ the original coordinate frame.
136
+
137
+ Arguments:
138
+ hm {torch.tensor} -- the predicted heatmaps, of shape [B, N, W, H]
139
+
140
+ Keyword Arguments:
141
+ center {torch.tensor} -- the center of the bounding box (default: {None})
142
+ scale {float} -- face scale (default: {None})
143
+ """
144
+ max, idx = torch.max(
145
+ hm.view(hm.size(0), hm.size(1), hm.size(2) * hm.size(3)), 2)
146
+ idx += 1
147
+ preds = idx.view(idx.size(0), idx.size(1), 1).repeat(1, 1, 2).float()
148
+ preds[..., 0].apply_(lambda x: (x - 1) % hm.size(3) + 1)
149
+ preds[..., 1].add_(-1).div_(hm.size(2)).floor_().add_(1)
150
+
151
+ for i in range(preds.size(0)):
152
+ for j in range(preds.size(1)):
153
+ hm_ = hm[i, j, :]
154
+ pX, pY = int(preds[i, j, 0]) - 1, int(preds[i, j, 1]) - 1
155
+ if pX > 0 and pX < 63 and pY > 0 and pY < 63:
156
+ diff = torch.FloatTensor(
157
+ [hm_[pY, pX + 1] - hm_[pY, pX - 1],
158
+ hm_[pY + 1, pX] - hm_[pY - 1, pX]])
159
+ preds[i, j].add_(diff.sign_().mul_(.25))
160
+
161
+ preds.add_(-.5)
162
+
163
+ preds_orig = torch.zeros(preds.size())
164
+ if center is not None and scale is not None:
165
+ for i in range(hm.size(0)):
166
+ for j in range(hm.size(1)):
167
+ preds_orig[i, j] = transform(
168
+ preds[i, j], center, scale, hm.size(2), True)
169
+
170
+ return preds, preds_orig
171
+
172
+ def get_preds_fromhm_batch(hm, centers=None, scales=None):
173
+ """Obtain (x,y) coordinates given a set of N heatmaps. If the centers
174
+ and the scales is provided the function will return the points also in
175
+ the original coordinate frame.
176
+
177
+ Arguments:
178
+ hm {torch.tensor} -- the predicted heatmaps, of shape [B, N, W, H]
179
+
180
+ Keyword Arguments:
181
+ centers {torch.tensor} -- the centers of the bounding box (default: {None})
182
+ scales {float} -- face scales (default: {None})
183
+ """
184
+ max, idx = torch.max(
185
+ hm.view(hm.size(0), hm.size(1), hm.size(2) * hm.size(3)), 2)
186
+ idx += 1
187
+ preds = idx.view(idx.size(0), idx.size(1), 1).repeat(1, 1, 2).float()
188
+ preds[..., 0].apply_(lambda x: (x - 1) % hm.size(3) + 1)
189
+ preds[..., 1].add_(-1).div_(hm.size(2)).floor_().add_(1)
190
+
191
+ for i in range(preds.size(0)):
192
+ for j in range(preds.size(1)):
193
+ hm_ = hm[i, j, :]
194
+ pX, pY = int(preds[i, j, 0]) - 1, int(preds[i, j, 1]) - 1
195
+ if pX > 0 and pX < 63 and pY > 0 and pY < 63:
196
+ diff = torch.FloatTensor(
197
+ [hm_[pY, pX + 1] - hm_[pY, pX - 1],
198
+ hm_[pY + 1, pX] - hm_[pY - 1, pX]])
199
+ preds[i, j].add_(diff.sign_().mul_(.25))
200
+
201
+ preds.add_(-.5)
202
+
203
+ preds_orig = torch.zeros(preds.size())
204
+ if centers is not None and scales is not None:
205
+ for i in range(hm.size(0)):
206
+ for j in range(hm.size(1)):
207
+ preds_orig[i, j] = transform(
208
+ preds[i, j], centers[i], scales[i], hm.size(2), True)
209
+
210
+ return preds, preds_orig
211
+
212
+ def shuffle_lr(parts, pairs=None):
213
+ """Shuffle the points left-right according to the axis of symmetry
214
+ of the object.
215
+
216
+ Arguments:
217
+ parts {torch.tensor} -- a 3D or 4D object containing the
218
+ heatmaps.
219
+
220
+ Keyword Arguments:
221
+ pairs {list of integers} -- [order of the flipped points] (default: {None})
222
+ """
223
+ if pairs is None:
224
+ pairs = [16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
225
+ 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 27, 28, 29, 30, 35,
226
+ 34, 33, 32, 31, 45, 44, 43, 42, 47, 46, 39, 38, 37, 36, 41,
227
+ 40, 54, 53, 52, 51, 50, 49, 48, 59, 58, 57, 56, 55, 64, 63,
228
+ 62, 61, 60, 67, 66, 65]
229
+ if parts.ndimension() == 3:
230
+ parts = parts[pairs, ...]
231
+ else:
232
+ parts = parts[:, pairs, ...]
233
+
234
+ return parts
235
+
236
+
237
+ def flip(tensor, is_label=False):
238
+ """Flip an image or a set of heatmaps left-right
239
+
240
+ Arguments:
241
+ tensor {numpy.array or torch.tensor} -- [the input image or heatmaps]
242
+
243
+ Keyword Arguments:
244
+ is_label {bool} -- [denote wherever the input is an image or a set of heatmaps ] (default: {False})
245
+ """
246
+ if not torch.is_tensor(tensor):
247
+ tensor = torch.from_numpy(tensor)
248
+
249
+ if is_label:
250
+ tensor = shuffle_lr(tensor).flip(tensor.ndimension() - 1)
251
+ else:
252
+ tensor = tensor.flip(tensor.ndimension() - 1)
253
+
254
+ return tensor
255
+
256
+ # From pyzolib/paths.py (https://bitbucket.org/pyzo/pyzolib/src/tip/paths.py)
257
+
258
+
259
+ def appdata_dir(appname=None, roaming=False):
260
+ """ appdata_dir(appname=None, roaming=False)
261
+
262
+ Get the path to the application directory, where applications are allowed
263
+ to write user specific files (e.g. configurations). For non-user specific
264
+ data, consider using common_appdata_dir().
265
+ If appname is given, a subdir is appended (and created if necessary).
266
+ If roaming is True, will prefer a roaming directory (Windows Vista/7).
267
+ """
268
+
269
+ # Define default user directory
270
+ userDir = os.getenv('FACEALIGNMENT_USERDIR', None)
271
+ if userDir is None:
272
+ userDir = os.path.expanduser('~')
273
+ if not os.path.isdir(userDir): # pragma: no cover
274
+ userDir = '/var/tmp' # issue #54
275
+
276
+ # Get system app data dir
277
+ path = None
278
+ if sys.platform.startswith('win'):
279
+ path1, path2 = os.getenv('LOCALAPPDATA'), os.getenv('APPDATA')
280
+ path = (path2 or path1) if roaming else (path1 or path2)
281
+ elif sys.platform.startswith('darwin'):
282
+ path = os.path.join(userDir, 'Library', 'Application Support')
283
+ # On Linux and as fallback
284
+ if not (path and os.path.isdir(path)):
285
+ path = userDir
286
+
287
+ # Maybe we should store things local to the executable (in case of a
288
+ # portable distro or a frozen application that wants to be portable)
289
+ prefix = sys.prefix
290
+ if getattr(sys, 'frozen', None):
291
+ prefix = os.path.abspath(os.path.dirname(sys.executable))
292
+ for reldir in ('settings', '../settings'):
293
+ localpath = os.path.abspath(os.path.join(prefix, reldir))
294
+ if os.path.isdir(localpath): # pragma: no cover
295
+ try:
296
+ open(os.path.join(localpath, 'test.write'), 'wb').close()
297
+ os.remove(os.path.join(localpath, 'test.write'))
298
+ except IOError:
299
+ pass # We cannot write in this directory
300
+ else:
301
+ path = localpath
302
+ break
303
+
304
+ # Get path specific for this app
305
+ if appname:
306
+ if path == userDir:
307
+ appname = '.' + appname.lstrip('.') # Make it a hidden directory
308
+ path = os.path.join(path, appname)
309
+ if not os.path.isdir(path): # pragma: no cover
310
+ os.mkdir(path)
311
+
312
+ # Done
313
+ return path
filelists/README.md ADDED
@@ -0,0 +1 @@
 
 
1
+ Place LRS2 (and any other) filelists here for training.
inference.py ADDED
@@ -0,0 +1,294 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from os import listdir, path
2
+ import numpy as np
3
+ import scipy, cv2, os, sys, argparse, audio
4
+ import json, subprocess, random, string
5
+ from tqdm import tqdm
6
+ from glob import glob
7
+ import torch, face_detection
8
+ from models import Wav2Lip
9
+ import platform
10
+
11
+ parser = argparse.ArgumentParser(description='Inference code to lip-sync videos in the wild using Wav2Lip models')
12
+
13
+ parser.add_argument('--checkpoint_path', type=str,
14
+ help='Name of saved checkpoint to load weights from', required=True)
15
+
16
+ parser.add_argument('--face', type=str,
17
+ help='Filepath of video/image that contains faces to use', required=True)
18
+ parser.add_argument('--audio', type=str,
19
+ help='Filepath of video/audio file to use as raw audio source', required=True)
20
+ parser.add_argument('--outfile', type=str, help='Video path to save result. See default for an e.g.',
21
+ default='results/result_voice.mp4')
22
+
23
+ parser.add_argument('--static', type=bool,
24
+ help='If True, then use only first video frame for inference', default=False)
25
+ parser.add_argument('--fps', type=float, help='Can be specified only if input is a static image (default: 25)',
26
+ default=25., required=False)
27
+
28
+ parser.add_argument('--pads', nargs='+', type=int, default=[0, 10, 0, 0],
29
+ help='Padding (top, bottom, left, right). Please adjust to include chin at least')
30
+
31
+ parser.add_argument('--face_det_batch_size', type=int,
32
+ help='Batch size for face detection', default=16)
33
+ parser.add_argument('--wav2lip_batch_size', type=int, help='Batch size for Wav2Lip model(s)', default=128)
34
+
35
+ parser.add_argument('--resize_factor', default=1, type=int,
36
+ help='Reduce the resolution by this factor. Sometimes, best results are obtained at 480p or 720p')
37
+
38
+ parser.add_argument('--crop', nargs='+', type=int, default=[0, -1, 0, -1],
39
+ help='Crop video to a smaller region (top, bottom, left, right). Applied after resize_factor and rotate arg. '
40
+ 'Useful if multiple face present. -1 implies the value will be auto-inferred based on height, width')
41
+
42
+ parser.add_argument('--box', nargs='+', type=int, default=[-1, -1, -1, -1],
43
+ help='Specify a constant bounding box for the face. Use only as a last resort if the face is not detected.'
44
+ 'Also, might work only if the face is not moving around much. Syntax: (top, bottom, left, right).')
45
+
46
+ parser.add_argument('--rotate', default=False, action='store_true',
47
+ help='Sometimes videos taken from a phone can be flipped 90deg. If true, will flip video right by 90deg.'
48
+ 'Use if you get a flipped result, despite feeding a normal looking video')
49
+
50
+ parser.add_argument('--nosmooth', default=False, action='store_true',
51
+ help='Prevent smoothing face detections over a short temporal window')
52
+
53
+ args = parser.parse_args()
54
+ args.img_size = 96
55
+
56
+ if os.path.isfile(args.face) and args.face.split('.')[1] in ['jpg', 'png', 'jpeg']:
57
+ args.static = True
58
+
59
+ def get_smoothened_boxes(boxes, T):
60
+ for i in range(len(boxes)):
61
+ if i + T > len(boxes):
62
+ window = boxes[len(boxes) - T:]
63
+ else:
64
+ window = boxes[i : i + T]
65
+ boxes[i] = np.mean(window, axis=0)
66
+ return boxes
67
+
68
+ def face_detect(images):
69
+ detector = face_detection.FaceAlignment(face_detection.LandmarksType._2D,
70
+ flip_input=False, device=device)
71
+
72
+ batch_size = args.face_det_batch_size
73
+
74
+ while 1:
75
+ predictions = []
76
+ try:
77
+ for i in tqdm(range(0, len(images), batch_size)):
78
+ predictions.extend(detector.get_detections_for_batch(np.array(images[i:i + batch_size])))
79
+ except RuntimeError:
80
+ if batch_size == 1:
81
+ raise RuntimeError('Image too big to run face detection on GPU. Please use the --resize_factor argument')
82
+ batch_size //= 2
83
+ print('Recovering from OOM error; New batch size: {}'.format(batch_size))
84
+ continue
85
+ break
86
+
87
+ results = []
88
+ pady1, pady2, padx1, padx2 = args.pads
89
+ for rect, image in zip(predictions, images):
90
+ if rect is None:
91
+ cv2.imwrite('temp/faulty_frame.jpg', image) # check this frame where the face was not detected.
92
+ raise ValueError('Face not detected! Ensure the video contains a face in all the frames.')
93
+
94
+ y1 = max(0, rect[1] - pady1)
95
+ y2 = min(image.shape[0], rect[3] + pady2)
96
+ x1 = max(0, rect[0] - padx1)
97
+ x2 = min(image.shape[1], rect[2] + padx2)
98
+
99
+ results.append([x1, y1, x2, y2])
100
+
101
+ boxes = np.array(results)
102
+ if not args.nosmooth: boxes = get_smoothened_boxes(boxes, T=5)
103
+ results = [[image[y1: y2, x1:x2], (y1, y2, x1, x2)] for image, (x1, y1, x2, y2) in zip(images, boxes)]
104
+
105
+ del detector
106
+ return results
107
+
108
+ def datagen(frames, mels):
109
+ img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
110
+
111
+ if args.box[0] == -1:
112
+ if not args.static:
113
+ face_det_results = face_detect(frames) # BGR2RGB for CNN face detection
114
+ else:
115
+ face_det_results = face_detect([frames[0]])
116
+ else:
117
+ print('Using the specified bounding box instead of face detection...')
118
+ y1, y2, x1, x2 = args.box
119
+ face_det_results = [[f[y1: y2, x1:x2], (y1, y2, x1, x2)] for f in frames]
120
+
121
+ for i, m in enumerate(mels):
122
+ idx = 0 if args.static else i%len(frames)
123
+ frame_to_save = frames[idx].copy()
124
+ face, coords = face_det_results[idx].copy()
125
+
126
+ face = cv2.resize(face, (args.img_size, args.img_size))
127
+
128
+ img_batch.append(face)
129
+ mel_batch.append(m)
130
+ frame_batch.append(frame_to_save)
131
+ coords_batch.append(coords)
132
+
133
+ if len(img_batch) >= args.wav2lip_batch_size:
134
+ img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
135
+
136
+ img_masked = img_batch.copy()
137
+ img_masked[:, args.img_size//2:] = 0
138
+
139
+ img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.
140
+ mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])
141
+
142
+ yield img_batch, mel_batch, frame_batch, coords_batch
143
+ img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
144
+
145
+ if len(img_batch) > 0:
146
+ img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
147
+
148
+ img_masked = img_batch.copy()
149
+ img_masked[:, args.img_size//2:] = 0
150
+
151
+ img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.
152
+ mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])
153
+
154
+ yield img_batch, mel_batch, frame_batch, coords_batch
155
+
156
+ mel_step_size = 16
157
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
158
+ print('Using {} for inference.'.format(device))
159
+
160
+ # def _load(checkpoint_path):
161
+ # if device == 'cuda':
162
+ # checkpoint = torch.load(checkpoint_path)
163
+ # else:
164
+ # checkpoint = torch.load(checkpoint_path,
165
+ # map_location=lambda storage, loc: storage)
166
+ # return checkpoint
167
+
168
+ def _load(checkpoint_path):
169
+ # Use torch.jit.load for TorchScript archives
170
+ if device == 'cuda':
171
+ model = torch.jit.load(checkpoint_path)
172
+ else:
173
+ # Accepts string or torch.device, not a lambda
174
+ model = torch.jit.load(checkpoint_path, map_location='cpu')
175
+ return model
176
+
177
+ # def load_model(path):
178
+ # model = Wav2Lip()
179
+ # print("Load checkpoint from: {}".format(path))
180
+ # checkpoint = _load(path)
181
+ # s = checkpoint["state_dict"]
182
+ # new_s = {}
183
+ # for k, v in s.items():
184
+ # new_s[k.replace('module.', '')] = v
185
+ # model.load_state_dict(new_s)
186
+
187
+ # model = model.to(device)
188
+ # return model.eval()
189
+ def load_model(path):
190
+ print("Loading scripted model from:", path)
191
+ model = _load(path) # returns the TorchScript Module
192
+ model = model.to(device) # move to CPU or GPU
193
+ return model.eval() # set to eval() mode
194
+
195
+ def main():
196
+ if not os.path.isfile(args.face):
197
+ raise ValueError('--face argument must be a valid path to video/image file')
198
+
199
+ elif args.face.split('.')[1] in ['jpg', 'png', 'jpeg']:
200
+ full_frames = [cv2.imread(args.face)]
201
+ fps = args.fps
202
+
203
+ else:
204
+ video_stream = cv2.VideoCapture(args.face)
205
+ fps = video_stream.get(cv2.CAP_PROP_FPS)
206
+
207
+ print('Reading video frames...')
208
+
209
+ full_frames = []
210
+ while 1:
211
+ still_reading, frame = video_stream.read()
212
+ if not still_reading:
213
+ video_stream.release()
214
+ break
215
+ if args.resize_factor > 1:
216
+ frame = cv2.resize(frame, (frame.shape[1]//args.resize_factor, frame.shape[0]//args.resize_factor))
217
+
218
+ if args.rotate:
219
+ frame = cv2.rotate(frame, cv2.cv2.ROTATE_90_CLOCKWISE)
220
+
221
+ y1, y2, x1, x2 = args.crop
222
+ if x2 == -1: x2 = frame.shape[1]
223
+ if y2 == -1: y2 = frame.shape[0]
224
+
225
+ frame = frame[y1:y2, x1:x2]
226
+
227
+ full_frames.append(frame)
228
+
229
+ print ("Number of frames available for inference: "+str(len(full_frames)))
230
+
231
+ if not args.audio.endswith('.wav'):
232
+ print('Extracting raw audio...')
233
+ command = 'ffmpeg -y -i {} -strict -2 {}'.format(args.audio, 'temp/temp.wav')
234
+
235
+ subprocess.call(command, shell=True)
236
+ args.audio = 'temp/temp.wav'
237
+
238
+ wav = audio.load_wav(args.audio, 16000)
239
+ mel = audio.melspectrogram(wav)
240
+ print(mel.shape)
241
+
242
+ if np.isnan(mel.reshape(-1)).sum() > 0:
243
+ raise ValueError('Mel contains nan! Using a TTS voice? Add a small epsilon noise to the wav file and try again')
244
+
245
+ mel_chunks = []
246
+ mel_idx_multiplier = 80./fps
247
+ i = 0
248
+ while 1:
249
+ start_idx = int(i * mel_idx_multiplier)
250
+ if start_idx + mel_step_size > len(mel[0]):
251
+ mel_chunks.append(mel[:, len(mel[0]) - mel_step_size:])
252
+ break
253
+ mel_chunks.append(mel[:, start_idx : start_idx + mel_step_size])
254
+ i += 1
255
+
256
+ print("Length of mel chunks: {}".format(len(mel_chunks)))
257
+
258
+ full_frames = full_frames[:len(mel_chunks)]
259
+
260
+ batch_size = args.wav2lip_batch_size
261
+ gen = datagen(full_frames.copy(), mel_chunks)
262
+
263
+ for i, (img_batch, mel_batch, frames, coords) in enumerate(tqdm(gen,
264
+ total=int(np.ceil(float(len(mel_chunks))/batch_size)))):
265
+ if i == 0:
266
+ model = load_model(args.checkpoint_path)
267
+ print ("Model loaded")
268
+
269
+ frame_h, frame_w = full_frames[0].shape[:-1]
270
+ out = cv2.VideoWriter('temp/result.avi',
271
+ cv2.VideoWriter_fourcc(*'DIVX'), fps, (frame_w, frame_h))
272
+
273
+ img_batch = torch.FloatTensor(np.transpose(img_batch, (0, 3, 1, 2))).to(device)
274
+ mel_batch = torch.FloatTensor(np.transpose(mel_batch, (0, 3, 1, 2))).to(device)
275
+
276
+ with torch.no_grad():
277
+ pred = model(mel_batch, img_batch)
278
+
279
+ pred = pred.cpu().numpy().transpose(0, 2, 3, 1) * 255.
280
+
281
+ for p, f, c in zip(pred, frames, coords):
282
+ y1, y2, x1, x2 = c
283
+ p = cv2.resize(p.astype(np.uint8), (x2 - x1, y2 - y1))
284
+
285
+ f[y1:y2, x1:x2] = p
286
+ out.write(f)
287
+
288
+ out.release()
289
+
290
+ command = 'ffmpeg -y -i {} -i {} -strict -2 -q:v 1 {}'.format(args.audio, 'temp/result.avi', args.outfile)
291
+ subprocess.call(command, shell=platform.system() != 'Windows')
292
+
293
+ if __name__ == '__main__':
294
+ main()
inference2.py ADDED
@@ -0,0 +1,346 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # inference.py (Updated)
2
+
3
+ from os import listdir, path
4
+ import numpy as np
5
+ import scipy, cv2, os, sys, argparse, audio
6
+ import json, subprocess, random, string
7
+ from tqdm import tqdm
8
+ from glob import glob
9
+ import torch # Ensure torch is imported
10
+ try:
11
+ import face_detection # Assuming this is installed or in a path accessible by your Flask app
12
+ except ImportError:
13
+ print("face_detection not found. Please ensure it's installed or available in your PYTHONPATH.")
14
+ # You might want to raise an error or handle this gracefully if face_detection is truly optional.
15
+
16
+ # Make sure you have a models/Wav2Lip.py or similar structure
17
+ try:
18
+ from models import Wav2Lip
19
+ except ImportError:
20
+ print("Wav2Lip model not found. Please ensure models/Wav2Lip.py exists and is correctly configured.")
21
+ # You might want to raise an error or handle this gracefully.
22
+
23
+ import platform
24
+ import shutil # For clearing temp directory
25
+
26
+
27
+ # These globals are still useful for shared configuration
28
+ mel_step_size = 16
29
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
30
+ print('Inference script using {} for inference.'.format(device))
31
+
32
+
33
+ def get_smoothened_boxes(boxes, T):
34
+ for i in range(len(boxes)):
35
+ if i + T > len(boxes):
36
+ window = boxes[len(boxes) - T:]
37
+ else:
38
+ window = boxes[i : i + T]
39
+ boxes[i] = np.mean(window, axis=0)
40
+ return boxes
41
+
42
+ def face_detect(images, pads, face_det_batch_size, nosmooth, img_size):
43
+ detector = face_detection.FaceAlignment(face_detection.LandmarksType._2D,
44
+ flip_input=False, device=device)
45
+
46
+ batch_size = face_det_batch_size
47
+
48
+ while 1:
49
+ predictions = []
50
+ try:
51
+ for i in tqdm(range(0, len(images), batch_size), desc="Face Detection"):
52
+ predictions.extend(detector.get_detections_for_batch(np.array(images[i:i + batch_size])))
53
+ except RuntimeError as e:
54
+ if batch_size == 1:
55
+ raise RuntimeError(f'Image too big to run face detection on GPU. Error: {e}')
56
+ batch_size //= 2
57
+ print('Recovering from OOM error; New face detection batch size: {}'.format(batch_size))
58
+ continue
59
+ break
60
+
61
+ results = []
62
+ pady1, pady2, padx1, padx2 = pads
63
+ for rect, image in zip(predictions, images):
64
+ if rect is None:
65
+ # Save the faulty frame for debugging
66
+ output_dir = 'temp' # Ensure this exists or create it
67
+ os.makedirs(output_dir, exist_ok=True)
68
+ cv2.imwrite(os.path.join(output_dir, 'faulty_frame.jpg'), image)
69
+ raise ValueError('Face not detected! Ensure the video/image contains a face in all the frames or try adjusting pads/box.')
70
+
71
+ y1 = max(0, rect[1] - pady1)
72
+ y2 = min(image.shape[0], rect[3] + pady2)
73
+ x1 = max(0, rect[0] - padx1)
74
+ x2 = min(image.shape[1], rect[2] + padx2)
75
+
76
+ results.append([x1, y1, x2, y2])
77
+
78
+ boxes = np.array(results)
79
+ if not nosmooth: boxes = get_smoothened_boxes(boxes, T=5)
80
+ results = [[image[y1: y2, x1:x2], (y1, y2, x1, x2)] for image, (x1, y1, x2, y2) in zip(images, boxes)]
81
+
82
+ del detector # Clean up detector
83
+ return results
84
+
85
+ def datagen(frames, mels, box, static, wav2lip_batch_size, img_size, pads, face_det_batch_size, nosmooth):
86
+ img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
87
+
88
+ if box[0] == -1:
89
+ if not static:
90
+ face_det_results = face_detect(frames, pads, face_det_batch_size, nosmooth, img_size) # BGR2RGB for CNN face detection
91
+ else:
92
+ face_det_results = face_detect([frames[0]], pads, face_det_batch_size, nosmooth, img_size)
93
+ else:
94
+ print('Using the specified bounding box instead of face detection...')
95
+ y1, y2, x1, x2 = box
96
+ face_det_results = [[f[y1: y2, x1:x2], (y1, y2, x1, x2)] for f in frames]
97
+
98
+ for i, m in enumerate(mels):
99
+ idx = 0 if static else i % len(frames)
100
+ frame_to_save = frames[idx].copy()
101
+ face, coords = face_det_results[idx].copy()
102
+
103
+ face = cv2.resize(face, (img_size, img_size))
104
+
105
+ img_batch.append(face)
106
+ mel_batch.append(m)
107
+ frame_batch.append(frame_to_save)
108
+ coords_batch.append(coords)
109
+
110
+ if len(img_batch) >= wav2lip_batch_size:
111
+ img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
112
+
113
+ img_masked = img_batch.copy()
114
+ img_masked[:, img_size//2:] = 0
115
+
116
+ img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.
117
+ mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])
118
+
119
+ yield img_batch, mel_batch, frame_batch, coords_batch
120
+ img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
121
+
122
+ if len(img_batch) > 0:
123
+ img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
124
+
125
+ img_masked = img_batch.copy()
126
+ img_masked[:, img_size//2:] = 0
127
+
128
+ img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.
129
+ mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])
130
+
131
+ yield img_batch, mel_batch, frame_batch, coords_batch
132
+
133
+ def _load(checkpoint_path):
134
+ # Use torch.jit.load for TorchScript archives
135
+ if device == 'cuda':
136
+ model = torch.jit.load(checkpoint_path)
137
+ else:
138
+ # Accepts string or torch.device, not a lambda
139
+ model = torch.jit.load(checkpoint_path, map_location='cpu')
140
+ return model
141
+
142
+ def load_model(path):
143
+ print("Loading scripted model from:", path)
144
+ model = _load(path) # returns the TorchScript Module
145
+ model = model.to(device) # move to CPU or GPU
146
+ return model.eval() # set to eval() mode
147
+
148
+
149
+ # New function to be called from Flask app
150
+ def run_inference(
151
+ checkpoint_path: str,
152
+ face_path: str,
153
+ audio_path: str,
154
+ output_filename: str,
155
+ static: bool = False,
156
+ fps: float = 25.,
157
+ pads: list = [0, 10, 0, 0],
158
+ face_det_batch_size: int = 16,
159
+ wav2lip_batch_size: int = 128,
160
+ resize_factor: int = 1,
161
+ crop: list = [0, -1, 0, -1],
162
+ box: list = [-1, -1, -1, -1],
163
+ rotate: bool = False,
164
+ nosmooth: bool = False,
165
+ img_size: int = 96 # Fixed for Wav2Lip
166
+ ) -> str:
167
+ """
168
+ Runs the Wav2Lip inference process.
169
+
170
+ Args:
171
+ checkpoint_path (str): Path to the Wav2Lip model checkpoint.
172
+ face_path (str): Path to the input video/image file with a face.
173
+ audio_path (str): Path to the input audio file.
174
+ output_filename (str): Name of the output video file (e.g., 'result.mp4').
175
+ static (bool): If True, use only the first video frame for inference.
176
+ fps (float): Frames per second for static image input.
177
+ pads (list): Padding for face detection (top, bottom, left, right).
178
+ face_det_batch_size (int): Batch size for face detection.
179
+ wav2lip_batch_size (int): Batch size for Wav2Lip model(s).
180
+ resize_factor (int): Reduce the resolution by this factor.
181
+ crop (list): Crop video to a smaller region (top, bottom, left, right).
182
+ box (list): Constant bounding box for the face.
183
+ rotate (bool): Rotate video right by 90deg.
184
+ nosmooth (bool): Prevent smoothing face detections.
185
+ img_size (int): Image size for the model.
186
+
187
+ Returns:
188
+ str: The path to the generated output video file.
189
+ """
190
+ print(f"Starting inference with: face='{face_path}', audio='{audio_path}', checkpoint='{checkpoint_path}', outfile='{output_filename}'")
191
+
192
+ # Create necessary directories
193
+ output_dir = 'results'
194
+ temp_dir = 'temp'
195
+ os.makedirs(output_dir, exist_ok=True)
196
+ os.makedirs(temp_dir, exist_ok=True)
197
+
198
+ # Clear temp directory for fresh run
199
+ for item in os.listdir(temp_dir):
200
+ item_path = os.path.join(temp_dir, item)
201
+ if os.path.isfile(item_path):
202
+ os.remove(item_path)
203
+ elif os.path.isdir(item_path):
204
+ shutil.rmtree(item_path)
205
+
206
+ # Determine if input is static based on file extension
207
+ is_static_input = static or (os.path.isfile(face_path) and face_path.split('.')[-1].lower() in ['jpg', 'png', 'jpeg'])
208
+
209
+ full_frames = []
210
+ if is_static_input:
211
+ full_frames = [cv2.imread(face_path)]
212
+ if full_frames[0] is None:
213
+ raise ValueError(f"Could not read face image at: {face_path}")
214
+ else:
215
+ video_stream = cv2.VideoCapture(face_path)
216
+ if not video_stream.isOpened():
217
+ raise ValueError(f"Could not open video file at: {face_path}")
218
+ fps = video_stream.get(cv2.CAP_PROP_FPS)
219
+
220
+ print('Reading video frames...')
221
+ while 1:
222
+ still_reading, frame = video_stream.read()
223
+ if not still_reading:
224
+ video_stream.release()
225
+ break
226
+ if resize_factor > 1:
227
+ frame = cv2.resize(frame, (frame.shape[1]//resize_factor, frame.shape[0]//resize_factor))
228
+
229
+ if rotate:
230
+ frame = cv2.rotate(frame, cv2.ROTATE_90_CLOCKWISE)
231
+
232
+ y1, y2, x1, x2 = crop
233
+ if x2 == -1: x2 = frame.shape[1]
234
+ if y2 == -1: y2 = frame.shape[0]
235
+
236
+ frame = frame[y1:y2, x1:x2]
237
+ full_frames.append(frame)
238
+
239
+ print ("Number of frames available for inference: "+str(len(full_frames)))
240
+ if not full_frames:
241
+ raise ValueError("No frames could be read from the input face file.")
242
+
243
+ temp_audio_path = os.path.join(temp_dir, 'temp_audio.wav')
244
+ if not audio_path.endswith('.wav'):
245
+ print('Extracting raw audio...')
246
+ command = f'ffmpeg -y -i "{audio_path}" -strict -2 "{temp_audio_path}"'
247
+ try:
248
+ subprocess.run(command, shell=True, check=True, capture_output=True)
249
+ audio_path = temp_audio_path
250
+ except subprocess.CalledProcessError as e:
251
+ print(f"FFmpeg error: {e.stderr.decode()}")
252
+ raise RuntimeError(f"Failed to extract audio from {audio_path}. Error: {e.stderr.decode()}")
253
+ else:
254
+ # Copy the wav file to temp if it's already wav to maintain consistency in naming
255
+ shutil.copy(audio_path, temp_audio_path)
256
+ audio_path = temp_audio_path
257
+
258
+
259
+ wav = audio.load_wav(audio_path, 16000)
260
+ mel = audio.melspectrogram(wav)
261
+ print("Mel spectrogram shape:", mel.shape)
262
+
263
+ if np.isnan(mel.reshape(-1)).sum() > 0:
264
+ raise ValueError('Mel contains nan! Using a TTS voice? Add a small epsilon noise to the wav file and try again')
265
+
266
+ mel_chunks = []
267
+ mel_idx_multiplier = 80./fps
268
+ i = 0
269
+ while 1:
270
+ start_idx = int(i * mel_idx_multiplier)
271
+ if start_idx + mel_step_size > len(mel[0]):
272
+ mel_chunks.append(mel[:, len(mel[0]) - mel_step_size:])
273
+ break
274
+ mel_chunks.append(mel[:, start_idx : start_idx + mel_step_size])
275
+ i += 1
276
+
277
+ print("Length of mel chunks: {}".format(len(mel_chunks)))
278
+
279
+ # Ensure full_frames matches mel_chunks length, or loop if static
280
+ if not is_static_input:
281
+ full_frames = full_frames[:len(mel_chunks)]
282
+ else:
283
+ # If static, replicate the first frame for the duration of the audio
284
+ full_frames = [full_frames[0]] * len(mel_chunks)
285
+
286
+
287
+ gen = datagen(full_frames.copy(), mel_chunks, box, is_static_input, wav2lip_batch_size, img_size, pads, face_det_batch_size, nosmooth)
288
+
289
+ output_avi_path = os.path.join(temp_dir, 'result.avi')
290
+
291
+ model_loaded = False
292
+ model = None
293
+ frame_h, frame_w = 0, 0
294
+ out = None
295
+
296
+ for i, (img_batch, mel_batch, frames, coords) in enumerate(tqdm(gen, desc="Wav2Lip Inference",
297
+ total=int(np.ceil(float(len(mel_chunks))/wav2lip_batch_size)))):
298
+ if not model_loaded:
299
+ model = load_model(checkpoint_path)
300
+ model_loaded = True
301
+ print ("Model loaded successfully")
302
+
303
+ frame_h, frame_w = full_frames[0].shape[:-1]
304
+ out = cv2.VideoWriter(output_avi_path,
305
+ cv2.VideoWriter_fourcc(*'DIVX'), fps, (frame_w, frame_h))
306
+ if out is None: # In case no frames were generated for some reason
307
+ raise RuntimeError("Video writer could not be initialized.")
308
+
309
+
310
+ img_batch = torch.FloatTensor(np.transpose(img_batch, (0, 3, 1, 2))).to(device)
311
+ mel_batch = torch.FloatTensor(np.transpose(mel_batch, (0, 3, 1, 2))).to(device)
312
+
313
+ with torch.no_grad():
314
+ pred = model(mel_batch, img_batch)
315
+
316
+ pred = pred.cpu().numpy().transpose(0, 2, 3, 1) * 255.
317
+
318
+ for p, f, c in zip(pred, frames, coords):
319
+ y1, y2, x1, x2 = c
320
+ p = cv2.resize(p.astype(np.uint8), (x2 - x1, y2 - y1))
321
+
322
+ f[y1:y2, x1:x2] = p
323
+ out.write(f)
324
+
325
+ if out:
326
+ out.release()
327
+ else:
328
+ print("Warning: Video writer was not initialized or no frames were processed.")
329
+
330
+
331
+ final_output_path = os.path.join(output_dir, output_filename)
332
+ command = f'ffmpeg -y -i "{audio_path}" -i "{output_avi_path}" -strict -2 -q:v 1 "{final_output_path}"'
333
+
334
+ try:
335
+ subprocess.run(command, shell=True, check=True, capture_output=True)
336
+ print(f"Output saved to: {final_output_path}")
337
+ except subprocess.CalledProcessError as e:
338
+ print(f"FFmpeg final merge error: {e.stderr.decode()}")
339
+ raise RuntimeError(f"Failed to merge audio and video. Error: {e.stderr.decode()}")
340
+
341
+ # Clean up temporary files (optional, but good practice)
342
+ # shutil.rmtree(temp_dir) # Be careful with this if you want to inspect temp files
343
+
344
+ return final_output_path
345
+
346
+ # No `if __name__ == '__main__':` block here, as it's meant to be imported
info_install.txt ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ install version python 3.7
2
+ conda create the environment
3
+ install all with the code forge
4
+ conda install -c conda-forge numpy=1.17.1 scipy=1.3.1 numba=0.48 tqdm=4.45.0 -y
5
+ conda install -c conda-forge librosa=0.7.0 -y
6
+ conda install -c conda-forge opencv=4.1.0 -y
7
+ pip install --no-deps opencv-contrib-python==4.1.0.25
8
+ pip install https://mirrors.aliyun.com/pytorch-wheels/cpu/torchvision-0.3.0-cp37-cp37m-win_amd64.whl
9
+ pip install https://mirror.sjtu.edu.cn/pytorch-wheels/cpu/torch-1.1.0-cp37-cp37m-win_amd64.whl
10
+
11
+
12
+ all installs
13
+ (wav2lip_env) D:\DEV PATEL\2025\Wav2Lip-master>pip freeze
14
+ audioread @ file:///D:/bld/audioread_1660497578082/work
15
+ certifi @ file:///home/conda/feedstock_root/build_artifacts/certifi_1725278078093/work/certifi
16
+ cffi @ file:///D:/bld/cffi_1666183927951/work
17
+ cycler @ file:///home/conda/feedstock_root/build_artifacts/cycler_1635519461629/work
18
+ decorator @ file:///home/conda/feedstock_root/build_artifacts/decorator_1641555617451/work
19
+ fonttools @ file:///D:/bld/fonttools_1666390069478/work
20
+ joblib @ file:///home/conda/feedstock_root/build_artifacts/joblib_1691577114857/work
21
+ kiwisolver @ file:///D:/bld/kiwisolver_1657953189205/work
22
+ librosa==0.7.0
23
+ llvmlite==0.31.0
24
+ matplotlib @ file:///C:/ci/matplotlib-suite_1634667159685/work
25
+ mkl-service==2.3.0
26
+ munkres==1.1.4
27
+ numba==0.48.0
28
+ numpy==1.17.1
29
+ opencv-contrib-python==4.1.0.25
30
+ packaging @ file:///home/conda/feedstock_root/build_artifacts/packaging_1696202382185/work
31
+ Pillow==9.3.0
32
+ ply @ file:///home/conda/feedstock_root/build_artifacts/ply_1712242996588/work
33
+ pycparser @ file:///home/conda/feedstock_root/build_artifacts/pycparser_1636257122734/work
34
+ pyparsing @ file:///home/conda/feedstock_root/build_artifacts/pyparsing_1724616129934/work
35
+ PyQt5-sip @ file:///D:/bld/pyqt-split_1665676787902/work/pyqt_sip
36
+ python-dateutil @ file:///home/conda/feedstock_root/build_artifacts/python-dateutil_1709299778482/work
37
+ resampy @ file:///home/conda/feedstock_root/build_artifacts/resampy_1657206395424/work
38
+ scikit-learn @ file:///D:/bld/scikit-learn_1611079929791/work
39
+ scipy==1.3.1
40
+ six @ file:///home/conda/feedstock_root/build_artifacts/six_1620240208055/work
41
+ soundfile @ file:///home/conda/feedstock_root/build_artifacts/pysoundfile_1676571469739/work
42
+ threadpoolctl @ file:///home/conda/feedstock_root/build_artifacts/threadpoolctl_1643647933166/work
43
+ toml @ file:///home/conda/feedstock_root/build_artifacts/toml_1604308577558/work
44
+ torch @ https://mirror.sjtu.edu.cn/pytorch-wheels/cpu/torch-1.1.0-cp37-cp37m-win_amd64.whl
45
+ torchvision @ https://mirrors.aliyun.com/pytorch-wheels/cpu/torchvision-0.3.0-cp37-cp37m-win_amd64.whl
46
+ tornado @ file:///D:/bld/tornado_1656937938087/work
47
+ tqdm==4.45.0
48
+ typing_extensions @ file:///home/conda/feedstock_root/build_artifacts/typing_extensions_1688315532570/work
49
+ unicodedata2 @ file:///D:/bld/unicodedata2_1649112131705/work
50
+ wincertstore==0.2
51
+
52
+ drive
53
+ model download for chrckpoint - https://drive.google.com/drive/folders/153HLrqlBNxzZcHi17PEvP09kkAfzRshM
54
+ face recognition model - https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth (alternate link - https://iiitaphyd-my.sharepoint.com/:u:/g/personal/prajwal_k_research_iiit_ac_in/EZsy6qWuivtDnANIG73iHjIBjMSoojcIV0NULXV-yiuiIg?e=qTasa8)
input/audio/audio_hindi_tony_stark.mp3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be12df3fb3d817bae39f6b02e532e67f33af7d04cafdf4ac59d971d9aa239b21
3
+ size 1260284
input/audio/harvard.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:971b4163670445c415c6b0fb6813c38093409ecac2f6b4d429ae3574d24ad470
3
+ size 3249924
input/audio/processed_tony_stark.mp3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16b1d2a79ff219503cb8f8df1e4d0bc6ea4aac791528b112372d71a1a2766ca9
3
+ size 580318