File size: 11,751 Bytes
8ed2f16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
"""This script contains the image preprocessing code for Deep3DFaceRecon_pytorch."""

import numpy as np
from scipy.io import loadmat
from PIL import Image
import cv2
import os
from skimage import transform as trans
import torch
import warnings

warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)


def POS(xp, x):
    """
    Calculate translation and scale using least squares for image alignment.

    Args:
        xp (np.ndarray): Target points, shape (2, N).
        x (np.ndarray): Source points, shape (2, N).

    Returns:
        tuple: Translation vector (t) and scale factor (s).
    """
    npts = xp.shape[1]
    A = np.zeros([2 * npts, 8])

    A[0:2 * npts - 1:2, 0:3] = x.T
    A[0:2 * npts - 1:2, 3] = 1
    A[1:2 * npts:2, 4:7] = x.T
    A[1:2 * npts:2, 7] = 1

    b = xp.T.reshape([2 * npts, 1])
    k, _, _, _ = np.linalg.lstsq(A, b, rcond=None)

    R1, R2 = k[:3], k[4:7]
    sTx, sTy = k[3], k[7]
    s = (np.linalg.norm(R1) + np.linalg.norm(R2)) / 2
    t = np.array([sTx, sTy])

    return t, s


def BBRegression(points, params):
    """
    Perform bounding box regression for 68 landmark detection.

    Args:
        points (np.ndarray): Facial landmarks, shape (5, 2).
        params (dict): Regression parameters.

    Returns:
        np.ndarray: Bounding box [x, y, w, h].
    """
    w1, b1, w2, b2 = params['W1'], params['B1'], params['W2'], params['B2']
    data = points.reshape([5, 2])
    data_mean = np.mean(data, axis=0)

    data -= data_mean
    rms = np.sqrt(np.sum(data ** 2) / 5)
    data /= rms
    data = data.reshape([1, 10]).T

    inputs = np.matmul(w1, data) + b1
    inputs = 2 / (1 + np.exp(-2 * inputs)) - 1
    inputs = np.matmul(w2, inputs) + b2
    inputs = inputs.T

    x, y = inputs[:, 0] * rms + data_mean[0], inputs[:, 1] * rms + data_mean[1]
    w = (224 / inputs[:, 2]) * rms

    return np.array([x, y, w, w]).reshape([4])


def img_padding(img, box):
    """
    Pad image to avoid cropping issues.

    Args:
        img (np.ndarray): Input image.
        box (np.ndarray): Bounding box [x, y, w, h].

    Returns:
        tuple: Padded image, updated bounding box, success flag.
    """
    success = True
    bbox = box.copy()
    h, w = img.shape[:2]
    padded_img = np.zeros([2 * h, 2 * w, 3])

    padded_img[h // 2: h + h // 2, w // 2: w + w // 2] = img
    bbox[:2] += [w // 2, h // 2]

    if bbox[0] < 0 or bbox[1] < 0:
        success = False

    return padded_img, bbox, success


def crop(img, bbox):
    """
    Crop image based on bounding box.

    Args:
        img (np.ndarray): Input image.
        bbox (np.ndarray): Bounding box [x, y, w, h].

    Returns:
        tuple: Cropped image, scale factor.
    """
    padded_img, padded_bbox, flag = img_padding(img, bbox)
    if not flag:
        return padded_img, 0

    x, y, w, h = padded_bbox
    cropped_img = padded_img[y:y + h, x:x + w]
    cropped_img = cv2.resize(cropped_img.astype(np.uint8), (224, 224), interpolation=cv2.INTER_CUBIC)

    return cropped_img, 224 / w


def scale_trans(img, lm, t, s):
    """
    Apply scaling and translation to the image and landmarks.

    Args:
        img (np.ndarray): Input image.
        lm (np.ndarray): Landmarks.
        t (np.ndarray): Translation vector.
        s (float): Scale factor.

    Returns:
        tuple: Transformed image, inverse scale parameters.
    """
    img_h, img_w = img.shape[:2]
    M_s = np.array([[1, 0, -t[0] + img_w // 2 + 0.5], [0, 1, -img_h // 2 + t[1]]], dtype=np.float32)
    img = cv2.warpAffine(img, M_s, (img_w, img_h))

    w, h = int(img_w / s * 100), int(img_h / s * 100)
    img = cv2.resize(img, (w, h))

    lm = np.stack([lm[:, 0] - t[0] + img_w // 2, lm[:, 1] - t[1] + img_h // 2], axis=1) / s * 100
    bbox = [w // 2 - 112, h // 2 - 112, 224, 224]

    cropped_img, scale2 = crop(img, bbox)
    assert scale2 != 0

    t1 = np.array([bbox[0], bbox[1]])
    scale = s / 100
    t2 = np.array([t[0] - img_w / 2, t[1] - img_h / 2])

    return cropped_img, (scale / scale2, scale * t1 + t2)


def align_for_lm(img, five_points):
    """
    Align facial image using facial landmarks for landmark detection refinement.

    Args:
        img: Input facial image (numpy array)
        five_points: Facial landmark coordinates (5 points, 10 values)

    Returns:
        crop_img: Cropped and aligned facial image
        scale: Scaling factor applied during cropping
        bbox: Bounding box coordinates [x, y, width, height]

    Process:
        1. Predict optimal face bounding box using landmark regression
        2. Crop and align image based on predicted bounding box
    """
    # Reshape landmarks to 1x10 array (5 points x 2 coordinates)
    five_points = np.array(five_points).reshape([1, 10])

    # Load bounding box regressor parameters (MATLAB format)
    params = loadmat('util/BBRegressorParam_r.mat')  # Contains regression weights

    # Predict optimal face bounding box using regression model
    bbox = BBRegression(five_points, params)  # Returns [x, y, width, height]

    # Verify valid bounding box prediction
    assert bbox[2] != 0, "Invalid bounding box width (zero detected)"

    # Convert to integer coordinates for cropping
    bbox = np.round(bbox).astype(np.int32)

    # Crop image and get scaling factor
    crop_img, scale = crop(img, bbox)  # crop() should handle boundary checks

    return crop_img, scale, bbox


def resize_n_crop_img(img, lm, ldmk_3d, t, s, s_3d, target_size=224., mask=None):
    """
    Resize and center-crop image with corresponding landmark transformation

    Args:
        img: PIL.Image - Input image
        lm: np.array - Facial landmarks in original image coordinates [N, 2]
        t: tuple - (tx, ty) translation parameters
        s: float - Scaling factor
        target_size: float - Output image dimensions (square)
        mask: PIL.Image - Optional mask image

    Returns:
        img: PIL.Image - Processed image
        lm: np.array - Transformed landmarks [N, 2]
        mask: PIL.Image - Processed mask (or None)
        left: int - Left crop coordinate
        up: int - Top crop coordinate
    """
    # Original image dimensions
    w0, h0 = img.size

    # Calculate scaled dimensions
    w = (w0 * s).astype(np.int32)
    h = (h0 * s).astype(np.int32)

    w_3d = (w0 * s_3d).astype(np.int32)
    h_3d = (h0 * s_3d).astype(np.int32)

    # Calculate crop coordinates after scaling and translation
    # Horizontal crop window
    left = (w / 2 - target_size / 2 + (t[0] - w0 / 2) * s).astype(np.int32)
    right = left + target_size

    # Vertical crop window (note inverted Y-axis in images)
    up = (h / 2 - target_size / 2 + (h0 / 2 - t[1]) * s).astype(np.int32)
    below = up + target_size
    left = int(left)
    up = int(up)
    right = int(right)
    below = int(below)
    # Resize and crop main image
    img = img.resize((w, h), resample=Image.BICUBIC)
    img = img.crop((left, up, right, below))

    # Process mask if provided
    if mask is not None:
        mask = mask.resize((w, h), resample=Image.BICUBIC)
        mask = mask.crop((left, up, right, below))

    # Transform landmarks to cropped coordinates
    # 1. Adjust for translation and original image center
    # 2. Apply scaling
    # 3. Adjust for final crop offset
    lm = np.stack([lm[:, 0] - t[0] + w0 / 2,
                   lm[:, 1] - t[1] + h0 / 2], axis=1) * s
    crop_offset = np.array([(w / 2 - target_size / 2),
                            (h / 2 - target_size / 2)])
    lm = lm - crop_offset.reshape(1, 2)

    ldmk_3d = np.stack([ldmk_3d[:, 0] - t[0] + w0 / 2, ldmk_3d[:, 1] -
                        t[1] + h0 / 2], axis=1) * s_3d
    ldmk_3d = ldmk_3d - np.reshape(
        np.array([(w_3d / 2 - 512 / 2), (h_3d / 2 - 512 / 2)]), [1, 2])

    return img, lm, mask, left, up, ldmk_3d


def extract_5p(lm):
    """
    Extract 5-point facial landmarks from 68 landmarks.

    Args:
        lm (np.ndarray): 68 facial landmarks.

    Returns:
        np.ndarray: 5-point landmarks.
    """
    lm_idx = np.array([31, 37, 40, 43, 46, 49, 55]) - 1
    lm5p = np.stack([
        lm[lm_idx[0], :],
        np.mean(lm[lm_idx[[1, 2]], :], axis=0),
        np.mean(lm[lm_idx[[3, 4]], :], axis=0),
        lm[lm_idx[5], :],
        lm[lm_idx[6], :]
    ], axis=0)

    return lm5p[[1, 2, 0, 3, 4], :]


def align_img(img, lm, lm3D, ldmk_3d, mask=None, target_size=224., rescale_factor=102., rescale_factor_3D=218.):
    """
    Align facial image using 2D-3D landmark correspondence

    Args:
        img: PIL.Image - Input facial image (H, W, 3)
        lm: np.array - Facial landmarks (68, 2) in image coordinates (y-axis inverted)
        lm3D: np.array - 3D reference landmarks (5, 3) for pose estimation
        mask: PIL.Image - Optional facial mask (H, W, 3)
        target_size: float - Output image dimensions (square)
        rescale_factor: float - Normalization factor for face scale

    Returns:
        trans_params: np.array - [raw_W, raw_H, scale, tx, ty] transformation parameters
        img_new: PIL.Image - Aligned image (target_size, target_size, 3)
        lm_new: np.array - Transformed landmarks (68, 2)
        mask_new: PIL.Image - Aligned mask (target_size, target_size)
        crop_left: int - Left crop coordinate
        crop_up: int - Top crop coordinate
        s: float - Final scaling factor

    Process:
        1. Extract 5-point landmarks if needed
        2. Estimate face scale and translation using POS algorithm
        3. Resize and crop image with landmark adjustment
    """
    # Original image dimensions
    w0, h0 = img.size

    # Extract 5 facial landmarks if not provided
    if lm.shape[0] != 5:
        lm5p = extract_5p(lm)  # Convert 68-point to 5-point landmarks
    else:
        lm5p = lm

    # Calculate scale and translation using PnP algorithm
    # POS (Perspective-n-Point algorithm) implementation
    t, s = POS(lm5p.T, lm3D.T)  # Returns translation vector and scale factor
    s_3d = rescale_factor_3D / s
    s = rescale_factor / s  # Normalize scale using reference face size
    # Apply geometric transformation
    img_new, lm_new, mask_new, crop_left, crop_up, ldmk_3d_align = resize_n_crop_img(
        img,
        lm,
        ldmk_3d,
        t,
        s,
        s_3d=s_3d,
        target_size=target_size,
        mask=mask
    )

    # Package transformation parameters [original_w, original_h, scale, tx, ty]
    trans_params = np.array([w0, h0, s, t[0][0], t[1][0]])

    return trans_params, img_new, lm_new, mask_new, crop_left, crop_up, s, ldmk_3d_align


def estimate_norm(lm_68p, H):
    """
    Estimate similarity transformation matrix for face alignment.

    Args:
        lm_68p (np.ndarray): 68 facial landmarks.
        H (int): Image height.

    Returns:
        np.ndarray: Transformation matrix (2, 3).
    """
    lm = extract_5p(lm_68p)
    lm[:, -1] = H - 1 - lm[:, -1]

    tform = trans.SimilarityTransform()
    src = np.array([
        [38.2946, 51.6963], [73.5318, 51.5014], [56.0252, 71.7366],
        [41.5493, 92.3655], [70.7299, 92.2041]
    ], dtype=np.float32)

    tform.estimate(lm, src)
    M = tform.params

    return M[0:2, :] if np.linalg.det(M) != 0 else np.eye(2, 3)


def estimate_norm_torch(lm_68p, H):
    """
    Estimate similarity transformation matrix for face alignment using PyTorch.

    Args:
        lm_68p (torch.Tensor): 68 facial landmarks.
        H (int): Image height.

    Returns:
        torch.Tensor: Transformation matrices.
    """
    lm_68p_ = lm_68p.detach().cpu().numpy()
    M = [estimate_norm(lm, H) for lm in lm_68p_]

    return torch.tensor(np.array(M), dtype=torch.float32, device=lm_68p.device)