File size: 4,893 Bytes
79d88c4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import os
import numpy as np
from PIL import Image
import torch
import torchvision.transforms as transforms
from transformers import CLIPImageProcessor
import librosa


def process_bbox(bbox, expand_radio, height, width):
    """
    raw_vid_path:
    bbox: format: x1, y1, x2, y2
    radio: expand radio against bbox size
    height,width: source image height and width
    """

    def expand(bbox, ratio, height, width):
        
        bbox_h = bbox[3] - bbox[1]
        bbox_w = bbox[2] - bbox[0]
        
        expand_x1 = max(bbox[0] - ratio * bbox_w, 0)
        expand_y1 = max(bbox[1] - ratio * bbox_h, 0)
        expand_x2 = min(bbox[2] + ratio * bbox_w, width)
        expand_y2 = min(bbox[3] + ratio * bbox_h, height)

        return [expand_x1,expand_y1,expand_x2,expand_y2]

    def to_square(bbox_src, bbox_expend, height, width):

        h = bbox_expend[3] - bbox_expend[1]
        w = bbox_expend[2] - bbox_expend[0]
        c_h = (bbox_expend[1] + bbox_expend[3]) / 2
        c_w = (bbox_expend[0] + bbox_expend[2]) / 2

        c = min(h, w) / 2

        c_src_h = (bbox_src[1] + bbox_src[3]) / 2
        c_src_w = (bbox_src[0] + bbox_src[2]) / 2

        s_h, s_w = 0, 0
        if w < h:
            d = abs((h - w) / 2)
            s_h = min(d, abs(c_src_h-c_h))
            s_h = s_h if  c_src_h > c_h else s_h * (-1)
        else:
            d = abs((h - w) / 2)
            s_w = min(d, abs(c_src_w-c_w))
            s_w = s_w if  c_src_w > c_w else s_w * (-1)


        c_h = (bbox_expend[1] + bbox_expend[3]) / 2 + s_h
        c_w = (bbox_expend[0] + bbox_expend[2]) / 2 + s_w

        square_x1 = c_w - c
        square_y1 = c_h - c
        square_x2 = c_w + c
        square_y2 = c_h + c

        x1, y1, x2, y2 = square_x1, square_y1, square_x2, square_y2
        ww = x2 - x1
        hh = y2 - y1
        cc_x = (x1 + x2)/2
        cc_y = (y1 + y2)/2
        # 1:1
        ww = hh = min(ww, hh)
        x1, x2 = round(cc_x - ww/2), round(cc_x + ww/2)
        y1, y2 = round(cc_y - hh/2), round(cc_y + hh/2) 

        return [round(x1), round(y1), round(x2), round(y2)]


    bbox_expend = expand(bbox, expand_radio, height=height, width=width)
    processed_bbox = to_square(bbox, bbox_expend, height=height, width=width)

    return processed_bbox


def get_audio_feature(audio_path, feature_extractor):
    audio_input, sampling_rate = librosa.load(audio_path, sr=16000)
    assert sampling_rate == 16000

    audio_features = []
    window = 750*640
    for i in range(0, len(audio_input), window):
        audio_feature = feature_extractor(audio_input[i:i+window], 
                                        sampling_rate=sampling_rate, 
                                        return_tensors="pt", 
                                        ).input_features
        audio_features.append(audio_feature)
    audio_features = torch.cat(audio_features, dim=-1)
    return audio_features, len(audio_input) // 640

def image_audio_to_tensor(align_instance, feature_extractor, image_path, audio_path, limit=100, image_size=512, area=1.25):
    
    clip_processor = CLIPImageProcessor()
    
    to_tensor = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
        ])
    mask_to_tensor = transforms.Compose([
            transforms.ToTensor(),
        ])
    

    imSrc_ = Image.open(image_path).convert('RGB')
    w, h = imSrc_.size
    
    _, _, bboxes_list = align_instance(np.array(imSrc_)[:,:,[2,1,0]], maxface=True)

    if len(bboxes_list) == 0:
        return None
    bboxSrc = bboxes_list[0]

    x1, y1, ww, hh = bboxSrc
    x2, y2 = x1 + ww, y1 + hh

    mask_img = np.zeros_like(np.array(imSrc_))
    ww, hh = (x2-x1) * area, (y2-y1) * area
    center = [(x2+x1)//2, (y2+y1)//2]
    x1 = max(center[0] - ww//2, 0)
    y1 = max(center[1] - hh//2, 0)
    x2 = min(center[0] + ww//2, w)
    y2 = min(center[1] + hh//2, h)
    mask_img[int(y1):int(y2), int(x1):int(x2)] = 255
    mask_img = Image.fromarray(mask_img)
    
    w, h = imSrc_.size
    scale = image_size / min(w, h)
    new_w = round(w * scale / 64) * 64
    new_h = round(h * scale / 64) * 64
    if new_h != h or new_w != w:
        imSrc = imSrc_.resize((new_w, new_h), Image.LANCZOS)
        mask_img = mask_img.resize((new_w, new_h), Image.LANCZOS)
    else:
        imSrc = imSrc_

    clip_image = clip_processor(
            images=imSrc.resize((224, 224), Image.LANCZOS), return_tensors="pt"
        ).pixel_values[0]
    audio_input, audio_len = get_audio_feature(audio_path, feature_extractor)

    audio_len = min(limit, audio_len)

    sample = dict(
                face_mask=mask_to_tensor(mask_img),
                ref_img=to_tensor(imSrc),
                clip_images=clip_image,
                audio_feature=audio_input[0],
                audio_len=audio_len
            )

    return sample