File size: 4,946 Bytes
8fece0a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ba67d41
8fece0a
 
 
 
 
ba67d41
8fece0a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ba67d41
8fece0a
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
from pathlib import Path

import cv2
import sys
import gradio as gr
import os
import numpy as np
from gradio_utils import *

from transformers import CLIPTokenizer

def image_mod(image):
    return image.rotate(45)


sys.path.insert(1, os.path.join(sys.path[0], '..'))


NUM_POINTS = 3
NUM_FRAMES = 16
LARGE_BOX_SIZE = 176



data = {}
tokenizer = CLIPTokenizer.from_pretrained('openai/clip-vit-base-patch32')
def get_token_number(prompt, word):
    all_tokens = tokenizer(prompt).input_ids
    word_tokens = tokenizer(word).input_ids
    print(all_tokens, word_tokens, word)
    return all_tokens.index(word_tokens[1])  # Word_tokens start with cls

def overlay_mask(img, mask):
    mask_resized = cv2.resize(mask, (img.shape[1], img.shape[0]), interpolation=cv2.INTER_NEAREST)

    # Create a 3-channel version of the mask
    mask_3ch = cv2.cvtColor(mask_resized, cv2.COLOR_GRAY2BGR)

    # Set the opacity level
    opacity = 0.25  # Adjust as needed
    alpha_channel = np.ones_like(mask_resized) * 255  # Start with a fully opaque alpha channel

    # Set black pixels to be completely transparent
    alpha_channel[mask_resized < 5] = 0

    # Set the opacity level for non-black pixels
    opacity = 0.5  # Adjust this value as needed (0.0 to 1.0)
    alpha_channel[mask_resized != 0] = int(255 * opacity)

    # Create a 4-channel image (BGR + Alpha)
    b, g, r = cv2.split(img)
    rgba = [b, g, r, alpha_channel]
    result = cv2.merge(rgba, 4)
    # Overlay the mask on the image
    overlay = cv2.addWeighted(mask_3ch, opacity, img, 1 - opacity, 0)    
    return overlay

def fetch_proper_img(prompt, word, frame_num, diffusion_step, layer_num=0):
    
    
    frame_num = frame_num - 1
    if layer_num is None:
        layer_num = 0
    else:
        layer_num = 100 if layer_num == 3 else layer_num
   
    video_file_name = f"./data/videos/{prompt.replace(' ', '_')}/video/frame_{frame_num:04d}.png"
    img = cv2.imread(video_file_name)

    if word is None:
        overlaid_image = img
    else:
        mask_file_name = f'./data/final_masks/attention_probs_{prompt}/frame_{frame_num}_layer_{layer_num}_diffusionstep_{diffusion_step}_token_{get_token_number(prompt, word)}.png'
        mask = cv2.imread(mask_file_name, cv2.IMREAD_GRAYSCALE)
        overlaid_image = overlay_mask(img, mask)
        print(mask_file_name)

    return img, overlaid_image


def fetch_proper_img_and_change_prompt(prompt, word, frame_num, diffusion_step, layer_num=0):
    radio = change_text_prompt(prompt)
    video_1, video_2 = fetch_proper_img(prompt, word, frame_num, diffusion_step, layer_num)
    return [video_1, video_2, radio]

css = """
.word-btn {
    width: fit-content;
    padding: 3px;
}
.word-btns-container {
    flex-direction: row;
}
"""

registry = {
    'spider': 'mask_1',
    'descending': 'mask_2',
}

data_path = Path(
    'data'
)


available_prompts = ['a dog and a cat sitting','A fish swimming in the water', 'A spider descending from its web', 'An astronaut riding a horse']

with gr.Blocks(css=css) as demo:
    # gr.Markdown("""
    #             # Peekaboo Demo
    #             """)
    with gr.Row():
        video_1 = gr.Image(label="Image", )
        # video_1 = gr.Image(label="Image", width=256, height=256)
        video_2 = gr.Image(label="Image with Attention Mask", )
        # video_2 = gr.Image(label="Image with Attention Mask", width=256, height=256)
    

    def change_text_prompt(text):
        return gr.Radio(text.strip().split(' '), value=None, label='Choose a word to visualize its attention mask.')

    text = 'A dog in a park'
    with gr.Group("Video Selection"):
        txt_1 = gr.Dropdown(choices=available_prompts, label="Video Prompt", value=available_prompts[0])
        submit_btn = gr.Button('Get words')
    with gr.Group('Word Selection'):
        radio = gr.Radio(text.split(' '), value=None, label='Choose a word to visualize its attention mask.')
        range_slider = gr.Slider(1, 16, 1, step=2, label='Frame of the generated video to visualize the attention mask.')
        diffusion_slider = gr.Slider(0, 35, 0, step=5, label='Index of diffusion steps.')
        layer_num_slider = gr.Slider(0, 6, 0, step=1, label='Layer number for attention mask.')
    radio.change(fetch_proper_img, inputs=[txt_1, radio, range_slider, diffusion_slider, layer_num_slider], outputs=[video_1, video_2])
    range_slider.change(fetch_proper_img, inputs=[txt_1, radio, range_slider, diffusion_slider, layer_num_slider], outputs=[video_1, video_2])
    diffusion_slider.change(fetch_proper_img, inputs=[txt_1, radio, range_slider, diffusion_slider, layer_num_slider], outputs=[video_1, video_2])
    layer_num_slider.change(fetch_proper_img, inputs=[txt_1, radio, range_slider, diffusion_slider, layer_num_slider], outputs=[video_1, video_2])
    submit_btn.click(change_text_prompt, inputs=[txt_1], outputs=[radio])

if __name__ == "__main__":
    demo.launch(server_name='0.0.0.0')