Spaces:

SoggyKiwi
/

DeIT-Dreamer

Sleeping

File size: 2,717 Bytes

b7ebb88
03f7bd7
 
 
 
b7ebb88
37ebd45
 
 
 
 
 
03f7bd7
f93fa3d
 
 
 
 
 
 
 
 
 
8c65b05
07b1c90
353541c
37ebd45
03f7bd7
 
37ebd45
03f7bd7
 
a4244e1
 
534e187
a4244e1
5c39195
03f7bd7
 
 
 
37ebd45
a4244e1
f93fa3d
8c65b05
f93fa3d
8c65b05
f93fa3d
03f7bd7
 
37ebd45
03f7bd7
 
 
 
 
 
b7ebb88
 
 
68fa56c
03f7bd7
8c65b05
 
 
f93fa3d
8c65b05
68fa56c
353541c
b7ebb88
 
37ebd45

import gradio as gr
import torch
import numpy as np
from transformers import ViTImageProcessor, ViTForImageClassification
from PIL import Image

# Load model and feature extractor outside the function
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
feature_extractor = ViTImageProcessor.from_pretrained('google/vit-large-patch32-384')
model = ViTForImageClassification.from_pretrained('google/vit-large-patch32-384')
model.to(device)
model.eval()

def get_encoder_activations(x):
    encoder_output = model.vit(x)
    final_activations = encoder_output.last_hidden_state[:,0,:]
    return final_activations

def total_variation_loss(img):
    pixel_dif1 = img[:, :, 1:, :] - img[:, :, :-1, :]
    pixel_dif2 = img[:, :, :, 1:] - img[:, :, :, :-1]
    return (torch.sum(torch.abs(pixel_dif1)) + torch.sum(torch.abs(pixel_dif2)))

def process_image(input_image, learning_rate, tv_weight, iterations, n_targets, seed):
    if input_image is None:
        return None

    image = input_image.convert('RGB')
    pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
    pixel_values = pixel_values.to(device)
    pixel_values.requires_grad_(True)

    
    torch.manual_seed(int(seed))
    random_indices = torch.randperm(1000)[:int(n_targets)].to(pixel_values.device)

    for iteration in range(int(iterations)):
        model.zero_grad()
        if pixel_values.grad is not None:
            pixel_values.grad.data.zero_()

        final_activations = get_encoder_activations(pixel_values)
        logits = model.classifier(final_activations[0])
    
        original_loss = logits[random_indices].sum()
        tv_loss = total_variation_loss(pixel_values)
        total_loss = original_loss - tv_weight * tv_loss
        total_loss.backward()

        with torch.no_grad():
            pixel_values.data += learning_rate * pixel_values.grad.data
        pixel_values.data = torch.clamp(pixel_values.data, -1, 1)

    updated_pixel_values_np = 127.5 + pixel_values.squeeze().permute(1, 2, 0).detach().cpu() * 127.5
    updated_pixel_values_np = updated_pixel_values_np.numpy().astype(np.uint8)

    return updated_pixel_values_np

iface = gr.Interface(
    fn=process_image,
    inputs=[
        gr.Image(type="pil"), 
        gr.Number(value=16.0, minimum=0, label="Learning Rate"),
        gr.Number(value=0.0001, label="Total Variation Loss"), 
        gr.Number(value=4, minimum=1, label="Iterations"),
        gr.Number(value=420, minimum=0, label="Seed"),
        gr.Number(value=500, minimum=1, maximum=1000, label="Number of Random Target Class Activations to Maximise"),
    ],
    outputs=[gr.Image(type="numpy", label="ViT-Dreamed Image")]
)

iface.launch()