File size: 962 Bytes
5595e84
530ec53
5595e84
 
8c71b78
f2de4bf
2937c23
 
530ec53
 
 
 
 
 
 
 
 
 
5595e84
 
 
 
f2de4bf
530ec53
f2de4bf
530ec53
5595e84
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
from transformers import ViTModel
from torchvision import transforms
import torch
import torch.nn as nn
import transformers 

transformers.logging.set_verbosity_error()

class VisionEncoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.vision_model = ViTModel.from_pretrained("google/vit-base-patch16-224")
        self.image_transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ])

    def forward(self, images, device):
        if not isinstance(images, list):
            images = [images]

        processed_images = torch.stack([self.image_transform(image) for image in images]).to(device)
        with torch.no_grad():
            pixel_values = self.vision_model(processed_images)
            image_features = pixel_values.last_hidden_state
        return image_features