File size: 4,120 Bytes
9ff79dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
from __future__ import annotations

from dataclasses import dataclass
from typing import List, cast

import torch
from PIL import Image
from transformers import LlamaTokenizerFast, PaliGemmaProcessor


@dataclass
class ColPaliTextInput:
    input_ids: torch.Tensor
    attention_mask: torch.Tensor

    def to(self, device: torch.device) -> ColPaliTextInput:
        return ColPaliTextInput(
            input_ids=self.input_ids.to(device),
            attention_mask=self.attention_mask.to(device),
        )


@dataclass
class ColPaliImageInput:
    input_ids: torch.Tensor
    pixel_values: torch.Tensor
    attention_mask: torch.Tensor

    def to(self, device: str | torch.device) -> ColPaliImageInput:
        return ColPaliImageInput(
            input_ids=self.input_ids.to(device),
            pixel_values=self.pixel_values.to(device),
            attention_mask=self.attention_mask.to(device),
        )


class ColPaliProcessor:
    def __init__(self, processor: PaliGemmaProcessor):
        self.processor = processor
        self.tokenizer = cast(LlamaTokenizerFast, self.processor.tokenizer)  # type: ignore

    @staticmethod
    def from_pretrained(model_name: str) -> ColPaliProcessor:
        return ColPaliProcessor(processor=cast(PaliGemmaProcessor, PaliGemmaProcessor.from_pretrained(model_name)))

    def process_text(
        self,
        text: str | List[str],
        padding: str = "longest",
        return_tensors: str = "pt",
        add_special_tokens: bool = True,
    ) -> ColPaliTextInput:
        """
        Process text inputs for the model.
        If `add_special_tokens` is True (default), the text will be prepended with the <bos> token and appended with " \n".
        """
        if add_special_tokens:
            if isinstance(text, str):
                text = self.tokenizer.bos_token + text + "\n"
            elif isinstance(text, list):
                text = [self.tokenizer.bos_token + t + "\n" for t in text]
            else:
                raise ValueError("text must be a string or a list of strings.")

        batch_output = self.tokenizer(
            text, padding=padding, return_tensors=return_tensors, add_special_tokens=add_special_tokens
        )

        return ColPaliTextInput(
            input_ids=cast(torch.Tensor, batch_output["input_ids"]),
            attention_mask=cast(torch.Tensor, batch_output["attention_mask"]),
        )

    def process_image(
        self,
        image: Image.Image | List[Image.Image],
        padding: str = "longest",
        do_convert_rgb: bool = True,
        return_tensors: str = "pt",
        add_special_prompt: bool = True,
    ) -> ColPaliImageInput:
        # NOTE: The special prompt was used at training time,
        special_prompt = "Describe the image." if add_special_prompt else None
        if isinstance(image, Image.Image):
            text_input = [special_prompt]
        elif isinstance(image, list):
            text_input = [special_prompt] * len(image)
        else:
            raise ValueError("image must be a PIL Image or a list of PIL Images.")

        batch_output = self.processor(
            text=text_input,
            images=image,
            padding=padding,
            do_convert_rgb=do_convert_rgb,
            return_tensors=return_tensors,
        )

        if add_special_prompt:
            return ColPaliImageInput(
                input_ids=batch_output["input_ids"],
                pixel_values=batch_output["pixel_values"],
                attention_mask=batch_output["attention_mask"],
            )
        else:
            return ColPaliImageInput(
                input_ids=batch_output["input_ids"][:, : self.processor.image_seq_length],
                pixel_values=batch_output["pixel_values"][:, : self.processor.image_seq_length],
                attention_mask=batch_output["attention_mask"][:, : self.processor.image_seq_length],
            )

    def decode(self, *args, **kwargs):
        return self.tokenizer.decode(*args, **kwargs)

    def batch_decode(self, *args, **kwargs):
        return self.tokenizer.batch_decode(*args, **kwargs)