File size: 5,497 Bytes
1d6bdab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
from typing import Any, List, Optional, Union

from PIL import Image

from camel.embeddings import BaseEmbedding
from camel.logger import get_logger

logger = get_logger(__name__)


class VisionLanguageEmbedding(BaseEmbedding[Union[str, Image.Image]]):
    r"""Provides image embedding functionalities using multimodal model.

    Args:
        model_name : The model type to be used for generating embeddings.
            And the default value is: obj:`openai/clip-vit-base-patch32`.

    Raises:
        RuntimeError: If an unsupported model type is specified.
    """

    def __init__(
        self, model_name: str = "openai/clip-vit-base-patch32"
    ) -> None:
        r"""Initializes the: obj: `VisionLanguageEmbedding` class with a
        specified model and return the dimension of embeddings.

        Args:
            model_name (str, optional): The version name of the model to use.
                (default: :obj:`openai/clip-vit-base-patch32`)
        """
        from transformers import AutoModel, AutoProcessor

        try:
            self.model = AutoModel.from_pretrained(model_name)
            self.processor = AutoProcessor.from_pretrained(model_name)
        except Exception as e:
            raise RuntimeError(f"Failed to load model '{model_name}': {e}")

        self.valid_processor_kwargs = []
        self.valid_model_kwargs = []

        try:
            self.valid_processor_kwargs = (
                self.processor.image_processor._valid_processor_keys
            )
            self.valid_model_kwargs = [
                "pixel_values",
                "return_dict",
                "interpolate_pos_encoding",
            ]
        except Exception:
            logger.warning("not typically processor and model structure")
            pass
        self.dim: Optional[int] = None

    def embed_list(
        self, objs: List[Union[Image.Image, str]], **kwargs: Any
    ) -> List[List[float]]:
        """Generates embeddings for the given images or texts.

        Args:
            objs (List[Image.Image|str]): The list of images or texts for
                which to generate the embeddings.
            image_processor_kwargs: Extra kwargs passed to the image processor.
            tokenizer_kwargs: Extra kwargs passed to the text tokenizer
                (processor).
            model_kwargs: Extra kwargs passed to the main model.

        Returns:
            List[List[float]]: A list that represents the generated embedding
                as a list of floating-point numbers.

        Raises:
            ValueError: If the input type is not `Image.Image` or `str`.
        """
        if not objs:
            raise ValueError("Input objs list is empty.")

        image_processor_kwargs: Optional[dict] = kwargs.get(
            'image_processor_kwargs', {}
        )
        tokenizer_kwargs: Optional[dict] = kwargs.get('tokenizer_kwargs', {})
        model_kwargs: Optional[dict] = kwargs.get('model_kwargs', {})

        result_list = []
        for obj in objs:
            if isinstance(obj, Image.Image):
                image_input = self.processor(
                    images=obj,
                    return_tensors="pt",
                    padding=True,
                    **image_processor_kwargs,
                )
                image_feature = (
                    self.model.get_image_features(
                        **image_input, **model_kwargs
                    )
                    .squeeze(dim=0)
                    .tolist()
                )
                result_list.append(image_feature)
            elif isinstance(obj, str):
                text_input = self.processor(
                    text=obj,
                    return_tensors="pt",
                    padding=True,
                    **tokenizer_kwargs,
                )
                text_feature = (
                    self.model.get_text_features(**text_input, **model_kwargs)
                    .squeeze(dim=0)
                    .tolist()
                )
                result_list.append(text_feature)
            else:
                raise ValueError("Input type is not image nor text.")

        self.dim = len(result_list[0])

        if any(len(result) != self.dim for result in result_list):
            raise ValueError("Dimensionality is not consistent.")

        return result_list

    def get_output_dim(self) -> int:
        r"""Returns the output dimension of the embeddings.

        Returns:
            int: The dimensionality of the embedding for the current model.
        """
        if self.dim is None:
            text = 'dimension'
            inputs = self.processor(text=[text], return_tensors="pt")
            self.dim = self.model.get_text_features(**inputs).shape[1]
        return self.dim