Spaces:
Configuration error
Configuration error
File size: 5,497 Bytes
1d6bdab |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
from typing import Any, List, Optional, Union
from PIL import Image
from camel.embeddings import BaseEmbedding
from camel.logger import get_logger
logger = get_logger(__name__)
class VisionLanguageEmbedding(BaseEmbedding[Union[str, Image.Image]]):
r"""Provides image embedding functionalities using multimodal model.
Args:
model_name : The model type to be used for generating embeddings.
And the default value is: obj:`openai/clip-vit-base-patch32`.
Raises:
RuntimeError: If an unsupported model type is specified.
"""
def __init__(
self, model_name: str = "openai/clip-vit-base-patch32"
) -> None:
r"""Initializes the: obj: `VisionLanguageEmbedding` class with a
specified model and return the dimension of embeddings.
Args:
model_name (str, optional): The version name of the model to use.
(default: :obj:`openai/clip-vit-base-patch32`)
"""
from transformers import AutoModel, AutoProcessor
try:
self.model = AutoModel.from_pretrained(model_name)
self.processor = AutoProcessor.from_pretrained(model_name)
except Exception as e:
raise RuntimeError(f"Failed to load model '{model_name}': {e}")
self.valid_processor_kwargs = []
self.valid_model_kwargs = []
try:
self.valid_processor_kwargs = (
self.processor.image_processor._valid_processor_keys
)
self.valid_model_kwargs = [
"pixel_values",
"return_dict",
"interpolate_pos_encoding",
]
except Exception:
logger.warning("not typically processor and model structure")
pass
self.dim: Optional[int] = None
def embed_list(
self, objs: List[Union[Image.Image, str]], **kwargs: Any
) -> List[List[float]]:
"""Generates embeddings for the given images or texts.
Args:
objs (List[Image.Image|str]): The list of images or texts for
which to generate the embeddings.
image_processor_kwargs: Extra kwargs passed to the image processor.
tokenizer_kwargs: Extra kwargs passed to the text tokenizer
(processor).
model_kwargs: Extra kwargs passed to the main model.
Returns:
List[List[float]]: A list that represents the generated embedding
as a list of floating-point numbers.
Raises:
ValueError: If the input type is not `Image.Image` or `str`.
"""
if not objs:
raise ValueError("Input objs list is empty.")
image_processor_kwargs: Optional[dict] = kwargs.get(
'image_processor_kwargs', {}
)
tokenizer_kwargs: Optional[dict] = kwargs.get('tokenizer_kwargs', {})
model_kwargs: Optional[dict] = kwargs.get('model_kwargs', {})
result_list = []
for obj in objs:
if isinstance(obj, Image.Image):
image_input = self.processor(
images=obj,
return_tensors="pt",
padding=True,
**image_processor_kwargs,
)
image_feature = (
self.model.get_image_features(
**image_input, **model_kwargs
)
.squeeze(dim=0)
.tolist()
)
result_list.append(image_feature)
elif isinstance(obj, str):
text_input = self.processor(
text=obj,
return_tensors="pt",
padding=True,
**tokenizer_kwargs,
)
text_feature = (
self.model.get_text_features(**text_input, **model_kwargs)
.squeeze(dim=0)
.tolist()
)
result_list.append(text_feature)
else:
raise ValueError("Input type is not image nor text.")
self.dim = len(result_list[0])
if any(len(result) != self.dim for result in result_list):
raise ValueError("Dimensionality is not consistent.")
return result_list
def get_output_dim(self) -> int:
r"""Returns the output dimension of the embeddings.
Returns:
int: The dimensionality of the embedding for the current model.
"""
if self.dim is None:
text = 'dimension'
inputs = self.processor(text=[text], return_tensors="pt")
self.dim = self.model.get_text_features(**inputs).shape[1]
return self.dim
|