File size: 2,603 Bytes
332190f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 |
import torch
import transformers
from typing import Union, Optional, Tuple
from transformers import AutoConfig, AutoModel
from transformers.models.clip.modeling_clip import CLIPTextModelOutput
class MCLIPConfig(transformers.PretrainedConfig):
model_type = "M-CLIP"
def __init__(self, modelBase='xlm-roberta-large', transformerDimSize=1024, imageDimSize=768, **kwargs):
self.transformerDimensions = transformerDimSize
self.numDims = imageDimSize
self.modelBase = modelBase
super().__init__(**kwargs)
class MultilingualCLIP(transformers.PreTrainedModel):
config_class = MCLIPConfig
def __init__(self, config, *args, **kwargs):
super().__init__(config, *args, **kwargs)
self.transformer = transformers.AutoModel.from_pretrained(config.modelBase)
self.LinearTransformation = torch.nn.Linear(in_features=config.transformerDimensions,
out_features=config.numDims)
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, CLIPTextModelOutput]:
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
text_outputs = self.transformer(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
pooled_output = text_outputs[1]
text_embeds = self.LinearTransformation(pooled_output)
if not return_dict:
outputs = (text_embeds, text_outputs[0]) + text_outputs[2:]
return tuple(output for output in outputs if output is not None)
return CLIPTextModelOutput(
text_embeds=text_embeds,
last_hidden_state=text_outputs.last_hidden_state,
hidden_states=text_outputs.hidden_states,
attentions=text_outputs.attentions,
)
@classmethod
def _load_state_dict_into_model(cls, model, state_dict, pretrained_model_name_or_path, _fast_init=True):
model.load_state_dict(state_dict)
return model, [], [], []
AutoConfig.register("M-CLIP", MCLIPConfig)
AutoModel.register(MCLIPConfig, MultilingualCLIP) |