Spaces:
Running
Running
File size: 5,493 Bytes
9ff79dc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
import os
from typing import Optional
import torch
from transformers import SiglipModel
class SigLIP(SiglipModel):
def forward(self, *args, **kwargs):
"""
Forward pass through Llama and the linear layer for dimensionality reduction
Args:
- input_ids (torch.LongTensor): The input tokens tensor.
- attention_mask (torch.LongTensor): The attention mask tensor.
Returns:
- torch.Tensor: Embeddings of shape (batch_size, num_tokens, dim)
"""
return self.forward_branch(*args, **kwargs)
def forward_branch(
self,
input_ids: Optional[torch.LongTensor] = None,
pixel_values: Optional[torch.FloatTensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
return_loss: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
interpolate_pos_encoding: bool = False,
):
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if pixel_values is not None:
# Use SigLIP model's config for some fields (if specified) instead of those of vision & text components.
outputs = self.vision_model(
pixel_values=pixel_values.to(dtype=self.dtype),
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
interpolate_pos_encoding=interpolate_pos_encoding,
)
else:
outputs = self.text_model(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
embeds = outputs[1]
# normalized features
embeds = embeds / embeds.norm(p=2, dim=-1, keepdim=True)
return embeds
class ColSigLIP(SiglipModel):
def __init__(self, config):
super(ColSigLIP, self).__init__(config=config)
self.dim = 128
self.custom_vision_proj = torch.nn.Linear(self.config.vision_config.hidden_size, self.dim)
self.custom_text_proj = torch.nn.Linear(self.config.text_config.hidden_size, self.dim)
self.main_input_name = "doc_input_ids"
def forward(self, *args, **kwargs):
"""
Forward pass through Llama and the linear layer for dimensionality reduction
Args:
- input_ids (torch.LongTensor): The input tokens tensor.
- attention_mask (torch.LongTensor): The attention mask tensor.
Returns:
- torch.Tensor: Embeddings of shape (batch_size, num_tokens, dim)
"""
return self.forward_branch(*args, **kwargs)
def forward_branch(
self,
input_ids: Optional[torch.LongTensor] = None,
pixel_values: Optional[torch.FloatTensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
return_loss: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
interpolate_pos_encoding: bool = False,
):
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if pixel_values is not None:
# Use SigLIP model's config for some fields (if specified) instead of those of vision & text components.
outputs = self.vision_model(
pixel_values=pixel_values.to(dtype=self.dtype),
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
interpolate_pos_encoding=interpolate_pos_encoding,
)
last_hidden_states = outputs.last_hidden_state
proj = self.custom_vision_proj(last_hidden_states)
# normalize l2 norm
proj = proj / proj.norm(dim=-1, keepdim=True)
else:
outputs = self.text_model(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
last_hidden_states = outputs.last_hidden_state
proj = self.custom_text_proj(last_hidden_states)
# normalize l2 norm
proj = proj / proj.norm(dim=-1, keepdim=True)
proj = proj * attention_mask.unsqueeze(-1)
# normalized features
return proj
|