Spaces:

HUANG-Stephanie
/

cvquest-colpali

Sleeping

File size: 2,145 Bytes

9ff79dc

from torch import nn
from transformers import Idefics2Model, Idefics2PreTrainedModel


class BiIdefics(Idefics2PreTrainedModel):
    def __init__(self, config):
        super(BiIdefics, self).__init__(config=config)
        self.model: Idefics2Model = Idefics2Model(config)
        self.pooling_strategy = "last"
        self.main_input_name = "doc_input_ids"

    def forward(self, *args, **kwargs):
        """
        Forward pass through Llama and the linear layer for dimensionality reduction

        Args:
        - input_ids (torch.LongTensor): The input tokens tensor.
        - attention_mask (torch.LongTensor): The attention mask tensor.

        Returns:
        - torch.Tensor: Embeddings of shape (batch_size, num_tokens, dim)
        """
        outputs = self.model(*args, **kwargs)
        last_hidden_states = outputs[0]  # (batch_size, sequence_length, hidden_size)
        # pooling - last token
        proj = last_hidden_states[:, -1, :]
        # normalize l2 norm
        proj = proj / proj.norm(dim=-1, keepdim=True)
        return proj


class ColIdefics(Idefics2PreTrainedModel):
    def __init__(self, config):
        super(ColIdefics, self).__init__(config=config)
        self.model: Idefics2Model = Idefics2Model(config)
        self.dim = 128
        self.linear = nn.Linear(self.model.config.text_config.hidden_size, self.dim)
        self.main_input_name = "doc_input_ids"

    def forward(self, *args, **kwargs):
        """
        Forward pass through Llama and the linear layer for dimensionality reduction

        Args:
        - input_ids (torch.LongTensor): The input tokens tensor.
        - attention_mask (torch.LongTensor): The attention mask tensor.

        Returns:
        - torch.Tensor: Embeddings of shape (batch_size, num_tokens, dim)
        """
        outputs = self.model(*args, **kwargs)
        last_hidden_states = outputs[0]  # (batch_size, sequence_length, hidden_size)
        proj = self.linear(last_hidden_states)
        # normalize l2 norm
        proj = proj / proj.norm(dim=-1, keepdim=True)
        proj = proj * kwargs["attention_mask"].unsqueeze(-1)
        return proj