File size: 2,628 Bytes

8a00d0d

import torch
import torch.nn as nn
from transformers import AutoProcessor, AutoModelForVision2Seq
from transformers.image_utils import load_image


class ideficsV3(nn.Module):
    def __init__(self, model_name="HuggingFaceTB/SmolVLM-Instruct"):
        super().__init__()

        # load smolVLM model from huggingface
        self.image_processor = AutoProcessor.from_pretrained(model_name).image_processor
        smolVLM = AutoModelForVision2Seq.from_pretrained(model_name, torch_dtype=torch.float32)

        # Extract the necessary modules
        self.vision_model = smolVLM.model.vision_model

    def forward(self, pixel_values):

        #################################################################

        # The error ValueError: too many values to unpack (expected 4) occurs because the pixel_values tensor you passed into the model has a shape of [1, 13, 3, 384, 384], while the vision transformer (ViT) expects an input shape of [batch_size, channels, height, width], i.e., a 4D tensor.
        # Your pixel_values tensor is 5D because it contains multiple patches, while the ViT expects a single image or batch of images.
        # You need to flatten the patch dimension (the second dimension, 13) into the batch dimension (1) before passing it to the vision transformer.

        # Flatten the patch dimension into the batch dimension
        batch_size, num_patches, channels, height, width = pixel_values.shape
        pixel_values = pixel_values.view(batch_size * num_patches, channels, height, width)

        #################################################################

        # Run images through the vision transformer
        vision_outputs = self.vision_model(pixel_values)
        x = vision_outputs.last_hidden_state # shape := [batch_size * num_patches, 729, 1152]

        return x
    
if __name__ == "__main__":

    # Instantiate truncated model
    device = "cuda" if torch.cuda.is_available() else "cpu"
    truncated_model = ideficsV3().to(device).eval()
    truncated_model.eval()

    image1 = load_image("https://huggingface.co/spaces/merve/chameleon-7b/resolve/main/bee.jpg")
    image2 = load_image("https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg")

    inputs1 = truncated_model.image_processor(images=[image1, image2], return_tensors="pt")
    pixel_values = inputs1.pixel_values.to(model_dtype).to(device)

    # Pass pixel_values through your truncated model
    with torch.no_grad():
        outputs = truncated_model(pixel_values)

    print(outputs.shape)  # Should be [batch_size, 2048] given the projection layer output.