import logging import torch import os from PIL import Image from io import BytesIO from transformers import AutoProcessor, Idefics2ForConditionalGeneration from huggingface_hub import hf_hub_download from indexify_extractor_sdk import Content, Extractor, Feature from .parse_utils import token2json from pydantic import BaseModel from pydantic_settings import BaseSettings from typing import Optional, Literal, List, Union logger = logging.getLogger(__name__) token = os.getenv('HF_TOKEN') class ModelSettings(BaseSettings): peft_model_id: str = "nielsr/idefics2-cord-demo" hf_token: Optional[str] = token model_settings = ModelSettings() class ReceiptExtractor(Extractor): name = "tensorlake/idefics2json" description = "Finetuned Idefics2 for Image to JSON." system_dependencies = [] input_mime_types = ["image/jpeg", "image/png"] def __init__(self): super(ReceiptExtractor, self).__init__() device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") logger.info(f"Using device: {device.type}") torch_dtype = torch.float32 if device.type == "cpu" else torch.float16 self.processor = AutoProcessor.from_pretrained(model_settings.peft_model_id) # Load the base model with adapters on top self.model = Idefics2ForConditionalGeneration.from_pretrained( model_settings.peft_model_id, torch_dtype=torch_dtype, ) # get the resized input embeddings filepath = hf_hub_download(repo_id="nielsr/idefics2-embeddings", filename="input_embeddings.pt", repo_type="dataset") input_embeddings = torch.load(filepath, map_location="cpu") input_embeddings_module = torch.nn.Embedding(input_embeddings.shape[0], input_embeddings.shape[1], _weight=input_embeddings) # set the resized output embeddings filepath = hf_hub_download(repo_id="nielsr/idefics2-embeddings", filename="output_embeddings.pt", repo_type="dataset") output_embeddings = torch.load(filepath, map_location="cpu") output_embeddings_module = torch.nn.Linear(output_embeddings.shape[0], output_embeddings.shape[1], bias=False) output_embeddings_module.weight = output_embeddings # set them accordingly self.model.resize_token_embeddings(len(self.processor.tokenizer)) self.model.set_input_embeddings(input_embeddings_module) self.model.set_output_embeddings(output_embeddings_module) self.model.to("cuda") def extract(self, content: Content, params = None) -> List[Union[Feature, Content]]: image = Image.open(BytesIO(content.data)) # prepare image and prompt for the model messages = [ { "role": "user", "content": [ {"type": "text", "text": "Extract JSON."}, {"type": "image"}, ] }, ] prompt = self.processor.apply_chat_template(messages, add_generation_prompt=True) inputs = self.processor(text=prompt, images=[image], return_tensors="pt").to("cuda") # Generate token IDs generated_ids = self.model.generate(**inputs, max_new_tokens=768) # Decode back into text generated_texts = self.processor.batch_decode(generated_ids, skip_special_tokens=True) added_vocab = self.processor.tokenizer.get_added_vocab() generated_json = token2json(generated_texts[0], added_vocab) return [Content.from_text(str(generated_json))] def sample_input(self) -> Content: filepath = "sample.jpg" with open(filepath, 'rb') as f: image_data = f.read() return Content(content_type="image/jpg", data=image_data) if __name__ == "__main__": filepath = "sample.jpg" with open(filepath, 'rb') as f: image_data = f.read() data = Content(content_type="image/jpg", data=image_data) extractor = ReceiptExtractor() results = extractor.extract(data) print(results)