Spaces:
Runtime error
Runtime error
| import logging | |
| import torch | |
| import os | |
| from PIL import Image | |
| from io import BytesIO | |
| from transformers import AutoProcessor, Idefics2ForConditionalGeneration | |
| from huggingface_hub import hf_hub_download | |
| from indexify_extractor_sdk import Content, Extractor, Feature | |
| from .parse_utils import token2json | |
| from pydantic import BaseModel | |
| from pydantic_settings import BaseSettings | |
| from typing import Optional, Literal, List, Union | |
| logger = logging.getLogger(__name__) | |
| token = os.getenv('HF_TOKEN') | |
| class ModelSettings(BaseSettings): | |
| peft_model_id: str = "nielsr/idefics2-cord-demo" | |
| hf_token: Optional[str] = token | |
| model_settings = ModelSettings() | |
| class ReceiptExtractor(Extractor): | |
| name = "tensorlake/idefics2json" | |
| description = "Finetuned Idefics2 for Image to JSON." | |
| system_dependencies = [] | |
| input_mime_types = ["image/jpeg", "image/png"] | |
| def __init__(self): | |
| super(ReceiptExtractor, self).__init__() | |
| device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") | |
| logger.info(f"Using device: {device.type}") | |
| torch_dtype = torch.float32 if device.type == "cpu" else torch.float16 | |
| self.processor = AutoProcessor.from_pretrained(model_settings.peft_model_id) | |
| # Load the base model with adapters on top | |
| self.model = Idefics2ForConditionalGeneration.from_pretrained( | |
| model_settings.peft_model_id, | |
| torch_dtype=torch_dtype, | |
| ) | |
| # get the resized input embeddings | |
| filepath = hf_hub_download(repo_id="nielsr/idefics2-embeddings", filename="input_embeddings.pt", repo_type="dataset") | |
| input_embeddings = torch.load(filepath, map_location="cpu") | |
| input_embeddings_module = torch.nn.Embedding(input_embeddings.shape[0], input_embeddings.shape[1], _weight=input_embeddings) | |
| # set the resized output embeddings | |
| filepath = hf_hub_download(repo_id="nielsr/idefics2-embeddings", filename="output_embeddings.pt", repo_type="dataset") | |
| output_embeddings = torch.load(filepath, map_location="cpu") | |
| output_embeddings_module = torch.nn.Linear(output_embeddings.shape[0], output_embeddings.shape[1], bias=False) | |
| output_embeddings_module.weight = output_embeddings | |
| # set them accordingly | |
| self.model.resize_token_embeddings(len(self.processor.tokenizer)) | |
| self.model.set_input_embeddings(input_embeddings_module) | |
| self.model.set_output_embeddings(output_embeddings_module) | |
| self.model.to("cuda") | |
| def extract(self, content: Content, params = None) -> List[Union[Feature, Content]]: | |
| image = Image.open(BytesIO(content.data)) | |
| # prepare image and prompt for the model | |
| messages = [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "text", "text": "Extract JSON."}, | |
| {"type": "image"}, | |
| ] | |
| }, | |
| ] | |
| prompt = self.processor.apply_chat_template(messages, add_generation_prompt=True) | |
| inputs = self.processor(text=prompt, images=[image], return_tensors="pt").to("cuda") | |
| # Generate token IDs | |
| generated_ids = self.model.generate(**inputs, max_new_tokens=768) | |
| # Decode back into text | |
| generated_texts = self.processor.batch_decode(generated_ids, skip_special_tokens=True) | |
| added_vocab = self.processor.tokenizer.get_added_vocab() | |
| generated_json = token2json(generated_texts[0], added_vocab) | |
| return [Content.from_text(str(generated_json))] | |
| def sample_input(self) -> Content: | |
| filepath = "sample.jpg" | |
| with open(filepath, 'rb') as f: | |
| image_data = f.read() | |
| return Content(content_type="image/jpg", data=image_data) | |
| if __name__ == "__main__": | |
| filepath = "sample.jpg" | |
| with open(filepath, 'rb') as f: | |
| image_data = f.read() | |
| data = Content(content_type="image/jpg", data=image_data) | |
| extractor = ReceiptExtractor() | |
| results = extractor.extract(data) | |
| print(results) |