import gradio as gr import pdfplumber import re from transformers import LayoutLMForTokenClassification, LayoutLMTokenizerFast # Wczytanie modelu LayoutLMv3 model_name = "kryman27/layoutlmv3-finetuned" model = LayoutLMForTokenClassification.from_pretrained(model_name) tokenizer = LayoutLMTokenizerFast.from_pretrained(model_name) # Reguły do wykrywania NIP, kwot, dat nip_pattern = re.compile(r'\bPL\s?\d{10}\b|\b\d{10}\b') kwota_pattern = re.compile(r'\b\d+[\.,]?\d*\b') data_pattern = re.compile(r'\b\d{2}\.\d{2}\.\d{4}\b') payment_keywords = ["data płatności", "termin płatności", "zapłata", "płatność"] def extract_invoice_data(pdf_file): with pdfplumber.open(pdf_file) as pdf: full_text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text()) # Tokenizacja danych z uwzględnieniem układu dokumentu tokens = tokenizer(full_text, return_tensors="pt