Spaces:
Running
Running
import gradio as gr | |
import pdfplumber | |
import re | |
from transformers import LayoutLMForTokenClassification, LayoutLMTokenizerFast | |
# Wczytanie modelu LayoutLMv3 | |
model_name = "kryman27/layoutlmv3-finetuned" | |
model = LayoutLMForTokenClassification.from_pretrained(model_name) | |
tokenizer = LayoutLMTokenizerFast.from_pretrained(model_name) | |
# Reguły do wykrywania NIP, kwot, dat | |
nip_pattern = re.compile(r'\bPL\s?\d{10}\b|\b\d{10}\b') | |
kwota_pattern = re.compile(r'\b\d+[\.,]?\d*\b') | |
data_pattern = re.compile(r'\b\d{2}\.\d{2}\.\d{4}\b') | |
payment_keywords = ["data płatności", "termin płatności", "zapłata", "płatność"] | |
def extract_invoice_data(pdf_file): | |
with pdfplumber.open(pdf_file) as pdf: | |
full_text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text()) | |
# Tokenizacja danych z uwzględnieniem układu dokumentu | |
tokens = tokenizer(full_text, return_tensors="pt | |