pdf-extractor / app.py
kryman27's picture
Update app.py
3e4d13c verified
raw
history blame
910 Bytes
import gradio as gr
import pdfplumber
import re
from transformers import LayoutLMForTokenClassification, LayoutLMTokenizerFast
# Wczytanie modelu LayoutLMv3
model_name = "kryman27/layoutlmv3-finetuned"
model = LayoutLMForTokenClassification.from_pretrained(model_name)
tokenizer = LayoutLMTokenizerFast.from_pretrained(model_name)
# Reguły do wykrywania NIP, kwot, dat
nip_pattern = re.compile(r'\bPL\s?\d{10}\b|\b\d{10}\b')
kwota_pattern = re.compile(r'\b\d+[\.,]?\d*\b')
data_pattern = re.compile(r'\b\d{2}\.\d{2}\.\d{4}\b')
payment_keywords = ["data płatności", "termin płatności", "zapłata", "płatność"]
def extract_invoice_data(pdf_file):
with pdfplumber.open(pdf_file) as pdf:
full_text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
# Tokenizacja danych z uwzględnieniem układu dokumentu
tokens = tokenizer(full_text, return_tensors="pt