File size: 910 Bytes
07d0354
814c19e
061d5cb
3e4d13c
10213d3
3e4d13c
 
 
 
814c19e
bf3bfc2
3e4d13c
 
 
1379608
99ddfcc
 
814c19e
159c760
07d0354
3e4d13c
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
import gradio as gr
import pdfplumber
import re
from transformers import LayoutLMForTokenClassification, LayoutLMTokenizerFast

# Wczytanie modelu LayoutLMv3
model_name = "kryman27/layoutlmv3-finetuned"
model = LayoutLMForTokenClassification.from_pretrained(model_name)
tokenizer = LayoutLMTokenizerFast.from_pretrained(model_name)

# Reguły do wykrywania NIP, kwot, dat
nip_pattern = re.compile(r'\bPL\s?\d{10}\b|\b\d{10}\b')
kwota_pattern = re.compile(r'\b\d+[\.,]?\d*\b')
data_pattern = re.compile(r'\b\d{2}\.\d{2}\.\d{4}\b')
payment_keywords = ["data płatności", "termin płatności", "zapłata", "płatność"]

def extract_invoice_data(pdf_file):
    with pdfplumber.open(pdf_file) as pdf:
        full_text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())

    # Tokenizacja danych z uwzględnieniem układu dokumentu
    tokens = tokenizer(full_text, return_tensors="pt