Spaces:
Running
Running
File size: 910 Bytes
07d0354 814c19e 061d5cb 3e4d13c 10213d3 3e4d13c 814c19e bf3bfc2 3e4d13c 1379608 99ddfcc 814c19e 159c760 07d0354 3e4d13c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 |
import gradio as gr
import pdfplumber
import re
from transformers import LayoutLMForTokenClassification, LayoutLMTokenizerFast
# Wczytanie modelu LayoutLMv3
model_name = "kryman27/layoutlmv3-finetuned"
model = LayoutLMForTokenClassification.from_pretrained(model_name)
tokenizer = LayoutLMTokenizerFast.from_pretrained(model_name)
# Reguły do wykrywania NIP, kwot, dat
nip_pattern = re.compile(r'\bPL\s?\d{10}\b|\b\d{10}\b')
kwota_pattern = re.compile(r'\b\d+[\.,]?\d*\b')
data_pattern = re.compile(r'\b\d{2}\.\d{2}\.\d{4}\b')
payment_keywords = ["data płatności", "termin płatności", "zapłata", "płatność"]
def extract_invoice_data(pdf_file):
with pdfplumber.open(pdf_file) as pdf:
full_text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
# Tokenizacja danych z uwzględnieniem układu dokumentu
tokens = tokenizer(full_text, return_tensors="pt
|