Spaces:

kryman27
/

pdf-extractor

Running

File size: 910 Bytes

07d0354
814c19e
061d5cb
3e4d13c
10213d3
3e4d13c
 
 
 
814c19e
bf3bfc2
3e4d13c
 
 
1379608
99ddfcc
 
814c19e
159c760
07d0354
3e4d13c

import gradio as gr
import pdfplumber
import re
from transformers import LayoutLMForTokenClassification, LayoutLMTokenizerFast

# Wczytanie modelu LayoutLMv3
model_name = "kryman27/layoutlmv3-finetuned"
model = LayoutLMForTokenClassification.from_pretrained(model_name)
tokenizer = LayoutLMTokenizerFast.from_pretrained(model_name)

# Reguły do wykrywania NIP, kwot, dat
nip_pattern = re.compile(r'\bPL\s?\d{10}\b|\b\d{10}\b')
kwota_pattern = re.compile(r'\b\d+[\.,]?\d*\b')
data_pattern = re.compile(r'\b\d{2}\.\d{2}\.\d{4}\b')
payment_keywords = ["data płatności", "termin płatności", "zapłata", "płatność"]

def extract_invoice_data(pdf_file):
    with pdfplumber.open(pdf_file) as pdf:
        full_text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())

    # Tokenizacja danych z uwzględnieniem układu dokumentu
    tokens = tokenizer(full_text, return_tensors="pt