pdf-extractor / app.py
kryman27's picture
Update app.py
cbec0a2 verified
raw
history blame
2.76 kB
import gradio as gr
import pdfplumber
import re
from transformers import LayoutLMForTokenClassification, AutoTokenizer
# Wczytanie modelu LayoutLMv3
model_name = "kryman27/layoutlmv3-finetuned"
model = LayoutLMForTokenClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name) # Poprawiona wersja
# Regu艂y do wykrywania NIP, kwot, dat
nip_pattern = re.compile(r'\bPL\s?\d{10}\b|\b\d{10}\b')
kwota_pattern = re.compile(r'\b\d+[\.,]?\d*\b')
data_pattern = re.compile(r'\b\d{2}\.\d{2}\.\d{4}\b')
payment_keywords = ["data p艂atno艣ci", "termin p艂atno艣ci", "zap艂ata", "p艂atno艣膰"]
def extract_invoice_data(pdf_file):
with pdfplumber.open(pdf_file) as pdf:
full_text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
# Podzia艂 tekstu na list臋 s艂贸w (LayoutLMv3 wymaga tokenizacji na poziomie s艂贸w)
words = full_text.split() # Nowa poprawiona linia
tokens = tokenizer(words, is_split_into_words=True, return_tensors="pt", truncation=True) # Poprawiona linia
# Predykcja modelu
outputs = model(**tokens)
predictions = outputs.logits.argmax(-1).squeeze().tolist()
# Przetwarzanie wynik贸w
entities = []
for token, pred in zip(words, predictions): # Teraz iterujemy po `words`
if pred > 0: # Pomijamy t艂o
entities.append((token, model.config.id2label[pred]))
# Wyszukiwanie kluczowych warto艣ci
seller_name = [token for token, label in entities if "ORG" in label]
seller_nip = nip_pattern.search(full_text)
kwoty = kwota_pattern.findall(full_text)
kwoty = [float(k.replace(",", ".")) for k in kwoty if k.replace(",", ".").replace(".", "").isdigit()]
total_amount = max(kwoty) if kwoty else None
# Szukamy daty p艂atno艣ci
payment_date = None
for line in full_text.split("\n"):
if any(keyword in line.lower() for keyword in payment_keywords):
date_match = data_pattern.search(line)
if date_match:
payment_date = date_match.group()
break
return {
"Sprzedawca": " ".join(seller_name) if seller_name else "Nie znaleziono",
"NIP": seller_nip.group() if seller_nip else "Nie znaleziono",
"Kwota ca艂kowita": total_amount if total_amount else "Nie znaleziono",
"Data p艂atno艣ci": payment_date if payment_date else "Nie znaleziono"
}
# Interfejs u偶ytkownika
iface = gr.Interface(
fn=extract_invoice_data,
inputs=gr.File(label="Wybierz plik PDF"),
outputs="json",
title="Ekstrakcja danych z faktury",
description="Prze艣lij plik PDF, a model zwr贸ci dane sprzedawcy, NIP, kwot臋 i dat臋 p艂atno艣ci."
)
if __name__ == "__main__":
iface.launch()