Spaces:
Running
Running
File size: 3,046 Bytes
07d0354 814c19e 061d5cb b1d3718 b82e672 10213d3 3e4d13c b82e672 814c19e bf3bfc2 3e4d13c 1379608 99ddfcc 814c19e b82e672 07d0354 b82e672 ce4e81b b82e672 ce4e81b b82e672 ce4e81b b82e672 ce4e81b b82e672 ce4e81b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 |
import gradio as gr
import pdfplumber
import re
from transformers import LayoutLMForTokenClassification, AutoTokenizer
import torch
# Wczytanie modelu LayoutLMv3
model_name = "kryman27/layoutlmv3-finetuned"
model = LayoutLMForTokenClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name) # Automatyczne wykrycie tokenizatora
# Reguły do wykrywania NIP, kwot, dat
nip_pattern = re.compile(r'\bPL\s?\d{10}\b|\b\d{10}\b')
kwota_pattern = re.compile(r'\b\d+[\.,]?\d*\b')
data_pattern = re.compile(r'\b\d{2}\.\d{2}\.\d{4}\b')
payment_keywords = ["data płatności", "termin płatności", "zapłata", "płatność"]
def extract_invoice_data(pdf_file):
with pdfplumber.open(pdf_file) as pdf:
words, boxes = [], []
for page in pdf.pages:
extracted_words = page.extract_words()
for word in extracted_words:
words.append(word['text']) # Pobieramy tekst słowa
bbox = [word['x0'], word['top'], word['x1'], word['bottom']]
boxes.append(bbox) # Pobieramy bounding box (pozycję słowa na stronie)
# Tokenizacja tekstu + dodanie bounding boxes
tokens = tokenizer(words, boxes=boxes, is_split_into_words=True, return_tensors="pt", truncation=True)
# Predykcja modelu
with torch.no_grad():
outputs = model(**tokens)
predictions = outputs.logits.argmax(-1).squeeze().tolist()
# Przetwarzanie wyników
entities = []
for token, pred in zip(words, predictions):
if pred > 0: # Pomijamy tło
entities.append((token, model.config.id2label[pred]))
# Wyszukiwanie kluczowych wartości
seller_name = [token for token, label in entities if "ORG" in label]
seller_nip = nip_pattern.search(" ".join(words))
kwoty = kwota_pattern.findall(" ".join(words))
kwoty = [float(k.replace(",", ".")) for k in kwoty if k.replace(",", ".").replace(".", "").isdigit()]
total_amount = max(kwoty) if kwoty else None
# Szukamy daty płatności
payment_date = None
for i, word in enumerate(words):
if any(keyword in word.lower() for keyword in payment_keywords):
if i + 1 < len(words):
date_match = data_pattern.search(words[i + 1])
if date_match:
payment_date = date_match.group()
break
return {
"Sprzedawca": " ".join(seller_name) if seller_name else "Nie znaleziono",
"NIP": seller_nip.group() if seller_nip else "Nie znaleziono",
"Kwota całkowita": total_amount if total_amount else "Nie znaleziono",
"Data płatności": payment_date if payment_date else "Nie znaleziono"
}
# Interfejs użytkownika
iface = gr.Interface(
fn=extract_invoice_data,
inputs=gr.File(label="Wybierz plik PDF"),
outputs="json",
title="Ekstrakcja danych z faktury",
description="Prześlij plik PDF, a model zwróci dane sprzedawcy, NIP, kwotę i datę płatności."
)
if __name__ == "__main__":
iface.launch()
|