Spaces:
Running
Running
import gradio as gr | |
import pdfplumber | |
import re | |
from transformers import LayoutLMForTokenClassification, AutoTokenizer | |
import torch | |
# Wczytanie modelu LayoutLMv3 | |
model_name = "kryman27/layoutlmv3-finetuned" | |
model = LayoutLMForTokenClassification.from_pretrained(model_name) | |
tokenizer = AutoTokenizer.from_pretrained(model_name) # Automatyczne wykrycie tokenizatora | |
# Regu艂y do wykrywania NIP, kwot, dat | |
nip_pattern = re.compile(r'\bPL\s?\d{10}\b|\b\d{10}\b') | |
kwota_pattern = re.compile(r'\b\d+[\.,]?\d*\s?(PLN|z艂|EUR|USD)?\b') # Rozpoznawanie walut | |
data_pattern = re.compile(r'\b\d{2}\.\d{2}\.\d{4}\b') # Format DD.MM.YYYY | |
payment_keywords = ["data p艂atno艣ci", "termin p艂atno艣ci", "zap艂ata", "zap艂acono", "p艂atno艣膰"] | |
seller_keywords = ["sprzedawca", "faktura wystawiona przez", "wystawca", "nazwa firmy"] | |
def extract_invoice_data(pdf_file): | |
with pdfplumber.open(pdf_file) as pdf: | |
words, boxes, full_text = [], [], [] | |
for page in pdf.pages: | |
extracted_words = page.extract_words() | |
for word in extracted_words: | |
words.append(word['text']) # Pobieramy tekst s艂owa | |
bbox = [int(word['x0']), int(word['top']), int(word['x1']), int(word['bottom'])] # Zaokr膮glamy warto艣ci | |
boxes.append(bbox) # Pobieramy bounding box (pozycj臋 s艂owa na stronie) | |
page_text = page.extract_text() | |
if page_text: | |
full_text.append(page_text.lower()) | |
full_text = "\n".join(full_text) # 艁膮czymy ca艂y tekst dokumentu | |
# Tokenizacja tekstu + dodanie bounding boxes | |
encoding = tokenizer.encode_plus(words, boxes=boxes, return_tensors="pt", truncation=True) | |
# Predykcja modelu | |
with torch.no_grad(): | |
outputs = model(**encoding) | |
predictions = outputs.logits.argmax(-1).squeeze().tolist() | |
# Przetwarzanie wynik贸w | |
entities = [] | |
for token, pred in zip(words, predictions): | |
if pred > 0: # Pomijamy t艂o | |
entities.append((token, model.config.id2label[pred])) | |
# 馃彚 Wyszukiwanie nazwy sprzedawcy | |
seller_name = [token for token, label in entities if "ORG" in label] | |
# Je艣li model nie znalaz艂, szukamy w tek艣cie | |
if not seller_name: | |
for line in full_text.split("\n"): | |
if any(keyword in line for keyword in seller_keywords): | |
seller_name = line.split(":")[-1].strip() | |
break | |
# 馃敘 Wyszukiwanie NIP | |
seller_nip = nip_pattern.search(full_text) | |
# 馃挵 Wyszukiwanie kwoty ca艂kowitej (najwi臋ksza kwota z walut膮) | |
kwoty = kwota_pattern.findall(full_text) | |
kwoty = [k[0].replace(",", ".") for k in kwoty if k[0].replace(",", ".").replace(".", "").isdigit()] | |
total_amount = max(map(float, kwoty), default=None) if kwoty else None | |
# 馃搯 Wyszukiwanie daty p艂atno艣ci | |
payment_date = None | |
for line in full_text.split("\n"): | |
if any(keyword in line for keyword in payment_keywords): | |
date_match = data_pattern.search(line) | |
if date_match: | |
payment_date = date_match.group() | |
break | |
return { | |
"Sprzedawca": " ".join(seller_name) if seller_name else "Nie znaleziono", | |
"NIP": seller_nip.group() if seller_nip else "Nie znaleziono", | |
"Kwota ca艂kowita": total_amount if total_amount else "Nie znaleziono", | |
"Data p艂atno艣ci": payment_date if payment_date else "Nie znaleziono" | |
} | |
# Interfejs u偶ytkownika | |
iface = gr.Interface( | |
fn=extract_invoice_data, | |
inputs=gr.File(label="Wybierz plik PDF"), | |
outputs="json", | |
title="Ekstrakcja danych z faktury", | |
description="Prze艣lij plik PDF, a model zwr贸ci dane sprzedawcy, NIP, kwot臋 i dat臋 p艂atno艣ci." | |
) | |
if __name__ == "__main__": | |
iface.launch() | |