Spaces:

kryman27
/

pdf-extractor

Running

File size: 2,604 Bytes

07d0354
814c19e
061d5cb
3e4d13c
10213d3
3e4d13c
 
 
 
814c19e
bf3bfc2
3e4d13c
 
 
1379608
99ddfcc
 
814c19e
159c760
07d0354
3e4d13c
ce4e81b

import gradio as gr
import pdfplumber
import re
from transformers import LayoutLMForTokenClassification, LayoutLMTokenizerFast

# Wczytanie modelu LayoutLMv3
model_name = "kryman27/layoutlmv3-finetuned"
model = LayoutLMForTokenClassification.from_pretrained(model_name)
tokenizer = LayoutLMTokenizerFast.from_pretrained(model_name)

# Reguły do wykrywania NIP, kwot, dat
nip_pattern = re.compile(r'\bPL\s?\d{10}\b|\b\d{10}\b')
kwota_pattern = re.compile(r'\b\d+[\.,]?\d*\b')
data_pattern = re.compile(r'\b\d{2}\.\d{2}\.\d{4}\b')
payment_keywords = ["data płatności", "termin płatności", "zapłata", "płatność"]

def extract_invoice_data(pdf_file):
    with pdfplumber.open(pdf_file) as pdf:
        full_text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())

    # Tokenizacja danych z uwzględnieniem układu dokumentu
    tokens = tokenizer(full_text, return_tensors="pt", truncation=True)

    # Predykcja modelu
    outputs = model(**tokens)
    predictions = outputs.logits.argmax(-1).squeeze().tolist()

    # Przetwarzanie wyników
    entities = []
    for token, pred in zip(tokens.tokens(), predictions):
        if pred > 0:  # Pomijamy tło
            entities.append((token, model.config.id2label[pred]))

    # Wyszukiwanie kluczowych wartości
    seller_name = [token for token, label in entities if "ORG" in label]
    seller_nip = nip_pattern.search(full_text)
    kwoty = kwota_pattern.findall(full_text)
    kwoty = [float(k.replace(",", ".")) for k in kwoty if k.replace(",", ".").replace(".", "").isdigit()]
    total_amount = max(kwoty) if kwoty else None

    # Szukamy daty płatności
    payment_date = None
    for line in full_text.split("\n"):
        if any(keyword in line.lower() for keyword in payment_keywords):
            date_match = data_pattern.search(line)
            if date_match:
                payment_date = date_match.group()
                break

    return {
        "Sprzedawca": " ".join(seller_name) if seller_name else "Nie znaleziono",
        "NIP": seller_nip.group() if seller_nip else "Nie znaleziono",
        "Kwota całkowita": total_amount if total_amount else "Nie znaleziono",
        "Data płatności": payment_date if payment_date else "Nie znaleziono"
    }

# Interfejs użytkownika
iface = gr.Interface(
    fn=extract_invoice_data,
    inputs=gr.File(label="Wybierz plik PDF"),
    outputs="json",
    title="Ekstrakcja danych z faktury",
    description="Prześlij plik PDF, a model zwróci dane sprzedawcy, NIP, kwotę i datę płatności."
)

if __name__ == "__main__":
    iface.launch()