Spaces:

kryman27
/

pdf-extractor

Running

File size: 3,046 Bytes

07d0354
814c19e
061d5cb
b1d3718
b82e672
10213d3
3e4d13c
 
 
b82e672
814c19e
bf3bfc2
3e4d13c
 
 
1379608
99ddfcc
 
814c19e
b82e672
07d0354
b82e672
 
 
 
 
 
 
 
 
ce4e81b
 
b82e672
 
ce4e81b
 
 
 
b82e672
ce4e81b
 
 
 
 
b82e672
 
ce4e81b
 
 
 
 
b82e672
 
 
 
 
 
 
ce4e81b

import gradio as gr
import pdfplumber
import re
from transformers import LayoutLMForTokenClassification, AutoTokenizer
import torch

# Wczytanie modelu LayoutLMv3
model_name = "kryman27/layoutlmv3-finetuned"
model = LayoutLMForTokenClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)  # Automatyczne wykrycie tokenizatora

# Reguły do wykrywania NIP, kwot, dat
nip_pattern = re.compile(r'\bPL\s?\d{10}\b|\b\d{10}\b')
kwota_pattern = re.compile(r'\b\d+[\.,]?\d*\b')
data_pattern = re.compile(r'\b\d{2}\.\d{2}\.\d{4}\b')
payment_keywords = ["data płatności", "termin płatności", "zapłata", "płatność"]

def extract_invoice_data(pdf_file):
    with pdfplumber.open(pdf_file) as pdf:
        words, boxes = [], []

        for page in pdf.pages:
            extracted_words = page.extract_words()
            for word in extracted_words:
                words.append(word['text'])  # Pobieramy tekst słowa
                bbox = [word['x0'], word['top'], word['x1'], word['bottom']]
                boxes.append(bbox)  # Pobieramy bounding box (pozycję słowa na stronie)

    # Tokenizacja tekstu + dodanie bounding boxes
    tokens = tokenizer(words, boxes=boxes, is_split_into_words=True, return_tensors="pt", truncation=True)

    # Predykcja modelu
    with torch.no_grad():
        outputs = model(**tokens)
    predictions = outputs.logits.argmax(-1).squeeze().tolist()

    # Przetwarzanie wyników
    entities = []
    for token, pred in zip(words, predictions):
        if pred > 0:  # Pomijamy tło
            entities.append((token, model.config.id2label[pred]))

    # Wyszukiwanie kluczowych wartości
    seller_name = [token for token, label in entities if "ORG" in label]
    seller_nip = nip_pattern.search(" ".join(words))
    kwoty = kwota_pattern.findall(" ".join(words))
    kwoty = [float(k.replace(",", ".")) for k in kwoty if k.replace(",", ".").replace(".", "").isdigit()]
    total_amount = max(kwoty) if kwoty else None

    # Szukamy daty płatności
    payment_date = None
    for i, word in enumerate(words):
        if any(keyword in word.lower() for keyword in payment_keywords):
            if i + 1 < len(words):
                date_match = data_pattern.search(words[i + 1])
                if date_match:
                    payment_date = date_match.group()
                    break

    return {
        "Sprzedawca": " ".join(seller_name) if seller_name else "Nie znaleziono",
        "NIP": seller_nip.group() if seller_nip else "Nie znaleziono",
        "Kwota całkowita": total_amount if total_amount else "Nie znaleziono",
        "Data płatności": payment_date if payment_date else "Nie znaleziono"
    }

# Interfejs użytkownika
iface = gr.Interface(
    fn=extract_invoice_data,
    inputs=gr.File(label="Wybierz plik PDF"),
    outputs="json",
    title="Ekstrakcja danych z faktury",
    description="Prześlij plik PDF, a model zwróci dane sprzedawcy, NIP, kwotę i datę płatności."
)

if __name__ == "__main__":
    iface.launch()