File size: 2,755 Bytes
07d0354
814c19e
061d5cb
b1d3718
10213d3
3e4d13c
 
 
cbec0a2
814c19e
bf3bfc2
3e4d13c
 
 
1379608
99ddfcc
 
814c19e
159c760
07d0354
cbec0a2
 
 
ce4e81b
 
 
 
 
 
 
cbec0a2
ce4e81b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import gradio as gr
import pdfplumber
import re
from transformers import LayoutLMForTokenClassification, AutoTokenizer

# Wczytanie modelu LayoutLMv3
model_name = "kryman27/layoutlmv3-finetuned"
model = LayoutLMForTokenClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)  # Poprawiona wersja

# Regu艂y do wykrywania NIP, kwot, dat
nip_pattern = re.compile(r'\bPL\s?\d{10}\b|\b\d{10}\b')
kwota_pattern = re.compile(r'\b\d+[\.,]?\d*\b')
data_pattern = re.compile(r'\b\d{2}\.\d{2}\.\d{4}\b')
payment_keywords = ["data p艂atno艣ci", "termin p艂atno艣ci", "zap艂ata", "p艂atno艣膰"]

def extract_invoice_data(pdf_file):
    with pdfplumber.open(pdf_file) as pdf:
        full_text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())

    # Podzia艂 tekstu na list臋 s艂贸w (LayoutLMv3 wymaga tokenizacji na poziomie s艂贸w)
    words = full_text.split()  # Nowa poprawiona linia
    tokens = tokenizer(words, is_split_into_words=True, return_tensors="pt", truncation=True)  # Poprawiona linia

    # Predykcja modelu
    outputs = model(**tokens)
    predictions = outputs.logits.argmax(-1).squeeze().tolist()

    # Przetwarzanie wynik贸w
    entities = []
    for token, pred in zip(words, predictions):  # Teraz iterujemy po `words`
        if pred > 0:  # Pomijamy t艂o
            entities.append((token, model.config.id2label[pred]))

    # Wyszukiwanie kluczowych warto艣ci
    seller_name = [token for token, label in entities if "ORG" in label]
    seller_nip = nip_pattern.search(full_text)
    kwoty = kwota_pattern.findall(full_text)
    kwoty = [float(k.replace(",", ".")) for k in kwoty if k.replace(",", ".").replace(".", "").isdigit()]
    total_amount = max(kwoty) if kwoty else None

    # Szukamy daty p艂atno艣ci
    payment_date = None
    for line in full_text.split("\n"):
        if any(keyword in line.lower() for keyword in payment_keywords):
            date_match = data_pattern.search(line)
            if date_match:
                payment_date = date_match.group()
                break

    return {
        "Sprzedawca": " ".join(seller_name) if seller_name else "Nie znaleziono",
        "NIP": seller_nip.group() if seller_nip else "Nie znaleziono",
        "Kwota ca艂kowita": total_amount if total_amount else "Nie znaleziono",
        "Data p艂atno艣ci": payment_date if payment_date else "Nie znaleziono"
    }

# Interfejs u偶ytkownika
iface = gr.Interface(
    fn=extract_invoice_data,
    inputs=gr.File(label="Wybierz plik PDF"),
    outputs="json",
    title="Ekstrakcja danych z faktury",
    description="Prze艣lij plik PDF, a model zwr贸ci dane sprzedawcy, NIP, kwot臋 i dat臋 p艂atno艣ci."
)

if __name__ == "__main__":
    iface.launch()