File size: 2,240 Bytes
bb9eeeb
 
 
 
cbb6c69
bb9eeeb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
367ed1b
9398c57
0463759
03e9573
bb9eeeb
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import pdfplumber
import json
import re
from fastapi import UploadFile
from datetime import date

mapping_file = "field_mapping.json"

def load_field_mapping(mapping_file):
    with open(mapping_file, "r", encoding="utf-8") as f:
        return json.load(f)

def extract_fields_from_pdf(file: UploadFile):
    mapping = load_field_mapping(mapping_file)
    
    data = {v: None for v in mapping.values()}
    
    numeric_fields = {"AGENCY_DISCOUNT", "TAX_NUMBER"}  
    single_word_fields = {"CAMPAIGN"}

    with pdfplumber.open(file.file) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if not text:
                continue

            lines = text.split('\n')
            for line in lines:
                line = line.strip()

                for label, field in mapping.items():
                    if label in line:
                        value = line.split(":", 1)[-1].strip()
                        if field in numeric_fields:
                            match = re.search(r"\d+%?|\d+(?:[.,]\d+)?", value)
                            data[field] = match.group() if match else value
                        elif field in single_word_fields:    
                            data[field] = value.split()[0] if value else value
                        else:                           
                            match = re.search(r'^(.*?\b(?:KFT\.|KFT|ZRT\.|ZRT|BT\.|BT))\b', value, re.IGNORECASE)
                            if match:
                                data[field] = match.group(1).strip()
                            else:
                                data[field] = value
                                
                data["STATUS"] = "OK"
                data["FILENAME"] = file.filename
                data["currency"] = "HUF"
                data["contractDates"] =  f"{date(2025, 10, 3)} | {date(2025, 11, 16)}"
                data["grossprice"] = 33821800

    return json.dumps(data, indent=2, ensure_ascii=False)

# Usage

#pdf_path = "163900_Suzuki_Rádió Március_megrendelő_R1.pdf"



#if __name__ == "__main__":
    #extracted_data = extract_fields_from_pdf(pdf_path, field_mapping)
    #print(json.dumps(extracted_data, indent=2, ensure_ascii=False))