File size: 4,182 Bytes
1a7befc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9c55a16
 
 
 
7058ae0
1138a8f
 
7058ae0
1138a8f
 
7058ae0
 
1138a8f
 
 
7058ae0
37c6455
 
1138a8f
1a7befc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49822c1
1a7befc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import pdfplumber
import json
import re
from fastapi import UploadFile

# Load your mapping JSON file
with open("field_mapping_contract.json", "r", encoding="utf-8") as f:
    nested_mapping = json.load(f)

# Helper: flatten mapping (English path => Hungarian label)
def flatten_mapping(d, parent_key=''):
    items = {}
    for k, v in d.items():
        new_key = f"{parent_key}.{k}" if parent_key else k
        if isinstance(v, dict):
            items.update(flatten_mapping(v, new_key))
        elif isinstance(v, list):
            for idx, label in enumerate(v):
                items[f"{new_key}[{idx}]"] = label
        else:
            items[new_key] = v
    return items

# Helper: set nested value from flat key
def set_nested(data, key_path, value):
    parts = re.split(r'\.|\[|\]', key_path)
    parts = [p for p in parts if p != '']
    d = data
    for part in parts[:-1]:
        if part.isdigit():
            part = int(part)
            while len(d) <= part:
                d.append({})
            d = d[part]
        else:
            if part not in d:
                d[part] = [] if parts[parts.index(part)+1].isdigit() else {}
            d = d[part]
    last = parts[-1]
    if last.isdigit():
        last = int(last)
        while len(d) <= last:
            d.append(None)
        d[last] = value
    else:
        d[last] = value

# Helper: extract value using regex
def extract_value(label, text, label_list):
    if label == "Dátum":
        lines = text.splitlines()
        for i, line in enumerate(lines):
            if "Dátum" in line:
                #print(f"✅ Found line with 'Dátum': {line}")
                if i + 1 < len(lines):
                    next_line = lines[i + 1]
                    #print(f"🔍 Checking next line: {next_line}")
                    dates = re.findall(r'\d{4}-\d{2}-\d{2}', next_line)
                    if len(dates) >= 2:
                        combined = f"{dates[0]} | {dates[1]}"
                        #print(f"✅ Combined dates: {combined}")
                        return combined
                    elif len(dates) == 1:
                        return dates[0]  # fallback: only one date found
                #print("❌ No date found in next line")
                return None


    pattern = re.escape(label) + r'[:\s]*([^\n]+)'
    match = re.search(pattern, text)
    if match:
        value = match.group(1).strip()

        # Remove any prefixing numbers before the actual label
        if label == "Adószám":
            # Extract just the number after the label
            num_match = re.search(rf"{re.escape(label)}[:\s]*([0-9]+)", text)
            if num_match:
                return num_match.group(1).strip()

        # Clean up by removing the next label if it bleeds in
        for other in label_list:
            if other != label and other in value:
                value = value.split(other)[0].strip()

        return value
    return None


# Read the PDF
def read_pdf_text(file: UploadFile):
    with pdfplumber.open(file.file) as pdf:
        text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
    return text

# Main logic
def extract_data(file: UploadFile):
    flat_map = flatten_mapping(nested_mapping)
    text = read_pdf_text(file)
    result = {}

    for eng_path, hun_label in flat_map.items():
        val = extract_value(hun_label, text, flat_map.values())
        if val:
            set_nested(result, eng_path, val)

    result["STATUS"] = "OK"
    result["FILENAME"] = file.filename
    result["AGENCY"] = "Wavemaker Hungary Kft."

    return json.dumps(result, indent=2, ensure_ascii=False)

# Run the pipeline
#if __name__ == "__main__":
    #pdf_path = "163193_WAVEMAKER_HUNGARY_Order_WMH0245_487-2025_Outlet_Center_Parndo_Parndorf_Radio_Station_Game_atmedia_Kft.pdf"  # Change to your actual PDF file path
    #output = extract_data(pdf_path, nested_mapping)

    # Pretty print or save
    #import pprint
    #pprint.pprint(output, sort_dicts=False)

    # Optional: Save to JSON file
    # with open("output.json", "w", encoding="utf-8") as f:
    #     json.dump(output, f, ensure_ascii=False, indent=2)