Spaces:
Sleeping
Sleeping
Update contractapp_forapi.py
Browse files- contractapp_forapi.py +104 -101
contractapp_forapi.py
CHANGED
@@ -1,101 +1,104 @@
|
|
1 |
-
import pdfplumber
|
2 |
-
import json
|
3 |
-
import re
|
4 |
-
from fastapi import UploadFile
|
5 |
-
|
6 |
-
# Load your mapping JSON file
|
7 |
-
with open("field_mapping_contract.json", "r", encoding="utf-8") as f:
|
8 |
-
nested_mapping = json.load(f)
|
9 |
-
|
10 |
-
# Helper: flatten mapping (English path => Hungarian label)
|
11 |
-
def flatten_mapping(d, parent_key=''):
|
12 |
-
items = {}
|
13 |
-
for k, v in d.items():
|
14 |
-
new_key = f"{parent_key}.{k}" if parent_key else k
|
15 |
-
if isinstance(v, dict):
|
16 |
-
items.update(flatten_mapping(v, new_key))
|
17 |
-
elif isinstance(v, list):
|
18 |
-
for idx, label in enumerate(v):
|
19 |
-
items[f"{new_key}[{idx}]"] = label
|
20 |
-
else:
|
21 |
-
items[new_key] = v
|
22 |
-
return items
|
23 |
-
|
24 |
-
# Helper: set nested value from flat key
|
25 |
-
def set_nested(data, key_path, value):
|
26 |
-
parts = re.split(r'\.|\[|\]', key_path)
|
27 |
-
parts = [p for p in parts if p != '']
|
28 |
-
d = data
|
29 |
-
for part in parts[:-1]:
|
30 |
-
if part.isdigit():
|
31 |
-
part = int(part)
|
32 |
-
while len(d) <= part:
|
33 |
-
d.append({})
|
34 |
-
d = d[part]
|
35 |
-
else:
|
36 |
-
if part not in d:
|
37 |
-
d[part] = [] if parts[parts.index(part)+1].isdigit() else {}
|
38 |
-
d = d[part]
|
39 |
-
last = parts[-1]
|
40 |
-
if last.isdigit():
|
41 |
-
last = int(last)
|
42 |
-
while len(d) <= last:
|
43 |
-
d.append(None)
|
44 |
-
d[last] = value
|
45 |
-
else:
|
46 |
-
d[last] = value
|
47 |
-
|
48 |
-
# Helper: extract value using regex
|
49 |
-
def extract_value(label, text, label_list):
|
50 |
-
pattern = re.escape(label) + r'[:\s]*([^\n]+)'
|
51 |
-
match = re.search(pattern, text)
|
52 |
-
if match:
|
53 |
-
value = match.group(1).strip()
|
54 |
-
|
55 |
-
# Remove any prefixing numbers before the actual label
|
56 |
-
if label == "Ad贸sz谩m":
|
57 |
-
# Extract just the number after the label
|
58 |
-
num_match = re.search(rf"{re.escape(label)}[:\s]*([0-9]+)", text)
|
59 |
-
if num_match:
|
60 |
-
return num_match.group(1).strip()
|
61 |
-
|
62 |
-
# Clean up by removing the next label if it bleeds in
|
63 |
-
for other in label_list:
|
64 |
-
if other != label and other in value:
|
65 |
-
value = value.split(other)[0].strip()
|
66 |
-
|
67 |
-
return value
|
68 |
-
return None
|
69 |
-
|
70 |
-
|
71 |
-
# Read the PDF
|
72 |
-
def read_pdf_text(file: UploadFile):
|
73 |
-
with pdfplumber.open(file.file) as pdf:
|
74 |
-
text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
|
75 |
-
return text
|
76 |
-
|
77 |
-
# Main logic
|
78 |
-
def extract_data(file: UploadFile):
|
79 |
-
flat_map = flatten_mapping(nested_mapping)
|
80 |
-
text = read_pdf_text(file)
|
81 |
-
result = {}
|
82 |
-
|
83 |
-
for eng_path, hun_label in flat_map.items():
|
84 |
-
val = extract_value(hun_label, text, flat_map.values())
|
85 |
-
if val:
|
86 |
-
set_nested(result, eng_path, val)
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
#
|
96 |
-
#
|
97 |
-
|
98 |
-
|
99 |
-
#
|
100 |
-
#
|
101 |
-
|
|
|
|
|
|
|
|
1 |
+
import pdfplumber
|
2 |
+
import json
|
3 |
+
import re
|
4 |
+
from fastapi import UploadFile
|
5 |
+
|
6 |
+
# Load your mapping JSON file
|
7 |
+
with open("field_mapping_contract.json", "r", encoding="utf-8") as f:
|
8 |
+
nested_mapping = json.load(f)
|
9 |
+
|
10 |
+
# Helper: flatten mapping (English path => Hungarian label)
|
11 |
+
def flatten_mapping(d, parent_key=''):
|
12 |
+
items = {}
|
13 |
+
for k, v in d.items():
|
14 |
+
new_key = f"{parent_key}.{k}" if parent_key else k
|
15 |
+
if isinstance(v, dict):
|
16 |
+
items.update(flatten_mapping(v, new_key))
|
17 |
+
elif isinstance(v, list):
|
18 |
+
for idx, label in enumerate(v):
|
19 |
+
items[f"{new_key}[{idx}]"] = label
|
20 |
+
else:
|
21 |
+
items[new_key] = v
|
22 |
+
return items
|
23 |
+
|
24 |
+
# Helper: set nested value from flat key
|
25 |
+
def set_nested(data, key_path, value):
|
26 |
+
parts = re.split(r'\.|\[|\]', key_path)
|
27 |
+
parts = [p for p in parts if p != '']
|
28 |
+
d = data
|
29 |
+
for part in parts[:-1]:
|
30 |
+
if part.isdigit():
|
31 |
+
part = int(part)
|
32 |
+
while len(d) <= part:
|
33 |
+
d.append({})
|
34 |
+
d = d[part]
|
35 |
+
else:
|
36 |
+
if part not in d:
|
37 |
+
d[part] = [] if parts[parts.index(part)+1].isdigit() else {}
|
38 |
+
d = d[part]
|
39 |
+
last = parts[-1]
|
40 |
+
if last.isdigit():
|
41 |
+
last = int(last)
|
42 |
+
while len(d) <= last:
|
43 |
+
d.append(None)
|
44 |
+
d[last] = value
|
45 |
+
else:
|
46 |
+
d[last] = value
|
47 |
+
|
48 |
+
# Helper: extract value using regex
|
49 |
+
def extract_value(label, text, label_list):
|
50 |
+
pattern = re.escape(label) + r'[:\s]*([^\n]+)'
|
51 |
+
match = re.search(pattern, text)
|
52 |
+
if match:
|
53 |
+
value = match.group(1).strip()
|
54 |
+
|
55 |
+
# Remove any prefixing numbers before the actual label
|
56 |
+
if label == "Ad贸sz谩m":
|
57 |
+
# Extract just the number after the label
|
58 |
+
num_match = re.search(rf"{re.escape(label)}[:\s]*([0-9]+)", text)
|
59 |
+
if num_match:
|
60 |
+
return num_match.group(1).strip()
|
61 |
+
|
62 |
+
# Clean up by removing the next label if it bleeds in
|
63 |
+
for other in label_list:
|
64 |
+
if other != label and other in value:
|
65 |
+
value = value.split(other)[0].strip()
|
66 |
+
|
67 |
+
return value
|
68 |
+
return None
|
69 |
+
|
70 |
+
|
71 |
+
# Read the PDF
|
72 |
+
def read_pdf_text(file: UploadFile):
|
73 |
+
with pdfplumber.open(file.file) as pdf:
|
74 |
+
text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
|
75 |
+
return text
|
76 |
+
|
77 |
+
# Main logic
|
78 |
+
def extract_data(file: UploadFile):
|
79 |
+
flat_map = flatten_mapping(nested_mapping)
|
80 |
+
text = read_pdf_text(file)
|
81 |
+
result = {}
|
82 |
+
|
83 |
+
for eng_path, hun_label in flat_map.items():
|
84 |
+
val = extract_value(hun_label, text, flat_map.values())
|
85 |
+
if val:
|
86 |
+
set_nested(result, eng_path, val)
|
87 |
+
|
88 |
+
result["STATUS"] = "OK"
|
89 |
+
result["FILENAME"] = file.filename
|
90 |
+
|
91 |
+
return json.dumps(result, indent=2, ensure_ascii=False)
|
92 |
+
|
93 |
+
# Run the pipeline
|
94 |
+
#if __name__ == "__main__":
|
95 |
+
#pdf_path = "163193_WAVEMAKER_HUNGARY_Order_WMH0245_487-2025_Outlet_Center_Parndo_Parndorf_Radio_Station_Game_atmedia_Kft.pdf" # Change to your actual PDF file path
|
96 |
+
#output = extract_data(pdf_path, nested_mapping)
|
97 |
+
|
98 |
+
# Pretty print or save
|
99 |
+
#import pprint
|
100 |
+
#pprint.pprint(output, sort_dicts=False)
|
101 |
+
|
102 |
+
# Optional: Save to JSON file
|
103 |
+
# with open("output.json", "w", encoding="utf-8") as f:
|
104 |
+
# json.dump(output, f, ensure_ascii=False, indent=2)
|