SoumyaJ commited on
Commit
1a7befc
verified
1 Parent(s): 2c39fc8

Update contractapp_forapi.py

Browse files
Files changed (1) hide show
  1. contractapp_forapi.py +104 -101
contractapp_forapi.py CHANGED
@@ -1,101 +1,104 @@
1
- import pdfplumber
2
- import json
3
- import re
4
- from fastapi import UploadFile
5
-
6
- # Load your mapping JSON file
7
- with open("field_mapping_contract.json", "r", encoding="utf-8") as f:
8
- nested_mapping = json.load(f)
9
-
10
- # Helper: flatten mapping (English path => Hungarian label)
11
- def flatten_mapping(d, parent_key=''):
12
- items = {}
13
- for k, v in d.items():
14
- new_key = f"{parent_key}.{k}" if parent_key else k
15
- if isinstance(v, dict):
16
- items.update(flatten_mapping(v, new_key))
17
- elif isinstance(v, list):
18
- for idx, label in enumerate(v):
19
- items[f"{new_key}[{idx}]"] = label
20
- else:
21
- items[new_key] = v
22
- return items
23
-
24
- # Helper: set nested value from flat key
25
- def set_nested(data, key_path, value):
26
- parts = re.split(r'\.|\[|\]', key_path)
27
- parts = [p for p in parts if p != '']
28
- d = data
29
- for part in parts[:-1]:
30
- if part.isdigit():
31
- part = int(part)
32
- while len(d) <= part:
33
- d.append({})
34
- d = d[part]
35
- else:
36
- if part not in d:
37
- d[part] = [] if parts[parts.index(part)+1].isdigit() else {}
38
- d = d[part]
39
- last = parts[-1]
40
- if last.isdigit():
41
- last = int(last)
42
- while len(d) <= last:
43
- d.append(None)
44
- d[last] = value
45
- else:
46
- d[last] = value
47
-
48
- # Helper: extract value using regex
49
- def extract_value(label, text, label_list):
50
- pattern = re.escape(label) + r'[:\s]*([^\n]+)'
51
- match = re.search(pattern, text)
52
- if match:
53
- value = match.group(1).strip()
54
-
55
- # Remove any prefixing numbers before the actual label
56
- if label == "Ad贸sz谩m":
57
- # Extract just the number after the label
58
- num_match = re.search(rf"{re.escape(label)}[:\s]*([0-9]+)", text)
59
- if num_match:
60
- return num_match.group(1).strip()
61
-
62
- # Clean up by removing the next label if it bleeds in
63
- for other in label_list:
64
- if other != label and other in value:
65
- value = value.split(other)[0].strip()
66
-
67
- return value
68
- return None
69
-
70
-
71
- # Read the PDF
72
- def read_pdf_text(file: UploadFile):
73
- with pdfplumber.open(file.file) as pdf:
74
- text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
75
- return text
76
-
77
- # Main logic
78
- def extract_data(file: UploadFile):
79
- flat_map = flatten_mapping(nested_mapping)
80
- text = read_pdf_text(file)
81
- result = {}
82
-
83
- for eng_path, hun_label in flat_map.items():
84
- val = extract_value(hun_label, text, flat_map.values())
85
- if val:
86
- set_nested(result, eng_path, val)
87
-
88
- return json.dumps(result, indent=2, ensure_ascii=False)
89
-
90
- # Run the pipeline
91
- #if __name__ == "__main__":
92
- #pdf_path = "163193_WAVEMAKER_HUNGARY_Order_WMH0245_487-2025_Outlet_Center_Parndo_Parndorf_Radio_Station_Game_atmedia_Kft.pdf" # Change to your actual PDF file path
93
- #output = extract_data(pdf_path, nested_mapping)
94
-
95
- # Pretty print or save
96
- #import pprint
97
- #pprint.pprint(output, sort_dicts=False)
98
-
99
- # Optional: Save to JSON file
100
- # with open("output.json", "w", encoding="utf-8") as f:
101
- # json.dump(output, f, ensure_ascii=False, indent=2)
 
 
 
 
1
+ import pdfplumber
2
+ import json
3
+ import re
4
+ from fastapi import UploadFile
5
+
6
+ # Load your mapping JSON file
7
+ with open("field_mapping_contract.json", "r", encoding="utf-8") as f:
8
+ nested_mapping = json.load(f)
9
+
10
+ # Helper: flatten mapping (English path => Hungarian label)
11
+ def flatten_mapping(d, parent_key=''):
12
+ items = {}
13
+ for k, v in d.items():
14
+ new_key = f"{parent_key}.{k}" if parent_key else k
15
+ if isinstance(v, dict):
16
+ items.update(flatten_mapping(v, new_key))
17
+ elif isinstance(v, list):
18
+ for idx, label in enumerate(v):
19
+ items[f"{new_key}[{idx}]"] = label
20
+ else:
21
+ items[new_key] = v
22
+ return items
23
+
24
+ # Helper: set nested value from flat key
25
+ def set_nested(data, key_path, value):
26
+ parts = re.split(r'\.|\[|\]', key_path)
27
+ parts = [p for p in parts if p != '']
28
+ d = data
29
+ for part in parts[:-1]:
30
+ if part.isdigit():
31
+ part = int(part)
32
+ while len(d) <= part:
33
+ d.append({})
34
+ d = d[part]
35
+ else:
36
+ if part not in d:
37
+ d[part] = [] if parts[parts.index(part)+1].isdigit() else {}
38
+ d = d[part]
39
+ last = parts[-1]
40
+ if last.isdigit():
41
+ last = int(last)
42
+ while len(d) <= last:
43
+ d.append(None)
44
+ d[last] = value
45
+ else:
46
+ d[last] = value
47
+
48
+ # Helper: extract value using regex
49
+ def extract_value(label, text, label_list):
50
+ pattern = re.escape(label) + r'[:\s]*([^\n]+)'
51
+ match = re.search(pattern, text)
52
+ if match:
53
+ value = match.group(1).strip()
54
+
55
+ # Remove any prefixing numbers before the actual label
56
+ if label == "Ad贸sz谩m":
57
+ # Extract just the number after the label
58
+ num_match = re.search(rf"{re.escape(label)}[:\s]*([0-9]+)", text)
59
+ if num_match:
60
+ return num_match.group(1).strip()
61
+
62
+ # Clean up by removing the next label if it bleeds in
63
+ for other in label_list:
64
+ if other != label and other in value:
65
+ value = value.split(other)[0].strip()
66
+
67
+ return value
68
+ return None
69
+
70
+
71
+ # Read the PDF
72
+ def read_pdf_text(file: UploadFile):
73
+ with pdfplumber.open(file.file) as pdf:
74
+ text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
75
+ return text
76
+
77
+ # Main logic
78
+ def extract_data(file: UploadFile):
79
+ flat_map = flatten_mapping(nested_mapping)
80
+ text = read_pdf_text(file)
81
+ result = {}
82
+
83
+ for eng_path, hun_label in flat_map.items():
84
+ val = extract_value(hun_label, text, flat_map.values())
85
+ if val:
86
+ set_nested(result, eng_path, val)
87
+
88
+ result["STATUS"] = "OK"
89
+ result["FILENAME"] = file.filename
90
+
91
+ return json.dumps(result, indent=2, ensure_ascii=False)
92
+
93
+ # Run the pipeline
94
+ #if __name__ == "__main__":
95
+ #pdf_path = "163193_WAVEMAKER_HUNGARY_Order_WMH0245_487-2025_Outlet_Center_Parndo_Parndorf_Radio_Station_Game_atmedia_Kft.pdf" # Change to your actual PDF file path
96
+ #output = extract_data(pdf_path, nested_mapping)
97
+
98
+ # Pretty print or save
99
+ #import pprint
100
+ #pprint.pprint(output, sort_dicts=False)
101
+
102
+ # Optional: Save to JSON file
103
+ # with open("output.json", "w", encoding="utf-8") as f:
104
+ # json.dump(output, f, ensure_ascii=False, indent=2)