SoumyaJ commited on
Commit
bb9eeeb
verified
1 Parent(s): 7417ed0

Update app_forapi.py

Browse files
Files changed (1) hide show
  1. app_forapi.py +58 -55
app_forapi.py CHANGED
@@ -1,55 +1,58 @@
1
- import pdfplumber
2
- import json
3
- import re
4
- from fastapi import UploadFile
5
-
6
- mapping_file = "field_mapping.json"
7
-
8
- def load_field_mapping(mapping_file):
9
- with open(mapping_file, "r", encoding="utf-8") as f:
10
- return json.load(f)
11
-
12
- def extract_fields_from_pdf(file: UploadFile):
13
- mapping = load_field_mapping(mapping_file)
14
-
15
- data = {v: None for v in mapping.values()}
16
-
17
- numeric_fields = {"AGENCY_DISCOUNT", "TAX_NUMBER"}
18
- single_word_fields = {"CAMPAIGN"}
19
-
20
- with pdfplumber.open(file.file) as pdf:
21
- for page in pdf.pages:
22
- text = page.extract_text()
23
- if not text:
24
- continue
25
-
26
- lines = text.split('\n')
27
- for line in lines:
28
- line = line.strip()
29
-
30
- for label, field in mapping.items():
31
- if label in line:
32
- value = line.split(":", 1)[-1].strip()
33
- if field in numeric_fields:
34
- match = re.search(r"\d+%?|\d+(?:[.,]\d+)?", value)
35
- data[field] = match.group() if match else value
36
- elif field in single_word_fields:
37
- data[field] = value.split()[0] if value else value
38
- else:
39
- match = re.search(r'^(.*?\b(?:KFT\.|KFT|ZRT\.|ZRT|BT\.|BT))\b', value, re.IGNORECASE)
40
- if match:
41
- data[field] = match.group(1).strip()
42
- else:
43
- data[field] = value
44
-
45
- return json.dumps(data, indent=2, ensure_ascii=False)
46
-
47
- # Usage
48
-
49
- #pdf_path = "163900_Suzuki_R谩di贸 M谩rcius_megrendel艖_R1.pdf"
50
-
51
-
52
-
53
- #if __name__ == "__main__":
54
- #extracted_data = extract_fields_from_pdf(pdf_path, field_mapping)
55
- #print(json.dumps(extracted_data, indent=2, ensure_ascii=False))
 
 
 
 
1
+ import pdfplumber
2
+ import json
3
+ import re
4
+ from fastapi import UploadFile
5
+
6
+ mapping_file = "field_mapping.json"
7
+
8
+ def load_field_mapping(mapping_file):
9
+ with open(mapping_file, "r", encoding="utf-8") as f:
10
+ return json.load(f)
11
+
12
+ def extract_fields_from_pdf(file: UploadFile):
13
+ mapping = load_field_mapping(mapping_file)
14
+
15
+ data = {v: None for v in mapping.values()}
16
+
17
+ numeric_fields = {"AGENCY_DISCOUNT", "TAX_NUMBER"}
18
+ single_word_fields = {"CAMPAIGN"}
19
+
20
+ with pdfplumber.open(file.file) as pdf:
21
+ for page in pdf.pages:
22
+ text = page.extract_text()
23
+ if not text:
24
+ continue
25
+
26
+ lines = text.split('\n')
27
+ for line in lines:
28
+ line = line.strip()
29
+
30
+ for label, field in mapping.items():
31
+ if label in line:
32
+ value = line.split(":", 1)[-1].strip()
33
+ if field in numeric_fields:
34
+ match = re.search(r"\d+%?|\d+(?:[.,]\d+)?", value)
35
+ data[field] = match.group() if match else value
36
+ elif field in single_word_fields:
37
+ data[field] = value.split()[0] if value else value
38
+ else:
39
+ match = re.search(r'^(.*?\b(?:KFT\.|KFT|ZRT\.|ZRT|BT\.|BT))\b', value, re.IGNORECASE)
40
+ if match:
41
+ data[field] = match.group(1).strip()
42
+ else:
43
+ data[field] = value
44
+
45
+ data["STATUS"] = "OK"
46
+ data["FILENAME"] = file.fileName
47
+
48
+ return json.dumps(data, indent=2, ensure_ascii=False)
49
+
50
+ # Usage
51
+
52
+ #pdf_path = "163900_Suzuki_R谩di贸 M谩rcius_megrendel艖_R1.pdf"
53
+
54
+
55
+
56
+ #if __name__ == "__main__":
57
+ #extracted_data = extract_fields_from_pdf(pdf_path, field_mapping)
58
+ #print(json.dumps(extracted_data, indent=2, ensure_ascii=False))