Spaces:
Sleeping
Sleeping
Update app_forapi.py
Browse files- app_forapi.py +58 -55
app_forapi.py
CHANGED
@@ -1,55 +1,58 @@
|
|
1 |
-
import pdfplumber
|
2 |
-
import json
|
3 |
-
import re
|
4 |
-
from fastapi import UploadFile
|
5 |
-
|
6 |
-
mapping_file = "field_mapping.json"
|
7 |
-
|
8 |
-
def load_field_mapping(mapping_file):
|
9 |
-
with open(mapping_file, "r", encoding="utf-8") as f:
|
10 |
-
return json.load(f)
|
11 |
-
|
12 |
-
def extract_fields_from_pdf(file: UploadFile):
|
13 |
-
mapping = load_field_mapping(mapping_file)
|
14 |
-
|
15 |
-
data = {v: None for v in mapping.values()}
|
16 |
-
|
17 |
-
numeric_fields = {"AGENCY_DISCOUNT", "TAX_NUMBER"}
|
18 |
-
single_word_fields = {"CAMPAIGN"}
|
19 |
-
|
20 |
-
with pdfplumber.open(file.file) as pdf:
|
21 |
-
for page in pdf.pages:
|
22 |
-
text = page.extract_text()
|
23 |
-
if not text:
|
24 |
-
continue
|
25 |
-
|
26 |
-
lines = text.split('\n')
|
27 |
-
for line in lines:
|
28 |
-
line = line.strip()
|
29 |
-
|
30 |
-
for label, field in mapping.items():
|
31 |
-
if label in line:
|
32 |
-
value = line.split(":", 1)[-1].strip()
|
33 |
-
if field in numeric_fields:
|
34 |
-
match = re.search(r"\d+%?|\d+(?:[.,]\d+)?", value)
|
35 |
-
data[field] = match.group() if match else value
|
36 |
-
elif field in single_word_fields:
|
37 |
-
data[field] = value.split()[0] if value else value
|
38 |
-
else:
|
39 |
-
match = re.search(r'^(.*?\b(?:KFT\.|KFT|ZRT\.|ZRT|BT\.|BT))\b', value, re.IGNORECASE)
|
40 |
-
if match:
|
41 |
-
data[field] = match.group(1).strip()
|
42 |
-
else:
|
43 |
-
data[field] = value
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
|
|
|
|
|
|
|
1 |
+
import pdfplumber
|
2 |
+
import json
|
3 |
+
import re
|
4 |
+
from fastapi import UploadFile
|
5 |
+
|
6 |
+
mapping_file = "field_mapping.json"
|
7 |
+
|
8 |
+
def load_field_mapping(mapping_file):
|
9 |
+
with open(mapping_file, "r", encoding="utf-8") as f:
|
10 |
+
return json.load(f)
|
11 |
+
|
12 |
+
def extract_fields_from_pdf(file: UploadFile):
|
13 |
+
mapping = load_field_mapping(mapping_file)
|
14 |
+
|
15 |
+
data = {v: None for v in mapping.values()}
|
16 |
+
|
17 |
+
numeric_fields = {"AGENCY_DISCOUNT", "TAX_NUMBER"}
|
18 |
+
single_word_fields = {"CAMPAIGN"}
|
19 |
+
|
20 |
+
with pdfplumber.open(file.file) as pdf:
|
21 |
+
for page in pdf.pages:
|
22 |
+
text = page.extract_text()
|
23 |
+
if not text:
|
24 |
+
continue
|
25 |
+
|
26 |
+
lines = text.split('\n')
|
27 |
+
for line in lines:
|
28 |
+
line = line.strip()
|
29 |
+
|
30 |
+
for label, field in mapping.items():
|
31 |
+
if label in line:
|
32 |
+
value = line.split(":", 1)[-1].strip()
|
33 |
+
if field in numeric_fields:
|
34 |
+
match = re.search(r"\d+%?|\d+(?:[.,]\d+)?", value)
|
35 |
+
data[field] = match.group() if match else value
|
36 |
+
elif field in single_word_fields:
|
37 |
+
data[field] = value.split()[0] if value else value
|
38 |
+
else:
|
39 |
+
match = re.search(r'^(.*?\b(?:KFT\.|KFT|ZRT\.|ZRT|BT\.|BT))\b', value, re.IGNORECASE)
|
40 |
+
if match:
|
41 |
+
data[field] = match.group(1).strip()
|
42 |
+
else:
|
43 |
+
data[field] = value
|
44 |
+
|
45 |
+
data["STATUS"] = "OK"
|
46 |
+
data["FILENAME"] = file.fileName
|
47 |
+
|
48 |
+
return json.dumps(data, indent=2, ensure_ascii=False)
|
49 |
+
|
50 |
+
# Usage
|
51 |
+
|
52 |
+
#pdf_path = "163900_Suzuki_R谩di贸 M谩rcius_megrendel艖_R1.pdf"
|
53 |
+
|
54 |
+
|
55 |
+
|
56 |
+
#if __name__ == "__main__":
|
57 |
+
#extracted_data = extract_fields_from_pdf(pdf_path, field_mapping)
|
58 |
+
#print(json.dumps(extracted_data, indent=2, ensure_ascii=False))
|