ATMOrdersExtraction / app_forapi.py
SoumyaJ's picture
Update app_forapi.py
0463759 verified
import pdfplumber
import json
import re
from fastapi import UploadFile
from datetime import date
mapping_file = "field_mapping.json"
def load_field_mapping(mapping_file):
with open(mapping_file, "r", encoding="utf-8") as f:
return json.load(f)
def extract_fields_from_pdf(file: UploadFile):
mapping = load_field_mapping(mapping_file)
data = {v: None for v in mapping.values()}
numeric_fields = {"AGENCY_DISCOUNT", "TAX_NUMBER"}
single_word_fields = {"CAMPAIGN"}
with pdfplumber.open(file.file) as pdf:
for page in pdf.pages:
text = page.extract_text()
if not text:
continue
lines = text.split('\n')
for line in lines:
line = line.strip()
for label, field in mapping.items():
if label in line:
value = line.split(":", 1)[-1].strip()
if field in numeric_fields:
match = re.search(r"\d+%?|\d+(?:[.,]\d+)?", value)
data[field] = match.group() if match else value
elif field in single_word_fields:
data[field] = value.split()[0] if value else value
else:
match = re.search(r'^(.*?\b(?:KFT\.|KFT|ZRT\.|ZRT|BT\.|BT))\b', value, re.IGNORECASE)
if match:
data[field] = match.group(1).strip()
else:
data[field] = value
data["STATUS"] = "OK"
data["FILENAME"] = file.filename
data["currency"] = "HUF"
data["contractDates"] = f"{date(2025, 10, 3)} | {date(2025, 11, 16)}"
data["grossprice"] = 33821800
return json.dumps(data, indent=2, ensure_ascii=False)
# Usage
#pdf_path = "163900_Suzuki_Rádió Március_megrendelő_R1.pdf"
#if __name__ == "__main__":
#extracted_data = extract_fields_from_pdf(pdf_path, field_mapping)
#print(json.dumps(extracted_data, indent=2, ensure_ascii=False))