import pdfplumber import json import re from fastapi import UploadFile from datetime import date mapping_file = "field_mapping.json" def load_field_mapping(mapping_file): with open(mapping_file, "r", encoding="utf-8") as f: return json.load(f) def extract_fields_from_pdf(file: UploadFile): mapping = load_field_mapping(mapping_file) data = {v: None for v in mapping.values()} numeric_fields = {"AGENCY_DISCOUNT", "TAX_NUMBER"} single_word_fields = {"CAMPAIGN"} with pdfplumber.open(file.file) as pdf: for page in pdf.pages: text = page.extract_text() if not text: continue lines = text.split('\n') for line in lines: line = line.strip() for label, field in mapping.items(): if label in line: value = line.split(":", 1)[-1].strip() if field in numeric_fields: match = re.search(r"\d+%?|\d+(?:[.,]\d+)?", value) data[field] = match.group() if match else value elif field in single_word_fields: data[field] = value.split()[0] if value else value else: match = re.search(r'^(.*?\b(?:KFT\.|KFT|ZRT\.|ZRT|BT\.|BT))\b', value, re.IGNORECASE) if match: data[field] = match.group(1).strip() else: data[field] = value data["STATUS"] = "OK" data["FILENAME"] = file.filename data["currency"] = "HUF" data["contractDates"] = f"{date(2025, 10, 3)} | {date(2025, 11, 16)}" data["grossprice"] = 33821800 return json.dumps(data, indent=2, ensure_ascii=False) # Usage #pdf_path = "163900_Suzuki_Rádió Március_megrendelő_R1.pdf" #if __name__ == "__main__": #extracted_data = extract_fields_from_pdf(pdf_path, field_mapping) #print(json.dumps(extracted_data, indent=2, ensure_ascii=False))