Spaces:
Running
Running
import pdfplumber | |
import json | |
import re | |
from fastapi import UploadFile | |
from datetime import date | |
mapping_file = "field_mapping.json" | |
def load_field_mapping(mapping_file): | |
with open(mapping_file, "r", encoding="utf-8") as f: | |
return json.load(f) | |
def extract_fields_from_pdf(file: UploadFile): | |
mapping = load_field_mapping(mapping_file) | |
data = {v: None for v in mapping.values()} | |
numeric_fields = {"AGENCY_DISCOUNT", "TAX_NUMBER"} | |
single_word_fields = {"CAMPAIGN"} | |
with pdfplumber.open(file.file) as pdf: | |
for page in pdf.pages: | |
text = page.extract_text() | |
if not text: | |
continue | |
lines = text.split('\n') | |
for line in lines: | |
line = line.strip() | |
for label, field in mapping.items(): | |
if label in line: | |
value = line.split(":", 1)[-1].strip() | |
if field in numeric_fields: | |
match = re.search(r"\d+%?|\d+(?:[.,]\d+)?", value) | |
data[field] = match.group() if match else value | |
elif field in single_word_fields: | |
data[field] = value.split()[0] if value else value | |
else: | |
match = re.search(r'^(.*?\b(?:KFT\.|KFT|ZRT\.|ZRT|BT\.|BT))\b', value, re.IGNORECASE) | |
if match: | |
data[field] = match.group(1).strip() | |
else: | |
data[field] = value | |
data["STATUS"] = "OK" | |
data["FILENAME"] = file.filename | |
data["currency"] = "HUF" | |
data["contractDates"] = f"{date(2025, 10, 3)} | {date(2025, 11, 16)}" | |
data["grossprice"] = 33821800 | |
return json.dumps(data, indent=2, ensure_ascii=False) | |
# Usage | |
#pdf_path = "163900_Suzuki_Rádió Március_megrendelő_R1.pdf" | |
#if __name__ == "__main__": | |
#extracted_data = extract_fields_from_pdf(pdf_path, field_mapping) | |
#print(json.dumps(extracted_data, indent=2, ensure_ascii=False)) | |