Spaces:
Sleeping
Sleeping
File size: 2,240 Bytes
bb9eeeb cbb6c69 bb9eeeb 367ed1b 9398c57 0463759 03e9573 bb9eeeb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
import pdfplumber
import json
import re
from fastapi import UploadFile
from datetime import date
mapping_file = "field_mapping.json"
def load_field_mapping(mapping_file):
with open(mapping_file, "r", encoding="utf-8") as f:
return json.load(f)
def extract_fields_from_pdf(file: UploadFile):
mapping = load_field_mapping(mapping_file)
data = {v: None for v in mapping.values()}
numeric_fields = {"AGENCY_DISCOUNT", "TAX_NUMBER"}
single_word_fields = {"CAMPAIGN"}
with pdfplumber.open(file.file) as pdf:
for page in pdf.pages:
text = page.extract_text()
if not text:
continue
lines = text.split('\n')
for line in lines:
line = line.strip()
for label, field in mapping.items():
if label in line:
value = line.split(":", 1)[-1].strip()
if field in numeric_fields:
match = re.search(r"\d+%?|\d+(?:[.,]\d+)?", value)
data[field] = match.group() if match else value
elif field in single_word_fields:
data[field] = value.split()[0] if value else value
else:
match = re.search(r'^(.*?\b(?:KFT\.|KFT|ZRT\.|ZRT|BT\.|BT))\b', value, re.IGNORECASE)
if match:
data[field] = match.group(1).strip()
else:
data[field] = value
data["STATUS"] = "OK"
data["FILENAME"] = file.filename
data["currency"] = "HUF"
data["contractDates"] = f"{date(2025, 10, 3)} | {date(2025, 11, 16)}"
data["grossprice"] = 33821800
return json.dumps(data, indent=2, ensure_ascii=False)
# Usage
#pdf_path = "163900_Suzuki_Rádió Március_megrendelő_R1.pdf"
#if __name__ == "__main__":
#extracted_data = extract_fields_from_pdf(pdf_path, field_mapping)
#print(json.dumps(extracted_data, indent=2, ensure_ascii=False))
|