ATMOrdersExtraction / contractapp_forapi.py
SoumyaJ's picture
Update contractapp_forapi.py
49822c1 verified
import pdfplumber
import json
import re
from fastapi import UploadFile
# Load your mapping JSON file
with open("field_mapping_contract.json", "r", encoding="utf-8") as f:
nested_mapping = json.load(f)
# Helper: flatten mapping (English path => Hungarian label)
def flatten_mapping(d, parent_key=''):
items = {}
for k, v in d.items():
new_key = f"{parent_key}.{k}" if parent_key else k
if isinstance(v, dict):
items.update(flatten_mapping(v, new_key))
elif isinstance(v, list):
for idx, label in enumerate(v):
items[f"{new_key}[{idx}]"] = label
else:
items[new_key] = v
return items
# Helper: set nested value from flat key
def set_nested(data, key_path, value):
parts = re.split(r'\.|\[|\]', key_path)
parts = [p for p in parts if p != '']
d = data
for part in parts[:-1]:
if part.isdigit():
part = int(part)
while len(d) <= part:
d.append({})
d = d[part]
else:
if part not in d:
d[part] = [] if parts[parts.index(part)+1].isdigit() else {}
d = d[part]
last = parts[-1]
if last.isdigit():
last = int(last)
while len(d) <= last:
d.append(None)
d[last] = value
else:
d[last] = value
# Helper: extract value using regex
def extract_value(label, text, label_list):
if label == "Dátum":
lines = text.splitlines()
for i, line in enumerate(lines):
if "Dátum" in line:
#print(f"✅ Found line with 'Dátum': {line}")
if i + 1 < len(lines):
next_line = lines[i + 1]
#print(f"🔍 Checking next line: {next_line}")
dates = re.findall(r'\d{4}-\d{2}-\d{2}', next_line)
if len(dates) >= 2:
combined = f"{dates[0]} | {dates[1]}"
#print(f"✅ Combined dates: {combined}")
return combined
elif len(dates) == 1:
return dates[0] # fallback: only one date found
#print("❌ No date found in next line")
return None
pattern = re.escape(label) + r'[:\s]*([^\n]+)'
match = re.search(pattern, text)
if match:
value = match.group(1).strip()
# Remove any prefixing numbers before the actual label
if label == "Adószám":
# Extract just the number after the label
num_match = re.search(rf"{re.escape(label)}[:\s]*([0-9]+)", text)
if num_match:
return num_match.group(1).strip()
# Clean up by removing the next label if it bleeds in
for other in label_list:
if other != label and other in value:
value = value.split(other)[0].strip()
return value
return None
# Read the PDF
def read_pdf_text(file: UploadFile):
with pdfplumber.open(file.file) as pdf:
text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
return text
# Main logic
def extract_data(file: UploadFile):
flat_map = flatten_mapping(nested_mapping)
text = read_pdf_text(file)
result = {}
for eng_path, hun_label in flat_map.items():
val = extract_value(hun_label, text, flat_map.values())
if val:
set_nested(result, eng_path, val)
result["STATUS"] = "OK"
result["FILENAME"] = file.filename
result["AGENCY"] = "Wavemaker Hungary Kft."
return json.dumps(result, indent=2, ensure_ascii=False)
# Run the pipeline
#if __name__ == "__main__":
#pdf_path = "163193_WAVEMAKER_HUNGARY_Order_WMH0245_487-2025_Outlet_Center_Parndo_Parndorf_Radio_Station_Game_atmedia_Kft.pdf" # Change to your actual PDF file path
#output = extract_data(pdf_path, nested_mapping)
# Pretty print or save
#import pprint
#pprint.pprint(output, sort_dicts=False)
# Optional: Save to JSON file
# with open("output.json", "w", encoding="utf-8") as f:
# json.dump(output, f, ensure_ascii=False, indent=2)