Spaces:
Running
Running
import pdfplumber | |
import json | |
import re | |
from fastapi import UploadFile | |
# Load your mapping JSON file | |
with open("field_mapping_contract.json", "r", encoding="utf-8") as f: | |
nested_mapping = json.load(f) | |
# Helper: flatten mapping (English path => Hungarian label) | |
def flatten_mapping(d, parent_key=''): | |
items = {} | |
for k, v in d.items(): | |
new_key = f"{parent_key}.{k}" if parent_key else k | |
if isinstance(v, dict): | |
items.update(flatten_mapping(v, new_key)) | |
elif isinstance(v, list): | |
for idx, label in enumerate(v): | |
items[f"{new_key}[{idx}]"] = label | |
else: | |
items[new_key] = v | |
return items | |
# Helper: set nested value from flat key | |
def set_nested(data, key_path, value): | |
parts = re.split(r'\.|\[|\]', key_path) | |
parts = [p for p in parts if p != ''] | |
d = data | |
for part in parts[:-1]: | |
if part.isdigit(): | |
part = int(part) | |
while len(d) <= part: | |
d.append({}) | |
d = d[part] | |
else: | |
if part not in d: | |
d[part] = [] if parts[parts.index(part)+1].isdigit() else {} | |
d = d[part] | |
last = parts[-1] | |
if last.isdigit(): | |
last = int(last) | |
while len(d) <= last: | |
d.append(None) | |
d[last] = value | |
else: | |
d[last] = value | |
# Helper: extract value using regex | |
def extract_value(label, text, label_list): | |
if label == "Dátum": | |
lines = text.splitlines() | |
for i, line in enumerate(lines): | |
if "Dátum" in line: | |
#print(f"✅ Found line with 'Dátum': {line}") | |
if i + 1 < len(lines): | |
next_line = lines[i + 1] | |
#print(f"🔍 Checking next line: {next_line}") | |
dates = re.findall(r'\d{4}-\d{2}-\d{2}', next_line) | |
if len(dates) >= 2: | |
combined = f"{dates[0]} | {dates[1]}" | |
#print(f"✅ Combined dates: {combined}") | |
return combined | |
elif len(dates) == 1: | |
return dates[0] # fallback: only one date found | |
#print("❌ No date found in next line") | |
return None | |
pattern = re.escape(label) + r'[:\s]*([^\n]+)' | |
match = re.search(pattern, text) | |
if match: | |
value = match.group(1).strip() | |
# Remove any prefixing numbers before the actual label | |
if label == "Adószám": | |
# Extract just the number after the label | |
num_match = re.search(rf"{re.escape(label)}[:\s]*([0-9]+)", text) | |
if num_match: | |
return num_match.group(1).strip() | |
# Clean up by removing the next label if it bleeds in | |
for other in label_list: | |
if other != label and other in value: | |
value = value.split(other)[0].strip() | |
return value | |
return None | |
# Read the PDF | |
def read_pdf_text(file: UploadFile): | |
with pdfplumber.open(file.file) as pdf: | |
text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text()) | |
return text | |
# Main logic | |
def extract_data(file: UploadFile): | |
flat_map = flatten_mapping(nested_mapping) | |
text = read_pdf_text(file) | |
result = {} | |
for eng_path, hun_label in flat_map.items(): | |
val = extract_value(hun_label, text, flat_map.values()) | |
if val: | |
set_nested(result, eng_path, val) | |
result["STATUS"] = "OK" | |
result["FILENAME"] = file.filename | |
result["AGENCY"] = "Wavemaker Hungary Kft." | |
return json.dumps(result, indent=2, ensure_ascii=False) | |
# Run the pipeline | |
#if __name__ == "__main__": | |
#pdf_path = "163193_WAVEMAKER_HUNGARY_Order_WMH0245_487-2025_Outlet_Center_Parndo_Parndorf_Radio_Station_Game_atmedia_Kft.pdf" # Change to your actual PDF file path | |
#output = extract_data(pdf_path, nested_mapping) | |
# Pretty print or save | |
#import pprint | |
#pprint.pprint(output, sort_dicts=False) | |
# Optional: Save to JSON file | |
# with open("output.json", "w", encoding="utf-8") as f: | |
# json.dump(output, f, ensure_ascii=False, indent=2) | |