import pdfplumber import json import re from fastapi import UploadFile # Load your mapping JSON file with open("field_mapping_contract.json", "r", encoding="utf-8") as f: nested_mapping = json.load(f) # Helper: flatten mapping (English path => Hungarian label) def flatten_mapping(d, parent_key=''): items = {} for k, v in d.items(): new_key = f"{parent_key}.{k}" if parent_key else k if isinstance(v, dict): items.update(flatten_mapping(v, new_key)) elif isinstance(v, list): for idx, label in enumerate(v): items[f"{new_key}[{idx}]"] = label else: items[new_key] = v return items # Helper: set nested value from flat key def set_nested(data, key_path, value): parts = re.split(r'\.|\[|\]', key_path) parts = [p for p in parts if p != ''] d = data for part in parts[:-1]: if part.isdigit(): part = int(part) while len(d) <= part: d.append({}) d = d[part] else: if part not in d: d[part] = [] if parts[parts.index(part)+1].isdigit() else {} d = d[part] last = parts[-1] if last.isdigit(): last = int(last) while len(d) <= last: d.append(None) d[last] = value else: d[last] = value # Helper: extract value using regex def extract_value(label, text, label_list): if label == "Dátum": lines = text.splitlines() for i, line in enumerate(lines): if "Dátum" in line: #print(f"✅ Found line with 'Dátum': {line}") if i + 1 < len(lines): next_line = lines[i + 1] #print(f"🔍 Checking next line: {next_line}") dates = re.findall(r'\d{4}-\d{2}-\d{2}', next_line) if len(dates) >= 2: combined = f"{dates[0]} | {dates[1]}" #print(f"✅ Combined dates: {combined}") return combined elif len(dates) == 1: return dates[0] # fallback: only one date found #print("❌ No date found in next line") return None pattern = re.escape(label) + r'[:\s]*([^\n]+)' match = re.search(pattern, text) if match: value = match.group(1).strip() # Remove any prefixing numbers before the actual label if label == "Adószám": # Extract just the number after the label num_match = re.search(rf"{re.escape(label)}[:\s]*([0-9]+)", text) if num_match: return num_match.group(1).strip() # Clean up by removing the next label if it bleeds in for other in label_list: if other != label and other in value: value = value.split(other)[0].strip() return value return None # Read the PDF def read_pdf_text(file: UploadFile): with pdfplumber.open(file.file) as pdf: text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text()) return text # Main logic def extract_data(file: UploadFile): flat_map = flatten_mapping(nested_mapping) text = read_pdf_text(file) result = {} for eng_path, hun_label in flat_map.items(): val = extract_value(hun_label, text, flat_map.values()) if val: set_nested(result, eng_path, val) result["STATUS"] = "OK" result["FILENAME"] = file.filename result["AGENCY"] = "Wavemaker Hungary Kft." return json.dumps(result, indent=2, ensure_ascii=False) # Run the pipeline #if __name__ == "__main__": #pdf_path = "163193_WAVEMAKER_HUNGARY_Order_WMH0245_487-2025_Outlet_Center_Parndo_Parndorf_Radio_Station_Game_atmedia_Kft.pdf" # Change to your actual PDF file path #output = extract_data(pdf_path, nested_mapping) # Pretty print or save #import pprint #pprint.pprint(output, sort_dicts=False) # Optional: Save to JSON file # with open("output.json", "w", encoding="utf-8") as f: # json.dump(output, f, ensure_ascii=False, indent=2)