Spaces:

SoumyaJ
/

ATMOrdersExtraction

Running

App Files Files Community

ATMOrdersExtraction / contractapp_forapi.py

SoumyaJ

Update contractapp_forapi.py

49822c1 verified about 1 month ago

raw

history blame contribute delete

4.18 kB

	import pdfplumber
	import json
	import re
	from fastapi import UploadFile

	# Load your mapping JSON file
	with open("field_mapping_contract.json", "r", encoding="utf-8") as f:
	nested_mapping = json.load(f)

	# Helper: flatten mapping (English path => Hungarian label)
	def flatten_mapping(d, parent_key=''):
	items = {}
	for k, v in d.items():
	new_key = f"{parent_key}.{k}" if parent_key else k
	if isinstance(v, dict):
	items.update(flatten_mapping(v, new_key))
	elif isinstance(v, list):
	for idx, label in enumerate(v):
	items[f"{new_key}[{idx}]"] = label
	else:
	items[new_key] = v
	return items

	# Helper: set nested value from flat key
	def set_nested(data, key_path, value):
	parts = re.split(r'\.\|\[\|\]', key_path)
	parts = [p for p in parts if p != '']
	d = data
	for part in parts[:-1]:
	if part.isdigit():
	part = int(part)
	while len(d) <= part:
	d.append({})
	d = d[part]
	else:
	if part not in d:
	d[part] = [] if parts[parts.index(part)+1].isdigit() else {}
	d = d[part]
	last = parts[-1]
	if last.isdigit():
	last = int(last)
	while len(d) <= last:
	d.append(None)
	d[last] = value
	else:
	d[last] = value

	# Helper: extract value using regex
	def extract_value(label, text, label_list):
	if label == "Dátum":
	lines = text.splitlines()
	for i, line in enumerate(lines):
	if "Dátum" in line:
	#print(f"✅ Found line with 'Dátum': {line}")
	if i + 1 < len(lines):
	next_line = lines[i + 1]
	#print(f"🔍 Checking next line: {next_line}")
	dates = re.findall(r'\d{4}-\d{2}-\d{2}', next_line)
	if len(dates) >= 2:
	combined = f"{dates[0]} \| {dates[1]}"
	#print(f"✅ Combined dates: {combined}")
	return combined
	elif len(dates) == 1:
	return dates[0] # fallback: only one date found
	#print("❌ No date found in next line")
	return None


	pattern = re.escape(label) + r'[:\s]*([^\n]+)'
	match = re.search(pattern, text)
	if match:
	value = match.group(1).strip()

	# Remove any prefixing numbers before the actual label
	if label == "Adószám":
	# Extract just the number after the label
	num_match = re.search(rf"{re.escape(label)}[:\s]*([0-9]+)", text)
	if num_match:
	return num_match.group(1).strip()

	# Clean up by removing the next label if it bleeds in
	for other in label_list:
	if other != label and other in value:
	value = value.split(other)[0].strip()

	return value
	return None


	# Read the PDF
	def read_pdf_text(file: UploadFile):
	with pdfplumber.open(file.file) as pdf:
	text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
	return text

	# Main logic
	def extract_data(file: UploadFile):
	flat_map = flatten_mapping(nested_mapping)
	text = read_pdf_text(file)
	result = {}

	for eng_path, hun_label in flat_map.items():
	val = extract_value(hun_label, text, flat_map.values())
	if val:
	set_nested(result, eng_path, val)

	result["STATUS"] = "OK"
	result["FILENAME"] = file.filename
	result["AGENCY"] = "Wavemaker Hungary Kft."

	return json.dumps(result, indent=2, ensure_ascii=False)

	# Run the pipeline
	#if __name__ == "__main__":
	#pdf_path = "163193_WAVEMAKER_HUNGARY_Order_WMH0245_487-2025_Outlet_Center_Parndo_Parndorf_Radio_Station_Game_atmedia_Kft.pdf" # Change to your actual PDF file path
	#output = extract_data(pdf_path, nested_mapping)

	# Pretty print or save
	#import pprint
	#pprint.pprint(output, sort_dicts=False)

	# Optional: Save to JSON file
	# with open("output.json", "w", encoding="utf-8") as f:
	# json.dump(output, f, ensure_ascii=False, indent=2)