Spaces:

SoumyaJ
/

ATMOrdersExtraction

Sleeping

App Files Files Community

SoumyaJ commited on May 30

Commit

1a7befc

verified ·

1 Parent(s): 2c39fc8

Update contractapp_forapi.py

Browse files

Files changed (1) hide show

contractapp_forapi.py +104 -101

contractapp_forapi.py CHANGED Viewed

@@ -1,101 +1,104 @@
-import pdfplumber
-import json
-import re
-from fastapi import UploadFile
-# Load your mapping JSON file
-with open("field_mapping_contract.json", "r", encoding="utf-8") as f:
-    nested_mapping = json.load(f)
-# Helper: flatten mapping (English path => Hungarian label)
-def flatten_mapping(d, parent_key=''):
-    items = {}
-    for k, v in d.items():
-        new_key = f"{parent_key}.{k}" if parent_key else k
-        if isinstance(v, dict):
-            items.update(flatten_mapping(v, new_key))
-        elif isinstance(v, list):
-            for idx, label in enumerate(v):
-                items[f"{new_key}[{idx}]"] = label
-        else:
-            items[new_key] = v
-    return items
-# Helper: set nested value from flat key
-def set_nested(data, key_path, value):
-    parts = re.split(r'\.|\[|\]', key_path)
-    parts = [p for p in parts if p != '']
-    d = data
-    for part in parts[:-1]:
-        if part.isdigit():
-            part = int(part)
-            while len(d) <= part:
-                d.append({})
-            d = d[part]
-        else:
-            if part not in d:
-                d[part] = [] if parts[parts.index(part)+1].isdigit() else {}
-            d = d[part]
-    last = parts[-1]
-    if last.isdigit():
-        last = int(last)
-        while len(d) <= last:
-            d.append(None)
-        d[last] = value
-    else:
-        d[last] = value
-# Helper: extract value using regex
-def extract_value(label, text, label_list):
-    pattern = re.escape(label) + r'[:\s]*([^\n]+)'
-    match = re.search(pattern, text)
-    if match:
-        value = match.group(1).strip()
-        # Remove any prefixing numbers before the actual label
-        if label == "Adószám":
-            # Extract just the number after the label
-            num_match = re.search(rf"{re.escape(label)}[:\s]*([0-9]+)", text)
-            if num_match:
-                return num_match.group(1).strip()
-        # Clean up by removing the next label if it bleeds in
-        for other in label_list:
-            if other != label and other in value:
-                value = value.split(other)[0].strip()
-        return value
-    return None
-# Read the PDF
-def read_pdf_text(file: UploadFile):
-    with pdfplumber.open(file.file) as pdf:
-        text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
-    return text
-# Main logic
-def extract_data(file: UploadFile):
-    flat_map = flatten_mapping(nested_mapping)
-    text = read_pdf_text(file)
-    result = {}
-    for eng_path, hun_label in flat_map.items():
-        val = extract_value(hun_label, text, flat_map.values())
-        if val:
-            set_nested(result, eng_path, val)
-    return json.dumps(result, indent=2, ensure_ascii=False)
-# Run the pipeline
-#if __name__ == "__main__":
-    #pdf_path = "163193_WAVEMAKER_HUNGARY_Order_WMH0245_487-2025_Outlet_Center_Parndo_Parndorf_Radio_Station_Game_atmedia_Kft.pdf"  # Change to your actual PDF file path
-    #output = extract_data(pdf_path, nested_mapping)
-    # Pretty print or save
-    #import pprint
-    #pprint.pprint(output, sort_dicts=False)
-    # Optional: Save to JSON file
-    # with open("output.json", "w", encoding="utf-8") as f:
-    #     json.dump(output, f, ensure_ascii=False, indent=2)

+import pdfplumber
+import json
+import re
+from fastapi import UploadFile
+# Load your mapping JSON file
+with open("field_mapping_contract.json", "r", encoding="utf-8") as f:
+    nested_mapping = json.load(f)
+# Helper: flatten mapping (English path => Hungarian label)
+def flatten_mapping(d, parent_key=''):
+    items = {}
+    for k, v in d.items():
+        new_key = f"{parent_key}.{k}" if parent_key else k
+        if isinstance(v, dict):
+            items.update(flatten_mapping(v, new_key))
+        elif isinstance(v, list):
+            for idx, label in enumerate(v):
+                items[f"{new_key}[{idx}]"] = label
+        else:
+            items[new_key] = v
+    return items
+# Helper: set nested value from flat key
+def set_nested(data, key_path, value):
+    parts = re.split(r'\.|\[|\]', key_path)
+    parts = [p for p in parts if p != '']
+    d = data
+    for part in parts[:-1]:
+        if part.isdigit():
+            part = int(part)
+            while len(d) <= part:
+                d.append({})
+            d = d[part]
+        else:
+            if part not in d:
+                d[part] = [] if parts[parts.index(part)+1].isdigit() else {}
+            d = d[part]
+    last = parts[-1]
+    if last.isdigit():
+        last = int(last)
+        while len(d) <= last:
+            d.append(None)
+        d[last] = value
+    else:
+        d[last] = value
+# Helper: extract value using regex
+def extract_value(label, text, label_list):
+    pattern = re.escape(label) + r'[:\s]*([^\n]+)'
+    match = re.search(pattern, text)
+    if match:
+        value = match.group(1).strip()
+        # Remove any prefixing numbers before the actual label
+        if label == "Adószám":
+            # Extract just the number after the label
+            num_match = re.search(rf"{re.escape(label)}[:\s]*([0-9]+)", text)
+            if num_match:
+                return num_match.group(1).strip()
+        # Clean up by removing the next label if it bleeds in
+        for other in label_list:
+            if other != label and other in value:
+                value = value.split(other)[0].strip()
+        return value
+    return None
+# Read the PDF
+def read_pdf_text(file: UploadFile):
+    with pdfplumber.open(file.file) as pdf:
+        text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
+    return text
+# Main logic
+def extract_data(file: UploadFile):
+    flat_map = flatten_mapping(nested_mapping)
+    text = read_pdf_text(file)
+    result = {}
+    for eng_path, hun_label in flat_map.items():
+        val = extract_value(hun_label, text, flat_map.values())
+        if val:
+            set_nested(result, eng_path, val)
+    result["STATUS"] = "OK"
+    result["FILENAME"] = file.filename
+    return json.dumps(result, indent=2, ensure_ascii=False)
+# Run the pipeline
+#if __name__ == "__main__":
+    #pdf_path = "163193_WAVEMAKER_HUNGARY_Order_WMH0245_487-2025_Outlet_Center_Parndo_Parndorf_Radio_Station_Game_atmedia_Kft.pdf"  # Change to your actual PDF file path
+    #output = extract_data(pdf_path, nested_mapping)
+    # Pretty print or save
+    #import pprint
+    #pprint.pprint(output, sort_dicts=False)
+    # Optional: Save to JSON file
+    # with open("output.json", "w", encoding="utf-8") as f:
+    #     json.dump(output, f, ensure_ascii=False, indent=2)