Spaces:

resolverkatla
/

Midterm_Task_5

Sleeping

resolverkatla commited on Apr 8

Commit

bcf1f17

1 Parent(s): 93fb443

Update

Files changed (1) hide show

processor.py CHANGED Viewed

@@ -8,16 +8,6 @@ def load_images(uploaded_file):
         return convert_from_path(uploaded_file)
     else:
         return [Image.open(uploaded_file)]
-from processor import (
-    load_images,
-    analyze_layout,
-    extract_text_from_blocks,
-    rule_based_kv_extraction
-)
-def convert_pdf_to_images(pdf_path):
-    return convert_from_path(pdf_path)
 def analyze_layout(image):
     model = lp.EfficientDetLayoutModel(
@@ -26,3 +16,27 @@ def analyze_layout(image):
         label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"}
     )
     return model.detect(image)

         return convert_from_path(uploaded_file)
     else:
         return [Image.open(uploaded_file)]
 def analyze_layout(image):
     model = lp.EfficientDetLayoutModel(
         label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"}
     )
     return model.detect(image)
+def extract_text_from_blocks(image, layout):
+    blocks = []
+    for block in layout:
+        segment_image = block.crop_image(image)
+        text = pytesseract.image_to_string(segment_image)
+        blocks.append({
+            "type": block.type,
+            "text": text.strip(),
+            "coordinates": block.coordinates
+        })
+    return blocks
+def rule_based_kv_extraction(blocks):
+    data = {}
+    for block in blocks:
+        text = block["text"].lower()
+        if "invoice" in text:
+            data["Invoice Number"] = block["text"]
+        elif "total" in text:
+            data["Total Amount"] = block["text"]
+        elif "customer" in text:
+            data["Customer Name"] = block["text"]
+    return data