Spaces:
Sleeping
Sleeping
Commit
·
bcf1f17
1
Parent(s):
93fb443
Update
Browse files- processor.py +24 -10
processor.py
CHANGED
@@ -8,16 +8,6 @@ def load_images(uploaded_file):
|
|
8 |
return convert_from_path(uploaded_file)
|
9 |
else:
|
10 |
return [Image.open(uploaded_file)]
|
11 |
-
|
12 |
-
from processor import (
|
13 |
-
load_images,
|
14 |
-
analyze_layout,
|
15 |
-
extract_text_from_blocks,
|
16 |
-
rule_based_kv_extraction
|
17 |
-
)
|
18 |
-
|
19 |
-
def convert_pdf_to_images(pdf_path):
|
20 |
-
return convert_from_path(pdf_path)
|
21 |
|
22 |
def analyze_layout(image):
|
23 |
model = lp.EfficientDetLayoutModel(
|
@@ -26,3 +16,27 @@ def analyze_layout(image):
|
|
26 |
label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"}
|
27 |
)
|
28 |
return model.detect(image)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
return convert_from_path(uploaded_file)
|
9 |
else:
|
10 |
return [Image.open(uploaded_file)]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
def analyze_layout(image):
|
13 |
model = lp.EfficientDetLayoutModel(
|
|
|
16 |
label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"}
|
17 |
)
|
18 |
return model.detect(image)
|
19 |
+
|
20 |
+
def extract_text_from_blocks(image, layout):
|
21 |
+
blocks = []
|
22 |
+
for block in layout:
|
23 |
+
segment_image = block.crop_image(image)
|
24 |
+
text = pytesseract.image_to_string(segment_image)
|
25 |
+
blocks.append({
|
26 |
+
"type": block.type,
|
27 |
+
"text": text.strip(),
|
28 |
+
"coordinates": block.coordinates
|
29 |
+
})
|
30 |
+
return blocks
|
31 |
+
|
32 |
+
def rule_based_kv_extraction(blocks):
|
33 |
+
data = {}
|
34 |
+
for block in blocks:
|
35 |
+
text = block["text"].lower()
|
36 |
+
if "invoice" in text:
|
37 |
+
data["Invoice Number"] = block["text"]
|
38 |
+
elif "total" in text:
|
39 |
+
data["Total Amount"] = block["text"]
|
40 |
+
elif "customer" in text:
|
41 |
+
data["Customer Name"] = block["text"]
|
42 |
+
return data
|