Spaces:
Sleeping
Sleeping
Commit
·
d813a84
1
Parent(s):
4a27fa8
Update
Browse files- processor.py +8 -38
processor.py
CHANGED
@@ -2,44 +2,14 @@ import layoutparser as lp
|
|
2 |
import pytesseract
|
3 |
from pdf2image import convert_from_path
|
4 |
from PIL import Image
|
5 |
-
import json
|
6 |
|
7 |
-
|
8 |
-
|
9 |
-
extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8],
|
10 |
-
label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"},
|
11 |
-
)
|
12 |
-
|
13 |
-
def load_images(uploaded_file):
|
14 |
-
if uploaded_file.name.endswith(".pdf"):
|
15 |
-
return convert_from_path(uploaded_file)
|
16 |
-
else:
|
17 |
-
return [Image.open(uploaded_file)]
|
18 |
|
19 |
def analyze_layout(image):
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
cropped = block.crop_image(image)
|
27 |
-
text = pytesseract.image_to_string(cropped)
|
28 |
-
blocks.append({
|
29 |
-
"type": block.type,
|
30 |
-
"text": text.strip(),
|
31 |
-
"coords": block.coordinates
|
32 |
-
})
|
33 |
-
return blocks
|
34 |
-
|
35 |
-
def rule_based_kv_extraction(blocks):
|
36 |
-
data = {}
|
37 |
-
for b in blocks:
|
38 |
-
t = b["text"].lower()
|
39 |
-
if "invoice" in t and "number" in t:
|
40 |
-
data["Invoice Number"] = b["text"]
|
41 |
-
elif "total" in t:
|
42 |
-
data["Total Amount"] = b["text"]
|
43 |
-
elif "customer" in t:
|
44 |
-
data["Customer Name"] = b["text"]
|
45 |
-
return data
|
|
|
2 |
import pytesseract
|
3 |
from pdf2image import convert_from_path
|
4 |
from PIL import Image
|
|
|
5 |
|
6 |
+
def convert_pdf_to_images(pdf_path):
|
7 |
+
return convert_from_path(pdf_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
def analyze_layout(image):
|
10 |
+
model = lp.EfficientDetLayoutModel(
|
11 |
+
"lp://efficientdet/PubLayNet",
|
12 |
+
extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.6],
|
13 |
+
label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"}
|
14 |
+
)
|
15 |
+
return model.detect(image)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|