Spaces:
Sleeping
Sleeping
Commit
·
4a27fa8
1
Parent(s):
44b4160
Update
Browse files- layout_extractor.py +11 -11
- requirements.txt +1 -3
layout_extractor.py
CHANGED
@@ -1,16 +1,16 @@
|
|
1 |
import layoutparser as lp
|
2 |
import pytesseract
|
3 |
-
import json
|
4 |
from pdf2image import convert_from_path
|
5 |
from PIL import Image
|
6 |
|
7 |
def convert_pdf_to_images(pdf_path):
|
8 |
return convert_from_path(pdf_path)
|
9 |
|
|
|
10 |
def analyze_layout(image):
|
11 |
-
model = lp.
|
12 |
-
|
13 |
-
extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.
|
14 |
label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"}
|
15 |
)
|
16 |
layout = model.detect(image)
|
@@ -31,11 +31,11 @@ def extract_text_from_blocks(image, layout):
|
|
31 |
def extract_key_values(blocks):
|
32 |
data = {}
|
33 |
for block in blocks:
|
34 |
-
text = block["text"]
|
35 |
-
if "invoice" in text
|
36 |
-
data["
|
37 |
-
elif "total" in text
|
38 |
-
data["
|
39 |
-
elif "customer" in text
|
40 |
-
data["
|
41 |
return data
|
|
|
1 |
import layoutparser as lp
|
2 |
import pytesseract
|
|
|
3 |
from pdf2image import convert_from_path
|
4 |
from PIL import Image
|
5 |
|
6 |
def convert_pdf_to_images(pdf_path):
|
7 |
return convert_from_path(pdf_path)
|
8 |
|
9 |
+
# ✅ Use EfficientDet instead of Detectron2 for better compatibility
|
10 |
def analyze_layout(image):
|
11 |
+
model = lp.EfficientDetLayoutModel(
|
12 |
+
"lp://efficientdet/PubLayNet",
|
13 |
+
extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.6],
|
14 |
label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"}
|
15 |
)
|
16 |
layout = model.detect(image)
|
|
|
31 |
def extract_key_values(blocks):
|
32 |
data = {}
|
33 |
for block in blocks:
|
34 |
+
text = block["text"].lower()
|
35 |
+
if "invoice" in text:
|
36 |
+
data["Invoice Number"] = block["text"]
|
37 |
+
elif "total" in text:
|
38 |
+
data["Total Amount"] = block["text"]
|
39 |
+
elif "customer" in text:
|
40 |
+
data["Customer Name"] = block["text"]
|
41 |
return data
|
requirements.txt
CHANGED
@@ -1,10 +1,8 @@
|
|
1 |
-
layoutparser[layoutmodels_detectron2] @ git+https://github.com/Layout-Parser/layout-parser.git
|
2 |
-
git+https://github.com/facebookresearch/detectron2.git
|
3 |
streamlit
|
|
|
4 |
pdf2image
|
5 |
pytesseract
|
6 |
transformers
|
7 |
torch
|
8 |
Pillow
|
9 |
opencv-python
|
10 |
-
|
|
|
|
|
|
|
1 |
streamlit
|
2 |
+
layoutparser
|
3 |
pdf2image
|
4 |
pytesseract
|
5 |
transformers
|
6 |
torch
|
7 |
Pillow
|
8 |
opencv-python
|
|