resolverkatla commited on
Commit
d813a84
·
1 Parent(s): 4a27fa8
Files changed (1) hide show
  1. processor.py +8 -38
processor.py CHANGED
@@ -2,44 +2,14 @@ import layoutparser as lp
2
  import pytesseract
3
  from pdf2image import convert_from_path
4
  from PIL import Image
5
- import json
6
 
7
- model = lp.Detectron2LayoutModel(
8
- config_path="lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config",
9
- extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8],
10
- label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"},
11
- )
12
-
13
- def load_images(uploaded_file):
14
- if uploaded_file.name.endswith(".pdf"):
15
- return convert_from_path(uploaded_file)
16
- else:
17
- return [Image.open(uploaded_file)]
18
 
19
  def analyze_layout(image):
20
- layout = model.detect(image)
21
- return layout
22
-
23
- def extract_text_from_blocks(image, layout):
24
- blocks = []
25
- for block in layout:
26
- cropped = block.crop_image(image)
27
- text = pytesseract.image_to_string(cropped)
28
- blocks.append({
29
- "type": block.type,
30
- "text": text.strip(),
31
- "coords": block.coordinates
32
- })
33
- return blocks
34
-
35
- def rule_based_kv_extraction(blocks):
36
- data = {}
37
- for b in blocks:
38
- t = b["text"].lower()
39
- if "invoice" in t and "number" in t:
40
- data["Invoice Number"] = b["text"]
41
- elif "total" in t:
42
- data["Total Amount"] = b["text"]
43
- elif "customer" in t:
44
- data["Customer Name"] = b["text"]
45
- return data
 
2
  import pytesseract
3
  from pdf2image import convert_from_path
4
  from PIL import Image
 
5
 
6
+ def convert_pdf_to_images(pdf_path):
7
+ return convert_from_path(pdf_path)
 
 
 
 
 
 
 
 
 
8
 
9
  def analyze_layout(image):
10
+ model = lp.EfficientDetLayoutModel(
11
+ "lp://efficientdet/PubLayNet",
12
+ extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.6],
13
+ label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"}
14
+ )
15
+ return model.detect(image)