resolverkatla commited on
Commit
bcf1f17
·
1 Parent(s): 93fb443
Files changed (1) hide show
  1. processor.py +24 -10
processor.py CHANGED
@@ -8,16 +8,6 @@ def load_images(uploaded_file):
8
  return convert_from_path(uploaded_file)
9
  else:
10
  return [Image.open(uploaded_file)]
11
-
12
- from processor import (
13
- load_images,
14
- analyze_layout,
15
- extract_text_from_blocks,
16
- rule_based_kv_extraction
17
- )
18
-
19
- def convert_pdf_to_images(pdf_path):
20
- return convert_from_path(pdf_path)
21
 
22
  def analyze_layout(image):
23
  model = lp.EfficientDetLayoutModel(
@@ -26,3 +16,27 @@ def analyze_layout(image):
26
  label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"}
27
  )
28
  return model.detect(image)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  return convert_from_path(uploaded_file)
9
  else:
10
  return [Image.open(uploaded_file)]
 
 
 
 
 
 
 
 
 
 
11
 
12
  def analyze_layout(image):
13
  model = lp.EfficientDetLayoutModel(
 
16
  label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"}
17
  )
18
  return model.detect(image)
19
+
20
+ def extract_text_from_blocks(image, layout):
21
+ blocks = []
22
+ for block in layout:
23
+ segment_image = block.crop_image(image)
24
+ text = pytesseract.image_to_string(segment_image)
25
+ blocks.append({
26
+ "type": block.type,
27
+ "text": text.strip(),
28
+ "coordinates": block.coordinates
29
+ })
30
+ return blocks
31
+
32
+ def rule_based_kv_extraction(blocks):
33
+ data = {}
34
+ for block in blocks:
35
+ text = block["text"].lower()
36
+ if "invoice" in text:
37
+ data["Invoice Number"] = block["text"]
38
+ elif "total" in text:
39
+ data["Total Amount"] = block["text"]
40
+ elif "customer" in text:
41
+ data["Customer Name"] = block["text"]
42
+ return data