resolverkatla commited on
Commit
79fc11d
·
1 Parent(s): a36f637
Files changed (4) hide show
  1. app.py +61 -2
  2. layout_extractor.py +41 -0
  3. processor.py +45 -0
  4. requirements.txt +8 -0
app.py CHANGED
@@ -1,2 +1,61 @@
1
- x = st.slider('Select a value')
2
- st.write(x, 'squared is', x * x)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from layout_extractor import convert_pdf_to_images, analyze_layout, extract_text_from_blocks, extract_key_values
3
+ from processor import load_images, analyze_layout, extract_text_from_blocks, rule_based_kv_extraction
4
+ import json
5
+
6
+ st.set_page_config(page_title="Document AI", layout="wide")
7
+ st.title("🧠 AI-Driven Document Layout & Info Extractor")
8
+
9
+ uploaded_file = st.file_uploader("Upload a PDF or Image", type=["pdf", "png", "jpg", "jpeg"])
10
+
11
+ if uploaded_file:
12
+ images = load_images(uploaded_file)
13
+ for i, image in enumerate(images):
14
+ st.subheader(f"Page {i+1}")
15
+ st.image(image, use_column_width=True)
16
+
17
+ with st.spinner("Analyzing layout..."):
18
+ layout = analyze_layout(image)
19
+ blocks = extract_text_from_blocks(image, layout)
20
+ kv_data = rule_based_kv_extraction(blocks)
21
+
22
+ st.success("Done! Here's what we found:")
23
+ st.json(kv_data)
24
+
25
+ st.subheader("✏️ Edit Extracted Fields")
26
+ edited_data = {}
27
+ for key, value in kv_data.items():
28
+ edited_data[key] = st.text_input(f"{key}", value)
29
+
30
+ st.download_button("⬇️ Download JSON", data=json.dumps(edited_data, indent=2),
31
+ file_name="extracted_data.json", mime="application/json")
32
+
33
+ with st.expander("🔍 All Detected Segments"):
34
+ for b in blocks:
35
+ st.markdown(f"**{b['type']}**: {b['text'][:150]}...")
36
+
37
+ st.title("📄 AI-Driven Document Layout Analyzer")
38
+
39
+ uploaded_file = st.file_uploader("Upload a PDF or Image", type=["pdf", "png", "jpg", "jpeg"])
40
+
41
+ if uploaded_file:
42
+ if uploaded_file.name.endswith(".pdf"):
43
+ images = convert_pdf_to_images(uploaded_file)
44
+ else:
45
+ from PIL import Image
46
+ images = [Image.open(uploaded_file)]
47
+
48
+ for i, image in enumerate(images):
49
+ st.image(image, caption=f"Page {i+1}", use_column_width=True)
50
+ layout = analyze_layout(image)
51
+ blocks = extract_text_from_blocks(image, layout)
52
+ key_values = extract_key_values(blocks)
53
+
54
+ st.subheader("Extracted Key Data")
55
+ st.json(key_values)
56
+
57
+ st.subheader("All Segments")
58
+ for block in blocks:
59
+ st.markdown(f"**{block['type']}**: {block['text'][:200]}...")
60
+
61
+ st.download_button("Download JSON", data=json.dumps(key_values, indent=2), file_name="extracted_data.json")
layout_extractor.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import layoutparser as lp
2
+ import pytesseract
3
+ import json
4
+ from pdf2image import convert_from_path
5
+ from PIL import Image
6
+
7
+ def convert_pdf_to_images(pdf_path):
8
+ return convert_from_path(pdf_path)
9
+
10
+ def analyze_layout(image):
11
+ model = lp.Detectron2LayoutModel(
12
+ config_path='lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config',
13
+ extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8],
14
+ label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"}
15
+ )
16
+ layout = model.detect(image)
17
+ return layout
18
+
19
+ def extract_text_from_blocks(image, layout):
20
+ blocks = []
21
+ for block in layout:
22
+ segment_image = block.crop_image(image)
23
+ text = pytesseract.image_to_string(segment_image)
24
+ blocks.append({
25
+ "type": block.type,
26
+ "text": text.strip(),
27
+ "coordinates": block.coordinates
28
+ })
29
+ return blocks
30
+
31
+ def extract_key_values(blocks):
32
+ data = {}
33
+ for block in blocks:
34
+ text = block["text"]
35
+ if "invoice" in text.lower():
36
+ data["invoice_number"] = text
37
+ elif "total" in text.lower():
38
+ data["total_amount"] = text
39
+ elif "customer" in text.lower():
40
+ data["customer_name"] = text
41
+ return data
processor.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import layoutparser as lp
2
+ import pytesseract
3
+ from pdf2image import convert_from_path
4
+ from PIL import Image
5
+ import json
6
+
7
+ model = lp.Detectron2LayoutModel(
8
+ config_path="lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config",
9
+ extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8],
10
+ label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"},
11
+ )
12
+
13
+ def load_images(uploaded_file):
14
+ if uploaded_file.name.endswith(".pdf"):
15
+ return convert_from_path(uploaded_file)
16
+ else:
17
+ return [Image.open(uploaded_file)]
18
+
19
+ def analyze_layout(image):
20
+ layout = model.detect(image)
21
+ return layout
22
+
23
+ def extract_text_from_blocks(image, layout):
24
+ blocks = []
25
+ for block in layout:
26
+ cropped = block.crop_image(image)
27
+ text = pytesseract.image_to_string(cropped)
28
+ blocks.append({
29
+ "type": block.type,
30
+ "text": text.strip(),
31
+ "coords": block.coordinates
32
+ })
33
+ return blocks
34
+
35
+ def rule_based_kv_extraction(blocks):
36
+ data = {}
37
+ for b in blocks:
38
+ t = b["text"].lower()
39
+ if "invoice" in t and "number" in t:
40
+ data["Invoice Number"] = b["text"]
41
+ elif "total" in t:
42
+ data["Total Amount"] = b["text"]
43
+ elif "customer" in t:
44
+ data["Customer Name"] = b["text"]
45
+ return data
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ layoutparser
3
+ pdf2image
4
+ pytesseract
5
+ transformers
6
+ torch
7
+ Pillow
8
+ opencv-python