Spaces:
Running
Running
Upload app.py
Browse files
app.py
ADDED
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pytesseract
|
3 |
+
import cv2
|
4 |
+
import pandas as pd
|
5 |
+
import re
|
6 |
+
from PIL import Image
|
7 |
+
import numpy as np
|
8 |
+
|
9 |
+
def extract_fields(image):
|
10 |
+
try:
|
11 |
+
# -------------------- Image Preparation --------------------
|
12 |
+
img = np.array(image.convert("RGB"))[:, :, ::-1] # PIL to BGR (OpenCV)
|
13 |
+
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
14 |
+
bw = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
15 |
+
cv2.THRESH_BINARY_INV, 25, 15)
|
16 |
+
inverted = cv2.bitwise_not(bw)
|
17 |
+
pil_img = Image.fromarray(inverted)
|
18 |
+
|
19 |
+
# -------------------- OCR Pass 1: Name by "Title" --------------------
|
20 |
+
ocr_df2 = pytesseract.image_to_data(image, output_type=pytesseract.Output.DATAFRAME)
|
21 |
+
ocr_df2 = ocr_df2.dropna(subset=["text"])
|
22 |
+
ocr_df2 = ocr_df2[ocr_df2["text"].str.strip() != ""]
|
23 |
+
|
24 |
+
name = "Not found"
|
25 |
+
neighbors = []
|
26 |
+
|
27 |
+
# ✅ Fix - Add these lines BEFORE you use `ocr_df`
|
28 |
+
ocr_df = pytesseract.image_to_data(pil_img, output_type=pytesseract.Output.DATAFRAME)
|
29 |
+
ocr_df = ocr_df.dropna(subset=["text"])
|
30 |
+
ocr_df = ocr_df[ocr_df["text"].str.strip() != ""]
|
31 |
+
title_matches = ocr_df[ocr_df['text'].str.lower().str.contains("tit", na=False)]
|
32 |
+
|
33 |
+
if not title_matches.empty:
|
34 |
+
title_info = title_matches.iloc[0]
|
35 |
+
|
36 |
+
if 'line_num' in title_info and 'block_num' in title_info:
|
37 |
+
line_num = title_info['line_num']
|
38 |
+
block_num = title_info['block_num']
|
39 |
+
|
40 |
+
same_line = ocr_df[
|
41 |
+
(ocr_df['line_num'] == line_num) &
|
42 |
+
(ocr_df['block_num'] == block_num)
|
43 |
+
].copy().sort_values(by='left').reset_index(drop=True)
|
44 |
+
|
45 |
+
tit_indices = same_line[same_line['text'].str.lower().str.contains("tit")].index
|
46 |
+
if not tit_indices.empty:
|
47 |
+
idx = tit_indices[0]
|
48 |
+
if idx + 1 < len(same_line):
|
49 |
+
neighbors.append(same_line.iloc[idx + 1]['text'])
|
50 |
+
if idx + 2 < len(same_line):
|
51 |
+
neighbors.append(same_line.iloc[idx + 2]['text'])
|
52 |
+
|
53 |
+
def clean_name(words):
|
54 |
+
cleaned = []
|
55 |
+
for w in words:
|
56 |
+
w_clean = re.sub(r'^[^a-zA-Z]+|[^a-zA-Z]+$', '', w)
|
57 |
+
if w_clean:
|
58 |
+
cleaned.append(w_clean)
|
59 |
+
return ' '.join(cleaned)
|
60 |
+
|
61 |
+
if neighbors:
|
62 |
+
name = clean_name(neighbors)
|
63 |
+
|
64 |
+
# -------------------- OCR Pass 2: For Other Fields --------------------
|
65 |
+
ocr_df2 = pytesseract.image_to_data(image, output_type=pytesseract.Output.DATAFRAME)
|
66 |
+
ocr_df2 = ocr_df2.dropna(subset=["text"])
|
67 |
+
ocr_df2 = ocr_df2[ocr_df2["text"].str.strip() != ""]
|
68 |
+
|
69 |
+
def get_value_next_to(keyword, direction="right", max_dist=200):
|
70 |
+
match = ocr_df2[ocr_df2['text'].str.lower() == keyword.lower()]
|
71 |
+
if match.empty:
|
72 |
+
return None
|
73 |
+
row = match.iloc[0]
|
74 |
+
if 'line_num' not in row or 'left' not in row:
|
75 |
+
return None
|
76 |
+
line = row['line_num']
|
77 |
+
x = row['left']
|
78 |
+
if direction == "right":
|
79 |
+
candidates = ocr_df2[
|
80 |
+
(ocr_df2['line_num'] == line) &
|
81 |
+
(ocr_df2['left'] > x) &
|
82 |
+
(ocr_df2['left'] < x + max_dist)
|
83 |
+
].sort_values('left')
|
84 |
+
return candidates['text'].tolist()[0] if not candidates.empty else None
|
85 |
+
return None
|
86 |
+
|
87 |
+
text = " ".join(ocr_df2['text'])
|
88 |
+
email_match = re.search(r'[\w\.-]+@[\w\.-]+', text)
|
89 |
+
phone_match = re.search(r'\+\d{2}\s?\d{2,3}\s?\d{3}\s?\d{2}\s?\d{2}', text)
|
90 |
+
|
91 |
+
raw_text = pytesseract.image_to_string(image, config='--psm 6')
|
92 |
+
|
93 |
+
dob_match = re.search(r'\d{2}\.\d{2}\.\d{4}', raw_text)
|
94 |
+
dob = dob_match.group(0) if dob_match else "Not found"
|
95 |
+
|
96 |
+
postcode = None
|
97 |
+
postcode_after_ch = None
|
98 |
+
ch_exists = bool(re.search(r'\bCH\b', raw_text))
|
99 |
+
|
100 |
+
lines = raw_text.splitlines()
|
101 |
+
for line in lines:
|
102 |
+
if re.search(r'\bCH\b', line):
|
103 |
+
match = re.search(r'\bCH\b.*?(\d{4})(?![\d/])', line)
|
104 |
+
if match:
|
105 |
+
postcode_after_ch = match.group(1)
|
106 |
+
break
|
107 |
+
|
108 |
+
if postcode_after_ch:
|
109 |
+
postcode = postcode_after_ch
|
110 |
+
else:
|
111 |
+
matches = re.findall(r'(?<!\d|\w)[0-9]{4}(?!\d|\w)', raw_text)
|
112 |
+
if matches:
|
113 |
+
postcode = matches[0]
|
114 |
+
|
115 |
+
if not postcode:
|
116 |
+
postcode = "Not found"
|
117 |
+
|
118 |
+
# -------------------- Function List Extraction --------------------
|
119 |
+
def extract_functions_block():
|
120 |
+
#
|
121 |
+
func_match = ocr_df2[ocr_df2['text'].str.lower().str.contains("function")]
|
122 |
+
if func_match.empty:
|
123 |
+
return []
|
124 |
+
|
125 |
+
base_y = func_match.iloc[0]['top']
|
126 |
+
|
127 |
+
func_words = ocr_df2[
|
128 |
+
(ocr_df2['top'] > base_y + 10) & (ocr_df2['top'] < base_y + 120)
|
129 |
+
]
|
130 |
+
|
131 |
+
# Sort by line_num and left to maintain correct reading order
|
132 |
+
func_words = func_words.sort_values(by=["line_num", "left"])
|
133 |
+
|
134 |
+
grouped_lines = func_words.groupby('line_num')['text'].apply(lambda x: ' '.join(x)).tolist()
|
135 |
+
|
136 |
+
clean_funcs = []
|
137 |
+
for line in grouped_lines:
|
138 |
+
#
|
139 |
+
cleaned = re.sub(r'[^a-zA-Z0-9\s]', '', line).strip()
|
140 |
+
if len(cleaned) > 1:
|
141 |
+
clean_funcs.append(cleaned)
|
142 |
+
return clean_funcs
|
143 |
+
|
144 |
+
functions = extract_functions_block()
|
145 |
+
# -------------------- Final Output --------------------
|
146 |
+
return [
|
147 |
+
name if name else "Not found",
|
148 |
+
email_match.group(0) if email_match else "Not found",
|
149 |
+
phone_match.group(0) if phone_match else "Not found",
|
150 |
+
dob,
|
151 |
+
postcode,
|
152 |
+
get_value_next_to("CurBase") or "Not found",
|
153 |
+
get_value_next_to("hourly") or get_value_next_to("rate") or "Not found",
|
154 |
+
"\n".join(functions) if functions else "Not found"
|
155 |
+
]
|
156 |
+
|
157 |
+
except Exception as e:
|
158 |
+
return [f"Error: {str(e)}"] + ["Not found"] * 8
|
159 |
+
|
160 |
+
|
161 |
+
# -------------------- Gradio Interface --------------------
|
162 |
+
demo = gr.Interface(
|
163 |
+
fn=extract_fields,
|
164 |
+
inputs=gr.Image(type="pil"),
|
165 |
+
outputs=[
|
166 |
+
gr.Text(label="Name"),
|
167 |
+
gr.Text(label="Email"),
|
168 |
+
gr.Text(label="Phone"),
|
169 |
+
gr.Text(label="DOB"),
|
170 |
+
gr.Text(label="Postcode"),
|
171 |
+
gr.Text(label="Prem (CurBase)"),
|
172 |
+
gr.Text(label="Temp (Hourly Rate)"),
|
173 |
+
gr.Textbox(label="Functions", lines=4)
|
174 |
+
],
|
175 |
+
title="Image OCR Field Extractor",
|
176 |
+
description="Upload a document image to extract structured data fields."
|
177 |
+
)
|
178 |
+
|
179 |
+
if __name__ == "__main__":
|
180 |
+
demo.launch()
|