Ayesha-Majeed's picture
Update app.py
c03d9b5 verified
import gradio as gr
import pytesseract
import cv2
import pandas as pd
import re
from PIL import Image
import numpy as np
def extract_fields(image):
try:
# -------------------- Image Preparation --------------------
img = np.array(image.convert("RGB"))[:, :, ::-1] # PIL to BGR (OpenCV)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
bw = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY_INV, 25, 15)
inverted = cv2.bitwise_not(bw)
pil_img = Image.fromarray(inverted)
# -------------------- OCR Pass 1: Name by "Title" --------------------
ocr_df2 = pytesseract.image_to_data(image, output_type=pytesseract.Output.DATAFRAME)
ocr_df2 = ocr_df2.dropna(subset=["text"])
ocr_df2 = ocr_df2[ocr_df2["text"].str.strip() != ""]
name = "Not found"
neighbors = []
# ✅ Fix - Add these lines BEFORE you use `ocr_df`
ocr_df = pytesseract.image_to_data(pil_img, output_type=pytesseract.Output.DATAFRAME)
ocr_df = ocr_df.dropna(subset=["text"])
ocr_df = ocr_df[ocr_df["text"].str.strip() != ""]
title_matches = ocr_df[ocr_df['text'].str.lower().str.contains("tit", na=False)]
if not title_matches.empty:
title_info = title_matches.iloc[0]
if 'line_num' in title_info and 'block_num' in title_info:
line_num = title_info['line_num']
block_num = title_info['block_num']
same_line = ocr_df[
(ocr_df['line_num'] == line_num) &
(ocr_df['block_num'] == block_num)
].copy().sort_values(by='left').reset_index(drop=True)
tit_indices = same_line[same_line['text'].str.lower().str.contains("tit")].index
if not tit_indices.empty:
idx = tit_indices[0]
if idx + 1 < len(same_line):
neighbors.append(same_line.iloc[idx + 1]['text'])
if idx + 2 < len(same_line):
neighbors.append(same_line.iloc[idx + 2]['text'])
def clean_name(words):
cleaned = []
for w in words:
w_clean = re.sub(r'^[^a-zA-Z]+|[^a-zA-Z]+$', '', w)
if w_clean:
cleaned.append(w_clean)
return ' '.join(cleaned)
if neighbors:
name = clean_name(neighbors)
# -------------------- OCR Pass 2: For Other Fields --------------------
ocr_df2 = pytesseract.image_to_data(image, output_type=pytesseract.Output.DATAFRAME)
ocr_df2 = ocr_df2.dropna(subset=["text"])
ocr_df2 = ocr_df2[ocr_df2["text"].str.strip() != ""]
def get_value_next_to(keyword, direction="right", max_dist=200):
match = ocr_df2[ocr_df2['text'].str.lower() == keyword.lower()]
if match.empty:
return None
row = match.iloc[0]
if 'line_num' not in row or 'left' not in row:
return None
line = row['line_num']
x = row['left']
if direction == "right":
candidates = ocr_df2[
(ocr_df2['line_num'] == line) &
(ocr_df2['left'] > x) &
(ocr_df2['left'] < x + max_dist)
].sort_values('left')
return candidates['text'].tolist()[0] if not candidates.empty else None
return None
text = " ".join(ocr_df2['text'])
email_match = re.search(r'[\w\.-]+@[\w\.-]+', text)
phone_match = re.search(r'\+\d{2}\s?\d{2,3}\s?\d{3}\s?\d{2}\s?\d{2}', text)
raw_text = pytesseract.image_to_string(image, config='--psm 6')
dob_match = re.search(r'\d{2}\.\d{2}\.\d{4}', raw_text)
dob = dob_match.group(0) if dob_match else "Not found"
postcode = None
postcode_after_ch = None
ch_exists = bool(re.search(r'\bCH\b', raw_text))
lines = raw_text.splitlines()
for line in lines:
if re.search(r'\bCH\b', line):
match = re.search(r'\bCH\b.*?(\d{4})(?![\d/])', line)
if match:
postcode_after_ch = match.group(1)
break
if postcode_after_ch:
postcode = postcode_after_ch
else:
matches = re.findall(r'(?<!\d|\w)[0-9]{4}(?!\d|\w)', raw_text)
if matches:
postcode = matches[0]
if not postcode:
postcode = "Not found"
# -------------------- Function List Extraction --------------------
def extract_functions_block():
#
func_match = ocr_df2[ocr_df2['text'].str.lower().str.contains("function")]
if func_match.empty:
return []
base_y = func_match.iloc[0]['top']
func_words = ocr_df2[
(ocr_df2['top'] > base_y + 10) & (ocr_df2['top'] < base_y + 120)
]
# Sort by line_num and left to maintain correct reading order
func_words = func_words.sort_values(by=["line_num", "left"])
grouped_lines = func_words.groupby('line_num')['text'].apply(lambda x: ' '.join(x)).tolist()
clean_funcs = []
for line in grouped_lines:
#
cleaned = re.sub(r'[^a-zA-Z0-9\s]', '', line).strip()
if len(cleaned) > 1:
clean_funcs.append(cleaned)
return clean_funcs
functions = extract_functions_block()
# -------------------- Final Output --------------------
return [
name if name else "Not found",
email_match.group(0) if email_match else "Not found",
phone_match.group(0) if phone_match else "Not found",
dob,
postcode,
get_value_next_to("CurBase") or "Not found",
get_value_next_to("hourly") or get_value_next_to("rate") or "Not found",
"\n".join(functions) if functions else "Not found"
]
except Exception as e:
return [f"Error: {str(e)}"] + ["Not found"] * 8
# -------------------- Gradio Interface --------------------
with gr.Blocks() as demo:
gr.Markdown("## 📄 Image OCR Field Extractor")
gr.Markdown("Upload a document image to extract structured data fields.")
with gr.Row():
with gr.Column():
image_input = gr.Image(type="pil", label=" Upload Your Document")
submit_btn = gr.Button(" Run Extraction")
gr.Examples(
examples=["example_doc.jpeg"],
inputs=[image_input],
label=" Example Image (Click to load into uploader)"
)
with gr.Column():
name = gr.Text(label="Name")
email = gr.Text(label="Email")
phone = gr.Text(label="Phone")
dob = gr.Text(label="DOB")
postcode = gr.Text(label="Postcode")
prem = gr.Text(label="Prem (CurBase)")
rate = gr.Text(label="Temp (Hourly Rate)")
functions = gr.Textbox(label="Functions", lines=4)
submit_btn.click(fn=extract_fields, inputs=image_input,
outputs=[name, email, phone, dob, postcode, prem, rate, functions])
if __name__ == "__main__":
demo.launch()