Ayesha-Majeed's picture
Update app.py
f4bdaf1 verified
raw
history blame
7.75 kB
import gradio as gr
import pytesseract
import cv2
import pandas as pd
import re
from PIL import Image
import numpy as np
def extract_fields(image):
try:
# -------------------- Image Preparation --------------------
img = np.array(image.convert("RGB"))[:, :, ::-1] # PIL to BGR (OpenCV)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
bw = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY_INV, 25, 15)
inverted = cv2.bitwise_not(bw)
pil_img = Image.fromarray(inverted)
# -------------------- OCR Pass 1: Name by "Title" --------------------
ocr_df2 = pytesseract.image_to_data(image, output_type=pytesseract.Output.DATAFRAME)
ocr_df2 = ocr_df2.dropna(subset=["text"])
ocr_df2 = ocr_df2[ocr_df2["text"].str.strip() != ""]
name = "Not found"
neighbors = []
# βœ… Fix - Add these lines BEFORE you use `ocr_df`
ocr_df = pytesseract.image_to_data(pil_img, output_type=pytesseract.Output.DATAFRAME)
ocr_df = ocr_df.dropna(subset=["text"])
ocr_df = ocr_df[ocr_df["text"].str.strip() != ""]
title_matches = ocr_df[ocr_df['text'].str.lower().str.contains("tit", na=False)]
if not title_matches.empty:
title_info = title_matches.iloc[0]
if 'line_num' in title_info and 'block_num' in title_info:
line_num = title_info['line_num']
block_num = title_info['block_num']
same_line = ocr_df[
(ocr_df['line_num'] == line_num) &
(ocr_df['block_num'] == block_num)
].copy().sort_values(by='left').reset_index(drop=True)
tit_indices = same_line[same_line['text'].str.lower().str.contains("tit")].index
if not tit_indices.empty:
idx = tit_indices[0]
if idx + 1 < len(same_line):
neighbors.append(same_line.iloc[idx + 1]['text'])
if idx + 2 < len(same_line):
neighbors.append(same_line.iloc[idx + 2]['text'])
def clean_name(words):
cleaned = []
for w in words:
w_clean = re.sub(r'^[^a-zA-Z]+|[^a-zA-Z]+$', '', w)
if w_clean:
cleaned.append(w_clean)
return ' '.join(cleaned)
if neighbors:
name = clean_name(neighbors)
# -------------------- OCR Pass 2: For Other Fields --------------------
ocr_df2 = pytesseract.image_to_data(image, output_type=pytesseract.Output.DATAFRAME)
ocr_df2 = ocr_df2.dropna(subset=["text"])
ocr_df2 = ocr_df2[ocr_df2["text"].str.strip() != ""]
def get_value_next_to(keyword, direction="right", max_dist=200):
match = ocr_df2[ocr_df2['text'].str.lower() == keyword.lower()]
if match.empty:
return None
row = match.iloc[0]
if 'line_num' not in row or 'left' not in row:
return None
line = row['line_num']
x = row['left']
if direction == "right":
candidates = ocr_df2[
(ocr_df2['line_num'] == line) &
(ocr_df2['left'] > x) &
(ocr_df2['left'] < x + max_dist)
].sort_values('left')
return candidates['text'].tolist()[0] if not candidates.empty else None
return None
text = " ".join(ocr_df2['text'])
email_match = re.search(r'[\w\.-]+@[\w\.-]+', text)
phone_match = re.search(r'\+\d{2}\s?\d{2,3}\s?\d{3}\s?\d{2}\s?\d{2}', text)
raw_text = pytesseract.image_to_string(image, config='--psm 6')
dob_match = re.search(r'\d{2}\.\d{2}\.\d{4}', raw_text)
dob = dob_match.group(0) if dob_match else "Not found"
postcode = None
postcode_after_ch = None
ch_exists = bool(re.search(r'\bCH\b', raw_text))
lines = raw_text.splitlines()
for line in lines:
if re.search(r'\bCH\b', line):
match = re.search(r'\bCH\b.*?(\d{4})(?![\d/])', line)
if match:
postcode_after_ch = match.group(1)
break
if postcode_after_ch:
postcode = postcode_after_ch
else:
matches = re.findall(r'(?<!\d|\w)[0-9]{4}(?!\d|\w)', raw_text)
if matches:
postcode = matches[0]
if not postcode:
postcode = "Not found"
# -------------------- Function List Extraction --------------------
def extract_functions_block():
#
func_match = ocr_df2[ocr_df2['text'].str.lower().str.contains("function")]
if func_match.empty:
return []
base_y = func_match.iloc[0]['top']
func_words = ocr_df2[
(ocr_df2['top'] > base_y + 10) & (ocr_df2['top'] < base_y + 120)
]
# Sort by line_num and left to maintain correct reading order
func_words = func_words.sort_values(by=["line_num", "left"])
grouped_lines = func_words.groupby('line_num')['text'].apply(lambda x: ' '.join(x)).tolist()
clean_funcs = []
for line in grouped_lines:
#
cleaned = re.sub(r'[^a-zA-Z0-9\s]', '', line).strip()
if len(cleaned) > 1:
clean_funcs.append(cleaned)
return clean_funcs
functions = extract_functions_block()
# -------------------- Final Output --------------------
return [
name if name else "Not found",
email_match.group(0) if email_match else "Not found",
phone_match.group(0) if phone_match else "Not found",
dob,
postcode,
get_value_next_to("CurBase") or "Not found",
get_value_next_to("hourly") or get_value_next_to("rate") or "Not found",
"\n".join(functions) if functions else "Not found"
]
except Exception as e:
return [f"Error: {str(e)}"] + ["Not found"] * 8
# -------------------- Gradio Interface --------------------
def extract_fields(image):
# Your full function definition goes here
return ["Name", "Email", "Phone", "DOB", "Postcode", "Prem", "Rate", "Functions"]
with gr.Blocks() as demo:
gr.Markdown("## πŸ“„ Image OCR Field Extractor")
gr.Markdown("Upload a document image to extract structured data fields.")
with gr.Row():
with gr.Column():
image_input = gr.Image(type="pil", label="πŸ“€ Upload Your Document")
submit_btn = gr.Button("πŸ” Run Detection")
# βœ… One Example Image that auto-fills input when clicked
gr.Examples(
examples=["example_doc.jpeg"], # Just one image
inputs=[image_input],
label="πŸ“Œ Example Image (Click to load)"
)
with gr.Column():
name = gr.Text(label="Name")
email = gr.Text(label="Email")
phone = gr.Text(label="Phone")
dob = gr.Text(label="DOB")
postcode = gr.Text(label="Postcode")
prem = gr.Text(label="Prem (CurBase)")
rate = gr.Text(label="Temp (Hourly Rate)")
functions = gr.Textbox(label="Functions", lines=4)
# πŸ” Link button to function
submit_btn.click(fn=extract_fields, inputs=image_input,
outputs=[name, email, phone, dob, postcode, prem, rate, functions])
if __name__ == "__main__":
demo.launch()