Ayesha-Majeed's picture
Upload app.py
08912f8 verified
raw
history blame
6.89 kB
import gradio as gr
import pytesseract
import cv2
import pandas as pd
import re
from PIL import Image
import numpy as np
def extract_fields(image):
try:
# -------------------- Image Preparation --------------------
img = np.array(image.convert("RGB"))[:, :, ::-1] # PIL to BGR (OpenCV)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
bw = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY_INV, 25, 15)
inverted = cv2.bitwise_not(bw)
pil_img = Image.fromarray(inverted)
# -------------------- OCR Pass 1: Name by "Title" --------------------
ocr_df2 = pytesseract.image_to_data(image, output_type=pytesseract.Output.DATAFRAME)
ocr_df2 = ocr_df2.dropna(subset=["text"])
ocr_df2 = ocr_df2[ocr_df2["text"].str.strip() != ""]
name = "Not found"
neighbors = []
# ✅ Fix - Add these lines BEFORE you use `ocr_df`
ocr_df = pytesseract.image_to_data(pil_img, output_type=pytesseract.Output.DATAFRAME)
ocr_df = ocr_df.dropna(subset=["text"])
ocr_df = ocr_df[ocr_df["text"].str.strip() != ""]
title_matches = ocr_df[ocr_df['text'].str.lower().str.contains("tit", na=False)]
if not title_matches.empty:
title_info = title_matches.iloc[0]
if 'line_num' in title_info and 'block_num' in title_info:
line_num = title_info['line_num']
block_num = title_info['block_num']
same_line = ocr_df[
(ocr_df['line_num'] == line_num) &
(ocr_df['block_num'] == block_num)
].copy().sort_values(by='left').reset_index(drop=True)
tit_indices = same_line[same_line['text'].str.lower().str.contains("tit")].index
if not tit_indices.empty:
idx = tit_indices[0]
if idx + 1 < len(same_line):
neighbors.append(same_line.iloc[idx + 1]['text'])
if idx + 2 < len(same_line):
neighbors.append(same_line.iloc[idx + 2]['text'])
def clean_name(words):
cleaned = []
for w in words:
w_clean = re.sub(r'^[^a-zA-Z]+|[^a-zA-Z]+$', '', w)
if w_clean:
cleaned.append(w_clean)
return ' '.join(cleaned)
if neighbors:
name = clean_name(neighbors)
# -------------------- OCR Pass 2: For Other Fields --------------------
ocr_df2 = pytesseract.image_to_data(image, output_type=pytesseract.Output.DATAFRAME)
ocr_df2 = ocr_df2.dropna(subset=["text"])
ocr_df2 = ocr_df2[ocr_df2["text"].str.strip() != ""]
def get_value_next_to(keyword, direction="right", max_dist=200):
match = ocr_df2[ocr_df2['text'].str.lower() == keyword.lower()]
if match.empty:
return None
row = match.iloc[0]
if 'line_num' not in row or 'left' not in row:
return None
line = row['line_num']
x = row['left']
if direction == "right":
candidates = ocr_df2[
(ocr_df2['line_num'] == line) &
(ocr_df2['left'] > x) &
(ocr_df2['left'] < x + max_dist)
].sort_values('left')
return candidates['text'].tolist()[0] if not candidates.empty else None
return None
text = " ".join(ocr_df2['text'])
email_match = re.search(r'[\w\.-]+@[\w\.-]+', text)
phone_match = re.search(r'\+\d{2}\s?\d{2,3}\s?\d{3}\s?\d{2}\s?\d{2}', text)
raw_text = pytesseract.image_to_string(image, config='--psm 6')
dob_match = re.search(r'\d{2}\.\d{2}\.\d{4}', raw_text)
dob = dob_match.group(0) if dob_match else "Not found"
postcode = None
postcode_after_ch = None
ch_exists = bool(re.search(r'\bCH\b', raw_text))
lines = raw_text.splitlines()
for line in lines:
if re.search(r'\bCH\b', line):
match = re.search(r'\bCH\b.*?(\d{4})(?![\d/])', line)
if match:
postcode_after_ch = match.group(1)
break
if postcode_after_ch:
postcode = postcode_after_ch
else:
matches = re.findall(r'(?<!\d|\w)[0-9]{4}(?!\d|\w)', raw_text)
if matches:
postcode = matches[0]
if not postcode:
postcode = "Not found"
# -------------------- Function List Extraction --------------------
def extract_functions_block():
#
func_match = ocr_df2[ocr_df2['text'].str.lower().str.contains("function")]
if func_match.empty:
return []
base_y = func_match.iloc[0]['top']
func_words = ocr_df2[
(ocr_df2['top'] > base_y + 10) & (ocr_df2['top'] < base_y + 120)
]
# Sort by line_num and left to maintain correct reading order
func_words = func_words.sort_values(by=["line_num", "left"])
grouped_lines = func_words.groupby('line_num')['text'].apply(lambda x: ' '.join(x)).tolist()
clean_funcs = []
for line in grouped_lines:
#
cleaned = re.sub(r'[^a-zA-Z0-9\s]', '', line).strip()
if len(cleaned) > 1:
clean_funcs.append(cleaned)
return clean_funcs
functions = extract_functions_block()
# -------------------- Final Output --------------------
return [
name if name else "Not found",
email_match.group(0) if email_match else "Not found",
phone_match.group(0) if phone_match else "Not found",
dob,
postcode,
get_value_next_to("CurBase") or "Not found",
get_value_next_to("hourly") or get_value_next_to("rate") or "Not found",
"\n".join(functions) if functions else "Not found"
]
except Exception as e:
return [f"Error: {str(e)}"] + ["Not found"] * 8
# -------------------- Gradio Interface --------------------
demo = gr.Interface(
fn=extract_fields,
inputs=gr.Image(type="pil"),
outputs=[
gr.Text(label="Name"),
gr.Text(label="Email"),
gr.Text(label="Phone"),
gr.Text(label="DOB"),
gr.Text(label="Postcode"),
gr.Text(label="Prem (CurBase)"),
gr.Text(label="Temp (Hourly Rate)"),
gr.Textbox(label="Functions", lines=4)
],
title="Image OCR Field Extractor",
description="Upload a document image to extract structured data fields."
)
if __name__ == "__main__":
demo.launch()