import gradio as gr import pytesseract import cv2 import pandas as pd import re from PIL import Image import numpy as np def extract_fields(image): try: # -------------------- Image Preparation -------------------- img = np.array(image.convert("RGB"))[:, :, ::-1] # PIL to BGR (OpenCV) gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) bw = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 25, 15) inverted = cv2.bitwise_not(bw) pil_img = Image.fromarray(inverted) # -------------------- OCR Pass 1: Name by "Title" -------------------- ocr_df2 = pytesseract.image_to_data(image, output_type=pytesseract.Output.DATAFRAME) ocr_df2 = ocr_df2.dropna(subset=["text"]) ocr_df2 = ocr_df2[ocr_df2["text"].str.strip() != ""] name = "Not found" neighbors = [] # ✅ Fix - Add these lines BEFORE you use `ocr_df` ocr_df = pytesseract.image_to_data(pil_img, output_type=pytesseract.Output.DATAFRAME) ocr_df = ocr_df.dropna(subset=["text"]) ocr_df = ocr_df[ocr_df["text"].str.strip() != ""] title_matches = ocr_df[ocr_df['text'].str.lower().str.contains("tit", na=False)] if not title_matches.empty: title_info = title_matches.iloc[0] if 'line_num' in title_info and 'block_num' in title_info: line_num = title_info['line_num'] block_num = title_info['block_num'] same_line = ocr_df[ (ocr_df['line_num'] == line_num) & (ocr_df['block_num'] == block_num) ].copy().sort_values(by='left').reset_index(drop=True) tit_indices = same_line[same_line['text'].str.lower().str.contains("tit")].index if not tit_indices.empty: idx = tit_indices[0] if idx + 1 < len(same_line): neighbors.append(same_line.iloc[idx + 1]['text']) if idx + 2 < len(same_line): neighbors.append(same_line.iloc[idx + 2]['text']) def clean_name(words): cleaned = [] for w in words: w_clean = re.sub(r'^[^a-zA-Z]+|[^a-zA-Z]+$', '', w) if w_clean: cleaned.append(w_clean) return ' '.join(cleaned) if neighbors: name = clean_name(neighbors) # -------------------- OCR Pass 2: For Other Fields -------------------- ocr_df2 = pytesseract.image_to_data(image, output_type=pytesseract.Output.DATAFRAME) ocr_df2 = ocr_df2.dropna(subset=["text"]) ocr_df2 = ocr_df2[ocr_df2["text"].str.strip() != ""] def get_value_next_to(keyword, direction="right", max_dist=200): match = ocr_df2[ocr_df2['text'].str.lower() == keyword.lower()] if match.empty: return None row = match.iloc[0] if 'line_num' not in row or 'left' not in row: return None line = row['line_num'] x = row['left'] if direction == "right": candidates = ocr_df2[ (ocr_df2['line_num'] == line) & (ocr_df2['left'] > x) & (ocr_df2['left'] < x + max_dist) ].sort_values('left') return candidates['text'].tolist()[0] if not candidates.empty else None return None text = " ".join(ocr_df2['text']) email_match = re.search(r'[\w\.-]+@[\w\.-]+', text) phone_match = re.search(r'\+\d{2}\s?\d{2,3}\s?\d{3}\s?\d{2}\s?\d{2}', text) raw_text = pytesseract.image_to_string(image, config='--psm 6') dob_match = re.search(r'\d{2}\.\d{2}\.\d{4}', raw_text) dob = dob_match.group(0) if dob_match else "Not found" postcode = None postcode_after_ch = None ch_exists = bool(re.search(r'\bCH\b', raw_text)) lines = raw_text.splitlines() for line in lines: if re.search(r'\bCH\b', line): match = re.search(r'\bCH\b.*?(\d{4})(?![\d/])', line) if match: postcode_after_ch = match.group(1) break if postcode_after_ch: postcode = postcode_after_ch else: matches = re.findall(r'(? base_y + 10) & (ocr_df2['top'] < base_y + 120) ] # Sort by line_num and left to maintain correct reading order func_words = func_words.sort_values(by=["line_num", "left"]) grouped_lines = func_words.groupby('line_num')['text'].apply(lambda x: ' '.join(x)).tolist() clean_funcs = [] for line in grouped_lines: # cleaned = re.sub(r'[^a-zA-Z0-9\s]', '', line).strip() if len(cleaned) > 1: clean_funcs.append(cleaned) return clean_funcs functions = extract_functions_block() # -------------------- Final Output -------------------- return [ name if name else "Not found", email_match.group(0) if email_match else "Not found", phone_match.group(0) if phone_match else "Not found", dob, postcode, get_value_next_to("CurBase") or "Not found", get_value_next_to("hourly") or get_value_next_to("rate") or "Not found", "\n".join(functions) if functions else "Not found" ] except Exception as e: return [f"Error: {str(e)}"] + ["Not found"] * 8 # -------------------- Gradio Interface -------------------- with gr.Blocks() as demo: gr.Markdown("## 📄 Image OCR Field Extractor") gr.Markdown("Upload a document image to extract structured data fields.") with gr.Row(): with gr.Column(): image_input = gr.Image(type="pil", label=" Upload Your Document") submit_btn = gr.Button(" Run Extraction") gr.Examples( examples=["example_doc.jpeg"], inputs=[image_input], label=" Example Image (Click to load into uploader)" ) with gr.Column(): name = gr.Text(label="Name") email = gr.Text(label="Email") phone = gr.Text(label="Phone") dob = gr.Text(label="DOB") postcode = gr.Text(label="Postcode") prem = gr.Text(label="Prem (CurBase)") rate = gr.Text(label="Temp (Hourly Rate)") functions = gr.Textbox(label="Functions", lines=4) submit_btn.click(fn=extract_fields, inputs=image_input, outputs=[name, email, phone, dob, postcode, prem, rate, functions]) if __name__ == "__main__": demo.launch()