Cvs_text_Extraction

Running

App Files Files Community

Ayesha-Majeed commited on 4 days ago

Commit

08912f8

verified ·

1 Parent(s): 36b682d

Upload app.py

Browse files

Files changed (1) hide show

app.py +180 -0

app.py ADDED Viewed

	@@ -0,0 +1,180 @@

+import gradio as gr
+import pytesseract
+import cv2
+import pandas as pd
+import re
+from PIL import Image
+import numpy as np
+def extract_fields(image):
+    try:
+        # -------------------- Image Preparation --------------------
+        img = np.array(image.convert("RGB"))[:, :, ::-1]  # PIL to BGR (OpenCV)
+        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+        bw = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+                                   cv2.THRESH_BINARY_INV, 25, 15)
+        inverted = cv2.bitwise_not(bw)
+        pil_img = Image.fromarray(inverted)
+        # -------------------- OCR Pass 1: Name by "Title" --------------------
+        ocr_df2 = pytesseract.image_to_data(image, output_type=pytesseract.Output.DATAFRAME)
+        ocr_df2 = ocr_df2.dropna(subset=["text"])
+        ocr_df2 = ocr_df2[ocr_df2["text"].str.strip() != ""]
+        name = "Not found"
+        neighbors = []
+        # ✅ Fix - Add these lines BEFORE you use `ocr_df`
+        ocr_df = pytesseract.image_to_data(pil_img, output_type=pytesseract.Output.DATAFRAME)
+        ocr_df = ocr_df.dropna(subset=["text"])
+        ocr_df = ocr_df[ocr_df["text"].str.strip() != ""]
+        title_matches = ocr_df[ocr_df['text'].str.lower().str.contains("tit", na=False)]
+        if not title_matches.empty:
+            title_info = title_matches.iloc[0]
+            if 'line_num' in title_info and 'block_num' in title_info:
+                line_num = title_info['line_num']
+                block_num = title_info['block_num']
+                same_line = ocr_df[
+                    (ocr_df['line_num'] == line_num) &
+                    (ocr_df['block_num'] == block_num)
+                ].copy().sort_values(by='left').reset_index(drop=True)
+                tit_indices = same_line[same_line['text'].str.lower().str.contains("tit")].index
+                if not tit_indices.empty:
+                    idx = tit_indices[0]
+                    if idx + 1 < len(same_line):
+                        neighbors.append(same_line.iloc[idx + 1]['text'])
+                    if idx + 2 < len(same_line):
+                        neighbors.append(same_line.iloc[idx + 2]['text'])
+        def clean_name(words):
+            cleaned = []
+            for w in words:
+                w_clean = re.sub(r'^[^a-zA-Z]+|[^a-zA-Z]+$', '', w)
+                if w_clean:
+                    cleaned.append(w_clean)
+            return ' '.join(cleaned)
+        if neighbors:
+            name = clean_name(neighbors)
+        # -------------------- OCR Pass 2: For Other Fields --------------------
+        ocr_df2 = pytesseract.image_to_data(image, output_type=pytesseract.Output.DATAFRAME)
+        ocr_df2 = ocr_df2.dropna(subset=["text"])
+        ocr_df2 = ocr_df2[ocr_df2["text"].str.strip() != ""]
+        def get_value_next_to(keyword, direction="right", max_dist=200):
+            match = ocr_df2[ocr_df2['text'].str.lower() == keyword.lower()]
+            if match.empty:
+                return None
+            row = match.iloc[0]
+            if 'line_num' not in row or 'left' not in row:
+                return None
+            line = row['line_num']
+            x = row['left']
+            if direction == "right":
+                candidates = ocr_df2[
+                    (ocr_df2['line_num'] == line) &
+                    (ocr_df2['left'] > x) &
+                    (ocr_df2['left'] < x + max_dist)
+                ].sort_values('left')
+                return candidates['text'].tolist()[0] if not candidates.empty else None
+            return None
+        text = " ".join(ocr_df2['text'])
+        email_match = re.search(r'[\w\.-]+@[\w\.-]+', text)
+        phone_match = re.search(r'\+\d{2}\s?\d{2,3}\s?\d{3}\s?\d{2}\s?\d{2}', text)
+        raw_text = pytesseract.image_to_string(image, config='--psm 6')
+        dob_match = re.search(r'\d{2}\.\d{2}\.\d{4}', raw_text)
+        dob = dob_match.group(0) if dob_match else "Not found"
+        postcode = None
+        postcode_after_ch = None
+        ch_exists = bool(re.search(r'\bCH\b', raw_text))
+        lines = raw_text.splitlines()
+        for line in lines:
+            if re.search(r'\bCH\b', line):
+                match = re.search(r'\bCH\b.*?(\d{4})(?![\d/])', line)
+                if match:
+                    postcode_after_ch = match.group(1)
+                    break
+        if postcode_after_ch:
+            postcode = postcode_after_ch
+        else:
+            matches = re.findall(r'(?<!\d|\w)[0-9]{4}(?!\d|\w)', raw_text)
+            if matches:
+                postcode = matches[0]
+        if not postcode:
+            postcode = "Not found"
+        # -------------------- Function List Extraction --------------------
+        def extract_functions_block():
+            #
+            func_match = ocr_df2[ocr_df2['text'].str.lower().str.contains("function")]
+            if func_match.empty:
+                return []
+            base_y = func_match.iloc[0]['top']
+            func_words = ocr_df2[
+            (ocr_df2['top'] > base_y + 10) & (ocr_df2['top'] < base_y + 120)
+            ]
+            # Sort by line_num and left to maintain correct reading order
+            func_words = func_words.sort_values(by=["line_num", "left"])
+            grouped_lines = func_words.groupby('line_num')['text'].apply(lambda x: ' '.join(x)).tolist()
+            clean_funcs = []
+            for line in grouped_lines:
+                #
+                cleaned = re.sub(r'[^a-zA-Z0-9\s]', '', line).strip()
+                if len(cleaned) > 1:
+                    clean_funcs.append(cleaned)
+            return clean_funcs
+        functions = extract_functions_block()
+        # -------------------- Final Output --------------------
+        return [
+            name if name else "Not found",
+            email_match.group(0) if email_match else "Not found",
+            phone_match.group(0) if phone_match else "Not found",
+            dob,
+            postcode,
+            get_value_next_to("CurBase") or "Not found",
+            get_value_next_to("hourly") or get_value_next_to("rate") or "Not found",
+            "\n".join(functions) if functions else "Not found"
+        ]
+    except Exception as e:
+        return [f"Error: {str(e)}"] + ["Not found"] * 8
+# -------------------- Gradio Interface --------------------
+demo = gr.Interface(
+    fn=extract_fields,
+    inputs=gr.Image(type="pil"),
+    outputs=[
+        gr.Text(label="Name"),
+        gr.Text(label="Email"),
+        gr.Text(label="Phone"),
+        gr.Text(label="DOB"),
+        gr.Text(label="Postcode"),
+        gr.Text(label="Prem (CurBase)"),
+        gr.Text(label="Temp (Hourly Rate)"),
+        gr.Textbox(label="Functions", lines=4)
+    ],
+    title="Image OCR Field Extractor",
+    description="Upload a document image to extract structured data fields."
+)
+if __name__ == "__main__":
+    demo.launch()