Ayesha-Majeed commited on
Commit
08912f8
·
verified ·
1 Parent(s): 36b682d

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +180 -0
app.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pytesseract
3
+ import cv2
4
+ import pandas as pd
5
+ import re
6
+ from PIL import Image
7
+ import numpy as np
8
+
9
+ def extract_fields(image):
10
+ try:
11
+ # -------------------- Image Preparation --------------------
12
+ img = np.array(image.convert("RGB"))[:, :, ::-1] # PIL to BGR (OpenCV)
13
+ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
14
+ bw = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
15
+ cv2.THRESH_BINARY_INV, 25, 15)
16
+ inverted = cv2.bitwise_not(bw)
17
+ pil_img = Image.fromarray(inverted)
18
+
19
+ # -------------------- OCR Pass 1: Name by "Title" --------------------
20
+ ocr_df2 = pytesseract.image_to_data(image, output_type=pytesseract.Output.DATAFRAME)
21
+ ocr_df2 = ocr_df2.dropna(subset=["text"])
22
+ ocr_df2 = ocr_df2[ocr_df2["text"].str.strip() != ""]
23
+
24
+ name = "Not found"
25
+ neighbors = []
26
+
27
+ # ✅ Fix - Add these lines BEFORE you use `ocr_df`
28
+ ocr_df = pytesseract.image_to_data(pil_img, output_type=pytesseract.Output.DATAFRAME)
29
+ ocr_df = ocr_df.dropna(subset=["text"])
30
+ ocr_df = ocr_df[ocr_df["text"].str.strip() != ""]
31
+ title_matches = ocr_df[ocr_df['text'].str.lower().str.contains("tit", na=False)]
32
+
33
+ if not title_matches.empty:
34
+ title_info = title_matches.iloc[0]
35
+
36
+ if 'line_num' in title_info and 'block_num' in title_info:
37
+ line_num = title_info['line_num']
38
+ block_num = title_info['block_num']
39
+
40
+ same_line = ocr_df[
41
+ (ocr_df['line_num'] == line_num) &
42
+ (ocr_df['block_num'] == block_num)
43
+ ].copy().sort_values(by='left').reset_index(drop=True)
44
+
45
+ tit_indices = same_line[same_line['text'].str.lower().str.contains("tit")].index
46
+ if not tit_indices.empty:
47
+ idx = tit_indices[0]
48
+ if idx + 1 < len(same_line):
49
+ neighbors.append(same_line.iloc[idx + 1]['text'])
50
+ if idx + 2 < len(same_line):
51
+ neighbors.append(same_line.iloc[idx + 2]['text'])
52
+
53
+ def clean_name(words):
54
+ cleaned = []
55
+ for w in words:
56
+ w_clean = re.sub(r'^[^a-zA-Z]+|[^a-zA-Z]+$', '', w)
57
+ if w_clean:
58
+ cleaned.append(w_clean)
59
+ return ' '.join(cleaned)
60
+
61
+ if neighbors:
62
+ name = clean_name(neighbors)
63
+
64
+ # -------------------- OCR Pass 2: For Other Fields --------------------
65
+ ocr_df2 = pytesseract.image_to_data(image, output_type=pytesseract.Output.DATAFRAME)
66
+ ocr_df2 = ocr_df2.dropna(subset=["text"])
67
+ ocr_df2 = ocr_df2[ocr_df2["text"].str.strip() != ""]
68
+
69
+ def get_value_next_to(keyword, direction="right", max_dist=200):
70
+ match = ocr_df2[ocr_df2['text'].str.lower() == keyword.lower()]
71
+ if match.empty:
72
+ return None
73
+ row = match.iloc[0]
74
+ if 'line_num' not in row or 'left' not in row:
75
+ return None
76
+ line = row['line_num']
77
+ x = row['left']
78
+ if direction == "right":
79
+ candidates = ocr_df2[
80
+ (ocr_df2['line_num'] == line) &
81
+ (ocr_df2['left'] > x) &
82
+ (ocr_df2['left'] < x + max_dist)
83
+ ].sort_values('left')
84
+ return candidates['text'].tolist()[0] if not candidates.empty else None
85
+ return None
86
+
87
+ text = " ".join(ocr_df2['text'])
88
+ email_match = re.search(r'[\w\.-]+@[\w\.-]+', text)
89
+ phone_match = re.search(r'\+\d{2}\s?\d{2,3}\s?\d{3}\s?\d{2}\s?\d{2}', text)
90
+
91
+ raw_text = pytesseract.image_to_string(image, config='--psm 6')
92
+
93
+ dob_match = re.search(r'\d{2}\.\d{2}\.\d{4}', raw_text)
94
+ dob = dob_match.group(0) if dob_match else "Not found"
95
+
96
+ postcode = None
97
+ postcode_after_ch = None
98
+ ch_exists = bool(re.search(r'\bCH\b', raw_text))
99
+
100
+ lines = raw_text.splitlines()
101
+ for line in lines:
102
+ if re.search(r'\bCH\b', line):
103
+ match = re.search(r'\bCH\b.*?(\d{4})(?![\d/])', line)
104
+ if match:
105
+ postcode_after_ch = match.group(1)
106
+ break
107
+
108
+ if postcode_after_ch:
109
+ postcode = postcode_after_ch
110
+ else:
111
+ matches = re.findall(r'(?<!\d|\w)[0-9]{4}(?!\d|\w)', raw_text)
112
+ if matches:
113
+ postcode = matches[0]
114
+
115
+ if not postcode:
116
+ postcode = "Not found"
117
+
118
+ # -------------------- Function List Extraction --------------------
119
+ def extract_functions_block():
120
+ #
121
+ func_match = ocr_df2[ocr_df2['text'].str.lower().str.contains("function")]
122
+ if func_match.empty:
123
+ return []
124
+
125
+ base_y = func_match.iloc[0]['top']
126
+
127
+ func_words = ocr_df2[
128
+ (ocr_df2['top'] > base_y + 10) & (ocr_df2['top'] < base_y + 120)
129
+ ]
130
+
131
+ # Sort by line_num and left to maintain correct reading order
132
+ func_words = func_words.sort_values(by=["line_num", "left"])
133
+
134
+ grouped_lines = func_words.groupby('line_num')['text'].apply(lambda x: ' '.join(x)).tolist()
135
+
136
+ clean_funcs = []
137
+ for line in grouped_lines:
138
+ #
139
+ cleaned = re.sub(r'[^a-zA-Z0-9\s]', '', line).strip()
140
+ if len(cleaned) > 1:
141
+ clean_funcs.append(cleaned)
142
+ return clean_funcs
143
+
144
+ functions = extract_functions_block()
145
+ # -------------------- Final Output --------------------
146
+ return [
147
+ name if name else "Not found",
148
+ email_match.group(0) if email_match else "Not found",
149
+ phone_match.group(0) if phone_match else "Not found",
150
+ dob,
151
+ postcode,
152
+ get_value_next_to("CurBase") or "Not found",
153
+ get_value_next_to("hourly") or get_value_next_to("rate") or "Not found",
154
+ "\n".join(functions) if functions else "Not found"
155
+ ]
156
+
157
+ except Exception as e:
158
+ return [f"Error: {str(e)}"] + ["Not found"] * 8
159
+
160
+
161
+ # -------------------- Gradio Interface --------------------
162
+ demo = gr.Interface(
163
+ fn=extract_fields,
164
+ inputs=gr.Image(type="pil"),
165
+ outputs=[
166
+ gr.Text(label="Name"),
167
+ gr.Text(label="Email"),
168
+ gr.Text(label="Phone"),
169
+ gr.Text(label="DOB"),
170
+ gr.Text(label="Postcode"),
171
+ gr.Text(label="Prem (CurBase)"),
172
+ gr.Text(label="Temp (Hourly Rate)"),
173
+ gr.Textbox(label="Functions", lines=4)
174
+ ],
175
+ title="Image OCR Field Extractor",
176
+ description="Upload a document image to extract structured data fields."
177
+ )
178
+
179
+ if __name__ == "__main__":
180
+ demo.launch()