Update app.py
Browse files
app.py
CHANGED
@@ -48,42 +48,6 @@ import pytesseract
|
|
48 |
|
49 |
#pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
|
50 |
from PIL import Image
|
51 |
-
def mark_region(im):
|
52 |
-
|
53 |
-
#im = cv2.imread(image_path)
|
54 |
-
|
55 |
-
gray = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
|
56 |
-
blur = cv2.GaussianBlur(gray, (9,9), 0)
|
57 |
-
thresh = cv2.adaptiveThreshold(blur,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV,11,30)
|
58 |
-
|
59 |
-
# Dilate to combine adjacent text contours
|
60 |
-
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (9,9))
|
61 |
-
dilate = cv2.dilate(thresh, kernel, iterations=4)
|
62 |
-
|
63 |
-
# Find contours, highlight text areas, and extract ROIs
|
64 |
-
cnts = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
65 |
-
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
|
66 |
-
|
67 |
-
line_items_coordinates = []
|
68 |
-
for c in cnts:
|
69 |
-
area = cv2.contourArea(c)
|
70 |
-
x,y,w,h = cv2.boundingRect(c)
|
71 |
-
|
72 |
-
if y >= 600 and x <= 1000:
|
73 |
-
if area > 10000:
|
74 |
-
image = cv2.rectangle(im, (x,y), (2200, y+h), color=(255,0,255), thickness=3)
|
75 |
-
line_items_coordinates.append([(x,y), (2200, y+h)])
|
76 |
-
|
77 |
-
if y >= 2400 and x<= 2000:
|
78 |
-
image = cv2.rectangle(im, (x,y), (2200, y+h), color=(255,0,255), thickness=3)
|
79 |
-
line_items_coordinates.append([(x,y), (2200, y+h)])
|
80 |
-
|
81 |
-
|
82 |
-
return image, line_items_coordinates
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
@st.experimental_singleton
|
88 |
def read_pdf(file):
|
89 |
images=pdf2image.convert_from_path(file)
|
|
|
48 |
|
49 |
#pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
|
50 |
from PIL import Image
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
@st.experimental_singleton
|
52 |
def read_pdf(file):
|
53 |
images=pdf2image.convert_from_path(file)
|