GDavila commited on
Commit
f0debc6
·
1 Parent(s): 24e64a7

Create ocrFuncs.py

Browse files
Files changed (1) hide show
  1. ocrFuncs.py +143 -0
ocrFuncs.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #bit messy to say the least will put cleaner version in separate space
2
+ def imgOCR_img2text(imgFilename):
3
+ import easyocr
4
+ reader = easyocr.Reader(['en'], gpu=True) #GPU inference - faster and more accurate but need GPU. Enable and try/excpet CPU users down to CPU
5
+ #reader = easyocr.Reader(['en'], gpu=False) #CPU inference - slower and less accurate
6
+
7
+ '''
8
+ try:
9
+ reader = easyocr.Reader(['en'], gpu=True) #GPU inference - faster and more accurate but need GPU. Enable and try/except CPU users down to CPU
10
+ except:
11
+ reader = easyocr.Reader(['en'], gpu=False) #CPU inference - slower and less accurate
12
+ '''
13
+
14
+ # Create a reader to do OCR.
15
+ # If you change to GPU instance, it will be faster. But CPU is enough.
16
+ # (by MENU > Runtime > Change runtime type > GPU, then redo from beginning )
17
+ #import easyocr
18
+ #reader = easyocr.Reader(['en'], gpu=True)
19
+
20
+ # Doing OCR. Get bounding boxes.
21
+ bounds2 = reader.readtext(imgFilename) #'writing_demo1.png'
22
+ #bounds2 = reader.readtext('writing_demo1.png', detail = 0) # detail = 0 turns off details, ie coordinates of bounding boxes and just returns the text
23
+
24
+ OCRbox = []
25
+ for kk in range(len(bounds2)): #don't want to alter original with the operations below
26
+ OCRbox.append( bounds2[kk] )
27
+
28
+ def getX1ofBoundingBox(inputArray1): # inputArray1 = bounds2[kk]
29
+ boundingX1 = (inputArray1[0])[0][0]
30
+ return boundingX1
31
+
32
+ def getY1ofBoundingBox(inputArray2): # inputArray2 = bounds2[kk]
33
+ boundingY1 = (inputArray2[0])[0][1]
34
+ return boundingY1
35
+
36
+ def getX3ofBoundingBox(inputArray3): # inputArray3 = bounds2[kk]
37
+ boundingX3 = (inputArray3[0])[2][0]
38
+ return boundingX3
39
+
40
+ def getY3ofBoundingBox(inputArray4): # inputArray4 = bounds2[kk]
41
+ boundingY3 = (inputArray4[0])[2][1]
42
+ return boundingY3
43
+
44
+
45
+ def get_XcentroidCoordinate_ofBoundingBox(inputArray5): # inputArray5 = bounds2[kk]
46
+ x1_0 = getX1ofBoundingBox(inputArray5)
47
+ x3_0 = getX3ofBoundingBox(inputArray5)
48
+
49
+ x_centroid0 = ( (x3_0 - x1_0) / 2 ) + x1_0
50
+ return x_centroid0
51
+
52
+ def get_YcentroidCoordinate_ofBoundingBox(inputArray6): # inputArray6 = bounds2[kk]
53
+ y1_0 = getY1ofBoundingBox(inputArray6)
54
+ y3_0 = getY3ofBoundingBox(inputArray6)
55
+
56
+ y_centroid0 = ( (y3_0 - y1_0) / 2 ) + y1_0
57
+ return y_centroid0
58
+
59
+
60
+
61
+ for kk in range(len(OCRbox)):
62
+ #bounds2[]
63
+ #OCRbox.sort(key=getY1ofBoundingBox) #Sorts it by Y1 location, see here for use of function key in sort https://www.w3schools.com/python/ref_list_sort.asp
64
+ OCRbox.sort(key=get_YcentroidCoordinate_ofBoundingBox) #Sorts it by Y centroid location
65
+
66
+ # [ associatedText, boundingCoordinates ] = [ bounds2[kk][1] , [X1, X3, Y1, Y3] ]
67
+
68
+ print( bounds2 )
69
+ print( "Row sorted aka all Y_centroid (or Y1, Y3, whichever we chose to sort by) should be increasing in each new item : ", OCRbox )
70
+
71
+
72
+ listOfRows = []
73
+ minilist = []
74
+
75
+
76
+ for kk in range(len(OCRbox) - 1):
77
+ minilist.append( OCRbox[kk] )
78
+ if get_YcentroidCoordinate_ofBoundingBox( OCRbox[kk] ) < getY1ofBoundingBox( OCRbox[kk + 1] ):
79
+ listOfRows.append( minilist )
80
+ #print( "this minilist aka row = " , minilist )
81
+ minilist = []
82
+ #minilist.append( OCRbox[kk] )
83
+
84
+ print( "listOfRows = ", listOfRows)
85
+ print( "len( listOfRows) = " , len( listOfRows) )
86
+ print( "the final minilist aka row = " , minilist )
87
+ print( "OCRbox[-1] = ", OCRbox[-1] )
88
+
89
+ #boundary case for last row. If its a single box we append it as its own row. If not we append it to the last list.
90
+ if get_YcentroidCoordinate_ofBoundingBox( OCRbox[-2] ) < getY1ofBoundingBox( OCRbox[-1] ): #boundary case in case the last row also happens to be a single box
91
+ listOfRows.append( [OCRbox[-1]] ) #tack on last one that for loop didnt AS ITS OWN LIST
92
+ elif len(listOfRows) < 1: #basically no text or single row detected
93
+ listOfRows.append( [OCRbox[-1]] )
94
+ else:
95
+ listOfRows[-1].append( OCRbox[-1] ) #tack it onto the last row
96
+
97
+
98
+ #def readLeft2RightSort(): #aka English, for Japanese just do Right2Left; Really just an X-centroid sort on each element of list of rows SEPARATELY like we did Y-centroid sort above
99
+
100
+ listOfRows.append( [([[0, 0], [0, 0], [0, 0], [0, 0]], '', 1)] ) #preserve structure in empty case
101
+
102
+ for kk in range(len(listOfRows)):
103
+ listOfRows[kk].sort(key=get_XcentroidCoordinate_ofBoundingBox)
104
+
105
+ print(listOfRows)
106
+ print(listOfRows[0])
107
+ print(listOfRows[1])
108
+ print(listOfRows[0][0][1])
109
+
110
+ rowOfTextList = []
111
+
112
+ for kk in range(len(listOfRows)):
113
+ for ii in range(len(listOfRows[kk])):
114
+ rowOfTextString = ''.join(listOfRows[kk][ii][1])
115
+ rowOfTextList.append(rowOfTextString)
116
+
117
+ print(rowOfTextList)
118
+
119
+ coordinateSortedText = ' '.join(rowOfTextList)
120
+
121
+ print(coordinateSortedText)
122
+
123
+
124
+
125
+
126
+ def cleanOCRtext(inputString2clean):
127
+ inputString2clean = inputString2clean.replace("_", " ") #replace _ with space
128
+ inputString2clean = inputString2clean.replace(" ", " ") #replace double space with single space
129
+ inputString2clean = inputString2clean.lower()
130
+
131
+ #import re #turn 0's that appear in the text into o's, this seems to be the major letter to number error
132
+ inputString2clean = re.sub("([a-z])[0]", "\\1o", inputString2clean) #capture [a-z] with parentheses then reference the first capture as \\1
133
+ inputString2clean = re.sub("[0]([a-z])", "\\1o", inputString2clean)
134
+
135
+ return inputString2clean
136
+
137
+ cleanedText = cleanOCRtext(coordinateSortedText)
138
+
139
+
140
+ print("============================== FINAL ==============================")
141
+ print(cleanedText)
142
+
143
+ return cleanedText