-
Notifications
You must be signed in to change notification settings - Fork 0
/
project.py
354 lines (317 loc) · 12.1 KB
/
project.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
from scipy import ndimage
import imutils
import cv2
import numpy as np
import csv
import math
import pytesseract
from pytesseract import Output
import re
import os
from datetime import datetime
# Return title without strings containing nums since these might be dates/times
def clean_title(title):
title_words = title.split()
cleaned_title = []
for word in title_words:
hasNums = False
for ch in word:
if ch.isnumeric():
hasNums = True
break
if not hasNums:
cleaned_title.append(word)
new_title = ' '.join(cleaned_title)
return new_title
# Return tuple (height of sentence, sentence)
# Source: https://stackoverflow.com/questions/20831612/getting-the-bounding-bo
# x-of-the-recognized-words-using-python-tesseract/54059166#54059166
# I can use this source because I just use a few lines of code to find height
# of sentence
def get_sentence_info(cropped):
d = pytesseract.image_to_data(cropped, output_type=Output.DICT)
n_boxes = len(d['level'])
word_heights = {}
words = {}
for i in range(n_boxes):
if d['text'][i] != '':
word_heights[i] = d['height'][i]
words[i] = d['text'][i]
if word_heights:
avg_word_height = sum(word_heights.values()) / len(word_heights)
sentence = ' '.join(words.values())
return (avg_word_height, sentence)
else:
return (0, "") # no text in file
# Return boolean: if numbers in date are within expected range
def is_date_valid(date):
# can assume in format nn/nn/nnnn (n=num)
mm, dd, year = date.split("/")
mm, dd, year = int(mm), int(dd), int(year)
return ((mm in range(1, 13)) and (dd in range(1, 32)) and
year in range(23, 26))
# Return date from image if date found
def get_date(sentences):
for text in sentences:
match = re.search(r'\d{2}/\d{2}/\d{2}', text)
if match:
# check within range
validDate = is_date_valid(match.group())
if validDate:
date = datetime.strptime(match.group(), '%m/%d/%y')
date = date.strftime("%m/%d/%y")
return date
return None
# Returns times in format required for csv file
def reformat_times(times):
# Adds space between number and time of day
# Adds ":00" if no minutes
reformatted = []
for time in times:
# extract the number
time = time.lower()
t = time.replace("am", '')
t = t.replace("pm", '')
t += ":00 " if ':' not in t else " "
t += time[-2:]
reformatted.append(t)
return reformatted
# Returns boolean: if numbers within time are in expected range
def is_time_valid(time):
# can assume in format n:nn or nn:nn, plus am/pm (n=num)
nums = time[:-2]
x = nums.split(":")
hour, minute = int(x[0]), int(x[1])
return ((hour in range(1, 13)) and (minute in range(0, 60)))
# Finds times in sentence, reformats them, and returns those within range
# Source: https://stackoverflow.com/questions/20437207/using-python-regular-
# expression-to-match-times
# I can use this source because I just use 1 line (regex)
def get_times(sentences):
times = []
for text in sentences:
regex = r'\d{1,2}(?:(?:am|pm)|(?::\d{1,2})(?:am|pm)?)'
text_times = re.findall(regex, text.lower())
if text_times:
# prevent duplicate times
[times.append(x) for x in text_times if x not in times]
reformatted_times = reformat_times(times)
valid_times = []
for time in reformatted_times:
if is_time_valid(time):
valid_times.append(time)
# in case errors w pytesseract occur, use at most first 2 times
return valid_times[:2]
# Returns image without background behind flyer
def removeWall(image):
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
blur = cv2.GaussianBlur(gray, (3,3), 0)
ret, thresh = cv2.threshold(blur, 130, 255, cv2.THRESH_BINARY)
contours, hierarchy = cv2.findContours(thresh, cv2.RETR_EXTERNAL,
cv2.CHAIN_APPROX_NONE)
big_contour = max(contours, key = cv2.contourArea)
x, y, w, h = cv2.boundingRect(big_contour)
cropped = thresh[y:y + h, x:x + w]
trim = trimLR(cropped, cropped.shape[0], None)
final_trim = trimUD(trim, None, trim.shape[1])
return final_trim
# Recursive function to trim sides
# Source: https://stackoverflow.com/questions/13538748/crop-black-edges-
# with-opencv
# I can use this source because I just use the idea of recursion for
# trimming sides
def trimLR(frame, rows, cols):
# if more than 25% black, crop side
if np.count_nonzero(frame[:, 0] == 0) > (rows * 0.25):
return trimLR(frame[:, 10:], rows, cols)
elif np.count_nonzero(frame[:,-1] == 0) > (rows * 0.25):
return trimLR(frame[:,:-10], rows, cols)
else:
return frame
# Recursive function to trim sides
def trimUD(frame, rows, cols):
if np.count_nonzero(frame[0] == 0) > (cols * 0.25):
return trimUD(frame[10:], rows, cols)
if np.count_nonzero(frame[-1] == 0) > (cols * 0.25):
return trimUD(frame[:-10], rows, cols)
else:
return frame
# Returns boolean for if any of the required elements are empty
def check_bad_elements(title, date, time):
return (title is None or date is None or time is None
or title == "" or date == "" or time == [])
# Creates csv calendar file
def create_calendar_event(title, date, time):
with open('output.csv', 'w') as f:
writer = csv.writer(f)
start_time = time[0]
if len(time) == 1:
headers = ['Subject', 'Start date', 'Start time']
content = [title, date, start_time]
writer.writerow(headers)
writer.writerow(content)
if len(time) == 2:
end_time = time[1]
headers = ['Subject', 'Start date', 'Start time', 'End time']
content = [title, date, start_time, end_time]
writer.writerow(headers)
writer.writerow(content)
# Returns sentence with max height as title
def get_title(possible_titles):
if possible_titles:
max_value = max(possible_titles.values(), key=lambda sub: sub[0])
max_height = max_value[0]
reversed_titles = dict(reversed(list(possible_titles.items())))
large = []
# sometimes titles are too big for our kernel
# so add sentences to new_title with similar h to max height
for key in reversed_titles:
val = possible_titles[key]
h = val[0]
if ((max_height - 11) < h) and ((max_height + 11) > h):
large.append(val[1])
new_title = ' '.join(large)
return clean_title(new_title)
return ""
# Returns sentences without \n character
def cleanup_sentences(sentences):
lst = []
for ele in sentences:
if ele != '':
lst.append(ele.replace("\n", ' '))
return lst
# Returns boolean if string contains any numbers
# Source: https://stackoverflow.com/questions/19859282/check-if-a-string-
# contains-a-number
# I can use this source because this is 1 line of code
def num_there(s):
return any(i.isdigit() for i in s)
# Returns modified date by fixing any pytesseract errors
def fix_date(date, respell, special_respell):
date = date.replace("|", "/")
date = date.replace("\\", "/")
if date.count("/") != 2: # can't do what's below
return date
# fix numbers
lst_date = list(date)
for i, char in enumerate(lst_date):
if char in respell:
lst_date[i] = respell[char]
new_date = ''.join(lst_date)
# special cases: ranges
nums = new_date.split("/")
mm, dd, year = nums[0], nums[1], nums[2]
if mm == "17":
mm = mm.replace('7', special_respell['7'])
if dd == "37":
dd = dd.replace('7', special_respell['7'])
elif dd == "80" or dd == "81":
dd = dd.replace('8', special_respell['8'])
if year[-1] == '8':
year = year[:1] + special_respell['8']
fixed_date = mm + "/" + dd + "/" + year
return fixed_date
# Returns modified time by fixing any pytesseract errors
def fix_time(time, respell, special_respell):
nums = time[:-2]
am_pm = time[-2:]
# fix numbers
lst_nums = list(nums)
for i, char in enumerate(lst_nums):
if char in respell:
lst_nums[i] = respell[char]
new_nums = ''.join(lst_nums)
# special cases
new_nums = new_nums.replace(".", ":")
numbers = new_nums.split(":")
if len(numbers) == 2: # means minutes present
hour = numbers[0]
minute = numbers[1]
for key in special_respell:
if minute[0] == key:
minute = special_respell[key] + minute[1]
new_nums = hour + ":" + minute
new_nums += am_pm
return new_nums
# Returns sentences, fixes any pytesseract errors
def fix_ocr(sentences):
# sentences: list of strings
respell = {'A': '4', 'B': '3', 'b': '6', 'D': '0', 'E': '3', 'F': '7',
'G': '6', 'g': '9', 'H': '4', 'I': '1', 'i': '1', 'L': '1',
'l': '1', 'O': '0', 'q': '9', 'S': '5', 'T': '7', 'U': '0',
'Z': '2'}
special_respell = {'7': '1', '8': '3'}
# fix possible dates
for idx, sentence in enumerate(sentences):
words = sentence.split()
# fix possible date: assume only 1 word in flyer fits this criteria
# since hard to confuse for other non-date strings
for i, word in enumerate(words):
if len(word) >= 8 and len(word) <= 11 and num_there(word):
words[i] = fix_date(word, respell, special_respell)
break
# fix possible times: assume time ends with am/pm
for i, word in enumerate(words):
if (len(word) <= 7 and len(word) >= 3 and
(word[-2:].lower() == "am" or word[-2:].lower() == "pm")):
words[i] = fix_time(word, respell, special_respell)
new_sentence = ' '.join(words)
sentences[idx] = new_sentence
return sentences
# Main part of code where everything is called from
# Source: https://www.geeksforgeeks.org/text-detection-and-extraction-using-
# opencv-and-ocr/
# I can use this source because it contains the basic code required to get
# pytesseract working for extracting text from images, small part of program
# Source: https://stackoverflow.com/questions/24385714/detect-text-region-in-
# image-using-opencv
# I can use this source because it's just to get the cropped image in the loop
def process(file):
# Preprocess image
image = cv2.imread(file)
croppedImage = removeWall(image)
inverse = cv2.bitwise_not(croppedImage)
ksize = 100
rect_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (ksize, ksize))
dilation = cv2.dilate(inverse, rect_kernel, iterations = 1)
contours, _ = cv2.findContours(dilation, cv2.RETR_EXTERNAL,
cv2.CHAIN_APPROX_NONE)
possible_titles = {}
sentences = []
j = 0
for cnt in contours:
x, y, w, h = cv2.boundingRect(cnt)
cropped = inverse[y:y + h, x:x + w]
text = pytesseract.image_to_string(cropped)
sentences.append(text)
possible_titles[j] = get_sentence_info(cropped)
j += 1
'''cv2.imshow('image', cropped)
cv2.waitKey(0)
cv2.destroyAllWindows()'''
clean_sentences = cleanup_sentences(sentences)
fixed_sentences = fix_ocr(clean_sentences)
title = get_title(possible_titles)
date = get_date(fixed_sentences)
time = get_times(fixed_sentences)
print(title, date, time)
if check_bad_elements(title, date, time):
print("Some required elements could not be found.")
print("Failed to create calendar file.")
else:
return create_calendar_event(title, date, time)
# Returns image files
def get_files(file_type):
files = []
path = "test/"
dir_list = os.listdir(path)
for fn in dir_list:
if fn.endswith(file_type):
files.append(path + fn)
files.sort()
return files
def main():
files = get_files(".jpg")
for f in files:
process(f)
main()