Spaces:

Mattral
/

HandWritten-Text-Recognizer

Sleeping

App Files Files Community

HandWritten-Text-Recognizer / app /preprocessor.py

Mattral

Upload 13 files

57462b3 verified over 1 year ago

raw

history blame

7.09 kB

	import random
	from typing import Tuple

	import cv2
	import numpy as np

	from dataloader_iam import Batch


	class Preprocessor:
	def __init__(self,
	img_size: Tuple[int, int],
	padding: int = 0,
	dynamic_width: bool = False,
	data_augmentation: bool = False,
	line_mode: bool = False) -> None:
	# dynamic width only supported when no data augmentation happens
	assert not (dynamic_width and data_augmentation)
	# when padding is on, we need dynamic width enabled
	assert not (padding > 0 and not dynamic_width)

	self.img_size = img_size
	self.padding = padding
	self.dynamic_width = dynamic_width
	self.data_augmentation = data_augmentation
	self.line_mode = line_mode

	@staticmethod
	def _truncate_label(text: str, max_text_len: int) -> str:
	"""
	Function ctc_loss can't compute loss if it cannot find a mapping between text label and input
	labels. Repeat letters cost double because of the blank symbol needing to be inserted.
	If a too-long label is provided, ctc_loss returns an infinite gradient.
	"""
	cost = 0
	for i in range(len(text)):
	if i != 0 and text[i] == text[i - 1]:
	cost += 2
	else:
	cost += 1
	if cost > max_text_len:
	return text[:i]
	return text

	def _simulate_text_line(self, batch: Batch) -> Batch:
	"""Create image of a text line by pasting multiple word images into an image."""

	default_word_sep = 30
	default_num_words = 5

	# go over all batch elements
	res_imgs = []
	res_gt_texts = []
	for i in range(batch.batch_size):
	# number of words to put into current line
	num_words = random.randint(1, 8) if self.data_augmentation else default_num_words

	# concat ground truth texts
	curr_gt = ' '.join([batch.gt_texts[(i + j) % batch.batch_size] for j in range(num_words)])
	res_gt_texts.append(curr_gt)

	# put selected word images into list, compute target image size
	sel_imgs = []
	word_seps = [0]
	h = 0
	w = 0
	for j in range(num_words):
	curr_sel_img = batch.imgs[(i + j) % batch.batch_size]
	curr_word_sep = random.randint(20, 50) if self.data_augmentation else default_word_sep
	h = max(h, curr_sel_img.shape[0])
	w += curr_sel_img.shape[1]
	sel_imgs.append(curr_sel_img)
	if j + 1 < num_words:
	w += curr_word_sep
	word_seps.append(curr_word_sep)

	# put all selected word images into target image
	target = np.ones([h, w], np.uint8) * 255
	x = 0
	for curr_sel_img, curr_word_sep in zip(sel_imgs, word_seps):
	x += curr_word_sep
	y = (h - curr_sel_img.shape[0]) // 2
	target[y:y + curr_sel_img.shape[0]:, x:x + curr_sel_img.shape[1]] = curr_sel_img
	x += curr_sel_img.shape[1]

	# put image of line into result
	res_imgs.append(target)

	return Batch(res_imgs, res_gt_texts, batch.batch_size)

	def process_img(self, img: np.ndarray) -> np.ndarray:
	"""Resize to target size, apply data augmentation."""

	# there are damaged files in IAM dataset - just use black image instead
	if img is None:
	img = np.zeros(self.img_size[::-1])

	# data augmentation
	img = img.astype(float)
	if self.data_augmentation:
	# photometric data augmentation
	if random.random() < 0.25:
	def rand_odd():
	return random.randint(1, 3) * 2 + 1
	img = cv2.GaussianBlur(img, (rand_odd(), rand_odd()), 0)
	if random.random() < 0.25:
	img = cv2.dilate(img, np.ones((3, 3)))
	if random.random() < 0.25:
	img = cv2.erode(img, np.ones((3, 3)))

	# geometric data augmentation
	wt, ht = self.img_size
	h, w = img.shape
	f = min(wt / w, ht / h)
	fx = f * np.random.uniform(0.75, 1.05)
	fy = f * np.random.uniform(0.75, 1.05)

	# random position around center
	txc = (wt - w * fx) / 2
	tyc = (ht - h * fy) / 2
	freedom_x = max((wt - fx * w) / 2, 0)
	freedom_y = max((ht - fy * h) / 2, 0)
	tx = txc + np.random.uniform(-freedom_x, freedom_x)
	ty = tyc + np.random.uniform(-freedom_y, freedom_y)

	# map image into target image
	M = np.float32([[fx, 0, tx], [0, fy, ty]])
	target = np.ones(self.img_size[::-1]) * 255
	img = cv2.warpAffine(img, M, dsize=self.img_size, dst=target, borderMode=cv2.BORDER_TRANSPARENT)

	# photometric data augmentation
	if random.random() < 0.5:
	img = img * (0.25 + random.random() * 0.75)
	if random.random() < 0.25:
	img = np.clip(img + (np.random.random(img.shape) - 0.5) * random.randint(1, 25), 0, 255)
	if random.random() < 0.1:
	img = 255 - img

	# no data augmentation
	else:
	if self.dynamic_width:
	ht = self.img_size[1]
	h, w = img.shape
	f = ht / h
	wt = int(f * w + self.padding)
	wt = wt + (4 - wt) % 4
	tx = (wt - w * f) / 2
	ty = 0
	else:
	wt, ht = self.img_size
	h, w = img.shape
	f = min(wt / w, ht / h)
	tx = (wt - w * f) / 2
	ty = (ht - h * f) / 2

	# map image into target image
	M = np.float32([[f, 0, tx], [0, f, ty]])
	target = np.ones([ht, wt]) * 255
	img = cv2.warpAffine(img, M, dsize=(wt, ht), dst=target, borderMode=cv2.BORDER_TRANSPARENT)

	# transpose for TF
	img = cv2.transpose(img)

	# convert to range [-1, 1]
	img = img / 255 - 0.5
	return img

	def process_batch(self, batch: Batch) -> Batch:
	if self.line_mode:
	batch = self._simulate_text_line(batch)

	res_imgs = [self.process_img(img) for img in batch.imgs]
	max_text_len = res_imgs[0].shape[0] // 4
	res_gt_texts = [self._truncate_label(gt_text, max_text_len) for gt_text in batch.gt_texts]
	return Batch(res_imgs, res_gt_texts, batch.batch_size)


	def main():
	import matplotlib.pyplot as plt

	img = cv2.imread('../data/test.png', cv2.IMREAD_GRAYSCALE)
	img_aug = Preprocessor((256, 32), data_augmentation=True).process_img(img)
	plt.subplot(121)
	plt.imshow(img, cmap='gray')
	plt.subplot(122)
	plt.imshow(cv2.transpose(img_aug) + 0.5, cmap='gray', vmin=0, vmax=1)
	plt.show()


	if __name__ == '__main__':
	main()