Spaces:
Running
Running
File size: 6,141 Bytes
8a8d449 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 |
from unimernet.common.registry import registry
from omegaconf import OmegaConf
import albumentations as alb
from albumentations.pytorch import ToTensorV2
from unimernet.processors.base_processor import BaseProcessor
import numpy as np
import cv2
from PIL import Image, ImageOps
from torchvision.transforms.functional import resize
import random
from unimernet.processors.formula_processor_helper.nougat import Bitmap, Dilation, Erosion
from unimernet.processors.formula_processor_helper.weather import Fog, Frost, Snow, Rain, Shadow
class FormulaImageBaseProcessor(BaseProcessor):
def __init__(self, image_size):
super(FormulaImageBaseProcessor, self).__init__()
self.input_size = [int(_) for _ in image_size]
assert len(self.input_size) == 2
@staticmethod
def crop_margin(img: Image.Image) -> Image.Image:
data = np.array(img.convert("L"))
data = data.astype(np.uint8)
max_val = data.max()
min_val = data.min()
if max_val == min_val:
return img
data = (data - min_val) / (max_val - min_val) * 255
gray = 255 * (data < 200).astype(np.uint8)
coords = cv2.findNonZero(gray) # Find all non-zero points (text)
a, b, w, h = cv2.boundingRect(coords) # Find minimum spanning bounding box
return img.crop((a, b, w + a, h + b))
def prepare_input(self, img: Image.Image, random_padding: bool = False):
"""
Convert PIL Image to tensor according to specified input_size after following steps below:
- resize
- rotate (if align_long_axis is True and image is not aligned longer axis with canvas)
- pad
"""
if img is None:
return
# crop margins
try:
img = self.crop_margin(img.convert("RGB"))
except OSError:
# might throw an error for broken files
return
if img.height == 0 or img.width == 0:
return
img = resize(img, min(self.input_size))
img.thumbnail((self.input_size[1], self.input_size[0]))
delta_width = self.input_size[1] - img.width
delta_height = self.input_size[0] - img.height
if random_padding:
pad_width = np.random.randint(low=0, high=delta_width + 1)
pad_height = np.random.randint(low=0, high=delta_height + 1)
else:
pad_width = delta_width // 2
pad_height = delta_height // 2
padding = (
pad_width,
pad_height,
delta_width - pad_width,
delta_height - pad_height,
)
return ImageOps.expand(img, padding)
@registry.register_processor("formula_image_train")
class FormulaImageTrainProcessor(FormulaImageBaseProcessor):
def __init__(self, image_size=384):
super().__init__(image_size)
self.transform = alb.Compose(
[
alb.Compose(
[
Bitmap(p=0.05),
alb.OneOf([Fog(), Frost(), Snow(), Rain(), Shadow()], p=0.2),
alb.OneOf([Erosion((2, 3)), Dilation((2, 3))], p=0.2),
alb.ShiftScaleRotate(shift_limit=0, scale_limit=(-.15, 0), rotate_limit=1, border_mode=0,
interpolation=3,
value=[255, 255, 255],
p=1),
alb.GridDistortion(distort_limit=0.1, border_mode=0, interpolation=3, value=[255, 255, 255],
p=.5)],
p=.15),
# alb.InvertImg(p=.15),
alb.RGBShift(r_shift_limit=15, g_shift_limit=15, b_shift_limit=15, p=0.3),
alb.GaussNoise(10, p=.2),
alb.RandomBrightnessContrast(.05, (-.2, 0), True, p=0.2),
alb.ImageCompression(95, p=.3),
alb.ToGray(always_apply=True),
alb.Normalize((0.7931, 0.7931, 0.7931), (0.1738, 0.1738, 0.1738)),
# alb.Sharpen()
ToTensorV2(),
]
)
def __call__(self, item):
img = self.prepare_input(item, random_padding=True)
if img is None:
return img
return self.transform(image=np.array(img))['image'][:1]
@classmethod
def from_config(cls, cfg=None):
if cfg is None:
cfg = OmegaConf.create()
image_size = cfg.get("image_size", [384, 384])
return cls(
image_size=image_size,
)
@registry.register_processor("formula_image_multi_scale_train")
class FormulaImageMultiScaleTrainProcessor(FormulaImageTrainProcessor):
def __init__(self, all_scales):
for i, scales in enumerate(all_scales):
all_scales[i] = [int(_) for _ in scales]
super(FormulaImageMultiScaleTrainProcessor, self).__init__(all_scales[0])
self.all_scales = all_scales
@classmethod
def from_config(cls, cfg=None):
if cfg is None:
cfg = OmegaConf.create()
all_scales = cfg.get("all_scales", [[384, 384]])
return cls(
all_scales=all_scales
)
def reset_scale(self):
self.input_size = random.choice(self.all_scales)
@registry.register_processor("formula_image_eval")
class FormulaImageEvalProcessor(FormulaImageBaseProcessor):
def __init__(self, image_size):
super().__init__(image_size)
self.transform = alb.Compose(
[
alb.ToGray(always_apply=True),
alb.Normalize((0.7931, 0.7931, 0.7931), (0.1738, 0.1738, 0.1738)),
# alb.Sharpen()
ToTensorV2(),
]
)
def __call__(self, item):
image = self.prepare_input(item)
return self.transform(image=np.array(image))['image'][:1]
@classmethod
def from_config(cls, cfg=None):
if cfg is None:
cfg = OmegaConf.create()
image_size = cfg.get("image_size", [384, 384])
return cls(image_size=image_size)
|