Kevin Hu
commited on
Commit
·
0129457
1
Parent(s):
b797251
Upgrades Document Layout Analysis model. (#4054)
Browse files### What problem does this PR solve?
#4052
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
api/db/services/task_service.py
CHANGED
|
@@ -247,8 +247,8 @@ def queue_tasks(doc: dict, bucket: str, name: str):
|
|
| 247 |
task["progress"] = 0.0
|
| 248 |
|
| 249 |
prev_tasks = TaskService.get_tasks(doc["id"])
|
|
|
|
| 250 |
if prev_tasks:
|
| 251 |
-
ck_num = 0
|
| 252 |
for task in tsks:
|
| 253 |
ck_num += reuse_prev_task_chunks(task, prev_tasks, chunking_config)
|
| 254 |
TaskService.filter_delete([Task.doc_id == doc["id"]])
|
|
@@ -258,7 +258,7 @@ def queue_tasks(doc: dict, bucket: str, name: str):
|
|
| 258 |
chunk_ids.extend(task["chunk_ids"].split())
|
| 259 |
if chunk_ids:
|
| 260 |
settings.docStoreConn.delete({"id": chunk_ids}, search.index_name(chunking_config["tenant_id"]), chunking_config["kb_id"])
|
| 261 |
-
|
| 262 |
|
| 263 |
bulk_insert_into_db(Task, tsks, True)
|
| 264 |
DocumentService.begin2parse(doc["id"])
|
|
|
|
| 247 |
task["progress"] = 0.0
|
| 248 |
|
| 249 |
prev_tasks = TaskService.get_tasks(doc["id"])
|
| 250 |
+
ck_num = 0
|
| 251 |
if prev_tasks:
|
|
|
|
| 252 |
for task in tsks:
|
| 253 |
ck_num += reuse_prev_task_chunks(task, prev_tasks, chunking_config)
|
| 254 |
TaskService.filter_delete([Task.doc_id == doc["id"]])
|
|
|
|
| 258 |
chunk_ids.extend(task["chunk_ids"].split())
|
| 259 |
if chunk_ids:
|
| 260 |
settings.docStoreConn.delete({"id": chunk_ids}, search.index_name(chunking_config["tenant_id"]), chunking_config["kb_id"])
|
| 261 |
+
DocumentService.update_by_id(doc["id"], {"chunk_num": ck_num})
|
| 262 |
|
| 263 |
bulk_insert_into_db(Task, tsks, True)
|
| 264 |
DocumentService.begin2parse(doc["id"])
|
conf/infinity_mapping.json
CHANGED
|
@@ -16,6 +16,8 @@
|
|
| 16 |
"content_with_weight": {"type": "varchar", "default": ""},
|
| 17 |
"content_ltks": {"type": "varchar", "default": "", "analyzer": "whitespace"},
|
| 18 |
"content_sm_ltks": {"type": "varchar", "default": "", "analyzer": "whitespace"},
|
|
|
|
|
|
|
| 19 |
"page_num_int": {"type": "varchar", "default": ""},
|
| 20 |
"top_int": {"type": "varchar", "default": ""},
|
| 21 |
"position_int": {"type": "varchar", "default": ""},
|
|
|
|
| 16 |
"content_with_weight": {"type": "varchar", "default": ""},
|
| 17 |
"content_ltks": {"type": "varchar", "default": "", "analyzer": "whitespace"},
|
| 18 |
"content_sm_ltks": {"type": "varchar", "default": "", "analyzer": "whitespace"},
|
| 19 |
+
"authors_tks": {"type": "varchar", "default": "", "analyzer": "whitespace"},
|
| 20 |
+
"authors_sm_tks": {"type": "varchar", "default": "", "analyzer": "whitespace"},
|
| 21 |
"page_num_int": {"type": "varchar", "default": ""},
|
| 22 |
"top_int": {"type": "varchar", "default": ""},
|
| 23 |
"position_int": {"type": "varchar", "default": ""},
|
deepdoc/vision/__init__.py
CHANGED
|
@@ -15,9 +15,10 @@ import pdfplumber
|
|
| 15 |
|
| 16 |
from .ocr import OCR
|
| 17 |
from .recognizer import Recognizer
|
| 18 |
-
from .layout_recognizer import LayoutRecognizer
|
| 19 |
from .table_structure_recognizer import TableStructureRecognizer
|
| 20 |
|
|
|
|
| 21 |
def init_in_out(args):
|
| 22 |
from PIL import Image
|
| 23 |
import os
|
|
|
|
| 15 |
|
| 16 |
from .ocr import OCR
|
| 17 |
from .recognizer import Recognizer
|
| 18 |
+
from .layout_recognizer import LayoutRecognizer4YOLOv10 as LayoutRecognizer
|
| 19 |
from .table_structure_recognizer import TableStructureRecognizer
|
| 20 |
|
| 21 |
+
|
| 22 |
def init_in_out(args):
|
| 23 |
from PIL import Image
|
| 24 |
import os
|
deepdoc/vision/layout_recognizer.py
CHANGED
|
@@ -14,11 +14,14 @@ import os
|
|
| 14 |
import re
|
| 15 |
from collections import Counter
|
| 16 |
from copy import deepcopy
|
|
|
|
|
|
|
| 17 |
import numpy as np
|
| 18 |
from huggingface_hub import snapshot_download
|
| 19 |
|
| 20 |
from api.utils.file_utils import get_project_base_directory
|
| 21 |
from deepdoc.vision import Recognizer
|
|
|
|
| 22 |
|
| 23 |
|
| 24 |
class LayoutRecognizer(Recognizer):
|
|
@@ -149,3 +152,88 @@ class LayoutRecognizer(Recognizer):
|
|
| 149 |
|
| 150 |
ocr_res = [b for b in ocr_res if b["text"].strip() not in garbag_set]
|
| 151 |
return ocr_res, page_layout
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
import re
|
| 15 |
from collections import Counter
|
| 16 |
from copy import deepcopy
|
| 17 |
+
|
| 18 |
+
import cv2
|
| 19 |
import numpy as np
|
| 20 |
from huggingface_hub import snapshot_download
|
| 21 |
|
| 22 |
from api.utils.file_utils import get_project_base_directory
|
| 23 |
from deepdoc.vision import Recognizer
|
| 24 |
+
from deepdoc.vision.operators import nms
|
| 25 |
|
| 26 |
|
| 27 |
class LayoutRecognizer(Recognizer):
|
|
|
|
| 152 |
|
| 153 |
ocr_res = [b for b in ocr_res if b["text"].strip() not in garbag_set]
|
| 154 |
return ocr_res, page_layout
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
class LayoutRecognizer4YOLOv10(LayoutRecognizer):
|
| 158 |
+
labels = [
|
| 159 |
+
"title",
|
| 160 |
+
"Text",
|
| 161 |
+
"Reference",
|
| 162 |
+
"Figure",
|
| 163 |
+
"Figure caption",
|
| 164 |
+
"Table",
|
| 165 |
+
"Table caption",
|
| 166 |
+
"Table caption",
|
| 167 |
+
"Equation",
|
| 168 |
+
"Figure caption",
|
| 169 |
+
]
|
| 170 |
+
|
| 171 |
+
def __init__(self, domain):
|
| 172 |
+
domain = "layout"
|
| 173 |
+
super().__init__(domain)
|
| 174 |
+
self.auto = False
|
| 175 |
+
self.scaleFill = False
|
| 176 |
+
self.scaleup = True
|
| 177 |
+
self.stride = 32
|
| 178 |
+
self.center = True
|
| 179 |
+
|
| 180 |
+
def preprocess(self, image_list):
|
| 181 |
+
inputs = []
|
| 182 |
+
new_shape = self.input_shape # height, width
|
| 183 |
+
for img in image_list:
|
| 184 |
+
shape = img.shape[:2]# current shape [height, width]
|
| 185 |
+
# Scale ratio (new / old)
|
| 186 |
+
r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
|
| 187 |
+
# Compute padding
|
| 188 |
+
new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
|
| 189 |
+
dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1] # wh padding
|
| 190 |
+
dw /= 2 # divide padding into 2 sides
|
| 191 |
+
dh /= 2
|
| 192 |
+
ww, hh = new_unpad
|
| 193 |
+
img = np.array(cv2.cvtColor(img, cv2.COLOR_BGR2RGB)).astype(np.float32)
|
| 194 |
+
img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
|
| 195 |
+
top, bottom = int(round(dh - 0.1)) if self.center else 0, int(round(dh + 0.1))
|
| 196 |
+
left, right = int(round(dw - 0.1)) if self.center else 0, int(round(dw + 0.1))
|
| 197 |
+
img = cv2.copyMakeBorder(
|
| 198 |
+
img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114)
|
| 199 |
+
) # add border
|
| 200 |
+
img /= 255.0
|
| 201 |
+
img = img.transpose(2, 0, 1)
|
| 202 |
+
img = img[np.newaxis, :, :, :].astype(np.float32)
|
| 203 |
+
inputs.append({self.input_names[0]: img, "scale_factor": [shape[1]/ww, shape[0]/hh, dw, dh]})
|
| 204 |
+
|
| 205 |
+
return inputs
|
| 206 |
+
|
| 207 |
+
def postprocess(self, boxes, inputs, thr):
|
| 208 |
+
thr = 0.08
|
| 209 |
+
boxes = np.squeeze(boxes)
|
| 210 |
+
scores = boxes[:, 4]
|
| 211 |
+
boxes = boxes[scores > thr, :]
|
| 212 |
+
scores = scores[scores > thr]
|
| 213 |
+
if len(boxes) == 0:
|
| 214 |
+
return []
|
| 215 |
+
class_ids = boxes[:, -1].astype(int)
|
| 216 |
+
boxes = boxes[:, :4]
|
| 217 |
+
boxes[:, 0] -= inputs["scale_factor"][2]
|
| 218 |
+
boxes[:, 2] -= inputs["scale_factor"][2]
|
| 219 |
+
boxes[:, 1] -= inputs["scale_factor"][3]
|
| 220 |
+
boxes[:, 3] -= inputs["scale_factor"][3]
|
| 221 |
+
input_shape = np.array([inputs["scale_factor"][0], inputs["scale_factor"][1], inputs["scale_factor"][0],
|
| 222 |
+
inputs["scale_factor"][1]])
|
| 223 |
+
boxes = np.multiply(boxes, input_shape, dtype=np.float32)
|
| 224 |
+
|
| 225 |
+
unique_class_ids = np.unique(class_ids)
|
| 226 |
+
indices = []
|
| 227 |
+
for class_id in unique_class_ids:
|
| 228 |
+
class_indices = np.where(class_ids == class_id)[0]
|
| 229 |
+
class_boxes = boxes[class_indices, :]
|
| 230 |
+
class_scores = scores[class_indices]
|
| 231 |
+
class_keep_boxes = nms(class_boxes, class_scores, 0.45)
|
| 232 |
+
indices.extend(class_indices[class_keep_boxes])
|
| 233 |
+
|
| 234 |
+
return [{
|
| 235 |
+
"type": self.label_list[class_ids[i]].lower(),
|
| 236 |
+
"bbox": [float(t) for t in boxes[i].tolist()],
|
| 237 |
+
"score": float(scores[i])
|
| 238 |
+
} for i in indices]
|
| 239 |
+
|
deepdoc/vision/operators.py
CHANGED
|
@@ -709,3 +709,29 @@ def preprocess(im, preprocess_ops):
|
|
| 709 |
for operator in preprocess_ops:
|
| 710 |
im, im_info = operator(im, im_info)
|
| 711 |
return im, im_info
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 709 |
for operator in preprocess_ops:
|
| 710 |
im, im_info = operator(im, im_info)
|
| 711 |
return im, im_info
|
| 712 |
+
|
| 713 |
+
|
| 714 |
+
def nms(bboxes, scores, iou_thresh):
|
| 715 |
+
import numpy as np
|
| 716 |
+
x1 = bboxes[:, 0]
|
| 717 |
+
y1 = bboxes[:, 1]
|
| 718 |
+
x2 = bboxes[:, 2]
|
| 719 |
+
y2 = bboxes[:, 3]
|
| 720 |
+
areas = (y2 - y1) * (x2 - x1)
|
| 721 |
+
|
| 722 |
+
indices = []
|
| 723 |
+
index = scores.argsort()[::-1]
|
| 724 |
+
while index.size > 0:
|
| 725 |
+
i = index[0]
|
| 726 |
+
indices.append(i)
|
| 727 |
+
x11 = np.maximum(x1[i], x1[index[1:]])
|
| 728 |
+
y11 = np.maximum(y1[i], y1[index[1:]])
|
| 729 |
+
x22 = np.minimum(x2[i], x2[index[1:]])
|
| 730 |
+
y22 = np.minimum(y2[i], y2[index[1:]])
|
| 731 |
+
w = np.maximum(0, x22 - x11 + 1)
|
| 732 |
+
h = np.maximum(0, y22 - y11 + 1)
|
| 733 |
+
overlaps = w * h
|
| 734 |
+
ious = overlaps / (areas[i] + areas[index[1:]] - overlaps)
|
| 735 |
+
idx = np.where(ious <= iou_thresh)[0]
|
| 736 |
+
index = index[idx + 1]
|
| 737 |
+
return indices
|