KevinHuSh
commited on
Commit
·
3cefaa0
1
Parent(s):
b0577d6
enlarge docker memory usage (#501)
Browse files### What problem does this PR solve?
### Type of change
- [x] Refactoring
- deepdoc/parser/pdf_parser.py +8 -9
- docker/.env +3 -1
- docker/docker-compose-base.yml +17 -17
- docker/entrypoint.sh +1 -1
- rag/app/book.py +1 -1
- rag/app/laws.py +1 -1
- rag/app/manual.py +1 -1
- rag/app/naive.py +3 -3
- rag/app/one.py +1 -1
- rag/app/paper.py +1 -1
- rag/svr/task_broker.py +5 -0
- rag/svr/task_executor.py +8 -0
deepdoc/parser/pdf_parser.py
CHANGED
|
@@ -11,7 +11,7 @@ import pdfplumber
|
|
| 11 |
import logging
|
| 12 |
from PIL import Image, ImageDraw
|
| 13 |
import numpy as np
|
| 14 |
-
|
| 15 |
from PyPDF2 import PdfReader as pdf2_read
|
| 16 |
|
| 17 |
from api.utils.file_utils import get_project_base_directory
|
|
@@ -936,6 +936,7 @@ class HuParser:
|
|
| 936 |
self.page_cum_height = [0]
|
| 937 |
self.page_layout = []
|
| 938 |
self.page_from = page_from
|
|
|
|
| 939 |
try:
|
| 940 |
self.pdf = pdfplumber.open(fnm) if isinstance(
|
| 941 |
fnm, str) else pdfplumber.open(BytesIO(fnm))
|
|
@@ -989,7 +990,9 @@ class HuParser:
|
|
| 989 |
self.is_english = True
|
| 990 |
else:
|
| 991 |
self.is_english = False
|
|
|
|
| 992 |
|
|
|
|
| 993 |
for i, img in enumerate(self.page_images):
|
| 994 |
chars = self.page_chars[i] if not self.is_english else []
|
| 995 |
self.mean_height.append(
|
|
@@ -1007,15 +1010,11 @@ class HuParser:
|
|
| 1007 |
chars[j]["width"]) / 2:
|
| 1008 |
chars[j]["text"] += " "
|
| 1009 |
j += 1
|
| 1010 |
-
|
| 1011 |
-
# if not chars:
|
| 1012 |
-
# self.page_cum_height.append(img.size[1] / zoomin)
|
| 1013 |
-
# else:
|
| 1014 |
-
# self.page_cum_height.append(
|
| 1015 |
-
# np.max([c["bottom"] for c in chars]))
|
| 1016 |
self.__ocr(i + 1, img, chars, zoomin)
|
| 1017 |
-
if callback:
|
| 1018 |
-
|
|
|
|
| 1019 |
|
| 1020 |
if not self.is_english and not any(
|
| 1021 |
[c for c in self.page_chars]) and self.boxes:
|
|
|
|
| 11 |
import logging
|
| 12 |
from PIL import Image, ImageDraw
|
| 13 |
import numpy as np
|
| 14 |
+
from timeit import default_timer as timer
|
| 15 |
from PyPDF2 import PdfReader as pdf2_read
|
| 16 |
|
| 17 |
from api.utils.file_utils import get_project_base_directory
|
|
|
|
| 936 |
self.page_cum_height = [0]
|
| 937 |
self.page_layout = []
|
| 938 |
self.page_from = page_from
|
| 939 |
+
st = timer()
|
| 940 |
try:
|
| 941 |
self.pdf = pdfplumber.open(fnm) if isinstance(
|
| 942 |
fnm, str) else pdfplumber.open(BytesIO(fnm))
|
|
|
|
| 990 |
self.is_english = True
|
| 991 |
else:
|
| 992 |
self.is_english = False
|
| 993 |
+
self.is_english = False
|
| 994 |
|
| 995 |
+
st = timer()
|
| 996 |
for i, img in enumerate(self.page_images):
|
| 997 |
chars = self.page_chars[i] if not self.is_english else []
|
| 998 |
self.mean_height.append(
|
|
|
|
| 1010 |
chars[j]["width"]) / 2:
|
| 1011 |
chars[j]["text"] += " "
|
| 1012 |
j += 1
|
| 1013 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1014 |
self.__ocr(i + 1, img, chars, zoomin)
|
| 1015 |
+
#if callback:
|
| 1016 |
+
# callback(prog=(i + 1) * 0.6 / len(self.page_images), msg="")
|
| 1017 |
+
#print("OCR:", timer()-st)
|
| 1018 |
|
| 1019 |
if not self.is_english and not any(
|
| 1020 |
[c for c in self.page_chars]) and self.boxes:
|
docker/.env
CHANGED
|
@@ -11,7 +11,9 @@ ES_PORT=1200
|
|
| 11 |
KIBANA_PORT=6601
|
| 12 |
|
| 13 |
# Increase or decrease based on the available host memory (in bytes)
|
| 14 |
-
|
|
|
|
|
|
|
| 15 |
|
| 16 |
MYSQL_PASSWORD=infini_rag_flow
|
| 17 |
MYSQL_PORT=5455
|
|
|
|
| 11 |
KIBANA_PORT=6601
|
| 12 |
|
| 13 |
# Increase or decrease based on the available host memory (in bytes)
|
| 14 |
+
|
| 15 |
+
MEM_LIMIT=8073741824
|
| 16 |
+
|
| 17 |
|
| 18 |
MYSQL_PASSWORD=infini_rag_flow
|
| 19 |
MYSQL_PORT=5455
|
docker/docker-compose-base.yml
CHANGED
|
@@ -29,23 +29,23 @@ services:
|
|
| 29 |
- ragflow
|
| 30 |
restart: always
|
| 31 |
|
| 32 |
-
kibana:
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
|
| 50 |
mysql:
|
| 51 |
image: mysql:5.7.18
|
|
|
|
| 29 |
- ragflow
|
| 30 |
restart: always
|
| 31 |
|
| 32 |
+
#kibana:
|
| 33 |
+
# depends_on:
|
| 34 |
+
# es01:
|
| 35 |
+
# condition: service_healthy
|
| 36 |
+
# image: docker.elastic.co/kibana/kibana:${STACK_VERSION}
|
| 37 |
+
# container_name: ragflow-kibana
|
| 38 |
+
# volumes:
|
| 39 |
+
# - kibanadata:/usr/share/kibana/data
|
| 40 |
+
# ports:
|
| 41 |
+
# - ${KIBANA_PORT}:5601
|
| 42 |
+
# environment:
|
| 43 |
+
# - SERVERNAME=kibana
|
| 44 |
+
# - ELASTICSEARCH_HOSTS=http://es01:9200
|
| 45 |
+
# - TZ=${TIMEZONE}
|
| 46 |
+
# mem_limit: ${MEM_LIMIT}
|
| 47 |
+
# networks:
|
| 48 |
+
# - ragflow
|
| 49 |
|
| 50 |
mysql:
|
| 51 |
image: mysql:5.7.18
|
docker/entrypoint.sh
CHANGED
|
@@ -29,7 +29,7 @@ function task_bro(){
|
|
| 29 |
|
| 30 |
task_bro &
|
| 31 |
|
| 32 |
-
WS=
|
| 33 |
for ((i=0;i<WS;i++))
|
| 34 |
do
|
| 35 |
task_exe $i $WS &
|
|
|
|
| 29 |
|
| 30 |
task_bro &
|
| 31 |
|
| 32 |
+
WS=1
|
| 33 |
for ((i=0;i<WS;i++))
|
| 34 |
do
|
| 35 |
task_exe $i $WS &
|
rag/app/book.py
CHANGED
|
@@ -37,7 +37,7 @@ class Pdf(PdfParser):
|
|
| 37 |
start = timer()
|
| 38 |
self._layouts_rec(zoomin)
|
| 39 |
callback(0.67, "Layout analysis finished")
|
| 40 |
-
print("
|
| 41 |
self._table_transformer_job(zoomin)
|
| 42 |
callback(0.68, "Table analysis finished")
|
| 43 |
self._text_merge()
|
|
|
|
| 37 |
start = timer()
|
| 38 |
self._layouts_rec(zoomin)
|
| 39 |
callback(0.67, "Layout analysis finished")
|
| 40 |
+
print("layouts:", timer() - start)
|
| 41 |
self._table_transformer_job(zoomin)
|
| 42 |
callback(0.68, "Table analysis finished")
|
| 43 |
self._text_merge()
|
rag/app/laws.py
CHANGED
|
@@ -71,7 +71,7 @@ class Pdf(PdfParser):
|
|
| 71 |
start = timer()
|
| 72 |
self._layouts_rec(zoomin)
|
| 73 |
callback(0.67, "Layout analysis finished")
|
| 74 |
-
cron_logger.info("
|
| 75 |
(timer() - start) / (self.total_page + 0.1)))
|
| 76 |
self._naive_vertical_merge()
|
| 77 |
|
|
|
|
| 71 |
start = timer()
|
| 72 |
self._layouts_rec(zoomin)
|
| 73 |
callback(0.67, "Layout analysis finished")
|
| 74 |
+
cron_logger.info("layouts:".format(
|
| 75 |
(timer() - start) / (self.total_page + 0.1)))
|
| 76 |
self._naive_vertical_merge()
|
| 77 |
|
rag/app/manual.py
CHANGED
|
@@ -32,7 +32,7 @@ class Pdf(PdfParser):
|
|
| 32 |
|
| 33 |
self._layouts_rec(zoomin)
|
| 34 |
callback(0.65, "Layout analysis finished.")
|
| 35 |
-
print("
|
| 36 |
self._table_transformer_job(zoomin)
|
| 37 |
callback(0.67, "Table analysis finished.")
|
| 38 |
self._text_merge()
|
|
|
|
| 32 |
|
| 33 |
self._layouts_rec(zoomin)
|
| 34 |
callback(0.65, "Layout analysis finished.")
|
| 35 |
+
print("layouts:", timer() - start)
|
| 36 |
self._table_transformer_job(zoomin)
|
| 37 |
callback(0.67, "Table analysis finished.")
|
| 38 |
self._text_merge()
|
rag/app/naive.py
CHANGED
|
@@ -77,12 +77,12 @@ class Pdf(PdfParser):
|
|
| 77 |
callback
|
| 78 |
)
|
| 79 |
callback(msg="OCR finished")
|
| 80 |
-
cron_logger.info("OCR: {}".format(timer() - start))
|
| 81 |
|
| 82 |
start = timer()
|
| 83 |
self._layouts_rec(zoomin)
|
| 84 |
callback(0.63, "Layout analysis finished.")
|
| 85 |
-
print("
|
| 86 |
self._table_transformer_job(zoomin)
|
| 87 |
callback(0.65, "Table analysis finished.")
|
| 88 |
self._text_merge()
|
|
@@ -92,7 +92,7 @@ class Pdf(PdfParser):
|
|
| 92 |
self._concat_downward()
|
| 93 |
#self._filter_forpages()
|
| 94 |
|
| 95 |
-
cron_logger.info("
|
| 96 |
(timer() - start) / (self.total_page + 0.1)))
|
| 97 |
return [(b["text"], self._line_tag(b, zoomin))
|
| 98 |
for b in self.boxes], tbls
|
|
|
|
| 77 |
callback
|
| 78 |
)
|
| 79 |
callback(msg="OCR finished")
|
| 80 |
+
cron_logger.info("OCR({}~{}): {}".format(from_page, to_page, timer() - start))
|
| 81 |
|
| 82 |
start = timer()
|
| 83 |
self._layouts_rec(zoomin)
|
| 84 |
callback(0.63, "Layout analysis finished.")
|
| 85 |
+
print("layouts:", timer() - start)
|
| 86 |
self._table_transformer_job(zoomin)
|
| 87 |
callback(0.65, "Table analysis finished.")
|
| 88 |
self._text_merge()
|
|
|
|
| 92 |
self._concat_downward()
|
| 93 |
#self._filter_forpages()
|
| 94 |
|
| 95 |
+
cron_logger.info("layouts: {}".format(
|
| 96 |
(timer() - start) / (self.total_page + 0.1)))
|
| 97 |
return [(b["text"], self._line_tag(b, zoomin))
|
| 98 |
for b in self.boxes], tbls
|
rag/app/one.py
CHANGED
|
@@ -33,7 +33,7 @@ class Pdf(PdfParser):
|
|
| 33 |
start = timer()
|
| 34 |
self._layouts_rec(zoomin, drop=False)
|
| 35 |
callback(0.63, "Layout analysis finished.")
|
| 36 |
-
print("
|
| 37 |
self._table_transformer_job(zoomin)
|
| 38 |
callback(0.65, "Table analysis finished.")
|
| 39 |
self._text_merge()
|
|
|
|
| 33 |
start = timer()
|
| 34 |
self._layouts_rec(zoomin, drop=False)
|
| 35 |
callback(0.63, "Layout analysis finished.")
|
| 36 |
+
print("layouts:", timer() - start)
|
| 37 |
self._table_transformer_job(zoomin)
|
| 38 |
callback(0.65, "Table analysis finished.")
|
| 39 |
self._text_merge()
|
rag/app/paper.py
CHANGED
|
@@ -42,7 +42,7 @@ class Pdf(PdfParser):
|
|
| 42 |
start = timer()
|
| 43 |
self._layouts_rec(zoomin)
|
| 44 |
callback(0.63, "Layout analysis finished")
|
| 45 |
-
print("
|
| 46 |
self._table_transformer_job(zoomin)
|
| 47 |
callback(0.68, "Table analysis finished")
|
| 48 |
self._text_merge()
|
|
|
|
| 42 |
start = timer()
|
| 43 |
self._layouts_rec(zoomin)
|
| 44 |
callback(0.63, "Layout analysis finished")
|
| 45 |
+
print("layouts:", timer() - start)
|
| 46 |
self._table_transformer_job(zoomin)
|
| 47 |
callback(0.68, "Table analysis finished")
|
| 48 |
self._text_merge()
|
rag/svr/task_broker.py
CHANGED
|
@@ -33,6 +33,8 @@ from api.settings import database_logger
|
|
| 33 |
from api.utils import get_format_time, get_uuid
|
| 34 |
from api.utils.file_utils import get_project_base_directory
|
| 35 |
from rag.utils.redis_conn import REDIS_CONN
|
|
|
|
|
|
|
| 36 |
|
| 37 |
|
| 38 |
def collect(tm):
|
|
@@ -181,6 +183,9 @@ if __name__ == "__main__":
|
|
| 181 |
peewee_logger.propagate = False
|
| 182 |
peewee_logger.addHandler(database_logger.handlers[0])
|
| 183 |
peewee_logger.setLevel(database_logger.level)
|
|
|
|
|
|
|
|
|
|
| 184 |
|
| 185 |
while True:
|
| 186 |
dispatch()
|
|
|
|
| 33 |
from api.utils import get_format_time, get_uuid
|
| 34 |
from api.utils.file_utils import get_project_base_directory
|
| 35 |
from rag.utils.redis_conn import REDIS_CONN
|
| 36 |
+
from api.db.db_models import init_database_tables as init_web_db
|
| 37 |
+
from api.db.init_data import init_web_data
|
| 38 |
|
| 39 |
|
| 40 |
def collect(tm):
|
|
|
|
| 183 |
peewee_logger.propagate = False
|
| 184 |
peewee_logger.addHandler(database_logger.handlers[0])
|
| 185 |
peewee_logger.setLevel(database_logger.level)
|
| 186 |
+
# init db
|
| 187 |
+
init_web_db()
|
| 188 |
+
init_web_data()
|
| 189 |
|
| 190 |
while True:
|
| 191 |
dispatch()
|
rag/svr/task_executor.py
CHANGED
|
@@ -163,6 +163,7 @@ def build(row):
|
|
| 163 |
"doc_id": row["doc_id"],
|
| 164 |
"kb_id": [str(row["kb_id"])]
|
| 165 |
}
|
|
|
|
| 166 |
for ck in cks:
|
| 167 |
d = copy.deepcopy(doc)
|
| 168 |
d.update(ck)
|
|
@@ -182,10 +183,13 @@ def build(row):
|
|
| 182 |
else:
|
| 183 |
d["image"].save(output_buffer, format='JPEG')
|
| 184 |
|
|
|
|
| 185 |
MINIO.put(row["kb_id"], d["_id"], output_buffer.getvalue())
|
|
|
|
| 186 |
d["img_id"] = "{}-{}".format(row["kb_id"], d["_id"])
|
| 187 |
del d["image"]
|
| 188 |
docs.append(d)
|
|
|
|
| 189 |
|
| 190 |
return docs
|
| 191 |
|
|
@@ -258,7 +262,9 @@ def main(comm, mod):
|
|
| 258 |
callback(prog=-1, msg=str(e))
|
| 259 |
continue
|
| 260 |
|
|
|
|
| 261 |
cks = build(r)
|
|
|
|
| 262 |
if cks is None:
|
| 263 |
continue
|
| 264 |
if not cks:
|
|
@@ -277,12 +283,14 @@ def main(comm, mod):
|
|
| 277 |
callback(-1, "Embedding error:{}".format(str(e)))
|
| 278 |
cron_logger.error(str(e))
|
| 279 |
tk_count = 0
|
|
|
|
| 280 |
|
| 281 |
callback(msg="Finished embedding({})! Start to build index!".format(timer()-st))
|
| 282 |
init_kb(r)
|
| 283 |
chunk_count = len(set([c["_id"] for c in cks]))
|
| 284 |
st = timer()
|
| 285 |
es_r = ELASTICSEARCH.bulk(cks, search.index_name(r["tenant_id"]))
|
|
|
|
| 286 |
if es_r:
|
| 287 |
callback(-1, "Index failure!")
|
| 288 |
ELASTICSEARCH.deleteByQuery(
|
|
|
|
| 163 |
"doc_id": row["doc_id"],
|
| 164 |
"kb_id": [str(row["kb_id"])]
|
| 165 |
}
|
| 166 |
+
el = 0
|
| 167 |
for ck in cks:
|
| 168 |
d = copy.deepcopy(doc)
|
| 169 |
d.update(ck)
|
|
|
|
| 183 |
else:
|
| 184 |
d["image"].save(output_buffer, format='JPEG')
|
| 185 |
|
| 186 |
+
st = timer()
|
| 187 |
MINIO.put(row["kb_id"], d["_id"], output_buffer.getvalue())
|
| 188 |
+
el += timer() - st
|
| 189 |
d["img_id"] = "{}-{}".format(row["kb_id"], d["_id"])
|
| 190 |
del d["image"]
|
| 191 |
docs.append(d)
|
| 192 |
+
cron_logger.info("MINIO PUT({}):{}".format(row["name"], el))
|
| 193 |
|
| 194 |
return docs
|
| 195 |
|
|
|
|
| 262 |
callback(prog=-1, msg=str(e))
|
| 263 |
continue
|
| 264 |
|
| 265 |
+
st = timer()
|
| 266 |
cks = build(r)
|
| 267 |
+
cron_logger.info("Build chunks({}): {}".format(r["name"], timer()-st))
|
| 268 |
if cks is None:
|
| 269 |
continue
|
| 270 |
if not cks:
|
|
|
|
| 283 |
callback(-1, "Embedding error:{}".format(str(e)))
|
| 284 |
cron_logger.error(str(e))
|
| 285 |
tk_count = 0
|
| 286 |
+
cron_logger.info("Embedding elapsed({}): {}".format(r["name"], timer()-st))
|
| 287 |
|
| 288 |
callback(msg="Finished embedding({})! Start to build index!".format(timer()-st))
|
| 289 |
init_kb(r)
|
| 290 |
chunk_count = len(set([c["_id"] for c in cks]))
|
| 291 |
st = timer()
|
| 292 |
es_r = ELASTICSEARCH.bulk(cks, search.index_name(r["tenant_id"]))
|
| 293 |
+
cron_logger.info("Indexing elapsed({}): {}".format(r["name"], timer()-st))
|
| 294 |
if es_r:
|
| 295 |
callback(-1, "Index failure!")
|
| 296 |
ELASTICSEARCH.deleteByQuery(
|