KevinHuSh
remove unused codes, seperate layout detection out as a new api. Add new rag methed 'table' (#55)
407b252
# | |
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
# | |
from peewee import Expression | |
from api.db import TenantPermission, FileType, TaskStatus | |
from api.db.db_models import DB, Knowledgebase, Tenant | |
from api.db.db_models import Document | |
from api.db.services.common_service import CommonService | |
from api.db.services.knowledgebase_service import KnowledgebaseService | |
from api.db import StatusEnum | |
class DocumentService(CommonService): | |
model = Document | |
def get_by_kb_id(cls, kb_id, page_number, items_per_page, | |
orderby, desc, keywords): | |
if keywords: | |
docs = cls.model.select().where( | |
cls.model.kb_id == kb_id, | |
cls.model.name.like(f"%%{keywords}%%")) | |
else: | |
docs = cls.model.select().where(cls.model.kb_id == kb_id) | |
count = docs.count() | |
if desc: | |
docs = docs.order_by(cls.model.getter_by(orderby).desc()) | |
else: | |
docs = docs.order_by(cls.model.getter_by(orderby).asc()) | |
docs = docs.paginate(page_number, items_per_page) | |
return list(docs.dicts()), count | |
def insert(cls, doc): | |
if not cls.save(**doc): | |
raise RuntimeError("Database error (Document)!") | |
e, doc = cls.get_by_id(doc["id"]) | |
if not e: | |
raise RuntimeError("Database error (Document retrieval)!") | |
e, kb = KnowledgebaseService.get_by_id(doc.kb_id) | |
if not KnowledgebaseService.update_by_id( | |
kb.id, {"doc_num": kb.doc_num + 1}): | |
raise RuntimeError("Database error (Knowledgebase)!") | |
return doc | |
def get_newly_uploaded(cls, tm, mod=0, comm=1, items_per_page=64): | |
fields = [cls.model.id, cls.model.kb_id, cls.model.parser_id, cls.model.parser_config, cls.model.name, cls.model.type, cls.model.location, cls.model.size, Knowledgebase.tenant_id, Tenant.embd_id, Tenant.img2txt_id, Tenant.asr_id, cls.model.update_time] | |
docs = cls.model.select(*fields) \ | |
.join(Knowledgebase, on=(cls.model.kb_id == Knowledgebase.id)) \ | |
.join(Tenant, on=(Knowledgebase.tenant_id == Tenant.id))\ | |
.where( | |
cls.model.status == StatusEnum.VALID.value, | |
~(cls.model.type == FileType.VIRTUAL.value), | |
cls.model.progress == 0, | |
cls.model.update_time >= tm, | |
cls.model.run == TaskStatus.RUNNING.value, | |
(Expression(cls.model.create_time, "%%", comm) == mod))\ | |
.order_by(cls.model.update_time.asc())\ | |
.paginate(1, items_per_page) | |
return list(docs.dicts()) | |
def get_unfinished_docs(cls): | |
fields = [cls.model.id, cls.model.process_begin_at] | |
docs = cls.model.select(*fields) \ | |
.where( | |
cls.model.status == StatusEnum.VALID.value, | |
~(cls.model.type == FileType.VIRTUAL.value), | |
cls.model.progress < 1, | |
cls.model.progress > 0) | |
return list(docs.dicts()) | |
def increment_chunk_num(cls, doc_id, kb_id, token_num, chunk_num, duation): | |
num = cls.model.update(token_num=cls.model.token_num + token_num, | |
chunk_num=cls.model.chunk_num + chunk_num, | |
process_duation=cls.model.process_duation+duation).where( | |
cls.model.id == doc_id).execute() | |
if num == 0:raise LookupError("Document not found which is supposed to be there") | |
num = Knowledgebase.update(token_num=Knowledgebase.token_num+token_num, chunk_num=Knowledgebase.chunk_num+chunk_num).where(Knowledgebase.id==kb_id).execute() | |
return num | |
def get_tenant_id(cls, doc_id): | |
docs = cls.model.select(Knowledgebase.tenant_id).join(Knowledgebase, on=(Knowledgebase.id == cls.model.kb_id)).where(cls.model.id == doc_id, Knowledgebase.status==StatusEnum.VALID.value) | |
docs = docs.dicts() | |
if not docs:return | |
return docs[0]["tenant_id"] |