Spaces:

retopara
/

ragflow

Build error

cecilia-uu commited on Jul 1, 2024

Commit

061aa4e

1 Parent(s): 2653e84

API: created list_doc (#1327)

### What problem does this PR solve?

Adds the api of listing documentation.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

Files changed (7) hide show

api/apps/dataset_api.py +223 -2
api/apps/documents_api.py +0 -228
api/db/services/document_service.py +29 -0
api/db/services/knowledgebase_service.py +3 -0
docs/references/ragflow_api.md +3 -1
sdk/python/ragflow/ragflow.py +17 -7
sdk/python/test/test_document.py +135 -6

api/apps/dataset_api.py CHANGED Viewed

@@ -13,13 +13,17 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 from flask import request
 from flask_login import login_required, current_user
 from httpx import HTTPError
 from api.contants import NAME_LENGTH_LIMIT
-from api.db import FileSource, StatusEnum
 from api.db.db_models import File
 from api.db.services import duplicate_name
 from api.db.services.document_service import DocumentService
@@ -29,8 +33,12 @@ from api.db.services.knowledgebase_service import KnowledgebaseService
 from api.db.services.user_service import TenantService
 from api.settings import RetCode
 from api.utils import get_uuid
-from api.utils.api_utils import construct_json_result, construct_result, construct_error_response, validate_request
 # ------------------------------ create a dataset ---------------------------------------
@@ -253,3 +261,216 @@ def update_dataset(dataset_id):
         return construct_json_result(data=dataset.to_json(), code=RetCode.SUCCESS)
     except Exception as e:
         return construct_error_response(e)

 #  See the License for the specific language governing permissions and
 #  limitations under the License.
+import os
+import re
+import warnings
 from flask import request
 from flask_login import login_required, current_user
 from httpx import HTTPError
 from api.contants import NAME_LENGTH_LIMIT
+from api.db import FileType, ParserType, FileSource
+from api.db import StatusEnum
 from api.db.db_models import File
 from api.db.services import duplicate_name
 from api.db.services.document_service import DocumentService
 from api.db.services.user_service import TenantService
 from api.settings import RetCode
 from api.utils import get_uuid
+from api.utils.api_utils import construct_json_result, construct_error_response
+from api.utils.api_utils import construct_result, validate_request
+from api.utils.file_utils import filename_type, thumbnail
+from rag.utils.minio_conn import MINIO
+MAXIMUM_OF_UPLOADING_FILES = 256
 # ------------------------------ create a dataset ---------------------------------------
         return construct_json_result(data=dataset.to_json(), code=RetCode.SUCCESS)
     except Exception as e:
         return construct_error_response(e)
+# --------------------------------content management ----------------------------------------------
+# ----------------------------upload files-----------------------------------------------------
+@manager.route('/<dataset_id>/documents/', methods=['POST'])
+@login_required
+def upload_documents(dataset_id):
+    # no files
+    if not request.files:
+        return construct_json_result(
+            message='There is no file!', code=RetCode.ARGUMENT_ERROR)
+    # the number of uploading files exceeds the limit
+    file_objs = request.files.getlist('file')
+    num_file_objs = len(file_objs)
+    if num_file_objs > MAXIMUM_OF_UPLOADING_FILES:
+        return construct_json_result(code=RetCode.DATA_ERROR, message=f"You try to upload {num_file_objs} files, "
+                                                                      f"which exceeds the maximum number of uploading files: {MAXIMUM_OF_UPLOADING_FILES}")
+    for file_obj in file_objs:
+        # the content of the file
+        file_content = file_obj.read()
+        file_name = file_obj.filename
+        # no name
+        if not file_name:
+            return construct_json_result(
+                message='There is a file without name!', code=RetCode.ARGUMENT_ERROR)
+        # TODO: support the remote files
+        if 'http' in file_name:
+            return construct_json_result(code=RetCode.ARGUMENT_ERROR, message="Remote files have not unsupported.")
+        # the content is empty, raising a warning
+        if file_content == b'':
+            warnings.warn(f"[WARNING]: The file {file_name} is empty.")
+    # no dataset
+    exist, dataset = KnowledgebaseService.get_by_id(dataset_id)
+    if not exist:
+        return construct_json_result(message="Can't find this dataset", code=RetCode.DATA_ERROR)
+    # get the root_folder
+    root_folder = FileService.get_root_folder(current_user.id)
+    # get the id of the root_folder
+    parent_file_id = root_folder["id"]  # document id
+    # this is for the new user, create '.knowledgebase' file
+    FileService.init_knowledgebase_docs(parent_file_id, current_user.id)
+    # go inside this folder, get the kb_root_folder
+    kb_root_folder = FileService.get_kb_folder(current_user.id)
+    # link the file management to the kb_folder
+    kb_folder = FileService.new_a_file_from_kb(dataset.tenant_id, dataset.name, kb_root_folder["id"])
+    # grab all the errs
+    err = []
+    MAX_FILE_NUM_PER_USER = int(os.environ.get('MAX_FILE_NUM_PER_USER', 0))
+    uploaded_docs_json = []
+    for file in file_objs:
+        try:
+            # TODO: get this value from the database as some tenants have this limit while others don't
+            if MAX_FILE_NUM_PER_USER > 0 and DocumentService.get_doc_count(dataset.tenant_id) >= MAX_FILE_NUM_PER_USER:
+                return construct_json_result(code=RetCode.DATA_ERROR,
+                                             message="Exceed the maximum file number of a free user!")
+            # deal with the duplicate name
+            filename = duplicate_name(
+                DocumentService.query,
+                name=file.filename,
+                kb_id=dataset.id)
+            # deal with the unsupported type
+            filetype = filename_type(filename)
+            if filetype == FileType.OTHER.value:
+                return construct_json_result(code=RetCode.DATA_ERROR,
+                                             message="This type of file has not been supported yet!")
+            # upload to the minio
+            location = filename
+            while MINIO.obj_exist(dataset_id, location):
+                location += "_"
+            blob = file.read()
+            MINIO.put(dataset_id, location, blob)
+            doc = {
+                "id": get_uuid(),
+                "kb_id": dataset.id,
+                "parser_id": dataset.parser_id,
+                "parser_config": dataset.parser_config,
+                "created_by": current_user.id,
+                "type": filetype,
+                "name": filename,
+                "location": location,
+                "size": len(blob),
+                "thumbnail": thumbnail(filename, blob)
+            }
+            if doc["type"] == FileType.VISUAL:
+                doc["parser_id"] = ParserType.PICTURE.value
+            if re.search(r"\.(ppt|pptx|pages)$", filename):
+                doc["parser_id"] = ParserType.PRESENTATION.value
+            DocumentService.insert(doc)
+            FileService.add_file_from_kb(doc, kb_folder["id"], dataset.tenant_id)
+            uploaded_docs_json.append(doc)
+        except Exception as e:
+            err.append(file.filename + ": " + str(e))
+    if err:
+        # return all the errors
+        return construct_json_result(message="\n".join(err), code=RetCode.SERVER_ERROR)
+    # success
+    return construct_json_result(data=uploaded_docs_json, code=RetCode.SUCCESS)
+# ----------------------------delete a file-----------------------------------------------------
+@manager.route('/<dataset_id>/documents/<document_id>', methods=['DELETE'])
+@login_required
+def delete_document(document_id, dataset_id):  # string
+    # get the root folder
+    root_folder = FileService.get_root_folder(current_user.id)
+    # parent file's id
+    parent_file_id = root_folder["id"]
+    # consider the new user
+    FileService.init_knowledgebase_docs(parent_file_id, current_user.id)
+    # store all the errors that may have
+    errors = ""
+    try:
+        # whether there is this document
+        exist, doc = DocumentService.get_by_id(document_id)
+        if not exist:
+            return construct_json_result(message=f"Document {document_id} not found!", code=RetCode.DATA_ERROR)
+        # whether this doc is authorized by this tenant
+        tenant_id = DocumentService.get_tenant_id(document_id)
+        if not tenant_id:
+            return construct_json_result(
+                message=f"You cannot delete this document {document_id} due to the authorization"
+                        f" reason!", code=RetCode.AUTHENTICATION_ERROR)
+        # get the doc's id and location
+        real_dataset_id, location = File2DocumentService.get_minio_address(doc_id=document_id)
+        if real_dataset_id != dataset_id:
+            return construct_json_result(message=f"The document {document_id} is not in the dataset: {dataset_id}, "
+                                                 f"but in the dataset: {real_dataset_id}.", code=RetCode.ARGUMENT_ERROR)
+        # there is an issue when removing
+        if not DocumentService.remove_document(doc, tenant_id):
+            return construct_json_result(
+                message="There was an error during the document removal process. Please check the status of the "
+                        "RAGFlow server and try the removal again.", code=RetCode.OPERATING_ERROR)
+        # fetch the File2Document record associated with the provided document ID.
+        file_to_doc = File2DocumentService.get_by_document_id(document_id)
+        # delete the associated File record.
+        FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == file_to_doc[0].file_id])
+        # delete the File2Document record itself using the document ID. This removes the
+        # association between the document and the file after the File record has been deleted.
+        File2DocumentService.delete_by_document_id(document_id)
+        # delete it from minio
+        MINIO.rm(dataset_id, location)
+    except Exception as e:
+        errors += str(e)
+    if errors:
+        return construct_json_result(data=False, message=errors, code=RetCode.SERVER_ERROR)
+    return construct_json_result(data=True, code=RetCode.SUCCESS)
+# ----------------------------list files-----------------------------------------------------
+@manager.route('/<dataset_id>/documents/', methods=['GET'])
+@login_required
+def list_documents(dataset_id):
+    if not dataset_id:
+        return construct_json_result(
+            data=False, message='Lack of "dataset_id"', code=RetCode.ARGUMENT_ERROR)
+    # searching keywords
+    keywords = request.args.get("keywords", "")
+    offset = request.args.get("offset", 0)
+    count = request.args.get("count", -1)
+    order_by = request.args.get("order_by", "create_time")
+    descend = request.args.get("descend", True)
+    try:
+        docs, total = DocumentService.list_documents_in_dataset(dataset_id, int(offset), int(count), order_by,
+                                                                descend, keywords)
+        return construct_json_result(data={"total": total, "docs": docs}, message=RetCode.SUCCESS)
+    except Exception as e:
+        return construct_error_response(e)
+# ----------------------------download a file-----------------------------------------------------
+# ----------------------------enable rename-----------------------------------------------------
+# ----------------------------start parsing-----------------------------------------------------
+# ----------------------------stop parsing-----------------------------------------------------
+# ----------------------------show the status of the file-----------------------------------------------------
+# ----------------------------list the chunks of the file-----------------------------------------------------
+# ----------------------------delete the chunk-----------------------------------------------------
+# ----------------------------edit the status of the chunk-----------------------------------------------------
+# ----------------------------insert a new chunk-----------------------------------------------------
+# ----------------------------upload a file-----------------------------------------------------
+# ----------------------------get a specific chunk-----------------------------------------------------
+# ----------------------------retrieval test-----------------------------------------------------

api/apps/documents_api.py DELETED Viewed

@@ -1,228 +0,0 @@
-#
-#  Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License
-#
-import os
-import re
-import warnings
-from flask import request
-from flask_login import login_required, current_user
-from api.db import FileType, ParserType
-from api.db.services import duplicate_name
-from api.db.services.document_service import DocumentService
-from api.db.services.file2document_service import File2DocumentService
-from api.db.services.file_service import FileService
-from api.db.services.knowledgebase_service import KnowledgebaseService
-from api.settings import RetCode
-from api.utils import get_uuid
-from api.utils.api_utils import construct_json_result
-from api.utils.file_utils import filename_type, thumbnail
-from rag.utils.minio_conn import MINIO
-from api.db.db_models import Task, File
-from api.db import FileType, TaskStatus, ParserType, FileSource
-MAXIMUM_OF_UPLOADING_FILES = 256
-# ----------------------------upload local files-----------------------------------------------------
-@manager.route('/<dataset_id>', methods=['POST'])
-@login_required
-def upload(dataset_id):
-    # no files
-    if not request.files:
-        return construct_json_result(
-            message='There is no file!', code=RetCode.ARGUMENT_ERROR)
-    # the number of uploading files exceeds the limit
-    file_objs = request.files.getlist('file')
-    num_file_objs = len(file_objs)
-    if num_file_objs > MAXIMUM_OF_UPLOADING_FILES:
-        return construct_json_result(code=RetCode.DATA_ERROR, message=f"You try to upload {num_file_objs} files, "
-                                                                      f"which exceeds the maximum number of uploading files: {MAXIMUM_OF_UPLOADING_FILES}")
-    for file_obj in file_objs:
-        # the content of the file
-        file_content = file_obj.read()
-        file_name = file_obj.filename
-        # no name
-        if not file_name:
-            return construct_json_result(
-                message='There is a file without name!', code=RetCode.ARGUMENT_ERROR)
-        # TODO: support the remote files
-        if 'http' in file_name:
-            return construct_json_result(code=RetCode.ARGUMENT_ERROR, message="Remote files have not unsupported.")
-        # the content is empty, raising a warning
-        if file_content == b'':
-            warnings.warn(f"[WARNING]: The file {file_name} is empty.")
-    # no dataset
-    exist, dataset = KnowledgebaseService.get_by_id(dataset_id)
-    if not exist:
-        return construct_json_result(message="Can't find this dataset", code=RetCode.DATA_ERROR)
-    # get the root_folder
-    root_folder = FileService.get_root_folder(current_user.id)
-    # get the id of the root_folder
-    parent_file_id = root_folder["id"]  # document id
-    # this is for the new user, create '.knowledgebase' file
-    FileService.init_knowledgebase_docs(parent_file_id, current_user.id)
-    # go inside this folder, get the kb_root_folder
-    kb_root_folder = FileService.get_kb_folder(current_user.id)
-    # link the file management to the kb_folder
-    kb_folder = FileService.new_a_file_from_kb(dataset.tenant_id, dataset.name, kb_root_folder["id"])
-    # grab all the errs
-    err = []
-    MAX_FILE_NUM_PER_USER = int(os.environ.get('MAX_FILE_NUM_PER_USER', 0))
-    uploaded_docs_json = []
-    for file in file_objs:
-        try:
-            # TODO: get this value from the database as some tenants have this limit while others don't
-            if MAX_FILE_NUM_PER_USER > 0 and DocumentService.get_doc_count(dataset.tenant_id) >= MAX_FILE_NUM_PER_USER:
-                return construct_json_result(code=RetCode.DATA_ERROR,
-                                             message="Exceed the maximum file number of a free user!")
-            # deal with the duplicate name
-            filename = duplicate_name(
-                DocumentService.query,
-                name=file.filename,
-                kb_id=dataset.id)
-            # deal with the unsupported type
-            filetype = filename_type(filename)
-            if filetype == FileType.OTHER.value:
-                return construct_json_result(code=RetCode.DATA_ERROR,
-                                             message="This type of file has not been supported yet!")
-            # upload to the minio
-            location = filename
-            while MINIO.obj_exist(dataset_id, location):
-                location += "_"
-            blob = file.read()
-            MINIO.put(dataset_id, location, blob)
-            doc = {
-                "id": get_uuid(),
-                "kb_id": dataset.id,
-                "parser_id": dataset.parser_id,
-                "parser_config": dataset.parser_config,
-                "created_by": current_user.id,
-                "type": filetype,
-                "name": filename,
-                "location": location,
-                "size": len(blob),
-                "thumbnail": thumbnail(filename, blob)
-            }
-            if doc["type"] == FileType.VISUAL:
-                doc["parser_id"] = ParserType.PICTURE.value
-            if re.search(r"\.(ppt|pptx|pages)$", filename):
-                doc["parser_id"] = ParserType.PRESENTATION.value
-            DocumentService.insert(doc)
-            FileService.add_file_from_kb(doc, kb_folder["id"], dataset.tenant_id)
-            uploaded_docs_json.append(doc)
-        except Exception as e:
-            err.append(file.filename + ": " + str(e))
-    if err:
-        # return all the errors
-        return construct_json_result(message="\n".join(err), code=RetCode.SERVER_ERROR)
-    # success
-    return construct_json_result(data=uploaded_docs_json, code=RetCode.SUCCESS)
-# ----------------------------delete a file-----------------------------------------------------
-@manager.route('/<dataset_id>/<document_id>', methods=['DELETE'])
-@login_required
-def delete(document_id, dataset_id):  # string
-    # get the root folder
-    root_folder = FileService.get_root_folder(current_user.id)
-    # parent file's id
-    parent_file_id = root_folder["id"]
-    # consider the new user
-    FileService.init_knowledgebase_docs(parent_file_id, current_user.id)
-    # store all the errors that may have
-    errors = ""
-    try:
-        # whether there is this document
-        exist, doc = DocumentService.get_by_id(document_id)
-        if not exist:
-            return construct_json_result(message=f"Document {document_id} not found!", code=RetCode.DATA_ERROR)
-        # whether this doc is authorized by this tenant
-        tenant_id = DocumentService.get_tenant_id(document_id)
-        if not tenant_id:
-            return construct_json_result(message=f"You cannot delete this document {document_id} due to the authorization"
-                                                 f" reason!", code=RetCode.AUTHENTICATION_ERROR)
-        # get the doc's id and location
-        real_dataset_id, location = File2DocumentService.get_minio_address(doc_id=document_id)
-        if real_dataset_id != dataset_id:
-            return construct_json_result(message=f"The document {document_id} is not in the dataset: {dataset_id}, "
-                                                 f"but in the dataset: {real_dataset_id}.", code=RetCode.ARGUMENT_ERROR)
-        # there is an issue when removing
-        if not DocumentService.remove_document(doc, tenant_id):
-            return construct_json_result(
-                message="There was an error during the document removal process. Please check the status of the "
-                        "RAGFlow server and try the removal again.", code=RetCode.OPERATING_ERROR)
-        # fetch the File2Document record associated with the provided document ID.
-        file_to_doc = File2DocumentService.get_by_document_id(document_id)
-        # delete the associated File record.
-        FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == file_to_doc[0].file_id])
-        # delete the File2Document record itself using the document ID. This removes the
-        # association between the document and the file after the File record has been deleted.
-        File2DocumentService.delete_by_document_id(document_id)
-        # delete it from minio
-        MINIO.rm(dataset_id, location)
-    except Exception as e:
-        errors += str(e)
-    if errors:
-        return construct_json_result(data=False, message=errors, code=RetCode.SERVER_ERROR)
-    return construct_json_result(data=True, code=RetCode.SUCCESS)
-# ----------------------------upload online files------------------------------------------------
-# ----------------------------download a file-----------------------------------------------------
-# ----------------------------enable rename-----------------------------------------------------
-# ----------------------------list files-----------------------------------------------------
-# ----------------------------start parsing-----------------------------------------------------
-# ----------------------------stop parsing-----------------------------------------------------
-# ----------------------------show the status of the file-----------------------------------------------------
-# ----------------------------list the chunks of the file-----------------------------------------------------
-# ----------------------------delete the chunk-----------------------------------------------------
-# ----------------------------edit the status of the chunk-----------------------------------------------------
-# ----------------------------insert a new chunk-----------------------------------------------------
-# ----------------------------upload a file-----------------------------------------------------
-# ----------------------------get a specific chunk-----------------------------------------------------
-# ----------------------------retrieval test-----------------------------------------------------

api/db/services/document_service.py CHANGED Viewed

@@ -59,6 +59,35 @@ class DocumentService(CommonService):
         return list(docs.dicts()), count
     @classmethod
     @DB.connection_context()
     def insert(cls, doc):

         return list(docs.dicts()), count
+    @classmethod
+    @DB.connection_context()
+    def list_documents_in_dataset(cls, dataset_id, offset, count, order_by, descend, keywords):
+        if keywords:
+            docs = cls.model.select().where(
+                (cls.model.kb_id == dataset_id),
+                (fn.LOWER(cls.model.name).contains(keywords.lower()))
+            )
+        else:
+            docs = cls.model.select().where(cls.model.kb_id == dataset_id)
+        total = docs.count()
+        if descend == 'True':
+            docs = docs.order_by(cls.model.getter_by(order_by).desc())
+        if descend == 'False':
+            docs = docs.order_by(cls.model.getter_by(order_by).asc())
+        docs = list(docs.dicts())
+        docs_length = len(docs)
+        if offset < 0 or offset > docs_length:
+            raise IndexError("Offset is out of the valid range.")
+        if count == -1:
+            return docs[offset:], total
+        return docs[offset:offset + count], total
     @classmethod
     @DB.connection_context()
     def insert(cls, doc):

api/db/services/knowledgebase_service.py CHANGED Viewed

@@ -60,6 +60,9 @@ class KnowledgebaseService(CommonService):
         if offset < 0 or offset > kbs_length:
             raise IndexError("Offset is out of the valid range.")
         return kbs[offset:offset+count]
     @classmethod

         if offset < 0 or offset > kbs_length:
             raise IndexError("Offset is out of the valid range.")
+        if count == -1:
+            return kbs[offset:]
         return kbs[offset:offset+count]
     @classmethod

docs/references/ragflow_api.md CHANGED Viewed

@@ -274,4 +274,6 @@ You are required to input at least one parameter.
     "code": 102,
     "message": "Please input at least one parameter that you want to update!"
 }
-```

     "code": 102,
     "message": "Please input at least one parameter that you want to update!"
 }
+```

sdk/python/ragflow/ragflow.py CHANGED Viewed

@@ -26,12 +26,11 @@ class RAGFlow:
         '''
         api_url: http://<host_address>/api/v1
         dataset_url: http://<host_address>/api/v1/dataset
-        document_url: http://<host_address>/api/v1/documents
         '''
         self.user_key = user_key
         self.api_url = f"{base_url}/api/{version}"
         self.dataset_url = f"{self.api_url}/dataset"
-        self.document_url = f"{self.api_url}/documents"
         self.authorization_header = {"Authorization": "{}".format(self.user_key)}
     def create_dataset(self, dataset_name):
@@ -79,7 +78,7 @@ class RAGFlow:
         response = requests.put(endpoint, json=params, headers=self.authorization_header)
         return response.json()
-# -------------------- content management -----------------------------------------------------
     # ----------------------------upload local files-----------------------------------------------------
     def upload_local_file(self, dataset_id, file_paths):
@@ -95,7 +94,7 @@ class RAGFlow:
             else:
                 return {'code': RetCode.DATA_ERROR, 'message': f"The file {file_path} does not exist"}
-        res = requests.request('POST', url=f"{self.document_url}/{dataset_id}", files=files,
                                headers=self.authorization_header)
         result_dict = json.loads(res.text)
@@ -103,16 +102,27 @@ class RAGFlow:
     # ----------------------------delete a file-----------------------------------------------------
     def delete_files(self, document_id, dataset_id):
-        endpoint = f"{self.document_url}/{dataset_id}/{document_id}"
         res = requests.delete(endpoint, headers=self.authorization_header)
         return res.json()
     # ----------------------------download a file-----------------------------------------------------
     # ----------------------------enable rename-----------------------------------------------------
-    # ----------------------------list files-----------------------------------------------------
     # ----------------------------start parsing-----------------------------------------------------
     # ----------------------------stop parsing-----------------------------------------------------

         '''
         api_url: http://<host_address>/api/v1
         dataset_url: http://<host_address>/api/v1/dataset
+        document_url: http://<host_address>/api/v1/dataset/{dataset_id}/documents
         '''
         self.user_key = user_key
         self.api_url = f"{base_url}/api/{version}"
         self.dataset_url = f"{self.api_url}/dataset"
         self.authorization_header = {"Authorization": "{}".format(self.user_key)}
     def create_dataset(self, dataset_name):
         response = requests.put(endpoint, json=params, headers=self.authorization_header)
         return response.json()
+    # -------------------- content management -----------------------------------------------------
     # ----------------------------upload local files-----------------------------------------------------
     def upload_local_file(self, dataset_id, file_paths):
             else:
                 return {'code': RetCode.DATA_ERROR, 'message': f"The file {file_path} does not exist"}
+        res = requests.request('POST', url=f"{self.dataset_url}/{dataset_id}/documents", files=files,
                                headers=self.authorization_header)
         result_dict = json.loads(res.text)
     # ----------------------------delete a file-----------------------------------------------------
     def delete_files(self, document_id, dataset_id):
+        endpoint = f"{self.dataset_url}/{dataset_id}/documents/{document_id}"
         res = requests.delete(endpoint, headers=self.authorization_header)
         return res.json()
+    # ----------------------------list files-----------------------------------------------------
+    def list_files(self, dataset_id, offset=0, count=-1, order_by="create_time", descend=True, keywords=""):
+        params = {
+            "offset": offset,
+            "count": count,
+            "order_by": order_by,
+            "descend": descend,
+            "keywords": keywords
+        }
+        endpoint = f"{self.dataset_url}/{dataset_id}/documents/"
+        res = requests.get(endpoint, params=params, headers=self.authorization_header)
+        return res.json()
     # ----------------------------download a file-----------------------------------------------------
     # ----------------------------enable rename-----------------------------------------------------
     # ----------------------------start parsing-----------------------------------------------------
     # ----------------------------stop parsing-----------------------------------------------------

sdk/python/test/test_document.py CHANGED Viewed

@@ -37,7 +37,7 @@ class TestFile(TestSdk):
         dataset_id = created_res['data']['dataset_id']
         file_paths = ["test_data/test.txt", "test_data/test1.txt"]
         res = ragflow.upload_local_file(dataset_id, file_paths)
-        assert res['code'] == RetCode.SUCCESS and res['data'] is True and res['message'] == 'success'
     def test_upload_one_file(self):
         """
@@ -48,7 +48,7 @@ class TestFile(TestSdk):
         dataset_id = created_res['data']['dataset_id']
         file_paths = ["test_data/test.txt"]
         res = ragflow.upload_local_file(dataset_id, file_paths)
-        assert res['code'] == RetCode.SUCCESS and res['data'] is True and res['message'] == 'success'
     def test_upload_nonexistent_files(self):
         """
@@ -237,12 +237,143 @@ class TestFile(TestSdk):
         assert (deleted_res['code'] == RetCode.ARGUMENT_ERROR and deleted_res['message'] ==
                 f'The document {doc_id} is not in the dataset: {other_dataset_id}, but in the dataset: {created_res_id}.')
 # ----------------------------download a file-----------------------------------------------------
 # ----------------------------enable rename-----------------------------------------------------
-# ----------------------------list files-----------------------------------------------------
 # ----------------------------start parsing-----------------------------------------------------
 # ----------------------------stop parsing-----------------------------------------------------
@@ -257,8 +388,6 @@ class TestFile(TestSdk):
 # ----------------------------insert a new chunk-----------------------------------------------------
-# ----------------------------upload a file-----------------------------------------------------
 # ----------------------------get a specific chunk-----------------------------------------------------
 # ----------------------------retrieval test-----------------------------------------------------

         dataset_id = created_res['data']['dataset_id']
         file_paths = ["test_data/test.txt", "test_data/test1.txt"]
         res = ragflow.upload_local_file(dataset_id, file_paths)
+        assert res['code'] == RetCode.SUCCESS and res['message'] == 'success'
     def test_upload_one_file(self):
         """
         dataset_id = created_res['data']['dataset_id']
         file_paths = ["test_data/test.txt"]
         res = ragflow.upload_local_file(dataset_id, file_paths)
+        assert res['code'] == RetCode.SUCCESS and res['message'] == 'success'
     def test_upload_nonexistent_files(self):
         """
         assert (deleted_res['code'] == RetCode.ARGUMENT_ERROR and deleted_res['message'] ==
                 f'The document {doc_id} is not in the dataset: {other_dataset_id}, but in the dataset: {created_res_id}.')
+# ----------------------------list files-----------------------------------------------------
+    def test_list_documents_with_success(self):
+        """
+        Test listing documents with a successful outcome.
+        """
+        ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
+        # upload a document
+        created_res = ragflow.create_dataset("test_list_documents_with_success")
+        created_res_id = created_res['data']['dataset_id']
+        file_paths = ["test_data/test.txt"]
+        ragflow.upload_local_file(created_res_id, file_paths)
+        # Call the list_document method
+        response = ragflow.list_files(created_res_id)
+        assert response['code'] == RetCode.SUCCESS and len(response['data']['docs']) == 1
+    def test_list_documents_with_checking_size(self):
+        """
+        Test listing documents and verify the size and names of the documents.
+        """
+        ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
+        # upload 10 documents
+        created_res = ragflow.create_dataset("test_list_documents_with_checking_size")
+        created_res_id = created_res['data']['dataset_id']
+        file_paths = ["test_data/test.txt"] * 10
+        ragflow.upload_local_file(created_res_id, file_paths)
+        # Call the list_document method
+        response = ragflow.list_files(created_res_id)
+        assert response['code'] == RetCode.SUCCESS and len(response['data']['docs']) == 10
+    def test_list_documents_with_getting_empty_result(self):
+        """
+        Test listing documents that should be empty.
+        """
+        ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
+        # upload 0 documents
+        created_res = ragflow.create_dataset("test_list_documents_with_getting_empty_result")
+        created_res_id = created_res['data']['dataset_id']
+        # Call the list_document method
+        response = ragflow.list_files(created_res_id)
+        assert response['code'] == RetCode.SUCCESS and len(response['data']['docs']) == 0
+    def test_list_documents_with_creating_100_documents(self):
+        """
+        Test listing 100 documents and verify the size of these documents.
+        """
+        ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
+        # upload 100 documents
+        created_res = ragflow.create_dataset("test_list_documents_with_creating_100_documents")
+        created_res_id = created_res['data']['dataset_id']
+        file_paths = ["test_data/test.txt"] * 100
+        ragflow.upload_local_file(created_res_id, file_paths)
+        # Call the list_document method
+        response = ragflow.list_files(created_res_id)
+        assert response['code'] == RetCode.SUCCESS and len(response['data']['docs']) == 100
+    def test_list_document_with_failure(self):
+        """
+        Test listing documents with IndexError.
+        """
+        ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
+        created_res = ragflow.create_dataset("test_list_document_with_failure")
+        created_res_id = created_res['data']['dataset_id']
+        response = ragflow.list_files(created_res_id, offset=-1, count=-1)
+        assert "IndexError" in response['message'] and response['code'] == RetCode.EXCEPTION_ERROR
+    def test_list_document_with_verifying_offset_and_count(self):
+        """
+        Test listing documents with verifying the functionalities of offset and count.
+        """
+        ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
+        created_res = ragflow.create_dataset("test_list_document_with_verifying_offset_and_count")
+        created_res_id = created_res['data']['dataset_id']
+        file_paths = ["test_data/test.txt", "test_data/empty.txt"] * 10
+        ragflow.upload_local_file(created_res_id, file_paths)
+        # Call the list_document method
+        response = ragflow.list_files(created_res_id, offset=2, count=10)
+        assert response['code'] == RetCode.SUCCESS and len(response['data']['docs']) == 10
+    def test_list_document_with_verifying_keywords(self):
+        """
+        Test listing documents with verifying the functionality of searching keywords.
+        """
+        ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
+        created_res = ragflow.create_dataset("test_list_document_with_verifying_keywords")
+        created_res_id = created_res['data']['dataset_id']
+        file_paths = ["test_data/test.txt", "test_data/empty.txt"]
+        ragflow.upload_local_file(created_res_id, file_paths)
+        # Call the list_document method
+        response = ragflow.list_files(created_res_id, keywords="empty")
+        assert response['code'] == RetCode.SUCCESS and len(response['data']['docs']) == 1
+    def test_list_document_with_verifying_order_by_and_descend(self):
+        """
+        Test listing documents with verifying the functionality of order_by and descend.
+        """
+        ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
+        created_res = ragflow.create_dataset("test_list_document_with_verifying_order_by_and_descend")
+        created_res_id = created_res['data']['dataset_id']
+        file_paths = ["test_data/test.txt", "test_data/empty.txt"]
+        ragflow.upload_local_file(created_res_id, file_paths)
+        # Call the list_document method
+        response = ragflow.list_files(created_res_id)
+        assert response['code'] == RetCode.SUCCESS and len(response['data']['docs']) == 2
+        docs = response['data']['docs']
+        # reverse
+        i = 1
+        for doc in docs:
+            assert doc['name'] in file_paths[i]
+            i -= 1
+    def test_list_document_with_verifying_order_by_and_ascend(self):
+        """
+        Test listing documents with verifying the functionality of order_by and ascend.
+        """
+        ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
+        created_res = ragflow.create_dataset("test_list_document_with_verifying_order_by_and_ascend")
+        created_res_id = created_res['data']['dataset_id']
+        file_paths = ["test_data/test.txt", "test_data/test1.txt", "test_data/empty.txt"]
+        ragflow.upload_local_file(created_res_id, file_paths)
+        # Call the list_document method
+        response = ragflow.list_files(created_res_id, descend=False)
+        assert response['code'] == RetCode.SUCCESS and len(response['data']['docs']) == 3
+        docs = response['data']['docs']
+        i = 0
+        for doc in docs:
+            assert doc['name'] in file_paths[i]
+            i += 1
+    # TODO: have to set the limitation of the number of documents
 # ----------------------------download a file-----------------------------------------------------
 # ----------------------------enable rename-----------------------------------------------------
 # ----------------------------start parsing-----------------------------------------------------
 # ----------------------------stop parsing-----------------------------------------------------
 # ----------------------------insert a new chunk-----------------------------------------------------
 # ----------------------------get a specific chunk-----------------------------------------------------
 # ----------------------------retrieval test-----------------------------------------------------