cecilia-uu commited on
Commit
061aa4e
·
1 Parent(s): 2653e84

API: created list_doc (#1327)

Browse files

### What problem does this PR solve?

Adds the api of listing documentation.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

api/apps/dataset_api.py CHANGED
@@ -13,13 +13,17 @@
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License.
15
 
 
 
 
16
 
17
  from flask import request
18
  from flask_login import login_required, current_user
19
  from httpx import HTTPError
20
 
21
  from api.contants import NAME_LENGTH_LIMIT
22
- from api.db import FileSource, StatusEnum
 
23
  from api.db.db_models import File
24
  from api.db.services import duplicate_name
25
  from api.db.services.document_service import DocumentService
@@ -29,8 +33,12 @@ from api.db.services.knowledgebase_service import KnowledgebaseService
29
  from api.db.services.user_service import TenantService
30
  from api.settings import RetCode
31
  from api.utils import get_uuid
32
- from api.utils.api_utils import construct_json_result, construct_result, construct_error_response, validate_request
 
 
 
33
 
 
34
 
35
  # ------------------------------ create a dataset ---------------------------------------
36
 
@@ -253,3 +261,216 @@ def update_dataset(dataset_id):
253
  return construct_json_result(data=dataset.to_json(), code=RetCode.SUCCESS)
254
  except Exception as e:
255
  return construct_error_response(e)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License.
15
 
16
+ import os
17
+ import re
18
+ import warnings
19
 
20
  from flask import request
21
  from flask_login import login_required, current_user
22
  from httpx import HTTPError
23
 
24
  from api.contants import NAME_LENGTH_LIMIT
25
+ from api.db import FileType, ParserType, FileSource
26
+ from api.db import StatusEnum
27
  from api.db.db_models import File
28
  from api.db.services import duplicate_name
29
  from api.db.services.document_service import DocumentService
 
33
  from api.db.services.user_service import TenantService
34
  from api.settings import RetCode
35
  from api.utils import get_uuid
36
+ from api.utils.api_utils import construct_json_result, construct_error_response
37
+ from api.utils.api_utils import construct_result, validate_request
38
+ from api.utils.file_utils import filename_type, thumbnail
39
+ from rag.utils.minio_conn import MINIO
40
 
41
+ MAXIMUM_OF_UPLOADING_FILES = 256
42
 
43
  # ------------------------------ create a dataset ---------------------------------------
44
 
 
261
  return construct_json_result(data=dataset.to_json(), code=RetCode.SUCCESS)
262
  except Exception as e:
263
  return construct_error_response(e)
264
+
265
+ # --------------------------------content management ----------------------------------------------
266
+
267
+ # ----------------------------upload files-----------------------------------------------------
268
+ @manager.route('/<dataset_id>/documents/', methods=['POST'])
269
+ @login_required
270
+ def upload_documents(dataset_id):
271
+ # no files
272
+ if not request.files:
273
+ return construct_json_result(
274
+ message='There is no file!', code=RetCode.ARGUMENT_ERROR)
275
+
276
+ # the number of uploading files exceeds the limit
277
+ file_objs = request.files.getlist('file')
278
+ num_file_objs = len(file_objs)
279
+
280
+ if num_file_objs > MAXIMUM_OF_UPLOADING_FILES:
281
+ return construct_json_result(code=RetCode.DATA_ERROR, message=f"You try to upload {num_file_objs} files, "
282
+ f"which exceeds the maximum number of uploading files: {MAXIMUM_OF_UPLOADING_FILES}")
283
+
284
+ for file_obj in file_objs:
285
+ # the content of the file
286
+ file_content = file_obj.read()
287
+ file_name = file_obj.filename
288
+ # no name
289
+ if not file_name:
290
+ return construct_json_result(
291
+ message='There is a file without name!', code=RetCode.ARGUMENT_ERROR)
292
+
293
+ # TODO: support the remote files
294
+ if 'http' in file_name:
295
+ return construct_json_result(code=RetCode.ARGUMENT_ERROR, message="Remote files have not unsupported.")
296
+
297
+ # the content is empty, raising a warning
298
+ if file_content == b'':
299
+ warnings.warn(f"[WARNING]: The file {file_name} is empty.")
300
+
301
+ # no dataset
302
+ exist, dataset = KnowledgebaseService.get_by_id(dataset_id)
303
+ if not exist:
304
+ return construct_json_result(message="Can't find this dataset", code=RetCode.DATA_ERROR)
305
+
306
+ # get the root_folder
307
+ root_folder = FileService.get_root_folder(current_user.id)
308
+ # get the id of the root_folder
309
+ parent_file_id = root_folder["id"] # document id
310
+ # this is for the new user, create '.knowledgebase' file
311
+ FileService.init_knowledgebase_docs(parent_file_id, current_user.id)
312
+ # go inside this folder, get the kb_root_folder
313
+ kb_root_folder = FileService.get_kb_folder(current_user.id)
314
+ # link the file management to the kb_folder
315
+ kb_folder = FileService.new_a_file_from_kb(dataset.tenant_id, dataset.name, kb_root_folder["id"])
316
+
317
+ # grab all the errs
318
+ err = []
319
+ MAX_FILE_NUM_PER_USER = int(os.environ.get('MAX_FILE_NUM_PER_USER', 0))
320
+ uploaded_docs_json = []
321
+ for file in file_objs:
322
+ try:
323
+ # TODO: get this value from the database as some tenants have this limit while others don't
324
+ if MAX_FILE_NUM_PER_USER > 0 and DocumentService.get_doc_count(dataset.tenant_id) >= MAX_FILE_NUM_PER_USER:
325
+ return construct_json_result(code=RetCode.DATA_ERROR,
326
+ message="Exceed the maximum file number of a free user!")
327
+ # deal with the duplicate name
328
+ filename = duplicate_name(
329
+ DocumentService.query,
330
+ name=file.filename,
331
+ kb_id=dataset.id)
332
+
333
+ # deal with the unsupported type
334
+ filetype = filename_type(filename)
335
+ if filetype == FileType.OTHER.value:
336
+ return construct_json_result(code=RetCode.DATA_ERROR,
337
+ message="This type of file has not been supported yet!")
338
+
339
+ # upload to the minio
340
+ location = filename
341
+ while MINIO.obj_exist(dataset_id, location):
342
+ location += "_"
343
+ blob = file.read()
344
+ MINIO.put(dataset_id, location, blob)
345
+ doc = {
346
+ "id": get_uuid(),
347
+ "kb_id": dataset.id,
348
+ "parser_id": dataset.parser_id,
349
+ "parser_config": dataset.parser_config,
350
+ "created_by": current_user.id,
351
+ "type": filetype,
352
+ "name": filename,
353
+ "location": location,
354
+ "size": len(blob),
355
+ "thumbnail": thumbnail(filename, blob)
356
+ }
357
+ if doc["type"] == FileType.VISUAL:
358
+ doc["parser_id"] = ParserType.PICTURE.value
359
+ if re.search(r"\.(ppt|pptx|pages)$", filename):
360
+ doc["parser_id"] = ParserType.PRESENTATION.value
361
+ DocumentService.insert(doc)
362
+
363
+ FileService.add_file_from_kb(doc, kb_folder["id"], dataset.tenant_id)
364
+ uploaded_docs_json.append(doc)
365
+ except Exception as e:
366
+ err.append(file.filename + ": " + str(e))
367
+
368
+ if err:
369
+ # return all the errors
370
+ return construct_json_result(message="\n".join(err), code=RetCode.SERVER_ERROR)
371
+ # success
372
+ return construct_json_result(data=uploaded_docs_json, code=RetCode.SUCCESS)
373
+
374
+
375
+ # ----------------------------delete a file-----------------------------------------------------
376
+ @manager.route('/<dataset_id>/documents/<document_id>', methods=['DELETE'])
377
+ @login_required
378
+ def delete_document(document_id, dataset_id): # string
379
+ # get the root folder
380
+ root_folder = FileService.get_root_folder(current_user.id)
381
+ # parent file's id
382
+ parent_file_id = root_folder["id"]
383
+ # consider the new user
384
+ FileService.init_knowledgebase_docs(parent_file_id, current_user.id)
385
+ # store all the errors that may have
386
+ errors = ""
387
+ try:
388
+ # whether there is this document
389
+ exist, doc = DocumentService.get_by_id(document_id)
390
+ if not exist:
391
+ return construct_json_result(message=f"Document {document_id} not found!", code=RetCode.DATA_ERROR)
392
+ # whether this doc is authorized by this tenant
393
+ tenant_id = DocumentService.get_tenant_id(document_id)
394
+ if not tenant_id:
395
+ return construct_json_result(
396
+ message=f"You cannot delete this document {document_id} due to the authorization"
397
+ f" reason!", code=RetCode.AUTHENTICATION_ERROR)
398
+
399
+ # get the doc's id and location
400
+ real_dataset_id, location = File2DocumentService.get_minio_address(doc_id=document_id)
401
+
402
+ if real_dataset_id != dataset_id:
403
+ return construct_json_result(message=f"The document {document_id} is not in the dataset: {dataset_id}, "
404
+ f"but in the dataset: {real_dataset_id}.", code=RetCode.ARGUMENT_ERROR)
405
+
406
+ # there is an issue when removing
407
+ if not DocumentService.remove_document(doc, tenant_id):
408
+ return construct_json_result(
409
+ message="There was an error during the document removal process. Please check the status of the "
410
+ "RAGFlow server and try the removal again.", code=RetCode.OPERATING_ERROR)
411
+
412
+ # fetch the File2Document record associated with the provided document ID.
413
+ file_to_doc = File2DocumentService.get_by_document_id(document_id)
414
+ # delete the associated File record.
415
+ FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == file_to_doc[0].file_id])
416
+ # delete the File2Document record itself using the document ID. This removes the
417
+ # association between the document and the file after the File record has been deleted.
418
+ File2DocumentService.delete_by_document_id(document_id)
419
+
420
+ # delete it from minio
421
+ MINIO.rm(dataset_id, location)
422
+ except Exception as e:
423
+ errors += str(e)
424
+ if errors:
425
+ return construct_json_result(data=False, message=errors, code=RetCode.SERVER_ERROR)
426
+
427
+ return construct_json_result(data=True, code=RetCode.SUCCESS)
428
+
429
+
430
+ # ----------------------------list files-----------------------------------------------------
431
+ @manager.route('/<dataset_id>/documents/', methods=['GET'])
432
+ @login_required
433
+ def list_documents(dataset_id):
434
+ if not dataset_id:
435
+ return construct_json_result(
436
+ data=False, message='Lack of "dataset_id"', code=RetCode.ARGUMENT_ERROR)
437
+
438
+ # searching keywords
439
+ keywords = request.args.get("keywords", "")
440
+
441
+ offset = request.args.get("offset", 0)
442
+ count = request.args.get("count", -1)
443
+ order_by = request.args.get("order_by", "create_time")
444
+ descend = request.args.get("descend", True)
445
+ try:
446
+ docs, total = DocumentService.list_documents_in_dataset(dataset_id, int(offset), int(count), order_by,
447
+ descend, keywords)
448
+
449
+ return construct_json_result(data={"total": total, "docs": docs}, message=RetCode.SUCCESS)
450
+ except Exception as e:
451
+ return construct_error_response(e)
452
+
453
+ # ----------------------------download a file-----------------------------------------------------
454
+
455
+ # ----------------------------enable rename-----------------------------------------------------
456
+
457
+ # ----------------------------start parsing-----------------------------------------------------
458
+
459
+ # ----------------------------stop parsing-----------------------------------------------------
460
+
461
+ # ----------------------------show the status of the file-----------------------------------------------------
462
+
463
+ # ----------------------------list the chunks of the file-----------------------------------------------------
464
+
465
+ # ----------------------------delete the chunk-----------------------------------------------------
466
+
467
+ # ----------------------------edit the status of the chunk-----------------------------------------------------
468
+
469
+ # ----------------------------insert a new chunk-----------------------------------------------------
470
+
471
+ # ----------------------------upload a file-----------------------------------------------------
472
+
473
+ # ----------------------------get a specific chunk-----------------------------------------------------
474
+
475
+ # ----------------------------retrieval test-----------------------------------------------------
476
+
api/apps/documents_api.py DELETED
@@ -1,228 +0,0 @@
1
- #
2
- # Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License
15
- #
16
-
17
- import os
18
- import re
19
- import warnings
20
-
21
- from flask import request
22
- from flask_login import login_required, current_user
23
-
24
- from api.db import FileType, ParserType
25
- from api.db.services import duplicate_name
26
- from api.db.services.document_service import DocumentService
27
- from api.db.services.file2document_service import File2DocumentService
28
- from api.db.services.file_service import FileService
29
- from api.db.services.knowledgebase_service import KnowledgebaseService
30
- from api.settings import RetCode
31
- from api.utils import get_uuid
32
- from api.utils.api_utils import construct_json_result
33
- from api.utils.file_utils import filename_type, thumbnail
34
- from rag.utils.minio_conn import MINIO
35
- from api.db.db_models import Task, File
36
- from api.db import FileType, TaskStatus, ParserType, FileSource
37
-
38
-
39
- MAXIMUM_OF_UPLOADING_FILES = 256
40
-
41
-
42
- # ----------------------------upload local files-----------------------------------------------------
43
- @manager.route('/<dataset_id>', methods=['POST'])
44
- @login_required
45
- def upload(dataset_id):
46
- # no files
47
- if not request.files:
48
- return construct_json_result(
49
- message='There is no file!', code=RetCode.ARGUMENT_ERROR)
50
-
51
- # the number of uploading files exceeds the limit
52
- file_objs = request.files.getlist('file')
53
- num_file_objs = len(file_objs)
54
-
55
- if num_file_objs > MAXIMUM_OF_UPLOADING_FILES:
56
- return construct_json_result(code=RetCode.DATA_ERROR, message=f"You try to upload {num_file_objs} files, "
57
- f"which exceeds the maximum number of uploading files: {MAXIMUM_OF_UPLOADING_FILES}")
58
-
59
- for file_obj in file_objs:
60
- # the content of the file
61
- file_content = file_obj.read()
62
- file_name = file_obj.filename
63
- # no name
64
- if not file_name:
65
- return construct_json_result(
66
- message='There is a file without name!', code=RetCode.ARGUMENT_ERROR)
67
-
68
- # TODO: support the remote files
69
- if 'http' in file_name:
70
- return construct_json_result(code=RetCode.ARGUMENT_ERROR, message="Remote files have not unsupported.")
71
-
72
- # the content is empty, raising a warning
73
- if file_content == b'':
74
- warnings.warn(f"[WARNING]: The file {file_name} is empty.")
75
-
76
- # no dataset
77
- exist, dataset = KnowledgebaseService.get_by_id(dataset_id)
78
- if not exist:
79
- return construct_json_result(message="Can't find this dataset", code=RetCode.DATA_ERROR)
80
-
81
- # get the root_folder
82
- root_folder = FileService.get_root_folder(current_user.id)
83
- # get the id of the root_folder
84
- parent_file_id = root_folder["id"] # document id
85
- # this is for the new user, create '.knowledgebase' file
86
- FileService.init_knowledgebase_docs(parent_file_id, current_user.id)
87
- # go inside this folder, get the kb_root_folder
88
- kb_root_folder = FileService.get_kb_folder(current_user.id)
89
- # link the file management to the kb_folder
90
- kb_folder = FileService.new_a_file_from_kb(dataset.tenant_id, dataset.name, kb_root_folder["id"])
91
-
92
- # grab all the errs
93
- err = []
94
- MAX_FILE_NUM_PER_USER = int(os.environ.get('MAX_FILE_NUM_PER_USER', 0))
95
- uploaded_docs_json = []
96
- for file in file_objs:
97
- try:
98
- # TODO: get this value from the database as some tenants have this limit while others don't
99
- if MAX_FILE_NUM_PER_USER > 0 and DocumentService.get_doc_count(dataset.tenant_id) >= MAX_FILE_NUM_PER_USER:
100
- return construct_json_result(code=RetCode.DATA_ERROR,
101
- message="Exceed the maximum file number of a free user!")
102
- # deal with the duplicate name
103
- filename = duplicate_name(
104
- DocumentService.query,
105
- name=file.filename,
106
- kb_id=dataset.id)
107
-
108
- # deal with the unsupported type
109
- filetype = filename_type(filename)
110
- if filetype == FileType.OTHER.value:
111
- return construct_json_result(code=RetCode.DATA_ERROR,
112
- message="This type of file has not been supported yet!")
113
-
114
- # upload to the minio
115
- location = filename
116
- while MINIO.obj_exist(dataset_id, location):
117
- location += "_"
118
- blob = file.read()
119
- MINIO.put(dataset_id, location, blob)
120
- doc = {
121
- "id": get_uuid(),
122
- "kb_id": dataset.id,
123
- "parser_id": dataset.parser_id,
124
- "parser_config": dataset.parser_config,
125
- "created_by": current_user.id,
126
- "type": filetype,
127
- "name": filename,
128
- "location": location,
129
- "size": len(blob),
130
- "thumbnail": thumbnail(filename, blob)
131
- }
132
- if doc["type"] == FileType.VISUAL:
133
- doc["parser_id"] = ParserType.PICTURE.value
134
- if re.search(r"\.(ppt|pptx|pages)$", filename):
135
- doc["parser_id"] = ParserType.PRESENTATION.value
136
- DocumentService.insert(doc)
137
-
138
- FileService.add_file_from_kb(doc, kb_folder["id"], dataset.tenant_id)
139
- uploaded_docs_json.append(doc)
140
- except Exception as e:
141
- err.append(file.filename + ": " + str(e))
142
-
143
- if err:
144
- # return all the errors
145
- return construct_json_result(message="\n".join(err), code=RetCode.SERVER_ERROR)
146
- # success
147
- return construct_json_result(data=uploaded_docs_json, code=RetCode.SUCCESS)
148
-
149
- # ----------------------------delete a file-----------------------------------------------------
150
- @manager.route('/<dataset_id>/<document_id>', methods=['DELETE'])
151
- @login_required
152
- def delete(document_id, dataset_id): # string
153
- # get the root folder
154
- root_folder = FileService.get_root_folder(current_user.id)
155
- # parent file's id
156
- parent_file_id = root_folder["id"]
157
- # consider the new user
158
- FileService.init_knowledgebase_docs(parent_file_id, current_user.id)
159
- # store all the errors that may have
160
- errors = ""
161
- try:
162
- # whether there is this document
163
- exist, doc = DocumentService.get_by_id(document_id)
164
- if not exist:
165
- return construct_json_result(message=f"Document {document_id} not found!", code=RetCode.DATA_ERROR)
166
- # whether this doc is authorized by this tenant
167
- tenant_id = DocumentService.get_tenant_id(document_id)
168
- if not tenant_id:
169
- return construct_json_result(message=f"You cannot delete this document {document_id} due to the authorization"
170
- f" reason!", code=RetCode.AUTHENTICATION_ERROR)
171
-
172
- # get the doc's id and location
173
- real_dataset_id, location = File2DocumentService.get_minio_address(doc_id=document_id)
174
-
175
- if real_dataset_id != dataset_id:
176
- return construct_json_result(message=f"The document {document_id} is not in the dataset: {dataset_id}, "
177
- f"but in the dataset: {real_dataset_id}.", code=RetCode.ARGUMENT_ERROR)
178
-
179
- # there is an issue when removing
180
- if not DocumentService.remove_document(doc, tenant_id):
181
- return construct_json_result(
182
- message="There was an error during the document removal process. Please check the status of the "
183
- "RAGFlow server and try the removal again.", code=RetCode.OPERATING_ERROR)
184
-
185
- # fetch the File2Document record associated with the provided document ID.
186
- file_to_doc = File2DocumentService.get_by_document_id(document_id)
187
- # delete the associated File record.
188
- FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == file_to_doc[0].file_id])
189
- # delete the File2Document record itself using the document ID. This removes the
190
- # association between the document and the file after the File record has been deleted.
191
- File2DocumentService.delete_by_document_id(document_id)
192
-
193
- # delete it from minio
194
- MINIO.rm(dataset_id, location)
195
- except Exception as e:
196
- errors += str(e)
197
- if errors:
198
- return construct_json_result(data=False, message=errors, code=RetCode.SERVER_ERROR)
199
-
200
- return construct_json_result(data=True, code=RetCode.SUCCESS)
201
-
202
- # ----------------------------upload online files------------------------------------------------
203
-
204
- # ----------------------------download a file-----------------------------------------------------
205
-
206
- # ----------------------------enable rename-----------------------------------------------------
207
-
208
- # ----------------------------list files-----------------------------------------------------
209
-
210
- # ----------------------------start parsing-----------------------------------------------------
211
-
212
- # ----------------------------stop parsing-----------------------------------------------------
213
-
214
- # ----------------------------show the status of the file-----------------------------------------------------
215
-
216
- # ----------------------------list the chunks of the file-----------------------------------------------------
217
-
218
- # ----------------------------delete the chunk-----------------------------------------------------
219
-
220
- # ----------------------------edit the status of the chunk-----------------------------------------------------
221
-
222
- # ----------------------------insert a new chunk-----------------------------------------------------
223
-
224
- # ----------------------------upload a file-----------------------------------------------------
225
-
226
- # ----------------------------get a specific chunk-----------------------------------------------------
227
-
228
- # ----------------------------retrieval test-----------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
api/db/services/document_service.py CHANGED
@@ -59,6 +59,35 @@ class DocumentService(CommonService):
59
 
60
  return list(docs.dicts()), count
61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  @classmethod
63
  @DB.connection_context()
64
  def insert(cls, doc):
 
59
 
60
  return list(docs.dicts()), count
61
 
62
+ @classmethod
63
+ @DB.connection_context()
64
+ def list_documents_in_dataset(cls, dataset_id, offset, count, order_by, descend, keywords):
65
+ if keywords:
66
+ docs = cls.model.select().where(
67
+ (cls.model.kb_id == dataset_id),
68
+ (fn.LOWER(cls.model.name).contains(keywords.lower()))
69
+ )
70
+ else:
71
+ docs = cls.model.select().where(cls.model.kb_id == dataset_id)
72
+
73
+ total = docs.count()
74
+
75
+ if descend == 'True':
76
+ docs = docs.order_by(cls.model.getter_by(order_by).desc())
77
+ if descend == 'False':
78
+ docs = docs.order_by(cls.model.getter_by(order_by).asc())
79
+
80
+ docs = list(docs.dicts())
81
+ docs_length = len(docs)
82
+
83
+ if offset < 0 or offset > docs_length:
84
+ raise IndexError("Offset is out of the valid range.")
85
+
86
+ if count == -1:
87
+ return docs[offset:], total
88
+
89
+ return docs[offset:offset + count], total
90
+
91
  @classmethod
92
  @DB.connection_context()
93
  def insert(cls, doc):
api/db/services/knowledgebase_service.py CHANGED
@@ -60,6 +60,9 @@ class KnowledgebaseService(CommonService):
60
  if offset < 0 or offset > kbs_length:
61
  raise IndexError("Offset is out of the valid range.")
62
 
 
 
 
63
  return kbs[offset:offset+count]
64
 
65
  @classmethod
 
60
  if offset < 0 or offset > kbs_length:
61
  raise IndexError("Offset is out of the valid range.")
62
 
63
+ if count == -1:
64
+ return kbs[offset:]
65
+
66
  return kbs[offset:offset+count]
67
 
68
  @classmethod
docs/references/ragflow_api.md CHANGED
@@ -274,4 +274,6 @@ You are required to input at least one parameter.
274
  "code": 102,
275
  "message": "Please input at least one parameter that you want to update!"
276
  }
277
- ```
 
 
 
274
  "code": 102,
275
  "message": "Please input at least one parameter that you want to update!"
276
  }
277
+ ```
278
+
279
+
sdk/python/ragflow/ragflow.py CHANGED
@@ -26,12 +26,11 @@ class RAGFlow:
26
  '''
27
  api_url: http://<host_address>/api/v1
28
  dataset_url: http://<host_address>/api/v1/dataset
29
- document_url: http://<host_address>/api/v1/documents
30
  '''
31
  self.user_key = user_key
32
  self.api_url = f"{base_url}/api/{version}"
33
  self.dataset_url = f"{self.api_url}/dataset"
34
- self.document_url = f"{self.api_url}/documents"
35
  self.authorization_header = {"Authorization": "{}".format(self.user_key)}
36
 
37
  def create_dataset(self, dataset_name):
@@ -79,7 +78,7 @@ class RAGFlow:
79
  response = requests.put(endpoint, json=params, headers=self.authorization_header)
80
  return response.json()
81
 
82
- # -------------------- content management -----------------------------------------------------
83
 
84
  # ----------------------------upload local files-----------------------------------------------------
85
  def upload_local_file(self, dataset_id, file_paths):
@@ -95,7 +94,7 @@ class RAGFlow:
95
  else:
96
  return {'code': RetCode.DATA_ERROR, 'message': f"The file {file_path} does not exist"}
97
 
98
- res = requests.request('POST', url=f"{self.document_url}/{dataset_id}", files=files,
99
  headers=self.authorization_header)
100
 
101
  result_dict = json.loads(res.text)
@@ -103,16 +102,27 @@ class RAGFlow:
103
 
104
  # ----------------------------delete a file-----------------------------------------------------
105
  def delete_files(self, document_id, dataset_id):
106
- endpoint = f"{self.document_url}/{dataset_id}/{document_id}"
107
  res = requests.delete(endpoint, headers=self.authorization_header)
108
  return res.json()
109
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  # ----------------------------download a file-----------------------------------------------------
111
 
112
  # ----------------------------enable rename-----------------------------------------------------
113
 
114
- # ----------------------------list files-----------------------------------------------------
115
-
116
  # ----------------------------start parsing-----------------------------------------------------
117
 
118
  # ----------------------------stop parsing-----------------------------------------------------
 
26
  '''
27
  api_url: http://<host_address>/api/v1
28
  dataset_url: http://<host_address>/api/v1/dataset
29
+ document_url: http://<host_address>/api/v1/dataset/{dataset_id}/documents
30
  '''
31
  self.user_key = user_key
32
  self.api_url = f"{base_url}/api/{version}"
33
  self.dataset_url = f"{self.api_url}/dataset"
 
34
  self.authorization_header = {"Authorization": "{}".format(self.user_key)}
35
 
36
  def create_dataset(self, dataset_name):
 
78
  response = requests.put(endpoint, json=params, headers=self.authorization_header)
79
  return response.json()
80
 
81
+ # -------------------- content management -----------------------------------------------------
82
 
83
  # ----------------------------upload local files-----------------------------------------------------
84
  def upload_local_file(self, dataset_id, file_paths):
 
94
  else:
95
  return {'code': RetCode.DATA_ERROR, 'message': f"The file {file_path} does not exist"}
96
 
97
+ res = requests.request('POST', url=f"{self.dataset_url}/{dataset_id}/documents", files=files,
98
  headers=self.authorization_header)
99
 
100
  result_dict = json.loads(res.text)
 
102
 
103
  # ----------------------------delete a file-----------------------------------------------------
104
  def delete_files(self, document_id, dataset_id):
105
+ endpoint = f"{self.dataset_url}/{dataset_id}/documents/{document_id}"
106
  res = requests.delete(endpoint, headers=self.authorization_header)
107
  return res.json()
108
 
109
+ # ----------------------------list files-----------------------------------------------------
110
+ def list_files(self, dataset_id, offset=0, count=-1, order_by="create_time", descend=True, keywords=""):
111
+ params = {
112
+ "offset": offset,
113
+ "count": count,
114
+ "order_by": order_by,
115
+ "descend": descend,
116
+ "keywords": keywords
117
+ }
118
+ endpoint = f"{self.dataset_url}/{dataset_id}/documents/"
119
+ res = requests.get(endpoint, params=params, headers=self.authorization_header)
120
+ return res.json()
121
+
122
  # ----------------------------download a file-----------------------------------------------------
123
 
124
  # ----------------------------enable rename-----------------------------------------------------
125
 
 
 
126
  # ----------------------------start parsing-----------------------------------------------------
127
 
128
  # ----------------------------stop parsing-----------------------------------------------------
sdk/python/test/test_document.py CHANGED
@@ -37,7 +37,7 @@ class TestFile(TestSdk):
37
  dataset_id = created_res['data']['dataset_id']
38
  file_paths = ["test_data/test.txt", "test_data/test1.txt"]
39
  res = ragflow.upload_local_file(dataset_id, file_paths)
40
- assert res['code'] == RetCode.SUCCESS and res['data'] is True and res['message'] == 'success'
41
 
42
  def test_upload_one_file(self):
43
  """
@@ -48,7 +48,7 @@ class TestFile(TestSdk):
48
  dataset_id = created_res['data']['dataset_id']
49
  file_paths = ["test_data/test.txt"]
50
  res = ragflow.upload_local_file(dataset_id, file_paths)
51
- assert res['code'] == RetCode.SUCCESS and res['data'] is True and res['message'] == 'success'
52
 
53
  def test_upload_nonexistent_files(self):
54
  """
@@ -237,12 +237,143 @@ class TestFile(TestSdk):
237
  assert (deleted_res['code'] == RetCode.ARGUMENT_ERROR and deleted_res['message'] ==
238
  f'The document {doc_id} is not in the dataset: {other_dataset_id}, but in the dataset: {created_res_id}.')
239
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
  # ----------------------------download a file-----------------------------------------------------
241
 
242
  # ----------------------------enable rename-----------------------------------------------------
243
 
244
- # ----------------------------list files-----------------------------------------------------
245
-
246
  # ----------------------------start parsing-----------------------------------------------------
247
 
248
  # ----------------------------stop parsing-----------------------------------------------------
@@ -257,8 +388,6 @@ class TestFile(TestSdk):
257
 
258
  # ----------------------------insert a new chunk-----------------------------------------------------
259
 
260
- # ----------------------------upload a file-----------------------------------------------------
261
-
262
  # ----------------------------get a specific chunk-----------------------------------------------------
263
 
264
  # ----------------------------retrieval test-----------------------------------------------------
 
37
  dataset_id = created_res['data']['dataset_id']
38
  file_paths = ["test_data/test.txt", "test_data/test1.txt"]
39
  res = ragflow.upload_local_file(dataset_id, file_paths)
40
+ assert res['code'] == RetCode.SUCCESS and res['message'] == 'success'
41
 
42
  def test_upload_one_file(self):
43
  """
 
48
  dataset_id = created_res['data']['dataset_id']
49
  file_paths = ["test_data/test.txt"]
50
  res = ragflow.upload_local_file(dataset_id, file_paths)
51
+ assert res['code'] == RetCode.SUCCESS and res['message'] == 'success'
52
 
53
  def test_upload_nonexistent_files(self):
54
  """
 
237
  assert (deleted_res['code'] == RetCode.ARGUMENT_ERROR and deleted_res['message'] ==
238
  f'The document {doc_id} is not in the dataset: {other_dataset_id}, but in the dataset: {created_res_id}.')
239
 
240
+ # ----------------------------list files-----------------------------------------------------
241
+ def test_list_documents_with_success(self):
242
+ """
243
+ Test listing documents with a successful outcome.
244
+ """
245
+ ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
246
+ # upload a document
247
+ created_res = ragflow.create_dataset("test_list_documents_with_success")
248
+ created_res_id = created_res['data']['dataset_id']
249
+ file_paths = ["test_data/test.txt"]
250
+ ragflow.upload_local_file(created_res_id, file_paths)
251
+ # Call the list_document method
252
+ response = ragflow.list_files(created_res_id)
253
+ assert response['code'] == RetCode.SUCCESS and len(response['data']['docs']) == 1
254
+
255
+ def test_list_documents_with_checking_size(self):
256
+ """
257
+ Test listing documents and verify the size and names of the documents.
258
+ """
259
+ ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
260
+ # upload 10 documents
261
+ created_res = ragflow.create_dataset("test_list_documents_with_checking_size")
262
+ created_res_id = created_res['data']['dataset_id']
263
+ file_paths = ["test_data/test.txt"] * 10
264
+ ragflow.upload_local_file(created_res_id, file_paths)
265
+ # Call the list_document method
266
+ response = ragflow.list_files(created_res_id)
267
+ assert response['code'] == RetCode.SUCCESS and len(response['data']['docs']) == 10
268
+
269
+ def test_list_documents_with_getting_empty_result(self):
270
+ """
271
+ Test listing documents that should be empty.
272
+ """
273
+ ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
274
+ # upload 0 documents
275
+ created_res = ragflow.create_dataset("test_list_documents_with_getting_empty_result")
276
+ created_res_id = created_res['data']['dataset_id']
277
+ # Call the list_document method
278
+ response = ragflow.list_files(created_res_id)
279
+ assert response['code'] == RetCode.SUCCESS and len(response['data']['docs']) == 0
280
+
281
+ def test_list_documents_with_creating_100_documents(self):
282
+ """
283
+ Test listing 100 documents and verify the size of these documents.
284
+ """
285
+ ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
286
+ # upload 100 documents
287
+ created_res = ragflow.create_dataset("test_list_documents_with_creating_100_documents")
288
+ created_res_id = created_res['data']['dataset_id']
289
+ file_paths = ["test_data/test.txt"] * 100
290
+ ragflow.upload_local_file(created_res_id, file_paths)
291
+ # Call the list_document method
292
+ response = ragflow.list_files(created_res_id)
293
+ assert response['code'] == RetCode.SUCCESS and len(response['data']['docs']) == 100
294
+
295
+ def test_list_document_with_failure(self):
296
+ """
297
+ Test listing documents with IndexError.
298
+ """
299
+ ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
300
+ created_res = ragflow.create_dataset("test_list_document_with_failure")
301
+ created_res_id = created_res['data']['dataset_id']
302
+ response = ragflow.list_files(created_res_id, offset=-1, count=-1)
303
+ assert "IndexError" in response['message'] and response['code'] == RetCode.EXCEPTION_ERROR
304
+
305
+ def test_list_document_with_verifying_offset_and_count(self):
306
+ """
307
+ Test listing documents with verifying the functionalities of offset and count.
308
+ """
309
+ ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
310
+ created_res = ragflow.create_dataset("test_list_document_with_verifying_offset_and_count")
311
+ created_res_id = created_res['data']['dataset_id']
312
+ file_paths = ["test_data/test.txt", "test_data/empty.txt"] * 10
313
+ ragflow.upload_local_file(created_res_id, file_paths)
314
+ # Call the list_document method
315
+ response = ragflow.list_files(created_res_id, offset=2, count=10)
316
+
317
+ assert response['code'] == RetCode.SUCCESS and len(response['data']['docs']) == 10
318
+
319
+ def test_list_document_with_verifying_keywords(self):
320
+ """
321
+ Test listing documents with verifying the functionality of searching keywords.
322
+ """
323
+ ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
324
+ created_res = ragflow.create_dataset("test_list_document_with_verifying_keywords")
325
+ created_res_id = created_res['data']['dataset_id']
326
+ file_paths = ["test_data/test.txt", "test_data/empty.txt"]
327
+ ragflow.upload_local_file(created_res_id, file_paths)
328
+ # Call the list_document method
329
+ response = ragflow.list_files(created_res_id, keywords="empty")
330
+
331
+ assert response['code'] == RetCode.SUCCESS and len(response['data']['docs']) == 1
332
+
333
+ def test_list_document_with_verifying_order_by_and_descend(self):
334
+ """
335
+ Test listing documents with verifying the functionality of order_by and descend.
336
+ """
337
+ ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
338
+ created_res = ragflow.create_dataset("test_list_document_with_verifying_order_by_and_descend")
339
+ created_res_id = created_res['data']['dataset_id']
340
+ file_paths = ["test_data/test.txt", "test_data/empty.txt"]
341
+ ragflow.upload_local_file(created_res_id, file_paths)
342
+ # Call the list_document method
343
+ response = ragflow.list_files(created_res_id)
344
+ assert response['code'] == RetCode.SUCCESS and len(response['data']['docs']) == 2
345
+ docs = response['data']['docs']
346
+ # reverse
347
+ i = 1
348
+ for doc in docs:
349
+ assert doc['name'] in file_paths[i]
350
+ i -= 1
351
+
352
+ def test_list_document_with_verifying_order_by_and_ascend(self):
353
+ """
354
+ Test listing documents with verifying the functionality of order_by and ascend.
355
+ """
356
+ ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
357
+ created_res = ragflow.create_dataset("test_list_document_with_verifying_order_by_and_ascend")
358
+ created_res_id = created_res['data']['dataset_id']
359
+ file_paths = ["test_data/test.txt", "test_data/test1.txt", "test_data/empty.txt"]
360
+ ragflow.upload_local_file(created_res_id, file_paths)
361
+ # Call the list_document method
362
+ response = ragflow.list_files(created_res_id, descend=False)
363
+ assert response['code'] == RetCode.SUCCESS and len(response['data']['docs']) == 3
364
+
365
+ docs = response['data']['docs']
366
+
367
+ i = 0
368
+ for doc in docs:
369
+ assert doc['name'] in file_paths[i]
370
+ i += 1
371
+
372
+ # TODO: have to set the limitation of the number of documents
373
  # ----------------------------download a file-----------------------------------------------------
374
 
375
  # ----------------------------enable rename-----------------------------------------------------
376
 
 
 
377
  # ----------------------------start parsing-----------------------------------------------------
378
 
379
  # ----------------------------stop parsing-----------------------------------------------------
 
388
 
389
  # ----------------------------insert a new chunk-----------------------------------------------------
390
 
 
 
391
  # ----------------------------get a specific chunk-----------------------------------------------------
392
 
393
  # ----------------------------retrieval test-----------------------------------------------------