cecilia-uu commited on
Commit
12defec
·
1 Parent(s): 67bae62

API: completed delete_doc api (#1290)

Browse files

### What problem does this PR solve?

Adds the functionality of deleting documentation

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

api/apps/documents_api.py CHANGED
@@ -24,6 +24,7 @@ from flask_login import login_required, current_user
24
  from api.db import FileType, ParserType
25
  from api.db.services import duplicate_name
26
  from api.db.services.document_service import DocumentService
 
27
  from api.db.services.file_service import FileService
28
  from api.db.services.knowledgebase_service import KnowledgebaseService
29
  from api.settings import RetCode
@@ -31,6 +32,8 @@ from api.utils import get_uuid
31
  from api.utils.api_utils import construct_json_result
32
  from api.utils.file_utils import filename_type, thumbnail
33
  from rag.utils.minio_conn import MINIO
 
 
34
 
35
 
36
  MAXIMUM_OF_UPLOADING_FILES = 256
@@ -89,6 +92,7 @@ def upload(dataset_id):
89
  # grab all the errs
90
  err = []
91
  MAX_FILE_NUM_PER_USER = int(os.environ.get('MAX_FILE_NUM_PER_USER', 0))
 
92
  for file in file_objs:
93
  try:
94
  # TODO: get this value from the database as some tenants have this limit while others don't
@@ -132,6 +136,7 @@ def upload(dataset_id):
132
  DocumentService.insert(doc)
133
 
134
  FileService.add_file_from_kb(doc, kb_folder["id"], dataset.tenant_id)
 
135
  except Exception as e:
136
  err.append(file.filename + ": " + str(e))
137
 
@@ -139,14 +144,65 @@ def upload(dataset_id):
139
  # return all the errors
140
  return construct_json_result(message="\n".join(err), code=RetCode.SERVER_ERROR)
141
  # success
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  return construct_json_result(data=True, code=RetCode.SUCCESS)
143
 
144
  # ----------------------------upload online files------------------------------------------------
145
 
146
  # ----------------------------download a file-----------------------------------------------------
147
 
148
- # ----------------------------delete a file-----------------------------------------------------
149
-
150
  # ----------------------------enable rename-----------------------------------------------------
151
 
152
  # ----------------------------list files-----------------------------------------------------
 
24
  from api.db import FileType, ParserType
25
  from api.db.services import duplicate_name
26
  from api.db.services.document_service import DocumentService
27
+ from api.db.services.file2document_service import File2DocumentService
28
  from api.db.services.file_service import FileService
29
  from api.db.services.knowledgebase_service import KnowledgebaseService
30
  from api.settings import RetCode
 
32
  from api.utils.api_utils import construct_json_result
33
  from api.utils.file_utils import filename_type, thumbnail
34
  from rag.utils.minio_conn import MINIO
35
+ from api.db.db_models import Task, File
36
+ from api.db import FileType, TaskStatus, ParserType, FileSource
37
 
38
 
39
  MAXIMUM_OF_UPLOADING_FILES = 256
 
92
  # grab all the errs
93
  err = []
94
  MAX_FILE_NUM_PER_USER = int(os.environ.get('MAX_FILE_NUM_PER_USER', 0))
95
+ uploaded_docs_json = []
96
  for file in file_objs:
97
  try:
98
  # TODO: get this value from the database as some tenants have this limit while others don't
 
136
  DocumentService.insert(doc)
137
 
138
  FileService.add_file_from_kb(doc, kb_folder["id"], dataset.tenant_id)
139
+ uploaded_docs_json.append(doc)
140
  except Exception as e:
141
  err.append(file.filename + ": " + str(e))
142
 
 
144
  # return all the errors
145
  return construct_json_result(message="\n".join(err), code=RetCode.SERVER_ERROR)
146
  # success
147
+ return construct_json_result(data=uploaded_docs_json, code=RetCode.SUCCESS)
148
+
149
+ # ----------------------------delete a file-----------------------------------------------------
150
+ @manager.route('/<dataset_id>/<document_id>', methods=['DELETE'])
151
+ @login_required
152
+ def delete(document_id, dataset_id): # string
153
+ # get the root folder
154
+ root_folder = FileService.get_root_folder(current_user.id)
155
+ # parent file's id
156
+ parent_file_id = root_folder["id"]
157
+ # consider the new user
158
+ FileService.init_knowledgebase_docs(parent_file_id, current_user.id)
159
+ # store all the errors that may have
160
+ errors = ""
161
+ try:
162
+ # whether there is this document
163
+ exist, doc = DocumentService.get_by_id(document_id)
164
+ if not exist:
165
+ return construct_json_result(message=f"Document {document_id} not found!", code=RetCode.DATA_ERROR)
166
+ # whether this doc is authorized by this tenant
167
+ tenant_id = DocumentService.get_tenant_id(document_id)
168
+ if not tenant_id:
169
+ return construct_json_result(message=f"You cannot delete this document {document_id} due to the authorization"
170
+ f" reason!", code=RetCode.AUTHENTICATION_ERROR)
171
+
172
+ # get the doc's id and location
173
+ real_dataset_id, location = File2DocumentService.get_minio_address(doc_id=document_id)
174
+
175
+ if real_dataset_id != dataset_id:
176
+ return construct_json_result(message=f"The document {document_id} is not in the dataset: {dataset_id}, "
177
+ f"but in the dataset: {real_dataset_id}.", code=RetCode.ARGUMENT_ERROR)
178
+
179
+ # there is an issue when removing
180
+ if not DocumentService.remove_document(doc, tenant_id):
181
+ return construct_json_result(
182
+ message="There was an error during the document removal process. Please check the status of the "
183
+ "RAGFlow server and try the removal again.", code=RetCode.OPERATING_ERROR)
184
+
185
+ # fetch the File2Document record associated with the provided document ID.
186
+ file_to_doc = File2DocumentService.get_by_document_id(document_id)
187
+ # delete the associated File record.
188
+ FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == file_to_doc[0].file_id])
189
+ # delete the File2Document record itself using the document ID. This removes the
190
+ # association between the document and the file after the File record has been deleted.
191
+ File2DocumentService.delete_by_document_id(document_id)
192
+
193
+ # delete it from minio
194
+ MINIO.rm(dataset_id, location)
195
+ except Exception as e:
196
+ errors += str(e)
197
+ if errors:
198
+ return construct_json_result(data=False, message=errors, code=RetCode.SERVER_ERROR)
199
+
200
  return construct_json_result(data=True, code=RetCode.SUCCESS)
201
 
202
  # ----------------------------upload online files------------------------------------------------
203
 
204
  # ----------------------------download a file-----------------------------------------------------
205
 
 
 
206
  # ----------------------------enable rename-----------------------------------------------------
207
 
208
  # ----------------------------list files-----------------------------------------------------
sdk/python/ragflow/ragflow.py CHANGED
@@ -101,10 +101,13 @@ class RAGFlow:
101
  result_dict = json.loads(res.text)
102
  return result_dict
103
 
104
- # ----------------------------upload remote files-----------------------------------------------------
105
- # ----------------------------download a file-----------------------------------------------------
106
-
107
  # ----------------------------delete a file-----------------------------------------------------
 
 
 
 
 
 
108
 
109
  # ----------------------------enable rename-----------------------------------------------------
110
 
 
101
  result_dict = json.loads(res.text)
102
  return result_dict
103
 
 
 
 
104
  # ----------------------------delete a file-----------------------------------------------------
105
+ def delete_files(self, document_id, dataset_id):
106
+ endpoint = f"{self.document_url}/{dataset_id}/{document_id}"
107
+ res = requests.delete(endpoint, headers=self.authorization_header)
108
+ return res.json()
109
+
110
+ # ----------------------------download a file-----------------------------------------------------
111
 
112
  # ----------------------------enable rename-----------------------------------------------------
113
 
sdk/python/test/test_document.py CHANGED
@@ -149,11 +149,95 @@ class TestFile(TestSdk):
149
  res = ragflow.upload_local_file(dataset_id, file_paths)
150
  assert res['code'] == RetCode.ARGUMENT_ERROR and res['message'] == 'Remote files have not unsupported.'
151
 
152
- # ----------------------------upload remote files-----------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
 
154
- # ----------------------------download a file-----------------------------------------------------
 
 
 
 
 
 
 
 
155
 
156
- # ----------------------------delete a file-----------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
 
158
  # ----------------------------enable rename-----------------------------------------------------
159
 
 
149
  res = ragflow.upload_local_file(dataset_id, file_paths)
150
  assert res['code'] == RetCode.ARGUMENT_ERROR and res['message'] == 'Remote files have not unsupported.'
151
 
152
+ # ----------------------------delete a file-----------------------------------------------------
153
+ def test_delete_one_file(self):
154
+ """
155
+ Test deleting one file with success.
156
+ """
157
+ ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
158
+ created_res = ragflow.create_dataset("test_delete_one_file")
159
+ dataset_id = created_res['data']['dataset_id']
160
+ file_paths = ["test_data/test.txt"]
161
+ res = ragflow.upload_local_file(dataset_id, file_paths)
162
+ # get the doc_id
163
+ data = res['data'][0]
164
+ doc_id = data['id']
165
+ # delete the files
166
+ deleted_res = ragflow.delete_files(doc_id, dataset_id)
167
+ # assert value
168
+ assert deleted_res['code'] == RetCode.SUCCESS and deleted_res['data'] is True
169
 
170
+ def test_delete_document_with_not_existing_document(self):
171
+ """
172
+ Test deleting a document that does not exist with failure.
173
+ """
174
+ ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
175
+ created_res = ragflow.create_dataset("test_delete_document_with_not_existing_document")
176
+ dataset_id = created_res['data']['dataset_id']
177
+ res = ragflow.delete_files("111", dataset_id)
178
+ assert res['code'] == RetCode.DATA_ERROR and res['message'] == 'Document 111 not found!'
179
 
180
+ def test_delete_document_with_creating_100_documents_and_deleting_100_documents(self):
181
+ """
182
+ Test deleting documents when uploading 100 docs and deleting 100 docs.
183
+ """
184
+ # upload 100 docs
185
+ ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
186
+ created_res = ragflow.create_dataset("test_delete_one_file")
187
+ dataset_id = created_res['data']['dataset_id']
188
+ file_paths = ["test_data/test.txt"] * 100
189
+ res = ragflow.upload_local_file(dataset_id, file_paths)
190
+
191
+ # get the doc_id
192
+ data = res['data']
193
+ for d in data:
194
+ doc_id = d['id']
195
+ # delete the files
196
+ deleted_res = ragflow.delete_files(doc_id, dataset_id)
197
+ # assert value
198
+ assert deleted_res['code'] == RetCode.SUCCESS and deleted_res['data'] is True
199
+
200
+ def test_delete_document_from_nonexistent_dataset(self):
201
+ """
202
+ Test deleting documents from a non-existent dataset
203
+ """
204
+ ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
205
+ created_res = ragflow.create_dataset("test_delete_one_file")
206
+ dataset_id = created_res['data']['dataset_id']
207
+ file_paths = ["test_data/test.txt"]
208
+ res = ragflow.upload_local_file(dataset_id, file_paths)
209
+ # get the doc_id
210
+ data = res['data'][0]
211
+ doc_id = data['id']
212
+ # delete the files
213
+ deleted_res = ragflow.delete_files(doc_id, "000")
214
+ # assert value
215
+ assert (deleted_res['code'] == RetCode.ARGUMENT_ERROR and deleted_res['message'] ==
216
+ f'The document {doc_id} is not in the dataset: 000, but in the dataset: {dataset_id}.')
217
+
218
+ def test_delete_document_which_is_located_in_other_dataset(self):
219
+ """
220
+ Test deleting a document which is located in other dataset.
221
+ """
222
+ ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
223
+ # upload a document
224
+ created_res = ragflow.create_dataset("test_delete_document_which_is_located_in_other_dataset")
225
+ created_res_id = created_res['data']['dataset_id']
226
+ file_paths = ["test_data/test.txt"]
227
+ res = ragflow.upload_local_file(created_res_id, file_paths)
228
+ # other dataset
229
+ other_res = ragflow.create_dataset("other_dataset")
230
+ other_dataset_id = other_res['data']['dataset_id']
231
+ # get the doc_id
232
+ data = res['data'][0]
233
+ doc_id = data['id']
234
+ # delete the files from the other dataset
235
+ deleted_res = ragflow.delete_files(doc_id, other_dataset_id)
236
+ # assert value
237
+ assert (deleted_res['code'] == RetCode.ARGUMENT_ERROR and deleted_res['message'] ==
238
+ f'The document {doc_id} is not in the dataset: {other_dataset_id}, but in the dataset: {created_res_id}.')
239
+
240
+ # ----------------------------download a file-----------------------------------------------------
241
 
242
  # ----------------------------enable rename-----------------------------------------------------
243