Storage: Support the s3, azure blob as the object storage of ragflow. (#2278)
Browse files### What problem does this PR solve?
issue: https://github.com/infiniflow/ragflow/issues/2277
_Briefly describe what this PR aims to solve. Include background context
that will help reviewers understand the purpose of the PR._
### Type of change
- [ ] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [ ] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):
Co-authored-by: Kevin Hu <[email protected]>
- api/apps/api_app.py +5 -5
 - api/apps/dataset_api.py +6 -6
 - api/apps/document_app.py +6 -6
 - api/apps/file_app.py +4 -4
 - api/apps/system_app.py +2 -2
 - api/db/services/document_service.py +2 -2
 - api/db/services/file_service.py +3 -3
 - api/db/services/task_service.py +3 -3
 - conf/service_conf.yaml +16 -0
 - rag/settings.py +4 -0
 - rag/svr/cache_file_svr.py +2 -2
 - rag/svr/task_executor.py +3 -3
 - rag/utils/azure_sas_conn.py +80 -0
 - rag/utils/azure_spn_conn.py +90 -0
 - rag/utils/s3_conn.py +135 -0
 - rag/utils/storage_factory.py +30 -0
 - requirements.txt +4 -2
 
    	
        api/apps/api_app.py
    CHANGED
    
    | 
         @@ -39,7 +39,7 @@ from itsdangerous import URLSafeTimedSerializer 
     | 
|
| 39 | 
         | 
| 40 | 
         
             
            from api.utils.file_utils import filename_type, thumbnail
         
     | 
| 41 | 
         
             
            from rag.nlp import keyword_extraction
         
     | 
| 42 | 
         
            -
            from rag.utils. 
     | 
| 43 | 
         | 
| 44 | 
         
             
            from api.db.services.canvas_service import CanvasTemplateService, UserCanvasService
         
     | 
| 45 | 
         
             
            from agent.canvas import Canvas
         
     | 
| 
         @@ -427,10 +427,10 @@ def upload(): 
     | 
|
| 427 | 
         
             
                            retmsg="This type of file has not been supported yet!")
         
     | 
| 428 | 
         | 
| 429 | 
         
             
                    location = filename
         
     | 
| 430 | 
         
            -
                    while  
     | 
| 431 | 
         
             
                        location += "_"
         
     | 
| 432 | 
         
             
                    blob = request.files['file'].read()
         
     | 
| 433 | 
         
            -
                     
     | 
| 434 | 
         
             
                    doc = {
         
     | 
| 435 | 
         
             
                        "id": get_uuid(),
         
     | 
| 436 | 
         
             
                        "kb_id": kb.id,
         
     | 
| 
         @@ -650,7 +650,7 @@ def document_rm(): 
     | 
|
| 650 | 
         
             
                        FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == f2d[0].file_id])
         
     | 
| 651 | 
         
             
                        File2DocumentService.delete_by_document_id(doc_id)
         
     | 
| 652 | 
         | 
| 653 | 
         
            -
                         
     | 
| 654 | 
         
             
                    except Exception as e:
         
     | 
| 655 | 
         
             
                        errors += str(e)
         
     | 
| 656 | 
         | 
| 
         @@ -723,7 +723,7 @@ def completion_faq(): 
     | 
|
| 723 | 
         
             
                        if ans["reference"]["chunks"][chunk_idx]["img_id"]:
         
     | 
| 724 | 
         
             
                            try:
         
     | 
| 725 | 
         
             
                                bkt, nm = ans["reference"]["chunks"][chunk_idx]["img_id"].split("-")
         
     | 
| 726 | 
         
            -
                                response =  
     | 
| 727 | 
         
             
                                data_type_picture["url"] = base64.b64encode(response).decode('utf-8')
         
     | 
| 728 | 
         
             
                                data.append(data_type_picture)
         
     | 
| 729 | 
         
             
                                break
         
     | 
| 
         | 
|
| 39 | 
         | 
| 40 | 
         
             
            from api.utils.file_utils import filename_type, thumbnail
         
     | 
| 41 | 
         
             
            from rag.nlp import keyword_extraction
         
     | 
| 42 | 
         
            +
            from rag.utils.storage_factory import STORAGE_IMPL
         
     | 
| 43 | 
         | 
| 44 | 
         
             
            from api.db.services.canvas_service import CanvasTemplateService, UserCanvasService
         
     | 
| 45 | 
         
             
            from agent.canvas import Canvas
         
     | 
| 
         | 
|
| 427 | 
         
             
                            retmsg="This type of file has not been supported yet!")
         
     | 
| 428 | 
         | 
| 429 | 
         
             
                    location = filename
         
     | 
| 430 | 
         
            +
                    while STORAGE_IMPL.obj_exist(kb_id, location):
         
     | 
| 431 | 
         
             
                        location += "_"
         
     | 
| 432 | 
         
             
                    blob = request.files['file'].read()
         
     | 
| 433 | 
         
            +
                    STORAGE_IMPL.put(kb_id, location, blob)
         
     | 
| 434 | 
         
             
                    doc = {
         
     | 
| 435 | 
         
             
                        "id": get_uuid(),
         
     | 
| 436 | 
         
             
                        "kb_id": kb.id,
         
     | 
| 
         | 
|
| 650 | 
         
             
                        FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == f2d[0].file_id])
         
     | 
| 651 | 
         
             
                        File2DocumentService.delete_by_document_id(doc_id)
         
     | 
| 652 | 
         | 
| 653 | 
         
            +
                        STORAGE_IMPL.rm(b, n)
         
     | 
| 654 | 
         
             
                    except Exception as e:
         
     | 
| 655 | 
         
             
                        errors += str(e)
         
     | 
| 656 | 
         | 
| 
         | 
|
| 723 | 
         
             
                        if ans["reference"]["chunks"][chunk_idx]["img_id"]:
         
     | 
| 724 | 
         
             
                            try:
         
     | 
| 725 | 
         
             
                                bkt, nm = ans["reference"]["chunks"][chunk_idx]["img_id"].split("-")
         
     | 
| 726 | 
         
            +
                                response = STORAGE_IMPL.get(bkt, nm)
         
     | 
| 727 | 
         
             
                                data_type_picture["url"] = base64.b64encode(response).decode('utf-8')
         
     | 
| 728 | 
         
             
                                data.append(data_type_picture)
         
     | 
| 729 | 
         
             
                                break
         
     | 
    	
        api/apps/dataset_api.py
    CHANGED
    
    | 
         @@ -42,7 +42,7 @@ from api.utils.file_utils import filename_type, thumbnail 
     | 
|
| 42 | 
         
             
            from rag.app import book, laws, manual, naive, one, paper, presentation, qa, resume, table, picture, audio, email
         
     | 
| 43 | 
         
             
            from rag.nlp import search
         
     | 
| 44 | 
         
             
            from rag.utils.es_conn import ELASTICSEARCH
         
     | 
| 45 | 
         
            -
            from rag.utils. 
     | 
| 46 | 
         | 
| 47 | 
         
             
            MAXIMUM_OF_UPLOADING_FILES = 256
         
     | 
| 48 | 
         | 
| 
         @@ -352,7 +352,7 @@ def upload_documents(dataset_id): 
     | 
|
| 352 | 
         | 
| 353 | 
         
             
                        # upload to the minio
         
     | 
| 354 | 
         
             
                        location = filename
         
     | 
| 355 | 
         
            -
                        while  
     | 
| 356 | 
         
             
                            location += "_"
         
     | 
| 357 | 
         | 
| 358 | 
         
             
                        blob = file.read()
         
     | 
| 
         @@ -361,7 +361,7 @@ def upload_documents(dataset_id): 
     | 
|
| 361 | 
         
             
                        if blob == b'':
         
     | 
| 362 | 
         
             
                            warnings.warn(f"[WARNING]: The content of the file {filename} is empty.")
         
     | 
| 363 | 
         | 
| 364 | 
         
            -
                         
     | 
| 365 | 
         | 
| 366 | 
         
             
                        doc = {
         
     | 
| 367 | 
         
             
                            "id": get_uuid(),
         
     | 
| 
         @@ -441,7 +441,7 @@ def delete_document(document_id, dataset_id):  # string 
     | 
|
| 441 | 
         
             
                    File2DocumentService.delete_by_document_id(document_id)
         
     | 
| 442 | 
         | 
| 443 | 
         
             
                    # delete it from minio
         
     | 
| 444 | 
         
            -
                     
     | 
| 445 | 
         
             
                except Exception as e:
         
     | 
| 446 | 
         
             
                    errors += str(e)
         
     | 
| 447 | 
         
             
                if errors:
         
     | 
| 
         @@ -596,7 +596,7 @@ def download_document(dataset_id, document_id): 
     | 
|
| 596 | 
         | 
| 597 | 
         
             
                    # The process of downloading
         
     | 
| 598 | 
         
             
                    doc_id, doc_location = File2DocumentService.get_minio_address(doc_id=document_id)  # minio address
         
     | 
| 599 | 
         
            -
                    file_stream =  
     | 
| 600 | 
         
             
                    if not file_stream:
         
     | 
| 601 | 
         
             
                        return construct_json_result(message="This file is empty.", code=RetCode.DATA_ERROR)
         
     | 
| 602 | 
         | 
| 
         @@ -737,7 +737,7 @@ def parsing_document_internal(id): 
     | 
|
| 737 | 
         
             
                    doc_id = doc_attributes["id"]
         
     | 
| 738 | 
         | 
| 739 | 
         
             
                    bucket, doc_name = File2DocumentService.get_minio_address(doc_id=doc_id)
         
     | 
| 740 | 
         
            -
                    binary =  
     | 
| 741 | 
         
             
                    parser_name = doc_attributes["parser_id"]
         
     | 
| 742 | 
         
             
                    if binary:
         
     | 
| 743 | 
         
             
                        res = doc_parse(binary, doc_name, parser_name, tenant_id, doc_id)
         
     | 
| 
         | 
|
| 42 | 
         
             
            from rag.app import book, laws, manual, naive, one, paper, presentation, qa, resume, table, picture, audio, email
         
     | 
| 43 | 
         
             
            from rag.nlp import search
         
     | 
| 44 | 
         
             
            from rag.utils.es_conn import ELASTICSEARCH
         
     | 
| 45 | 
         
            +
            from rag.utils.storage_factory import STORAGE_IMPL
         
     | 
| 46 | 
         | 
| 47 | 
         
             
            MAXIMUM_OF_UPLOADING_FILES = 256
         
     | 
| 48 | 
         | 
| 
         | 
|
| 352 | 
         | 
| 353 | 
         
             
                        # upload to the minio
         
     | 
| 354 | 
         
             
                        location = filename
         
     | 
| 355 | 
         
            +
                        while STORAGE_IMPL.obj_exist(dataset_id, location):
         
     | 
| 356 | 
         
             
                            location += "_"
         
     | 
| 357 | 
         | 
| 358 | 
         
             
                        blob = file.read()
         
     | 
| 
         | 
|
| 361 | 
         
             
                        if blob == b'':
         
     | 
| 362 | 
         
             
                            warnings.warn(f"[WARNING]: The content of the file {filename} is empty.")
         
     | 
| 363 | 
         | 
| 364 | 
         
            +
                        STORAGE_IMPL.put(dataset_id, location, blob)
         
     | 
| 365 | 
         | 
| 366 | 
         
             
                        doc = {
         
     | 
| 367 | 
         
             
                            "id": get_uuid(),
         
     | 
| 
         | 
|
| 441 | 
         
             
                    File2DocumentService.delete_by_document_id(document_id)
         
     | 
| 442 | 
         | 
| 443 | 
         
             
                    # delete it from minio
         
     | 
| 444 | 
         
            +
                    STORAGE_IMPL.rm(dataset_id, location)
         
     | 
| 445 | 
         
             
                except Exception as e:
         
     | 
| 446 | 
         
             
                    errors += str(e)
         
     | 
| 447 | 
         
             
                if errors:
         
     | 
| 
         | 
|
| 596 | 
         | 
| 597 | 
         
             
                    # The process of downloading
         
     | 
| 598 | 
         
             
                    doc_id, doc_location = File2DocumentService.get_minio_address(doc_id=document_id)  # minio address
         
     | 
| 599 | 
         
            +
                    file_stream = STORAGE_IMPL.get(doc_id, doc_location)
         
     | 
| 600 | 
         
             
                    if not file_stream:
         
     | 
| 601 | 
         
             
                        return construct_json_result(message="This file is empty.", code=RetCode.DATA_ERROR)
         
     | 
| 602 | 
         | 
| 
         | 
|
| 737 | 
         
             
                    doc_id = doc_attributes["id"]
         
     | 
| 738 | 
         | 
| 739 | 
         
             
                    bucket, doc_name = File2DocumentService.get_minio_address(doc_id=doc_id)
         
     | 
| 740 | 
         
            +
                    binary = STORAGE_IMPL.get(bucket, doc_name)
         
     | 
| 741 | 
         
             
                    parser_name = doc_attributes["parser_id"]
         
     | 
| 742 | 
         
             
                    if binary:
         
     | 
| 743 | 
         
             
                        res = doc_parse(binary, doc_name, parser_name, tenant_id, doc_id)
         
     | 
    	
        api/apps/document_app.py
    CHANGED
    
    | 
         @@ -48,7 +48,7 @@ from api.db import FileType, TaskStatus, ParserType, FileSource, LLMType 
     | 
|
| 48 | 
         
             
            from api.db.services.document_service import DocumentService, doc_upload_and_parse
         
     | 
| 49 | 
         
             
            from api.settings import RetCode, stat_logger
         
     | 
| 50 | 
         
             
            from api.utils.api_utils import get_json_result
         
     | 
| 51 | 
         
            -
            from rag.utils. 
     | 
| 52 | 
         
             
            from api.utils.file_utils import filename_type, thumbnail, get_project_base_directory
         
     | 
| 53 | 
         
             
            from api.utils.web_utils import html2pdf, is_valid_url
         
     | 
| 54 | 
         | 
| 
         @@ -118,9 +118,9 @@ def web_crawl(): 
     | 
|
| 118 | 
         
             
                        raise RuntimeError("This type of file has not been supported yet!")
         
     | 
| 119 | 
         | 
| 120 | 
         
             
                    location = filename
         
     | 
| 121 | 
         
            -
                    while  
     | 
| 122 | 
         
             
                        location += "_"
         
     | 
| 123 | 
         
            -
                     
     | 
| 124 | 
         
             
                    doc = {
         
     | 
| 125 | 
         
             
                        "id": get_uuid(),
         
     | 
| 126 | 
         
             
                        "kb_id": kb.id,
         
     | 
| 
         @@ -307,7 +307,7 @@ def rm(): 
     | 
|
| 307 | 
         
             
                        FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == f2d[0].file_id])
         
     | 
| 308 | 
         
             
                        File2DocumentService.delete_by_document_id(doc_id)
         
     | 
| 309 | 
         | 
| 310 | 
         
            -
                         
     | 
| 311 | 
         
             
                    except Exception as e:
         
     | 
| 312 | 
         
             
                        errors += str(e)
         
     | 
| 313 | 
         | 
| 
         @@ -394,7 +394,7 @@ def get(doc_id): 
     | 
|
| 394 | 
         
             
                        return get_data_error_result(retmsg="Document not found!")
         
     | 
| 395 | 
         | 
| 396 | 
         
             
                    b, n = File2DocumentService.get_minio_address(doc_id=doc_id)
         
     | 
| 397 | 
         
            -
                    response = flask.make_response( 
     | 
| 398 | 
         | 
| 399 | 
         
             
                    ext = re.search(r"\.([^.]+)$", doc.name)
         
     | 
| 400 | 
         
             
                    if ext:
         
     | 
| 
         @@ -458,7 +458,7 @@ def change_parser(): 
     | 
|
| 458 | 
         
             
            def get_image(image_id):
         
     | 
| 459 | 
         
             
                try:
         
     | 
| 460 | 
         
             
                    bkt, nm = image_id.split("-")
         
     | 
| 461 | 
         
            -
                    response = flask.make_response( 
     | 
| 462 | 
         
             
                    response.headers.set('Content-Type', 'image/JPEG')
         
     | 
| 463 | 
         
             
                    return response
         
     | 
| 464 | 
         
             
                except Exception as e:
         
     | 
| 
         | 
|
| 48 | 
         
             
            from api.db.services.document_service import DocumentService, doc_upload_and_parse
         
     | 
| 49 | 
         
             
            from api.settings import RetCode, stat_logger
         
     | 
| 50 | 
         
             
            from api.utils.api_utils import get_json_result
         
     | 
| 51 | 
         
            +
            from rag.utils.storage_factory import STORAGE_IMPL
         
     | 
| 52 | 
         
             
            from api.utils.file_utils import filename_type, thumbnail, get_project_base_directory
         
     | 
| 53 | 
         
             
            from api.utils.web_utils import html2pdf, is_valid_url
         
     | 
| 54 | 
         | 
| 
         | 
|
| 118 | 
         
             
                        raise RuntimeError("This type of file has not been supported yet!")
         
     | 
| 119 | 
         | 
| 120 | 
         
             
                    location = filename
         
     | 
| 121 | 
         
            +
                    while STORAGE_IMPL.obj_exist(kb_id, location):
         
     | 
| 122 | 
         
             
                        location += "_"
         
     | 
| 123 | 
         
            +
                    STORAGE_IMPL.put(kb_id, location, blob)
         
     | 
| 124 | 
         
             
                    doc = {
         
     | 
| 125 | 
         
             
                        "id": get_uuid(),
         
     | 
| 126 | 
         
             
                        "kb_id": kb.id,
         
     | 
| 
         | 
|
| 307 | 
         
             
                        FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == f2d[0].file_id])
         
     | 
| 308 | 
         
             
                        File2DocumentService.delete_by_document_id(doc_id)
         
     | 
| 309 | 
         | 
| 310 | 
         
            +
                        STORAGE_IMPL.rm(b, n)
         
     | 
| 311 | 
         
             
                    except Exception as e:
         
     | 
| 312 | 
         
             
                        errors += str(e)
         
     | 
| 313 | 
         | 
| 
         | 
|
| 394 | 
         
             
                        return get_data_error_result(retmsg="Document not found!")
         
     | 
| 395 | 
         | 
| 396 | 
         
             
                    b, n = File2DocumentService.get_minio_address(doc_id=doc_id)
         
     | 
| 397 | 
         
            +
                    response = flask.make_response(STORAGE_IMPL.get(b, n))
         
     | 
| 398 | 
         | 
| 399 | 
         
             
                    ext = re.search(r"\.([^.]+)$", doc.name)
         
     | 
| 400 | 
         
             
                    if ext:
         
     | 
| 
         | 
|
| 458 | 
         
             
            def get_image(image_id):
         
     | 
| 459 | 
         
             
                try:
         
     | 
| 460 | 
         
             
                    bkt, nm = image_id.split("-")
         
     | 
| 461 | 
         
            +
                    response = flask.make_response(STORAGE_IMPL.get(bkt, nm))
         
     | 
| 462 | 
         
             
                    response.headers.set('Content-Type', 'image/JPEG')
         
     | 
| 463 | 
         
             
                    return response
         
     | 
| 464 | 
         
             
                except Exception as e:
         
     | 
    	
        api/apps/file_app.py
    CHANGED
    
    | 
         @@ -34,7 +34,7 @@ from api.utils.api_utils import get_json_result 
     | 
|
| 34 | 
         
             
            from api.utils.file_utils import filename_type
         
     | 
| 35 | 
         
             
            from rag.nlp import search
         
     | 
| 36 | 
         
             
            from rag.utils.es_conn import ELASTICSEARCH
         
     | 
| 37 | 
         
            -
            from rag.utils. 
     | 
| 38 | 
         | 
| 39 | 
         | 
| 40 | 
         
             
            @manager.route('/upload', methods=['POST'])
         
     | 
| 
         @@ -98,7 +98,7 @@ def upload(): 
     | 
|
| 98 | 
         
             
                        # file type
         
     | 
| 99 | 
         
             
                        filetype = filename_type(file_obj_names[file_len - 1])
         
     | 
| 100 | 
         
             
                        location = file_obj_names[file_len - 1]
         
     | 
| 101 | 
         
            -
                        while  
     | 
| 102 | 
         
             
                            location += "_"
         
     | 
| 103 | 
         
             
                        blob = file_obj.read()
         
     | 
| 104 | 
         
             
                        filename = duplicate_name(
         
     | 
| 
         @@ -260,7 +260,7 @@ def rm(): 
     | 
|
| 260 | 
         
             
                                e, file = FileService.get_by_id(inner_file_id)
         
     | 
| 261 | 
         
             
                                if not e:
         
     | 
| 262 | 
         
             
                                    return get_data_error_result(retmsg="File not found!")
         
     | 
| 263 | 
         
            -
                                 
     | 
| 264 | 
         
             
                            FileService.delete_folder_by_pf_id(current_user.id, file_id)
         
     | 
| 265 | 
         
             
                        else:
         
     | 
| 266 | 
         
             
                            if not FileService.delete(file):
         
     | 
| 
         @@ -333,7 +333,7 @@ def get(file_id): 
     | 
|
| 333 | 
         
             
                    if not e:
         
     | 
| 334 | 
         
             
                        return get_data_error_result(retmsg="Document not found!")
         
     | 
| 335 | 
         
             
                    b, n = File2DocumentService.get_minio_address(file_id=file_id)
         
     | 
| 336 | 
         
            -
                    response = flask.make_response( 
     | 
| 337 | 
         
             
                    ext = re.search(r"\.([^.]+)$", file.name)
         
     | 
| 338 | 
         
             
                    if ext:
         
     | 
| 339 | 
         
             
                        if file.type == FileType.VISUAL.value:
         
     | 
| 
         | 
|
| 34 | 
         
             
            from api.utils.file_utils import filename_type
         
     | 
| 35 | 
         
             
            from rag.nlp import search
         
     | 
| 36 | 
         
             
            from rag.utils.es_conn import ELASTICSEARCH
         
     | 
| 37 | 
         
            +
            from rag.utils.storage_factory import STORAGE_IMPL
         
     | 
| 38 | 
         | 
| 39 | 
         | 
| 40 | 
         
             
            @manager.route('/upload', methods=['POST'])
         
     | 
| 
         | 
|
| 98 | 
         
             
                        # file type
         
     | 
| 99 | 
         
             
                        filetype = filename_type(file_obj_names[file_len - 1])
         
     | 
| 100 | 
         
             
                        location = file_obj_names[file_len - 1]
         
     | 
| 101 | 
         
            +
                        while STORAGE_IMPL.obj_exist(last_folder.id, location):
         
     | 
| 102 | 
         
             
                            location += "_"
         
     | 
| 103 | 
         
             
                        blob = file_obj.read()
         
     | 
| 104 | 
         
             
                        filename = duplicate_name(
         
     | 
| 
         | 
|
| 260 | 
         
             
                                e, file = FileService.get_by_id(inner_file_id)
         
     | 
| 261 | 
         
             
                                if not e:
         
     | 
| 262 | 
         
             
                                    return get_data_error_result(retmsg="File not found!")
         
     | 
| 263 | 
         
            +
                                STORAGE_IMPL.rm(file.parent_id, file.location)
         
     | 
| 264 | 
         
             
                            FileService.delete_folder_by_pf_id(current_user.id, file_id)
         
     | 
| 265 | 
         
             
                        else:
         
     | 
| 266 | 
         
             
                            if not FileService.delete(file):
         
     | 
| 
         | 
|
| 333 | 
         
             
                    if not e:
         
     | 
| 334 | 
         
             
                        return get_data_error_result(retmsg="Document not found!")
         
     | 
| 335 | 
         
             
                    b, n = File2DocumentService.get_minio_address(file_id=file_id)
         
     | 
| 336 | 
         
            +
                    response = flask.make_response(STORAGE_IMPL.get(b, n))
         
     | 
| 337 | 
         
             
                    ext = re.search(r"\.([^.]+)$", file.name)
         
     | 
| 338 | 
         
             
                    if ext:
         
     | 
| 339 | 
         
             
                        if file.type == FileType.VISUAL.value:
         
     | 
    	
        api/apps/system_app.py
    CHANGED
    
    | 
         @@ -22,7 +22,7 @@ from api.utils.api_utils import get_json_result 
     | 
|
| 22 | 
         
             
            from api.versions import get_rag_version
         
     | 
| 23 | 
         
             
            from rag.settings import SVR_QUEUE_NAME
         
     | 
| 24 | 
         
             
            from rag.utils.es_conn import ELASTICSEARCH
         
     | 
| 25 | 
         
            -
            from rag.utils. 
     | 
| 26 | 
         
             
            from timeit import default_timer as timer
         
     | 
| 27 | 
         | 
| 28 | 
         
             
            from rag.utils.redis_conn import REDIS_CONN
         
     | 
| 
         @@ -47,7 +47,7 @@ def status(): 
     | 
|
| 47 | 
         | 
| 48 | 
         
             
                st = timer()
         
     | 
| 49 | 
         
             
                try:
         
     | 
| 50 | 
         
            -
                     
     | 
| 51 | 
         
             
                    res["minio"] = {"status": "green", "elapsed": "{:.1f}".format((timer() - st)*1000.)}
         
     | 
| 52 | 
         
             
                except Exception as e:
         
     | 
| 53 | 
         
             
                    res["minio"] = {"status": "red", "elapsed": "{:.1f}".format((timer() - st)*1000.), "error": str(e)}
         
     | 
| 
         | 
|
| 22 | 
         
             
            from api.versions import get_rag_version
         
     | 
| 23 | 
         
             
            from rag.settings import SVR_QUEUE_NAME
         
     | 
| 24 | 
         
             
            from rag.utils.es_conn import ELASTICSEARCH
         
     | 
| 25 | 
         
            +
            from rag.utils.storage_factory import STORAGE_IMPL
         
     | 
| 26 | 
         
             
            from timeit import default_timer as timer
         
     | 
| 27 | 
         | 
| 28 | 
         
             
            from rag.utils.redis_conn import REDIS_CONN
         
     | 
| 
         | 
|
| 47 | 
         | 
| 48 | 
         
             
                st = timer()
         
     | 
| 49 | 
         
             
                try:
         
     | 
| 50 | 
         
            +
                    STORAGE_IMPL.health()
         
     | 
| 51 | 
         
             
                    res["minio"] = {"status": "green", "elapsed": "{:.1f}".format((timer() - st)*1000.)}
         
     | 
| 52 | 
         
             
                except Exception as e:
         
     | 
| 53 | 
         
             
                    res["minio"] = {"status": "red", "elapsed": "{:.1f}".format((timer() - st)*1000.), "error": str(e)}
         
     | 
    	
        api/db/services/document_service.py
    CHANGED
    
    | 
         @@ -34,7 +34,7 @@ from api.utils.file_utils import get_project_base_directory 
     | 
|
| 34 | 
         
             
            from graphrag.mind_map_extractor import MindMapExtractor
         
     | 
| 35 | 
         
             
            from rag.settings import SVR_QUEUE_NAME
         
     | 
| 36 | 
         
             
            from rag.utils.es_conn import ELASTICSEARCH
         
     | 
| 37 | 
         
            -
            from rag.utils. 
     | 
| 38 | 
         
             
            from rag.nlp import search, rag_tokenizer
         
     | 
| 39 | 
         | 
| 40 | 
         
             
            from api.db import FileType, TaskStatus, ParserType, LLMType
         
     | 
| 
         @@ -473,7 +473,7 @@ def doc_upload_and_parse(conversation_id, file_objs, user_id): 
     | 
|
| 473 | 
         
             
                        else:
         
     | 
| 474 | 
         
             
                            d["image"].save(output_buffer, format='JPEG')
         
     | 
| 475 | 
         | 
| 476 | 
         
            -
                         
     | 
| 477 | 
         
             
                        d["img_id"] = "{}-{}".format(kb.id, d["_id"])
         
     | 
| 478 | 
         
             
                        del d["image"]
         
     | 
| 479 | 
         
             
                        docs.append(d)
         
     | 
| 
         | 
|
| 34 | 
         
             
            from graphrag.mind_map_extractor import MindMapExtractor
         
     | 
| 35 | 
         
             
            from rag.settings import SVR_QUEUE_NAME
         
     | 
| 36 | 
         
             
            from rag.utils.es_conn import ELASTICSEARCH
         
     | 
| 37 | 
         
            +
            from rag.utils.storage_factory import STORAGE_IMPL
         
     | 
| 38 | 
         
             
            from rag.nlp import search, rag_tokenizer
         
     | 
| 39 | 
         | 
| 40 | 
         
             
            from api.db import FileType, TaskStatus, ParserType, LLMType
         
     | 
| 
         | 
|
| 473 | 
         
             
                        else:
         
     | 
| 474 | 
         
             
                            d["image"].save(output_buffer, format='JPEG')
         
     | 
| 475 | 
         | 
| 476 | 
         
            +
                        STORAGE_IMPL.put(kb.id, d["_id"], output_buffer.getvalue())
         
     | 
| 477 | 
         
             
                        d["img_id"] = "{}-{}".format(kb.id, d["_id"])
         
     | 
| 478 | 
         
             
                        del d["image"]
         
     | 
| 479 | 
         
             
                        docs.append(d)
         
     | 
    	
        api/db/services/file_service.py
    CHANGED
    
    | 
         @@ -27,7 +27,7 @@ from api.db.services.document_service import DocumentService 
     | 
|
| 27 | 
         
             
            from api.db.services.file2document_service import File2DocumentService
         
     | 
| 28 | 
         
             
            from api.utils import get_uuid
         
     | 
| 29 | 
         
             
            from api.utils.file_utils import filename_type, thumbnail
         
     | 
| 30 | 
         
            -
            from rag.utils. 
     | 
| 31 | 
         | 
| 32 | 
         | 
| 33 | 
         
             
            class FileService(CommonService):
         
     | 
| 
         @@ -350,10 +350,10 @@ class FileService(CommonService): 
     | 
|
| 350 | 
         
             
                                raise RuntimeError("This type of file has not been supported yet!")
         
     | 
| 351 | 
         | 
| 352 | 
         
             
                            location = filename
         
     | 
| 353 | 
         
            -
                            while  
     | 
| 354 | 
         
             
                                location += "_"
         
     | 
| 355 | 
         
             
                            blob = file.read()
         
     | 
| 356 | 
         
            -
                             
     | 
| 357 | 
         
             
                            doc = {
         
     | 
| 358 | 
         
             
                                "id": get_uuid(),
         
     | 
| 359 | 
         
             
                                "kb_id": kb.id,
         
     | 
| 
         | 
|
| 27 | 
         
             
            from api.db.services.file2document_service import File2DocumentService
         
     | 
| 28 | 
         
             
            from api.utils import get_uuid
         
     | 
| 29 | 
         
             
            from api.utils.file_utils import filename_type, thumbnail
         
     | 
| 30 | 
         
            +
            from rag.utils.storage_factory import STORAGE_IMPL
         
     | 
| 31 | 
         | 
| 32 | 
         | 
| 33 | 
         
             
            class FileService(CommonService):
         
     | 
| 
         | 
|
| 350 | 
         
             
                                raise RuntimeError("This type of file has not been supported yet!")
         
     | 
| 351 | 
         | 
| 352 | 
         
             
                            location = filename
         
     | 
| 353 | 
         
            +
                            while STORAGE_IMPL.obj_exist(kb.id, location):
         
     | 
| 354 | 
         
             
                                location += "_"
         
     | 
| 355 | 
         
             
                            blob = file.read()
         
     | 
| 356 | 
         
            +
                            STORAGE_IMPL.put(kb.id, location, blob)
         
     | 
| 357 | 
         
             
                            doc = {
         
     | 
| 358 | 
         
             
                                "id": get_uuid(),
         
     | 
| 359 | 
         
             
                                "kb_id": kb.id,
         
     | 
    	
        api/db/services/task_service.py
    CHANGED
    
    | 
         @@ -27,7 +27,7 @@ from api.db.services.document_service import DocumentService 
     | 
|
| 27 | 
         
             
            from api.utils import current_timestamp, get_uuid
         
     | 
| 28 | 
         
             
            from deepdoc.parser.excel_parser import RAGFlowExcelParser
         
     | 
| 29 | 
         
             
            from rag.settings import SVR_QUEUE_NAME
         
     | 
| 30 | 
         
            -
            from rag.utils. 
     | 
| 31 | 
         
             
            from rag.utils.redis_conn import REDIS_CONN
         
     | 
| 32 | 
         | 
| 33 | 
         | 
| 
         @@ -143,7 +143,7 @@ def queue_tasks(doc, bucket, name): 
     | 
|
| 143 | 
         
             
                tsks = []
         
     | 
| 144 | 
         | 
| 145 | 
         
             
                if doc["type"] == FileType.PDF.value:
         
     | 
| 146 | 
         
            -
                    file_bin =  
     | 
| 147 | 
         
             
                    do_layout = doc["parser_config"].get("layout_recognize", True)
         
     | 
| 148 | 
         
             
                    pages = PdfParser.total_page_number(doc["name"], file_bin)
         
     | 
| 149 | 
         
             
                    page_size = doc["parser_config"].get("task_page_size", 12)
         
     | 
| 
         @@ -169,7 +169,7 @@ def queue_tasks(doc, bucket, name): 
     | 
|
| 169 | 
         
             
                            tsks.append(task)
         
     | 
| 170 | 
         | 
| 171 | 
         
             
                elif doc["parser_id"] == "table":
         
     | 
| 172 | 
         
            -
                    file_bin =  
     | 
| 173 | 
         
             
                    rn = RAGFlowExcelParser.row_number(
         
     | 
| 174 | 
         
             
                        doc["name"], file_bin)
         
     | 
| 175 | 
         
             
                    for i in range(0, rn, 3000):
         
     | 
| 
         | 
|
| 27 | 
         
             
            from api.utils import current_timestamp, get_uuid
         
     | 
| 28 | 
         
             
            from deepdoc.parser.excel_parser import RAGFlowExcelParser
         
     | 
| 29 | 
         
             
            from rag.settings import SVR_QUEUE_NAME
         
     | 
| 30 | 
         
            +
            from rag.utils.storage_factory import STORAGE_IMPL
         
     | 
| 31 | 
         
             
            from rag.utils.redis_conn import REDIS_CONN
         
     | 
| 32 | 
         | 
| 33 | 
         | 
| 
         | 
|
| 143 | 
         
             
                tsks = []
         
     | 
| 144 | 
         | 
| 145 | 
         
             
                if doc["type"] == FileType.PDF.value:
         
     | 
| 146 | 
         
            +
                    file_bin = STORAGE_IMPL.get(bucket, name)
         
     | 
| 147 | 
         
             
                    do_layout = doc["parser_config"].get("layout_recognize", True)
         
     | 
| 148 | 
         
             
                    pages = PdfParser.total_page_number(doc["name"], file_bin)
         
     | 
| 149 | 
         
             
                    page_size = doc["parser_config"].get("task_page_size", 12)
         
     | 
| 
         | 
|
| 169 | 
         
             
                            tsks.append(task)
         
     | 
| 170 | 
         | 
| 171 | 
         
             
                elif doc["parser_id"] == "table":
         
     | 
| 172 | 
         
            +
                    file_bin = STORAGE_IMPL.get(bucket, name)
         
     | 
| 173 | 
         
             
                    rn = RAGFlowExcelParser.row_number(
         
     | 
| 174 | 
         
             
                        doc["name"], file_bin)
         
     | 
| 175 | 
         
             
                    for i in range(0, rn, 3000):
         
     | 
    	
        conf/service_conf.yaml
    CHANGED
    
    | 
         @@ -13,6 +13,22 @@ minio: 
     | 
|
| 13 | 
         
             
              user: 'rag_flow'
         
     | 
| 14 | 
         
             
              password: 'infini_rag_flow'
         
     | 
| 15 | 
         
             
              host: 'minio:9000'
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 16 | 
         
             
            es:
         
     | 
| 17 | 
         
             
              hosts: 'http://es01:9200'
         
     | 
| 18 | 
         
             
              username: 'elastic'
         
     | 
| 
         | 
|
| 13 | 
         
             
              user: 'rag_flow'
         
     | 
| 14 | 
         
             
              password: 'infini_rag_flow'
         
     | 
| 15 | 
         
             
              host: 'minio:9000'
         
     | 
| 16 | 
         
            +
            azure:
         
     | 
| 17 | 
         
            +
              auth_type: 'sas'
         
     | 
| 18 | 
         
            +
              container_url: 'container_url'
         
     | 
| 19 | 
         
            +
              sas_token: 'sas_token'
         
     | 
| 20 | 
         
            +
            #azure:
         
     | 
| 21 | 
         
            +
            #  auth_type: 'spn'
         
     | 
| 22 | 
         
            +
            #  account_url: 'account_url'
         
     | 
| 23 | 
         
            +
            #  client_id: 'client_id'
         
     | 
| 24 | 
         
            +
            #  secret: 'secret'
         
     | 
| 25 | 
         
            +
            #  tenant_id: 'tenant_id'
         
     | 
| 26 | 
         
            +
            #  container_name: 'container_name'
         
     | 
| 27 | 
         
            +
            s3:
         
     | 
| 28 | 
         
            +
              endpoint: 'endpoint'
         
     | 
| 29 | 
         
            +
              access_key: 'access_key'
         
     | 
| 30 | 
         
            +
              secret_key: 'secret_key'
         
     | 
| 31 | 
         
            +
              region: 'region'
         
     | 
| 32 | 
         
             
            es:
         
     | 
| 33 | 
         
             
              hosts: 'http://es01:9200'
         
     | 
| 34 | 
         
             
              username: 'elastic'
         
     | 
    	
        rag/settings.py
    CHANGED
    
    | 
         @@ -24,6 +24,8 @@ RAG_CONF_PATH = os.path.join(get_project_base_directory(), "conf") 
     | 
|
| 24 | 
         
             
            SUBPROCESS_STD_LOG_NAME = "std.log"
         
     | 
| 25 | 
         | 
| 26 | 
         
             
            ES = get_base_config("es", {})
         
     | 
| 
         | 
|
| 
         | 
|
| 27 | 
         
             
            MINIO = decrypt_database_config(name="minio")
         
     | 
| 28 | 
         
             
            try:
         
     | 
| 29 | 
         
             
                REDIS = decrypt_database_config(name="redis")
         
     | 
| 
         @@ -43,6 +45,8 @@ LoggerFactory.LEVEL = 30 
     | 
|
| 43 | 
         | 
| 44 | 
         
             
            es_logger = getLogger("es")
         
     | 
| 45 | 
         
             
            minio_logger = getLogger("minio")
         
     | 
| 
         | 
|
| 
         | 
|
| 46 | 
         
             
            cron_logger = getLogger("cron_logger")
         
     | 
| 47 | 
         
             
            cron_logger.setLevel(20)
         
     | 
| 48 | 
         
             
            chunk_logger = getLogger("chunk_logger")
         
     | 
| 
         | 
|
| 24 | 
         
             
            SUBPROCESS_STD_LOG_NAME = "std.log"
         
     | 
| 25 | 
         | 
| 26 | 
         
             
            ES = get_base_config("es", {})
         
     | 
| 27 | 
         
            +
            AZURE = get_base_config("azure", {})
         
     | 
| 28 | 
         
            +
            S3 = get_base_config("s3", {})
         
     | 
| 29 | 
         
             
            MINIO = decrypt_database_config(name="minio")
         
     | 
| 30 | 
         
             
            try:
         
     | 
| 31 | 
         
             
                REDIS = decrypt_database_config(name="redis")
         
     | 
| 
         | 
|
| 45 | 
         | 
| 46 | 
         
             
            es_logger = getLogger("es")
         
     | 
| 47 | 
         
             
            minio_logger = getLogger("minio")
         
     | 
| 48 | 
         
            +
            s3_logger = getLogger("s3")
         
     | 
| 49 | 
         
            +
            azure_logger = getLogger("azure")
         
     | 
| 50 | 
         
             
            cron_logger = getLogger("cron_logger")
         
     | 
| 51 | 
         
             
            cron_logger.setLevel(20)
         
     | 
| 52 | 
         
             
            chunk_logger = getLogger("chunk_logger")
         
     | 
    	
        rag/svr/cache_file_svr.py
    CHANGED
    
    | 
         @@ -20,7 +20,7 @@ import traceback 
     | 
|
| 20 | 
         
             
            from api.db.db_models import close_connection
         
     | 
| 21 | 
         
             
            from api.db.services.task_service import TaskService
         
     | 
| 22 | 
         
             
            from rag.settings import cron_logger
         
     | 
| 23 | 
         
            -
            from rag.utils. 
     | 
| 24 | 
         
             
            from rag.utils.redis_conn import REDIS_CONN
         
     | 
| 25 | 
         | 
| 26 | 
         | 
| 
         @@ -42,7 +42,7 @@ def main(): 
     | 
|
| 42 | 
         
             
                            try:
         
     | 
| 43 | 
         
             
                                key = "{}/{}".format(kb_id, loc)
         
     | 
| 44 | 
         
             
                                if REDIS_CONN.exist(key):continue
         
     | 
| 45 | 
         
            -
                                file_bin =  
     | 
| 46 | 
         
             
                                REDIS_CONN.transaction(key, file_bin, 12 * 60)
         
     | 
| 47 | 
         
             
                                cron_logger.info("CACHE: {}".format(loc))
         
     | 
| 48 | 
         
             
                            except Exception as e:
         
     | 
| 
         | 
|
| 20 | 
         
             
            from api.db.db_models import close_connection
         
     | 
| 21 | 
         
             
            from api.db.services.task_service import TaskService
         
     | 
| 22 | 
         
             
            from rag.settings import cron_logger
         
     | 
| 23 | 
         
            +
            from rag.utils.storage_factory import STORAGE_IMPL
         
     | 
| 24 | 
         
             
            from rag.utils.redis_conn import REDIS_CONN
         
     | 
| 25 | 
         | 
| 26 | 
         | 
| 
         | 
|
| 42 | 
         
             
                            try:
         
     | 
| 43 | 
         
             
                                key = "{}/{}".format(kb_id, loc)
         
     | 
| 44 | 
         
             
                                if REDIS_CONN.exist(key):continue
         
     | 
| 45 | 
         
            +
                                file_bin = STORAGE_IMPL.get(kb_id, loc)
         
     | 
| 46 | 
         
             
                                REDIS_CONN.transaction(key, file_bin, 12 * 60)
         
     | 
| 47 | 
         
             
                                cron_logger.info("CACHE: {}".format(loc))
         
     | 
| 48 | 
         
             
                            except Exception as e:
         
     | 
    	
        rag/svr/task_executor.py
    CHANGED
    
    | 
         @@ -29,7 +29,7 @@ from functools import partial 
     | 
|
| 29 | 
         
             
            from api.db.services.file2document_service import File2DocumentService
         
     | 
| 30 | 
         
             
            from api.settings import retrievaler
         
     | 
| 31 | 
         
             
            from rag.raptor import RecursiveAbstractiveProcessing4TreeOrganizedRetrieval as Raptor
         
     | 
| 32 | 
         
            -
            from rag.utils. 
     | 
| 33 | 
         
             
            from api.db.db_models import close_connection
         
     | 
| 34 | 
         
             
            from rag.settings import database_logger, SVR_QUEUE_NAME
         
     | 
| 35 | 
         
             
            from rag.settings import cron_logger, DOC_MAXIMUM_SIZE
         
     | 
| 
         @@ -138,7 +138,7 @@ def collect(): 
     | 
|
| 138 | 
         | 
| 139 | 
         | 
| 140 | 
         
             
            def get_minio_binary(bucket, name):
         
     | 
| 141 | 
         
            -
                return  
     | 
| 142 | 
         | 
| 143 | 
         | 
| 144 | 
         
             
            def build(row):
         
     | 
| 
         @@ -214,7 +214,7 @@ def build(row): 
     | 
|
| 214 | 
         
             
                            d["image"].save(output_buffer, format='JPEG')
         
     | 
| 215 | 
         | 
| 216 | 
         
             
                        st = timer()
         
     | 
| 217 | 
         
            -
                         
     | 
| 218 | 
         
             
                        el += timer() - st
         
     | 
| 219 | 
         
             
                    except Exception as e:
         
     | 
| 220 | 
         
             
                        cron_logger.error(str(e))
         
     | 
| 
         | 
|
| 29 | 
         
             
            from api.db.services.file2document_service import File2DocumentService
         
     | 
| 30 | 
         
             
            from api.settings import retrievaler
         
     | 
| 31 | 
         
             
            from rag.raptor import RecursiveAbstractiveProcessing4TreeOrganizedRetrieval as Raptor
         
     | 
| 32 | 
         
            +
            from rag.utils.storage_factory import STORAGE_IMPL
         
     | 
| 33 | 
         
             
            from api.db.db_models import close_connection
         
     | 
| 34 | 
         
             
            from rag.settings import database_logger, SVR_QUEUE_NAME
         
     | 
| 35 | 
         
             
            from rag.settings import cron_logger, DOC_MAXIMUM_SIZE
         
     | 
| 
         | 
|
| 138 | 
         | 
| 139 | 
         | 
| 140 | 
         
             
            def get_minio_binary(bucket, name):
         
     | 
| 141 | 
         
            +
                return STORAGE_IMPL.get(bucket, name)
         
     | 
| 142 | 
         | 
| 143 | 
         | 
| 144 | 
         
             
            def build(row):
         
     | 
| 
         | 
|
| 214 | 
         
             
                            d["image"].save(output_buffer, format='JPEG')
         
     | 
| 215 | 
         | 
| 216 | 
         
             
                        st = timer()
         
     | 
| 217 | 
         
            +
                        STORAGE_IMPL.put(row["kb_id"], d["_id"], output_buffer.getvalue())
         
     | 
| 218 | 
         
             
                        el += timer() - st
         
     | 
| 219 | 
         
             
                    except Exception as e:
         
     | 
| 220 | 
         
             
                        cron_logger.error(str(e))
         
     | 
    	
        rag/utils/azure_sas_conn.py
    ADDED
    
    | 
         @@ -0,0 +1,80 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            import os
         
     | 
| 2 | 
         
            +
            import time
         
     | 
| 3 | 
         
            +
            from io import BytesIO
         
     | 
| 4 | 
         
            +
            from rag import settings
         
     | 
| 5 | 
         
            +
            from rag.settings import azure_logger
         
     | 
| 6 | 
         
            +
            from rag.utils import singleton
         
     | 
| 7 | 
         
            +
            from azure.storage.blob import ContainerClient
         
     | 
| 8 | 
         
            +
             
     | 
| 9 | 
         
            +
             
     | 
| 10 | 
         
            +
            @singleton
         
     | 
| 11 | 
         
            +
            class RAGFlowAzureSasBlob(object):
         
     | 
| 12 | 
         
            +
                def __init__(self):
         
     | 
| 13 | 
         
            +
                    self.conn = None
         
     | 
| 14 | 
         
            +
                    self.container_url = os.getenv('CONTAINER_URL', settings.AZURE["container_url"])
         
     | 
| 15 | 
         
            +
                    self.sas_token = os.getenv('SAS_TOKEN', settings.AZURE["sas_token"])
         
     | 
| 16 | 
         
            +
                    self.__open__()
         
     | 
| 17 | 
         
            +
             
     | 
| 18 | 
         
            +
                def __open__(self):
         
     | 
| 19 | 
         
            +
                    try:
         
     | 
| 20 | 
         
            +
                        if self.conn:
         
     | 
| 21 | 
         
            +
                            self.__close__()
         
     | 
| 22 | 
         
            +
                    except Exception as e:
         
     | 
| 23 | 
         
            +
                        pass
         
     | 
| 24 | 
         
            +
             
     | 
| 25 | 
         
            +
                    try:
         
     | 
| 26 | 
         
            +
                        self.conn = ContainerClient.from_container_url(self.account_url + "?" + self.sas_token)
         
     | 
| 27 | 
         
            +
                    except Exception as e:
         
     | 
| 28 | 
         
            +
                        azure_logger.error(
         
     | 
| 29 | 
         
            +
                            "Fail to connect %s " % self.account_url + str(e))
         
     | 
| 30 | 
         
            +
             
     | 
| 31 | 
         
            +
                def __close__(self):
         
     | 
| 32 | 
         
            +
                    del self.conn
         
     | 
| 33 | 
         
            +
                    self.conn = None
         
     | 
| 34 | 
         
            +
             
     | 
| 35 | 
         
            +
                def health(self):
         
     | 
| 36 | 
         
            +
                    bucket, fnm, binary = "txtxtxtxt1", "txtxtxtxt1", b"_t@@@1"
         
     | 
| 37 | 
         
            +
                    return self.conn.upload_blob(name=fnm, data=BytesIO(binary), length=len(binary))
         
     | 
| 38 | 
         
            +
             
     | 
| 39 | 
         
            +
                def put(self, bucket, fnm, binary):
         
     | 
| 40 | 
         
            +
                    for _ in range(3):
         
     | 
| 41 | 
         
            +
                        try:
         
     | 
| 42 | 
         
            +
                            return self.conn.upload_blob(name=fnm, data=BytesIO(binary), length=len(binary))
         
     | 
| 43 | 
         
            +
                        except Exception as e:
         
     | 
| 44 | 
         
            +
                            azure_logger.error(f"Fail put {bucket}/{fnm}: " + str(e))
         
     | 
| 45 | 
         
            +
                            self.__open__()
         
     | 
| 46 | 
         
            +
                            time.sleep(1)
         
     | 
| 47 | 
         
            +
             
     | 
| 48 | 
         
            +
                def rm(self, bucket, fnm):
         
     | 
| 49 | 
         
            +
                    try:
         
     | 
| 50 | 
         
            +
                        self.conn.delete_blob(fnm)
         
     | 
| 51 | 
         
            +
                    except Exception as e:
         
     | 
| 52 | 
         
            +
                        azure_logger.error(f"Fail rm {bucket}/{fnm}: " + str(e))
         
     | 
| 53 | 
         
            +
             
     | 
| 54 | 
         
            +
                def get(self, bucket, fnm):
         
     | 
| 55 | 
         
            +
                    for _ in range(1):
         
     | 
| 56 | 
         
            +
                        try:
         
     | 
| 57 | 
         
            +
                            r = self.conn.download_blob(fnm)
         
     | 
| 58 | 
         
            +
                            return r.read()
         
     | 
| 59 | 
         
            +
                        except Exception as e:
         
     | 
| 60 | 
         
            +
                            azure_logger.error(f"fail get {bucket}/{fnm}: " + str(e))
         
     | 
| 61 | 
         
            +
                            self.__open__()
         
     | 
| 62 | 
         
            +
                            time.sleep(1)
         
     | 
| 63 | 
         
            +
                    return
         
     | 
| 64 | 
         
            +
             
     | 
| 65 | 
         
            +
                def obj_exist(self, bucket, fnm):
         
     | 
| 66 | 
         
            +
                    try:
         
     | 
| 67 | 
         
            +
                        return self.conn.get_blob_client(fnm).exists()
         
     | 
| 68 | 
         
            +
                    except Exception as e:
         
     | 
| 69 | 
         
            +
                        azure_logger.error(f"Fail put {bucket}/{fnm}: " + str(e))
         
     | 
| 70 | 
         
            +
                    return False
         
     | 
| 71 | 
         
            +
             
     | 
| 72 | 
         
            +
                def get_presigned_url(self, bucket, fnm, expires):
         
     | 
| 73 | 
         
            +
                    for _ in range(10):
         
     | 
| 74 | 
         
            +
                        try:
         
     | 
| 75 | 
         
            +
                            return self.conn.get_presigned_url("GET", bucket, fnm, expires)
         
     | 
| 76 | 
         
            +
                        except Exception as e:
         
     | 
| 77 | 
         
            +
                            azure_logger.error(f"fail get {bucket}/{fnm}: " + str(e))
         
     | 
| 78 | 
         
            +
                            self.__open__()
         
     | 
| 79 | 
         
            +
                            time.sleep(1)
         
     | 
| 80 | 
         
            +
                    return
         
     | 
    	
        rag/utils/azure_spn_conn.py
    ADDED
    
    | 
         @@ -0,0 +1,90 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            import os
         
     | 
| 2 | 
         
            +
            import time
         
     | 
| 3 | 
         
            +
            from rag import settings
         
     | 
| 4 | 
         
            +
            from rag.settings import azure_logger
         
     | 
| 5 | 
         
            +
            from rag.utils import singleton
         
     | 
| 6 | 
         
            +
            from azure.identity import ClientSecretCredential, AzureAuthorityHosts
         
     | 
| 7 | 
         
            +
            from azure.storage.filedatalake import FileSystemClient
         
     | 
| 8 | 
         
            +
             
     | 
| 9 | 
         
            +
             
     | 
| 10 | 
         
            +
            @singleton
         
     | 
| 11 | 
         
            +
            class RAGFlowAzureSpnBlob(object):
         
     | 
| 12 | 
         
            +
                def __init__(self):
         
     | 
| 13 | 
         
            +
                    self.conn = None
         
     | 
| 14 | 
         
            +
                    self.account_url = os.getenv('ACCOUNT_URL', settings.AZURE["account_url"])
         
     | 
| 15 | 
         
            +
                    self.client_id = os.getenv('CLIENT_ID', settings.AZURE["client_id"])
         
     | 
| 16 | 
         
            +
                    self.secret = os.getenv('SECRET', settings.AZURE["secret"])
         
     | 
| 17 | 
         
            +
                    self.tenant_id = os.getenv('TENANT_ID', settings.AZURE["tenant_id"])
         
     | 
| 18 | 
         
            +
                    self.container_name = os.getenv('CONTAINER_NAME', settings.AZURE["container_name"])
         
     | 
| 19 | 
         
            +
                    self.__open__()
         
     | 
| 20 | 
         
            +
             
     | 
| 21 | 
         
            +
                def __open__(self):
         
     | 
| 22 | 
         
            +
                    try:
         
     | 
| 23 | 
         
            +
                        if self.conn:
         
     | 
| 24 | 
         
            +
                            self.__close__()
         
     | 
| 25 | 
         
            +
                    except Exception as e:
         
     | 
| 26 | 
         
            +
                        pass
         
     | 
| 27 | 
         
            +
             
     | 
| 28 | 
         
            +
                    try:
         
     | 
| 29 | 
         
            +
                        credentials = ClientSecretCredential(tenant_id=self.tenant_id, client_id=self.client_id, client_secret=self.secret, authority=AzureAuthorityHosts.AZURE_CHINA)
         
     | 
| 30 | 
         
            +
                        self.conn = FileSystemClient(account_url=self.account_url, file_system_name=self.container_name, credential=credentials)
         
     | 
| 31 | 
         
            +
                    except Exception as e:
         
     | 
| 32 | 
         
            +
                        azure_logger.error(
         
     | 
| 33 | 
         
            +
                            "Fail to connect %s " % self.account_url + str(e))
         
     | 
| 34 | 
         
            +
             
     | 
| 35 | 
         
            +
                def __close__(self):
         
     | 
| 36 | 
         
            +
                    del self.conn
         
     | 
| 37 | 
         
            +
                    self.conn = None
         
     | 
| 38 | 
         
            +
             
     | 
| 39 | 
         
            +
                def health(self):
         
     | 
| 40 | 
         
            +
                    bucket, fnm, binary = "txtxtxtxt1", "txtxtxtxt1", b"_t@@@1"
         
     | 
| 41 | 
         
            +
                    f = self.conn.create_file(fnm)
         
     | 
| 42 | 
         
            +
                    f.append_data(binary, offset=0, length=len(binary))
         
     | 
| 43 | 
         
            +
                    return f.flush_data(len(binary))
         
     | 
| 44 | 
         
            +
             
     | 
| 45 | 
         
            +
                def put(self, bucket, fnm, binary):
         
     | 
| 46 | 
         
            +
                    for _ in range(3):
         
     | 
| 47 | 
         
            +
                        try:
         
     | 
| 48 | 
         
            +
                            f = self.conn.create_file(fnm)
         
     | 
| 49 | 
         
            +
                            f.append_data(binary, offset=0, length=len(binary))
         
     | 
| 50 | 
         
            +
                            return f.flush_data(len(binary))
         
     | 
| 51 | 
         
            +
                        except Exception as e:
         
     | 
| 52 | 
         
            +
                            azure_logger.error(f"Fail put {bucket}/{fnm}: " + str(e))
         
     | 
| 53 | 
         
            +
                            self.__open__()
         
     | 
| 54 | 
         
            +
                            time.sleep(1)
         
     | 
| 55 | 
         
            +
             
     | 
| 56 | 
         
            +
                def rm(self, bucket, fnm):
         
     | 
| 57 | 
         
            +
                    try:
         
     | 
| 58 | 
         
            +
                        self.conn.delete_file(fnm)
         
     | 
| 59 | 
         
            +
                    except Exception as e:
         
     | 
| 60 | 
         
            +
                        azure_logger.error(f"Fail rm {bucket}/{fnm}: " + str(e))
         
     | 
| 61 | 
         
            +
             
     | 
| 62 | 
         
            +
                def get(self, bucket, fnm):
         
     | 
| 63 | 
         
            +
                    for _ in range(1):
         
     | 
| 64 | 
         
            +
                        try:
         
     | 
| 65 | 
         
            +
                            client = self.conn.get_file_client(fnm)
         
     | 
| 66 | 
         
            +
                            r = client.download_file()
         
     | 
| 67 | 
         
            +
                            return r.read()
         
     | 
| 68 | 
         
            +
                        except Exception as e:
         
     | 
| 69 | 
         
            +
                            azure_logger.error(f"fail get {bucket}/{fnm}: " + str(e))
         
     | 
| 70 | 
         
            +
                            self.__open__()
         
     | 
| 71 | 
         
            +
                            time.sleep(1)
         
     | 
| 72 | 
         
            +
                    return
         
     | 
| 73 | 
         
            +
             
     | 
| 74 | 
         
            +
                def obj_exist(self, bucket, fnm):
         
     | 
| 75 | 
         
            +
                    try:
         
     | 
| 76 | 
         
            +
                        client = self.conn.get_file_client(fnm)
         
     | 
| 77 | 
         
            +
                        return client.exists()
         
     | 
| 78 | 
         
            +
                    except Exception as e:
         
     | 
| 79 | 
         
            +
                        azure_logger.error(f"Fail put {bucket}/{fnm}: " + str(e))
         
     | 
| 80 | 
         
            +
                    return False
         
     | 
| 81 | 
         
            +
             
     | 
| 82 | 
         
            +
                def get_presigned_url(self, bucket, fnm, expires):
         
     | 
| 83 | 
         
            +
                    for _ in range(10):
         
     | 
| 84 | 
         
            +
                        try:
         
     | 
| 85 | 
         
            +
                            return self.conn.get_presigned_url("GET", bucket, fnm, expires)
         
     | 
| 86 | 
         
            +
                        except Exception as e:
         
     | 
| 87 | 
         
            +
                            azure_logger.error(f"fail get {bucket}/{fnm}: " + str(e))
         
     | 
| 88 | 
         
            +
                            self.__open__()
         
     | 
| 89 | 
         
            +
                            time.sleep(1)
         
     | 
| 90 | 
         
            +
                    return
         
     | 
    	
        rag/utils/s3_conn.py
    ADDED
    
    | 
         @@ -0,0 +1,135 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            import boto3
         
     | 
| 2 | 
         
            +
            import os
         
     | 
| 3 | 
         
            +
            from botocore.exceptions import ClientError
         
     | 
| 4 | 
         
            +
            from botocore.client import Config
         
     | 
| 5 | 
         
            +
            import time
         
     | 
| 6 | 
         
            +
            from io import BytesIO
         
     | 
| 7 | 
         
            +
            from rag.settings import s3_logger
         
     | 
| 8 | 
         
            +
            from rag.utils import singleton
         
     | 
| 9 | 
         
            +
             
     | 
| 10 | 
         
            +
            @singleton
         
     | 
| 11 | 
         
            +
            class RAGFlowS3(object):
         
     | 
| 12 | 
         
            +
                def __init__(self):
         
     | 
| 13 | 
         
            +
                    self.conn = None
         
     | 
| 14 | 
         
            +
                    self.endpoint = os.getenv('ENDPOINT', None)
         
     | 
| 15 | 
         
            +
                    self.access_key = os.getenv('ACCESS_KEY', None)
         
     | 
| 16 | 
         
            +
                    self.secret_key = os.getenv('SECRET_KEY', None)
         
     | 
| 17 | 
         
            +
                    self.region = os.getenv('REGION', None)
         
     | 
| 18 | 
         
            +
                    self.__open__()
         
     | 
| 19 | 
         
            +
             
     | 
| 20 | 
         
            +
                def __open__(self):
         
     | 
| 21 | 
         
            +
                    try:
         
     | 
| 22 | 
         
            +
                        if self.conn:
         
     | 
| 23 | 
         
            +
                            self.__close__()
         
     | 
| 24 | 
         
            +
                    except Exception as e:
         
     | 
| 25 | 
         
            +
                        pass
         
     | 
| 26 | 
         
            +
             
     | 
| 27 | 
         
            +
                    try:
         
     | 
| 28 | 
         
            +
             
     | 
| 29 | 
         
            +
                        config = Config(
         
     | 
| 30 | 
         
            +
                            s3={
         
     | 
| 31 | 
         
            +
                                'addressing_style': 'virtual'
         
     | 
| 32 | 
         
            +
                            }
         
     | 
| 33 | 
         
            +
                        )
         
     | 
| 34 | 
         
            +
             
     | 
| 35 | 
         
            +
                        self.conn = boto3.client(
         
     | 
| 36 | 
         
            +
                            's3',
         
     | 
| 37 | 
         
            +
                            endpoint_url=self.endpoint,
         
     | 
| 38 | 
         
            +
                            region_name=self.region,
         
     | 
| 39 | 
         
            +
                            aws_access_key_id=self.access_key,
         
     | 
| 40 | 
         
            +
                            aws_secret_access_key=self.secret_key,
         
     | 
| 41 | 
         
            +
                            config=config
         
     | 
| 42 | 
         
            +
                        )
         
     | 
| 43 | 
         
            +
                    except Exception as e:
         
     | 
| 44 | 
         
            +
                        s3_logger.error(
         
     | 
| 45 | 
         
            +
                            "Fail to connect %s " % self.endpoint + str(e))
         
     | 
| 46 | 
         
            +
             
     | 
| 47 | 
         
            +
                def __close__(self):
         
     | 
| 48 | 
         
            +
                    del self.conn
         
     | 
| 49 | 
         
            +
                    self.conn = None
         
     | 
| 50 | 
         
            +
             
     | 
| 51 | 
         
            +
                def bucket_exists(self, bucket):
         
     | 
| 52 | 
         
            +
                    try:
         
     | 
| 53 | 
         
            +
                        s3_logger.error(f"head_bucket bucketname  {bucket}")
         
     | 
| 54 | 
         
            +
                        self.conn.head_bucket(Bucket=bucket)
         
     | 
| 55 | 
         
            +
                        exists = True
         
     | 
| 56 | 
         
            +
                    except ClientError as e:
         
     | 
| 57 | 
         
            +
                        s3_logger.error(f"head_bucket error {bucket}: " + str(e))
         
     | 
| 58 | 
         
            +
                        exists = False
         
     | 
| 59 | 
         
            +
                    return exists
         
     | 
| 60 | 
         
            +
             
     | 
| 61 | 
         
            +
                def health(self):
         
     | 
| 62 | 
         
            +
                    bucket, fnm, binary = "txtxtxtxt1", "txtxtxtxt1", b"_t@@@1"
         
     | 
| 63 | 
         
            +
             
     | 
| 64 | 
         
            +
                    if not self.bucket_exists(bucket):
         
     | 
| 65 | 
         
            +
                        self.conn.create_bucket(Bucket=bucket)
         
     | 
| 66 | 
         
            +
                        s3_logger.error(f"create bucket {bucket} ********")
         
     | 
| 67 | 
         
            +
             
     | 
| 68 | 
         
            +
                    r = self.conn.upload_fileobj(BytesIO(binary), bucket, fnm)
         
     | 
| 69 | 
         
            +
                    return r
         
     | 
| 70 | 
         
            +
             
     | 
| 71 | 
         
            +
                def get_properties(self, bucket, key):
         
     | 
| 72 | 
         
            +
                    return {}
         
     | 
| 73 | 
         
            +
             
     | 
| 74 | 
         
            +
                def list(self, bucket, dir, recursive=True):
         
     | 
| 75 | 
         
            +
                    return []
         
     | 
| 76 | 
         
            +
             
     | 
| 77 | 
         
            +
                def put(self, bucket, fnm, binary):
         
     | 
| 78 | 
         
            +
                    s3_logger.error(f"bucket name {bucket}; filename :{fnm}:")
         
     | 
| 79 | 
         
            +
                    for _ in range(1):
         
     | 
| 80 | 
         
            +
                        try:
         
     | 
| 81 | 
         
            +
                            if not self.bucket_exists(bucket):
         
     | 
| 82 | 
         
            +
                                self.conn.create_bucket(Bucket=bucket)
         
     | 
| 83 | 
         
            +
                                s3_logger.error(f"create bucket {bucket} ********")
         
     | 
| 84 | 
         
            +
                            r = self.conn.upload_fileobj(BytesIO(binary), bucket, fnm)
         
     | 
| 85 | 
         
            +
             
     | 
| 86 | 
         
            +
                            return r
         
     | 
| 87 | 
         
            +
                        except Exception as e:
         
     | 
| 88 | 
         
            +
                            s3_logger.error(f"Fail put {bucket}/{fnm}: " + str(e))
         
     | 
| 89 | 
         
            +
                            self.__open__()
         
     | 
| 90 | 
         
            +
                            time.sleep(1)
         
     | 
| 91 | 
         
            +
             
     | 
| 92 | 
         
            +
                def rm(self, bucket, fnm):
         
     | 
| 93 | 
         
            +
                    try:
         
     | 
| 94 | 
         
            +
                        self.conn.delete_object(Bucket=bucket, Key=fnm)
         
     | 
| 95 | 
         
            +
                    except Exception as e:
         
     | 
| 96 | 
         
            +
                        s3_logger.error(f"Fail rm {bucket}/{fnm}: " + str(e))
         
     | 
| 97 | 
         
            +
             
     | 
| 98 | 
         
            +
                def get(self, bucket, fnm):
         
     | 
| 99 | 
         
            +
                    for _ in range(1):
         
     | 
| 100 | 
         
            +
                        try:
         
     | 
| 101 | 
         
            +
                            r = self.conn.get_object(Bucket=bucket, Key=fnm)
         
     | 
| 102 | 
         
            +
                            object_data = r['Body'].read()
         
     | 
| 103 | 
         
            +
                            return object_data
         
     | 
| 104 | 
         
            +
                        except Exception as e:
         
     | 
| 105 | 
         
            +
                            s3_logger.error(f"fail get {bucket}/{fnm}: " + str(e))
         
     | 
| 106 | 
         
            +
                            self.__open__()
         
     | 
| 107 | 
         
            +
                            time.sleep(1)
         
     | 
| 108 | 
         
            +
                    return
         
     | 
| 109 | 
         
            +
             
     | 
| 110 | 
         
            +
                def obj_exist(self, bucket, fnm):
         
     | 
| 111 | 
         
            +
                    try:
         
     | 
| 112 | 
         
            +
             
     | 
| 113 | 
         
            +
                        if self.conn.head_object(Bucket=bucket, Key=fnm):
         
     | 
| 114 | 
         
            +
                            return True
         
     | 
| 115 | 
         
            +
                    except ClientError as e:
         
     | 
| 116 | 
         
            +
                        if e.response['Error']['Code'] == '404':
         
     | 
| 117 | 
         
            +
             
     | 
| 118 | 
         
            +
                            return False
         
     | 
| 119 | 
         
            +
                        else:
         
     | 
| 120 | 
         
            +
                            raise
         
     | 
| 121 | 
         
            +
             
     | 
| 122 | 
         
            +
                def get_presigned_url(self, bucket, fnm, expires):
         
     | 
| 123 | 
         
            +
                    for _ in range(10):
         
     | 
| 124 | 
         
            +
                        try:
         
     | 
| 125 | 
         
            +
                            r = self.conn.generate_presigned_url('get_object',
         
     | 
| 126 | 
         
            +
                                                                 Params={'Bucket': bucket,
         
     | 
| 127 | 
         
            +
                                                                         'Key': fnm},
         
     | 
| 128 | 
         
            +
                                                                 ExpiresIn=expires)
         
     | 
| 129 | 
         
            +
             
     | 
| 130 | 
         
            +
                            return r
         
     | 
| 131 | 
         
            +
                        except Exception as e:
         
     | 
| 132 | 
         
            +
                            s3_logger.error(f"fail get url {bucket}/{fnm}: " + str(e))
         
     | 
| 133 | 
         
            +
                            self.__open__()
         
     | 
| 134 | 
         
            +
                            time.sleep(1)
         
     | 
| 135 | 
         
            +
                    return
         
     | 
    	
        rag/utils/storage_factory.py
    ADDED
    
    | 
         @@ -0,0 +1,30 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            import os
         
     | 
| 2 | 
         
            +
            from enum import Enum
         
     | 
| 3 | 
         
            +
             
     | 
| 4 | 
         
            +
            from rag.utils.azure_sas_conn import RAGFlowAzureSasBlob
         
     | 
| 5 | 
         
            +
            from rag.utils.azure_spn_conn import RAGFlowAzureSpnBlob
         
     | 
| 6 | 
         
            +
            from rag.utils.minio_conn import RAGFlowMinio
         
     | 
| 7 | 
         
            +
            from rag.utils.s3_conn import RAGFlowS3
         
     | 
| 8 | 
         
            +
             
     | 
| 9 | 
         
            +
             
     | 
| 10 | 
         
            +
            class Storage(Enum):
         
     | 
| 11 | 
         
            +
                MINIO = 1
         
     | 
| 12 | 
         
            +
                AZURE_SPN = 2
         
     | 
| 13 | 
         
            +
                AZURE_SAS = 3
         
     | 
| 14 | 
         
            +
                AWS_S3 = 4
         
     | 
| 15 | 
         
            +
             
     | 
| 16 | 
         
            +
             
     | 
| 17 | 
         
            +
            class StorageFactory:
         
     | 
| 18 | 
         
            +
                storage_mapping = {
         
     | 
| 19 | 
         
            +
                    Storage.MINIO: RAGFlowMinio,
         
     | 
| 20 | 
         
            +
                    Storage.AZURE_SPN: RAGFlowAzureSpnBlob,
         
     | 
| 21 | 
         
            +
                    Storage.AZURE_SAS: RAGFlowAzureSasBlob,
         
     | 
| 22 | 
         
            +
                    Storage.AWS_S3: RAGFlowS3,
         
     | 
| 23 | 
         
            +
                }
         
     | 
| 24 | 
         
            +
             
     | 
| 25 | 
         
            +
                @classmethod
         
     | 
| 26 | 
         
            +
                def create(cls, storage: Storage):
         
     | 
| 27 | 
         
            +
                    return cls.storage_mapping[storage]()
         
     | 
| 28 | 
         
            +
             
     | 
| 29 | 
         
            +
             
     | 
| 30 | 
         
            +
            STORAGE_IMPL = StorageFactory.create(Storage[os.getenv('STORAGE_IMPL', 'MINIO')])
         
     | 
    	
        requirements.txt
    CHANGED
    
    | 
         @@ -1,5 +1,7 @@ 
     | 
|
| 1 | 
         
            -
             
     | 
| 2 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 3 | 
         
             
            arxiv==2.1.3
         
     | 
| 4 | 
         
             
            Aspose.Slides==24.2.0
         
     | 
| 5 | 
         
             
            BCEmbedding==0.1.3
         
     | 
| 
         | 
|
| 1 | 
         
            +
            azure-storage-blob==12.22.0
         
     | 
| 2 | 
         
            +
            azure-identity==1.17.1
         
     | 
| 3 | 
         
            +
            azure-storage-file-datalake==12.16.0
         
     | 
| 4 | 
         
            +
            anthropic===0.34.1
         
     | 
| 5 | 
         
             
            arxiv==2.1.3
         
     | 
| 6 | 
         
             
            Aspose.Slides==24.2.0
         
     | 
| 7 | 
         
             
            BCEmbedding==0.1.3
         
     |