Spaces:

ZySec-AI
/

ZySec

Running

File size: 3,997 Bytes

8e29341


#app_to_vectorstore.py

import os
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from modules import common_utils,file_utils
from modules import app_logger
# Assuming all necessary loader classes are imported

from modules import app_constants

app_logger = app_logger.app_logger

TEMP_DIR = app_constants.WORKSPACE_DIRECTORY + "tmp"
DB_DIR = app_constants.WORKSPACE_DIRECTORY + "db"

processed_files_record = os.path.join(app_constants.WORKSPACE_DIRECTORY, app_constants.PROCESSED_DOCS)

def load_documents_from_jsonl(file_path, loader_class):
    try:
        loader = loader_class(file_path, json_lines=True, text_content=False, jq_schema='.')
        return loader.load()
    except Exception as e:
        app_logger.error(f"Error loading documents from JSONL file {file_path}: {e}")
        return None

def update_processed_files_record(file_md5,module, file_path):
    try:
        with open(processed_files_record, 'a') as file:  # 'a' mode will create the file if it doesn't exist
            file.write(f"{file_md5},{module},{file_path}\n")
    except Exception as e:
        app_logger.error(f"Error updating processed files record: {e}")

def is_file_processed(file_md5):
    if os.path.exists(processed_files_record):
        with open(processed_files_record, 'r') as file:
            for line in file:
                md5, _ = line.strip().split(',', 1)
                if md5 == file_md5:
                    return True
    return False

def get_chroma_index(file_path, current_page="nav_playbooks", is_persistent=True):
    app_logger.info(f"Starting get_chroma_index for {file_path}")
    file_md5 = file_utils.compute_md5(file_path)
    if is_file_processed(file_md5):
        app_logger.info(f"File {file_path} has already been processed. Skipping.")
        db = None
        return False
    
    _, file_extension = os.path.splitext(file_path)
    loader_class = app_constants.DOCUMENT_MAP.get(file_extension.lower(), None)

    if not loader_class:
        app_logger.error(f"No suitable loader found for file type {file_extension}")
        return None, False

    embedding_model = app_constants.EMBEDDING_MODEL_NAME
    chunk_size = app_constants.CHUNK_SIZE
    chunk_overlap = app_constants.CHUNK_OVERLAP

    storage_dir = DB_DIR if is_persistent else TEMP_DIR

    base_filename = f"{current_page}_chroma_db" if is_persistent else f"{os.path.splitext(os.path.basename(file_path))[0]}_chroma_db"
    sanitized_base_filename = file_utils.sanitize_filename(base_filename)
    chroma_persist_directory = os.path.join(storage_dir, sanitized_base_filename)


    embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
    try:
        if file_extension.lower() == '.jsonl':
            documents = load_documents_from_jsonl(file_path, loader_class)
        else:
            loader = loader_class(file_path)
            documents = loader.load()

        if not documents:
            app_logger.error(f"No documents loaded from {file_path}.")
            db = None
            return False

        text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        docs = text_splitter.split_documents(documents)

        if not docs:
            app_logger.error(f"No documents to process after splitting from {file_path}.")
            db = None
            return False

        db = Chroma.from_documents(docs, embeddings, persist_directory=chroma_persist_directory, client_settings=app_constants.CHROMA_SETTINGS)
        update_processed_files_record(file_md5,current_page, file_path)
        app_logger.info("Created index and saved to disk")
        db.persist()
    except Exception as e:
        app_logger.error(f"Error in get_chroma_index for {file_path}: {e}")
        db = None
        return False
    app_logger.info("Completed get_chroma_index operation")
    db = None
    return True