File size: 4,312 Bytes
1f2ba2d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
from langchain_community.document_loaders import DirectoryLoader
from langchain.embeddings import HuggingFaceInstructEmbeddings,HuggingFaceEmbeddings # for embedding task 
from langchain.text_splitter import RecursiveCharacterTextSplitter # for converting the large documents into smaller chunks
from langchain.schema import Document 
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
import openai 
import openai
import os
import shutil
import uuid


# Configurations
UPLOAD_FOLDER = "./uploads"
VECTOR_DB_FOLDER = "./VectorDB"
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
os.makedirs(VECTOR_DB_FOLDER, exist_ok=True)


def load_document(data_path):

    # Load documents
    loader = DirectoryLoader(data_path, glob="*.*")
    print("loader",loader)
    document = loader.load()
    return document

# Creating the chunks of Data from the knowledge 
def split_text(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 1000,
        chunk_overlap = 500,
        length_function = len,
        add_start_index=True,
    ) 
    chunks = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(chunks)} chunks.")
    
    return chunks

# # Chroma for creating the vector db whcch we will use for the searching relvant data.
# def save_to_chroma(chunks: list[Document],name: str):
#     print
#     CHROMA_PATH = f"./VectorDB/chroma_{name}"
#     # Clear out the database first.
#     if os.path.exists(CHROMA_PATH):
#         shutil.rmtree(CHROMA_PATH)        
   
#     # Initialize SBERT embedding function
#     embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
#     db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)
    
#     # Add documents and persist the database
#     db.add_documents(chunks)
#     db.persist()
#     # Return the database instance or a success status
#     return db

async def save_to_chroma(chunks: list[Document], name: str):
    CHROMA_PATH = f"./VectorDB/chroma_{name}"
    
    # Clear out the database first
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)
    
    try:
        # Initialize SBERT embedding function
        embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
        db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)
        
        # Add documents and persist the database
        print("Adding documents to the database...")
        db.add_documents(chunks)
        print("Persisting the database...")
        db.persist()
        print("Database successfully saved.")
        
        return db
    except Exception as e:
        print("Error while saving to Chroma:", e)
        return None
 
def get_unique_sources(chroma_path):
    # Load the Chroma database
    db = Chroma(persist_directory=chroma_path)
    
    # Retrieve all metadata from the database
    metadata_list = db.get()['metadatas']
    
    # Extract unique sources from metadata
    unique_sources = {metadata['source'] for metadata in metadata_list if 'source' in metadata}
    return list(unique_sources)
  
def generate_data_store(file_path,db_name):
    CHROMA_PATH = f"./VectorDB/chroma_{db_name}"
    print(f"filepath===>{file_path}  db_name =====>{db_name}")
    try:
        documents = load_document(file_path)
        print("Documents loaded successfully.")
    except Exception as e:
        print(f"Error loading documents: {e}")
        return

    try:
        chunks = split_text(documents)
        print(f"Text split into {len(chunks)} chunks.")
    except Exception as e:
        print(f"Error splitting text: {e}")
        return

    try:
        save_to_chroma(chunks, db_name)
        print(f"Data saved to Chroma for database {db_name}.")
    except Exception as e:
        print(f"Error saving to Chroma: {e}")
        return
# def main():
#     data_path = "H:\\DEV PATEL\\RAG Project\\data1"
#     db_name = "Product_data"
#     generate_data_store(data_path,db_name)

# if __name__ == "__main__":
#     main()