Spaces:
Running
Running
from langchain_community.document_loaders import DirectoryLoader | |
from langchain.embeddings import HuggingFaceInstructEmbeddings,HuggingFaceEmbeddings # for embedding task | |
from langchain.text_splitter import RecursiveCharacterTextSplitter # for converting the large documents into smaller chunks | |
from langchain.schema import Document | |
from langchain_core.documents import Document | |
from langchain_openai import OpenAIEmbeddings | |
from langchain_community.vectorstores import Chroma | |
import openai | |
import openai | |
import os | |
import shutil | |
import uuid | |
import asyncio # async | |
# Configurations | |
UPLOAD_FOLDER = "./uploads" | |
VECTOR_DB_FOLDER = "./VectorDB" | |
os.makedirs(UPLOAD_FOLDER, exist_ok=True) | |
os.makedirs(VECTOR_DB_FOLDER, exist_ok=True) | |
def load_document(data_path): | |
# Load documents | |
loader = DirectoryLoader(data_path, glob="*.*") | |
print("loader",loader) | |
document = loader.load() | |
return document | |
# Creating the chunks of Data from the knowledge | |
def split_text(documents: list[Document]): | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size = 1000, | |
chunk_overlap = 500, | |
length_function = len, | |
add_start_index=True, | |
) | |
chunks = text_splitter.split_documents(documents) | |
print(f"Split {len(documents)} documents into {len(chunks)} chunks.") | |
return chunks | |
# # Chroma for creating the vector db whcch we will use for the searching relvant data. | |
# def save_to_chroma(chunks: list[Document],name: str): | |
# CHROMA_PATH = f"./VectorDB/chroma_{name}" | |
# # Clear out the database first. | |
# if os.path.exists(CHROMA_PATH): | |
# shutil.rmtree(CHROMA_PATH) | |
# # Initialize SBERT embedding function | |
# embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") | |
# db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function) | |
# # Add documents and persist the database | |
# db.add_documents(chunks) | |
# db.persist() | |
# # Return the database instance or a success status | |
# return db | |
def save_to_chroma(chunks: list[Document], name: str): | |
CHROMA_PATH = f"./VectorDB/chroma_{name}" | |
# Clear out the database first | |
if os.path.exists(CHROMA_PATH): | |
shutil.rmtree(CHROMA_PATH) | |
try: | |
# Initialize SBERT embedding function | |
embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") | |
db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function) | |
# Add documents and persist the database | |
print("Adding documents to the database...") | |
db.add_documents(chunks) | |
print("Persisting the database...") | |
db.persist() | |
print("Database successfully saved.") | |
return db | |
except Exception as e: | |
print("Error while saving to Chroma:", e) | |
return None | |
def get_unique_sources(chroma_path): | |
# Load the Chroma database | |
db = Chroma(persist_directory=chroma_path) | |
# Retrieve all metadata from the database | |
metadata_list = db.get()['metadatas'] | |
# Extract unique sources from metadata | |
unique_sources = {metadata['source'] for metadata in metadata_list if 'source' in metadata} | |
return list(unique_sources) | |
def generate_data_store(file_path,db_name): | |
CHROMA_PATH = f"./VectorDB/chroma_{db_name}" | |
print(f"filepath===>{file_path} db_name =====>{db_name}") | |
try: | |
documents = load_document(file_path) | |
print("Documents loaded successfully.") | |
except Exception as e: | |
print(f"Error loading documents: {e}") | |
return | |
try: | |
chunks = split_text(documents) | |
print(f"Text split into {len(chunks)} chunks.") | |
except Exception as e: | |
print(f"Error splitting text: {e}") | |
return | |
try: | |
asyncio.run(save_to_chroma(chunks, db_name)) | |
print(f"Data saved to Chroma for database {db_name}.") | |
except Exception as e: | |
print(f"Error saving to Chroma: {e}") | |
return | |
# def main(): | |
# data_path = "H:\\DEV PATEL\\RAG Project\\data1" | |
# db_name = "Product_data" | |
# generate_data_store(data_path,db_name) | |
# if __name__ == "__main__": | |
# main() | |