RAG_AI / retrival.py
WebashalarForML's picture
Update retrival.py
894171e verified
from langchain_community.document_loaders import DirectoryLoader
from langchain.embeddings import HuggingFaceInstructEmbeddings,HuggingFaceEmbeddings # for embedding task
from langchain.text_splitter import RecursiveCharacterTextSplitter # for converting the large documents into smaller chunks
from langchain.schema import Document
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
import openai
import openai
import os
import shutil
import uuid
import asyncio # async
# Configurations
UPLOAD_FOLDER = "./uploads"
VECTOR_DB_FOLDER = "./VectorDB"
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
os.makedirs(VECTOR_DB_FOLDER, exist_ok=True)
def load_document(data_path):
# Load documents
loader = DirectoryLoader(data_path, glob="*.*")
print("loader",loader)
document = loader.load()
return document
# Creating the chunks of Data from the knowledge
def split_text(documents: list[Document]):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size = 1000,
chunk_overlap = 500,
length_function = len,
add_start_index=True,
)
chunks = text_splitter.split_documents(documents)
print(f"Split {len(documents)} documents into {len(chunks)} chunks.")
return chunks
# # Chroma for creating the vector db whcch we will use for the searching relvant data.
# def save_to_chroma(chunks: list[Document],name: str):
# print
# CHROMA_PATH = f"./VectorDB/chroma_{name}"
# # Clear out the database first.
# if os.path.exists(CHROMA_PATH):
# shutil.rmtree(CHROMA_PATH)
# # Initialize SBERT embedding function
# embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
# db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)
# # Add documents and persist the database
# db.add_documents(chunks)
# db.persist()
# # Return the database instance or a success status
# return db
def save_to_chroma(chunks: list[Document], name: str):
CHROMA_PATH = f"./VectorDB/chroma_{name}"
# Clear out the database first
if os.path.exists(CHROMA_PATH):
shutil.rmtree(CHROMA_PATH)
try:
# Initialize SBERT embedding function
embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)
# Add documents and persist the database
print("Adding documents to the database...")
db.add_documents(chunks)
print("Persisting the database...")
db.persist()
print("Database successfully saved.")
return db
except Exception as e:
print("Error while saving to Chroma:", e)
return None
def get_unique_sources(chroma_path):
# Load the Chroma database
db = Chroma(persist_directory=chroma_path)
# Retrieve all metadata from the database
metadata_list = db.get()['metadatas']
# Extract unique sources from metadata
unique_sources = {metadata['source'] for metadata in metadata_list if 'source' in metadata}
return list(unique_sources)
def generate_data_store(file_path,db_name):
CHROMA_PATH = f"./VectorDB/chroma_{db_name}"
print(f"filepath===>{file_path} db_name =====>{db_name}")
try:
documents = load_document(file_path)
print("Documents loaded successfully.")
except Exception as e:
print(f"Error loading documents: {e}")
return
try:
chunks = split_text(documents)
print(f"Text split into {len(chunks)} chunks.")
except Exception as e:
print(f"Error splitting text: {e}")
return
try:
asyncio.run(save_to_chroma(chunks, db_name))
print(f"Data saved to Chroma for database {db_name}.")
except Exception as e:
print(f"Error saving to Chroma: {e}")
return
# def main():
# data_path = "H:\\DEV PATEL\\RAG Project\\data1"
# db_name = "Product_data"
# generate_data_store(data_path,db_name)
# if __name__ == "__main__":
# main()