Spaces:

WebashalarForML
/

RAG_AI

Running

App Files Files Community

RAG_AI / retrival.py

WebashalarForML

Update retrival.py

894171e verified 3 months ago

raw

history blame contribute delete

4.22 kB

	from langchain_community.document_loaders import DirectoryLoader
	from langchain.embeddings import HuggingFaceInstructEmbeddings,HuggingFaceEmbeddings # for embedding task
	from langchain.text_splitter import RecursiveCharacterTextSplitter # for converting the large documents into smaller chunks
	from langchain.schema import Document
	from langchain_core.documents import Document
	from langchain_openai import OpenAIEmbeddings
	from langchain_community.vectorstores import Chroma
	import openai
	import openai
	import os
	import shutil
	import uuid
	import asyncio # async


	# Configurations
	UPLOAD_FOLDER = "./uploads"
	VECTOR_DB_FOLDER = "./VectorDB"
	os.makedirs(UPLOAD_FOLDER, exist_ok=True)
	os.makedirs(VECTOR_DB_FOLDER, exist_ok=True)


	def load_document(data_path):

	# Load documents
	loader = DirectoryLoader(data_path, glob=".")
	print("loader",loader)
	document = loader.load()
	return document

	# Creating the chunks of Data from the knowledge
	def split_text(documents: list[Document]):
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size = 1000,
	chunk_overlap = 500,
	length_function = len,
	add_start_index=True,
	)
	chunks = text_splitter.split_documents(documents)
	print(f"Split {len(documents)} documents into {len(chunks)} chunks.")

	return chunks

	# # Chroma for creating the vector db whcch we will use for the searching relvant data.
	# def save_to_chroma(chunks: list[Document],name: str):
	# print
	# CHROMA_PATH = f"./VectorDB/chroma_{name}"
	# # Clear out the database first.
	# if os.path.exists(CHROMA_PATH):
	# shutil.rmtree(CHROMA_PATH)

	# # Initialize SBERT embedding function
	# embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
	# db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

	# # Add documents and persist the database
	# db.add_documents(chunks)
	# db.persist()
	# # Return the database instance or a success status
	# return db

	def save_to_chroma(chunks: list[Document], name: str):
	CHROMA_PATH = f"./VectorDB/chroma_{name}"

	# Clear out the database first
	if os.path.exists(CHROMA_PATH):
	shutil.rmtree(CHROMA_PATH)

	try:
	# Initialize SBERT embedding function
	embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
	db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

	# Add documents and persist the database
	print("Adding documents to the database...")
	db.add_documents(chunks)
	print("Persisting the database...")
	db.persist()
	print("Database successfully saved.")

	return db
	except Exception as e:
	print("Error while saving to Chroma:", e)
	return None

	def get_unique_sources(chroma_path):
	# Load the Chroma database
	db = Chroma(persist_directory=chroma_path)

	# Retrieve all metadata from the database
	metadata_list = db.get()['metadatas']

	# Extract unique sources from metadata
	unique_sources = {metadata['source'] for metadata in metadata_list if 'source' in metadata}
	return list(unique_sources)

	def generate_data_store(file_path,db_name):
	CHROMA_PATH = f"./VectorDB/chroma_{db_name}"
	print(f"filepath===>{file_path} db_name =====>{db_name}")
	try:
	documents = load_document(file_path)
	print("Documents loaded successfully.")
	except Exception as e:
	print(f"Error loading documents: {e}")
	return

	try:
	chunks = split_text(documents)
	print(f"Text split into {len(chunks)} chunks.")
	except Exception as e:
	print(f"Error splitting text: {e}")
	return

	try:
	asyncio.run(save_to_chroma(chunks, db_name))
	print(f"Data saved to Chroma for database {db_name}.")
	except Exception as e:
	print(f"Error saving to Chroma: {e}")
	return
	# def main():
	# data_path = "H:\\DEV PATEL\\RAG Project\\data1"
	# db_name = "Product_data"
	# generate_data_store(data_path,db_name)

	# if __name__ == "__main__":
	# main()