from langchain_community.document_loaders import DirectoryLoader
from langchain.embeddings import HuggingFaceInstructEmbeddings,HuggingFaceEmbeddings # for embedding task 
from langchain.text_splitter import RecursiveCharacterTextSplitter # for converting the large documents into smaller chunks
from langchain.schema import Document 
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
import openai 
import openai
import os
import shutil
import uuid
import asyncio # async


# Configurations
UPLOAD_FOLDER = "./uploads"
VECTOR_DB_FOLDER = "./VectorDB"
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
os.makedirs(VECTOR_DB_FOLDER, exist_ok=True)


def load_document(data_path):

    # Load documents
    loader = DirectoryLoader(data_path, glob="*.*")
    print("loader",loader)
    document = loader.load()
    return document

# Creating the chunks of Data from the knowledge 
def split_text(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 1000,
        chunk_overlap = 500,
        length_function = len,
        add_start_index=True,
    ) 
    chunks = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(chunks)} chunks.")
    
    return chunks

# # Chroma for creating the vector db whcch we will use for the searching relvant data.
# def save_to_chroma(chunks: list[Document],name: str):
#     print
#     CHROMA_PATH = f"./VectorDB/chroma_{name}"
#     # Clear out the database first.
#     if os.path.exists(CHROMA_PATH):
#         shutil.rmtree(CHROMA_PATH)        
   
#     # Initialize SBERT embedding function
#     embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
#     db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)
    
#     # Add documents and persist the database
#     db.add_documents(chunks)
#     db.persist()
#     # Return the database instance or a success status
#     return db

def save_to_chroma(chunks: list[Document], name: str):
    CHROMA_PATH = f"./VectorDB/chroma_{name}"
    
    # Clear out the database first
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)
    
    try:
        # Initialize SBERT embedding function
        embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
        db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)
        
        # Add documents and persist the database
        print("Adding documents to the database...")
        db.add_documents(chunks)
        print("Persisting the database...")
        db.persist()
        print("Database successfully saved.")
        
        return db
    except Exception as e:
        print("Error while saving to Chroma:", e)
        return None
 
def get_unique_sources(chroma_path):
    # Load the Chroma database
    db = Chroma(persist_directory=chroma_path)
    
    # Retrieve all metadata from the database
    metadata_list = db.get()['metadatas']
    
    # Extract unique sources from metadata
    unique_sources = {metadata['source'] for metadata in metadata_list if 'source' in metadata}
    return list(unique_sources)
  
def generate_data_store(file_path,db_name):
    CHROMA_PATH = f"./VectorDB/chroma_{db_name}"
    print(f"filepath===>{file_path}  db_name =====>{db_name}")
    try:
        documents = load_document(file_path)
        print("Documents loaded successfully.")
    except Exception as e:
        print(f"Error loading documents: {e}")
        return

    try:
        chunks = split_text(documents)
        print(f"Text split into {len(chunks)} chunks.")
    except Exception as e:
        print(f"Error splitting text: {e}")
        return

    try:
        asyncio.run(save_to_chroma(chunks, db_name))
        print(f"Data saved to Chroma for database {db_name}.")
    except Exception as e:
        print(f"Error saving to Chroma: {e}")
        return
# def main():
#     data_path = "H:\\DEV PATEL\\RAG Project\\data1"
#     db_name = "Product_data"
#     generate_data_store(data_path,db_name)

# if __name__ == "__main__":
#     main()