File size: 8,301 Bytes
71a7938
 
 
8e9388d
71a7938
 
 
 
 
 
8e9388d
71a7938
 
 
 
 
 
 
 
69682e3
326d6d4
71a7938
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8e9388d
 
 
71a7938
8e9388d
 
 
 
 
 
 
 
 
 
 
71a7938
 
 
 
 
8e9388d
71a7938
 
 
 
 
 
 
 
 
 
 
 
 
 
28e1a34
71a7938
 
 
 
 
 
 
 
28e1a34
71a7938
 
 
 
 
 
 
 
 
28e1a34
 
71a7938
 
 
 
 
 
 
 
 
 
28e1a34
71a7938
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
import os
from dotenv import load_dotenv
from langchain_community.document_loaders import TextLoader, DirectoryLoader, UnstructuredPDFLoader, UnstructuredWordDocumentLoader
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
import json
from google.oauth2 import service_account
load_dotenv()


GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
if GEMINI_API_KEY is None:
    GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")

conf = os.environ.get('GOOGLE_APPLICATION_CREDENTIALS')

service_account_info = json.loads(conf)
service_account_info = eval(service_account_info)

credentials = service_account.Credentials.from_service_account_info(service_account_info)

DOCUMENT_DIR = 'document/'
COLLECTION_NAME = "health_documents"

llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", GEMINI_API_KEY=GEMINI_API_KEY, temperature=0.7, credentials=credentials)


print("Models initialized successfully.")

import os
from dotenv import load_dotenv
from langchain_community.llms import HuggingFacePipeline
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import TextLoader, DirectoryLoader, UnstructuredPDFLoader, UnstructuredWordDocumentLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_community.vectorstores.utils import filter_complex_metadata
from langchain_community.vectorstores import Chroma
import torch
from constants import CHROMA_PATH

# Load environment variables (if needed)
load_dotenv()

# Define the directory containing the documents
DOCUMENT_DIR = 'document/'

# Define the path for ChromaDB persistent storage

# Initialize Hugging Face Embeddings
# You can change the model to any suitable embedding model
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/gtr-t5-large",
    model_kwargs={'device': 'cpu'}
)


def load_documents(directory):
    """Load documents from multiple file types."""
    documents = []
    
    # Load text files
    text_loader = DirectoryLoader(
        directory,
        glob="**/*.txt",
        loader_cls=TextLoader
    )
    documents.extend(text_loader.load())

    # Load Word documents
    docx_loader = DirectoryLoader(
        directory,
        glob="**/*.docx",
        loader_cls=UnstructuredWordDocumentLoader,
        loader_kwargs={"mode": "elements"}
    )
    documents.extend(docx_loader.load())

    # Load PDF files
    pdf_loader = DirectoryLoader(
        directory,
        glob="**/*.pdf",
        loader_cls=UnstructuredPDFLoader
    )
    documents.extend(pdf_loader.load())
    
    print(f"Loaded {len(documents)} documents.")
    return documents

def split_documents(documents, chunk_size=1000, chunk_overlap=200):
    """Split documents into smaller chunks."""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, 
        chunk_overlap=chunk_overlap
    )
    chunks = text_splitter.split_documents(documents)
    return chunks

def create_and_store_embeddings(chunks):
    """Create or load ChromaDB vector store."""
    # Ensure the Chroma path exists
    os.makedirs(CHROMA_PATH, exist_ok=True)
    print(f"unfiltered chunks: {len(chunks)}")
    filtered_chunks = []
    for chunk in chunks:
        # Create a new document with filtered metadata
        filtered_metadata = {k: v for k, v in chunk.metadata.items() 
                             if isinstance(v, (str, int, float, bool))}
        chunk.metadata = filtered_metadata
        filtered_chunks.append(chunk)

    print(f"Filtered metadata for {len(filtered_chunks)} chunks.")
    # Create or load the vector store
    vector_store = Chroma.from_documents(
        documents=filtered_chunks,
        embedding=embeddings,
        persist_directory=CHROMA_PATH
    )
    
    print("Created ChromaDB vector store.")
    return vector_store

def load_vectordb(path:str=CHROMA_PATH):
    if os.path.exists(path):
        vector_store = Chroma(persist_directory=path, embedding_function=embeddings)
        print("Loaded ChromaDB vector store.")
        return vector_store
    else:
        raise ValueError(f"ChromaDB path {path} does not exist.")

from langchain.chains import LLMChain
from langchain.chains.base import Chain

def create_health_agent(vector_store):
    """Create a custom retrieval QA chain for health-related queries."""
    prompt_template = """You are a helpful health assistant. Who will talk to the user as human and resolve their queries.

    Use Previous_Conversation to maintain consistency in the conversation.
    These are Previous_Conversation between you and user.
    Previous_Conversation: \n{previous_conversation}

    These are info about the person.
    User_Data: \n{user_data}
    Thoroughly analyze the Context, and also use context to answer the questions, aside of your knowledge.

    Points to Adhere:

    1. Only tell the schemes if user specifically asked, otherwise don't share schemes information.
    2. If the user asks about schemes, Ask about what state they belong to first.
    2. You can act as a mental-health counselor if needed. 
    3. Give precautions and natural-remedies for the diseases, if user asked or it's needed, only for Common diseases include the common cold, flu etc.
    4. Also Use Information from Context to answer the questions.
    6. Ask the preferred language of the user, In the starting of the conversation.
    
    Context: {context}\n
    Question: {question}
    Answer:"""
    
    PROMPT = PromptTemplate(
        template=prompt_template, 
        input_variables=["context", "question", "previous_conversation", "user_data"]
    )

    if llm is None:
        raise ValueError("No language model initialized. Please check the model initialization.")

    # Create a retriever
    retriever = vector_store.as_retriever(search_kwargs={"k": 10})

    class CustomRetrievalQA(Chain):
        retriever: object
        llm_chain: LLMChain

        @property
        def input_keys(self):
            return ['query', 'previous_conversation', 'user_data']

        @property
        def output_keys(self):
            return ['result']

        def _call(self, inputs):
            query = inputs['query']
            previous_conversation = inputs.get('previous_conversation', '')
            user_data = inputs.get('user_data', '')
            
            # Retrieve relevant documents
            docs = retriever.get_relevant_documents(query)
            context = "\n".join([doc.page_content for doc in docs])

            # Prepare inputs for the LLM chain
            llm_inputs = {
                'context': context,
                'question': query,
                'previous_conversation': previous_conversation,
                'user_data': user_data
            }

            # Generate response
            result = self.llm_chain(llm_inputs)
            return {'result': result['text']}

    # Create the LLM chain
    llm_chain = LLMChain(llm=llm, prompt=PROMPT)

    # Create and return the custom chain
    return CustomRetrievalQA(retriever=retriever, llm_chain=llm_chain, user_data=None)


def agent_with_db():
    # 1. Load documents
    vector_store = load_vectordb(CHROMA_PATH)
    UPDATE_DB = os.getenv("UPDATE_DB")
    if UPDATE_DB.lower()=="true":
        UPDATE_DB = True
    if vector_store is None or UPDATE_DB is True:
        print("Loading documents...")
        print(vector_store, UPDATE_DB)
        
        documents = load_documents(DOCUMENT_DIR)

        print("Splitting documents into chunks...")
        chunks = split_documents(documents)
        print(f"Split into {len(chunks)} chunks.")

        print("Creating and storing embeddings in ChromaDB...")
        try:
            vector_store = create_and_store_embeddings(chunks)
            print("Embeddings stored successfully in ChromaDB.")
        except Exception as e:
            print(f"An error occurred while creating or storing embeddings: {e}")
            return

    print("Creating the health agent...")
    health_agent = create_health_agent(vector_store)

    return health_agent