RAG_AI_V2

Build error

App Files Files Community

WebashalarForML commited on Jan 30

Commit

5c6971a

verified ·

1 Parent(s): 1203483

Update app.py

Browse files

Files changed (1) hide show

app.py +134 -116

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from flask import Flask, render_template, request, redirect, url_for, session
 import os
 from werkzeug.utils import secure_filename
 #from retrival import generate_data_store
@@ -13,6 +13,7 @@ from langchain.schema import Document
 from langchain_core.documents import Document
 from dotenv import load_dotenv
 import re
 import glob
 import shutil
 from werkzeug.utils import secure_filename
@@ -32,23 +33,25 @@ app.secret_key = os.urandom(24)
 # Configurations
 UPLOAD_FOLDER = "uploads/"
 VECTOR_DB_FOLDER = "VectorDB/"
-#TABLE_DB_FOLDER = "TableDB/"
 app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
 os.makedirs(UPLOAD_FOLDER, exist_ok=True)
 os.makedirs(VECTOR_DB_FOLDER, exist_ok=True)
-#os.makedirs(TABLE_DB_FOLDER, exist_ok=True)
 # Global variables
 CHROMA_PATH = None
-TEMP_PATH = None
-#TABLE_PATH = None
-#System prompt
-'''PROMPT_TEMPLATE = """
-You are working with a retrieval-augmented generation (RAG) setup. Your task is to generate a response based on the context provided and the question asked. Consider only the following context strictly, and use it to answer the question. If the question cannot be answered using the context, respond with: "The information requested is not mentioned in the context."
 Context:
 {context}
@@ -59,28 +62,38 @@ Question:
 {question}
 Response:
-"""
-'''
-PROMPT_TEMPLATE = """
-You are working as a retrieval-augmented generation (RAG) assistant specializing in providing precise and accurate responses. Generate a response based only on the provided context and question, following these concrete instructions:
-- **Adhere strictly to the context:** Use only the information in the context to answer the question. Do not add any external details or assumptions.
-- **Handle multiple chunks:** The context is divided into chunks, separated by "###". Query-related information may be present in any chunk.
-- **Focus on relevance:** Identify and prioritize chunks relevant to the question while ignoring unrelated chunks.
-- **Answer concisely and factually:** Provide clear, direct, and structured responses based on the retrieved information.
 Context:
 {context}
 ---
 Question:
 {question}
 Response:
 """
 #HFT = os.getenv('HF_TOKEN')
 #client = InferenceClient(api_key=HFT)
@@ -96,59 +109,88 @@ def chat():
     print("sessionhist1",session['history'])
     global CHROMA_PATH
-    #global TABLE_PATH
-    #old_db = session.get('old_db', None)
-    #print(f"Selected DB: {CHROMA_PATH}")
-    #if TEMP_PATH is not None and TEMP_PATH != CHROMA_PATH:
-    #    session['history'] = []
-    #TEMP_PATH = CHROMA_PATH
     if request.method == 'POST':
         query_text = request.form['query_text']
         if CHROMA_PATH is None:
-            return render_template('chat.html', error="No vector database selected!", history=[])
         # Load the selected Document Database
         embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
         #embedding_function = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
         db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)
-        results_document = db.similarity_search_with_relevance_scores(query_text, k=3)
-        print("results------------------->",results_document)
-        context_text_document = "\n\n---\n\n".join([doc.page_content for doc, _score in results_document])
-        # # Load the selected Table Database
-        # #embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
-        # embedding_function = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
-        # tdb = Chroma(persist_directory=TABLE_PATH, embedding_function=embedding_function)
-        # results_table = tdb.similarity_search_with_relevance_scores(query_text, k=2)
-        # print("results------------------->",results_table)
-        # context_text_table = "\n\n---\n\n".join([doc.page_content for doc, _score in results_table])
-        # Prepare the prompt and query the model
-        prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
-        prompt = prompt_template.format(context=context_text_document,question=query_text)
-        #prompt = prompt_template.format(context=context_text_document,table=context_text_table, question=query_text)
-        print("results------------------->",prompt)
         #Model Defining and its use
         repo_id = "mistralai/Mistral-7B-Instruct-v0.3"
         HFT = os.environ["HF_TOKEN"]
         llm = HuggingFaceEndpoint(
             repo_id=repo_id,
-            max_tokens=3000,
             temperature=0.8,
             huggingfacehub_api_token=HFT,
         )
         data= llm(prompt)
         #data = response.choices[0].message.content
-        print("LLM response------------------>",data)
         # filtering the uneccessary context.
         if re.search(r'\bmention\b|\bnot mention\b|\bnot mentioned\b|\bnot contain\b|\bnot include\b|\bnot provide\b|\bdoes not\b|\bnot explicitly\b|\bnot explicitly mentioned\b', data, re.IGNORECASE):
             data = "We do not have information related to your query on our end."
@@ -160,90 +202,65 @@ def chat():
         session.modified = True
         print("sessionhist2",session['history'])
-        return render_template('chat.html', query_text=query_text, answer=data, history=session['history'])
-    return render_template('chat.html', history=session['history'])
-'''
 @app.route('/create-db', methods=['GET', 'POST'])
 def create_db():
     if request.method == 'POST':
-        db_name = request.form['db_name']
-        # Get all files from the uploaded folder
-        files = request.files.getlist('folder')
-        if not files:
             return "No files uploaded", 400
-        # if not exist
-        os.makedirs(UPLOAD_FOLDER, exist_ok=True)
-        # Define the base upload path
         upload_base_path = os.path.join(app.config['UPLOAD_FOLDER'], secure_filename(db_name))
-        #upload_base_path = upload_base_path.replace("\\", "/")
         print(f"Base Upload Path: {upload_base_path}")
         os.makedirs(upload_base_path, exist_ok=True)
-        # Save each file and recreate folder structure
-        for file in files:
-            print("file , files",files,file)
-            #relative_path = file.filename  # This should contain the subfolder structure
-            file_path = os.path.join(upload_base_path)
-            #file_path = file_path.replace("\\", "/")
-            # Ensure the directory exists before saving the file
-            print(f"Saving to: {file_path}")
-            os.makedirs(os.path.dirname(file_path), exist_ok=True)
-            # Get the file path and save it
-            file_path = os.path.join(upload_base_path, secure_filename(file.filename))
-            file.save(file_path)
-        # Generate datastore
-        generate_data_store(upload_base_path, db_name)
-        # # Clean up uploaded files (if needed)
-        #if os.path.exists(app.config['UPLOAD_FOLDER']):
-        #    shutil.rmtree(app.config['UPLOAD_FOLDER'])
-        return redirect(url_for('list_dbs'))
-    return render_template('create_db.html')
-'''
-@app.route('/create-db', methods=['GET', 'POST'])
-def create_db():
-    if request.method == 'POST':
-        db_name = request.form['db_name']
-        # Ensure the upload folder exists
-        os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
-        # Define the base upload path
-        upload_base_path = os.path.join(app.config['UPLOAD_FOLDER'], secure_filename(db_name))
-        os.makedirs(upload_base_path, exist_ok=True)
-        # Check for uploaded folder or files
-        folder_files = request.files.getlist('folder')
-        single_files = request.files.getlist('file')
-        if folder_files and any(file.filename for file in folder_files):
-            # Process folder files
-            for file in folder_files:
-                file_path = os.path.join(upload_base_path, secure_filename(file.filename))
                 os.makedirs(os.path.dirname(file_path), exist_ok=True)
-                file.save(file_path)
-        elif single_files and any(file.filename for file in single_files):
-            # Process single files
-            for file in single_files:
-                file_path = os.path.join(upload_base_path, secure_filename(file.filename))
                 file.save(file_path)
-        else:
-            return "No files uploaded", 400
-        # Generate datastore
-        generate_data_store(upload_base_path, db_name)
         return redirect(url_for('list_dbs'))
@@ -256,21 +273,22 @@ def list_dbs():
 @app.route('/select-db/<db_name>', methods=['POST'])
 def select_db(db_name):
     #Selecting the Documnet Vector DB
     global CHROMA_PATH
     print(f"Selected DB: {CHROMA_PATH}")
     CHROMA_PATH = os.path.join(VECTOR_DB_FOLDER, db_name)
     CHROMA_PATH = CHROMA_PATH.replace("\\", "/")
     print(f"Selected DB: {CHROMA_PATH}")
-    #Selecting the Table Vector DB
-    # global TABLE_PATH
-    # print(f"Selected DB: {TABLE_PATH}")
-    # TABLE_PATH = os.path.join(TABLE_DB_FOLDER, db_name)
-    # TABLE_PATH = TABLE_PATH.replace("\\", "/")
-    # print(f"Selected DB: {TABLE_PATH}")
     return redirect(url_for('chat'))

+from flask import Flask, render_template, request, redirect, url_for, session, flash
 import os
 from werkzeug.utils import secure_filename
 #from retrival import generate_data_store
 from langchain_core.documents import Document
 from dotenv import load_dotenv
 import re
+import numpy as np
 import glob
 import shutil
 from werkzeug.utils import secure_filename
 # Configurations
 UPLOAD_FOLDER = "uploads/"
 VECTOR_DB_FOLDER = "VectorDB/"
+TABLE_DB_FOLDER = "TableDB/"
 app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
 os.makedirs(UPLOAD_FOLDER, exist_ok=True)
 os.makedirs(VECTOR_DB_FOLDER, exist_ok=True)
+os.makedirs(TABLE_DB_FOLDER, exist_ok=True)
 # Global variables
 CHROMA_PATH = None
+TABLE_PATH = None
+PROMPT_TEMPLATE_DOC = """
+<s>[INST] You are a retrieval-augmented generation (RAG) assistant. Your task is to generate a response strictly based on the given context. Follow these instructions:
+- Use only the provided context; do not add external information.
+- The context contains multiple retrieved chunks separated by "###". Choose only the most relevant chunks to answer the question and ignore unrelated ones.
+- If available, use the provided source information to support the response.
+- Answer concisely and factually.
 Context:
 {context}
 {question}
 Response:
+[/INST]
+"""
+# prompt if the document having the tables
+PROMPT_TEMPLATE_TAB = """
+<s>[INST] You are a retrieval-augmented generation (RAG) assistant. Your task is to generate a response strictly based on the given context. Follow these instructions:
+- Use only the provided context; do not add external information.
+- The context contains multiple retrieved chunks separated by "###". Choose only the most relevant chunks to answer the question and ignore unrelated ones.
+- If available, use the provided source information to support the response.
+- If a table is provided as html, incorporate its relevant details into the response while maintaining a structured format.
+- Answer concisely and factually.
 Context:
 {context}
 ---
+Table:
+{table}
+---
 Question:
 {question}
 Response:
+[/INST]
 """
 #HFT = os.getenv('HF_TOKEN')
 #client = InferenceClient(api_key=HFT)
     print("sessionhist1",session['history'])
     global CHROMA_PATH
+    global TABLE_PATH
+    old_db = session.get('old_db', None)
+    print(f"Selected DB: {CHROMA_PATH}")
+    # if old_db != None:
+    #     if CHROMA_PATH != old_db:
+    #         session['history'] = []
+    #print("sessionhist1",session['history'])
     if request.method == 'POST':
         query_text = request.form['query_text']
         if CHROMA_PATH is None:
+            flash("Please select a database first!", "error")
+            return redirect(url_for('list_dbs'))
         # Load the selected Document Database
         embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
         #embedding_function = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
         db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)
+        # Convert the query to its embedding vector
+        query_embedding = embedding_function.embed_query(query_text)
+        if isinstance(query_embedding, float):
+            query_embedding = [query_embedding]
+        # print(f"Query embedding: {query_embedding}")
+        # print(f"Type of query embedding: {type(query_embedding)}")
+        # print(f"Length of query embedding: {len(query_embedding) if isinstance(query_embedding, (list, np.ndarray)) else 'Not applicable'}")
+        results_document = db.similarity_search_by_vector_with_relevance_scores(
+            embedding=query_embedding,  # Pass the query embedding
+            k=3,
+            #filter=filter_condition         # Pass the filter condition
+        )
+        print("results------------------->",results_document)
+        print("============================================")
+        print("============================================")
+        context_text_document = "   \n\n###\n\n   ".join(
+            [f"Source: {doc.metadata.get('source', '')} Page_content:{doc.page_content}\n" for doc, _score in results_document]
+        )
+        # Loading Table Database only if available
+        if TABLE_PATH is not None:
+            #embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
+            embedding_function = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
+            tdb = Chroma(persist_directory=TABLE_PATH, embedding_function=embedding_function)
+            results_table = tdb.similarity_search_by_vector_with_relevance_scores(
+                embedding=query_embedding,  # Pass the query embedding
+                k=2
+                #filter=filter_condition         # Pass the filter condition
+            )
+            print("results------------------->",results_table)
+            context_text_table = "\n\n---\n\n".join([doc.page_content for doc, _score in results_table])
+            # Prepare the prompt and query the model
+            prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE_TAB)
+            prompt = prompt_template.format(context=context_text_document,table=context_text_table,question=query_text)
+            #prompt = prompt_template.format(context=context_text_document,table=context_text_table, question=query_text)
+            print("results------------------->",prompt)
+        else:
+            # Prepare the prompt and query the model
+            prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE_DOC)
+            prompt = prompt_template.format(context=context_text_document,question=query_text)
+            #prompt = prompt_template.format(context=context_text_document,table=context_text_table, question=query_text)
+            print("results------------------->",prompt)
         #Model Defining and its use
         repo_id = "mistralai/Mistral-7B-Instruct-v0.3"
         HFT = os.environ["HF_TOKEN"]
         llm = HuggingFaceEndpoint(
             repo_id=repo_id,
+            #max_tokens=3000,
+            max_new_tokens=2000,
             temperature=0.8,
             huggingfacehub_api_token=HFT,
         )
         data= llm(prompt)
         #data = response.choices[0].message.content
         # filtering the uneccessary context.
         if re.search(r'\bmention\b|\bnot mention\b|\bnot mentioned\b|\bnot contain\b|\bnot include\b|\bnot provide\b|\bdoes not\b|\bnot explicitly\b|\bnot explicitly mentioned\b', data, re.IGNORECASE):
             data = "We do not have information related to your query on our end."
         session.modified = True
         print("sessionhist2",session['history'])
+        return render_template('chat.html', query_text=query_text, answer=data, history=session['history'],old_db=CHROMA_PATH)
+    return render_template('chat.html', history=session['history'], old_db=CHROMA_PATH)
 @app.route('/create-db', methods=['GET', 'POST'])
 def create_db():
     if request.method == 'POST':
+        db_name = request.form.get('db_name', '').strip()
+        if not db_name:
+            return "Database name is required", 400
+        # Get uploaded files
+        files = request.files.getlist('folder')  # Folder uploads (multiple files)
+        single_files = request.files.getlist('file')  # Single file uploads
+        # Check if any file is uploaded
+        if not files and not single_files:
             return "No files uploaded", 400
+        # Create upload directory
         upload_base_path = os.path.join(app.config['UPLOAD_FOLDER'], secure_filename(db_name))
         print(f"Base Upload Path: {upload_base_path}")
         os.makedirs(upload_base_path, exist_ok=True)
+        # Process folder files (if any)
+        if files:
+            for file in files:
+                file_name = secure_filename(file.filename)  # Ensure the file name is safe
+                file_path = os.path.join(upload_base_path, file_name)
+                # Ensure the directory exists before saving the file
+                print(f"Saving to: {file_path}")
+                os.makedirs(os.path.dirname(file_path), exist_ok=True)
+                # Save the file
+                file.save(file_path)
+        # Process single files (if any)
+        if single_files:
+            for file in single_files:
+                if file.filename == '':
+                    print("Skipping empty single file")
+                    continue  # Skip empty uploads
+                # Create full file path for single file upload
+                file_name = secure_filename(file.filename)
+                file_path = os.path.join(upload_base_path, file_name)
+                # Ensure the directory exists before saving the file
+                print(f"Saving single file to: {file_path}")
                 os.makedirs(os.path.dirname(file_path), exist_ok=True)
+                # Save the file
                 file.save(file_path)
+                print("file------------->",file)
+                print("file_path------------->",file_path)
+        # Generate datastore (example task, depending on your logic)
+        asyncio.run(generate_data_store(upload_base_path, db_name))
         return redirect(url_for('list_dbs'))
 @app.route('/select-db/<db_name>', methods=['POST'])
 def select_db(db_name):
+    flash(f"{db_name} Database has been selected", "table_selected")
     #Selecting the Documnet Vector DB
     global CHROMA_PATH
+    global TABLE_PATH
     print(f"Selected DB: {CHROMA_PATH}")
+    print("-----------------------------------------------------1----")
     CHROMA_PATH = os.path.join(VECTOR_DB_FOLDER, db_name)
     CHROMA_PATH = CHROMA_PATH.replace("\\", "/")
     print(f"Selected DB: {CHROMA_PATH}")
+    print("-----------------------------------------------------2----")
+    # Selecting the Table Vector DB
+    table_db_path = os.path.join(TABLE_DB_FOLDER, db_name)
+    table_db_path = table_db_path.replace("\\", "/")
+    TABLE_PATH = table_db_path if os.path.exists(table_db_path) else None
+    print(f"Selected Table DB: {TABLE_PATH}")
     return redirect(url_for('chat'))