WebashalarForML commited on
Commit
5c6971a
·
verified ·
1 Parent(s): 1203483

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +134 -116
app.py CHANGED
@@ -1,4 +1,4 @@
1
- from flask import Flask, render_template, request, redirect, url_for, session
2
  import os
3
  from werkzeug.utils import secure_filename
4
  #from retrival import generate_data_store
@@ -13,6 +13,7 @@ from langchain.schema import Document
13
  from langchain_core.documents import Document
14
  from dotenv import load_dotenv
15
  import re
 
16
  import glob
17
  import shutil
18
  from werkzeug.utils import secure_filename
@@ -32,23 +33,25 @@ app.secret_key = os.urandom(24)
32
  # Configurations
33
  UPLOAD_FOLDER = "uploads/"
34
  VECTOR_DB_FOLDER = "VectorDB/"
35
- #TABLE_DB_FOLDER = "TableDB/"
36
 
37
  app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
38
 
39
  os.makedirs(UPLOAD_FOLDER, exist_ok=True)
40
  os.makedirs(VECTOR_DB_FOLDER, exist_ok=True)
41
- #os.makedirs(TABLE_DB_FOLDER, exist_ok=True)
42
 
43
  # Global variables
44
  CHROMA_PATH = None
45
- TEMP_PATH = None
46
- #TABLE_PATH = None
47
 
48
- #System prompt
 
49
 
50
- '''PROMPT_TEMPLATE = """
51
- You are working with a retrieval-augmented generation (RAG) setup. Your task is to generate a response based on the context provided and the question asked. Consider only the following context strictly, and use it to answer the question. If the question cannot be answered using the context, respond with: "The information requested is not mentioned in the context."
 
 
52
 
53
  Context:
54
  {context}
@@ -59,28 +62,38 @@ Question:
59
  {question}
60
 
61
  Response:
62
- """
63
- '''
64
 
65
- PROMPT_TEMPLATE = """
66
- You are working as a retrieval-augmented generation (RAG) assistant specializing in providing precise and accurate responses. Generate a response based only on the provided context and question, following these concrete instructions:
 
 
67
 
68
- - **Adhere strictly to the context:** Use only the information in the context to answer the question. Do not add any external details or assumptions.
69
- - **Handle multiple chunks:** The context is divided into chunks, separated by "###". Query-related information may be present in any chunk.
70
- - **Focus on relevance:** Identify and prioritize chunks relevant to the question while ignoring unrelated chunks.
71
- - **Answer concisely and factually:** Provide clear, direct, and structured responses based on the retrieved information.
 
72
 
73
  Context:
74
  {context}
75
 
76
  ---
77
 
 
 
 
 
 
78
  Question:
79
  {question}
80
 
81
  Response:
 
 
82
  """
83
 
 
84
  #HFT = os.getenv('HF_TOKEN')
85
  #client = InferenceClient(api_key=HFT)
86
 
@@ -96,59 +109,88 @@ def chat():
96
  print("sessionhist1",session['history'])
97
 
98
  global CHROMA_PATH
99
- #global TABLE_PATH
100
 
101
- #old_db = session.get('old_db', None)
102
- #print(f"Selected DB: {CHROMA_PATH}")
103
 
104
- #if TEMP_PATH is not None and TEMP_PATH != CHROMA_PATH:
105
- # session['history'] = []
106
- #TEMP_PATH = CHROMA_PATH
 
 
107
 
108
  if request.method == 'POST':
109
  query_text = request.form['query_text']
110
  if CHROMA_PATH is None:
111
- return render_template('chat.html', error="No vector database selected!", history=[])
 
112
 
 
113
  # Load the selected Document Database
114
  embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
115
  #embedding_function = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
116
  db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)
117
- results_document = db.similarity_search_with_relevance_scores(query_text, k=3)
118
-
119
- print("results------------------->",results_document)
120
- context_text_document = "\n\n---\n\n".join([doc.page_content for doc, _score in results_document])
121
-
122
-
123
- # # Load the selected Table Database
124
- # #embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
125
- # embedding_function = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
126
- # tdb = Chroma(persist_directory=TABLE_PATH, embedding_function=embedding_function)
127
- # results_table = tdb.similarity_search_with_relevance_scores(query_text, k=2)
128
-
129
- # print("results------------------->",results_table)
130
- # context_text_table = "\n\n---\n\n".join([doc.page_content for doc, _score in results_table])
131
-
132
- # Prepare the prompt and query the model
133
- prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
134
- prompt = prompt_template.format(context=context_text_document,question=query_text)
135
- #prompt = prompt_template.format(context=context_text_document,table=context_text_table, question=query_text)
136
- print("results------------------->",prompt)
137
 
 
 
 
 
 
 
 
138
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  #Model Defining and its use
140
  repo_id = "mistralai/Mistral-7B-Instruct-v0.3"
141
  HFT = os.environ["HF_TOKEN"]
142
  llm = HuggingFaceEndpoint(
143
  repo_id=repo_id,
144
- max_tokens=3000,
 
145
  temperature=0.8,
146
  huggingfacehub_api_token=HFT,
147
  )
148
 
149
  data= llm(prompt)
150
  #data = response.choices[0].message.content
151
- print("LLM response------------------>",data)
152
  # filtering the uneccessary context.
153
  if re.search(r'\bmention\b|\bnot mention\b|\bnot mentioned\b|\bnot contain\b|\bnot include\b|\bnot provide\b|\bdoes not\b|\bnot explicitly\b|\bnot explicitly mentioned\b', data, re.IGNORECASE):
154
  data = "We do not have information related to your query on our end."
@@ -160,90 +202,65 @@ def chat():
160
  session.modified = True
161
  print("sessionhist2",session['history'])
162
 
163
- return render_template('chat.html', query_text=query_text, answer=data, history=session['history'])
164
 
165
- return render_template('chat.html', history=session['history'])
166
 
167
- '''
168
  @app.route('/create-db', methods=['GET', 'POST'])
169
  def create_db():
170
  if request.method == 'POST':
171
- db_name = request.form['db_name']
 
 
172
 
173
- # Get all files from the uploaded folder
174
- files = request.files.getlist('folder')
175
- if not files:
 
 
 
176
  return "No files uploaded", 400
177
 
178
- # if not exist
179
- os.makedirs(UPLOAD_FOLDER, exist_ok=True)
180
- # Define the base upload path
181
  upload_base_path = os.path.join(app.config['UPLOAD_FOLDER'], secure_filename(db_name))
182
- #upload_base_path = upload_base_path.replace("\\", "/")
183
  print(f"Base Upload Path: {upload_base_path}")
184
  os.makedirs(upload_base_path, exist_ok=True)
185
 
186
- # Save each file and recreate folder structure
187
- for file in files:
188
- print("file , files",files,file)
189
- #relative_path = file.filename # This should contain the subfolder structure
190
- file_path = os.path.join(upload_base_path)
191
- #file_path = file_path.replace("\\", "/")
192
 
193
- # Ensure the directory exists before saving the file
194
- print(f"Saving to: {file_path}")
195
- os.makedirs(os.path.dirname(file_path), exist_ok=True)
196
-
197
-
198
- # Get the file path and save it
199
- file_path = os.path.join(upload_base_path, secure_filename(file.filename))
200
- file.save(file_path)
201
-
202
- # Generate datastore
203
- generate_data_store(upload_base_path, db_name)
204
-
205
- # # Clean up uploaded files (if needed)
206
- #if os.path.exists(app.config['UPLOAD_FOLDER']):
207
- # shutil.rmtree(app.config['UPLOAD_FOLDER'])
208
-
209
- return redirect(url_for('list_dbs'))
210
-
211
- return render_template('create_db.html')
212
- '''
213
- @app.route('/create-db', methods=['GET', 'POST'])
214
- def create_db():
215
- if request.method == 'POST':
216
- db_name = request.form['db_name']
217
 
218
- # Ensure the upload folder exists
219
- os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
220
 
221
- # Define the base upload path
222
- upload_base_path = os.path.join(app.config['UPLOAD_FOLDER'], secure_filename(db_name))
223
- os.makedirs(upload_base_path, exist_ok=True)
 
 
 
224
 
225
- # Check for uploaded folder or files
226
- folder_files = request.files.getlist('folder')
227
- single_files = request.files.getlist('file')
228
 
229
- if folder_files and any(file.filename for file in folder_files):
230
- # Process folder files
231
- for file in folder_files:
232
- file_path = os.path.join(upload_base_path, secure_filename(file.filename))
233
  os.makedirs(os.path.dirname(file_path), exist_ok=True)
234
- file.save(file_path)
235
 
236
- elif single_files and any(file.filename for file in single_files):
237
- # Process single files
238
- for file in single_files:
239
- file_path = os.path.join(upload_base_path, secure_filename(file.filename))
240
  file.save(file_path)
 
 
241
 
242
- else:
243
- return "No files uploaded", 400
244
-
245
- # Generate datastore
246
- generate_data_store(upload_base_path, db_name)
247
 
248
  return redirect(url_for('list_dbs'))
249
 
@@ -256,21 +273,22 @@ def list_dbs():
256
 
257
  @app.route('/select-db/<db_name>', methods=['POST'])
258
  def select_db(db_name):
259
-
260
  #Selecting the Documnet Vector DB
261
  global CHROMA_PATH
 
262
  print(f"Selected DB: {CHROMA_PATH}")
 
263
  CHROMA_PATH = os.path.join(VECTOR_DB_FOLDER, db_name)
264
  CHROMA_PATH = CHROMA_PATH.replace("\\", "/")
265
  print(f"Selected DB: {CHROMA_PATH}")
 
266
 
267
- #Selecting the Table Vector DB
268
- # global TABLE_PATH
269
- # print(f"Selected DB: {TABLE_PATH}")
270
- # TABLE_PATH = os.path.join(TABLE_DB_FOLDER, db_name)
271
- # TABLE_PATH = TABLE_PATH.replace("\\", "/")
272
- # print(f"Selected DB: {TABLE_PATH}")
273
-
274
 
275
  return redirect(url_for('chat'))
276
 
 
1
+ from flask import Flask, render_template, request, redirect, url_for, session, flash
2
  import os
3
  from werkzeug.utils import secure_filename
4
  #from retrival import generate_data_store
 
13
  from langchain_core.documents import Document
14
  from dotenv import load_dotenv
15
  import re
16
+ import numpy as np
17
  import glob
18
  import shutil
19
  from werkzeug.utils import secure_filename
 
33
  # Configurations
34
  UPLOAD_FOLDER = "uploads/"
35
  VECTOR_DB_FOLDER = "VectorDB/"
36
+ TABLE_DB_FOLDER = "TableDB/"
37
 
38
  app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
39
 
40
  os.makedirs(UPLOAD_FOLDER, exist_ok=True)
41
  os.makedirs(VECTOR_DB_FOLDER, exist_ok=True)
42
+ os.makedirs(TABLE_DB_FOLDER, exist_ok=True)
43
 
44
  # Global variables
45
  CHROMA_PATH = None
46
+ TABLE_PATH = None
 
47
 
48
+ PROMPT_TEMPLATE_DOC = """
49
+ <s>[INST] You are a retrieval-augmented generation (RAG) assistant. Your task is to generate a response strictly based on the given context. Follow these instructions:
50
 
51
+ - Use only the provided context; do not add external information.
52
+ - The context contains multiple retrieved chunks separated by "###". Choose only the most relevant chunks to answer the question and ignore unrelated ones.
53
+ - If available, use the provided source information to support the response.
54
+ - Answer concisely and factually.
55
 
56
  Context:
57
  {context}
 
62
  {question}
63
 
64
  Response:
65
+ [/INST]
 
66
 
67
+ """
68
+ # prompt if the document having the tables
69
+ PROMPT_TEMPLATE_TAB = """
70
+ <s>[INST] You are a retrieval-augmented generation (RAG) assistant. Your task is to generate a response strictly based on the given context. Follow these instructions:
71
 
72
+ - Use only the provided context; do not add external information.
73
+ - The context contains multiple retrieved chunks separated by "###". Choose only the most relevant chunks to answer the question and ignore unrelated ones.
74
+ - If available, use the provided source information to support the response.
75
+ - If a table is provided as html, incorporate its relevant details into the response while maintaining a structured format.
76
+ - Answer concisely and factually.
77
 
78
  Context:
79
  {context}
80
 
81
  ---
82
 
83
+ Table:
84
+ {table}
85
+
86
+ ---
87
+
88
  Question:
89
  {question}
90
 
91
  Response:
92
+ [/INST]
93
+
94
  """
95
 
96
+
97
  #HFT = os.getenv('HF_TOKEN')
98
  #client = InferenceClient(api_key=HFT)
99
 
 
109
  print("sessionhist1",session['history'])
110
 
111
  global CHROMA_PATH
112
+ global TABLE_PATH
113
 
114
+ old_db = session.get('old_db', None)
115
+ print(f"Selected DB: {CHROMA_PATH}")
116
 
117
+ # if old_db != None:
118
+ # if CHROMA_PATH != old_db:
119
+ # session['history'] = []
120
+
121
+ #print("sessionhist1",session['history'])
122
 
123
  if request.method == 'POST':
124
  query_text = request.form['query_text']
125
  if CHROMA_PATH is None:
126
+ flash("Please select a database first!", "error")
127
+ return redirect(url_for('list_dbs'))
128
 
129
+
130
  # Load the selected Document Database
131
  embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
132
  #embedding_function = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
133
  db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)
134
+ # Convert the query to its embedding vector
135
+ query_embedding = embedding_function.embed_query(query_text)
136
+ if isinstance(query_embedding, float):
137
+ query_embedding = [query_embedding]
138
+ # print(f"Query embedding: {query_embedding}")
139
+ # print(f"Type of query embedding: {type(query_embedding)}")
140
+ # print(f"Length of query embedding: {len(query_embedding) if isinstance(query_embedding, (list, np.ndarray)) else 'Not applicable'}")
141
+ results_document = db.similarity_search_by_vector_with_relevance_scores(
142
+ embedding=query_embedding, # Pass the query embedding
143
+ k=3,
144
+ #filter=filter_condition # Pass the filter condition
145
+ )
 
 
 
 
 
 
 
 
146
 
147
+ print("results------------------->",results_document)
148
+ print("============================================")
149
+ print("============================================")
150
+
151
+ context_text_document = " \n\n###\n\n ".join(
152
+ [f"Source: {doc.metadata.get('source', '')} Page_content:{doc.page_content}\n" for doc, _score in results_document]
153
+ )
154
 
155
+ # Loading Table Database only if available
156
+ if TABLE_PATH is not None:
157
+ #embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
158
+ embedding_function = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
159
+ tdb = Chroma(persist_directory=TABLE_PATH, embedding_function=embedding_function)
160
+ results_table = tdb.similarity_search_by_vector_with_relevance_scores(
161
+ embedding=query_embedding, # Pass the query embedding
162
+ k=2
163
+ #filter=filter_condition # Pass the filter condition
164
+ )
165
+ print("results------------------->",results_table)
166
+ context_text_table = "\n\n---\n\n".join([doc.page_content for doc, _score in results_table])
167
+
168
+ # Prepare the prompt and query the model
169
+ prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE_TAB)
170
+ prompt = prompt_template.format(context=context_text_document,table=context_text_table,question=query_text)
171
+ #prompt = prompt_template.format(context=context_text_document,table=context_text_table, question=query_text)
172
+ print("results------------------->",prompt)
173
+ else:
174
+ # Prepare the prompt and query the model
175
+ prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE_DOC)
176
+ prompt = prompt_template.format(context=context_text_document,question=query_text)
177
+ #prompt = prompt_template.format(context=context_text_document,table=context_text_table, question=query_text)
178
+ print("results------------------->",prompt)
179
+
180
  #Model Defining and its use
181
  repo_id = "mistralai/Mistral-7B-Instruct-v0.3"
182
  HFT = os.environ["HF_TOKEN"]
183
  llm = HuggingFaceEndpoint(
184
  repo_id=repo_id,
185
+ #max_tokens=3000,
186
+ max_new_tokens=2000,
187
  temperature=0.8,
188
  huggingfacehub_api_token=HFT,
189
  )
190
 
191
  data= llm(prompt)
192
  #data = response.choices[0].message.content
193
+
194
  # filtering the uneccessary context.
195
  if re.search(r'\bmention\b|\bnot mention\b|\bnot mentioned\b|\bnot contain\b|\bnot include\b|\bnot provide\b|\bdoes not\b|\bnot explicitly\b|\bnot explicitly mentioned\b', data, re.IGNORECASE):
196
  data = "We do not have information related to your query on our end."
 
202
  session.modified = True
203
  print("sessionhist2",session['history'])
204
 
205
+ return render_template('chat.html', query_text=query_text, answer=data, history=session['history'],old_db=CHROMA_PATH)
206
 
207
+ return render_template('chat.html', history=session['history'], old_db=CHROMA_PATH)
208
 
 
209
  @app.route('/create-db', methods=['GET', 'POST'])
210
  def create_db():
211
  if request.method == 'POST':
212
+ db_name = request.form.get('db_name', '').strip()
213
+ if not db_name:
214
+ return "Database name is required", 400
215
 
216
+ # Get uploaded files
217
+ files = request.files.getlist('folder') # Folder uploads (multiple files)
218
+ single_files = request.files.getlist('file') # Single file uploads
219
+
220
+ # Check if any file is uploaded
221
+ if not files and not single_files:
222
  return "No files uploaded", 400
223
 
224
+ # Create upload directory
 
 
225
  upload_base_path = os.path.join(app.config['UPLOAD_FOLDER'], secure_filename(db_name))
 
226
  print(f"Base Upload Path: {upload_base_path}")
227
  os.makedirs(upload_base_path, exist_ok=True)
228
 
229
+ # Process folder files (if any)
230
+ if files:
231
+ for file in files:
232
+ file_name = secure_filename(file.filename) # Ensure the file name is safe
233
+ file_path = os.path.join(upload_base_path, file_name)
 
234
 
235
+ # Ensure the directory exists before saving the file
236
+ print(f"Saving to: {file_path}")
237
+ os.makedirs(os.path.dirname(file_path), exist_ok=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
 
239
+ # Save the file
240
+ file.save(file_path)
241
 
242
+ # Process single files (if any)
243
+ if single_files:
244
+ for file in single_files:
245
+ if file.filename == '':
246
+ print("Skipping empty single file")
247
+ continue # Skip empty uploads
248
 
249
+ # Create full file path for single file upload
250
+ file_name = secure_filename(file.filename)
251
+ file_path = os.path.join(upload_base_path, file_name)
252
 
253
+ # Ensure the directory exists before saving the file
254
+ print(f"Saving single file to: {file_path}")
 
 
255
  os.makedirs(os.path.dirname(file_path), exist_ok=True)
 
256
 
257
+ # Save the file
 
 
 
258
  file.save(file_path)
259
+ print("file------------->",file)
260
+ print("file_path------------->",file_path)
261
 
262
+ # Generate datastore (example task, depending on your logic)
263
+ asyncio.run(generate_data_store(upload_base_path, db_name))
 
 
 
264
 
265
  return redirect(url_for('list_dbs'))
266
 
 
273
 
274
  @app.route('/select-db/<db_name>', methods=['POST'])
275
  def select_db(db_name):
276
+ flash(f"{db_name} Database has been selected", "table_selected")
277
  #Selecting the Documnet Vector DB
278
  global CHROMA_PATH
279
+ global TABLE_PATH
280
  print(f"Selected DB: {CHROMA_PATH}")
281
+ print("-----------------------------------------------------1----")
282
  CHROMA_PATH = os.path.join(VECTOR_DB_FOLDER, db_name)
283
  CHROMA_PATH = CHROMA_PATH.replace("\\", "/")
284
  print(f"Selected DB: {CHROMA_PATH}")
285
+ print("-----------------------------------------------------2----")
286
 
287
+ # Selecting the Table Vector DB
288
+ table_db_path = os.path.join(TABLE_DB_FOLDER, db_name)
289
+ table_db_path = table_db_path.replace("\\", "/")
290
+ TABLE_PATH = table_db_path if os.path.exists(table_db_path) else None
291
+ print(f"Selected Table DB: {TABLE_PATH}")
 
 
292
 
293
  return redirect(url_for('chat'))
294