Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
from flask import Flask, render_template, request, redirect, url_for, session
|
2 |
import os
|
3 |
from werkzeug.utils import secure_filename
|
4 |
#from retrival import generate_data_store
|
@@ -13,6 +13,7 @@ from langchain.schema import Document
|
|
13 |
from langchain_core.documents import Document
|
14 |
from dotenv import load_dotenv
|
15 |
import re
|
|
|
16 |
import glob
|
17 |
import shutil
|
18 |
from werkzeug.utils import secure_filename
|
@@ -32,23 +33,25 @@ app.secret_key = os.urandom(24)
|
|
32 |
# Configurations
|
33 |
UPLOAD_FOLDER = "uploads/"
|
34 |
VECTOR_DB_FOLDER = "VectorDB/"
|
35 |
-
|
36 |
|
37 |
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
|
38 |
|
39 |
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
|
40 |
os.makedirs(VECTOR_DB_FOLDER, exist_ok=True)
|
41 |
-
|
42 |
|
43 |
# Global variables
|
44 |
CHROMA_PATH = None
|
45 |
-
|
46 |
-
#TABLE_PATH = None
|
47 |
|
48 |
-
|
|
|
49 |
|
50 |
-
|
51 |
-
|
|
|
|
|
52 |
|
53 |
Context:
|
54 |
{context}
|
@@ -59,28 +62,38 @@ Question:
|
|
59 |
{question}
|
60 |
|
61 |
Response:
|
62 |
-
|
63 |
-
'''
|
64 |
|
65 |
-
|
66 |
-
|
|
|
|
|
67 |
|
68 |
-
-
|
69 |
-
-
|
70 |
-
-
|
71 |
-
-
|
|
|
72 |
|
73 |
Context:
|
74 |
{context}
|
75 |
|
76 |
---
|
77 |
|
|
|
|
|
|
|
|
|
|
|
78 |
Question:
|
79 |
{question}
|
80 |
|
81 |
Response:
|
|
|
|
|
82 |
"""
|
83 |
|
|
|
84 |
#HFT = os.getenv('HF_TOKEN')
|
85 |
#client = InferenceClient(api_key=HFT)
|
86 |
|
@@ -96,59 +109,88 @@ def chat():
|
|
96 |
print("sessionhist1",session['history'])
|
97 |
|
98 |
global CHROMA_PATH
|
99 |
-
|
100 |
|
101 |
-
|
102 |
-
|
103 |
|
104 |
-
#if
|
105 |
-
#
|
106 |
-
#
|
|
|
|
|
107 |
|
108 |
if request.method == 'POST':
|
109 |
query_text = request.form['query_text']
|
110 |
if CHROMA_PATH is None:
|
111 |
-
|
|
|
112 |
|
|
|
113 |
# Load the selected Document Database
|
114 |
embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
|
115 |
#embedding_function = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
|
116 |
db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
#
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
# print("results------------------->",results_table)
|
130 |
-
# context_text_table = "\n\n---\n\n".join([doc.page_content for doc, _score in results_table])
|
131 |
-
|
132 |
-
# Prepare the prompt and query the model
|
133 |
-
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
|
134 |
-
prompt = prompt_template.format(context=context_text_document,question=query_text)
|
135 |
-
#prompt = prompt_template.format(context=context_text_document,table=context_text_table, question=query_text)
|
136 |
-
print("results------------------->",prompt)
|
137 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
138 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
#Model Defining and its use
|
140 |
repo_id = "mistralai/Mistral-7B-Instruct-v0.3"
|
141 |
HFT = os.environ["HF_TOKEN"]
|
142 |
llm = HuggingFaceEndpoint(
|
143 |
repo_id=repo_id,
|
144 |
-
max_tokens=3000,
|
|
|
145 |
temperature=0.8,
|
146 |
huggingfacehub_api_token=HFT,
|
147 |
)
|
148 |
|
149 |
data= llm(prompt)
|
150 |
#data = response.choices[0].message.content
|
151 |
-
|
152 |
# filtering the uneccessary context.
|
153 |
if re.search(r'\bmention\b|\bnot mention\b|\bnot mentioned\b|\bnot contain\b|\bnot include\b|\bnot provide\b|\bdoes not\b|\bnot explicitly\b|\bnot explicitly mentioned\b', data, re.IGNORECASE):
|
154 |
data = "We do not have information related to your query on our end."
|
@@ -160,90 +202,65 @@ def chat():
|
|
160 |
session.modified = True
|
161 |
print("sessionhist2",session['history'])
|
162 |
|
163 |
-
return render_template('chat.html', query_text=query_text, answer=data, history=session['history'])
|
164 |
|
165 |
-
return render_template('chat.html', history=session['history'])
|
166 |
|
167 |
-
'''
|
168 |
@app.route('/create-db', methods=['GET', 'POST'])
|
169 |
def create_db():
|
170 |
if request.method == 'POST':
|
171 |
-
db_name = request.form
|
|
|
|
|
172 |
|
173 |
-
# Get
|
174 |
-
files = request.files.getlist('folder')
|
175 |
-
|
|
|
|
|
|
|
176 |
return "No files uploaded", 400
|
177 |
|
178 |
-
#
|
179 |
-
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
|
180 |
-
# Define the base upload path
|
181 |
upload_base_path = os.path.join(app.config['UPLOAD_FOLDER'], secure_filename(db_name))
|
182 |
-
#upload_base_path = upload_base_path.replace("\\", "/")
|
183 |
print(f"Base Upload Path: {upload_base_path}")
|
184 |
os.makedirs(upload_base_path, exist_ok=True)
|
185 |
|
186 |
-
#
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
#file_path = file_path.replace("\\", "/")
|
192 |
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
# Get the file path and save it
|
199 |
-
file_path = os.path.join(upload_base_path, secure_filename(file.filename))
|
200 |
-
file.save(file_path)
|
201 |
-
|
202 |
-
# Generate datastore
|
203 |
-
generate_data_store(upload_base_path, db_name)
|
204 |
-
|
205 |
-
# # Clean up uploaded files (if needed)
|
206 |
-
#if os.path.exists(app.config['UPLOAD_FOLDER']):
|
207 |
-
# shutil.rmtree(app.config['UPLOAD_FOLDER'])
|
208 |
-
|
209 |
-
return redirect(url_for('list_dbs'))
|
210 |
-
|
211 |
-
return render_template('create_db.html')
|
212 |
-
'''
|
213 |
-
@app.route('/create-db', methods=['GET', 'POST'])
|
214 |
-
def create_db():
|
215 |
-
if request.method == 'POST':
|
216 |
-
db_name = request.form['db_name']
|
217 |
|
218 |
-
|
219 |
-
|
220 |
|
221 |
-
#
|
222 |
-
|
223 |
-
|
|
|
|
|
|
|
224 |
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
|
229 |
-
|
230 |
-
|
231 |
-
for file in folder_files:
|
232 |
-
file_path = os.path.join(upload_base_path, secure_filename(file.filename))
|
233 |
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
234 |
-
file.save(file_path)
|
235 |
|
236 |
-
|
237 |
-
# Process single files
|
238 |
-
for file in single_files:
|
239 |
-
file_path = os.path.join(upload_base_path, secure_filename(file.filename))
|
240 |
file.save(file_path)
|
|
|
|
|
241 |
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
# Generate datastore
|
246 |
-
generate_data_store(upload_base_path, db_name)
|
247 |
|
248 |
return redirect(url_for('list_dbs'))
|
249 |
|
@@ -256,21 +273,22 @@ def list_dbs():
|
|
256 |
|
257 |
@app.route('/select-db/<db_name>', methods=['POST'])
|
258 |
def select_db(db_name):
|
259 |
-
|
260 |
#Selecting the Documnet Vector DB
|
261 |
global CHROMA_PATH
|
|
|
262 |
print(f"Selected DB: {CHROMA_PATH}")
|
|
|
263 |
CHROMA_PATH = os.path.join(VECTOR_DB_FOLDER, db_name)
|
264 |
CHROMA_PATH = CHROMA_PATH.replace("\\", "/")
|
265 |
print(f"Selected DB: {CHROMA_PATH}")
|
|
|
266 |
|
267 |
-
#Selecting the Table Vector DB
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
# print(f"Selected DB: {TABLE_PATH}")
|
273 |
-
|
274 |
|
275 |
return redirect(url_for('chat'))
|
276 |
|
|
|
1 |
+
from flask import Flask, render_template, request, redirect, url_for, session, flash
|
2 |
import os
|
3 |
from werkzeug.utils import secure_filename
|
4 |
#from retrival import generate_data_store
|
|
|
13 |
from langchain_core.documents import Document
|
14 |
from dotenv import load_dotenv
|
15 |
import re
|
16 |
+
import numpy as np
|
17 |
import glob
|
18 |
import shutil
|
19 |
from werkzeug.utils import secure_filename
|
|
|
33 |
# Configurations
|
34 |
UPLOAD_FOLDER = "uploads/"
|
35 |
VECTOR_DB_FOLDER = "VectorDB/"
|
36 |
+
TABLE_DB_FOLDER = "TableDB/"
|
37 |
|
38 |
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
|
39 |
|
40 |
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
|
41 |
os.makedirs(VECTOR_DB_FOLDER, exist_ok=True)
|
42 |
+
os.makedirs(TABLE_DB_FOLDER, exist_ok=True)
|
43 |
|
44 |
# Global variables
|
45 |
CHROMA_PATH = None
|
46 |
+
TABLE_PATH = None
|
|
|
47 |
|
48 |
+
PROMPT_TEMPLATE_DOC = """
|
49 |
+
<s>[INST] You are a retrieval-augmented generation (RAG) assistant. Your task is to generate a response strictly based on the given context. Follow these instructions:
|
50 |
|
51 |
+
- Use only the provided context; do not add external information.
|
52 |
+
- The context contains multiple retrieved chunks separated by "###". Choose only the most relevant chunks to answer the question and ignore unrelated ones.
|
53 |
+
- If available, use the provided source information to support the response.
|
54 |
+
- Answer concisely and factually.
|
55 |
|
56 |
Context:
|
57 |
{context}
|
|
|
62 |
{question}
|
63 |
|
64 |
Response:
|
65 |
+
[/INST]
|
|
|
66 |
|
67 |
+
"""
|
68 |
+
# prompt if the document having the tables
|
69 |
+
PROMPT_TEMPLATE_TAB = """
|
70 |
+
<s>[INST] You are a retrieval-augmented generation (RAG) assistant. Your task is to generate a response strictly based on the given context. Follow these instructions:
|
71 |
|
72 |
+
- Use only the provided context; do not add external information.
|
73 |
+
- The context contains multiple retrieved chunks separated by "###". Choose only the most relevant chunks to answer the question and ignore unrelated ones.
|
74 |
+
- If available, use the provided source information to support the response.
|
75 |
+
- If a table is provided as html, incorporate its relevant details into the response while maintaining a structured format.
|
76 |
+
- Answer concisely and factually.
|
77 |
|
78 |
Context:
|
79 |
{context}
|
80 |
|
81 |
---
|
82 |
|
83 |
+
Table:
|
84 |
+
{table}
|
85 |
+
|
86 |
+
---
|
87 |
+
|
88 |
Question:
|
89 |
{question}
|
90 |
|
91 |
Response:
|
92 |
+
[/INST]
|
93 |
+
|
94 |
"""
|
95 |
|
96 |
+
|
97 |
#HFT = os.getenv('HF_TOKEN')
|
98 |
#client = InferenceClient(api_key=HFT)
|
99 |
|
|
|
109 |
print("sessionhist1",session['history'])
|
110 |
|
111 |
global CHROMA_PATH
|
112 |
+
global TABLE_PATH
|
113 |
|
114 |
+
old_db = session.get('old_db', None)
|
115 |
+
print(f"Selected DB: {CHROMA_PATH}")
|
116 |
|
117 |
+
# if old_db != None:
|
118 |
+
# if CHROMA_PATH != old_db:
|
119 |
+
# session['history'] = []
|
120 |
+
|
121 |
+
#print("sessionhist1",session['history'])
|
122 |
|
123 |
if request.method == 'POST':
|
124 |
query_text = request.form['query_text']
|
125 |
if CHROMA_PATH is None:
|
126 |
+
flash("Please select a database first!", "error")
|
127 |
+
return redirect(url_for('list_dbs'))
|
128 |
|
129 |
+
|
130 |
# Load the selected Document Database
|
131 |
embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
|
132 |
#embedding_function = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
|
133 |
db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)
|
134 |
+
# Convert the query to its embedding vector
|
135 |
+
query_embedding = embedding_function.embed_query(query_text)
|
136 |
+
if isinstance(query_embedding, float):
|
137 |
+
query_embedding = [query_embedding]
|
138 |
+
# print(f"Query embedding: {query_embedding}")
|
139 |
+
# print(f"Type of query embedding: {type(query_embedding)}")
|
140 |
+
# print(f"Length of query embedding: {len(query_embedding) if isinstance(query_embedding, (list, np.ndarray)) else 'Not applicable'}")
|
141 |
+
results_document = db.similarity_search_by_vector_with_relevance_scores(
|
142 |
+
embedding=query_embedding, # Pass the query embedding
|
143 |
+
k=3,
|
144 |
+
#filter=filter_condition # Pass the filter condition
|
145 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
|
147 |
+
print("results------------------->",results_document)
|
148 |
+
print("============================================")
|
149 |
+
print("============================================")
|
150 |
+
|
151 |
+
context_text_document = " \n\n###\n\n ".join(
|
152 |
+
[f"Source: {doc.metadata.get('source', '')} Page_content:{doc.page_content}\n" for doc, _score in results_document]
|
153 |
+
)
|
154 |
|
155 |
+
# Loading Table Database only if available
|
156 |
+
if TABLE_PATH is not None:
|
157 |
+
#embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
|
158 |
+
embedding_function = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
|
159 |
+
tdb = Chroma(persist_directory=TABLE_PATH, embedding_function=embedding_function)
|
160 |
+
results_table = tdb.similarity_search_by_vector_with_relevance_scores(
|
161 |
+
embedding=query_embedding, # Pass the query embedding
|
162 |
+
k=2
|
163 |
+
#filter=filter_condition # Pass the filter condition
|
164 |
+
)
|
165 |
+
print("results------------------->",results_table)
|
166 |
+
context_text_table = "\n\n---\n\n".join([doc.page_content for doc, _score in results_table])
|
167 |
+
|
168 |
+
# Prepare the prompt and query the model
|
169 |
+
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE_TAB)
|
170 |
+
prompt = prompt_template.format(context=context_text_document,table=context_text_table,question=query_text)
|
171 |
+
#prompt = prompt_template.format(context=context_text_document,table=context_text_table, question=query_text)
|
172 |
+
print("results------------------->",prompt)
|
173 |
+
else:
|
174 |
+
# Prepare the prompt and query the model
|
175 |
+
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE_DOC)
|
176 |
+
prompt = prompt_template.format(context=context_text_document,question=query_text)
|
177 |
+
#prompt = prompt_template.format(context=context_text_document,table=context_text_table, question=query_text)
|
178 |
+
print("results------------------->",prompt)
|
179 |
+
|
180 |
#Model Defining and its use
|
181 |
repo_id = "mistralai/Mistral-7B-Instruct-v0.3"
|
182 |
HFT = os.environ["HF_TOKEN"]
|
183 |
llm = HuggingFaceEndpoint(
|
184 |
repo_id=repo_id,
|
185 |
+
#max_tokens=3000,
|
186 |
+
max_new_tokens=2000,
|
187 |
temperature=0.8,
|
188 |
huggingfacehub_api_token=HFT,
|
189 |
)
|
190 |
|
191 |
data= llm(prompt)
|
192 |
#data = response.choices[0].message.content
|
193 |
+
|
194 |
# filtering the uneccessary context.
|
195 |
if re.search(r'\bmention\b|\bnot mention\b|\bnot mentioned\b|\bnot contain\b|\bnot include\b|\bnot provide\b|\bdoes not\b|\bnot explicitly\b|\bnot explicitly mentioned\b', data, re.IGNORECASE):
|
196 |
data = "We do not have information related to your query on our end."
|
|
|
202 |
session.modified = True
|
203 |
print("sessionhist2",session['history'])
|
204 |
|
205 |
+
return render_template('chat.html', query_text=query_text, answer=data, history=session['history'],old_db=CHROMA_PATH)
|
206 |
|
207 |
+
return render_template('chat.html', history=session['history'], old_db=CHROMA_PATH)
|
208 |
|
|
|
209 |
@app.route('/create-db', methods=['GET', 'POST'])
|
210 |
def create_db():
|
211 |
if request.method == 'POST':
|
212 |
+
db_name = request.form.get('db_name', '').strip()
|
213 |
+
if not db_name:
|
214 |
+
return "Database name is required", 400
|
215 |
|
216 |
+
# Get uploaded files
|
217 |
+
files = request.files.getlist('folder') # Folder uploads (multiple files)
|
218 |
+
single_files = request.files.getlist('file') # Single file uploads
|
219 |
+
|
220 |
+
# Check if any file is uploaded
|
221 |
+
if not files and not single_files:
|
222 |
return "No files uploaded", 400
|
223 |
|
224 |
+
# Create upload directory
|
|
|
|
|
225 |
upload_base_path = os.path.join(app.config['UPLOAD_FOLDER'], secure_filename(db_name))
|
|
|
226 |
print(f"Base Upload Path: {upload_base_path}")
|
227 |
os.makedirs(upload_base_path, exist_ok=True)
|
228 |
|
229 |
+
# Process folder files (if any)
|
230 |
+
if files:
|
231 |
+
for file in files:
|
232 |
+
file_name = secure_filename(file.filename) # Ensure the file name is safe
|
233 |
+
file_path = os.path.join(upload_base_path, file_name)
|
|
|
234 |
|
235 |
+
# Ensure the directory exists before saving the file
|
236 |
+
print(f"Saving to: {file_path}")
|
237 |
+
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
238 |
|
239 |
+
# Save the file
|
240 |
+
file.save(file_path)
|
241 |
|
242 |
+
# Process single files (if any)
|
243 |
+
if single_files:
|
244 |
+
for file in single_files:
|
245 |
+
if file.filename == '':
|
246 |
+
print("Skipping empty single file")
|
247 |
+
continue # Skip empty uploads
|
248 |
|
249 |
+
# Create full file path for single file upload
|
250 |
+
file_name = secure_filename(file.filename)
|
251 |
+
file_path = os.path.join(upload_base_path, file_name)
|
252 |
|
253 |
+
# Ensure the directory exists before saving the file
|
254 |
+
print(f"Saving single file to: {file_path}")
|
|
|
|
|
255 |
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
|
|
256 |
|
257 |
+
# Save the file
|
|
|
|
|
|
|
258 |
file.save(file_path)
|
259 |
+
print("file------------->",file)
|
260 |
+
print("file_path------------->",file_path)
|
261 |
|
262 |
+
# Generate datastore (example task, depending on your logic)
|
263 |
+
asyncio.run(generate_data_store(upload_base_path, db_name))
|
|
|
|
|
|
|
264 |
|
265 |
return redirect(url_for('list_dbs'))
|
266 |
|
|
|
273 |
|
274 |
@app.route('/select-db/<db_name>', methods=['POST'])
|
275 |
def select_db(db_name):
|
276 |
+
flash(f"{db_name} Database has been selected", "table_selected")
|
277 |
#Selecting the Documnet Vector DB
|
278 |
global CHROMA_PATH
|
279 |
+
global TABLE_PATH
|
280 |
print(f"Selected DB: {CHROMA_PATH}")
|
281 |
+
print("-----------------------------------------------------1----")
|
282 |
CHROMA_PATH = os.path.join(VECTOR_DB_FOLDER, db_name)
|
283 |
CHROMA_PATH = CHROMA_PATH.replace("\\", "/")
|
284 |
print(f"Selected DB: {CHROMA_PATH}")
|
285 |
+
print("-----------------------------------------------------2----")
|
286 |
|
287 |
+
# Selecting the Table Vector DB
|
288 |
+
table_db_path = os.path.join(TABLE_DB_FOLDER, db_name)
|
289 |
+
table_db_path = table_db_path.replace("\\", "/")
|
290 |
+
TABLE_PATH = table_db_path if os.path.exists(table_db_path) else None
|
291 |
+
print(f"Selected Table DB: {TABLE_PATH}")
|
|
|
|
|
292 |
|
293 |
return redirect(url_for('chat'))
|
294 |
|