Baskar2005 commited on
Commit
0d6f2a4
·
verified ·
1 Parent(s): 9ef1174

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +331 -0
app.py ADDED
@@ -0,0 +1,331 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
2
+ from langchain_community.vectorstores import FAISS
3
+ from langchain_community.chat_models import ChatOpenAI
4
+ from langchain_openai import AzureChatOpenAI,AzureOpenAIEmbeddings
5
+ from langchain.memory import ConversationBufferMemory
6
+ # from langchain.chains import ConversationChain
7
+ from langchain.chains import (
8
+ create_history_aware_retriever,
9
+ create_retrieval_chain,
10
+ )
11
+ from langchain_unstructured import UnstructuredLoader
12
+ from typing import List, Dict, Tuple
13
+ import gradio as gr
14
+ import validators
15
+ import requests
16
+ import mimetypes
17
+ import tempfile
18
+ import os
19
+ # from langchain.chains.question_answering import load_qa_chain
20
+ # from langchain.llms import OpenAI
21
+ from langchain.prompts import PromptTemplate
22
+ from langchain.prompts.prompt import PromptTemplate
23
+ import pandas as pd
24
+ # from langchain_experimental.agents.agent_toolkits import create_csv_agent
25
+ from langchain_experimental.agents import create_csv_agent
26
+
27
+ # from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent
28
+ from langchain.agents import ZeroShotAgent, Tool, AgentExecutor
29
+ from langchain.agents.agent_types import AgentType
30
+ # from langchain.agents import create_csv_agent
31
+ from langchain import LLMChain
32
+ # from openai import AzureOpenAI
33
+
34
+
35
+
36
+ class ChatDocumentQA:
37
+ def __init__(self) -> None:
38
+ pass
39
+
40
+ def _get_empty_state(self) -> Dict[str, None]:
41
+ """Create an empty knowledge base."""
42
+ return {"knowledge_base": None}
43
+
44
+ def _extract_text_from_pdfs(self, file_paths: List[str]) -> List[str]:
45
+ """Extract text content from PDF files.
46
+
47
+ Args:
48
+ file_paths (List[str]): List of file paths.
49
+
50
+ Returns:
51
+ List[str]: Extracted text from the PDFs.
52
+ """
53
+ loader = UnstructuredLoader(file_paths)
54
+ docs = loader.load()
55
+ print("Docs:",docs)
56
+ return docs
57
+
58
+
59
+
60
+
61
+ def _get_content_from_url(self, urls: str) -> List[str]:
62
+ """Fetch content from given URLs.
63
+
64
+ Args:
65
+ urls (str): Comma-separated URLs.
66
+
67
+ Returns:
68
+ List[str]: List of text content fetched from the URLs.
69
+ """
70
+ file_paths = []
71
+ for url in urls.split(','):
72
+ if validators.url(url):
73
+ # headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',}
74
+ r = requests.get(url)
75
+ if r.status_code != 200:
76
+ raise ValueError("Check the url of your file; returned status code %s" % r.status_code)
77
+ content_type = r.headers.get("content-type")
78
+ file_extension = mimetypes.guess_extension(content_type)
79
+ temp_file = tempfile.NamedTemporaryFile(suffix=file_extension, delete=False)
80
+ temp_file.write(r.content)
81
+ file_paths.append(temp_file.name)
82
+
83
+ print("File_Paths:",file_paths)
84
+ docs = self._extract_text_from_pdfs(file_paths)
85
+ return docs
86
+
87
+ def _split_text_into_chunks(self, text: str) -> List[str]:
88
+ """Split text into smaller chunks.
89
+
90
+ Args:
91
+ text (str): Input text to be split.
92
+
93
+ Returns:
94
+ List[str]: List of smaller text chunks.
95
+ """
96
+ text_splitter = RecursiveCharacterTextSplitter( chunk_size=6000, chunk_overlap=0)
97
+
98
+ chunks = text_splitter.split_documents(text)
99
+
100
+ return chunks
101
+
102
+ def _create_vector_store_from_text_chunks(self, text_chunks: List[str]) -> FAISS:
103
+ """Create a vector store from text chunks.
104
+
105
+ Args:
106
+ text_chunks (List[str]): List of text chunks.
107
+
108
+ Returns:
109
+ FAISS: Vector store created from the text chunks.
110
+ """
111
+ embeddings = AzureOpenAIEmbeddings(
112
+ azure_deployment="text-embedding-3-large",
113
+ )
114
+
115
+ return FAISS.from_documents(documents=text_chunks, embedding=embeddings)
116
+
117
+
118
+ def _create_conversation_chain(self,vectorstore):
119
+
120
+ _template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.
121
+
122
+ Chat History: {chat_history}
123
+ Follow Up Input: {question}
124
+ Standalone question:"""
125
+ CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)
126
+
127
+ memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
128
+
129
+ # llm = ChatOpenAI(temperature=0)
130
+ llm=AzureChatOpenAI(azure_deployment = "GPT-4o")
131
+
132
+ return ConversationalRetrievalChain.from_llm(llm=llm, retriever=vectorstore.as_retriever(),
133
+ condense_question_prompt=CONDENSE_QUESTION_PROMPT,
134
+ memory=memory)
135
+
136
+
137
+ def _get_documents_knowledge_base(self, file_paths: List[str]) -> Tuple[str, Dict[str, FAISS]]:
138
+ """Build knowledge base from uploaded files.
139
+
140
+ Args:
141
+ file_paths (List[str]): List of file paths.
142
+
143
+ Returns:
144
+ Tuple[str, Dict]: Tuple containing a status message and the knowledge base.
145
+ """
146
+ file_path = file_paths[0].name
147
+ file_extension = os.path.splitext(file_path)[1]
148
+
149
+ if file_extension == '.csv':
150
+ # agent = self.create_agent(file_path)
151
+ # tools = self.get_agent_tools(agent)
152
+ # memory,tools,prompt = self.create_memory_for_csv_qa(tools)
153
+ # agent_chain = self.create_agent_chain_for_csv_qa(memory,tools,prompt)
154
+ agent_chain = create_csv_agent(
155
+ AzureChatOpenAI(azure_deployment = "GPT-4o"),
156
+ file_path,
157
+ verbose=True,
158
+ allow_dangerous_code=True
159
+ )
160
+ return "file uploaded", {"knowledge_base": agent_chain}
161
+
162
+ else:
163
+ pdf_docs = [file_path.name for file_path in file_paths]
164
+ raw_text = self._extract_text_from_pdfs(pdf_docs)
165
+ text_chunks = self._split_text_into_chunks(raw_text)
166
+ vectorstore = self._create_vector_store_from_text_chunks(text_chunks)
167
+ return "file uploaded", {"knowledge_base": vectorstore}
168
+
169
+
170
+ def _get_urls_knowledge_base(self, urls: str) -> Tuple[str, Dict[str, FAISS]]:
171
+ """Build knowledge base from URLs.
172
+
173
+ Args:
174
+ urls (str): Comma-separated URLs.
175
+
176
+ Returns:
177
+ Tuple[str, Dict]: Tuple containing a status message and the knowledge base.
178
+ """
179
+ webpage_text = self._get_content_from_url(urls)
180
+ text_chunks = self._split_text_into_chunks(webpage_text)
181
+ vectorstore = self._create_vector_store_from_text_chunks(text_chunks)
182
+ return "file uploaded", {"knowledge_base": vectorstore}
183
+
184
+ #************************
185
+ # csv qa
186
+ #************************
187
+ def create_agent(self,file_path):
188
+ agent_chain = create_csv_agent(
189
+ AzureChatOpenAI(azure_deployment = "GPT-4o"),
190
+ file_path,
191
+ verbose=True,
192
+ agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
193
+ )
194
+ return agent_chain
195
+ def get_agent_tools(self,agent):
196
+ # search = agent
197
+ tools = [
198
+ Tool(
199
+ name="dataframe qa",
200
+ func=agent.run,
201
+ description="useful for when you need to answer questions about table data and dataframe data",
202
+ )
203
+ ]
204
+ return tools
205
+
206
+ def create_memory_for_csv_qa(self,tools):
207
+ prefix = """Have a conversation with a human, answering the following questions about table data and dataframe data as best you can. You have access to the following tools:"""
208
+ suffix = """Begin!"
209
+
210
+ {chat_history}
211
+ Question: {input}
212
+ {agent_scratchpad}"""
213
+
214
+ prompt = ZeroShotAgent.create_prompt(
215
+ tools,
216
+ prefix=prefix,
217
+ suffix=suffix,
218
+ input_variables=["input", "chat_history", "agent_scratchpad"],
219
+ )
220
+ memory = ConversationBufferMemory(memory_key="chat_history",return_messages=True)
221
+
222
+ return memory,tools,prompt
223
+
224
+ def create_agent_chain_for_csv_qa(self,memory,tools,prompt):
225
+
226
+ llm_chain = LLMChain(llm=AzureChatOpenAI(azure_deployment = "GPT-4o"), prompt=prompt)
227
+ agent = ZeroShotAgent(llm_chain=llm_chain, tools=tools, verbose=True)
228
+ agent_chain = AgentExecutor.from_agent_and_tools(
229
+ agent=agent, tools=tools, verbose=True, memory=memory
230
+ )
231
+
232
+ return agent_chain
233
+
234
+ def _get_response(self, message: str, chat_history: List[Tuple[str, str]], state: Dict[str, FAISS],file_paths) -> Tuple[str, List[Tuple[str, str]]]:
235
+ """Get a response from the chatbot.
236
+
237
+ Args:
238
+ message (str): User's message/question.
239
+ chat_history (List[Tuple[str, str]]): List of chat history as tuples of (user_message, bot_response).
240
+ state (dict): State containing the knowledge base.
241
+
242
+ Returns:
243
+ Tuple[str, List[Tuple[str, str]]]: Tuple containing a status message and updated chat history.
244
+ """
245
+ try:
246
+ if file_paths:
247
+ file_path = file_paths[0].name
248
+ file_extension = os.path.splitext(file_path)[1]
249
+
250
+ if file_extension == '.csv':
251
+ agent_chain = state["knowledge_base"]
252
+ response = agent_chain.run(input = message)
253
+ chat_history.append((message, response))
254
+ return "", chat_history
255
+
256
+ else:
257
+ vectorstore = state["knowledge_base"]
258
+ chat = self._create_conversation_chain(vectorstore)
259
+ response = chat({"question": message,"chat_history": chat_history})
260
+ chat_history.append((message, response["answer"]))
261
+ return "", chat_history
262
+ else:
263
+ vectorstore = state["knowledge_base"]
264
+ chat = self._create_conversation_chain(vectorstore)
265
+ response = chat({"question": message,"chat_history": chat_history})
266
+ chat_history.append((message, response["answer"]))
267
+ return "", chat_history
268
+ except:
269
+ chat_history.append((message, "Please Upload Document or URL"))
270
+ return "", chat_history
271
+
272
+ def gradio_interface(self) -> None:
273
+ """Create a Gradio interface for the chatbot."""
274
+ with gr.Blocks(css="#textbox_id textarea {color: white}",theme='SherlockRamos/Feliz') as demo:
275
+ gr.HTML("""
276
+ <style>
277
+ .footer {
278
+ display: none !important;
279
+ }
280
+ footer {
281
+ display: none !important;
282
+ }
283
+ #foot {
284
+ display: none !important;
285
+ }
286
+ .svelte-1fzp3xt {
287
+ display: none !important;
288
+ }
289
+ #root > div > div > div {
290
+ padding-bottom: 0 !important;
291
+ }
292
+ .custom-footer {
293
+ text-align: center;
294
+ padding: 10px;
295
+ font-size: 14px;
296
+ color: #333;
297
+ }
298
+ </style>
299
+ """)
300
+ gr.HTML("""<h1 style="color:#000;margin-left:4in;padding-top:10px">Multi Document QA</h1></div>""")
301
+ state = gr.State(self._get_empty_state())
302
+ chatbot = gr.Chatbot()
303
+
304
+ with gr.Row():
305
+ with gr.Column(scale=0.85):
306
+ msg = gr.Textbox(label="Question", elem_id="textbox_id")
307
+ with gr.Column(scale=0.15):
308
+ file_output = gr.Textbox(label="File Status")
309
+ with gr.Row():
310
+ with gr.Column(scale=0.85):
311
+ clear = gr.ClearButton([msg, chatbot])
312
+ with gr.Column(scale=0.15):
313
+ upload_button = gr.UploadButton(
314
+ "Browse File",
315
+ file_types=[".txt", ".pdf", ".docx", ".csv"],
316
+ file_count="multiple", variant="primary"
317
+ )
318
+ with gr.Row():
319
+ with gr.Column(scale=1):
320
+ input_url = gr.Textbox(label="urls", elem_id="textbox_id")
321
+
322
+ input_url.submit(self._get_urls_knowledge_base, input_url, [file_output, state])
323
+ upload_button.upload(self._get_documents_knowledge_base, upload_button, [file_output, state])
324
+ msg.submit(self._get_response, [msg, chatbot, state,upload_button], [msg, chatbot])
325
+
326
+ demo.launch(debug=True,allowed_paths=["/content/"])
327
+
328
+
329
+ if __name__ == "__main__":
330
+ chatdocumentqa = ChatDocumentQA()
331
+ chatdocumentqa.gradio_interface()