RoAr777 commited on
Commit
55b729c
·
verified ·
1 Parent(s): 0fdf3e3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +192 -259
app.py CHANGED
@@ -1,260 +1,193 @@
1
- import PyPDF2
2
- import re
3
- from sentence_transformers import SentenceTransformer
4
- import faiss
5
- from langchain.agents import initialize_agent, AgentType,Tool
6
- from langchain.schema import HumanMessage
7
- from langchain_google_genai import ChatGoogleGenerativeAI
8
- import gradio as gr
9
- import os
10
- import pytesseract
11
- from PIL import Image
12
- pytesseract.pytesseract.tesseract_cmd = r"tesseract.exe"
13
-
14
-
15
-
16
- def load_pdf_text(file_path):
17
- with open(file_path, "rb") as file:
18
- reader = PyPDF2.PdfReader(file)
19
- text = ""
20
- for page in reader.pages:
21
- text += page.extract_text()
22
- return text
23
-
24
- def chunk_text(text, chunk_size=700):
25
- # Splits the text into chunks of chunk_size while preserving sentences
26
- chunks = []
27
- sentences = re.split(r'(?<=[.!?])\s+', text)
28
- current_chunk = ""
29
- for sentence in sentences:
30
- if len(current_chunk) + len(sentence) > chunk_size:
31
- chunks.append(current_chunk)
32
- current_chunk = sentence
33
- else:
34
- current_chunk += " " + sentence
35
- chunks.append(current_chunk)
36
- return chunks
37
-
38
-
39
- def load_and_process_chapters(directory):
40
- chapter_data = {}
41
- for filename in os.listdir(directory):
42
- if filename.endswith(".pdf"):
43
- file_path = os.path.join(directory, filename)
44
- text = load_pdf_text(file_path)
45
- chunks = chunk_text(text)
46
- chapter_data[filename] = chunks # Use filename as key
47
- return chapter_data
48
-
49
- ipc_data = load_and_process_chapters("IPC")
50
- crpc_data=load_and_process_chapters("CrPC")
51
- # Step 2: Embeddings and Indexing
52
- model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
53
-
54
- index = faiss.IndexFlatL2(model.get_sentence_embedding_dimension())
55
- index2 = faiss.IndexFlatL2(model.get_sentence_embedding_dimension())
56
-
57
- # Flatten the chapter data and keep track of chapter and chunk indices
58
- flattened_data = []
59
- pdf_filenames = [] # Store PDF filenames for citation
60
- chunk_indices = []
61
- for pdf_filename, chunks in ipc_data.items():
62
- for i, chunk in enumerate(chunks):
63
- flattened_data.append(chunk)
64
- pdf_filenames.append(pdf_filename)
65
- chunk_indices.append(i)
66
-
67
- embeddings = model.encode(flattened_data)
68
- index.add(embeddings)
69
-
70
- flattened_data2 = []
71
- pdf_filenames2 = [] # Store PDF filenames for citation
72
- chunk_indices2 = []
73
- for pdf_filename, chunks in crpc_data.items():
74
- for i, chunk in enumerate(chunks):
75
- flattened_data2.append(chunk)
76
- pdf_filenames2.append(pdf_filename)
77
- chunk_indices2.append(i)
78
-
79
- embeddings = model.encode(flattened_data2)
80
- index2.add(embeddings)
81
-
82
- # Step 3: Retrieval with Citations using PDF filename
83
- def retrieve_info_with_citation(query, top_k=5):
84
- query_embedding = model.encode([query])
85
- D, I = index.search(query_embedding, k=top_k)
86
-
87
- results = []
88
- for i in range(min(top_k, len(I[0]))):
89
- if D[0][i] < 1.0: # Relevance threshold
90
- chunk_index = I[0][i]
91
- pdf_filename = pdf_filenames[chunk_index]
92
- chunk_number = chunk_indices[chunk_index] + 1
93
- match = flattened_data[chunk_index]
94
- citation = f"Source: {pdf_filename}, Chunk: {chunk_number}"
95
- results.append((match, citation))
96
- else:
97
- break
98
-
99
- if results:
100
- return results
101
- else:
102
- return [("I'm sorry, I couldn't find relevant information.", "Source: N/A")]
103
-
104
-
105
- def retrieve_info_with_citation2(query, top_k=5):
106
- query_embedding = model.encode([query])
107
- D, I = index2.search(query_embedding, k=top_k)
108
-
109
- results = []
110
- for i in range(min(top_k, len(I[0]))):
111
- if D[0][i] < 1.0: # Relevance threshold
112
- chunk_index = I[0][i]
113
- pdf_filename = pdf_filenames2[chunk_index]
114
- chunk_number = chunk_indices2[chunk_index] + 1
115
- match = flattened_data2[chunk_index]
116
- citation = f"Source: {pdf_filename}, Chunk: {chunk_number}"
117
- results.append((match, citation))
118
- else:
119
- break
120
-
121
- if results:
122
- return results
123
- else:
124
- return [("I'm sorry, I couldn't find relevant information.", "Source: N/A")]
125
-
126
- def retrieve_info(query):
127
- results = retrieve_info_with_citation(query)
128
- formatted_results = "\n\n".join([f"{i+1}. {match}\n{citation}" for i, (match, citation) in enumerate(results)])
129
- return formatted_results
130
-
131
- def retrieve_info2(query):
132
- results = retrieve_info_with_citation2(query)
133
- formatted_results = "\n\n".join([f"{i+1}. {match}\n{citation}" for i, (match, citation) in enumerate(results)])
134
- return formatted_results
135
-
136
- ipc_tool = Tool(
137
- name="IPC Information Retrieval",
138
- func=retrieve_info,
139
- description="Retrieve information from the Indian Penal Code Related to query keyword(s)."
140
- )
141
-
142
- crpc_tool=Tool(
143
- name="CrPC Information Retrieval",
144
- func=retrieve_info2,
145
- description="Retrieve information from the Code of Criminal Procedure(CrPC) Related to query keyword(s)."
146
- )
147
- llm = ChatGoogleGenerativeAI(
148
- model="gemini-1.5-pro",
149
- temperature=0.25,
150
- max_tokens=None,
151
- timeout=None,
152
- max_retries=2,
153
- prompt_template="""
154
- You are a highly specialized legal assistant with deep knowledge of the Indian Penal Code (IPC).
155
- Your primary task is to retrieve and summarize legal information accurately from the IPC.pdf document provided to you.
156
- Your responses should be highly specific, fact-based, and free from any speculation or hallucinations.
157
- Always cite the exact section from the IPC when providing an answer.
158
- If the information is not available in the document, clearly state that and do not make any assumptions.
159
-
160
- Example task: "What is the punishment for theft according to the IPC?"
161
- Example response: "According to Section 379 of the IPC, the punishment for theft is imprisonment of either description for a term which may extend to three years, or with fine, or with both."
162
-
163
- Task: {{query}}
164
-
165
- Response:
166
- """,
167
- )
168
-
169
- agent_tools = [ipc_tool,crpc_tool]
170
-
171
- agent = initialize_agent(
172
- tools=agent_tools,
173
- llm=llm,
174
- agent=AgentType.STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION,
175
- verbose=True,
176
- return_intermediate_steps=True,
177
- handle_parsing_errors=True,
178
- )
179
- def encode_image_to_base64(image_path):
180
- return pytesseract.image_to_string(Image.open(image_path))
181
- def chatbot_response(query):
182
- if query.get('files'):
183
- # Encode image to base64
184
- image_data=""
185
- for x in range(len(query["files"])):
186
- image_data += f"{x}. "+encode_image_to_base64(query["files"][x]) +"\n"
187
-
188
- # Create a multimodal message with both text and image data
189
- message = HumanMessage(
190
- content=[
191
- {"type": "text", "text": query['text'] +" System :Image(s) was added to this prompt by this user. Text Extracted from this image (Some words may be misspelled ,Use your understanding ):"+image_data}, # Add text input
192
-
193
- ]
194
- )
195
- else:
196
- # If no image, only pass the text
197
- message = HumanMessage(content=[{"type": "text", "text": query}])
198
-
199
- # Invoke the model with the multimodal message
200
- result = agent.invoke([message])
201
- response = result['output']
202
- intermediate_steps = result.get('intermediate_steps', [])
203
-
204
- thought_process = ""
205
- for action, observation in intermediate_steps:
206
- thought_process += f"**Thought:** {action.log}\n"
207
- thought_process += f"**Action:** {action.tool}\n"
208
- thought_process += f"**Observation:** {observation}\n\n"
209
-
210
- return response, thought_process.strip()
211
- # Step 5: Gradio Interface
212
- from gradio import ChatMessage
213
- def chatbot_interface(messages,prompt):
214
- response, thought_process = chatbot_response(prompt)
215
- #messages.append(ChatMessage(role="user", content=prompt))
216
-
217
- for x in prompt["files"]:
218
- messages.append(ChatMessage(role="user", content={"path": x, "mime_type": "image/png"}))
219
- if prompt["text"] is not None:
220
- messages.append(ChatMessage(role="user", content=prompt['text']))
221
- if thought_process:
222
- messages.append(ChatMessage(role="assistant", content=thought_process,metadata={"title": "🧠 Thought Process"}))
223
- messages.append(ChatMessage(role="assistant", content=response))
224
-
225
- return messages, gr.MultimodalTextbox(value=None, interactive=True)
226
-
227
-
228
- def vote(data: gr.LikeData):
229
- if data.liked:
230
- print("You upvoted this response: " + data.value)
231
- else:
232
- print("You downvoted this response: " + data.value)
233
-
234
- with gr.Blocks(theme=gr.themes.Soft()) as iface:
235
-
236
- gr.Markdown(
237
- """
238
- <div style="font-size: 24px; font-weight: bold; color: #333;">
239
- DoJ Chatbot
240
- </div>
241
- <div style="font-size: 16px; color: #555;">
242
- Ask questions related to the Department of Justice.
243
- </div>
244
- """
245
- )
246
- chatbot = gr.Chatbot(type="messages",avatar_images=("user.jpeg", "logo.jpeg"), bubble_full_width=True) # Chatbot component to display conversation history
247
- query_input = gr.MultimodalTextbox(interactive=True,
248
- placeholder="Enter message or upload file...", show_label=False)
249
- submit_button = gr.Button("Send")
250
-
251
- submit_button.click(chatbot_interface, [chatbot, query_input], [chatbot, query_input])
252
- query_input.submit(chatbot_interface, [chatbot, query_input], [chatbot,query_input])
253
-
254
- chatbot.like(vote, None, None) # Adding like/dislike functionality to the chatbot
255
-
256
-
257
- iface.launch(
258
- show_error=True,
259
- prevent_thread_lock=True
260
  )
 
1
+ import PyPDF2
2
+ import re
3
+ from sentence_transformers import SentenceTransformer
4
+ import faiss
5
+ from langchain.agents import initialize_agent, AgentType,Tool
6
+ from langchain.schema import HumanMessage
7
+ from langchain_google_genai import ChatGoogleGenerativeAI
8
+ import gradio as gr
9
+ import os
10
+ import pytesseract
11
+ from PIL import Image
12
+ pytesseract.pytesseract.tesseract_cmd = r"tesseract.exe"
13
+
14
+ model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
15
+
16
+ embeddings = model.encode(flattened_data)
17
+ index = faiss.read_index('IPC_index.faiss')
18
+ index2 = faiss.read_index('CrPC_index.faiss')
19
+
20
+
21
+ # Step 3: Retrieval with Citations using PDF filename
22
+ def retrieve_info_with_citation(query, top_k=5):
23
+ query_embedding = model.encode([query])
24
+ D, I = index.search(query_embedding, k=top_k)
25
+
26
+ results = []
27
+ for i in range(min(top_k, len(I[0]))):
28
+ if D[0][i] < 1.0: # Relevance threshold
29
+ chunk_index = I[0][i]
30
+ citation = f"Source: IPC"
31
+ results.append((match, citation))
32
+ else:
33
+ break
34
+
35
+ if results:
36
+ return results
37
+ else:
38
+ return [("I'm sorry, I couldn't find relevant information.", "Source: N/A")]
39
+
40
+
41
+ def retrieve_info_with_citation2(query, top_k=5):
42
+ query_embedding = model.encode([query])
43
+ D, I = index2.search(query_embedding, k=top_k)
44
+
45
+ results = []
46
+ for i in range(min(top_k, len(I[0]))):
47
+ if D[0][i] < 1.0: # Relevance threshold
48
+ chunk_index = I[0][i]
49
+ citation = f"Source: CrPC"
50
+ results.append((match, citation))
51
+ else:
52
+ break
53
+
54
+ if results:
55
+ return results
56
+ else:
57
+ return [("I'm sorry, I couldn't find relevant information.", "Source: N/A")]
58
+
59
+ def retrieve_info(query):
60
+ results = retrieve_info_with_citation(query)
61
+ formatted_results = "\n\n".join([f"{i+1}. {match}\n{citation}" for i, (match, citation) in enumerate(results)])
62
+ return formatted_results
63
+
64
+ def retrieve_info2(query):
65
+ results = retrieve_info_with_citation2(query)
66
+ formatted_results = "\n\n".join([f"{i+1}. {match}\n{citation}" for i, (match, citation) in enumerate(results)])
67
+ return formatted_results
68
+
69
+ ipc_tool = Tool(
70
+ name="IPC Information Retrieval",
71
+ func=retrieve_info,
72
+ description="Retrieve information from the Indian Penal Code Related to query keyword(s)."
73
+ )
74
+
75
+ crpc_tool=Tool(
76
+ name="CrPC Information Retrieval",
77
+ func=retrieve_info2,
78
+ description="Retrieve information from the Code of Criminal Procedure(CrPC) Related to query keyword(s)."
79
+ )
80
+ llm = ChatGoogleGenerativeAI(
81
+ model="gemini-1.5-pro",
82
+ temperature=0.25,
83
+ max_tokens=None,
84
+ timeout=None,
85
+ max_retries=2,
86
+ prompt_template="""
87
+ You are a highly specialized legal assistant with deep knowledge of the Indian Penal Code (IPC).
88
+ Your primary task is to retrieve and summarize legal information accurately from the IPC.pdf document provided to you.
89
+ Your responses should be highly specific, fact-based, and free from any speculation or hallucinations.
90
+ Always cite the exact section from the IPC when providing an answer.
91
+ If the information is not available in the document, clearly state that and do not make any assumptions.
92
+
93
+ Example task: "What is the punishment for theft according to the IPC?"
94
+ Example response: "According to Section 379 of the IPC, the punishment for theft is imprisonment of either description for a term which may extend to three years, or with fine, or with both."
95
+
96
+ Task: {{query}}
97
+
98
+ Response:
99
+ """,
100
+ )
101
+
102
+ agent_tools = [ipc_tool,crpc_tool]
103
+
104
+ agent = initialize_agent(
105
+ tools=agent_tools,
106
+ llm=llm,
107
+ agent=AgentType.STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION,
108
+ verbose=True,
109
+ return_intermediate_steps=True,
110
+ handle_parsing_errors=True,
111
+ )
112
+ def encode_image_to_base64(image_path):
113
+ return pytesseract.image_to_string(Image.open(image_path))
114
+ def chatbot_response(query):
115
+ if query.get('files'):
116
+ # Encode image to base64
117
+ image_data=""
118
+ for x in range(len(query["files"])):
119
+ image_data += f"{x}. "+encode_image_to_base64(query["files"][x]) +"\n"
120
+
121
+ # Create a multimodal message with both text and image data
122
+ message = HumanMessage(
123
+ content=[
124
+ {"type": "text", "text": query['text'] +" System :Image(s) was added to this prompt by this user. Text Extracted from this image (Some words may be misspelled ,Use your understanding ):"+image_data}, # Add text input
125
+
126
+ ]
127
+ )
128
+ else:
129
+ # If no image, only pass the text
130
+ message = HumanMessage(content=[{"type": "text", "text": query}])
131
+
132
+ # Invoke the model with the multimodal message
133
+ result = agent.invoke([message])
134
+ response = result['output']
135
+ intermediate_steps = result.get('intermediate_steps', [])
136
+
137
+ thought_process = ""
138
+ for action, observation in intermediate_steps:
139
+ thought_process += f"**Thought:** {action.log}\n"
140
+ thought_process += f"**Action:** {action.tool}\n"
141
+ thought_process += f"**Observation:** {observation}\n\n"
142
+
143
+ return response, thought_process.strip()
144
+ # Step 5: Gradio Interface
145
+ from gradio import ChatMessage
146
+ def chatbot_interface(messages,prompt):
147
+ response, thought_process = chatbot_response(prompt)
148
+ #messages.append(ChatMessage(role="user", content=prompt))
149
+
150
+ for x in prompt["files"]:
151
+ messages.append(ChatMessage(role="user", content={"path": x, "mime_type": "image/png"}))
152
+ if prompt["text"] is not None:
153
+ messages.append(ChatMessage(role="user", content=prompt['text']))
154
+ if thought_process:
155
+ messages.append(ChatMessage(role="assistant", content=thought_process,metadata={"title": "🧠 Thought Process"}))
156
+ messages.append(ChatMessage(role="assistant", content=response))
157
+
158
+ return messages, gr.MultimodalTextbox(value=None, interactive=True)
159
+
160
+
161
+ def vote(data: gr.LikeData):
162
+ if data.liked:
163
+ print("You upvoted this response: " + data.value)
164
+ else:
165
+ print("You downvoted this response: " + data.value)
166
+
167
+ with gr.Blocks(theme=gr.themes.Soft()) as iface:
168
+
169
+ gr.Markdown(
170
+ """
171
+ <div style="font-size: 24px; font-weight: bold; color: #333;">
172
+ DoJ Chatbot
173
+ </div>
174
+ <div style="font-size: 16px; color: #555;">
175
+ Ask questions related to the Department of Justice.
176
+ </div>
177
+ """
178
+ )
179
+ chatbot = gr.Chatbot(type="messages",avatar_images=("user.jpeg", "logo.jpeg"), bubble_full_width=True) # Chatbot component to display conversation history
180
+ query_input = gr.MultimodalTextbox(interactive=True,
181
+ placeholder="Enter message or upload file...", show_label=False)
182
+ submit_button = gr.Button("Send")
183
+
184
+ submit_button.click(chatbot_interface, [chatbot, query_input], [chatbot, query_input])
185
+ query_input.submit(chatbot_interface, [chatbot, query_input], [chatbot,query_input])
186
+
187
+ chatbot.like(vote, None, None) # Adding like/dislike functionality to the chatbot
188
+
189
+
190
+ iface.launch(
191
+ show_error=True,
192
+ prevent_thread_lock=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  )