RoAr777 commited on
Commit
29fb422
·
verified ·
1 Parent(s): 1a524c4

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +260 -0
app.py ADDED
@@ -0,0 +1,260 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import PyPDF2
2
+ import re
3
+ from sentence_transformers import SentenceTransformer
4
+ import faiss
5
+ from langchain.agents import initialize_agent, AgentType,Tool
6
+ from langchain.schema import HumanMessage
7
+ from langchain_google_genai import ChatGoogleGenerativeAI
8
+ import gradio as gr
9
+ import os
10
+ import pytesseract
11
+ from PIL import Image
12
+ pytesseract.pytesseract.tesseract_cmd = r"tesseract.exe"
13
+
14
+
15
+
16
+ def load_pdf_text(file_path):
17
+ with open(file_path, "rb") as file:
18
+ reader = PyPDF2.PdfReader(file)
19
+ text = ""
20
+ for page in reader.pages:
21
+ text += page.extract_text()
22
+ return text
23
+
24
+ def chunk_text(text, chunk_size=700):
25
+ # Splits the text into chunks of chunk_size while preserving sentences
26
+ chunks = []
27
+ sentences = re.split(r'(?<=[.!?])\s+', text)
28
+ current_chunk = ""
29
+ for sentence in sentences:
30
+ if len(current_chunk) + len(sentence) > chunk_size:
31
+ chunks.append(current_chunk)
32
+ current_chunk = sentence
33
+ else:
34
+ current_chunk += " " + sentence
35
+ chunks.append(current_chunk)
36
+ return chunks
37
+
38
+
39
+ def load_and_process_chapters(directory):
40
+ chapter_data = {}
41
+ for filename in os.listdir(directory):
42
+ if filename.endswith(".pdf"):
43
+ file_path = os.path.join(directory, filename)
44
+ text = load_pdf_text(file_path)
45
+ chunks = chunk_text(text)
46
+ chapter_data[filename] = chunks # Use filename as key
47
+ return chapter_data
48
+
49
+ ipc_data = load_and_process_chapters("IPC")
50
+ crpc_data=load_and_process_chapters("CrPC")
51
+ # Step 2: Embeddings and Indexing
52
+ model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
53
+
54
+ index = faiss.IndexFlatL2(model.get_sentence_embedding_dimension())
55
+ index2 = faiss.IndexFlatL2(model.get_sentence_embedding_dimension())
56
+
57
+ # Flatten the chapter data and keep track of chapter and chunk indices
58
+ flattened_data = []
59
+ pdf_filenames = [] # Store PDF filenames for citation
60
+ chunk_indices = []
61
+ for pdf_filename, chunks in ipc_data.items():
62
+ for i, chunk in enumerate(chunks):
63
+ flattened_data.append(chunk)
64
+ pdf_filenames.append(pdf_filename)
65
+ chunk_indices.append(i)
66
+
67
+ embeddings = model.encode(flattened_data)
68
+ index.add(embeddings)
69
+
70
+ flattened_data2 = []
71
+ pdf_filenames2 = [] # Store PDF filenames for citation
72
+ chunk_indices2 = []
73
+ for pdf_filename, chunks in crpc_data.items():
74
+ for i, chunk in enumerate(chunks):
75
+ flattened_data2.append(chunk)
76
+ pdf_filenames2.append(pdf_filename)
77
+ chunk_indices2.append(i)
78
+
79
+ embeddings = model.encode(flattened_data2)
80
+ index2.add(embeddings)
81
+
82
+ # Step 3: Retrieval with Citations using PDF filename
83
+ def retrieve_info_with_citation(query, top_k=5):
84
+ query_embedding = model.encode([query])
85
+ D, I = index.search(query_embedding, k=top_k)
86
+
87
+ results = []
88
+ for i in range(min(top_k, len(I[0]))):
89
+ if D[0][i] < 1.0: # Relevance threshold
90
+ chunk_index = I[0][i]
91
+ pdf_filename = pdf_filenames[chunk_index]
92
+ chunk_number = chunk_indices[chunk_index] + 1
93
+ match = flattened_data[chunk_index]
94
+ citation = f"Source: {pdf_filename}, Chunk: {chunk_number}"
95
+ results.append((match, citation))
96
+ else:
97
+ break
98
+
99
+ if results:
100
+ return results
101
+ else:
102
+ return [("I'm sorry, I couldn't find relevant information.", "Source: N/A")]
103
+
104
+
105
+ def retrieve_info_with_citation2(query, top_k=5):
106
+ query_embedding = model.encode([query])
107
+ D, I = index2.search(query_embedding, k=top_k)
108
+
109
+ results = []
110
+ for i in range(min(top_k, len(I[0]))):
111
+ if D[0][i] < 1.0: # Relevance threshold
112
+ chunk_index = I[0][i]
113
+ pdf_filename = pdf_filenames2[chunk_index]
114
+ chunk_number = chunk_indices2[chunk_index] + 1
115
+ match = flattened_data2[chunk_index]
116
+ citation = f"Source: {pdf_filename}, Chunk: {chunk_number}"
117
+ results.append((match, citation))
118
+ else:
119
+ break
120
+
121
+ if results:
122
+ return results
123
+ else:
124
+ return [("I'm sorry, I couldn't find relevant information.", "Source: N/A")]
125
+
126
+ def retrieve_info(query):
127
+ results = retrieve_info_with_citation(query)
128
+ formatted_results = "\n\n".join([f"{i+1}. {match}\n{citation}" for i, (match, citation) in enumerate(results)])
129
+ return formatted_results
130
+
131
+ def retrieve_info2(query):
132
+ results = retrieve_info_with_citation2(query)
133
+ formatted_results = "\n\n".join([f"{i+1}. {match}\n{citation}" for i, (match, citation) in enumerate(results)])
134
+ return formatted_results
135
+
136
+ ipc_tool = Tool(
137
+ name="IPC Information Retrieval",
138
+ func=retrieve_info,
139
+ description="Retrieve information from the Indian Penal Code Related to query keyword(s)."
140
+ )
141
+
142
+ crpc_tool=Tool(
143
+ name="CrPC Information Retrieval",
144
+ func=retrieve_info2,
145
+ description="Retrieve information from the Code of Criminal Procedure(CrPC) Related to query keyword(s)."
146
+ )
147
+ llm = ChatGoogleGenerativeAI(
148
+ model="gemini-1.5-pro",
149
+ temperature=0.25,
150
+ max_tokens=None,
151
+ timeout=None,
152
+ max_retries=2,
153
+ prompt_template="""
154
+ You are a highly specialized legal assistant with deep knowledge of the Indian Penal Code (IPC).
155
+ Your primary task is to retrieve and summarize legal information accurately from the IPC.pdf document provided to you.
156
+ Your responses should be highly specific, fact-based, and free from any speculation or hallucinations.
157
+ Always cite the exact section from the IPC when providing an answer.
158
+ If the information is not available in the document, clearly state that and do not make any assumptions.
159
+
160
+ Example task: "What is the punishment for theft according to the IPC?"
161
+ Example response: "According to Section 379 of the IPC, the punishment for theft is imprisonment of either description for a term which may extend to three years, or with fine, or with both."
162
+
163
+ Task: {{query}}
164
+
165
+ Response:
166
+ """,
167
+ )
168
+
169
+ agent_tools = [ipc_tool,crpc_tool]
170
+
171
+ agent = initialize_agent(
172
+ tools=agent_tools,
173
+ llm=llm,
174
+ agent=AgentType.STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION,
175
+ verbose=True,
176
+ return_intermediate_steps=True,
177
+ handle_parsing_errors=True,
178
+ )
179
+ def encode_image_to_base64(image_path):
180
+ return pytesseract.image_to_string(Image.open(image_path))
181
+ def chatbot_response(query):
182
+ if query.get('files'):
183
+ # Encode image to base64
184
+ image_data=""
185
+ for x in range(len(query["files"])):
186
+ image_data += f"{x}. "+encode_image_to_base64(query["files"][x]) +"\n"
187
+
188
+ # Create a multimodal message with both text and image data
189
+ message = HumanMessage(
190
+ content=[
191
+ {"type": "text", "text": query['text'] +" System :Image(s) was added to this prompt by this user. Text Extracted from this image (Some words may be misspelled ,Use your understanding ):"+image_data}, # Add text input
192
+
193
+ ]
194
+ )
195
+ else:
196
+ # If no image, only pass the text
197
+ message = HumanMessage(content=[{"type": "text", "text": query}])
198
+
199
+ # Invoke the model with the multimodal message
200
+ result = agent.invoke([message])
201
+ response = result['output']
202
+ intermediate_steps = result.get('intermediate_steps', [])
203
+
204
+ thought_process = ""
205
+ for action, observation in intermediate_steps:
206
+ thought_process += f"**Thought:** {action.log}\n"
207
+ thought_process += f"**Action:** {action.tool}\n"
208
+ thought_process += f"**Observation:** {observation}\n\n"
209
+
210
+ return response, thought_process.strip()
211
+ # Step 5: Gradio Interface
212
+ from gradio import ChatMessage
213
+ def chatbot_interface(messages,prompt):
214
+ response, thought_process = chatbot_response(prompt)
215
+ #messages.append(ChatMessage(role="user", content=prompt))
216
+
217
+ for x in prompt["files"]:
218
+ messages.append(ChatMessage(role="user", content={"path": x, "mime_type": "image/png"}))
219
+ if prompt["text"] is not None:
220
+ messages.append(ChatMessage(role="user", content=prompt['text']))
221
+ if thought_process:
222
+ messages.append(ChatMessage(role="assistant", content=thought_process,metadata={"title": "🧠 Thought Process"}))
223
+ messages.append(ChatMessage(role="assistant", content=response))
224
+
225
+ return messages, gr.MultimodalTextbox(value=None, interactive=True)
226
+
227
+
228
+ def vote(data: gr.LikeData):
229
+ if data.liked:
230
+ print("You upvoted this response: " + data.value)
231
+ else:
232
+ print("You downvoted this response: " + data.value)
233
+
234
+ with gr.Blocks(theme=gr.themes.Soft()) as iface:
235
+
236
+ gr.Markdown(
237
+ """
238
+ <div style="font-size: 24px; font-weight: bold; color: #333;">
239
+ DoJ Chatbot
240
+ </div>
241
+ <div style="font-size: 16px; color: #555;">
242
+ Ask questions related to the Department of Justice.
243
+ </div>
244
+ """
245
+ )
246
+ chatbot = gr.Chatbot(type="messages",avatar_images=("user.jpeg", "logo.jpeg"), bubble_full_width=True) # Chatbot component to display conversation history
247
+ query_input = gr.MultimodalTextbox(interactive=True,
248
+ placeholder="Enter message or upload file...", show_label=False)
249
+ submit_button = gr.Button("Send")
250
+
251
+ submit_button.click(chatbot_interface, [chatbot, query_input], [chatbot, query_input])
252
+ query_input.submit(chatbot_interface, [chatbot, query_input], [chatbot,query_input])
253
+
254
+ chatbot.like(vote, None, None) # Adding like/dislike functionality to the chatbot
255
+
256
+
257
+ iface.launch(
258
+ show_error=True,
259
+ prevent_thread_lock=True
260
+ )