Spaces:

RoAr777
/

LS

Runtime error

App Files Files Community

RoAr777 commited on Sep 13, 2024

Commit

29fb422

verified ·

1 Parent(s): 1a524c4

Upload app.py

Browse files

Files changed (1) hide show

app.py +260 -0

app.py ADDED Viewed

	@@ -0,0 +1,260 @@

+import PyPDF2
+import re
+from sentence_transformers import SentenceTransformer
+import faiss
+from langchain.agents import initialize_agent, AgentType,Tool
+from langchain.schema import HumanMessage
+from langchain_google_genai import ChatGoogleGenerativeAI
+import gradio as gr
+import os
+import pytesseract
+from PIL import Image
+pytesseract.pytesseract.tesseract_cmd = r"tesseract.exe"
+def load_pdf_text(file_path):
+    with open(file_path, "rb") as file:
+        reader = PyPDF2.PdfReader(file)
+        text = ""
+        for page in reader.pages:
+            text += page.extract_text()
+        return text
+def chunk_text(text, chunk_size=700):
+    # Splits the text into chunks of chunk_size while preserving sentences
+    chunks = []
+    sentences = re.split(r'(?<=[.!?])\s+', text)
+    current_chunk = ""
+    for sentence in sentences:
+        if len(current_chunk) + len(sentence) > chunk_size:
+            chunks.append(current_chunk)
+            current_chunk = sentence
+        else:
+            current_chunk += " " + sentence
+    chunks.append(current_chunk)
+    return chunks
+def load_and_process_chapters(directory):
+    chapter_data = {}
+    for filename in os.listdir(directory):
+        if filename.endswith(".pdf"):
+            file_path = os.path.join(directory, filename)
+            text = load_pdf_text(file_path)
+            chunks = chunk_text(text)
+            chapter_data[filename] = chunks # Use filename as key
+    return chapter_data
+ipc_data = load_and_process_chapters("IPC")
+crpc_data=load_and_process_chapters("CrPC")
+# Step 2: Embeddings and Indexing
+model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
+index = faiss.IndexFlatL2(model.get_sentence_embedding_dimension())
+index2 = faiss.IndexFlatL2(model.get_sentence_embedding_dimension())
+# Flatten the chapter data and keep track of chapter and chunk indices
+flattened_data = []
+pdf_filenames = [] # Store PDF filenames for citation
+chunk_indices = []
+for pdf_filename, chunks in ipc_data.items():
+    for i, chunk in enumerate(chunks):
+        flattened_data.append(chunk)
+        pdf_filenames.append(pdf_filename)
+        chunk_indices.append(i)
+embeddings = model.encode(flattened_data)
+index.add(embeddings)
+flattened_data2 = []
+pdf_filenames2 = [] # Store PDF filenames for citation
+chunk_indices2 = []
+for pdf_filename, chunks in crpc_data.items():
+    for i, chunk in enumerate(chunks):
+        flattened_data2.append(chunk)
+        pdf_filenames2.append(pdf_filename)
+        chunk_indices2.append(i)
+embeddings = model.encode(flattened_data2)
+index2.add(embeddings)
+# Step 3: Retrieval with Citations using PDF filename
+def retrieve_info_with_citation(query, top_k=5):
+    query_embedding = model.encode([query])
+    D, I = index.search(query_embedding, k=top_k)
+    results = []
+    for i in range(min(top_k, len(I[0]))):
+        if D[0][i] < 1.0:  # Relevance threshold
+            chunk_index = I[0][i]
+            pdf_filename = pdf_filenames[chunk_index]
+            chunk_number = chunk_indices[chunk_index] + 1
+            match = flattened_data[chunk_index]
+            citation = f"Source: {pdf_filename}, Chunk: {chunk_number}"
+            results.append((match, citation))
+        else:
+            break
+    if results:
+        return results
+    else:
+        return [("I'm sorry, I couldn't find relevant information.", "Source: N/A")]
+def retrieve_info_with_citation2(query, top_k=5):
+    query_embedding = model.encode([query])
+    D, I = index2.search(query_embedding, k=top_k)
+    results = []
+    for i in range(min(top_k, len(I[0]))):
+        if D[0][i] < 1.0:  # Relevance threshold
+            chunk_index = I[0][i]
+            pdf_filename = pdf_filenames2[chunk_index]
+            chunk_number = chunk_indices2[chunk_index] + 1
+            match = flattened_data2[chunk_index]
+            citation = f"Source: {pdf_filename}, Chunk: {chunk_number}"
+            results.append((match, citation))
+        else:
+            break
+    if results:
+        return results
+    else:
+        return [("I'm sorry, I couldn't find relevant information.", "Source: N/A")]
+def retrieve_info(query):
+    results = retrieve_info_with_citation(query)
+    formatted_results = "\n\n".join([f"{i+1}. {match}\n{citation}" for i, (match, citation) in enumerate(results)])
+    return formatted_results
+def retrieve_info2(query):
+    results = retrieve_info_with_citation2(query)
+    formatted_results = "\n\n".join([f"{i+1}. {match}\n{citation}" for i, (match, citation) in enumerate(results)])
+    return formatted_results
+ipc_tool = Tool(
+    name="IPC Information Retrieval",
+    func=retrieve_info,
+    description="Retrieve information from the Indian Penal Code Related to query keyword(s)."
+)
+crpc_tool=Tool(
+    name="CrPC Information Retrieval",
+    func=retrieve_info2,
+    description="Retrieve information from the Code of Criminal Procedure(CrPC) Related to query keyword(s)."
+)
+llm = ChatGoogleGenerativeAI(
+    model="gemini-1.5-pro",
+    temperature=0.25,
+    max_tokens=None,
+    timeout=None,
+    max_retries=2,
+    prompt_template="""
+    You are a highly specialized legal assistant with deep knowledge of the Indian Penal Code (IPC).
+    Your primary task is to retrieve and summarize legal information accurately from the IPC.pdf document provided to you.
+    Your responses should be highly specific, fact-based, and free from any speculation or hallucinations.
+    Always cite the exact section from the IPC when providing an answer.
+    If the information is not available in the document, clearly state that and do not make any assumptions.
+    Example task: "What is the punishment for theft according to the IPC?"
+    Example response: "According to Section 379 of the IPC, the punishment for theft is imprisonment of either description for a term which may extend to three years, or with fine, or with both."
+    Task: {{query}}
+    Response:
+    """,
+)
+agent_tools = [ipc_tool,crpc_tool]
+agent = initialize_agent(
+    tools=agent_tools,
+    llm=llm,
+    agent=AgentType.STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION,
+    verbose=True,
+    return_intermediate_steps=True,
+    handle_parsing_errors=True,
+)
+def encode_image_to_base64(image_path):
+    return pytesseract.image_to_string(Image.open(image_path))
+def chatbot_response(query):
+    if query.get('files'):
+        # Encode image to base64
+        image_data=""
+        for x in range(len(query["files"])):
+            image_data += f"{x}. "+encode_image_to_base64(query["files"][x]) +"\n"
+        # Create a multimodal message with both text and image data
+        message = HumanMessage(
+            content=[
+                {"type": "text", "text": query['text'] +" System :Image(s) was added to this prompt by this user. Text Extracted from this image (Some words may be misspelled ,Use your understanding ):"+image_data},  # Add text input
+            ]
+        )
+    else:
+        # If no image, only pass the text
+        message = HumanMessage(content=[{"type": "text", "text": query}])
+    # Invoke the model with the multimodal message
+    result = agent.invoke([message])
+    response = result['output']
+    intermediate_steps = result.get('intermediate_steps', [])
+    thought_process = ""
+    for action, observation in intermediate_steps:
+        thought_process += f"**Thought:** {action.log}\n"
+        thought_process += f"**Action:** {action.tool}\n"
+        thought_process += f"**Observation:** {observation}\n\n"
+    return response, thought_process.strip()
+# Step 5: Gradio Interface
+from gradio import ChatMessage
+def chatbot_interface(messages,prompt):
+    response, thought_process = chatbot_response(prompt)
+    #messages.append(ChatMessage(role="user", content=prompt))
+    for x in prompt["files"]:
+            messages.append(ChatMessage(role="user", content={"path": x, "mime_type": "image/png"}))
+    if prompt["text"] is not None:
+            messages.append(ChatMessage(role="user", content=prompt['text']))
+    if thought_process:
+        messages.append(ChatMessage(role="assistant", content=thought_process,metadata={"title": "🧠 Thought Process"}))
+    messages.append(ChatMessage(role="assistant", content=response))
+    return messages,  gr.MultimodalTextbox(value=None, interactive=True)
+def vote(data: gr.LikeData):
+    if data.liked:
+        print("You upvoted this response: " + data.value)
+    else:
+        print("You downvoted this response: " + data.value)
+with gr.Blocks(theme=gr.themes.Soft()) as iface:
+            gr.Markdown(
+                """
+                <div style="font-size: 24px; font-weight: bold; color: #333;">
+                    DoJ Chatbot
+                </div>
+                <div style="font-size: 16px; color: #555;">
+                    Ask questions related to the Department of Justice.
+                </div>
+                """
+            )
+            chatbot = gr.Chatbot(type="messages",avatar_images=("user.jpeg", "logo.jpeg"), bubble_full_width=True)  # Chatbot component to display conversation history
+            query_input = gr.MultimodalTextbox(interactive=True,
+                                      placeholder="Enter message or upload file...", show_label=False)
+            submit_button = gr.Button("Send")
+            submit_button.click(chatbot_interface, [chatbot, query_input], [chatbot, query_input])
+            query_input.submit(chatbot_interface, [chatbot, query_input], [chatbot,query_input])
+            chatbot.like(vote, None, None)  # Adding like/dislike functionality to the chatbot
+iface.launch(
+    show_error=True,
+    prevent_thread_lock=True
+)