PDF-Insight-PRO / preprocessing.py
Jatin Mehra
Refactor retrieval and agent functions for improved chunk handling and error management
63ed7c1
raw
history blame
11.1 kB
import os
from langchain_community.document_loaders import PyMuPDFLoader
import faiss
from langchain_groq import ChatGroq
from langchain.agents import AgentExecutor, create_tool_calling_agent
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.memory import ConversationBufferMemory
from sentence_transformers import SentenceTransformer
import dotenv
from langchain.tools import tool
import traceback
dotenv.load_dotenv()
# Initialize LLM and tools globally
def model_selection(model_name):
llm = ChatGroq(model=model_name, api_key=os.getenv("GROQ_API_KEY"))
return llm
tools = [TavilySearchResults(max_results=5)]
# Initialize memory for conversation history
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
def estimate_tokens(text):
"""Estimate the number of tokens in a text (rough approximation)."""
return len(text) // 4
def process_pdf_file(file_path):
"""Load a PDF file and extract its text with metadata."""
if not os.path.exists(file_path):
raise FileNotFoundError(f"The file {file_path} does not exist.")
loader = PyMuPDFLoader(file_path)
documents = loader.load()
return documents # Return list of Document objects with metadata
def chunk_text(documents, max_length=1000):
"""Split documents into chunks with metadata."""
chunks = []
for doc in documents:
text = doc.page_content
metadata = doc.metadata
paragraphs = text.split("\n\n")
current_chunk = ""
current_metadata = metadata.copy()
for paragraph in paragraphs:
if estimate_tokens(current_chunk + paragraph) <= max_length // 4:
current_chunk += paragraph + "\n\n"
else:
chunks.append({"text": current_chunk.strip(), "metadata": current_metadata})
current_chunk = paragraph + "\n\n"
if current_chunk:
chunks.append({"text": current_chunk.strip(), "metadata": current_metadata})
return chunks
def create_embeddings(chunks, model):
"""Create embeddings for a list of chunk texts."""
texts = [chunk["text"] for chunk in chunks]
embeddings = model.encode(texts, show_progress_bar=True, convert_to_tensor=True)
return embeddings.cpu().numpy(), chunks
def build_faiss_index(embeddings):
"""Build a FAISS HNSW index from embeddings for similarity search."""
dim = embeddings.shape[1]
index = faiss.IndexHNSWFlat(dim, 32) # 32 = number of neighbors in HNSW graph
index.hnsw.efConstruction = 200 # Higher = better quality, slower build
index.hnsw.efSearch = 50 # Higher = better accuracy, slower search
index.add(embeddings)
return index
def retrieve_similar_chunks(query, index, chunks_with_metadata, embedding_model, k=10, max_chunk_length=1000):
"""Retrieve top k similar chunks to the query from the FAISS index."""
query_embedding = embedding_model.encode([query], convert_to_tensor=True).cpu().numpy()
distances, indices = index.search(query_embedding, k)
# Ensure indices are within bounds of chunks_with_metadata
valid_indices = [i for i in indices[0] if 0 <= i < len(chunks_with_metadata)]
return [
(chunks_with_metadata[i]["text"][:max_chunk_length], distances[0][j], chunks_with_metadata[i]["metadata"])
for j, i in enumerate(valid_indices) # Use valid_indices
]
def create_vector_search_tool(faiss_index, document_chunks_with_metadata, embedding_model, k=3, max_chunk_length=1000):
@tool
def vector_database_search(query: str) -> str:
"""
Searches the currently uploaded PDF document for information semantically similar to the query.
Use this tool when the user's question is likely answerable from the content of the specific document they provided.
Input should be the search query.
"""
# Retrieve similar chunks using the provided session-specific components
similar_chunks_data = retrieve_similar_chunks(
query,
faiss_index,
document_chunks_with_metadata, # This is the list of dicts {text: ..., metadata: ...}
embedding_model,
k=k,
max_chunk_length=max_chunk_length
)
# Format the response
if not similar_chunks_data:
return "No relevant information found in the document for that query."
context = "\n\n---\n\n".join([chunk_text for chunk_text, _, _ in similar_chunks_data])
return f"The following information was found in the document regarding '{query}':\n{context}"
return vector_database_search
def agentic_rag(llm, agent_specific_tools, query, context_chunks, memory, Use_Tavily=False): # Renamed 'tools' to 'agent_specific_tools'
# Sort chunks by relevance (lower distance = more relevant)
context_chunks = sorted(context_chunks, key=lambda x: x[1]) if context_chunks else []
context = ""
total_tokens = 0
max_tokens = 7000 # Leave room for prompt and response
for chunk, _, _ in context_chunks:
chunk_tokens = estimate_tokens(chunk)
if total_tokens + chunk_tokens <= max_tokens:
context += chunk + "\n\n"
total_tokens += chunk_tokens
else:
break
context = context.strip() if context else "No initial context provided from preliminary search."
# Dynamically build the tool guidance for the prompt
# Tool names: 'vector_database_search', 'tavily_search_results_json'
has_document_search = any(t.name == "vector_database_search" for t in agent_specific_tools)
has_web_search = any(t.name == "tavily_search_results_json" for t in agent_specific_tools)
guidance_parts = []
if has_document_search:
guidance_parts.append(
"If the direct context (if any from preliminary search) is insufficient and the question seems answerable from the uploaded document, "
"use the 'vector_database_search' tool to find relevant information within the document."
)
if has_web_search: # Tavily tool would only be in agent_specific_tools if Use_Tavily was true
guidance_parts.append(
"If the information is not found in the document (after using 'vector_database_search' if appropriate) "
"or the question is of a general nature not specific to the document, "
"use the 'tavily_search_results_json' tool for web searches."
)
if not guidance_parts:
search_behavior_instructions = "If the context is insufficient, you *must* state that you don't know."
else:
search_behavior_instructions = " ".join(guidance_parts)
search_behavior_instructions += ("\n * If, after all steps and tool use (if any), you cannot find an answer, "
"respond with: \"Based on the available information, I don't know the answer.\"")
prompt = ChatPromptTemplate.from_messages([
("system", f"""
You are an expert Q&A system. Your primary function is to answer questions using a given set of documents (Context) and available tools.
**Your Process:**
1. **Analyze the Question:** Understand exactly what the user is asking.
2. **Scan the Context:** Thoroughly review the 'Context' provided (if any) to find relevant information. This context is derived from a preliminary similarity search in the document.
3. **Formulate the Answer:**
* If the initially provided context contains a clear answer, synthesize it into a concise response. Start your answer with "Based on the Document, ...".
* {search_behavior_instructions}
* When using the 'vector_database_search' tool, the information comes from the document. Prepend your answer with "Based on the Document, ...".
* When using the 'tavily_search_results_json' tool, the information comes from the web. Prepend your answer with "According to a web search, ...". If no useful information is found, state that.
4. **Clarity:** Ensure your final answer is clear, direct, and avoids jargon if possible.
**Important Rules:**
* **Stick to Sources:** Do *not* use any information outside of the provided 'Context', document search results ('vector_database_search'), or web search results ('tavily_search_results_json').
* **No Speculation:** Do not make assumptions or infer information not explicitly present.
* **Cite Sources (If Web Searching):** If you use the 'tavily_search_results_json' tool and it provides source links, you MUST include them in your response.
"""),
("human", "Context: {{context}}\n\nQuestion: {{input}}"), # Double braces for f-string in f-string
MessagesPlaceholder(variable_name="chat_history"),
MessagesPlaceholder(variable_name="agent_scratchpad"),
])
try:
agent = create_tool_calling_agent(llm, agent_specific_tools, prompt)
agent_executor = AgentExecutor(agent=agent, tools=agent_specific_tools, memory=memory, verbose=True)
response_payload = agent_executor.invoke({
"input": query,
"context": context,
})
return response_payload # Expecting dict like {'output': '...'}
except Exception as e:
print(f"Error during agent execution: {str(e)} \nTraceback: {traceback.format_exc()}")
fallback_prompt_template = ChatPromptTemplate.from_messages([
("system", "You are a helpful assistant. Use the provided context to answer the user's question. If the context is insufficient, say you don't know."),
("human", "Context: {context}\n\nQuestion: {input}")
])
# Format the prompt with the actual context and query
formatted_fallback_prompt = fallback_prompt_template.format_prompt(context=context, input=query).to_messages()
response = llm.invoke(formatted_fallback_prompt)
return {"output": response.content if hasattr(response, 'content') else str(response)}
"""if __name__ == "__main__":
# Process PDF and prepare index
dotenv.load_dotenv()
pdf_file = "JatinCV.pdf"
llm = model_selection("meta-llama/llama-4-scout-17b-16e-instruct")
texts = process_pdf_file(pdf_file)
chunks = chunk_text(texts, max_length=1500)
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = create_embeddings(chunks, model)
index = build_faiss_index(embeddings)
# Chat loop
print("Chat with the assistant (type 'exit' or 'quit' to stop):")
while True:
query = input("User: ")
if query.lower() in ["exit", "quit"]:
break
# Retrieve similar chunks
similar_chunks = retrieve_similar_chunks(query, index, chunks, model, k=3)
# context = "\n".join([chunk for chunk, _ in similar_chunks])
# Generate response
response = agentic_rag(llm, tools, query=query, context=similar_chunks, Use_Tavily=True, memory=memory)
print("Assistant:", response["output"])"""