import gradio as gr import easyocr import numpy as np from langchain_community.llms import HuggingFacePipeline from langchain.chains import RetrievalQA from langchain_community.vectorstores import FAISS from langchain_community.embeddings import HuggingFaceEmbeddings from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.docstore.document import Document from transformers import pipeline as hf_pipeline # 1. OCR Processor (English) class OCRProcessor: def __init__(self): self.reader = easyocr.Reader(['en']) def extract_text(self, image: np.ndarray) -> str: try: results = self.reader.readtext(image, detail=0, paragraph=True) return "\n".join(results) if results else "" except Exception as e: return f"OCR error: {str(e)}" # 2. LangChain-based DocQA Agent class LangChainDocQAAgent: def __init__(self): # Embedding model self.embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") # Text splitter (chunk size and overlap for better retrieval) self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) # HuggingFace QA pipeline as an LLM self.qa_llm = HuggingFacePipeline( pipeline=hf_pipeline( "question-answering", model="deepset/roberta-base-squad2", tokenizer="deepset/roberta-base-squad2" ), model_kwargs={"return_full_text": False} ) def prepare_retriever(self, text): # Split text into LangChain Document objects docs = [Document(page_content=chunk) for chunk in self.text_splitter.split_text(text)] # Create FAISS vectorstore for retrieval vectorstore = FAISS.from_documents(docs, self.embeddings) return vectorstore.as_retriever(), docs def qa(self, text, question): if not text.strip() or not question.strip(): return "No text or question provided.", "" # Build retriever from text retriever, docs = self.prepare_retriever(text) # RetrievalQA chain: retrieve relevant chunk and answer qa_chain = RetrievalQA.from_chain_type( llm=self.qa_llm, chain_type="stuff", retriever=retriever, return_source_documents=True ) result = qa_chain({"query": question}) answer = result["result"] # Show the most relevant chunk as context relevant_context = result["source_documents"][0].page_content if result["source_documents"] else "" return relevant_context, answer ocr_processor = OCRProcessor() docqa_agent = LangChainDocQAAgent() def docqa_pipeline(image, question): # 1. OCR context = ocr_processor.extract_text(image) if context.startswith("OCR error"): return context, "No answer." # 2. LangChain RetrievalQA relevant_chunk, answer = docqa_agent.qa(context, question) return context, f"Relevant chunk:\n{relevant_chunk}\n\nModel answer:\n{answer}" with gr.Blocks(title="DocQA Agent (LangChain): Intelligent Q&A from Extracted English Document") as app: gr.Markdown(""" # DocQA Agent (LangChain)
A multi-agent system for question answering from English documents (OCR + retrieval + intelligent answer with LangChain) """) with gr.Row(): with gr.Column(): img_input = gr.Image(label="Input Image", type="numpy") question_input = gr.Textbox(label="Your question (in English)", placeholder="e.g. Who is the author of this text?", lines=1) process_btn = gr.Button("Get Answer") with gr.Column(): context_output = gr.Textbox(label="Extracted Text", lines=10, max_lines=None, interactive=False) answer_output = gr.Textbox(label="Model Output (Relevant Chunk & Answer)", lines=10, max_lines=None, interactive=False) process_btn.click( fn=docqa_pipeline, inputs=[img_input, question_input], outputs=[context_output, answer_output] ) if __name__ == "__main__": app.launch()