Spaces:

OmidSakaki
/

DocQA_Agent

Sleeping

File size: 4,130 Bytes

9453eac
5c3f634
4abc449
87a1c7f
 
 
 
 
 
 
 
 
9453eac
fd5f89e
c5a772e
 
fd5f89e
54a29b3
c5a772e
 
6ecc4f4
 
c5a772e
fd5f89e
768d260
87a1c7f
 
c5a772e
87a1c7f
 
 
 
 
 
 
 
 
 
 
 
3209503
6ecc4f4
87a1c7f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57fa964
12a2f23
87a1c7f
c5a772e
2711484
 
12a2f23
fd5f89e
 
87a1c7f
 
fd5f89e
12a2f23
87a1c7f
c5a772e
87a1c7f
2711484
87a1c7f
c5a772e
9453eac
 
fd5f89e
 
 
279ab91
fd5f89e
 
28a9f71
57fa964
2711484
12a2f23
 
57fa964
9453eac
 
4abc449

import gradio as gr
import easyocr
import numpy as np

from langchain_community.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document

from transformers import pipeline as hf_pipeline

# 1. OCR Processor (English)
class OCRProcessor:
    def __init__(self):
        self.reader = easyocr.Reader(['en'])

    def extract_text(self, image: np.ndarray) -> str:
        try:
            results = self.reader.readtext(image, detail=0, paragraph=True)
            return "\n".join(results) if results else ""
        except Exception as e:
            return f"OCR error: {str(e)}"

# 2. LangChain-based DocQA Agent
class LangChainDocQAAgent:
    def __init__(self):
        # Embedding model
        self.embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
        # Text splitter (chunk size and overlap for better retrieval)
        self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
        # HuggingFace QA pipeline as an LLM
        self.qa_llm = HuggingFacePipeline(
            pipeline=hf_pipeline(
                "question-answering",
                model="deepset/roberta-base-squad2",
                tokenizer="deepset/roberta-base-squad2"
            ),
            model_kwargs={"return_full_text": False}
        )

    def prepare_retriever(self, text):
        # Split text into LangChain Document objects
        docs = [Document(page_content=chunk) for chunk in self.text_splitter.split_text(text)]
        # Create FAISS vectorstore for retrieval
        vectorstore = FAISS.from_documents(docs, self.embeddings)
        return vectorstore.as_retriever(), docs

    def qa(self, text, question):
        if not text.strip() or not question.strip():
            return "No text or question provided.", ""
        # Build retriever from text
        retriever, docs = self.prepare_retriever(text)
        # RetrievalQA chain: retrieve relevant chunk and answer
        qa_chain = RetrievalQA.from_chain_type(
            llm=self.qa_llm,
            chain_type="stuff",
            retriever=retriever,
            return_source_documents=True
        )
        result = qa_chain({"query": question})
        answer = result["result"]
        # Show the most relevant chunk as context
        relevant_context = result["source_documents"][0].page_content if result["source_documents"] else ""
        return relevant_context, answer

ocr_processor = OCRProcessor()
docqa_agent = LangChainDocQAAgent()

def docqa_pipeline(image, question):
    # 1. OCR
    context = ocr_processor.extract_text(image)
    if context.startswith("OCR error"):
        return context, "No answer."
    # 2. LangChain RetrievalQA
    relevant_chunk, answer = docqa_agent.qa(context, question)
    return context, f"Relevant chunk:\n{relevant_chunk}\n\nModel answer:\n{answer}"

with gr.Blocks(title="DocQA Agent (LangChain): Intelligent Q&A from Extracted English Document") as app:
    gr.Markdown("""
    # DocQA Agent (LangChain)
    <br>
    A multi-agent system for question answering from English documents (OCR + retrieval + intelligent answer with LangChain)
    """)
    with gr.Row():
        with gr.Column():
            img_input = gr.Image(label="Input Image", type="numpy")
            question_input = gr.Textbox(label="Your question (in English)", placeholder="e.g. Who is the author of this text?", lines=1)
            process_btn = gr.Button("Get Answer")
        with gr.Column():
            context_output = gr.Textbox(label="Extracted Text", lines=10, max_lines=None, interactive=False)
            answer_output = gr.Textbox(label="Model Output (Relevant Chunk & Answer)", lines=10, max_lines=None, interactive=False)

    process_btn.click(
        fn=docqa_pipeline,
        inputs=[img_input, question_input],
        outputs=[context_output, answer_output]
    )

if __name__ == "__main__":
    app.launch()