Spaces:

OmidSakaki
/

DocQA_Agent

Sleeping

App Files Files Community

OmidSakaki commited on Jul 3

Commit

872e2d7

verified ·

1 Parent(s): afb70ec

Update app.py

Browse files

Files changed (1) hide show

app.py +14 -24

app.py CHANGED Viewed

@@ -26,43 +26,33 @@ class OCRProcessor:
 # 2. LangChain-based DocQA Agent
 class LangChainDocQAAgent:
     def __init__(self):
-        # Embedding model
         self.embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
-        # Text splitter (chunk size and overlap for better retrieval)
         self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
-        # HuggingFace QA pipeline as an LLM
-        self.qa_llm = HuggingFacePipeline(
-            pipeline=hf_pipeline(
-                "question-answering",
-                model="deepset/roberta-base-squad2",
-                tokenizer="deepset/roberta-base-squad2"
-            ),
-            model_kwargs={"return_full_text": False}
         )
     def prepare_retriever(self, text):
-        # Split text into LangChain Document objects
         docs = [Document(page_content=chunk) for chunk in self.text_splitter.split_text(text)]
-        # Create FAISS vectorstore for retrieval
         vectorstore = FAISS.from_documents(docs, self.embeddings)
         return vectorstore.as_retriever(), docs
     def qa(self, text, question):
         if not text.strip() or not question.strip():
             return "No text or question provided.", ""
-        # Build retriever from text
         retriever, docs = self.prepare_retriever(text)
-        # RetrievalQA chain: retrieve relevant chunk and answer
-        qa_chain = RetrievalQA.from_chain_type(
-            llm=self.qa_llm,
-            chain_type="stuff",
-            retriever=retriever,
-            return_source_documents=True
-        )
-        result = qa_chain({"query": question})
-        answer = result["result"]
-        # Show the most relevant chunk as context
-        relevant_context = result["source_documents"][0].page_content if result["source_documents"] else ""
         return relevant_context, answer
 ocr_processor = OCRProcessor()

 # 2. LangChain-based DocQA Agent
 class LangChainDocQAAgent:
     def __init__(self):
         self.embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
         self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
+        self.qa_pipeline = hf_pipeline(
+            "question-answering",
+            model="deepset/roberta-base-squad2",
+            tokenizer="deepset/roberta-base-squad2"
         )
     def prepare_retriever(self, text):
         docs = [Document(page_content=chunk) for chunk in self.text_splitter.split_text(text)]
         vectorstore = FAISS.from_documents(docs, self.embeddings)
         return vectorstore.as_retriever(), docs
     def qa(self, text, question):
         if not text.strip() or not question.strip():
             return "No text or question provided.", ""
         retriever, docs = self.prepare_retriever(text)
+        relevant_docs = retriever.get_relevant_documents(question)
+        relevant_context = relevant_docs[0].page_content if relevant_docs else ""
+        if relevant_context:
+            result = self.qa_pipeline({"context": relevant_context, "question": question})
+            answer = result["answer"]
+        else:
+            answer = "No answer found."
         return relevant_context, answer
 ocr_processor = OCRProcessor()