OmidSakaki commited on
Commit
872e2d7
·
verified ·
1 Parent(s): afb70ec

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -24
app.py CHANGED
@@ -26,43 +26,33 @@ class OCRProcessor:
26
  # 2. LangChain-based DocQA Agent
27
  class LangChainDocQAAgent:
28
  def __init__(self):
29
- # Embedding model
30
  self.embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
31
- # Text splitter (chunk size and overlap for better retrieval)
32
  self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
33
- # HuggingFace QA pipeline as an LLM
34
- self.qa_llm = HuggingFacePipeline(
35
- pipeline=hf_pipeline(
36
- "question-answering",
37
- model="deepset/roberta-base-squad2",
38
- tokenizer="deepset/roberta-base-squad2"
39
- ),
40
- model_kwargs={"return_full_text": False}
41
  )
42
 
43
  def prepare_retriever(self, text):
44
- # Split text into LangChain Document objects
45
  docs = [Document(page_content=chunk) for chunk in self.text_splitter.split_text(text)]
46
- # Create FAISS vectorstore for retrieval
47
  vectorstore = FAISS.from_documents(docs, self.embeddings)
48
  return vectorstore.as_retriever(), docs
49
 
50
  def qa(self, text, question):
51
  if not text.strip() or not question.strip():
52
  return "No text or question provided.", ""
53
- # Build retriever from text
54
  retriever, docs = self.prepare_retriever(text)
55
- # RetrievalQA chain: retrieve relevant chunk and answer
56
- qa_chain = RetrievalQA.from_chain_type(
57
- llm=self.qa_llm,
58
- chain_type="stuff",
59
- retriever=retriever,
60
- return_source_documents=True
61
- )
62
- result = qa_chain({"query": question})
63
- answer = result["result"]
64
- # Show the most relevant chunk as context
65
- relevant_context = result["source_documents"][0].page_content if result["source_documents"] else ""
66
  return relevant_context, answer
67
 
68
  ocr_processor = OCRProcessor()
 
26
  # 2. LangChain-based DocQA Agent
27
  class LangChainDocQAAgent:
28
  def __init__(self):
 
29
  self.embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
 
30
  self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
31
+
32
+ self.qa_pipeline = hf_pipeline(
33
+ "question-answering",
34
+ model="deepset/roberta-base-squad2",
35
+ tokenizer="deepset/roberta-base-squad2"
 
 
 
36
  )
37
 
38
  def prepare_retriever(self, text):
 
39
  docs = [Document(page_content=chunk) for chunk in self.text_splitter.split_text(text)]
 
40
  vectorstore = FAISS.from_documents(docs, self.embeddings)
41
  return vectorstore.as_retriever(), docs
42
 
43
  def qa(self, text, question):
44
  if not text.strip() or not question.strip():
45
  return "No text or question provided.", ""
 
46
  retriever, docs = self.prepare_retriever(text)
47
+
48
+ relevant_docs = retriever.get_relevant_documents(question)
49
+ relevant_context = relevant_docs[0].page_content if relevant_docs else ""
50
+
51
+ if relevant_context:
52
+ result = self.qa_pipeline({"context": relevant_context, "question": question})
53
+ answer = result["answer"]
54
+ else:
55
+ answer = "No answer found."
 
 
56
  return relevant_context, answer
57
 
58
  ocr_processor = OCRProcessor()