Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -26,43 +26,33 @@ class OCRProcessor:
|
|
26 |
# 2. LangChain-based DocQA Agent
|
27 |
class LangChainDocQAAgent:
|
28 |
def __init__(self):
|
29 |
-
# Embedding model
|
30 |
self.embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
|
31 |
-
# Text splitter (chunk size and overlap for better retrieval)
|
32 |
self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
|
33 |
-
|
34 |
-
self.
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
tokenizer="deepset/roberta-base-squad2"
|
39 |
-
),
|
40 |
-
model_kwargs={"return_full_text": False}
|
41 |
)
|
42 |
|
43 |
def prepare_retriever(self, text):
|
44 |
-
# Split text into LangChain Document objects
|
45 |
docs = [Document(page_content=chunk) for chunk in self.text_splitter.split_text(text)]
|
46 |
-
# Create FAISS vectorstore for retrieval
|
47 |
vectorstore = FAISS.from_documents(docs, self.embeddings)
|
48 |
return vectorstore.as_retriever(), docs
|
49 |
|
50 |
def qa(self, text, question):
|
51 |
if not text.strip() or not question.strip():
|
52 |
return "No text or question provided.", ""
|
53 |
-
# Build retriever from text
|
54 |
retriever, docs = self.prepare_retriever(text)
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
# Show the most relevant chunk as context
|
65 |
-
relevant_context = result["source_documents"][0].page_content if result["source_documents"] else ""
|
66 |
return relevant_context, answer
|
67 |
|
68 |
ocr_processor = OCRProcessor()
|
|
|
26 |
# 2. LangChain-based DocQA Agent
|
27 |
class LangChainDocQAAgent:
|
28 |
def __init__(self):
|
|
|
29 |
self.embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
|
|
|
30 |
self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
|
31 |
+
|
32 |
+
self.qa_pipeline = hf_pipeline(
|
33 |
+
"question-answering",
|
34 |
+
model="deepset/roberta-base-squad2",
|
35 |
+
tokenizer="deepset/roberta-base-squad2"
|
|
|
|
|
|
|
36 |
)
|
37 |
|
38 |
def prepare_retriever(self, text):
|
|
|
39 |
docs = [Document(page_content=chunk) for chunk in self.text_splitter.split_text(text)]
|
|
|
40 |
vectorstore = FAISS.from_documents(docs, self.embeddings)
|
41 |
return vectorstore.as_retriever(), docs
|
42 |
|
43 |
def qa(self, text, question):
|
44 |
if not text.strip() or not question.strip():
|
45 |
return "No text or question provided.", ""
|
|
|
46 |
retriever, docs = self.prepare_retriever(text)
|
47 |
+
|
48 |
+
relevant_docs = retriever.get_relevant_documents(question)
|
49 |
+
relevant_context = relevant_docs[0].page_content if relevant_docs else ""
|
50 |
+
|
51 |
+
if relevant_context:
|
52 |
+
result = self.qa_pipeline({"context": relevant_context, "question": question})
|
53 |
+
answer = result["answer"]
|
54 |
+
else:
|
55 |
+
answer = "No answer found."
|
|
|
|
|
56 |
return relevant_context, answer
|
57 |
|
58 |
ocr_processor = OCRProcessor()
|