Spaces:

Shriharsh
/

Customer_Support_Bot_with_Document_Training

Sleeping

App Files Files Community

Shriharsh commited on Mar 16

Commit

c803a6f

verified ·

1 Parent(s): 3987840

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -22

app.py CHANGED Viewed

@@ -19,44 +19,35 @@ logger = logging.getLogger()
 qa_model = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
 embedder = SentenceTransformer('all-MiniLM-L6-v2')
-import re
 def preprocess_qa_format(text):
     """
-    Detects Q/A pairs in different formats and merges them so that the answer appears first.
-    Supported formats:
-    1. "question 69: what is xyz?\nanswer:: dark chocolate"
-    2. "Q 21: what is xyz?\nA: It is a new form of tyre."
-    3. "Q 21: what is xyz?\nit is a new form of tyre."
-    Transforms them into:
-    "Answer: [answer]\nQuestion: [question]"
     """
-    # Matches "Question X: ..." followed by "Answer: ..."
     pattern1 = re.compile(r"(?i)question\s*\d*\s*:\s*(.+?)\n\s*answer[:]*\s*(.+?)(?:\n|$)")
-    # Matches "Q X: ..." followed by "A: ..."
     pattern2 = re.compile(r"(?i)Q\s*\d*\s*:\s*(.+?)\n\s*A[:]*\s*(.+?)(?:\n|$)")
-    # Matches "Q X: ..." followed by a sentence as an inferred answer (not explicitly labeled)
     pattern3 = re.compile(r"(?i)Q\s*\d*\s*:\s*(.+?)\n\s*([A-Z][^.]*\..+?)(?:\n|$)")
     def replacer(match):
-        question_text = match.group(1).strip()
         answer_text = match.group(2).strip()
-        # Put the answer first, then the question.
-        return f"Answer: {answer_text}\nQuestion: {question_text}\n"
-    # Apply transformations
     text = pattern1.sub(replacer, text)
     text = pattern2.sub(replacer, text)
     text = pattern3.sub(replacer, text)
     return text
 # Helper function to extract text from PDF
 def extract_text_from_pdf(file_path):
     text = ""

 qa_model = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
 embedder = SentenceTransformer('all-MiniLM-L6-v2')
 def preprocess_qa_format(text):
     """
+    Detects Q/A pairs in different formats and returns only the answer text.
+    Supported formats:
+    1. "Question X: ..." followed by "Answer: ..."
+    2. "Q X: ..." followed by "A: ..."
+    3. "Q X: ..." followed by an inferred answer line.
     """
+    # Pattern for explicit "Question ..." and "Answer ..." pairs
     pattern1 = re.compile(r"(?i)question\s*\d*\s*:\s*(.+?)\n\s*answer[:]*\s*(.+?)(?:\n|$)")
+    # Pattern for shorthand "Q ..." and "A: ..." pairs
     pattern2 = re.compile(r"(?i)Q\s*\d*\s*:\s*(.+?)\n\s*A[:]*\s*(.+?)(?:\n|$)")
+    # Pattern for "Q ..." followed by an inferred answer (starting with a capital letter and ending with a period)
     pattern3 = re.compile(r"(?i)Q\s*\d*\s*:\s*(.+?)\n\s*([A-Z][^.]*\..+?)(?:\n|$)")
     def replacer(match):
+        # We ignore the question text entirely and keep only the answer
         answer_text = match.group(2).strip()
+        return f"{answer_text}\n"
+    # Apply the transformations
     text = pattern1.sub(replacer, text)
     text = pattern2.sub(replacer, text)
     text = pattern3.sub(replacer, text)
     return text
 # Helper function to extract text from PDF
 def extract_text_from_pdf(file_path):
     text = ""