Update app.py
Browse files
app.py
CHANGED
@@ -19,44 +19,35 @@ logger = logging.getLogger()
|
|
19 |
qa_model = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
|
20 |
embedder = SentenceTransformer('all-MiniLM-L6-v2')
|
21 |
|
22 |
-
import re
|
23 |
-
|
24 |
def preprocess_qa_format(text):
|
25 |
"""
|
26 |
-
Detects Q/A pairs in different formats and
|
27 |
-
|
28 |
-
Supported formats:
|
29 |
-
1. "question 69: what is xyz?\nanswer:: dark chocolate"
|
30 |
-
2. "Q 21: what is xyz?\nA: It is a new form of tyre."
|
31 |
-
3. "Q 21: what is xyz?\nit is a new form of tyre."
|
32 |
|
33 |
-
|
34 |
-
"
|
|
|
|
|
35 |
"""
|
36 |
-
|
37 |
-
# Matches "Question X: ..." followed by "Answer: ..."
|
38 |
pattern1 = re.compile(r"(?i)question\s*\d*\s*:\s*(.+?)\n\s*answer[:]*\s*(.+?)(?:\n|$)")
|
39 |
-
|
40 |
-
# Matches "Q X: ..." followed by "A: ..."
|
41 |
pattern2 = re.compile(r"(?i)Q\s*\d*\s*:\s*(.+?)\n\s*A[:]*\s*(.+?)(?:\n|$)")
|
42 |
-
|
43 |
-
# Matches "Q X: ..." followed by a sentence as an inferred answer (not explicitly labeled)
|
44 |
pattern3 = re.compile(r"(?i)Q\s*\d*\s*:\s*(.+?)\n\s*([A-Z][^.]*\..+?)(?:\n|$)")
|
45 |
-
|
46 |
def replacer(match):
|
47 |
-
|
48 |
answer_text = match.group(2).strip()
|
49 |
-
|
50 |
-
return f"Answer: {answer_text}\nQuestion: {question_text}\n"
|
51 |
|
52 |
-
# Apply transformations
|
53 |
text = pattern1.sub(replacer, text)
|
54 |
text = pattern2.sub(replacer, text)
|
55 |
text = pattern3.sub(replacer, text)
|
56 |
-
|
57 |
return text
|
58 |
|
59 |
|
|
|
60 |
# Helper function to extract text from PDF
|
61 |
def extract_text_from_pdf(file_path):
|
62 |
text = ""
|
|
|
19 |
qa_model = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
|
20 |
embedder = SentenceTransformer('all-MiniLM-L6-v2')
|
21 |
|
|
|
|
|
22 |
def preprocess_qa_format(text):
|
23 |
"""
|
24 |
+
Detects Q/A pairs in different formats and returns only the answer text.
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
+
Supported formats:
|
27 |
+
1. "Question X: ..." followed by "Answer: ..."
|
28 |
+
2. "Q X: ..." followed by "A: ..."
|
29 |
+
3. "Q X: ..." followed by an inferred answer line.
|
30 |
"""
|
31 |
+
# Pattern for explicit "Question ..." and "Answer ..." pairs
|
|
|
32 |
pattern1 = re.compile(r"(?i)question\s*\d*\s*:\s*(.+?)\n\s*answer[:]*\s*(.+?)(?:\n|$)")
|
33 |
+
# Pattern for shorthand "Q ..." and "A: ..." pairs
|
|
|
34 |
pattern2 = re.compile(r"(?i)Q\s*\d*\s*:\s*(.+?)\n\s*A[:]*\s*(.+?)(?:\n|$)")
|
35 |
+
# Pattern for "Q ..." followed by an inferred answer (starting with a capital letter and ending with a period)
|
|
|
36 |
pattern3 = re.compile(r"(?i)Q\s*\d*\s*:\s*(.+?)\n\s*([A-Z][^.]*\..+?)(?:\n|$)")
|
37 |
+
|
38 |
def replacer(match):
|
39 |
+
# We ignore the question text entirely and keep only the answer
|
40 |
answer_text = match.group(2).strip()
|
41 |
+
return f"{answer_text}\n"
|
|
|
42 |
|
43 |
+
# Apply the transformations
|
44 |
text = pattern1.sub(replacer, text)
|
45 |
text = pattern2.sub(replacer, text)
|
46 |
text = pattern3.sub(replacer, text)
|
|
|
47 |
return text
|
48 |
|
49 |
|
50 |
+
|
51 |
# Helper function to extract text from PDF
|
52 |
def extract_text_from_pdf(file_path):
|
53 |
text = ""
|