Shriharsh commited on
Commit
c803a6f
·
verified ·
1 Parent(s): 3987840

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -22
app.py CHANGED
@@ -19,44 +19,35 @@ logger = logging.getLogger()
19
  qa_model = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
20
  embedder = SentenceTransformer('all-MiniLM-L6-v2')
21
 
22
- import re
23
-
24
  def preprocess_qa_format(text):
25
  """
26
- Detects Q/A pairs in different formats and merges them so that the answer appears first.
27
-
28
- Supported formats:
29
- 1. "question 69: what is xyz?\nanswer:: dark chocolate"
30
- 2. "Q 21: what is xyz?\nA: It is a new form of tyre."
31
- 3. "Q 21: what is xyz?\nit is a new form of tyre."
32
 
33
- Transforms them into:
34
- "Answer: [answer]\nQuestion: [question]"
 
 
35
  """
36
-
37
- # Matches "Question X: ..." followed by "Answer: ..."
38
  pattern1 = re.compile(r"(?i)question\s*\d*\s*:\s*(.+?)\n\s*answer[:]*\s*(.+?)(?:\n|$)")
39
-
40
- # Matches "Q X: ..." followed by "A: ..."
41
  pattern2 = re.compile(r"(?i)Q\s*\d*\s*:\s*(.+?)\n\s*A[:]*\s*(.+?)(?:\n|$)")
42
-
43
- # Matches "Q X: ..." followed by a sentence as an inferred answer (not explicitly labeled)
44
  pattern3 = re.compile(r"(?i)Q\s*\d*\s*:\s*(.+?)\n\s*([A-Z][^.]*\..+?)(?:\n|$)")
45
-
46
  def replacer(match):
47
- question_text = match.group(1).strip()
48
  answer_text = match.group(2).strip()
49
- # Put the answer first, then the question.
50
- return f"Answer: {answer_text}\nQuestion: {question_text}\n"
51
 
52
- # Apply transformations
53
  text = pattern1.sub(replacer, text)
54
  text = pattern2.sub(replacer, text)
55
  text = pattern3.sub(replacer, text)
56
-
57
  return text
58
 
59
 
 
60
  # Helper function to extract text from PDF
61
  def extract_text_from_pdf(file_path):
62
  text = ""
 
19
  qa_model = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
20
  embedder = SentenceTransformer('all-MiniLM-L6-v2')
21
 
 
 
22
  def preprocess_qa_format(text):
23
  """
24
+ Detects Q/A pairs in different formats and returns only the answer text.
 
 
 
 
 
25
 
26
+ Supported formats:
27
+ 1. "Question X: ..." followed by "Answer: ..."
28
+ 2. "Q X: ..." followed by "A: ..."
29
+ 3. "Q X: ..." followed by an inferred answer line.
30
  """
31
+ # Pattern for explicit "Question ..." and "Answer ..." pairs
 
32
  pattern1 = re.compile(r"(?i)question\s*\d*\s*:\s*(.+?)\n\s*answer[:]*\s*(.+?)(?:\n|$)")
33
+ # Pattern for shorthand "Q ..." and "A: ..." pairs
 
34
  pattern2 = re.compile(r"(?i)Q\s*\d*\s*:\s*(.+?)\n\s*A[:]*\s*(.+?)(?:\n|$)")
35
+ # Pattern for "Q ..." followed by an inferred answer (starting with a capital letter and ending with a period)
 
36
  pattern3 = re.compile(r"(?i)Q\s*\d*\s*:\s*(.+?)\n\s*([A-Z][^.]*\..+?)(?:\n|$)")
37
+
38
  def replacer(match):
39
+ # We ignore the question text entirely and keep only the answer
40
  answer_text = match.group(2).strip()
41
+ return f"{answer_text}\n"
 
42
 
43
+ # Apply the transformations
44
  text = pattern1.sub(replacer, text)
45
  text = pattern2.sub(replacer, text)
46
  text = pattern3.sub(replacer, text)
 
47
  return text
48
 
49
 
50
+
51
  # Helper function to extract text from PDF
52
  def extract_text_from_pdf(file_path):
53
  text = ""