Shriharsh commited on
Commit
049f57d
·
verified ·
1 Parent(s): 0a81317

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -0
app.py CHANGED
@@ -4,6 +4,7 @@ import gradio as gr
4
  from transformers import pipeline
5
  from sentence_transformers import SentenceTransformer, util
6
  import PyPDF2
 
7
 
8
  # Set up logging with immediate writing
9
  logging.basicConfig(
@@ -18,6 +19,43 @@ logger = logging.getLogger()
18
  qa_model = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
19
  embedder = SentenceTransformer('all-MiniLM-L6-v2')
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  # Helper function to extract text from PDF
22
  def extract_text_from_pdf(file_path):
23
  text = ""
 
4
  from transformers import pipeline
5
  from sentence_transformers import SentenceTransformer, util
6
  import PyPDF2
7
+ import re
8
 
9
  # Set up logging with immediate writing
10
  logging.basicConfig(
 
19
  qa_model = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
20
  embedder = SentenceTransformer('all-MiniLM-L6-v2')
21
 
22
+ import re
23
+
24
+ def preprocess_qa_format(text):
25
+ """
26
+ Detects Q/A pairs in different formats and merges them so that the answer appears first.
27
+
28
+ Supported formats:
29
+ 1. "question 69: what is xyz?\nanswer:: dark chocolate"
30
+ 2. "Q 21: what is xyz?\nA: It is a new form of tyre."
31
+ 3. "Q 21: what is xyz?\nit is a new form of tyre."
32
+
33
+ Transforms them into:
34
+ "Answer: [answer]\nQuestion: [question]"
35
+ """
36
+
37
+ # Matches explicit "Question X: ..." followed by "Answer: ..."
38
+ pattern1 = re.compile(r"(?i)question\s*\d*\s*:\s*(.+?)\n\s*answer[:]*\s*(.+?)(?:\n|$)")
39
+
40
+ # Matches "Q X: ..." followed by "A: ..."
41
+ pattern2 = re.compile(r"(?i)Q\s*\d*\s*:\s*(.+?)\n\s*A[:]*\s*(.+?)(?:\n|$)")
42
+
43
+ # Matches "Q X: ..." followed by a sentence as an inferred answer (not explicitly labeled)
44
+ pattern3 = re.compile(r"(?i)Q\s*\d*\s*:\s*(.+?)\n\s*([A-Z][^.]*\..+?)(?:\n|$)")
45
+
46
+ def replacer(match):
47
+ question_text = match.group(1).strip()
48
+ answer_text = match.group(2).strip()
49
+ return f"Answer: {answer_text}\nQuestion: {question_text}\n"
50
+
51
+ # Apply transformations
52
+ text = pattern1.sub(replacer, text)
53
+ text = pattern2.sub(replacer, text)
54
+ text = pattern3.sub(replacer, text)
55
+
56
+ return text
57
+
58
+
59
  # Helper function to extract text from PDF
60
  def extract_text_from_pdf(file_path):
61
  text = ""