Update app.py
Browse files
app.py
CHANGED
@@ -4,6 +4,7 @@ import gradio as gr
|
|
4 |
from transformers import pipeline
|
5 |
from sentence_transformers import SentenceTransformer, util
|
6 |
import PyPDF2
|
|
|
7 |
|
8 |
# Set up logging with immediate writing
|
9 |
logging.basicConfig(
|
@@ -18,6 +19,43 @@ logger = logging.getLogger()
|
|
18 |
qa_model = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
|
19 |
embedder = SentenceTransformer('all-MiniLM-L6-v2')
|
20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
# Helper function to extract text from PDF
|
22 |
def extract_text_from_pdf(file_path):
|
23 |
text = ""
|
|
|
4 |
from transformers import pipeline
|
5 |
from sentence_transformers import SentenceTransformer, util
|
6 |
import PyPDF2
|
7 |
+
import re
|
8 |
|
9 |
# Set up logging with immediate writing
|
10 |
logging.basicConfig(
|
|
|
19 |
qa_model = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
|
20 |
embedder = SentenceTransformer('all-MiniLM-L6-v2')
|
21 |
|
22 |
+
import re
|
23 |
+
|
24 |
+
def preprocess_qa_format(text):
|
25 |
+
"""
|
26 |
+
Detects Q/A pairs in different formats and merges them so that the answer appears first.
|
27 |
+
|
28 |
+
Supported formats:
|
29 |
+
1. "question 69: what is xyz?\nanswer:: dark chocolate"
|
30 |
+
2. "Q 21: what is xyz?\nA: It is a new form of tyre."
|
31 |
+
3. "Q 21: what is xyz?\nit is a new form of tyre."
|
32 |
+
|
33 |
+
Transforms them into:
|
34 |
+
"Answer: [answer]\nQuestion: [question]"
|
35 |
+
"""
|
36 |
+
|
37 |
+
# Matches explicit "Question X: ..." followed by "Answer: ..."
|
38 |
+
pattern1 = re.compile(r"(?i)question\s*\d*\s*:\s*(.+?)\n\s*answer[:]*\s*(.+?)(?:\n|$)")
|
39 |
+
|
40 |
+
# Matches "Q X: ..." followed by "A: ..."
|
41 |
+
pattern2 = re.compile(r"(?i)Q\s*\d*\s*:\s*(.+?)\n\s*A[:]*\s*(.+?)(?:\n|$)")
|
42 |
+
|
43 |
+
# Matches "Q X: ..." followed by a sentence as an inferred answer (not explicitly labeled)
|
44 |
+
pattern3 = re.compile(r"(?i)Q\s*\d*\s*:\s*(.+?)\n\s*([A-Z][^.]*\..+?)(?:\n|$)")
|
45 |
+
|
46 |
+
def replacer(match):
|
47 |
+
question_text = match.group(1).strip()
|
48 |
+
answer_text = match.group(2).strip()
|
49 |
+
return f"Answer: {answer_text}\nQuestion: {question_text}\n"
|
50 |
+
|
51 |
+
# Apply transformations
|
52 |
+
text = pattern1.sub(replacer, text)
|
53 |
+
text = pattern2.sub(replacer, text)
|
54 |
+
text = pattern3.sub(replacer, text)
|
55 |
+
|
56 |
+
return text
|
57 |
+
|
58 |
+
|
59 |
# Helper function to extract text from PDF
|
60 |
def extract_text_from_pdf(file_path):
|
61 |
text = ""
|