Update app.py
Browse files
app.py
CHANGED
@@ -4,7 +4,6 @@ import gradio as gr
|
|
4 |
from transformers import pipeline
|
5 |
from sentence_transformers import SentenceTransformer, util
|
6 |
import PyPDF2
|
7 |
-
import re
|
8 |
|
9 |
# Set up logging with immediate writing
|
10 |
logging.basicConfig(
|
@@ -19,35 +18,6 @@ logger = logging.getLogger()
|
|
19 |
qa_model = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
|
20 |
embedder = SentenceTransformer('all-MiniLM-L6-v2')
|
21 |
|
22 |
-
def preprocess_qa_format(text):
|
23 |
-
"""
|
24 |
-
Detects Q/A pairs in different formats and returns only the answer text.
|
25 |
-
|
26 |
-
Supported formats:
|
27 |
-
1. "Question X: ..." followed by "Answer: ..."
|
28 |
-
2. "Q X: ..." followed by "A: ..."
|
29 |
-
3. "Q X: ..." followed by an inferred answer line.
|
30 |
-
"""
|
31 |
-
# Pattern for explicit "Question ..." and "Answer ..." pairs
|
32 |
-
pattern1 = re.compile(r"(?i)question\s*\d*\s*:\s*(.+?)\n\s*answer[:]*\s*(.+?)(?:\n|$)")
|
33 |
-
# Pattern for shorthand "Q ..." and "A: ..." pairs
|
34 |
-
pattern2 = re.compile(r"(?i)Q\s*\d*\s*:\s*(.+?)\n\s*A[:]*\s*(.+?)(?:\n|$)")
|
35 |
-
# Pattern for "Q ..." followed by an inferred answer (starting with a capital letter and ending with a period)
|
36 |
-
pattern3 = re.compile(r"(?i)Q\s*\d*\s*:\s*(.+?)\n\s*([A-Z][^.]*\..+?)(?:\n|$)")
|
37 |
-
|
38 |
-
def replacer(match):
|
39 |
-
# We ignore the question text entirely and keep only the answer
|
40 |
-
answer_text = match.group(2).strip()
|
41 |
-
return f"{answer_text}\n"
|
42 |
-
|
43 |
-
# Apply the transformations
|
44 |
-
text = pattern1.sub(replacer, text)
|
45 |
-
text = pattern2.sub(replacer, text)
|
46 |
-
text = pattern3.sub(replacer, text)
|
47 |
-
return text
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
# Helper function to extract text from PDF
|
52 |
def extract_text_from_pdf(file_path):
|
53 |
text = ""
|
|
|
4 |
from transformers import pipeline
|
5 |
from sentence_transformers import SentenceTransformer, util
|
6 |
import PyPDF2
|
|
|
7 |
|
8 |
# Set up logging with immediate writing
|
9 |
logging.basicConfig(
|
|
|
18 |
qa_model = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
|
19 |
embedder = SentenceTransformer('all-MiniLM-L6-v2')
|
20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
# Helper function to extract text from PDF
|
22 |
def extract_text_from_pdf(file_path):
|
23 |
text = ""
|