Akshat1000 commited on
Commit
ad9331d
·
verified ·
1 Parent(s): 5a55dc9

Upload 7 files

Browse files
Files changed (7) hide show
  1. answer.py +32 -0
  2. app.py +22 -0
  3. extract_pdf.py +16 -0
  4. generate_answers.py +15 -0
  5. requirements.txt +5 -0
  6. summarize.py +59 -0
  7. test.py +49 -0
answer.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import BertForQuestionAnswering, BertTokenizer
2
+ import torch
3
+ from summarize import summarize_text,extract_text_from_pdf # Import summarization function
4
+
5
+ def load_qa_model(model_path="D:/code/bert_easy/bert-large-uncased-whole-word-masking-finetuned-squad"):
6
+ """Loads the BERT model and tokenizer for question answering."""
7
+ model = BertForQuestionAnswering.from_pretrained(model_path)
8
+ tokenizer = BertTokenizer.from_pretrained(model_path)
9
+ return model, tokenizer
10
+
11
+ def get_answer(question, context, model, tokenizer):
12
+ """Generates an answer for a given question based on the provided context."""
13
+ inputs = tokenizer(question, context, return_tensors="pt", truncation=True, max_length=512)
14
+ with torch.no_grad():
15
+ outputs = model(**inputs)
16
+
17
+ answer_start = torch.argmax(outputs.start_logits)
18
+ answer_end = torch.argmax(outputs.end_logits) + 1
19
+ answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end]))
20
+
21
+ return answer
22
+
23
+ if __name__ == "__main__":
24
+ pdf_path = "C://Users/HP/Downloads/study/cis/CIS Fundamentals.pdf"# Replace with actual PDF file path
25
+ extracted_text = extract_text_from_pdf(pdf_path)
26
+ summary = summarize_text(extracted_text)
27
+ sample_question = "what is cloud computing ?"
28
+
29
+ model, tokenizer = load_qa_model()
30
+ answer = get_answer(sample_question, summary, model, tokenizer) # Use summary as context
31
+ print("Summary:", summary)
32
+ print("Answer:", answer)
app.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from extract_pdf import extract_text_from_pdf
3
+ from summarize import summarize_text
4
+ from generate_answers import get_answer
5
+
6
+ st.title("📄 PDF Question Answering with Bert Model and T5 Model")
7
+
8
+ uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])
9
+ if uploaded_file:
10
+ with st.spinner("Reading and summarizing document..."):
11
+ raw_text = extract_text_from_pdf(uploaded_file)
12
+ summary = summarize_text(raw_text)
13
+ st.success("Document summarized!")
14
+
15
+ with st.expander("📄 View Summary"):
16
+ st.write(summary)
17
+
18
+ question = st.text_input("❓ Ask a question based on the document summary:")
19
+ if question:
20
+ with st.spinner("Generating answer..."):
21
+ answer = get_answer(question, summary)
22
+ st.markdown(f"**Answer:** {answer}")
extract_pdf.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import PyPDF2
2
+
3
+ def extract_text_from_pdf(uploaded_file):
4
+ """Extracts text from uploaded PDF file."""
5
+ pdf_reader = PyPDF2.PdfReader(uploaded_file)
6
+ text = ""
7
+ for page in pdf_reader.pages:
8
+ page_text = page.extract_text()
9
+ if page_text:
10
+ text += page_text + "\n"
11
+ return text.strip()
12
+
13
+ if __name__ == "__main__":
14
+ pdf_path = "C:/Users/HP/Downloads/DAUR-Project-Presentation.pdf"
15
+ raw_text = extract_text_from_pdf(pdf_path)
16
+ print("Summary:\n", raw_text)
generate_answers.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import BertTokenizer, BertForQuestionAnswering
2
+ import torch
3
+
4
+ model_path = "D:/code/bert_easy/bert-large-uncased-whole-word-masking-finetuned-squad"
5
+ tokenizer = BertTokenizer.from_pretrained(model_path)
6
+ model = BertForQuestionAnswering.from_pretrained(model_path)
7
+
8
+ def get_answer(question, context):
9
+ """Answers a question using BERT on given context."""
10
+ inputs = tokenizer(question, context, return_tensors="pt", truncation=True, max_length=512)
11
+ with torch.no_grad():
12
+ outputs = model(**inputs)
13
+ start = torch.argmax(outputs.start_logits)
14
+ end = torch.argmax(outputs.end_logits) + 1
15
+ return tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][start:end]))
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ transformers==4.40.1
2
+ torch==2.2.2
3
+ sentencepiece==0.1.99
4
+ streamlit==1.33.0
5
+ PyPDF2==3.0.1
summarize.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # summarize.py
2
+
3
+ from transformers import T5Tokenizer, T5ForConditionalGeneration
4
+ import PyPDF2
5
+ import math
6
+
7
+ # Load model and tokenizer
8
+ model_name = "t5-base"
9
+ tokenizer = T5Tokenizer.from_pretrained(model_name)
10
+ model = T5ForConditionalGeneration.from_pretrained(model_name)
11
+
12
+ # Extract all text from PDF
13
+ def extract_text_from_pdf(pdf_path):
14
+ text = ""
15
+ reader = PyPDF2.PdfReader(pdf_path)
16
+ for page in reader.pages:
17
+ page_text = page.extract_text()
18
+ if page_text:
19
+ text += page_text + "\n"
20
+ return text.strip()
21
+
22
+ # Split text into chunks of approx. 512 tokens (by words)
23
+ def split_text_into_chunks(text, max_tokens=500):
24
+ words = text.split()
25
+ chunks = []
26
+ i = 0
27
+ while i < len(words):
28
+ chunk = words[i:i+max_tokens]
29
+ chunks.append(" ".join(chunk))
30
+ i += max_tokens
31
+ return chunks
32
+
33
+ # Summarize a chunk
34
+ def summarize_chunk(text_chunk):
35
+ input_text = "summarize: " + text_chunk
36
+ inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
37
+ summary_ids = model.generate(
38
+ inputs["input_ids"],
39
+ max_length=512,
40
+ min_length=250,
41
+ length_penalty=2.0,
42
+ num_beams=4,
43
+ early_stopping=True
44
+ )
45
+ return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
46
+
47
+ # Summarize the entire document using chunks
48
+ def summarize_text(full_text):
49
+ chunks = split_text_into_chunks(full_text)
50
+ summaries = [summarize_chunk(chunk) for chunk in chunks]
51
+ full_summary = " ".join(summaries)
52
+ return full_summary
53
+
54
+ # Testable main flow
55
+ if __name__ == "__main__":
56
+ pdf_path = "C:/Users/HP/Downloads/study/cns/Unit 1.pdf"
57
+ raw_text = extract_text_from_pdf(pdf_path)
58
+ summary = summarize_text(raw_text)
59
+ print("Summary:\n", summary)
test.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import BertForQuestionAnswering, BertTokenizer
2
+ import torch
3
+ import PyPDF2
4
+
5
+ # Load the model & tokenizer
6
+ model_loc = "D://code/bert_easy/bert-large-uncased-whole-word-masking-finetuned-squad"
7
+ model = BertForQuestionAnswering.from_pretrained(model_loc)
8
+ tokenizer = BertTokenizer.from_pretrained(model_loc)
9
+
10
+ # Extract text from PDF
11
+ pdf_path = "C://Users/HP/Downloads/Resumes/Akshat_Thakkar_2022BTCS008.pdf"
12
+ text = ""
13
+
14
+ try:
15
+ pdf_reader = PyPDF2.PdfReader(pdf_path)
16
+ for page in pdf_reader.pages:
17
+ page_text = page.extract_text()
18
+ if page_text: # Ensure text is extracted
19
+ text += page_text + "\n"
20
+ except Exception as e:
21
+ print(f"Error reading PDF: {e}")
22
+ text = ""
23
+
24
+ # Ensure valid context
25
+ if not text.strip():
26
+ print("No valid text extracted from PDF.")
27
+ exit()
28
+
29
+ # Limit context to 512 tokens
30
+ tokens = tokenizer.tokenize(text)
31
+ context = tokenizer.convert_tokens_to_string(tokens[:1512]) # Truncate to 512 tokens
32
+
33
+ # Define question
34
+ question = "What is my name?"
35
+
36
+ # Tokenize input
37
+ inputs = tokenizer(question, context, return_tensors="pt", truncation=True, max_length=512)
38
+
39
+ # Get model predictions
40
+ with torch.no_grad():
41
+ outputs = model(**inputs)
42
+
43
+ # Extract answer
44
+ answer_start = torch.argmax(outputs.start_logits)
45
+ answer_end = torch.argmax(outputs.end_logits) + 1
46
+ answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end]))
47
+
48
+ print("summary:", answer)
49
+ print("Answer:", answer)