Spaces:

Akshat1000
/

Soft_Computing_Project

Sleeping

App Files Files Community

Akshat1000 commited on Apr 25

Commit

ad9331d

verified ·

1 Parent(s): 5a55dc9

Upload 7 files

Browse files

Files changed (7) hide show

answer.py +32 -0
app.py +22 -0
extract_pdf.py +16 -0
generate_answers.py +15 -0
requirements.txt +5 -0
summarize.py +59 -0
test.py +49 -0

answer.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from transformers import BertForQuestionAnswering, BertTokenizer
+import torch
+from summarize import summarize_text,extract_text_from_pdf  # Import summarization function
+def load_qa_model(model_path="D:/code/bert_easy/bert-large-uncased-whole-word-masking-finetuned-squad"):
+    """Loads the BERT model and tokenizer for question answering."""
+    model = BertForQuestionAnswering.from_pretrained(model_path)
+    tokenizer = BertTokenizer.from_pretrained(model_path)
+    return model, tokenizer
+def get_answer(question, context, model, tokenizer):
+    """Generates an answer for a given question based on the provided context."""
+    inputs = tokenizer(question, context, return_tensors="pt", truncation=True, max_length=512)
+    with torch.no_grad():
+        outputs = model(**inputs)
+    answer_start = torch.argmax(outputs.start_logits)
+    answer_end = torch.argmax(outputs.end_logits) + 1
+    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end]))
+    return answer
+if __name__ == "__main__":
+    pdf_path = "C://Users/HP/Downloads/study/cis/CIS Fundamentals.pdf"# Replace with actual PDF file path
+    extracted_text = extract_text_from_pdf(pdf_path)
+    summary = summarize_text(extracted_text)
+    sample_question = "what is cloud computing ?"
+    model, tokenizer = load_qa_model()
+    answer = get_answer(sample_question, summary, model, tokenizer)  # Use summary as context
+    print("Summary:", summary)
+    print("Answer:", answer)

app.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import streamlit as st
+from extract_pdf import extract_text_from_pdf
+from summarize import summarize_text
+from generate_answers import get_answer
+st.title("📄 PDF Question Answering with Bert Model and T5 Model")
+uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])
+if uploaded_file:
+    with st.spinner("Reading and summarizing document..."):
+        raw_text = extract_text_from_pdf(uploaded_file)
+        summary = summarize_text(raw_text)
+        st.success("Document summarized!")
+    with st.expander("📄 View Summary"):
+        st.write(summary)
+    question = st.text_input("❓ Ask a question based on the document summary:")
+    if question:
+        with st.spinner("Generating answer..."):
+            answer = get_answer(question, summary)
+        st.markdown(f"**Answer:** {answer}")

extract_pdf.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import PyPDF2
+def extract_text_from_pdf(uploaded_file):
+    """Extracts text from uploaded PDF file."""
+    pdf_reader = PyPDF2.PdfReader(uploaded_file)
+    text = ""
+    for page in pdf_reader.pages:
+        page_text = page.extract_text()
+        if page_text:
+            text += page_text + "\n"
+    return text.strip()
+if __name__ == "__main__":
+    pdf_path = "C:/Users/HP/Downloads/DAUR-Project-Presentation.pdf"
+    raw_text = extract_text_from_pdf(pdf_path)
+    print("Summary:\n", raw_text)

generate_answers.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from transformers import BertTokenizer, BertForQuestionAnswering
+import torch
+model_path = "D:/code/bert_easy/bert-large-uncased-whole-word-masking-finetuned-squad"
+tokenizer = BertTokenizer.from_pretrained(model_path)
+model = BertForQuestionAnswering.from_pretrained(model_path)
+def get_answer(question, context):
+    """Answers a question using BERT on given context."""
+    inputs = tokenizer(question, context, return_tensors="pt", truncation=True, max_length=512)
+    with torch.no_grad():
+        outputs = model(**inputs)
+    start = torch.argmax(outputs.start_logits)
+    end = torch.argmax(outputs.end_logits) + 1
+    return tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][start:end]))

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+transformers==4.40.1
+torch==2.2.2
+sentencepiece==0.1.99
+streamlit==1.33.0
+PyPDF2==3.0.1

summarize.py ADDED Viewed

	@@ -0,0 +1,59 @@

+# summarize.py
+from transformers import T5Tokenizer, T5ForConditionalGeneration
+import PyPDF2
+import math
+# Load model and tokenizer
+model_name = "t5-base"
+tokenizer = T5Tokenizer.from_pretrained(model_name)
+model = T5ForConditionalGeneration.from_pretrained(model_name)
+# Extract all text from PDF
+def extract_text_from_pdf(pdf_path):
+    text = ""
+    reader = PyPDF2.PdfReader(pdf_path)
+    for page in reader.pages:
+        page_text = page.extract_text()
+        if page_text:
+            text += page_text + "\n"
+    return text.strip()
+# Split text into chunks of approx. 512 tokens (by words)
+def split_text_into_chunks(text, max_tokens=500):
+    words = text.split()
+    chunks = []
+    i = 0
+    while i < len(words):
+        chunk = words[i:i+max_tokens]
+        chunks.append(" ".join(chunk))
+        i += max_tokens
+    return chunks
+# Summarize a chunk
+def summarize_chunk(text_chunk):
+    input_text = "summarize: " + text_chunk
+    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
+    summary_ids = model.generate(
+        inputs["input_ids"],
+        max_length=512,
+        min_length=250,
+        length_penalty=2.0,
+        num_beams=4,
+        early_stopping=True
+    )
+    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+# Summarize the entire document using chunks
+def summarize_text(full_text):
+    chunks = split_text_into_chunks(full_text)
+    summaries = [summarize_chunk(chunk) for chunk in chunks]
+    full_summary = " ".join(summaries)
+    return full_summary
+# Testable main flow
+if __name__ == "__main__":
+    pdf_path = "C:/Users/HP/Downloads/study/cns/Unit 1.pdf"
+    raw_text = extract_text_from_pdf(pdf_path)
+    summary = summarize_text(raw_text)
+    print("Summary:\n", summary)

test.py ADDED Viewed

	@@ -0,0 +1,49 @@

+from transformers import BertForQuestionAnswering, BertTokenizer
+import torch
+import PyPDF2
+# Load the model & tokenizer
+model_loc = "D://code/bert_easy/bert-large-uncased-whole-word-masking-finetuned-squad"
+model = BertForQuestionAnswering.from_pretrained(model_loc)
+tokenizer = BertTokenizer.from_pretrained(model_loc)
+# Extract text from PDF
+pdf_path = "C://Users/HP/Downloads/Resumes/Akshat_Thakkar_2022BTCS008.pdf"
+text = ""
+try:
+    pdf_reader = PyPDF2.PdfReader(pdf_path)
+    for page in pdf_reader.pages:
+        page_text = page.extract_text()
+        if page_text:  # Ensure text is extracted
+            text += page_text + "\n"
+except Exception as e:
+    print(f"Error reading PDF: {e}")
+    text = ""
+# Ensure valid context
+if not text.strip():
+    print("No valid text extracted from PDF.")
+    exit()
+# Limit context to 512 tokens
+tokens = tokenizer.tokenize(text)
+context = tokenizer.convert_tokens_to_string(tokens[:1512])  # Truncate to 512 tokens
+# Define question
+question = "What is my name?"
+# Tokenize input
+inputs = tokenizer(question, context, return_tensors="pt", truncation=True, max_length=512)
+# Get model predictions
+with torch.no_grad():
+    outputs = model(**inputs)
+# Extract answer
+answer_start = torch.argmax(outputs.start_logits)
+answer_end = torch.argmax(outputs.end_logits) + 1
+answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end]))
+print("summary:", answer)
+print("Answer:", answer)