Akshat1000's picture
Upload 7 files
ad9331d verified
from transformers import BertForQuestionAnswering, BertTokenizer
import torch
import PyPDF2
# Load the model & tokenizer
model_loc = "D://code/bert_easy/bert-large-uncased-whole-word-masking-finetuned-squad"
model = BertForQuestionAnswering.from_pretrained(model_loc)
tokenizer = BertTokenizer.from_pretrained(model_loc)
# Extract text from PDF
pdf_path = "C://Users/HP/Downloads/Resumes/Akshat_Thakkar_2022BTCS008.pdf"
text = ""
try:
pdf_reader = PyPDF2.PdfReader(pdf_path)
for page in pdf_reader.pages:
page_text = page.extract_text()
if page_text: # Ensure text is extracted
text += page_text + "\n"
except Exception as e:
print(f"Error reading PDF: {e}")
text = ""
# Ensure valid context
if not text.strip():
print("No valid text extracted from PDF.")
exit()
# Limit context to 512 tokens
tokens = tokenizer.tokenize(text)
context = tokenizer.convert_tokens_to_string(tokens[:1512]) # Truncate to 512 tokens
# Define question
question = "What is my name?"
# Tokenize input
inputs = tokenizer(question, context, return_tensors="pt", truncation=True, max_length=512)
# Get model predictions
with torch.no_grad():
outputs = model(**inputs)
# Extract answer
answer_start = torch.argmax(outputs.start_logits)
answer_end = torch.argmax(outputs.end_logits) + 1
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end]))
print("summary:", answer)
print("Answer:", answer)