Spaces:
Sleeping
Sleeping
from transformers import BertForQuestionAnswering, BertTokenizer | |
import torch | |
import PyPDF2 | |
# Load the model & tokenizer | |
model_loc = "D://code/bert_easy/bert-large-uncased-whole-word-masking-finetuned-squad" | |
model = BertForQuestionAnswering.from_pretrained(model_loc) | |
tokenizer = BertTokenizer.from_pretrained(model_loc) | |
# Extract text from PDF | |
pdf_path = "C://Users/HP/Downloads/Resumes/Akshat_Thakkar_2022BTCS008.pdf" | |
text = "" | |
try: | |
pdf_reader = PyPDF2.PdfReader(pdf_path) | |
for page in pdf_reader.pages: | |
page_text = page.extract_text() | |
if page_text: # Ensure text is extracted | |
text += page_text + "\n" | |
except Exception as e: | |
print(f"Error reading PDF: {e}") | |
text = "" | |
# Ensure valid context | |
if not text.strip(): | |
print("No valid text extracted from PDF.") | |
exit() | |
# Limit context to 512 tokens | |
tokens = tokenizer.tokenize(text) | |
context = tokenizer.convert_tokens_to_string(tokens[:1512]) # Truncate to 512 tokens | |
# Define question | |
question = "What is my name?" | |
# Tokenize input | |
inputs = tokenizer(question, context, return_tensors="pt", truncation=True, max_length=512) | |
# Get model predictions | |
with torch.no_grad(): | |
outputs = model(**inputs) | |
# Extract answer | |
answer_start = torch.argmax(outputs.start_logits) | |
answer_end = torch.argmax(outputs.end_logits) + 1 | |
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end])) | |
print("summary:", answer) | |
print("Answer:", answer) | |