from transformers import BertForQuestionAnswering, BertTokenizer import torch import PyPDF2 # Load the model & tokenizer model_loc = "D://code/bert_easy/bert-large-uncased-whole-word-masking-finetuned-squad" model = BertForQuestionAnswering.from_pretrained(model_loc) tokenizer = BertTokenizer.from_pretrained(model_loc) # Extract text from PDF pdf_path = "C://Users/HP/Downloads/Resumes/Akshat_Thakkar_2022BTCS008.pdf" text = "" try: pdf_reader = PyPDF2.PdfReader(pdf_path) for page in pdf_reader.pages: page_text = page.extract_text() if page_text: # Ensure text is extracted text += page_text + "\n" except Exception as e: print(f"Error reading PDF: {e}") text = "" # Ensure valid context if not text.strip(): print("No valid text extracted from PDF.") exit() # Limit context to 512 tokens tokens = tokenizer.tokenize(text) context = tokenizer.convert_tokens_to_string(tokens[:1512]) # Truncate to 512 tokens # Define question question = "What is my name?" # Tokenize input inputs = tokenizer(question, context, return_tensors="pt", truncation=True, max_length=512) # Get model predictions with torch.no_grad(): outputs = model(**inputs) # Extract answer answer_start = torch.argmax(outputs.start_logits) answer_end = torch.argmax(outputs.end_logits) + 1 answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end])) print("summary:", answer) print("Answer:", answer)