# summarize.py from transformers import T5Tokenizer, T5ForConditionalGeneration import PyPDF2 import math # Load model and tokenizer model_name = "t5-base" tokenizer = T5Tokenizer.from_pretrained(model_name) model = T5ForConditionalGeneration.from_pretrained(model_name) # Extract all text from PDF def extract_text_from_pdf(pdf_path): text = "" reader = PyPDF2.PdfReader(pdf_path) for page in reader.pages: page_text = page.extract_text() if page_text: text += page_text + "\n" return text.strip() # Split text into chunks of approx. 512 tokens (by words) def split_text_into_chunks(text, max_tokens=500): words = text.split() chunks = [] i = 0 while i < len(words): chunk = words[i:i+max_tokens] chunks.append(" ".join(chunk)) i += max_tokens return chunks # Summarize a chunk def summarize_chunk(text_chunk): input_text = "summarize: " + text_chunk inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True) summary_ids = model.generate( inputs["input_ids"], max_length=512, min_length=250, length_penalty=2.0, num_beams=4, early_stopping=True ) return tokenizer.decode(summary_ids[0], skip_special_tokens=True) # Summarize the entire document using chunks def summarize_text(full_text): chunks = split_text_into_chunks(full_text) summaries = [summarize_chunk(chunk) for chunk in chunks] full_summary = " ".join(summaries) return full_summary # Testable main flow if __name__ == "__main__": pdf_path = "C:/Users/HP/Downloads/study/cns/Unit 1.pdf" raw_text = extract_text_from_pdf(pdf_path) summary = summarize_text(raw_text) print("Summary:\n", summary)