Soft_Computing_Project / summarize.py
Akshat1000's picture
Upload 7 files
ad9331d verified
raw
history blame
1.84 kB
# summarize.py
from transformers import T5Tokenizer, T5ForConditionalGeneration
import PyPDF2
import math
# Load model and tokenizer
model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
# Extract all text from PDF
def extract_text_from_pdf(pdf_path):
text = ""
reader = PyPDF2.PdfReader(pdf_path)
for page in reader.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
return text.strip()
# Split text into chunks of approx. 512 tokens (by words)
def split_text_into_chunks(text, max_tokens=500):
words = text.split()
chunks = []
i = 0
while i < len(words):
chunk = words[i:i+max_tokens]
chunks.append(" ".join(chunk))
i += max_tokens
return chunks
# Summarize a chunk
def summarize_chunk(text_chunk):
input_text = "summarize: " + text_chunk
inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
summary_ids = model.generate(
inputs["input_ids"],
max_length=512,
min_length=250,
length_penalty=2.0,
num_beams=4,
early_stopping=True
)
return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
# Summarize the entire document using chunks
def summarize_text(full_text):
chunks = split_text_into_chunks(full_text)
summaries = [summarize_chunk(chunk) for chunk in chunks]
full_summary = " ".join(summaries)
return full_summary
# Testable main flow
if __name__ == "__main__":
pdf_path = "C:/Users/HP/Downloads/study/cns/Unit 1.pdf"
raw_text = extract_text_from_pdf(pdf_path)
summary = summarize_text(raw_text)
print("Summary:\n", summary)