Spaces:
Sleeping
Sleeping
# summarize.py | |
from transformers import T5Tokenizer, T5ForConditionalGeneration | |
import PyPDF2 | |
import math | |
# Load model and tokenizer | |
model_name = "t5-base" | |
tokenizer = T5Tokenizer.from_pretrained(model_name) | |
model = T5ForConditionalGeneration.from_pretrained(model_name) | |
# Extract all text from PDF | |
def extract_text_from_pdf(pdf_path): | |
text = "" | |
reader = PyPDF2.PdfReader(pdf_path) | |
for page in reader.pages: | |
page_text = page.extract_text() | |
if page_text: | |
text += page_text + "\n" | |
return text.strip() | |
# Split text into chunks of approx. 512 tokens (by words) | |
def split_text_into_chunks(text, max_tokens=500): | |
words = text.split() | |
chunks = [] | |
i = 0 | |
while i < len(words): | |
chunk = words[i:i+max_tokens] | |
chunks.append(" ".join(chunk)) | |
i += max_tokens | |
return chunks | |
# Summarize a chunk | |
def summarize_chunk(text_chunk): | |
input_text = "summarize: " + text_chunk | |
inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True) | |
summary_ids = model.generate( | |
inputs["input_ids"], | |
max_length=512, | |
min_length=250, | |
length_penalty=2.0, | |
num_beams=4, | |
early_stopping=True | |
) | |
return tokenizer.decode(summary_ids[0], skip_special_tokens=True) | |
# Summarize the entire document using chunks | |
def summarize_text(full_text): | |
chunks = split_text_into_chunks(full_text) | |
summaries = [summarize_chunk(chunk) for chunk in chunks] | |
full_summary = " ".join(summaries) | |
return full_summary | |
# Testable main flow | |
if __name__ == "__main__": | |
pdf_path = "C:/Users/HP/Downloads/study/cns/Unit 1.pdf" | |
raw_text = extract_text_from_pdf(pdf_path) | |
summary = summarize_text(raw_text) | |
print("Summary:\n", summary) | |