Spaces:
Sleeping
Sleeping
File size: 1,837 Bytes
ad9331d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
# summarize.py
from transformers import T5Tokenizer, T5ForConditionalGeneration
import PyPDF2
import math
# Load model and tokenizer
model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
# Extract all text from PDF
def extract_text_from_pdf(pdf_path):
text = ""
reader = PyPDF2.PdfReader(pdf_path)
for page in reader.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
return text.strip()
# Split text into chunks of approx. 512 tokens (by words)
def split_text_into_chunks(text, max_tokens=500):
words = text.split()
chunks = []
i = 0
while i < len(words):
chunk = words[i:i+max_tokens]
chunks.append(" ".join(chunk))
i += max_tokens
return chunks
# Summarize a chunk
def summarize_chunk(text_chunk):
input_text = "summarize: " + text_chunk
inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
summary_ids = model.generate(
inputs["input_ids"],
max_length=512,
min_length=250,
length_penalty=2.0,
num_beams=4,
early_stopping=True
)
return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
# Summarize the entire document using chunks
def summarize_text(full_text):
chunks = split_text_into_chunks(full_text)
summaries = [summarize_chunk(chunk) for chunk in chunks]
full_summary = " ".join(summaries)
return full_summary
# Testable main flow
if __name__ == "__main__":
pdf_path = "C:/Users/HP/Downloads/study/cns/Unit 1.pdf"
raw_text = extract_text_from_pdf(pdf_path)
summary = summarize_text(raw_text)
print("Summary:\n", summary)
|