Spaces:

Akshat1000
/

Soft_Computing_Project

Sleeping

App Files Files Community

Akshat1000 commited on Apr 25

Commit

809119f

verified ·

1 Parent(s): 8b91afc

Update summarize.py

Browse files

Files changed (1) hide show

summarize.py +20 -59

summarize.py CHANGED Viewed

@@ -1,59 +1,20 @@
-# summarize.py
-from transformers import T5Tokenizer, T5ForConditionalGeneration
-import PyPDF2
-import math
-# Load model and tokenizer
-model_name = "t5-base"
-tokenizer = T5Tokenizer.from_pretrained(model_name)
-model = T5ForConditionalGeneration.from_pretrained(model_name)
-# Extract all text from PDF
-def extract_text_from_pdf(pdf_path):
-    text = ""
-    reader = PyPDF2.PdfReader(pdf_path)
-    for page in reader.pages:
-        page_text = page.extract_text()
-        if page_text:
-            text += page_text + "\n"
-    return text.strip()
-# Split text into chunks of approx. 512 tokens (by words)
-def split_text_into_chunks(text, max_tokens=500):
-    words = text.split()
-    chunks = []
-    i = 0
-    while i < len(words):
-        chunk = words[i:i+max_tokens]
-        chunks.append(" ".join(chunk))
-        i += max_tokens
-    return chunks
-# Summarize a chunk
-def summarize_chunk(text_chunk):
-    input_text = "summarize: " + text_chunk
-    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
-    summary_ids = model.generate(
-        inputs["input_ids"],
-        max_length=512,
-        min_length=250,
-        length_penalty=2.0,
-        num_beams=4,
-        early_stopping=True
-    )
-    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
-# Summarize the entire document using chunks
-def summarize_text(full_text):
-    chunks = split_text_into_chunks(full_text)
-    summaries = [summarize_chunk(chunk) for chunk in chunks]
-    full_summary = " ".join(summaries)
-    return full_summary
-# Testable main flow
-if __name__ == "__main__":
-    pdf_path = "C:/Users/HP/Downloads/study/cns/Unit 1.pdf"
-    raw_text = extract_text_from_pdf(pdf_path)
-    summary = summarize_text(raw_text)
-    print("Summary:\n", summary)

+from transformers import T5Tokenizer, T5ForConditionalGeneration
+import torch
+tokenizer = T5Tokenizer.from_pretrained("t5-base")
+model = T5ForConditionalGeneration.from_pretrained("t5-base")
+def summarize_text(text, max_chunk_length=512):
+    text = text.replace("\n", " ")
+    chunks = [text[i:i+max_chunk_length] for i in range(0, len(text), max_chunk_length)]
+    summarized_chunks = []
+    for chunk in chunks:
+        input_text = "summarize: " + chunk
+        inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
+        summary_ids = model.generate(inputs, max_length=150, min_length=40, num_beams=4, length_penalty=2.0, early_stopping=True)
+        output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+        summarized_chunks.append(output)
+    return " ".join(summarized_chunks)