Akshat1000 commited on
Commit
809119f
·
verified ·
1 Parent(s): 8b91afc

Update summarize.py

Browse files
Files changed (1) hide show
  1. summarize.py +20 -59
summarize.py CHANGED
@@ -1,59 +1,20 @@
1
- # summarize.py
2
-
3
- from transformers import T5Tokenizer, T5ForConditionalGeneration
4
- import PyPDF2
5
- import math
6
-
7
- # Load model and tokenizer
8
- model_name = "t5-base"
9
- tokenizer = T5Tokenizer.from_pretrained(model_name)
10
- model = T5ForConditionalGeneration.from_pretrained(model_name)
11
-
12
- # Extract all text from PDF
13
- def extract_text_from_pdf(pdf_path):
14
- text = ""
15
- reader = PyPDF2.PdfReader(pdf_path)
16
- for page in reader.pages:
17
- page_text = page.extract_text()
18
- if page_text:
19
- text += page_text + "\n"
20
- return text.strip()
21
-
22
- # Split text into chunks of approx. 512 tokens (by words)
23
- def split_text_into_chunks(text, max_tokens=500):
24
- words = text.split()
25
- chunks = []
26
- i = 0
27
- while i < len(words):
28
- chunk = words[i:i+max_tokens]
29
- chunks.append(" ".join(chunk))
30
- i += max_tokens
31
- return chunks
32
-
33
- # Summarize a chunk
34
- def summarize_chunk(text_chunk):
35
- input_text = "summarize: " + text_chunk
36
- inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
37
- summary_ids = model.generate(
38
- inputs["input_ids"],
39
- max_length=512,
40
- min_length=250,
41
- length_penalty=2.0,
42
- num_beams=4,
43
- early_stopping=True
44
- )
45
- return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
46
-
47
- # Summarize the entire document using chunks
48
- def summarize_text(full_text):
49
- chunks = split_text_into_chunks(full_text)
50
- summaries = [summarize_chunk(chunk) for chunk in chunks]
51
- full_summary = " ".join(summaries)
52
- return full_summary
53
-
54
- # Testable main flow
55
- if __name__ == "__main__":
56
- pdf_path = "C:/Users/HP/Downloads/study/cns/Unit 1.pdf"
57
- raw_text = extract_text_from_pdf(pdf_path)
58
- summary = summarize_text(raw_text)
59
- print("Summary:\n", summary)
 
1
+ from transformers import T5Tokenizer, T5ForConditionalGeneration
2
+ import torch
3
+
4
+ tokenizer = T5Tokenizer.from_pretrained("t5-base")
5
+ model = T5ForConditionalGeneration.from_pretrained("t5-base")
6
+
7
+ def summarize_text(text, max_chunk_length=512):
8
+ text = text.replace("\n", " ")
9
+ chunks = [text[i:i+max_chunk_length] for i in range(0, len(text), max_chunk_length)]
10
+
11
+ summarized_chunks = []
12
+
13
+ for chunk in chunks:
14
+ input_text = "summarize: " + chunk
15
+ inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
16
+ summary_ids = model.generate(inputs, max_length=150, min_length=40, num_beams=4, length_penalty=2.0, early_stopping=True)
17
+ output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
18
+ summarized_chunks.append(output)
19
+
20
+ return " ".join(summarized_chunks)