varl42 commited on
Commit
9447b9b
·
1 Parent(s): dddbd88

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -7
app.py CHANGED
@@ -6,28 +6,38 @@ import numpy
6
  import scipy
7
  from gtts import gTTS
8
  from io import BytesIO
9
- from transformers import BartTokenizer
10
 
11
  def extract_text(pdf_file):
12
  pdfReader = PyPDF2.PdfReader(pdf_file)
13
  pageObj = pdfReader.pages[0]
14
  return pageObj.extract_text()
15
 
16
-
17
  def summarize_text(text):
18
  sentences = text.split(". ")
 
 
19
  for i, sentence in enumerate(sentences):
20
  if "Abstract" in sentence:
21
  start = i + 1
22
  end = start + 3
23
  break
24
- abstract = ". ".join(sentences[start:end+1])
 
 
25
 
 
26
  tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
27
- summarizer = pipeline("summarization", model="facebook/bart-large-cnn", tokenizer=tokenizer)
28
- summary = summarizer(abstract, max_length=30, min_length=30,
29
- do_sample=False)
30
- return summary[0]['summary_text']
 
 
 
 
 
 
31
 
32
  def text_to_audio(text):
33
  tts = gTTS(text, lang='en')
 
6
  import scipy
7
  from gtts import gTTS
8
  from io import BytesIO
9
+ from transformers import BartTokenizer, BartForConditionalGeneration
10
 
11
  def extract_text(pdf_file):
12
  pdfReader = PyPDF2.PdfReader(pdf_file)
13
  pageObj = pdfReader.pages[0]
14
  return pageObj.extract_text()
15
 
 
16
  def summarize_text(text):
17
  sentences = text.split(". ")
18
+
19
+ # Find abstract section
20
  for i, sentence in enumerate(sentences):
21
  if "Abstract" in sentence:
22
  start = i + 1
23
  end = start + 3
24
  break
25
+
26
+ # Extract abstract
27
+ abstract = ". ".join(sentences[start:end+1])
28
 
29
+ # Load BART model & tokenizer
30
  tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
31
+ model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
32
+
33
+ # Tokenize abstract
34
+ inputs = tokenizer(abstract, return_tensors="pt", truncation=True)
35
+
36
+ # Generate summary
37
+ summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=50, min_length=50, early_stopping=True)
38
+ summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
39
+
40
+ return summary
41
 
42
  def text_to_audio(text):
43
  tts = gTTS(text, lang='en')