User commited on
Commit
ef4e0b3
·
1 Parent(s): d38a75c

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +100 -0
app.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """app.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1n1zTe_HIqsQ1JvPcV2S3i8-kjq5V4xJo
8
+ """
9
+
10
+ # Import necessary libraries
11
+ import gradio as gr
12
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
13
+ from gtts import gTTS
14
+ from io import BytesIO
15
+ import PyPDF2
16
+
17
+ # Function to extract abstract from PDF
18
+ def extract_abstract(pdf_path):
19
+ with open(pdf_path, 'rb') as file:
20
+ reader = PyPDF2.PdfReader(file)
21
+ abstract_start, abstract_end = None, None
22
+
23
+ for page_num, page in enumerate(reader.pages):
24
+ page_text = page.extract_text()
25
+ if "Abstract" in page_text:
26
+ abstract_start = page_num
27
+ break
28
+
29
+ if abstract_start is not None:
30
+ for page_num, page in enumerate(reader.pages[abstract_start + 1:]):
31
+ page_text = page.extract_text()
32
+ if any(title_word in page_text for title_word in ["Introduction", "Background", "1.", "I."]):
33
+ abstract_end = abstract_start + page_num + 1
34
+ break
35
+
36
+ if abstract_start is not None and abstract_end is not None:
37
+ abstract_text = ''.join(page.extract_text() for page in reader.pages[abstract_start:abstract_end])
38
+ return abstract_text
39
+ else:
40
+ return None
41
+
42
+ # Function to summarize abstract using a pre-trained model
43
+ def summarize_abstract(text):
44
+ tokenizer = AutoTokenizer.from_pretrained("pszemraj/led-base-book-summary")
45
+ model = AutoModelForSeq2SeqLM.from_pretrained("pszemraj/led-base-book-summary")
46
+ inputs = tokenizer(text, max_length=1000, return_tensors="pt", truncation=True)
47
+ summary_ids = model.generate(
48
+ inputs['input_ids'],
49
+ max_length=40,
50
+ min_length=20,
51
+ no_repeat_ngram_size=3,
52
+ encoder_no_repeat_ngram_size=3,
53
+ repetition_penalty=2.0,
54
+ num_beams=3,
55
+ do_sample=True,
56
+ early_stopping=False
57
+ )
58
+ summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
59
+
60
+ sentences = summary.split('.')
61
+ if len(sentences) > 1:
62
+ summary = sentences[0] + '.'
63
+
64
+ return summary
65
+
66
+ # Function to convert text to speech
67
+ def convert_to_speech(text):
68
+ tts = gTTS(text, lang='en')
69
+ buffer = BytesIO()
70
+ tts.write_to_fp(buffer)
71
+ buffer.seek(0)
72
+ return buffer.read()
73
+
74
+ # Function to process PDF and generate summary
75
+ def process_pdf(pdf_path):
76
+ abstract_text = extract_abstract(pdf_path)
77
+
78
+ if abstract_text:
79
+ abstract_text = abstract_text[:1024]
80
+ summary = summarize_abstract(abstract_text)
81
+
82
+ if summary:
83
+ return summary, convert_to_speech(summary)
84
+
85
+ # Define Gradio interface
86
+ inputs = gr.File(label="Upload a PDF with an abstract") # Add a label to the file input
87
+ summary_text = gr.Text(label="Written summary of the abstract")
88
+ audio_summary = gr.Audio(label="Audio summary of abstract")
89
+
90
+ # Launch the Gradio interface with an example PDF
91
+ iface = gr.Interface(
92
+ fn=process_pdf,
93
+ inputs=inputs,
94
+ outputs=[summary_text, audio_summary],
95
+ title="Summarized Abstract",
96
+ description="The app will summarize the abstract of a PDF and read it to the user.",
97
+ )
98
+
99
+ # Launch the Gradio interface
100
+ iface.launch()