Spaces:
Sleeping
Sleeping
User
commited on
Commit
·
ef4e0b3
1
Parent(s):
d38a75c
Upload app.py
Browse files
app.py
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""app.ipynb
|
3 |
+
|
4 |
+
Automatically generated by Colaboratory.
|
5 |
+
|
6 |
+
Original file is located at
|
7 |
+
https://colab.research.google.com/drive/1n1zTe_HIqsQ1JvPcV2S3i8-kjq5V4xJo
|
8 |
+
"""
|
9 |
+
|
10 |
+
# Import necessary libraries
|
11 |
+
import gradio as gr
|
12 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
13 |
+
from gtts import gTTS
|
14 |
+
from io import BytesIO
|
15 |
+
import PyPDF2
|
16 |
+
|
17 |
+
# Function to extract abstract from PDF
|
18 |
+
def extract_abstract(pdf_path):
|
19 |
+
with open(pdf_path, 'rb') as file:
|
20 |
+
reader = PyPDF2.PdfReader(file)
|
21 |
+
abstract_start, abstract_end = None, None
|
22 |
+
|
23 |
+
for page_num, page in enumerate(reader.pages):
|
24 |
+
page_text = page.extract_text()
|
25 |
+
if "Abstract" in page_text:
|
26 |
+
abstract_start = page_num
|
27 |
+
break
|
28 |
+
|
29 |
+
if abstract_start is not None:
|
30 |
+
for page_num, page in enumerate(reader.pages[abstract_start + 1:]):
|
31 |
+
page_text = page.extract_text()
|
32 |
+
if any(title_word in page_text for title_word in ["Introduction", "Background", "1.", "I."]):
|
33 |
+
abstract_end = abstract_start + page_num + 1
|
34 |
+
break
|
35 |
+
|
36 |
+
if abstract_start is not None and abstract_end is not None:
|
37 |
+
abstract_text = ''.join(page.extract_text() for page in reader.pages[abstract_start:abstract_end])
|
38 |
+
return abstract_text
|
39 |
+
else:
|
40 |
+
return None
|
41 |
+
|
42 |
+
# Function to summarize abstract using a pre-trained model
|
43 |
+
def summarize_abstract(text):
|
44 |
+
tokenizer = AutoTokenizer.from_pretrained("pszemraj/led-base-book-summary")
|
45 |
+
model = AutoModelForSeq2SeqLM.from_pretrained("pszemraj/led-base-book-summary")
|
46 |
+
inputs = tokenizer(text, max_length=1000, return_tensors="pt", truncation=True)
|
47 |
+
summary_ids = model.generate(
|
48 |
+
inputs['input_ids'],
|
49 |
+
max_length=40,
|
50 |
+
min_length=20,
|
51 |
+
no_repeat_ngram_size=3,
|
52 |
+
encoder_no_repeat_ngram_size=3,
|
53 |
+
repetition_penalty=2.0,
|
54 |
+
num_beams=3,
|
55 |
+
do_sample=True,
|
56 |
+
early_stopping=False
|
57 |
+
)
|
58 |
+
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
|
59 |
+
|
60 |
+
sentences = summary.split('.')
|
61 |
+
if len(sentences) > 1:
|
62 |
+
summary = sentences[0] + '.'
|
63 |
+
|
64 |
+
return summary
|
65 |
+
|
66 |
+
# Function to convert text to speech
|
67 |
+
def convert_to_speech(text):
|
68 |
+
tts = gTTS(text, lang='en')
|
69 |
+
buffer = BytesIO()
|
70 |
+
tts.write_to_fp(buffer)
|
71 |
+
buffer.seek(0)
|
72 |
+
return buffer.read()
|
73 |
+
|
74 |
+
# Function to process PDF and generate summary
|
75 |
+
def process_pdf(pdf_path):
|
76 |
+
abstract_text = extract_abstract(pdf_path)
|
77 |
+
|
78 |
+
if abstract_text:
|
79 |
+
abstract_text = abstract_text[:1024]
|
80 |
+
summary = summarize_abstract(abstract_text)
|
81 |
+
|
82 |
+
if summary:
|
83 |
+
return summary, convert_to_speech(summary)
|
84 |
+
|
85 |
+
# Define Gradio interface
|
86 |
+
inputs = gr.File(label="Upload a PDF with an abstract") # Add a label to the file input
|
87 |
+
summary_text = gr.Text(label="Written summary of the abstract")
|
88 |
+
audio_summary = gr.Audio(label="Audio summary of abstract")
|
89 |
+
|
90 |
+
# Launch the Gradio interface with an example PDF
|
91 |
+
iface = gr.Interface(
|
92 |
+
fn=process_pdf,
|
93 |
+
inputs=inputs,
|
94 |
+
outputs=[summary_text, audio_summary],
|
95 |
+
title="Summarized Abstract",
|
96 |
+
description="The app will summarize the abstract of a PDF and read it to the user.",
|
97 |
+
)
|
98 |
+
|
99 |
+
# Launch the Gradio interface
|
100 |
+
iface.launch()
|