Spaces:
Sleeping
Sleeping
import gradio as gr | |
from PyPDF2 import PdfReader | |
from transformers import pipeline | |
from gtts import gTTS | |
from io import BytesIO | |
import re | |
import os | |
summarizer = pipeline("summarization") | |
def extract_abstract_and_summarize(pdf_file): | |
try: | |
with open(pdf_file, "rb") as file: | |
pdf_reader = PdfReader(file) | |
abstract_text = "" | |
for page_num in range(len(pdf_reader.pages)): | |
page = pdf_reader.pages[page_num] | |
text = page.extract_text() | |
abstract_match = re.search(r"\bAbstract\b", text, re.IGNORECASE) | |
if abstract_match: | |
start_index = abstract_match.end() | |
introduction_match = re.search(r"\bIntroduction\b", text[start_index:], re.IGNORECASE) | |
if introduction_match: | |
end_index = start_index + introduction_match.start() | |
else: | |
end_index = None | |
abstract_text = text[start_index:end_index] | |
break | |
# Summarize the extracted abstract | |
result = summarizer( | |
abstract_text, | |
min_length=16, | |
max_length=256, | |
no_repeat_ngram_size=3, | |
encoder_no_repeat_ngram_size=3, | |
repetition_penalty=3.5, | |
num_beams=4, | |
early_stopping=True, | |
) | |
summary = result[0]['summary'] | |
# Generate audio | |
speech = gTTS(text=summary, lang="en") | |
speech_bytes = BytesIO() | |
speech.write_to_fp(speech_bytes) | |
# Return individual output values | |
return summary, speech_bytes.getvalue(), abstract_text.strip() | |
except Exception as e: | |
raise Exception(str(e)) | |
interface = gr.Interface( | |
fn=extract_abstract_and_summarize, | |
inputs=[gr.File(label="Upload PDF")], | |
outputs=[gr.Textbox(label="Summary"), gr.Audio()], | |
title="PDF Summarization & Audio Tool", | |
description="""PDF Summarization App. This app extracts the abstract from a PDF, summarizes it using the 'summarizer' model, and generates an audio of it. Only upload PDFs with abstracts. Please read the README.MD for information about the app and sample PDFs.""" | |
) | |
interface.launch() | |