Spaces:
Sleeping
Sleeping
import gradio as gr | |
from PyPDF2 import PdfReader | |
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer | |
from gtts import gTTS | |
from io import BytesIO | |
import re | |
import os | |
model_name = "pszemraj/led-base-book-summary" | |
model = AutoModelForSeq2SeqLM.from_pretrained(model_name) | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
def extract_first_sentence(text): | |
""" | |
Extracts the first sentence from a given text. | |
""" | |
sentences = re.split(r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s+", text) | |
if sentences: | |
return sentences[0] | |
else: | |
return text | |
def extract_abstract_and_summarize(pdf_file): | |
""" | |
Extracts the abstract and summarizes it in one sentence with information till "Introduction". | |
""" | |
try: | |
with open(pdf_file, "rb") as file: | |
pdf_reader = PdfReader(file) | |
abstract_text = "" | |
for page_num in range(len(pdf_reader.pages)): | |
page = pdf_reader.pages[page_num] | |
text = page.extract_text() | |
abstract_match = re.search(r"\bAbstract\b", text, re.IGNORECASE) | |
if abstract_match: | |
start_index = abstract_match.end() | |
introduction_match = re.search(r"\bIntroduction\b", text[start_index:], re.IGNORECASE) | |
if introduction_match: | |
end_index = start_index + introduction_match.start() | |
else: | |
end_index = None | |
abstract_text = text[start_index:end_index] | |
break # Exit loop once abstract is found | |
# Summarize the extracted abstract | |
inputs = tokenizer(abstract_text, return_tensors="pt") | |
outputs = model.generate(**inputs) | |
summary = tokenizer.decode(outputs[0]) | |
# Extract only the first sentence | |
summary_sentence = extract_first_sentence(summary) | |
# Generate audio | |
speech = gTTS(text=summary_sentence, lang="en") | |
speech_bytes = BytesIO() | |
speech.write_to_fp(speech_bytes) | |
# Return individual output values | |
return summary_sentence, speech_bytes.getvalue(), abstract_text.strip() | |
except Exception as e: | |
raise Exception(str(e)) | |
interface = gr.Interface( | |
fn=extract_abstract_and_summarize, | |
inputs=[gr.File(label="Upload PDF")], | |
outputs=[gr.Textbox(label="Summary"), gr.Audio()], | |
title="PDF Summarization & Audio Tool", | |
description="""PDF Summarization App. This app extracts the abstract from a PDF, summarizes it in one sentence with information till "Introduction", and generates an audio of it. Only upload PDFs with abstracts. Please read the README.MD for information about the app and sample PDFs.""", | |
) | |
interface.launch(share=True) | |