Spaces:
Sleeping
Sleeping
import gradio as gr | |
from PyPDF2 import PdfReader | |
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer | |
from gtts import gTTS | |
from io import BytesIO | |
import re | |
import os | |
model_name = "pszemraj/led-base-book-summary" | |
model = AutoModelForSeq2SeqLM.from_pretrained(model_name) | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
def extract_first_sentence(text): | |
sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text) | |
if sentences: | |
return sentences[0] | |
else: | |
return text | |
def extract_abstract_and_summarize(pdf_file): | |
try: | |
with open(pdf_file, 'rb') as file: | |
pdf_reader = PdfReader(file) | |
abstract_text = '' | |
for page_num in range(len(pdf_reader.pages)): | |
page = pdf_reader.pages[page_num] | |
text = page.extract_text() | |
abstract_match = re.search(r'\bAbstract\b', text, re.IGNORECASE) | |
if abstract_match: | |
start_index = abstract_match.end() | |
# Check for the next heading or section marker | |
next_section_match = re.search(r'\b(?:Introduction|Methodology|Conclusion)\b', text[start_index:]) | |
if next_section_match: | |
end_index = start_index + next_section_match.start() | |
abstract_text = text[start_index:end_index] | |
else: | |
abstract_text = text[start_index:] | |
break # Exit loop once abstract is found | |
# Summarize the extracted abstract | |
inputs = tokenizer(abstract_text, return_tensors="pt") | |
outputs = model.generate(**inputs) | |
summary = tokenizer.decode(outputs[0]) | |
# Extract only the first sentence | |
summary_sentence = extract_first_sentence(summary) | |
# Generate audio | |
speech = gTTS(text=summary_sentence, lang="en") | |
speech_bytes = BytesIO() | |
speech.write_to_fp(speech_bytes) | |
# Return individual output values | |
return summary_sentence, speech_bytes.getvalue(), abstract_text.strip() | |
except Exception as e: | |
raise Exception(str(e)) | |
interface = gr.Interface( | |
fn=extract_abstract_and_summarize, | |
inputs=[gr.File(label="Upload PDF")], | |
outputs=[gr.Textbox(label="Summary"), gr.Audio()], | |
title="PDF Summarization & Audio Tool", | |
description="""PDF Summarization App. This app extracts the abstract from a PDF, summarizes it in one sentence, and generates an audio of it. Only upload PDFs with abstracts. | |
Please read the README.MD for information about the app and sample PDFs.""", | |
) | |
interface.launch(share=True) |