Spaces:
Sleeping
Sleeping
import gradio as gr | |
from PyPDF2 import PdfReader | |
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer | |
from gtts import gTTS | |
from io import BytesIO | |
import re | |
# Load the LED-large model for summarization | |
model_name = "pszemraj/led-large-book-summary" | |
summarizer = pipeline("summarization", model=model_name, tokenizer=model_name) | |
def extract_abstract_and_summarize(pdf_file): | |
try: | |
if pdf_file is None: | |
raise ValueError("PDF file is not provided.") | |
with open(pdf_file, "rb") as file: | |
pdf_reader = PdfReader(file) | |
abstract_text = "" | |
for page_num in range(len(pdf_reader.pages)): | |
page = pdf_reader.pages[page_num] | |
text = page.extract_text() | |
abstract_match = re.search(r"\bAbstract\b", text, re.IGNORECASE) | |
if abstract_match: | |
start_index = abstract_match.end() | |
introduction_match = re.search(r"\bIntroduction\b", text[start_index:], re.IGNORECASE) | |
if introduction_match: | |
end_index = start_index + introduction_match.start() | |
else: | |
end_index = None | |
abstract_text = text[start_index:end_index] | |
break | |
# Summarize the extracted abstract using the LED-large model | |
result = summarizer(abstract_text) | |
# Print the entire result for debugging | |
print("Result:", result) | |
# Check if 'summary' is present in the result | |
if result and isinstance(result, list) and len(result) > 0: | |
summary = result[0].get('summary', 'Summary not available.') | |
else: | |
summary = "Summary not available." | |
# Generate audio | |
speech = gTTS(text=summary, lang="en") | |
speech_bytes = BytesIO() | |
speech.write_to_fp(speech_bytes) | |
# Return individual output values | |
return summary, speech_bytes.getvalue(), abstract_text.strip() | |
except Exception as e: | |
raise Exception(str(e)) | |
interface = gr.Interface( | |
fn=extract_abstract_and_summarize, | |
inputs=[gr.File(label="Upload PDF")], | |
outputs=[gr.Textbox(label="Summary"), gr.Audio()], | |
title="PDF Summarization & Audio Tool", | |
description="""PDF Summarization App. This app extracts the abstract from a PDF, summarizes it using the 'pszemraj/led-large-book-summary' model, and generates an audio of it. Only upload PDFs with abstracts. Please read the README.MD for information about the app and sample PDFs.""" | |
) | |
interface.launch() | |