Spaces:
Sleeping
Sleeping
File size: 2,837 Bytes
c693e62 67d721c c693e62 6fbb403 c693e62 d9986c3 c693e62 d9986c3 c693e62 d9986c3 67d721c d9986c3 c693e62 d9986c3 67d721c d9986c3 67d721c d9986c3 67d721c d9986c3 67d721c d9986c3 67d721c c693e62 d9986c3 c693e62 67d721c c693e62 67d721c d9986c3 c693e62 d9986c3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 |
import gradio as gr
from PyPDF2 import PdfReader
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from gtts import gTTS
from io import BytesIO
import re
import os
model_name = "pszemraj/led-base-book-summary"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
def extract_first_sentence(text):
"""
Extracts the first sentence from a given text.
"""
sentences = re.split(r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s+", text)
if sentences:
return sentences[0]
else:
return text
def extract_abstract_and_summarize(pdf_file):
"""
Extracts the abstract and summarizes it in one sentence with information till "Introduction".
"""
try:
with open(pdf_file, "rb") as file:
pdf_reader = PdfReader(file)
abstract_text = ""
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
text = page.extract_text()
abstract_match = re.search(r"\bAbstract\b", text, re.IGNORECASE)
if abstract_match:
start_index = abstract_match.end()
introduction_match = re.search(r"\bIntroduction\b", text[start_index:], re.IGNORECASE)
if introduction_match:
end_index = start_index + introduction_match.start()
else:
end_index = None
abstract_text = text[start_index:end_index]
break # Exit loop once abstract is found
# Summarize the extracted abstract
inputs = tokenizer(abstract_text, return_tensors="pt")
outputs = model.generate(**inputs)
summary = tokenizer.decode(outputs[0])
# Extract only the first sentence
summary_sentence = extract_first_sentence(summary)
# Generate audio
speech = gTTS(text=summary_sentence, lang="en")
speech_bytes = BytesIO()
speech.write_to_fp(speech_bytes)
# Return individual output values
return summary_sentence, speech_bytes.getvalue(), abstract_text.strip()
except Exception as e:
raise Exception(str(e))
interface = gr.Interface(
fn=extract_abstract_and_summarize,
inputs=[gr.File(label="Upload PDF")],
outputs=[gr.Textbox(label="Summary"), gr.Audio()],
title="PDF Summarization & Audio Tool",
description="""PDF Summarization App. This app extracts the abstract from a PDF, summarizes it in one sentence with information till "Introduction", and generates an audio of it. Only upload PDFs with abstracts. Please read the README.MD for information about the app and sample PDFs.""",
)
interface.launch(share=True)
|