File size: 2,303 Bytes
c693e62
 
cdb128e
c693e62
 
 
 
cdb128e
 
 
c693e62
67d721c
c693e62
d9986c3
67d721c
d9986c3
67d721c
 
 
d9986c3
67d721c
 
d9986c3
 
 
67d721c
d9986c3
 
2a10acb
 
cdb128e
 
797680d
2a10acb
67d721c
2a10acb
67d721c
 
2a10acb
67d721c
2a10acb
 
c693e62
 
 
 
67d721c
c693e62
 
67d721c
cdb128e
c693e62
2a10acb
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import gradio as gr
from PyPDF2 import PdfReader
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
from gtts import gTTS
from io import BytesIO
import re

# Load the LED-large model for summarization
model_name = "pszemraj/led-large-book-summary"
summarizer = pipeline("summarization", model=model_name, tokenizer=model_name)

def extract_abstract_and_summarize(pdf_file):
    try:
        with open(pdf_file, "rb") as file:
            pdf_reader = PdfReader(file)
            abstract_text = ""
            for page_num in range(len(pdf_reader.pages)):
                page = pdf_reader.pages[page_num]
                text = page.extract_text()
                abstract_match = re.search(r"\bAbstract\b", text, re.IGNORECASE)
                if abstract_match:
                    start_index = abstract_match.end()
                    introduction_match = re.search(r"\bIntroduction\b", text[start_index:], re.IGNORECASE)
                    if introduction_match:
                        end_index = start_index + introduction_match.start()
                    else:
                        end_index = None
                    abstract_text = text[start_index:end_index]
                    break

            # Summarize the extracted abstract using the LED-large model
            result = summarizer(abstract_text, max_length=256, min_length=16, length_penalty=2.0)
            summary = result[0]['summary']

            # Generate audio
            speech = gTTS(text=summary, lang="en")
            speech_bytes = BytesIO()
            speech.write_to_fp(speech_bytes)

            # Return individual output values
            return summary, speech_bytes.getvalue(), abstract_text.strip()

    except Exception as e:
        raise Exception(str(e))

interface = gr.Interface(
    fn=extract_abstract_and_summarize,
    inputs=[gr.File(label="Upload PDF")],
    outputs=[gr.Textbox(label="Summary"), gr.Audio()],
    title="PDF Summarization & Audio Tool",
    description="""PDF Summarization App. This app extracts the abstract from a PDF, summarizes it using the 'pszemraj/led-large-book-summary' model, and generates an audio of it. Only upload PDFs with abstracts. Please read the README.MD for information about the app and sample PDFs."""
)

interface.launch()