File size: 3,050 Bytes
c693e62
 
cdb128e
c693e62
 
 
 
cdb128e
 
 
c693e62
67d721c
6416b3a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e9d4bd7
 
6416b3a
6900efd
6416b3a
e9d4bd7
6900efd
 
6416b3a
6900efd
6416b3a
 
6900efd
6416b3a
 
 
 
6900efd
6416b3a
 
 
c693e62
 
6416b3a
 
 
 
1481746
151b692
c693e62
2a10acb
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import gradio as gr
from PyPDF2 import PdfReader
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
from gtts import gTTS
from io import BytesIO
import re

# Load the LED-large model for summarization
model_name = "pszemraj/led-large-book-summary"
summarizer = pipeline("summarization", model=model_name, tokenizer=model_name)

def extract_abstract_and_summarize(pdf_file):
    try:
        if pdf_file is None:
            raise ValueError("PDF file is not provided.")

        with open(pdf_file, "rb") as file:
            pdf_reader = PdfReader(file)
            abstract_text = ""
            for page_num in range(len(pdf_reader.pages)):
                page = pdf_reader.pages[page_num]
                text = page.extract_text()
                abstract_match = re.search(r"\bAbstract\b", text, re.IGNORECASE)
                if abstract_match:
                    start_index = abstract_match.end()
                    introduction_match = re.search(r"\bIntroduction\b", text[start_index:], re.IGNORECASE)
                    if introduction_match:
                        end_index = start_index + introduction_match.start()
                    else:
                        end_index = None
                    abstract_text = text[start_index:end_index]
                    break

            # Summarize the extracted abstract using the LED-large model with a specific max_length
            result = summarizer(abstract_text, max_length=81)

            # Extract only the first sentence from the summary
            if result and isinstance(result, list) and len(result) > 0:
                summary = result[0].get('summary_text', 'Summary not available.')
                # Extracting the first sentence
                first_sentence = summary.split('.')[0] + '.'
            else:
                first_sentence = "Summary not available."

            # Generate audio
            speech = gTTS(text=first_sentence, lang="en")
            speech_bytes = BytesIO()
            speech.write_to_fp(speech_bytes)

            # Return individual output values
            return first_sentence, speech_bytes.getvalue(), abstract_text.strip()

    except Exception as e:
        raise Exception(str(e))

interface = gr.Interface(
    fn=extract_abstract_and_summarize,
    inputs=[gr.File(label="Upload PDF")],
    outputs=[gr.Textbox(label="Summary"), gr.Audio()],
    title="PDF Summarization & Audio Tool",
    description="""PDF Summarization App. This app extracts the abstract from a PDF, summarizes it using the 'pszemraj/led-large-book-summary' model, and generates an audio of it. Only upload PDFs with abstracts. Please read the README.MD for information about the app and sample PDFs.""",
    examples=[[os.path.join(os.path.dirname(__file__), "Article 11 Hidden Technical Debt in Machine Learning Systems.pdf")],[os.path.join(os.path.dirname(__file__), "Article 4 Experimental Evidence on the Productivity Effects of Generative Artificial Intelligence.pdf")]],cache_examples=True,
)

interface.launch()