File size: 2,326 Bytes
c693e62
 
cdb128e
c693e62
 
 
 
cdb128e
 
 
c693e62
67d721c
ac303b5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c693e62
 
ac303b5
 
 
 
 
c693e62
2a10acb
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import gradio as gr
from PyPDF2 import PdfReader
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
from gtts import gTTS
from io import BytesIO
import re

# Load the LED-large model for summarization
model_name = "pszemraj/led-large-book-summary"
summarizer = pipeline("summarization", model=model_name, tokenizer=model_name)

def extract_abstract_and_summarize(pdf_file):
  try:
    with open(pdf_file, "rb") as file:
      pdf_reader = PdfReader(file)
      abstract_text = ""
      for page_num in range(len(pdf_reader.pages)):
        page = pdf_reader.pages[page_num]
        text = page.extract_text()
        abstract_match = re.search(r"\bAbstract\b", text, re.IGNORECASE)
        if abstract_match:
          start_index = abstract_match.end()
          introduction_match = re.search(r"\bIntroduction\b", text[start_index:], re.IGNORECASE)
          if introduction_match:
            end_index = start_index + introduction_match.start()
          else:
            end_index = None
          abstract_text = text[start_index:end_index]
          break

      # Summarize the extracted abstract using the LED-large model
      result = summarizer(abstract_text)

      # Print the entire result for debugging
      print("Result:", result)

      # Check if 'summary' is present in the result
      if result and isinstance(result, list) and len(result) > 0:
        summary = result[0].get('summary', 'Summary not available.')
      else:
        summary = "Summary not available."

      # Generate audio
      speech = gTTS(text=summary, lang="en")
      speech_bytes = BytesIO()
      speech.write_to_fp(speech_bytes)

      # Return individual output values
      return summary, speech_bytes.getvalue(), abstract_text.strip()

  except Exception as e:
    raise Exception(str(e))

interface = gr.Interface(
  fn=extract_abstract_and_summarize,
  inputs=[gr.File(label="Upload PDF")],
  outputs=[gr.Textbox(label="Summary"), gr.Audio()],
  title="PDF Summarization & Audio Tool",
  description="""PDF Summarization App. This app extracts the abstract from a PDF, summarizes it using the 'pszemraj/led-large-book-summary' model, and generates an audio of it. Only upload PDFs with abstracts. Please read the README.MD for information about the app and sample PDFs."""
)

interface.launch()