Spaces:

user2434
/

SummarizedAbstract

Sleeping

App Files Files

User commited on Dec 8, 2023

Commit

ef4e0b3

1 Parent(s): d38a75c

Upload app.py

Browse files

Files changed (1) hide show

app.py +100 -0

app.py ADDED Viewed

	@@ -0,0 +1,100 @@

+# -*- coding: utf-8 -*-
+"""app.ipynb
+Automatically generated by Colaboratory.
+Original file is located at
+    https://colab.research.google.com/drive/1n1zTe_HIqsQ1JvPcV2S3i8-kjq5V4xJo
+"""
+# Import necessary libraries
+import gradio as gr
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+from gtts import gTTS
+from io import BytesIO
+import PyPDF2
+# Function to extract abstract from PDF
+def extract_abstract(pdf_path):
+    with open(pdf_path, 'rb') as file:
+        reader = PyPDF2.PdfReader(file)
+        abstract_start, abstract_end = None, None
+        for page_num, page in enumerate(reader.pages):
+            page_text = page.extract_text()
+            if "Abstract" in page_text:
+                abstract_start = page_num
+                break
+        if abstract_start is not None:
+            for page_num, page in enumerate(reader.pages[abstract_start + 1:]):
+                page_text = page.extract_text()
+                if any(title_word in page_text for title_word in ["Introduction", "Background", "1.", "I."]):
+                    abstract_end = abstract_start + page_num + 1
+                    break
+        if abstract_start is not None and abstract_end is not None:
+            abstract_text = ''.join(page.extract_text() for page in reader.pages[abstract_start:abstract_end])
+            return abstract_text
+        else:
+            return None
+# Function to summarize abstract using a pre-trained model
+def summarize_abstract(text):
+    tokenizer = AutoTokenizer.from_pretrained("pszemraj/led-base-book-summary")
+    model = AutoModelForSeq2SeqLM.from_pretrained("pszemraj/led-base-book-summary")
+    inputs = tokenizer(text, max_length=1000, return_tensors="pt", truncation=True)
+    summary_ids = model.generate(
+        inputs['input_ids'],
+        max_length=40,
+        min_length=20,
+        no_repeat_ngram_size=3,
+        encoder_no_repeat_ngram_size=3,
+        repetition_penalty=2.0,
+        num_beams=3,
+        do_sample=True,
+        early_stopping=False
+    )
+    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+    sentences = summary.split('.')
+    if len(sentences) > 1:
+        summary = sentences[0] + '.'
+    return summary
+# Function to convert text to speech
+def convert_to_speech(text):
+    tts = gTTS(text, lang='en')
+    buffer = BytesIO()
+    tts.write_to_fp(buffer)
+    buffer.seek(0)
+    return buffer.read()
+# Function to process PDF and generate summary
+def process_pdf(pdf_path):
+    abstract_text = extract_abstract(pdf_path)
+    if abstract_text:
+        abstract_text = abstract_text[:1024]
+        summary = summarize_abstract(abstract_text)
+        if summary:
+            return summary, convert_to_speech(summary)
+# Define Gradio interface
+inputs = gr.File(label="Upload a PDF with an abstract")  # Add a label to the file input
+summary_text = gr.Text(label="Written summary of the abstract")
+audio_summary = gr.Audio(label="Audio summary of abstract")
+# Launch the Gradio interface with an example PDF
+iface = gr.Interface(
+    fn=process_pdf,
+    inputs=inputs,
+    outputs=[summary_text, audio_summary],
+    title="Summarized Abstract",
+    description="The app will summarize the abstract of a PDF and read it to the user.",
+    )
+# Launch the Gradio interface
+iface.launch()