mrsk1883 commited on
Commit
f8b4423
·
1 Parent(s): 7f5b90e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -59
app.py CHANGED
@@ -1,72 +1,50 @@
1
  import gradio as gr
2
- from PyPDF2 import PdfReader
3
  from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
4
  from gtts import gTTS
5
  from io import BytesIO
6
 
7
- # Check if running in IPython environment
8
- try:
9
- from IPython.display import Audio
10
- ipython_available = True
11
- except ImportError:
12
- ipython_available = False
13
 
14
- # Define model and tokenizer
15
- model_name = "ArtifactAI/led_large_16384_arxiv_summarization"
16
  model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
17
  tokenizer = AutoTokenizer.from_pretrained(model_name)
18
 
19
  def summarize_pdf_abstract(pdf_bytes):
20
- """
21
- Reads a PDF file, extracts the abstract, and summarizes it in one sentence.
22
 
23
- Args:
24
- pdf_bytes: The raw bytes of the uploaded PDF file.
25
-
26
- Returns:
27
- A dictionary containing the one-sentence summary of the abstract and the generated audio.
28
- """
29
- try:
30
- reader = PdfReader.from_buffer(pdf_bytes)
31
- abstract_text = ""
32
-
33
- for page in reader.pages:
34
- if "Abstract" in page.extract_text() or "Introduction" in page.extract_text():
35
- abstract_text = page.extract_text()
36
- break
37
-
38
- # Generate summary using the model
39
- inputs = tokenizer(abstract_text, return_tensors="pt")
40
- outputs = model.generate(**inputs)
41
- summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
42
-
43
- # Generate audio if IPython is available
44
- if ipython_available:
45
- speech = gTTS(summary, lang="en")
46
- speech_bytes = speech.get_wav_data()
47
- else:
48
- speech_bytes = None
49
-
50
- return {"summary": summary, "audio": speech_bytes}
51
-
52
- except Exception as e:
53
- raise Exception(str(e))
54
-
55
- # Modify the Gradio interface based on IPython availability
56
  if ipython_available:
57
- # If running in IPython, include the Audio component
58
- interface = gr.Interface(
59
- fn=summarize_pdf_abstract,
60
- inputs=[gr.File(label="Upload PDF", type="binary")],
61
- outputs=[gr.Text(label="One-sentence summary"), gr.Audio(label="Summary audio")],
62
- )
63
  else:
64
- # If not running in IPython, exclude the Audio component
65
- interface = gr.Interface(
66
- fn=summarize_pdf_abstract,
67
- inputs=[gr.File(label="Upload PDF", type="binary")],
68
- outputs=[gr.Text(label="One-sentence summary")],
69
- )
70
-
71
- # Launch the Gradio interface
72
- interface.launch()
 
1
  import gradio as gr
2
+ from PyPDF2 import PdfReader
3
  from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
4
  from gtts import gTTS
5
  from io import BytesIO
6
 
7
+ # IPython check
8
+ try:
9
+ from IPython.display import Audio
10
+ ipython_available = True
11
+ except ImportError:
12
+ ipython_available = False
13
 
14
+ # Model
15
+ model_name = "ArtifactAI/led_large_16384_arxiv_summarization"
16
  model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
17
  tokenizer = AutoTokenizer.from_pretrained(model_name)
18
 
19
  def summarize_pdf_abstract(pdf_bytes):
 
 
20
 
21
+ try:
22
+ reader = PdfReader(pdf_bytes)
23
+
24
+ abstract_text = ""
25
+ for page in reader.pages:
26
+ if "Abstract" in page.extract_text() or "Introduction" in page.extract_text():
27
+ abstract_text = page.extract_text()
28
+ break
29
+
30
+ inputs = tokenizer(abstract_text, return_tensors="pt")
31
+ outputs = model.generate(**inputs)
32
+ summary = tokenizer.decode(outputs[0])
33
+
34
+ if ipython_available:
35
+ speech = gTTS(text=summary, lang="en")
36
+ speech_bytes = speech.get_wav_data()
37
+ else:
38
+ speech_bytes = None
39
+
40
+ return {"summary": summary, "audio": speech_bytes}
41
+
42
+ except Exception as e:
43
+ raise Exception(str(e))
44
+
 
 
 
 
 
 
 
 
 
45
  if ipython_available:
46
+ interface = gr.Interface(...)
 
 
 
 
 
47
  else:
48
+ interface = gr.Interface(...)
49
+
50
+ interface.launch()