eHemink commited on
Commit
b8b1d65
·
1 Parent(s): 3d0b487

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +69 -0
app.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #imports
2
+ !pip install PyPDF2
3
+ import PyPDF2
4
+ import re
5
+ !pip install transformers
6
+ import transformers
7
+ from transformers import pipeline
8
+ !pip install git+https://github.com/suno-ai/bark.git
9
+ from bark import SAMPLE_RATE, generate_audio, preload_models
10
+ from scipy.io.wavfile import write as write_wav
11
+ from IPython.display import Audio
12
+
13
+ def abstract_to_audio(insert_pdf):
14
+ # Extracting the abstract text from the article pdf
15
+ def extract_abstract(pdf_file):
16
+ # Open the PDF file in read-binary mode
17
+ with open(pdf_file, 'rb') as file:
18
+ # Create a PDF reader object
19
+ pdf_reader = PyPDF2.PdfReader(file)
20
+
21
+ # Initialize an empty string to store abstract content
22
+ abstract_text = ''
23
+
24
+ # Loop through each page in the PDF
25
+ for page_num in range(len(pdf_reader.pages)):
26
+ # Get the text from the current page
27
+ page = pdf_reader.pages[page_num]
28
+ text = page.extract_text()
29
+
30
+ # Use regular expression to find the "Abstract" section
31
+ abstract_match = re.search(r'\bAbstract\b', text, re.IGNORECASE)
32
+ if abstract_match:
33
+ # Get the text after the "Abstract" heading until the next section, indicated by "Introduction" heading
34
+ start_index = abstract_match.end()
35
+ next_section_match = re.search(r'\bIntroduction\b', text[start_index:])
36
+ if next_section_match:
37
+ end_index = start_index + next_section_match.start()
38
+ abstract_text = text[start_index:end_index]
39
+ else:
40
+ # If no next section found, extract text till the end
41
+ abstract_text = text[start_index:]
42
+ break # Exit loop once abstract is found
43
+
44
+ return abstract_text.strip()
45
+
46
+
47
+ abstract = extract_abstract(insert_pdf)
48
+
49
+ # Creating a summarization pipeline
50
+ model = "lidiya/bart-large-xsum-samsum"
51
+ pipeline1 = pipeline(task = "summarization", model = model)
52
+
53
+ # Summarizing the extracted abstract
54
+ summarized = pipeline1(abstract)
55
+ print(summarized[0]['summary_text'])
56
+ tss_prompt = summarized[0]['summary_text']
57
+
58
+ # Generate audio file that speaks the generated sentence using Bark
59
+ # download and load all models
60
+ preload_models()
61
+
62
+ # generate audio from text
63
+ text_prompt = tss_prompt
64
+ audio_array = generate_audio(text_prompt)
65
+
66
+ # play text in notebook
67
+ return Audio(audio_array, rate=SAMPLE_RATE)
68
+
69
+ my_app = gr.Interface(fn=abstract_to_audio, inputs='file', outputs='audio')