Daniel Tse
commited on
Commit
·
9bb604c
1
Parent(s):
01bea1f
Add sentence chunking
Browse files
app.py
CHANGED
@@ -29,12 +29,47 @@ def transcribe_audio(audiofile):
|
|
29 |
st.info('Done Transcription')
|
30 |
|
31 |
return transcription
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
|
33 |
def summarize_podcast(audiotranscription):
|
34 |
st.info("Summarizing...")
|
35 |
summarizer = pipeline("summarization", model="philschmid/flan-t5-base-samsum", device=0)
|
36 |
|
37 |
-
|
|
|
|
|
|
|
38 |
st.session_state['summary'] = summarized_text
|
39 |
return summarized_text
|
40 |
|
|
|
29 |
st.info('Done Transcription')
|
30 |
|
31 |
return transcription
|
32 |
+
def chunk_and_preprocess_text(text, model_name= 'philschmid/flan-t5-base-samsum'):
|
33 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
34 |
+
sentences = sent_tokenize(text)
|
35 |
+
|
36 |
+
length = 0
|
37 |
+
chunk = ""
|
38 |
+
chunks = []
|
39 |
+
count = -1
|
40 |
+
|
41 |
+
for sentence in sentences:
|
42 |
+
count += 1
|
43 |
+
combined_length = len(tokenizer.tokenize(sentence)) + length # add the no. of sentence tokens to the length counter
|
44 |
+
|
45 |
+
if combined_length <= tokenizer.max_len_single_sentence: # if it doesn't exceed
|
46 |
+
chunk += sentence + " " # add the sentence to the chunk
|
47 |
+
length = combined_length # update the length counter
|
48 |
+
|
49 |
+
# if it is the last sentence
|
50 |
+
if count == len(sentences) - 1:
|
51 |
+
chunks.append(chunk) # save the chunk
|
52 |
+
|
53 |
+
else:
|
54 |
+
chunks.append(chunk) # save the chunk
|
55 |
+
# reset
|
56 |
+
length = 0
|
57 |
+
chunk = ""
|
58 |
+
|
59 |
+
# take care of the overflow sentence
|
60 |
+
chunk += sentence + " "
|
61 |
+
length = len(tokenizer.tokenize(sentence))
|
62 |
+
|
63 |
+
return chunks
|
64 |
|
65 |
def summarize_podcast(audiotranscription):
|
66 |
st.info("Summarizing...")
|
67 |
summarizer = pipeline("summarization", model="philschmid/flan-t5-base-samsum", device=0)
|
68 |
|
69 |
+
st.info("Chunking text")
|
70 |
+
text_chunks = chunk_and_preprocess_text(audiotranscription)
|
71 |
+
|
72 |
+
summarized_text = summarizer(text_chunks, max_len=200,min_len=50)
|
73 |
st.session_state['summary'] = summarized_text
|
74 |
return summarized_text
|
75 |
|