File size: 4,761 Bytes
ab58fb5 9d727e3 7117743 4bce6ba d1ef5e9 b74cc3f d56b55d a581f12 4fa56af 7814fef 4fa56af 253b4e4 e7956b2 253b4e4 866ffb3 b74cc3f e7956b2 555a0ea 2ea7ea8 0157a0b 2ea7ea8 3e7fc84 2ea7ea8 9bb604c 253b4e4 c274bf0 01bea1f e7956b2 e3d61f6 9bb604c 4fa56af 01bea1f 0157a0b d6920f0 4fa56af c274bf0 b03d0c5 4ebf6a9 b03d0c5 5431882 b03d0c5 976ad76 b03d0c5 976ad76 97c3b7e 3277eaa e7956b2 5de52b7 690c95c e7956b2 0069484 e7956b2 d6920f0 4ebf6a9 4125ee2 2b98878 4125ee2 d6920f0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
import streamlit as st
import whisper
import os
import torch
import nltk
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, AutoModelForTokenClassification, AutoModelForSeq2SeqLM
from pydub import AudioSegment
from nltk import sent_tokenize
nltk.download('punkt')
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
def transcribe_audio(audiofile):
st.session_state['audio'] = audiofile
print(f"audio_file_session_state:{st.session_state['audio'] }")
st.info("Getting size of file")
#get size of audio file
audio_size = round(os.path.getsize(st.session_state['audio'])/(1024*1024),1)
print(f"audio file size:{audio_size}")
#determine audio duration
podcast = AudioSegment.from_mp3(st.session_state['audio'])
st.session_state['audio_segment'] = podcast
podcast_duration = podcast.duration_seconds
print(f"Audio Duration: {podcast_duration}")
st.info("Transcribing")
whisper_model = whisper.load_model("small.en")
transcription = whisper_model.transcribe(audiofile)
st.session_state['transcription'] = transcription
print(f"transcription: {transcription['text']}")
st.info('Done Transcription')
return transcription
def chunk_and_preprocess_text(text, model_name= 'philschmid/flan-t5-base-samsum'):
tokenizer = AutoTokenizer.from_pretrained(model_name)
sentences = sent_tokenize(text)
length = 0
chunk = ""
chunks = []
count = -1
for sentence in sentences:
count += 1
combined_length = len(tokenizer.tokenize(sentence)) + length # add the no. of sentence tokens to the length counter
if combined_length <= tokenizer.max_len_single_sentence: # if it doesn't exceed
chunk += sentence + " " # add the sentence to the chunk
length = combined_length # update the length counter
# if it is the last sentence
if count == len(sentences) - 1:
chunks.append(chunk) # save the chunk
else:
chunks.append(chunk) # save the chunk
# reset
length = 0
chunk = ""
# take care of the overflow sentence
chunk += sentence + " "
length = len(tokenizer.tokenize(sentence))
return chunks
def summarize_podcast(audiotranscription):
st.info("Summarizing...")
summarizer = pipeline("summarization", model="philschmid/flan-t5-base-samsum", device=0)
st.info("Chunking text")
text_chunks = chunk_and_preprocess_text(audiotranscription)
#summarized_text = summarizer(text_chunks, max_len=200,min_len=50)
summarized_text = summarizer(text_chunks)
st.session_state['summary'] = summarized_text
print(f"Summary: {summarized_text}")
#summarized_text is an array of objects with key summary_text
full_summary = ' '.join(item['summary_text'] for item in summarized_text)
return full_summary
def prepare_text_for_qa(audiotranscription):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=20)
documents = text_splitter.split_documents(audiotranscription)
revalue = ""
return revalue
st.markdown("# Podcast Q&A")
st.markdown(
"""
This helps understand information-dense podcast episodes by doing the following:
- Speech to Text transcription - using OpenSource Whisper Model (small.en)
- Summarizes the episode - using philschmid/flan-t5-base-samsum a model based on Google's flan t5
- As a proof of Concept: the Podcast Episode of Marketplace Business News Podcast from NPR on June 14 is used in this codebase.
- The file is THE ONLY HARDCODED piece of information used in this application.
"""
)
st.text("Marketplace Episode June 14 2023")
st.audio("marketplace-2023-06-14.mp3")
if st.button("Process Audio File"):
podcast_text = transcribe_audio("marketplace-2023-06-14.mp3")
#write text out
with st.expander("See Transcription"):
st.caption(podcast_text['text'])
#Summarize Text
podcast_summary = summarize_podcast(podcast_text['text'])
st.markdown(
"""
##Summary of Text
"""
)
st.text(podcast_summary)
if st.button("Summarize Podcast"):
with open('transcription.txt', 'r') as file:
podcast_text = file.read().rstrip()
podcast_summary = summarize_podcast(podcast_text)
st.markdown(
"""
##Summary of Text
"""
)
st.text(podcast_summary)
|