Spaces:
Build error
Build error
| import streamlit as st | |
| from models import BagOfModels, SoundToText, TextToSummary | |
| from settings import MODEL_PARSER | |
| args = MODEL_PARSER | |
| st.set_page_config( | |
| page_title="TTS Applications | Incore Solutions", | |
| layout="wide", | |
| menu_items={ | |
| "About": """This is a simple GUI for OpenAI's Whisper.""", | |
| }, | |
| ) | |
| def open_instructions(): | |
| with open("instructions.md", "r") as f: | |
| st.write(f.read()) | |
| # Render input type selection on the sidebar & the form | |
| input_type = st.sidebar.selectbox("Input Type", ["YouTube", "File"]) | |
| with st.sidebar.form("input_form"): | |
| if input_type == "YouTube": | |
| youtube_url = st.text_input("Youtube URL") | |
| elif input_type == "File": | |
| input_file = st.file_uploader("File", type=["mp3", "wav"]) | |
| whisper_model = st.selectbox("Whisper model", options = [whisper for whisper in BagOfModels.get_model_names() if "whisper" in whisper] , index=1) | |
| summary = st.checkbox("summarize") | |
| if summary: | |
| min_sum = st.number_input("Minimum words in the summary", min_value=1, step=1) | |
| max_sum = min(min_sum,st.number_input("Maximum words in the summary", min_value=2, step=1)) | |
| st.form_submit_button(label="Save settings") | |
| with st.sidebar.form("save settings"): | |
| transcribe = st.form_submit_button(label="Transcribe!") | |
| if transcribe: | |
| if input_type == "YouTube": | |
| if youtube_url and youtube_url.startswith("http"): | |
| model = BagOfModels.load_model(whisper_model,**vars(args)) | |
| st.session_state.transcription = model.predict_stt(source=youtube_url,source_type=input_type,model_task="stt") | |
| else: | |
| st.error("Please enter a valid YouTube URL") | |
| open_instructions() | |
| elif input_type == "File": | |
| if input_file: | |
| model = BagOfModels.load_model(whisper_model,**vars(args)) | |
| st.session_state.transcription = model.predict_stt(source=input_file,source_type=input_type,model_task="stt") | |
| else: | |
| st.error("Please upload a file") | |
| if "transcription" in st.session_state: | |
| # st.session_state.transcription.whisper() | |
| # create two columns to separate page and youtube video | |
| transcription_col, media_col = st.columns(2) | |
| with transcription_col: | |
| st.markdown("#### Audio") | |
| with open(st.session_state.transcription.audio_path, "rb") as f: | |
| st.audio(f.read()) | |
| st.markdown("---") | |
| st.markdown(f"#### Transcription (whisper model - `{whisper_model}`)") | |
| st.markdown(f"##### Language: `{st.session_state.transcription.language}`") | |
| # Trim raw transcribed output off tokens to simplify | |
| raw_output = st.expander("Raw output") | |
| raw_output.markdown(st.session_state.transcription.raw_output["text"]) | |
| if summary: | |
| summarized_output = st.expander("summarized output") | |
| # CURRENTLY ONLY SUPPORTS 1024 WORD TOKENS -> TODO: FIND METHOD TO INCREASE SUMMARY FOR LONGER VIDS -> 1024 * 4 = aprox 800 words within 1024 range | |
| text_summary = TextToSummary(str(st.session_state.transcription.text[:1024*4]),min_sum,max_sum).get_summary() | |
| summarized_output.markdown(text_summary[0]["summary_text"]) | |
| # Show transcription in format with timers added to text | |
| time_annotated_output = st.expander("time_annotated_output") | |
| for segment in st.session_state.transcription.segments: | |
| time_annotated_output.markdown( | |
| f"""[{round(segment["start"], 1)} - {round(segment["end"], 1)}] - {segment["text"]}""" | |
| ) | |
| # Show input youtube video | |
| with media_col: | |
| if input_type == "YouTube": | |
| st.markdown("---") | |
| st.markdown("#### Original YouTube Video") | |
| st.video(st.session_state.transcription.source) | |
| else: | |
| pass | |