import os import tempfile import streamlit as st from google import genai from jinja2 import Template st.title("Audio Transcription with Speaker Identification") st.write("Upload an audio file to generate a transcript with speakers identified.") # API Key Input api_key_input = st.text_input( "Gemini API Key", type="password", help="You can also set it via GEMINI_API_KEY environment variable." ) api_key = api_key_input or os.getenv("GEMINI_API_KEY") # Speakers Input speakers_input = st.text_input( "Known Speakers (comma-separated)", help="List known speaker names. Leave empty if unknown." ) speakers = [s.strip() for s in speakers_input.split(",")] if speakers_input else [] # File Upload audio_file = st.file_uploader( "Upload Audio File", type=["mp3", "wav", "m4a", "ogg", "mp4"] ) if st.button("Generate Transcript"): if not api_key: st.error("Please provide a Gemini API key.") elif not audio_file: st.error("Please upload an audio file.") else: original_extension = os.path.splitext(audio_file.name)[1] with tempfile.NamedTemporaryFile( delete=False, suffix=original_extension ) as tmp_file: tmp_file.write(audio_file.read()) tmp_file_path = tmp_file.name try: # Initialize GenAI client client = genai.Client(api_key=api_key) uploaded_file = client.files.upload(file=tmp_file_path) # New token counting functionality try: token_info = client.models.count_tokens( model='gemini-2.0-flash', contents=[uploaded_file] ) st.info(f"File contains approximately {token_info.total_tokens} tokens") except AttributeError: st.warning("Token counting not available in current API version") # Create prompt template prompt_template = Template("""Generate a transcript of the episode. Include timestamps and identify speakers. Speakers are: {% for speaker in speakers %}- {{ speaker }}{% if not loop.last %}\n{% endif %}{% endfor %} eg: [00:00] Brady: Hello there. [00:02] Tim: Hi Brady. It is important to include the correct speaker names. Use the names you identified earlier. If you really don't know the speaker's name, identify them with a letter of the alphabet, eg there may be an unknown speaker 'A' and another unknown speaker 'B'. If there is music or a short jingle playing, signify like so: [01:02] [MUSIC] or [01:02] [JINGLE] If you can identify the name of the music or jingle playing then use that instead, eg: [01:02] [Firework by Katy Perry] or [01:02] [The Sofa Shop jingle] If there is some other sound playing try to identify the sound, eg: [01:02] [Bell ringing] Each individual caption should be quite short, a few short sentences at most. Signify the end of the episode with [END]. Don't use any markdown formatting, like bolding or italics. Only use characters from the English alphabet, unless you genuinely believe foreign characters are correct. It is important that you use the correct words and spell everything correctly. Use the context of the podcast to help. If the hosts discuss something like a movie, book or celebrity, make sure the movie, book, or celebrity name is spelled correctly.""") prompt = prompt_template.render(speakers=speakers) # Generate content response = client.models.generate_content( model="gemini-2.0-flash", contents=[prompt, uploaded_file], ) # Display results st.subheader("Transcript") st.code(response.text, language="text") except Exception as e: st.error(f"An error occurred: {str(e)}") finally: os.remove(tmp_file_path) # Credits section in sidebar st.sidebar.markdown(""" **Info** - It works without key, but has limits. Use your own to upload bigger files. **Credits** - Transcription powered by [Gemini API](https://ai.google.dev/) - Heavy inspired by https://github.com/philschmid/gemini-samples/blob/main/examples/gemini-transcribe-with-timestamps.ipynb """)