Spaces:
Running
Running
File size: 4,257 Bytes
835bf99 f514700 835bf99 f514700 bc4cb86 f91380b 835bf99 f91380b e8a155e 835bf99 f91380b bc4cb86 f91380b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 |
import os
import tempfile
import streamlit as st
from google import genai
from jinja2 import Template
st.title("Audio Transcription with Speaker Identification")
st.write("Upload an audio file to generate a transcript with speakers identified.")
# API Key Input
api_key_input = st.text_input(
"Gemini API Key",
type="password",
help="You can also set it via GEMINI_API_KEY environment variable."
)
api_key = api_key_input or os.getenv("GEMINI_API_KEY")
# Speakers Input
speakers_input = st.text_input(
"Known Speakers (comma-separated)",
help="List known speaker names. Leave empty if unknown."
)
speakers = [s.strip() for s in speakers_input.split(",")] if speakers_input else []
# File Upload
audio_file = st.file_uploader(
"Upload Audio File",
type=["mp3", "wav", "m4a", "ogg", "mp4"]
)
if st.button("Generate Transcript"):
if not api_key:
st.error("Please provide a Gemini API key.")
elif not audio_file:
st.error("Please upload an audio file.")
else:
original_extension = os.path.splitext(audio_file.name)[1]
with tempfile.NamedTemporaryFile(
delete=False,
suffix=original_extension
) as tmp_file:
tmp_file.write(audio_file.read())
tmp_file_path = tmp_file.name
try:
# Initialize GenAI client
client = genai.Client(api_key=api_key)
uploaded_file = client.files.upload(file=tmp_file_path)
# New token counting functionality
try:
token_info = client.models.count_tokens(
model='gemini-2.0-flash',
contents=[uploaded_file]
)
st.info(f"File contains approximately {token_info.total_tokens} tokens")
except AttributeError:
st.warning("Token counting not available in current API version")
# Create prompt template
prompt_template = Template("""Generate a transcript of the episode. Include timestamps and identify speakers.
Speakers are:
{% for speaker in speakers %}- {{ speaker }}{% if not loop.last %}\n{% endif %}{% endfor %}
eg:
[00:00] Brady: Hello there.
[00:02] Tim: Hi Brady.
It is important to include the correct speaker names. Use the names you identified earlier. If you really don't know the speaker's name, identify them with a letter of the alphabet, eg there may be an unknown speaker 'A' and another unknown speaker 'B'.
If there is music or a short jingle playing, signify like so:
[01:02] [MUSIC] or [01:02] [JINGLE]
If you can identify the name of the music or jingle playing then use that instead, eg:
[01:02] [Firework by Katy Perry] or [01:02] [The Sofa Shop jingle]
If there is some other sound playing try to identify the sound, eg:
[01:02] [Bell ringing]
Each individual caption should be quite short, a few short sentences at most.
Signify the end of the episode with [END].
Don't use any markdown formatting, like bolding or italics.
Only use characters from the English alphabet, unless you genuinely believe foreign characters are correct.
It is important that you use the correct words and spell everything correctly. Use the context of the podcast to help.
If the hosts discuss something like a movie, book or celebrity, make sure the movie, book, or celebrity name is spelled correctly.""")
prompt = prompt_template.render(speakers=speakers)
# Generate content
response = client.models.generate_content(
model="gemini-2.0-flash",
contents=[prompt, uploaded_file],
)
# Display results
st.subheader("Transcript")
st.code(response.text, language="text")
except Exception as e:
st.error(f"An error occurred: {str(e)}")
finally:
os.remove(tmp_file_path)
# Credits section in sidebar
st.sidebar.markdown("""
**Info**
- It works without key, but has limits. Use your own to upload bigger files.
**Credits**
- Transcription powered by [Gemini API](https://ai.google.dev/)
- Heavy inspired by https://github.com/philschmid/gemini-samples/blob/main/examples/gemini-transcribe-with-timestamps.ipynb
""") |