Spaces:

Sebbe33
/

Transcription_gemini

Running

App Files Files Community

Transcription_gemini / app.py

Sebbe33

Update app.py

e8a155e verified 6 months ago

raw

history blame contribute delete

4.26 kB

	import os
	import tempfile
	import streamlit as st
	from google import genai
	from jinja2 import Template

	st.title("Audio Transcription with Speaker Identification")
	st.write("Upload an audio file to generate a transcript with speakers identified.")

	# API Key Input
	api_key_input = st.text_input(
	"Gemini API Key",
	type="password",
	help="You can also set it via GEMINI_API_KEY environment variable."
	)
	api_key = api_key_input or os.getenv("GEMINI_API_KEY")

	# Speakers Input
	speakers_input = st.text_input(
	"Known Speakers (comma-separated)",
	help="List known speaker names. Leave empty if unknown."
	)
	speakers = [s.strip() for s in speakers_input.split(",")] if speakers_input else []

	# File Upload
	audio_file = st.file_uploader(
	"Upload Audio File",
	type=["mp3", "wav", "m4a", "ogg", "mp4"]
	)

	if st.button("Generate Transcript"):
	if not api_key:
	st.error("Please provide a Gemini API key.")
	elif not audio_file:
	st.error("Please upload an audio file.")
	else:
	original_extension = os.path.splitext(audio_file.name)[1]
	with tempfile.NamedTemporaryFile(
	delete=False,
	suffix=original_extension
	) as tmp_file:
	tmp_file.write(audio_file.read())
	tmp_file_path = tmp_file.name

	try:
	# Initialize GenAI client
	client = genai.Client(api_key=api_key)

	uploaded_file = client.files.upload(file=tmp_file_path)

	# New token counting functionality
	try:
	token_info = client.models.count_tokens(
	model='gemini-2.0-flash',
	contents=[uploaded_file]
	)
	st.info(f"File contains approximately {token_info.total_tokens} tokens")
	except AttributeError:
	st.warning("Token counting not available in current API version")

	# Create prompt template
	prompt_template = Template("""Generate a transcript of the episode. Include timestamps and identify speakers.
	Speakers are:
	{% for speaker in speakers %}- {{ speaker }}{% if not loop.last %}\n{% endif %}{% endfor %}

	eg:
	[00:00] Brady: Hello there.
	[00:02] Tim: Hi Brady.

	It is important to include the correct speaker names. Use the names you identified earlier. If you really don't know the speaker's name, identify them with a letter of the alphabet, eg there may be an unknown speaker 'A' and another unknown speaker 'B'.

	If there is music or a short jingle playing, signify like so:
	[01:02] [MUSIC] or [01:02] [JINGLE]

	If you can identify the name of the music or jingle playing then use that instead, eg:
	[01:02] [Firework by Katy Perry] or [01:02] [The Sofa Shop jingle]

	If there is some other sound playing try to identify the sound, eg:
	[01:02] [Bell ringing]

	Each individual caption should be quite short, a few short sentences at most.

	Signify the end of the episode with [END].

	Don't use any markdown formatting, like bolding or italics.

	Only use characters from the English alphabet, unless you genuinely believe foreign characters are correct.

	It is important that you use the correct words and spell everything correctly. Use the context of the podcast to help.
	If the hosts discuss something like a movie, book or celebrity, make sure the movie, book, or celebrity name is spelled correctly.""")

	prompt = prompt_template.render(speakers=speakers)

	# Generate content
	response = client.models.generate_content(
	model="gemini-2.0-flash",
	contents=[prompt, uploaded_file],
	)

	# Display results
	st.subheader("Transcript")
	st.code(response.text, language="text")

	except Exception as e:
	st.error(f"An error occurred: {str(e)}")
	finally:
	os.remove(tmp_file_path)

	# Credits section in sidebar
	st.sidebar.markdown("""
	Info
	- It works without key, but has limits. Use your own to upload bigger files.
	Credits
	- Transcription powered by [Gemini API](https://ai.google.dev/)
	- Heavy inspired by https://github.com/philschmid/gemini-samples/blob/main/examples/gemini-transcribe-with-timestamps.ipynb
	""")