AudioTranscribe / app.py
ZennyKenny's picture
close mic so browser doesn't crash
190e895 verified
raw
history blame
3.21 kB
import gradio as gr
import torch
from transformers import pipeline
import librosa
import soundfile as sf
import spaces
import os
def split_audio(audio_data, sr, chunk_duration=30):
"""Split audio into chunks of chunk_duration seconds."""
chunks = []
for start in range(0, len(audio_data), int(chunk_duration * sr)):
end = start + int(chunk_duration * sr)
chunks.append(audio_data[start:end])
return chunks
def transcribe_long_audio(audio_input, transcriber, chunk_duration=30):
"""Transcribe long audio by splitting into smaller chunks."""
try:
if isinstance(audio_input, tuple): # Recorded audio
audio_data, sr = audio_input
elif isinstance(audio_input, str): # Uploaded file path
audio_data, sr = librosa.load(audio_input, sr=None)
else:
raise ValueError("Unsupported audio input format.")
chunks = split_audio(audio_data, sr, chunk_duration)
transcriptions = []
for i, chunk in enumerate(chunks):
temp_path = f"temp_chunk_{i}.wav"
sf.write(temp_path, chunk, sr) # Save chunk as WAV
transcription = transcriber(temp_path)["text"]
transcriptions.append(transcription)
os.remove(temp_path) # Cleanup temp files
return " ".join(transcriptions)
except Exception as e:
return f"Error processing audio: {e}"
@spaces.GPU(duration=3)
def main():
device = 0 if torch.cuda.is_available() else -1
try:
transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
except Exception as e:
print(f"Error loading models: {e}")
raise
def process_audio(audio_input):
try:
transcription = transcribe_long_audio(audio_input, transcriber, chunk_duration=30)
summary = summarizer(transcription, max_length=50, min_length=10, do_sample=False)[0]["summary_text"]
return transcription, summary
except Exception as e:
return f"Error processing audio: {e}", ""
with gr.Blocks() as interface:
with gr.Row():
with gr.Column():
# Enable recording or file upload
audio_input = gr.Audio(type="numpy", label="Record or Upload Audio")
process_button = gr.Button("Process Audio")
stop_button = gr.Button("Stop Recording") # Add Stop Button
with gr.Column():
transcription_output = gr.Textbox(label="Full Transcription", lines=10)
summary_output = gr.Textbox(label="Summary", lines=5)
def stop_microphone():
"""Dummy function to simulate stopping the microphone."""
return "Recording stopped."
process_button.click(
process_audio,
inputs=[audio_input],
outputs=[transcription_output, summary_output]
)
stop_button.click(
stop_microphone,
inputs=[],
outputs=[],
)
interface.launch(share=True)
if __name__ == "__main__":
main()