Spaces:
Running
on
Zero
Running
on
Zero
File size: 5,051 Bytes
714e414 b683426 714e414 b683426 714e414 b683426 714e414 b683426 add17f8 1222a68 add17f8 714e414 b683426 1222a68 b683426 714e414 b683426 714e414 b683426 714e414 df7d9b2 1222a68 df7d9b2 714e414 b683426 714e414 b683426 1222a68 b683426 714e414 b683426 1222a68 b683426 714e414 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 |
import os
from math import floor
from typing import Optional
import spaces
import torch
import gradio as gr
from transformers import pipeline
from transformers.pipelines.audio_utils import ffmpeg_read
# config
model_name = "kotoba-tech/kotoba-whisper-v2.2"
example_file = "sample_diarization_japanese.mp3"
if torch.cuda.is_available():
pipe = pipeline(
model=model_name,
chunk_length_s=15,
batch_size=16,
torch_dtype=torch.bfloat16,
device="cuda",
model_kwargs={'attn_implementation': 'sdpa'},
trust_remote_code=True
)
else:
pipe = pipeline(model=model_name, chunk_length_s=15, batch_size=16, trust_remote_code=True)
def format_time(start: Optional[float], end: Optional[float]):
def _format_time(seconds: Optional[float]):
if seconds is None:
return "[no timestamp available]"
minutes = floor(seconds / 60)
hours = floor(seconds / 3600)
seconds = seconds - hours * 3600 - minutes * 60
m_seconds = floor(round(seconds - floor(seconds), 1) * 10)
seconds = floor(seconds)
return f'{minutes:02}:{seconds:02}.{m_seconds:01}'
return f"[{_format_time(start)} -> {_format_time(end)}]:"
@spaces.GPU
def get_prediction(inputs, **kwargs):
return pipe(inputs, **kwargs)
def transcribe(inputs: str,
add_punctuation: bool,
num_speakers: float,
min_speakers: float,
max_speakers: float,
chunk_length_s: float,
add_silence_end: float,
add_silence_start: float):
if inputs is None:
raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
with open(inputs, "rb") as f:
inputs = f.read()
array = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
prediction = get_prediction(
inputs={"array": array, "sampling_rate": pipe.feature_extractor.sampling_rate},
add_punctuation=add_punctuation,
num_speakers=int(num_speakers) if num_speakers != 0 else None,
min_speakers=int(min_speakers) if min_speakers != 0 else None,
max_speakers=int(max_speakers) if max_speakers != 0 else None,
chunk_length_s=int(chunk_length_s) if chunk_length_s != 30 else None,
add_silence_end=add_silence_end if add_silence_end != 0 else None,
add_silence_start=add_silence_start if add_silence_start != 0 else None
)
output = ""
for n, s in enumerate(prediction["speaker_ids"]):
text_timestamped = "\n".join([f"- **{format_time(*c['timestamp'])}** {c['text']}" for c in prediction[f"chunks/{s}"]])
output += f'### Speaker {n+1} \n{prediction[f"text/{s}"]}\n\n{text_timestamped}\n'
return output
description = (f"Transcribe and diarize long-form microphone or audio inputs with the click of a button! Demo uses "
f"Kotoba-Whisper [{model_name}](https://huggingface.co/{model_name}).")
title = f"Audio Transcription and Diarization with {os.path.basename(model_name)}"
shared_config = {"fn": transcribe, "title": title, "description": description, "allow_flagging": "never", "examples": [
[example_file, True, 0, 0, 0, 30, 0.5, 0.5],
[example_file, True, 4, 0, 0, 30, 0.5, 0.5]
]}
o_upload = gr.Markdown()
o_mic = gr.Markdown()
options = [
]
i_upload = gr.Interface(
inputs=[
gr.Audio(sources="upload", type="filepath", label="Audio file"),
gr.Checkbox(label="add punctuation", value=True),
gr.Slider(0, 10, label="num speakers (set 0 for auto-detect mode)", value=0, step=1),
gr.Slider(0, 10, label="min speakers (set 0 for auto-detect mode)", value=0, step=1),
gr.Slider(0, 10, label="max speakers (set 0 for auto-detect mode)", value=0, step=1),
gr.Slider(5, 30, label="chunk length for ASR", value=30, step=1),
gr.Slider(0, 0.5, label="silence at the end", value=0.5, step=0.05),
gr.Slider(0, 0.5, label="silence at the start", value=0.5, step=0.05),
],
outputs=gr.Markdown(),
**shared_config
)
i_mic = gr.Interface(
inputs=[
gr.Audio(sources="microphone", type="filepath", label="Microphone input"),
gr.Checkbox(label="add punctuation", value=True),
gr.Slider(0, 10, label="num speakers (set 0 for auto-detect mode)", value=0, step=1),
gr.Slider(0, 10, label="min speakers (set 0 for auto-detect mode)", value=0, step=1),
gr.Slider(0, 10, label="max speakers (set 0 for auto-detect mode)", value=0, step=1),
gr.Slider(5, 30, label="chunk length for ASR", value=30, step=1),
gr.Slider(0, 0.5, label="silence at the end", value=0.5, step=0.05),
gr.Slider(0, 0.5, label="silence at the start", value=0.5, step=0.05),
],
outputs=gr.Markdown(),
**shared_config
)
with gr.Blocks() as demo:
gr.TabbedInterface([i_upload, i_mic], ["Audio file", "Microphone"])
demo.queue(api_open=False, default_concurrency_limit=40).launch(show_api=False, show_error=True)
|