litagin's picture
Use token
07c2d2e
raw
history blame
5.88 kB
import os
import time
import warnings
from pathlib import Path
import gradio as gr
import huggingface_hub
import librosa
import spaces
import torch
from loguru import logger
from transformers import pipeline
warnings.filterwarnings("ignore")
huggingface_hub.login(token=os.getenv("HF_TOKEN"))
is_hf = os.getenv("SYSTEM") == "spaces"
generate_kwargs = {
"language": "Japanese",
"do_sample": False,
"num_beams": 1,
"no_repeat_ngram_size": 0,
"max_new_tokens": 64,
}
model_dict = {
"whisper-large-v2": "openai/whisper-large-v2",
"whisper-large-v3": "openai/whisper-large-v3",
"whisper-large-v3-turbo": "openai/whisper-large-v3-turbo",
"kotoba-whisper-v1.0": "kotoba-tech/kotoba-whisper-v1.0",
"kotoba-whisper-v2.0": "kotoba-tech/kotoba-whisper-v2.0",
"anime-whisper": "litagin/anime-whisper",
}
logger.info("Initializing pipelines...")
pipe_dict = {
k: pipeline(
"automatic-speech-recognition",
model=v,
device="cuda" if torch.cuda.is_available() else "cpu",
)
for k, v in model_dict.items()
}
logger.success("Pipelines initialized!")
@spaces.GPU
def transcribe_common(audio: str, model: str) -> tuple[str, float]:
if not audio:
return "No audio file", 0
filename = Path(audio).name
logger.info(f"Model: {model}")
logger.info(f"Audio: {filename}")
# Read and resample audio to 16kHz
y, sr = librosa.load(audio, mono=True, sr=16000)
# Get duration of audio
duration = librosa.get_duration(y=y, sr=sr)
logger.info(f"Duration: {duration:.2f}s")
if duration > 15:
logger.error(f"Audio too long, limit is 15 seconds, got {duration:.2f}s")
return f"Audio too long, limit is 15 seconds, got {duration:.2f}s", 0
start_time = time.time()
result = pipe_dict[model](y, generate_kwargs=generate_kwargs)["text"]
end_time = time.time()
logger.success(f"Finished in {end_time - start_time:.2f}s\n{result}")
return result, end_time - start_time
def transcribe_large_v2(audio) -> tuple[str, float]:
return transcribe_common(audio, "whisper-large-v2")
def transcribe_large_v3(audio) -> tuple[str, float]:
return transcribe_common(audio, "whisper-large-v3")
def transcribe_large_v3_turbo(audio) -> tuple[str, float]:
return transcribe_common(audio, "whisper-large-v3-turbo")
def transcribe_kotoba_v1(audio) -> tuple[str, float]:
return transcribe_common(audio, "kotoba-whisper-v1.0")
def transcribe_kotoba_v2(audio) -> tuple[str, float]:
return transcribe_common(audio, "kotoba-whisper-v2.0")
def transcribe_anime_whisper(audio) -> tuple[str, float]:
return transcribe_common(audio, "anime-whisper")
initial_md = """
# Anime-Whisper Demo
- 音声認識モデル [kotoba-whisper-v2.0](https://huggingface.co/kotoba-tech/kotoba-whisper-v2.0) をファインチューンしたモデルのお試し
- https://huggingface.co/litagin/anime-whisper
- デモでは**音声は15秒まで**しか受け付けません
- 日本語のみ対応 (Japanese only)
- 現在0.1エポックくらい
- 比較できるように他モデルもついでに試せる
pipeに渡しているkwargsは以下の最低限のもの:
```python
generate_kwargs = {
"language": "Japanese",
"do_sample": False,
"num_beams": 1,
"no_repeat_ngram_size": 0,
"max_new_tokens": 64,
}
```
"""
with gr.Blocks() as app:
gr.Markdown(initial_md)
audio = gr.Audio(type="filepath")
with gr.Row():
with gr.Column():
gr.Markdown("### Anime-Whisper")
button_galgame = gr.Button("Transcribe with Anime-Whisper")
time_galgame = gr.Textbox(label="Time taken")
output_galgame = gr.Textbox(label="Result")
with gr.Row():
with gr.Column():
gr.Markdown("### Whisper-Large-V2")
button_v2 = gr.Button("Transcribe with Whisper-Large-V2")
time_v2 = gr.Textbox(label="Time taken")
output_v2 = gr.Textbox(label="Result")
with gr.Column():
gr.Markdown("### Whisper-Large-V3")
button_v3 = gr.Button("Transcribe with Whisper-Large-V3")
time_v3 = gr.Textbox(label="Time taken")
output_v3 = gr.Textbox(label="Result")
with gr.Column():
gr.Markdown("### Whisper-Large-V3-Turbo")
button_v3_turbo = gr.Button("Transcribe with Whisper-Large-V3-Turbo")
time_v3_turbo = gr.Textbox(label="Time taken")
output_v3_turbo = gr.Textbox(label="Result")
with gr.Row():
with gr.Column():
gr.Markdown("### Kotoba-Whisper-V1.0")
button_kotoba_v1 = gr.Button("Transcribe with Kotoba-Whisper-V1.0")
time_kotoba_v1 = gr.Textbox(label="Time taken")
output_kotoba_v1 = gr.Textbox(label="Result")
with gr.Column():
gr.Markdown("### Kotoba-Whisper-V2.0")
button_kotoba_v2 = gr.Button("Transcribe with Kotoba-Whisper-V2.0")
time_kotoba_v2 = gr.Textbox(label="Time taken")
output_kotoba_v2 = gr.Textbox(label="Result")
button_v2.click(transcribe_large_v2, inputs=audio, outputs=[output_v2, time_v2])
button_v3.click(transcribe_large_v3, inputs=audio, outputs=[output_v3, time_v3])
button_v3_turbo.click(
transcribe_large_v3_turbo,
inputs=audio,
outputs=[output_v3_turbo, time_v3_turbo],
)
button_kotoba_v1.click(
transcribe_kotoba_v1, inputs=audio, outputs=[output_kotoba_v1, time_kotoba_v1]
)
button_kotoba_v2.click(
transcribe_kotoba_v2, inputs=audio, outputs=[output_kotoba_v2, time_kotoba_v2]
)
button_galgame.click(
transcribe_anime_whisper,
inputs=audio,
outputs=[output_galgame, time_galgame],
)
# app.load(warmup, inputs=[], outputs=[warmup_result], queue=True)
app.launch(inbrowser=True)