Spaces:
Running
on
Zero
Running
on
Zero
import os | |
import gradio as gr | |
import spaces | |
import dolphin | |
from dolphin.languages import LANGUAGE_CODES, LANGUAGE_REGION_CODES | |
MODEL_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "models") | |
os.makedirs(MODEL_DIR, exist_ok=True) | |
language_options = [(f"{code}: {name[0]}", code) | |
for code, name in LANGUAGE_CODES.items()] | |
language_options.sort(key=lambda x: x[0]) | |
MODELS = { | |
"base (140M)": "base", | |
"small (372M)": "small", | |
} | |
language_to_regions = {} | |
for lang_region, names in LANGUAGE_REGION_CODES.items(): | |
if "-" in lang_region: | |
lang, region = lang_region.split("-", 1) | |
if lang not in language_to_regions: | |
language_to_regions[lang] = [] | |
language_to_regions[lang].append((f"{region}: {names[0]}", region)) | |
def update_regions(language): | |
if language and language in language_to_regions: | |
regions = language_to_regions[language] | |
regions.sort(key=lambda x: x[0]) | |
return gr.Dropdown.update(choices=regions, value=regions[0][1], visible=True) | |
return gr.Dropdown.update(choices=[], value=None, visible=False) | |
def transcribe_audio(audio_file, model_name, language, region, predict_timestamps, padding_speech): | |
model_key = MODELS[model_name] | |
model = dolphin.load_model(model_key, MODEL_DIR, "cuda") | |
waveform = dolphin.load_audio(audio_file) | |
kwargs = { | |
"predict_time": predict_timestamps, | |
"padding_speech": padding_speech | |
} | |
if language: | |
kwargs["lang_sym"] = language | |
if region: | |
kwargs["region_sym"] = region | |
result = model(waveform, **kwargs) | |
output_text = result.text | |
language_detected = f"{result.language}" | |
region_detected = f"{result.region}" | |
detected_info = f"Detected language: {result.language}" + \ | |
(f", region: {result.region}" if result.region else "") | |
return output_text, detected_info | |
with gr.Blocks(title="Dolphin Speech Recognition") as demo: | |
gr.Markdown("# Dolphin ASR") | |
gr.Markdown(""" | |
A multilingual, multitask ASR model supporting 40 Eastern languages and 22 Chinese dialects. | |
This model is from [DataoceanAI/Dolphin](https://github.com/DataoceanAI/Dolphin), for speech recognition in | |
Eastern languages including Chinese, Japanese, Korean, and many more. | |
""") | |
with gr.Row(): | |
with gr.Column(): | |
audio_input = gr.Audio( | |
type="filepath", label="Upload or Record Audio") | |
with gr.Row(): | |
model_dropdown = gr.Dropdown( | |
choices=list(MODELS.keys()), | |
value=list(MODELS.keys())[1], | |
label="Model Size" | |
) | |
with gr.Row(): | |
language_dropdown = gr.Dropdown( | |
choices=language_options, | |
value=None, | |
label="Language (Optional)", | |
info="If not selected, the model will auto-detect language" | |
) | |
region_dropdown = gr.Dropdown( | |
choices=[], | |
value=None, | |
label="Region (Optional)", | |
visible=False | |
) | |
with gr.Row(): | |
timestamp_checkbox = gr.Checkbox( | |
value=True, | |
label="Include Timestamps" | |
) | |
padding_checkbox = gr.Checkbox( | |
value=True, | |
label="Pad Speech to 30s" | |
) | |
transcribe_button = gr.Button("Transcribe", variant="primary") | |
with gr.Column(): | |
output_text = gr.Textbox(label="Transcription", lines=10) | |
language_info = gr.Textbox(label="Detected Language", lines=1) | |
language_dropdown.change( | |
fn=update_regions, | |
inputs=[language_dropdown], | |
outputs=[region_dropdown] | |
) | |
transcribe_button.click( | |
fn=transcribe_audio, | |
inputs=[ | |
audio_input, | |
model_dropdown, | |
language_dropdown, | |
region_dropdown, | |
timestamp_checkbox, | |
padding_checkbox | |
], | |
outputs=[output_text, language_info] | |
) | |
gr.Examples( | |
inputs=[ | |
audio_input, | |
model_dropdown, | |
language_dropdown, | |
region_dropdown, | |
timestamp_checkbox, | |
padding_checkbox | |
], | |
outputs=[output_text, language_info], | |
fn=transcribe_audio, | |
cache_examples=True, | |
) | |
gr.Markdown(""" | |
- The model supports 40 Eastern languages and 22 Chinese dialects | |
- You can let the model auto-detect language or specify language and region | |
- Timestamps can be included in the output | |
- Speech can be padded to 30 seconds for better processing | |
- Model: [DataoceanAI/Dolphin](https://github.com/DataoceanAI/Dolphin) | |
- Paper: [Dolphin: A Multilingual Model for Eastern Languages](https://arxiv.org/abs/2503.20212) | |
""") | |
demo.launch() | |