abreza commited on
Commit
8d36f34
·
1 Parent(s): 5a2ffa2

Add initial implementation of Dolphin ASR with Gradio interface and dependencies

Browse files
Files changed (3) hide show
  1. app.py +162 -0
  2. packages.txt +1 -0
  3. requirements.txt +6 -0
app.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ import spaces
4
+ import dolphin
5
+ from dolphin.languages import LANGUAGE_CODES, LANGUAGE_REGION_CODES
6
+
7
+ MODEL_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "models")
8
+ os.makedirs(MODEL_DIR, exist_ok=True)
9
+
10
+ language_options = [(f"{code}: {name[0]}", code)
11
+ for code, name in LANGUAGE_CODES.items()]
12
+ language_options.sort(key=lambda x: x[0])
13
+
14
+ MODELS = {
15
+ "base (140M)": "base",
16
+ "small (372M)": "small",
17
+ }
18
+
19
+ language_to_regions = {}
20
+ for lang_region, names in LANGUAGE_REGION_CODES.items():
21
+ if "-" in lang_region:
22
+ lang, region = lang_region.split("-", 1)
23
+ if lang not in language_to_regions:
24
+ language_to_regions[lang] = []
25
+ language_to_regions[lang].append((f"{region}: {names[0]}", region))
26
+
27
+
28
+
29
+ def update_regions(language):
30
+ if language and language in language_to_regions:
31
+ regions = language_to_regions[language]
32
+ regions.sort(key=lambda x: x[0])
33
+ return gr.Dropdown.update(choices=regions, value=regions[0][1], visible=True)
34
+ return gr.Dropdown.update(choices=[], value=None, visible=False)
35
+
36
+
37
+
38
+ @spaces.GPU
39
+ def transcribe_audio(audio_file, model_name, language, region, predict_timestamps, padding_speech):
40
+ model_key = MODELS[model_name]
41
+ model = dolphin.load_model(model_key, MODEL_DIR, "cuda")
42
+
43
+ waveform = dolphin.load_audio(audio_file)
44
+
45
+ kwargs = {
46
+ "predict_time": predict_timestamps,
47
+ "padding_speech": padding_speech
48
+ }
49
+
50
+ if language:
51
+ kwargs["lang_sym"] = language
52
+ if region:
53
+ kwargs["region_sym"] = region
54
+
55
+ result = model(waveform, **kwargs)
56
+
57
+ output_text = result.text
58
+ language_detected = f"{result.language}"
59
+ region_detected = f"{result.region}"
60
+
61
+ detected_info = f"Detected language: {result.language}" + \
62
+ (f", region: {result.region}" if result.region else "")
63
+ return output_text, detected_info
64
+
65
+
66
+ with gr.Blocks(title="Dolphin Speech Recognition") as demo:
67
+ gr.Markdown("# Dolphin ASR")
68
+ gr.Markdown("""
69
+ A multilingual, multitask ASR model supporting 40 Eastern languages and 22 Chinese dialects.
70
+
71
+ This model is from [DataoceanAI/Dolphin](https://github.com/DataoceanAI/Dolphin), for speech recognition in
72
+ Eastern languages including Chinese, Japanese, Korean, and many more.
73
+ """)
74
+
75
+ with gr.Row():
76
+ with gr.Column():
77
+ audio_input = gr.Audio(
78
+ type="filepath", label="Upload or Record Audio")
79
+
80
+ with gr.Row():
81
+ model_dropdown = gr.Dropdown(
82
+ choices=list(MODELS.keys()),
83
+ value=list(MODELS.keys())[1],
84
+ label="Model Size"
85
+ )
86
+
87
+ with gr.Row():
88
+ language_dropdown = gr.Dropdown(
89
+ choices=language_options,
90
+ value=None,
91
+ label="Language (Optional)",
92
+ info="If not selected, the model will auto-detect language"
93
+ )
94
+ region_dropdown = gr.Dropdown(
95
+ choices=[],
96
+ value=None,
97
+ label="Region (Optional)",
98
+ visible=False
99
+ )
100
+
101
+ with gr.Row():
102
+ timestamp_checkbox = gr.Checkbox(
103
+ value=True,
104
+ label="Include Timestamps"
105
+ )
106
+ padding_checkbox = gr.Checkbox(
107
+ value=True,
108
+ label="Pad Speech to 30s"
109
+ )
110
+
111
+ transcribe_button = gr.Button("Transcribe", variant="primary")
112
+
113
+ with gr.Column():
114
+ output_text = gr.Textbox(label="Transcription", lines=10)
115
+ language_info = gr.Textbox(label="Detected Language", lines=1)
116
+
117
+ language_dropdown.change(
118
+ fn=update_regions,
119
+ inputs=[language_dropdown],
120
+ outputs=[region_dropdown]
121
+ )
122
+
123
+ transcribe_button.click(
124
+ fn=transcribe_audio,
125
+ inputs=[
126
+ audio_input,
127
+ model_dropdown,
128
+ language_dropdown,
129
+ region_dropdown,
130
+ timestamp_checkbox,
131
+ padding_checkbox
132
+ ],
133
+ outputs=[output_text, language_info]
134
+ )
135
+
136
+ gr.Examples(
137
+ inputs=[
138
+ audio_input,
139
+ model_dropdown,
140
+ language_dropdown,
141
+ region_dropdown,
142
+ timestamp_checkbox,
143
+ padding_checkbox
144
+ ],
145
+ outputs=[output_text, language_info],
146
+ fn=transcribe_audio,
147
+ cache_examples=True,
148
+ )
149
+
150
+ gr.Markdown("""
151
+
152
+ - The model supports 40 Eastern languages and 22 Chinese dialects
153
+ - You can let the model auto-detect language or specify language and region
154
+ - Timestamps can be included in the output
155
+ - Speech can be padded to 30 seconds for better processing
156
+
157
+
158
+ - Model: [DataoceanAI/Dolphin](https://github.com/DataoceanAI/Dolphin)
159
+ - Paper: [Dolphin: A Multilingual Model for Eastern Languages](https://arxiv.org/abs/2503.20212)
160
+ """)
161
+
162
+ demo.launch()
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ ffmpeg
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ dataoceanai-dolphin
2
+ gradio
3
+ espnet==202402
4
+ modelscope
5
+ torch
6
+ typeguard