Update app.py
Browse files
app.py
CHANGED
@@ -1,3 +1,5 @@
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
import torch
|
3 |
import torchaudio
|
@@ -7,20 +9,47 @@ from datetime import timedelta
|
|
7 |
import os
|
8 |
import shutil
|
9 |
from pathlib import Path
|
|
|
10 |
|
11 |
-
#
|
12 |
-
|
13 |
-
|
14 |
-
)
|
15 |
-
(get_speech_ts, _, _, _, _) = utils
|
16 |
|
17 |
-
#
|
18 |
-
model_name = "ganga4364/Garchen_Rinpoche-wav2vec2-Checkpoint-19000"
|
19 |
-
model = Wav2Vec2ForCTC.from_pretrained(model_name)
|
20 |
-
processor = Wav2Vec2Processor.from_pretrained(model_name)
|
21 |
-
model.eval()
|
22 |
|
23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
def format_timestamp(seconds, format_type="srt"):
|
26 |
"""Convert seconds to SRT or WebVTT timestamp format"""
|
@@ -50,7 +79,16 @@ def create_subtitle_file(timestamps_with_text, output_path, format_type="srt"):
|
|
50 |
f.write(f"{format_timestamp(start_time/SAMPLE_RATE, 'vtt')} --> {format_timestamp(end_time/SAMPLE_RATE, 'vtt')}\n")
|
51 |
f.write(f"{text}\n\n")
|
52 |
|
53 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
"""Create an HTML preview with audio player and subtitles"""
|
55 |
static_dir = Path("static")
|
56 |
static_dir.mkdir(exist_ok=True)
|
@@ -68,9 +106,9 @@ def create_preview_html(audio_path, vtt_path):
|
|
68 |
html_content = f"""
|
69 |
<div class="player-container">
|
70 |
<h3>Audio Player with Subtitles</h3>
|
71 |
-
<audio controls
|
72 |
<source src="file/{new_audio_path}" type="audio/wav">
|
73 |
-
<track label="
|
74 |
Your browser does not support the audio element.
|
75 |
</audio>
|
76 |
</div>
|
@@ -78,7 +116,21 @@ def create_preview_html(audio_path, vtt_path):
|
|
78 |
|
79 |
return html_content
|
80 |
|
81 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
# Load and resample audio to 16kHz mono
|
83 |
wav, sr = torchaudio.load(audio_path)
|
84 |
if sr != SAMPLE_RATE:
|
@@ -89,7 +141,13 @@ def transcribe_with_vad(audio_path):
|
|
89 |
# Get speech timestamps using Silero VAD
|
90 |
speech_timestamps = get_speech_ts(wav_np, vad_model, sampling_rate=SAMPLE_RATE)
|
91 |
if not speech_timestamps:
|
92 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
|
94 |
timestamps_with_text = []
|
95 |
transcriptions = []
|
@@ -116,24 +174,61 @@ def transcribe_with_vad(audio_path):
|
|
116 |
create_subtitle_file(timestamps_with_text, srt_path, "srt")
|
117 |
create_subtitle_file(timestamps_with_text, vtt_path, "vtt")
|
118 |
|
119 |
-
# Create preview
|
120 |
-
preview_html =
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
137 |
|
138 |
if __name__ == "__main__":
|
|
|
|
|
139 |
demo.launch(share=True, file_directories=["static"])
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
|
3 |
import gradio as gr
|
4 |
import torch
|
5 |
import torchaudio
|
|
|
9 |
import os
|
10 |
import shutil
|
11 |
from pathlib import Path
|
12 |
+
import logging
|
13 |
|
14 |
+
# Constants and Configuration
|
15 |
+
SAMPLE_RATE = 16000
|
16 |
+
MODEL_NAME = "ganga4364/Garchen_Rinpoche-wav2vec2-Checkpoint-19000"
|
|
|
|
|
17 |
|
18 |
+
title = "# Tibetan Speech-to-Text with Subtitles"
|
|
|
|
|
|
|
|
|
19 |
|
20 |
+
description = """
|
21 |
+
This application transcribes Tibetan audio files and generates subtitles using:
|
22 |
+
- Wav2Vec2 model fine-tuned on Garchen Rinpoche's teachings
|
23 |
+
- Silero VAD for voice activity detection
|
24 |
+
- Generates both SRT and WebVTT subtitle formats
|
25 |
+
"""
|
26 |
+
|
27 |
+
css = """
|
28 |
+
.result {display:flex;flex-direction:column}
|
29 |
+
.result_item {padding:15px;margin-bottom:8px;border-radius:15px;width:100%}
|
30 |
+
.result_item_success {background-color:mediumaquamarine;color:white;align-self:start}
|
31 |
+
.result_item_error {background-color:#ff7070;color:white;align-self:start}
|
32 |
+
.player-container {margin: 20px 0;}
|
33 |
+
.player-container audio {width: 100%;}
|
34 |
+
"""
|
35 |
+
|
36 |
+
# Initialize models
|
37 |
+
def init_models():
|
38 |
+
# Load Silero VAD
|
39 |
+
vad_model, utils = torch.hub.load(
|
40 |
+
repo_or_dir='snakers4/silero-vad', model='silero_vad', trust_repo=True
|
41 |
+
)
|
42 |
+
get_speech_ts = utils[0]
|
43 |
+
|
44 |
+
# Load Wav2Vec2 model
|
45 |
+
model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)
|
46 |
+
processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
|
47 |
+
model.eval()
|
48 |
+
|
49 |
+
return vad_model, get_speech_ts, model, processor
|
50 |
+
|
51 |
+
# Initialize models globally
|
52 |
+
vad_model, get_speech_ts, model, processor = init_models()
|
53 |
|
54 |
def format_timestamp(seconds, format_type="srt"):
|
55 |
"""Convert seconds to SRT or WebVTT timestamp format"""
|
|
|
79 |
f.write(f"{format_timestamp(start_time/SAMPLE_RATE, 'vtt')} --> {format_timestamp(end_time/SAMPLE_RATE, 'vtt')}\n")
|
80 |
f.write(f"{text}\n\n")
|
81 |
|
82 |
+
def build_html_output(s: str, style: str = "result_item_success"):
|
83 |
+
return f"""
|
84 |
+
<div class='result'>
|
85 |
+
<div class='result_item {style}'>
|
86 |
+
{s}
|
87 |
+
</div>
|
88 |
+
</div>
|
89 |
+
"""
|
90 |
+
|
91 |
+
def create_preview_player(audio_path, vtt_path):
|
92 |
"""Create an HTML preview with audio player and subtitles"""
|
93 |
static_dir = Path("static")
|
94 |
static_dir.mkdir(exist_ok=True)
|
|
|
106 |
html_content = f"""
|
107 |
<div class="player-container">
|
108 |
<h3>Audio Player with Subtitles</h3>
|
109 |
+
<audio controls>
|
110 |
<source src="file/{new_audio_path}" type="audio/wav">
|
111 |
+
<track label="Tibetan" kind="subtitles" srclang="bo" src="file/{new_vtt_path}" default>
|
112 |
Your browser does not support the audio element.
|
113 |
</audio>
|
114 |
</div>
|
|
|
116 |
|
117 |
return html_content
|
118 |
|
119 |
+
def process_audio(audio_path: str):
|
120 |
+
if audio_path is None or audio_path == "":
|
121 |
+
return (
|
122 |
+
build_html_output(
|
123 |
+
"Please upload an audio file first",
|
124 |
+
"result_item_error",
|
125 |
+
),
|
126 |
+
"",
|
127 |
+
"",
|
128 |
+
"",
|
129 |
+
"",
|
130 |
+
)
|
131 |
+
|
132 |
+
logging.info(f"Processing audio file: {audio_path}")
|
133 |
+
|
134 |
# Load and resample audio to 16kHz mono
|
135 |
wav, sr = torchaudio.load(audio_path)
|
136 |
if sr != SAMPLE_RATE:
|
|
|
141 |
# Get speech timestamps using Silero VAD
|
142 |
speech_timestamps = get_speech_ts(wav_np, vad_model, sampling_rate=SAMPLE_RATE)
|
143 |
if not speech_timestamps:
|
144 |
+
return (
|
145 |
+
build_html_output("No speech detected", "result_item_error"),
|
146 |
+
"",
|
147 |
+
"",
|
148 |
+
"",
|
149 |
+
"",
|
150 |
+
)
|
151 |
|
152 |
timestamps_with_text = []
|
153 |
transcriptions = []
|
|
|
174 |
create_subtitle_file(timestamps_with_text, srt_path, "srt")
|
175 |
create_subtitle_file(timestamps_with_text, vtt_path, "vtt")
|
176 |
|
177 |
+
# Create preview player
|
178 |
+
preview_html = create_preview_player(audio_path, vtt_path)
|
179 |
+
all_text = " ".join(transcriptions)
|
180 |
+
|
181 |
+
return (
|
182 |
+
build_html_output(
|
183 |
+
"Transcription completed! You can now:\n1. Download the SRT/VTT files\n2. Play the audio with subtitles below",
|
184 |
+
"result_item_success"
|
185 |
+
),
|
186 |
+
srt_path,
|
187 |
+
vtt_path,
|
188 |
+
preview_html,
|
189 |
+
all_text,
|
190 |
+
)
|
191 |
+
|
192 |
+
demo = gr.Blocks(css=css)
|
193 |
+
|
194 |
+
with demo:
|
195 |
+
gr.Markdown(title)
|
196 |
+
|
197 |
+
with gr.Tabs():
|
198 |
+
with gr.TabItem("Upload Audio"):
|
199 |
+
audio_input = gr.Audio(
|
200 |
+
sources=["upload"],
|
201 |
+
type="filepath",
|
202 |
+
label="Upload audio file",
|
203 |
+
)
|
204 |
+
process_button = gr.Button("Generate Subtitles")
|
205 |
+
|
206 |
+
with gr.Column():
|
207 |
+
info_output = gr.HTML(label="Status")
|
208 |
+
srt_output = gr.File(label="SRT Subtitle File")
|
209 |
+
vtt_output = gr.File(label="WebVTT Subtitle File")
|
210 |
+
preview_output = gr.HTML(label="Preview Player")
|
211 |
+
text_output = gr.Textbox(
|
212 |
+
label="Full Transcription",
|
213 |
+
placeholder="Transcribed text will appear here...",
|
214 |
+
lines=5
|
215 |
+
)
|
216 |
+
|
217 |
+
process_button.click(
|
218 |
+
process_audio,
|
219 |
+
inputs=[audio_input],
|
220 |
+
outputs=[
|
221 |
+
info_output,
|
222 |
+
srt_output,
|
223 |
+
vtt_output,
|
224 |
+
preview_output,
|
225 |
+
text_output,
|
226 |
+
],
|
227 |
+
)
|
228 |
+
|
229 |
+
gr.Markdown(description)
|
230 |
|
231 |
if __name__ == "__main__":
|
232 |
+
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
|
233 |
+
logging.basicConfig(format=formatter, level=logging.INFO)
|
234 |
demo.launch(share=True, file_directories=["static"])
|