ganga4364 commited on
Commit
3543a1c
·
verified ·
1 Parent(s): b25ec11

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +129 -34
app.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  import gradio as gr
2
  import torch
3
  import torchaudio
@@ -7,20 +9,47 @@ from datetime import timedelta
7
  import os
8
  import shutil
9
  from pathlib import Path
 
10
 
11
- # Load Silero VAD
12
- vad_model, utils = torch.hub.load(
13
- repo_or_dir='snakers4/silero-vad', model='silero_vad', trust_repo=True
14
- )
15
- (get_speech_ts, _, _, _, _) = utils
16
 
17
- # Load Wav2Vec2 model
18
- model_name = "ganga4364/Garchen_Rinpoche-wav2vec2-Checkpoint-19000"
19
- model = Wav2Vec2ForCTC.from_pretrained(model_name)
20
- processor = Wav2Vec2Processor.from_pretrained(model_name)
21
- model.eval()
22
 
23
- SAMPLE_RATE = 16000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  def format_timestamp(seconds, format_type="srt"):
26
  """Convert seconds to SRT or WebVTT timestamp format"""
@@ -50,7 +79,16 @@ def create_subtitle_file(timestamps_with_text, output_path, format_type="srt"):
50
  f.write(f"{format_timestamp(start_time/SAMPLE_RATE, 'vtt')} --> {format_timestamp(end_time/SAMPLE_RATE, 'vtt')}\n")
51
  f.write(f"{text}\n\n")
52
 
53
- def create_preview_html(audio_path, vtt_path):
 
 
 
 
 
 
 
 
 
54
  """Create an HTML preview with audio player and subtitles"""
55
  static_dir = Path("static")
56
  static_dir.mkdir(exist_ok=True)
@@ -68,9 +106,9 @@ def create_preview_html(audio_path, vtt_path):
68
  html_content = f"""
69
  <div class="player-container">
70
  <h3>Audio Player with Subtitles</h3>
71
- <audio controls style="width: 100%; margin: 10px 0;">
72
  <source src="file/{new_audio_path}" type="audio/wav">
73
- <track label="English" kind="subtitles" srclang="en" src="file/{new_vtt_path}" default>
74
  Your browser does not support the audio element.
75
  </audio>
76
  </div>
@@ -78,7 +116,21 @@ def create_preview_html(audio_path, vtt_path):
78
 
79
  return html_content
80
 
81
- def transcribe_with_vad(audio_path):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  # Load and resample audio to 16kHz mono
83
  wav, sr = torchaudio.load(audio_path)
84
  if sr != SAMPLE_RATE:
@@ -89,7 +141,13 @@ def transcribe_with_vad(audio_path):
89
  # Get speech timestamps using Silero VAD
90
  speech_timestamps = get_speech_ts(wav_np, vad_model, sampling_rate=SAMPLE_RATE)
91
  if not speech_timestamps:
92
- return "No speech detected.", None, None, None
 
 
 
 
 
 
93
 
94
  timestamps_with_text = []
95
  transcriptions = []
@@ -116,24 +174,61 @@ def transcribe_with_vad(audio_path):
116
  create_subtitle_file(timestamps_with_text, srt_path, "srt")
117
  create_subtitle_file(timestamps_with_text, vtt_path, "vtt")
118
 
119
- # Create preview HTML
120
- preview_html = create_preview_html(audio_path, vtt_path)
121
-
122
- return " ".join(transcriptions), srt_path, vtt_path, preview_html
123
-
124
- # Gradio Interface
125
- demo = gr.Interface(
126
- fn=transcribe_with_vad,
127
- inputs=gr.Audio(sources=["microphone", "upload"], type="filepath", label="Upload or Record"),
128
- outputs=[
129
- gr.Textbox(label="Transcription"),
130
- gr.File(label="SRT Subtitle File"),
131
- gr.File(label="WebVTT Subtitle File"),
132
- gr.HTML(label="Preview Player")
133
- ],
134
- title="Smart Speech-to-Text with VAD and Subtitles",
135
- description="Transcribe long audio using ganga4364/Garchen_Rinpoche-wav2vec2-Checkpoint-19000 and Silero VAD. Generates SRT and WebVTT subtitle files."
136
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
 
138
  if __name__ == "__main__":
 
 
139
  demo.launch(share=True, file_directories=["static"])
 
1
+ #!/usr/bin/env python3
2
+
3
  import gradio as gr
4
  import torch
5
  import torchaudio
 
9
  import os
10
  import shutil
11
  from pathlib import Path
12
+ import logging
13
 
14
+ # Constants and Configuration
15
+ SAMPLE_RATE = 16000
16
+ MODEL_NAME = "ganga4364/Garchen_Rinpoche-wav2vec2-Checkpoint-19000"
 
 
17
 
18
+ title = "# Tibetan Speech-to-Text with Subtitles"
 
 
 
 
19
 
20
+ description = """
21
+ This application transcribes Tibetan audio files and generates subtitles using:
22
+ - Wav2Vec2 model fine-tuned on Garchen Rinpoche's teachings
23
+ - Silero VAD for voice activity detection
24
+ - Generates both SRT and WebVTT subtitle formats
25
+ """
26
+
27
+ css = """
28
+ .result {display:flex;flex-direction:column}
29
+ .result_item {padding:15px;margin-bottom:8px;border-radius:15px;width:100%}
30
+ .result_item_success {background-color:mediumaquamarine;color:white;align-self:start}
31
+ .result_item_error {background-color:#ff7070;color:white;align-self:start}
32
+ .player-container {margin: 20px 0;}
33
+ .player-container audio {width: 100%;}
34
+ """
35
+
36
+ # Initialize models
37
+ def init_models():
38
+ # Load Silero VAD
39
+ vad_model, utils = torch.hub.load(
40
+ repo_or_dir='snakers4/silero-vad', model='silero_vad', trust_repo=True
41
+ )
42
+ get_speech_ts = utils[0]
43
+
44
+ # Load Wav2Vec2 model
45
+ model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)
46
+ processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
47
+ model.eval()
48
+
49
+ return vad_model, get_speech_ts, model, processor
50
+
51
+ # Initialize models globally
52
+ vad_model, get_speech_ts, model, processor = init_models()
53
 
54
  def format_timestamp(seconds, format_type="srt"):
55
  """Convert seconds to SRT or WebVTT timestamp format"""
 
79
  f.write(f"{format_timestamp(start_time/SAMPLE_RATE, 'vtt')} --> {format_timestamp(end_time/SAMPLE_RATE, 'vtt')}\n")
80
  f.write(f"{text}\n\n")
81
 
82
+ def build_html_output(s: str, style: str = "result_item_success"):
83
+ return f"""
84
+ <div class='result'>
85
+ <div class='result_item {style}'>
86
+ {s}
87
+ </div>
88
+ </div>
89
+ """
90
+
91
+ def create_preview_player(audio_path, vtt_path):
92
  """Create an HTML preview with audio player and subtitles"""
93
  static_dir = Path("static")
94
  static_dir.mkdir(exist_ok=True)
 
106
  html_content = f"""
107
  <div class="player-container">
108
  <h3>Audio Player with Subtitles</h3>
109
+ <audio controls>
110
  <source src="file/{new_audio_path}" type="audio/wav">
111
+ <track label="Tibetan" kind="subtitles" srclang="bo" src="file/{new_vtt_path}" default>
112
  Your browser does not support the audio element.
113
  </audio>
114
  </div>
 
116
 
117
  return html_content
118
 
119
+ def process_audio(audio_path: str):
120
+ if audio_path is None or audio_path == "":
121
+ return (
122
+ build_html_output(
123
+ "Please upload an audio file first",
124
+ "result_item_error",
125
+ ),
126
+ "",
127
+ "",
128
+ "",
129
+ "",
130
+ )
131
+
132
+ logging.info(f"Processing audio file: {audio_path}")
133
+
134
  # Load and resample audio to 16kHz mono
135
  wav, sr = torchaudio.load(audio_path)
136
  if sr != SAMPLE_RATE:
 
141
  # Get speech timestamps using Silero VAD
142
  speech_timestamps = get_speech_ts(wav_np, vad_model, sampling_rate=SAMPLE_RATE)
143
  if not speech_timestamps:
144
+ return (
145
+ build_html_output("No speech detected", "result_item_error"),
146
+ "",
147
+ "",
148
+ "",
149
+ "",
150
+ )
151
 
152
  timestamps_with_text = []
153
  transcriptions = []
 
174
  create_subtitle_file(timestamps_with_text, srt_path, "srt")
175
  create_subtitle_file(timestamps_with_text, vtt_path, "vtt")
176
 
177
+ # Create preview player
178
+ preview_html = create_preview_player(audio_path, vtt_path)
179
+ all_text = " ".join(transcriptions)
180
+
181
+ return (
182
+ build_html_output(
183
+ "Transcription completed! You can now:\n1. Download the SRT/VTT files\n2. Play the audio with subtitles below",
184
+ "result_item_success"
185
+ ),
186
+ srt_path,
187
+ vtt_path,
188
+ preview_html,
189
+ all_text,
190
+ )
191
+
192
+ demo = gr.Blocks(css=css)
193
+
194
+ with demo:
195
+ gr.Markdown(title)
196
+
197
+ with gr.Tabs():
198
+ with gr.TabItem("Upload Audio"):
199
+ audio_input = gr.Audio(
200
+ sources=["upload"],
201
+ type="filepath",
202
+ label="Upload audio file",
203
+ )
204
+ process_button = gr.Button("Generate Subtitles")
205
+
206
+ with gr.Column():
207
+ info_output = gr.HTML(label="Status")
208
+ srt_output = gr.File(label="SRT Subtitle File")
209
+ vtt_output = gr.File(label="WebVTT Subtitle File")
210
+ preview_output = gr.HTML(label="Preview Player")
211
+ text_output = gr.Textbox(
212
+ label="Full Transcription",
213
+ placeholder="Transcribed text will appear here...",
214
+ lines=5
215
+ )
216
+
217
+ process_button.click(
218
+ process_audio,
219
+ inputs=[audio_input],
220
+ outputs=[
221
+ info_output,
222
+ srt_output,
223
+ vtt_output,
224
+ preview_output,
225
+ text_output,
226
+ ],
227
+ )
228
+
229
+ gr.Markdown(description)
230
 
231
  if __name__ == "__main__":
232
+ formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
233
+ logging.basicConfig(format=formatter, level=logging.INFO)
234
  demo.launch(share=True, file_directories=["static"])