ganga4364 commited on
Commit
0191635
·
verified ·
1 Parent(s): 2a3adfa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -163
app.py CHANGED
@@ -5,10 +5,6 @@ import torch
5
  import torchaudio
6
  import numpy as np
7
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
8
- from datetime import timedelta
9
- import os
10
- import shutil
11
- from pathlib import Path
12
  import logging
13
 
14
  # Constants and Configuration
@@ -17,22 +13,12 @@ CHUNK_SECONDS = 30 # Split audio into 30-second chunks
17
  CHUNK_SAMPLES = SAMPLE_RATE * CHUNK_SECONDS
18
  MODEL_NAME = "openpecha/general_stt_base_model"
19
 
20
- title = "# Tibetan Speech-to-Text with Subtitles"
21
 
22
  description = """
23
- This application transcribes Tibetan audio files and generates subtitles using:
24
  - Wav2Vec2 model fine-tuned on Garchen Rinpoche's teachings
25
  - 30-second fixed chunking for long audio processing
26
- - Generates both SRT and WebVTT subtitle formats
27
- """
28
-
29
- css = """
30
- .result {display:flex;flex-direction:column}
31
- .result_item {padding:15px;margin-bottom:8px;border-radius:15px;width:100%}
32
- .result_item_success {background-color:mediumaquamarine;color:white;align-self:start}
33
- .result_item_error {background-color:#ff7070;color:white;align-self:start}
34
- .player-container {margin: 20px 0;}
35
- .player-container audio {width: 100%;}
36
  """
37
 
38
  # Initialize model
@@ -47,73 +33,9 @@ def init_model():
47
  # Initialize model globally
48
  model, processor = init_model()
49
 
50
- def format_timestamp(seconds, format_type="srt"):
51
- """Convert seconds to SRT or WebVTT timestamp format"""
52
- td = timedelta(seconds=seconds)
53
- hours = td.seconds // 3600
54
- minutes = (td.seconds % 3600) // 60
55
- seconds = td.seconds % 60
56
- milliseconds = round(td.microseconds / 1000)
57
-
58
- if format_type == "srt":
59
- return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"
60
- else: # webvtt
61
- return f"{hours:02d}:{minutes:02d}:{seconds:02d}.{milliseconds:03d}"
62
-
63
- def create_subtitle_file(timestamps_with_text, output_path, format_type="srt"):
64
- """Create SRT or WebVTT subtitle file"""
65
- with open(output_path, 'w', encoding='utf-8') as f:
66
- if format_type == "vtt":
67
- f.write("WEBVTT\n\n")
68
-
69
- for i, (start_time, end_time, text) in enumerate(timestamps_with_text, 1):
70
- if format_type == "srt":
71
- f.write(f"{i}\n")
72
- f.write(f"{format_timestamp(start_time)} --> {format_timestamp(end_time)}\n")
73
- f.write(f"{text}\n\n")
74
- else:
75
- f.write(f"{format_timestamp(start_time, 'vtt')} --> {format_timestamp(end_time, 'vtt')}\n")
76
- f.write(f"{text}\n\n")
77
-
78
- def build_html_output(s: str, style: str = "result_item_success"):
79
- return f"""
80
- <div class='result'>
81
- <div class='result_item {style}'>
82
- {s}
83
- </div>
84
- </div>
85
- """
86
-
87
- def create_preview_player(audio_path, vtt_path):
88
- # Create an HTML preview with audio player and subtitles
89
- # Convert file paths to relative URLs that Gradio can serve
90
- audio_url = f"file={audio_path}"
91
- vtt_url = f"file={vtt_path}"
92
-
93
- html_content = f"""
94
- <div class="audio-player">
95
- <audio controls style="width: 100%;">
96
- <source src="{audio_url}" type="audio/wav">
97
- <track kind="subtitles" src="{vtt_url}" default>
98
- Your browser does not support the audio element.
99
- </audio>
100
- </div>
101
- """
102
-
103
- return html_content
104
-
105
  def process_audio(audio_path: str):
106
  if audio_path is None or audio_path == "":
107
- return (
108
- build_html_output(
109
- "Please upload an audio file first",
110
- "result_item_error",
111
- ),
112
- None,
113
- None,
114
- "",
115
- "",
116
- )
117
 
118
  logging.info(f"Processing audio file: {audio_path}")
119
 
@@ -126,16 +48,11 @@ def process_audio(audio_path: str):
126
 
127
  # Split audio into 30-second chunks
128
  audio_length = wav.shape[0]
129
- timestamps_with_text = []
130
  transcriptions = []
131
 
132
  for start_sample in range(0, audio_length, CHUNK_SAMPLES):
133
  end_sample = min(start_sample + CHUNK_SAMPLES, audio_length)
134
 
135
- # Convert sample positions to seconds
136
- start_time = start_sample / SAMPLE_RATE
137
- end_time = end_sample / SAMPLE_RATE
138
-
139
  # Extract chunk
140
  chunk = wav[start_sample:end_sample]
141
 
@@ -153,92 +70,44 @@ def process_audio(audio_path: str):
153
  # Skip empty transcriptions
154
  if transcription.strip():
155
  transcriptions.append(transcription)
156
- timestamps_with_text.append((start_time, end_time, transcription))
157
-
158
- if not timestamps_with_text:
159
- return (
160
- build_html_output("No speech detected or recognized", "result_item_error"),
161
- None,
162
- None,
163
- "",
164
- "",
165
- )
166
-
167
- # Generate subtitle files
168
- base_path = os.path.splitext(audio_path)[0]
169
- srt_path = f"{base_path}.srt"
170
- vtt_path = f"{base_path}.vtt"
171
-
172
- create_subtitle_file(timestamps_with_text, srt_path, "srt")
173
- create_subtitle_file(timestamps_with_text, vtt_path, "vtt")
174
-
175
- # Return the file paths directly
176
- srt_file = srt_path
177
- vtt_file = vtt_path
178
-
179
- # Create preview player with the file paths
180
- preview_html = create_preview_player(audio_path, vtt_path)
181
- all_text = " ".join(transcriptions)
182
 
183
- return (
184
- build_html_output(
185
- "Transcription completed! You can now:\n1. Download the SRT/VTT files\n2. Play the audio with subtitles below",
186
- "result_item_success"
187
- ),
188
- srt_file,
189
- vtt_file,
190
- preview_html,
191
- all_text,
192
- )
193
  except Exception as e:
194
  logging.error(f"Error processing audio: {str(e)}")
195
- return (
196
- build_html_output(
197
- f"Error processing audio: {str(e)}",
198
- "result_item_error"
199
- ),
200
- None,
201
- None,
202
- "",
203
- "",
204
- )
205
 
206
- demo = gr.Blocks(css=css)
207
 
208
  with demo:
209
  gr.Markdown(title)
210
 
211
- with gr.Tabs():
212
- with gr.TabItem("Upload Audio"):
213
- audio_input = gr.Audio(
214
- sources=["upload"],
215
- type="filepath",
216
- label="Upload audio file",
217
- )
218
- process_button = gr.Button("Generate Subtitles")
219
-
220
- with gr.Column():
221
- info_output = gr.HTML(label="Status")
222
- srt_output = gr.File(label="SRT Subtitle File")
223
- vtt_output = gr.File(label="WebVTT Subtitle File")
224
- preview_output = gr.HTML(label="Preview Player")
225
- text_output = gr.Textbox(
226
- label="Full Transcription",
227
- placeholder="Transcribed text will appear here...",
228
- lines=5
229
- )
230
-
231
- process_button.click(
232
- process_audio,
233
- inputs=[audio_input],
234
- outputs=[
235
- info_output,
236
- srt_output,
237
- vtt_output,
238
- preview_output,
239
- text_output,
240
- ],
241
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
242
 
243
  gr.Markdown(description)
244
 
 
5
  import torchaudio
6
  import numpy as np
7
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
 
 
 
 
8
  import logging
9
 
10
  # Constants and Configuration
 
13
  CHUNK_SAMPLES = SAMPLE_RATE * CHUNK_SECONDS
14
  MODEL_NAME = "openpecha/general_stt_base_model"
15
 
16
+ title = "# Tibetan Speech-to-Text"
17
 
18
  description = """
19
+ This application transcribes Tibetan audio files using:
20
  - Wav2Vec2 model fine-tuned on Garchen Rinpoche's teachings
21
  - 30-second fixed chunking for long audio processing
 
 
 
 
 
 
 
 
 
 
22
  """
23
 
24
  # Initialize model
 
33
  # Initialize model globally
34
  model, processor = init_model()
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  def process_audio(audio_path: str):
37
  if audio_path is None or audio_path == "":
38
+ return "Please upload an audio file first"
 
 
 
 
 
 
 
 
 
39
 
40
  logging.info(f"Processing audio file: {audio_path}")
41
 
 
48
 
49
  # Split audio into 30-second chunks
50
  audio_length = wav.shape[0]
 
51
  transcriptions = []
52
 
53
  for start_sample in range(0, audio_length, CHUNK_SAMPLES):
54
  end_sample = min(start_sample + CHUNK_SAMPLES, audio_length)
55
 
 
 
 
 
56
  # Extract chunk
57
  chunk = wav[start_sample:end_sample]
58
 
 
70
  # Skip empty transcriptions
71
  if transcription.strip():
72
  transcriptions.append(transcription)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
+ if not transcriptions:
75
+ return "No speech detected or recognized"
76
+
77
+ # Join all transcriptions
78
+ all_text = " ".join(transcriptions)
79
+ return all_text
80
+
 
 
 
81
  except Exception as e:
82
  logging.error(f"Error processing audio: {str(e)}")
83
+ return f"Error processing audio: {str(e)}"
 
 
 
 
 
 
 
 
 
84
 
85
+ demo = gr.Blocks()
86
 
87
  with demo:
88
  gr.Markdown(title)
89
 
90
+ with gr.Row():
91
+ audio_input = gr.Audio(
92
+ sources=["upload"],
93
+ type="filepath",
94
+ label="Upload audio file",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  )
96
+
97
+ process_button = gr.Button("Transcribe Audio")
98
+
99
+ with gr.Row():
100
+ text_output = gr.Textbox(
101
+ label="Transcription",
102
+ placeholder="Transcribed text will appear here...",
103
+ lines=8
104
+ )
105
+
106
+ process_button.click(
107
+ process_audio,
108
+ inputs=[audio_input],
109
+ outputs=[text_output],
110
+ )
111
 
112
  gr.Markdown(description)
113