ganga4364 commited on
Commit
4ce0e75
·
verified ·
1 Parent(s): 3543a1c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +76 -71
app.py CHANGED
@@ -88,27 +88,15 @@ def build_html_output(s: str, style: str = "result_item_success"):
88
  </div>
89
  """
90
 
91
- def create_preview_player(audio_path, vtt_path):
92
  """Create an HTML preview with audio player and subtitles"""
93
- static_dir = Path("static")
94
- static_dir.mkdir(exist_ok=True)
95
-
96
- # Copy files to static directory with friendly names
97
- audio_filename = Path(audio_path).name
98
- vtt_filename = Path(vtt_path).name
99
- new_audio_path = static_dir / audio_filename
100
- new_vtt_path = static_dir / vtt_filename
101
-
102
- shutil.copy2(audio_path, new_audio_path)
103
- shutil.copy2(vtt_path, new_vtt_path)
104
-
105
- # Create direct HTML content
106
  html_content = f"""
107
  <div class="player-container">
108
  <h3>Audio Player with Subtitles</h3>
109
  <audio controls>
110
- <source src="file/{new_audio_path}" type="audio/wav">
111
- <track label="Tibetan" kind="subtitles" srclang="bo" src="file/{new_vtt_path}" default>
112
  Your browser does not support the audio element.
113
  </audio>
114
  </div>
@@ -123,72 +111,89 @@ def process_audio(audio_path: str):
123
  "Please upload an audio file first",
124
  "result_item_error",
125
  ),
126
- "",
127
- "",
128
  "",
129
  "",
130
  )
131
 
132
  logging.info(f"Processing audio file: {audio_path}")
133
 
134
- # Load and resample audio to 16kHz mono
135
- wav, sr = torchaudio.load(audio_path)
136
- if sr != SAMPLE_RATE:
137
- wav = torchaudio.transforms.Resample(sr, SAMPLE_RATE)(wav)
138
- wav = wav.mean(dim=0) # convert to mono
139
- wav_np = wav.numpy()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
 
141
- # Get speech timestamps using Silero VAD
142
- speech_timestamps = get_speech_ts(wav_np, vad_model, sampling_rate=SAMPLE_RATE)
143
- if not speech_timestamps:
144
  return (
145
- build_html_output("No speech detected", "result_item_error"),
146
- "",
147
- "",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  "",
149
  "",
150
  )
151
 
152
- timestamps_with_text = []
153
- transcriptions = []
154
-
155
- for ts in speech_timestamps:
156
- start, end = ts['start'], ts['end']
157
- segment = wav[start:end]
158
- if segment.dim() > 1:
159
- segment = segment.squeeze()
160
-
161
- inputs = processor(segment, sampling_rate=SAMPLE_RATE, return_tensors="pt", padding=True)
162
- with torch.no_grad():
163
- logits = model(**inputs).logits
164
- predicted_ids = torch.argmax(logits, dim=-1)
165
- transcription = processor.decode(predicted_ids[0])
166
- transcriptions.append(transcription)
167
- timestamps_with_text.append((start, end, transcription))
168
-
169
- # Generate subtitle files
170
- base_path = os.path.splitext(audio_path)[0]
171
- srt_path = f"{base_path}.srt"
172
- vtt_path = f"{base_path}.vtt"
173
-
174
- create_subtitle_file(timestamps_with_text, srt_path, "srt")
175
- create_subtitle_file(timestamps_with_text, vtt_path, "vtt")
176
-
177
- # Create preview player
178
- preview_html = create_preview_player(audio_path, vtt_path)
179
- all_text = " ".join(transcriptions)
180
-
181
- return (
182
- build_html_output(
183
- "Transcription completed! You can now:\n1. Download the SRT/VTT files\n2. Play the audio with subtitles below",
184
- "result_item_success"
185
- ),
186
- srt_path,
187
- vtt_path,
188
- preview_html,
189
- all_text,
190
- )
191
-
192
  demo = gr.Blocks(css=css)
193
 
194
  with demo:
@@ -231,4 +236,4 @@ with demo:
231
  if __name__ == "__main__":
232
  formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
233
  logging.basicConfig(format=formatter, level=logging.INFO)
234
- demo.launch(share=True, file_directories=["static"])
 
88
  </div>
89
  """
90
 
91
+ def create_preview_player(audio_file, vtt_file):
92
  """Create an HTML preview with audio player and subtitles"""
93
+ # Create direct HTML content using the file components directly
 
 
 
 
 
 
 
 
 
 
 
 
94
  html_content = f"""
95
  <div class="player-container">
96
  <h3>Audio Player with Subtitles</h3>
97
  <audio controls>
98
+ <source src="{audio_file.name}" type="audio/wav">
99
+ <track label="Tibetan" kind="subtitles" srclang="bo" src="{vtt_file.name}" default>
100
  Your browser does not support the audio element.
101
  </audio>
102
  </div>
 
111
  "Please upload an audio file first",
112
  "result_item_error",
113
  ),
114
+ None,
115
+ None,
116
  "",
117
  "",
118
  )
119
 
120
  logging.info(f"Processing audio file: {audio_path}")
121
 
122
+ try:
123
+ # Load and resample audio to 16kHz mono
124
+ wav, sr = torchaudio.load(audio_path)
125
+ if sr != SAMPLE_RATE:
126
+ wav = torchaudio.transforms.Resample(sr, SAMPLE_RATE)(wav)
127
+ wav = wav.mean(dim=0) # convert to mono
128
+ wav_np = wav.numpy()
129
+
130
+ # Get speech timestamps using Silero VAD
131
+ speech_timestamps = get_speech_ts(wav_np, vad_model, sampling_rate=SAMPLE_RATE)
132
+ if not speech_timestamps:
133
+ return (
134
+ build_html_output("No speech detected", "result_item_error"),
135
+ None,
136
+ None,
137
+ "",
138
+ "",
139
+ )
140
+
141
+ timestamps_with_text = []
142
+ transcriptions = []
143
+
144
+ for ts in speech_timestamps:
145
+ start, end = ts['start'], ts['end']
146
+ segment = wav[start:end]
147
+ if segment.dim() > 1:
148
+ segment = segment.squeeze()
149
+
150
+ inputs = processor(segment, sampling_rate=SAMPLE_RATE, return_tensors="pt", padding=True)
151
+ with torch.no_grad():
152
+ logits = model(**inputs).logits
153
+ predicted_ids = torch.argmax(logits, dim=-1)
154
+ transcription = processor.decode(predicted_ids[0])
155
+ transcriptions.append(transcription)
156
+ timestamps_with_text.append((start, end, transcription))
157
+
158
+ # Generate subtitle files
159
+ base_path = os.path.splitext(audio_path)[0]
160
+ srt_path = f"{base_path}.srt"
161
+ vtt_path = f"{base_path}.vtt"
162
+
163
+ create_subtitle_file(timestamps_with_text, srt_path, "srt")
164
+ create_subtitle_file(timestamps_with_text, vtt_path, "vtt")
165
+
166
+ # Create file components for Gradio
167
+ srt_file = gr.File.update(value=srt_path)
168
+ vtt_file = gr.File.update(value=vtt_path)
169
+
170
+ # Create preview player
171
+ preview_html = create_preview_player(srt_file, vtt_file)
172
+ all_text = " ".join(transcriptions)
173
 
 
 
 
174
  return (
175
+ build_html_output(
176
+ "Transcription completed! You can now:\n1. Download the SRT/VTT files\n2. Play the audio with subtitles below",
177
+ "result_item_success"
178
+ ),
179
+ srt_file,
180
+ vtt_file,
181
+ preview_html,
182
+ all_text,
183
+ )
184
+ except Exception as e:
185
+ logging.error(f"Error processing audio: {str(e)}")
186
+ return (
187
+ build_html_output(
188
+ f"Error processing audio: {str(e)}",
189
+ "result_item_error"
190
+ ),
191
+ None,
192
+ None,
193
  "",
194
  "",
195
  )
196
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
  demo = gr.Blocks(css=css)
198
 
199
  with demo:
 
236
  if __name__ == "__main__":
237
  formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
238
  logging.basicConfig(format=formatter, level=logging.INFO)
239
+ demo.launch(share=True)