Spaces:

openpecha
/

stt_demo

Sleeping

App Files Files Community

ganga4364 commited on Jul 22

Commit

4ce0e75

verified ·

1 Parent(s): 3543a1c

Update app.py

Browse files

Files changed (1) hide show

app.py +76 -71

app.py CHANGED Viewed

@@ -88,27 +88,15 @@ def build_html_output(s: str, style: str = "result_item_success"):
     </div>
     """
-def create_preview_player(audio_path, vtt_path):
     """Create an HTML preview with audio player and subtitles"""
-    static_dir = Path("static")
-    static_dir.mkdir(exist_ok=True)
-    # Copy files to static directory with friendly names
-    audio_filename = Path(audio_path).name
-    vtt_filename = Path(vtt_path).name
-    new_audio_path = static_dir / audio_filename
-    new_vtt_path = static_dir / vtt_filename
-    shutil.copy2(audio_path, new_audio_path)
-    shutil.copy2(vtt_path, new_vtt_path)
-    # Create direct HTML content
     html_content = f"""
     <div class="player-container">
         <h3>Audio Player with Subtitles</h3>
         <audio controls>
-            <source src="file/{new_audio_path}" type="audio/wav">
-            <track label="Tibetan" kind="subtitles" srclang="bo" src="file/{new_vtt_path}" default>
             Your browser does not support the audio element.
         </audio>
     </div>
@@ -123,72 +111,89 @@ def process_audio(audio_path: str):
                 "Please upload an audio file first",
                 "result_item_error",
             ),
-            "",
-            "",
             "",
             "",
         )
     logging.info(f"Processing audio file: {audio_path}")
-    # Load and resample audio to 16kHz mono
-    wav, sr = torchaudio.load(audio_path)
-    if sr != SAMPLE_RATE:
-        wav = torchaudio.transforms.Resample(sr, SAMPLE_RATE)(wav)
-    wav = wav.mean(dim=0)  # convert to mono
-    wav_np = wav.numpy()
-    # Get speech timestamps using Silero VAD
-    speech_timestamps = get_speech_ts(wav_np, vad_model, sampling_rate=SAMPLE_RATE)
-    if not speech_timestamps:
         return (
-            build_html_output("No speech detected", "result_item_error"),
-            "",
-            "",
             "",
             "",
         )
-    timestamps_with_text = []
-    transcriptions = []
-    for ts in speech_timestamps:
-        start, end = ts['start'], ts['end']
-        segment = wav[start:end]
-        if segment.dim() > 1:
-            segment = segment.squeeze()
-        inputs = processor(segment, sampling_rate=SAMPLE_RATE, return_tensors="pt", padding=True)
-        with torch.no_grad():
-            logits = model(**inputs).logits
-        predicted_ids = torch.argmax(logits, dim=-1)
-        transcription = processor.decode(predicted_ids[0])
-        transcriptions.append(transcription)
-        timestamps_with_text.append((start, end, transcription))
-    # Generate subtitle files
-    base_path = os.path.splitext(audio_path)[0]
-    srt_path = f"{base_path}.srt"
-    vtt_path = f"{base_path}.vtt"
-    create_subtitle_file(timestamps_with_text, srt_path, "srt")
-    create_subtitle_file(timestamps_with_text, vtt_path, "vtt")
-    # Create preview player
-    preview_html = create_preview_player(audio_path, vtt_path)
-    all_text = " ".join(transcriptions)
-    return (
-        build_html_output(
-            "Transcription completed! You can now:\n1. Download the SRT/VTT files\n2. Play the audio with subtitles below",
-            "result_item_success"
-        ),
-        srt_path,
-        vtt_path,
-        preview_html,
-        all_text,
-    )
 demo = gr.Blocks(css=css)
 with demo:
@@ -231,4 +236,4 @@ with demo:
 if __name__ == "__main__":
     formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
     logging.basicConfig(format=formatter, level=logging.INFO)
-    demo.launch(share=True, file_directories=["static"])

     </div>
     """
+def create_preview_player(audio_file, vtt_file):
     """Create an HTML preview with audio player and subtitles"""
+    # Create direct HTML content using the file components directly
     html_content = f"""
     <div class="player-container">
         <h3>Audio Player with Subtitles</h3>
         <audio controls>
+            <source src="{audio_file.name}" type="audio/wav">
+            <track label="Tibetan" kind="subtitles" srclang="bo" src="{vtt_file.name}" default>
             Your browser does not support the audio element.
         </audio>
     </div>
                 "Please upload an audio file first",
                 "result_item_error",
             ),
+            None,
+            None,
             "",
             "",
         )
     logging.info(f"Processing audio file: {audio_path}")
+    try:
+        # Load and resample audio to 16kHz mono
+        wav, sr = torchaudio.load(audio_path)
+        if sr != SAMPLE_RATE:
+            wav = torchaudio.transforms.Resample(sr, SAMPLE_RATE)(wav)
+        wav = wav.mean(dim=0)  # convert to mono
+        wav_np = wav.numpy()
+        # Get speech timestamps using Silero VAD
+        speech_timestamps = get_speech_ts(wav_np, vad_model, sampling_rate=SAMPLE_RATE)
+        if not speech_timestamps:
+            return (
+                build_html_output("No speech detected", "result_item_error"),
+                None,
+                None,
+                "",
+                "",
+            )
+        timestamps_with_text = []
+        transcriptions = []
+        for ts in speech_timestamps:
+            start, end = ts['start'], ts['end']
+            segment = wav[start:end]
+            if segment.dim() > 1:
+                segment = segment.squeeze()
+            inputs = processor(segment, sampling_rate=SAMPLE_RATE, return_tensors="pt", padding=True)
+            with torch.no_grad():
+                logits = model(**inputs).logits
+            predicted_ids = torch.argmax(logits, dim=-1)
+            transcription = processor.decode(predicted_ids[0])
+            transcriptions.append(transcription)
+            timestamps_with_text.append((start, end, transcription))
+        # Generate subtitle files
+        base_path = os.path.splitext(audio_path)[0]
+        srt_path = f"{base_path}.srt"
+        vtt_path = f"{base_path}.vtt"
+        create_subtitle_file(timestamps_with_text, srt_path, "srt")
+        create_subtitle_file(timestamps_with_text, vtt_path, "vtt")
+        # Create file components for Gradio
+        srt_file = gr.File.update(value=srt_path)
+        vtt_file = gr.File.update(value=vtt_path)
+        # Create preview player
+        preview_html = create_preview_player(srt_file, vtt_file)
+        all_text = " ".join(transcriptions)
         return (
+            build_html_output(
+                "Transcription completed! You can now:\n1. Download the SRT/VTT files\n2. Play the audio with subtitles below",
+                "result_item_success"
+            ),
+            srt_file,
+            vtt_file,
+            preview_html,
+            all_text,
+        )
+    except Exception as e:
+        logging.error(f"Error processing audio: {str(e)}")
+        return (
+            build_html_output(
+                f"Error processing audio: {str(e)}",
+                "result_item_error"
+            ),
+            None,
+            None,
             "",
             "",
         )
 demo = gr.Blocks(css=css)
 with demo:
 if __name__ == "__main__":
     formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
     logging.basicConfig(format=formatter, level=logging.INFO)
+    demo.launch(share=True)