Spaces:

openpecha
/

stt_demo

Running

App Files Files Community

ganga4364 commited on 28 days ago

Commit

3543a1c

verified ·

1 Parent(s): b25ec11

Update app.py

Browse files

Files changed (1) hide show

app.py +129 -34

app.py CHANGED Viewed

@@ -1,3 +1,5 @@
 import gradio as gr
 import torch
 import torchaudio
@@ -7,20 +9,47 @@ from datetime import timedelta
 import os
 import shutil
 from pathlib import Path
-# Load Silero VAD
-vad_model, utils = torch.hub.load(
-    repo_or_dir='snakers4/silero-vad', model='silero_vad', trust_repo=True
-)
-(get_speech_ts, _, _, _, _) = utils
-# Load Wav2Vec2 model
-model_name = "ganga4364/Garchen_Rinpoche-wav2vec2-Checkpoint-19000"
-model = Wav2Vec2ForCTC.from_pretrained(model_name)
-processor = Wav2Vec2Processor.from_pretrained(model_name)
-model.eval()
-SAMPLE_RATE = 16000
 def format_timestamp(seconds, format_type="srt"):
     """Convert seconds to SRT or WebVTT timestamp format"""
@@ -50,7 +79,16 @@ def create_subtitle_file(timestamps_with_text, output_path, format_type="srt"):
                 f.write(f"{format_timestamp(start_time/SAMPLE_RATE, 'vtt')} --> {format_timestamp(end_time/SAMPLE_RATE, 'vtt')}\n")
                 f.write(f"{text}\n\n")
-def create_preview_html(audio_path, vtt_path):
     """Create an HTML preview with audio player and subtitles"""
     static_dir = Path("static")
     static_dir.mkdir(exist_ok=True)
@@ -68,9 +106,9 @@ def create_preview_html(audio_path, vtt_path):
     html_content = f"""
     <div class="player-container">
         <h3>Audio Player with Subtitles</h3>
-        <audio controls style="width: 100%; margin: 10px 0;">
             <source src="file/{new_audio_path}" type="audio/wav">
-            <track label="English" kind="subtitles" srclang="en" src="file/{new_vtt_path}" default>
             Your browser does not support the audio element.
         </audio>
     </div>
@@ -78,7 +116,21 @@ def create_preview_html(audio_path, vtt_path):
     return html_content
-def transcribe_with_vad(audio_path):
     # Load and resample audio to 16kHz mono
     wav, sr = torchaudio.load(audio_path)
     if sr != SAMPLE_RATE:
@@ -89,7 +141,13 @@ def transcribe_with_vad(audio_path):
     # Get speech timestamps using Silero VAD
     speech_timestamps = get_speech_ts(wav_np, vad_model, sampling_rate=SAMPLE_RATE)
     if not speech_timestamps:
-        return "No speech detected.", None, None, None
     timestamps_with_text = []
     transcriptions = []
@@ -116,24 +174,61 @@ def transcribe_with_vad(audio_path):
     create_subtitle_file(timestamps_with_text, srt_path, "srt")
     create_subtitle_file(timestamps_with_text, vtt_path, "vtt")
-    # Create preview HTML
-    preview_html = create_preview_html(audio_path, vtt_path)
-    return " ".join(transcriptions), srt_path, vtt_path, preview_html
-# Gradio Interface
-demo = gr.Interface(
-    fn=transcribe_with_vad,
-    inputs=gr.Audio(sources=["microphone", "upload"], type="filepath", label="Upload or Record"),
-    outputs=[
-        gr.Textbox(label="Transcription"),
-        gr.File(label="SRT Subtitle File"),
-        gr.File(label="WebVTT Subtitle File"),
-        gr.HTML(label="Preview Player")
-    ],
-    title="Smart Speech-to-Text with VAD and Subtitles",
-    description="Transcribe long audio using ganga4364/Garchen_Rinpoche-wav2vec2-Checkpoint-19000 and Silero VAD. Generates SRT and WebVTT subtitle files."
-)
 if __name__ == "__main__":
     demo.launch(share=True, file_directories=["static"])

+#!/usr/bin/env python3
 import gradio as gr
 import torch
 import torchaudio
 import os
 import shutil
 from pathlib import Path
+import logging
+# Constants and Configuration
+SAMPLE_RATE = 16000
+MODEL_NAME = "ganga4364/Garchen_Rinpoche-wav2vec2-Checkpoint-19000"
+title = "# Tibetan Speech-to-Text with Subtitles"
+description = """
+This application transcribes Tibetan audio files and generates subtitles using:
+- Wav2Vec2 model fine-tuned on Garchen Rinpoche's teachings
+- Silero VAD for voice activity detection
+- Generates both SRT and WebVTT subtitle formats
+"""
+css = """
+.result {display:flex;flex-direction:column}
+.result_item {padding:15px;margin-bottom:8px;border-radius:15px;width:100%}
+.result_item_success {background-color:mediumaquamarine;color:white;align-self:start}
+.result_item_error {background-color:#ff7070;color:white;align-self:start}
+.player-container {margin: 20px 0;}
+.player-container audio {width: 100%;}
+"""
+# Initialize models
+def init_models():
+    # Load Silero VAD
+    vad_model, utils = torch.hub.load(
+        repo_or_dir='snakers4/silero-vad', model='silero_vad', trust_repo=True
+    )
+    get_speech_ts = utils[0]
+    # Load Wav2Vec2 model
+    model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)
+    processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
+    model.eval()
+    return vad_model, get_speech_ts, model, processor
+# Initialize models globally
+vad_model, get_speech_ts, model, processor = init_models()
 def format_timestamp(seconds, format_type="srt"):
     """Convert seconds to SRT or WebVTT timestamp format"""
                 f.write(f"{format_timestamp(start_time/SAMPLE_RATE, 'vtt')} --> {format_timestamp(end_time/SAMPLE_RATE, 'vtt')}\n")
                 f.write(f"{text}\n\n")
+def build_html_output(s: str, style: str = "result_item_success"):
+    return f"""
+    <div class='result'>
+        <div class='result_item {style}'>
+          {s}
+        </div>
+    </div>
+    """
+def create_preview_player(audio_path, vtt_path):
     """Create an HTML preview with audio player and subtitles"""
     static_dir = Path("static")
     static_dir.mkdir(exist_ok=True)
     html_content = f"""
     <div class="player-container">
         <h3>Audio Player with Subtitles</h3>
+        <audio controls>
             <source src="file/{new_audio_path}" type="audio/wav">
+            <track label="Tibetan" kind="subtitles" srclang="bo" src="file/{new_vtt_path}" default>
             Your browser does not support the audio element.
         </audio>
     </div>
     return html_content
+def process_audio(audio_path: str):
+    if audio_path is None or audio_path == "":
+        return (
+            build_html_output(
+                "Please upload an audio file first",
+                "result_item_error",
+            ),
+            "",
+            "",
+            "",
+            "",
+        )
+    logging.info(f"Processing audio file: {audio_path}")
     # Load and resample audio to 16kHz mono
     wav, sr = torchaudio.load(audio_path)
     if sr != SAMPLE_RATE:
     # Get speech timestamps using Silero VAD
     speech_timestamps = get_speech_ts(wav_np, vad_model, sampling_rate=SAMPLE_RATE)
     if not speech_timestamps:
+        return (
+            build_html_output("No speech detected", "result_item_error"),
+            "",
+            "",
+            "",
+            "",
+        )
     timestamps_with_text = []
     transcriptions = []
     create_subtitle_file(timestamps_with_text, srt_path, "srt")
     create_subtitle_file(timestamps_with_text, vtt_path, "vtt")
+    # Create preview player
+    preview_html = create_preview_player(audio_path, vtt_path)
+    all_text = " ".join(transcriptions)
+    return (
+        build_html_output(
+            "Transcription completed! You can now:\n1. Download the SRT/VTT files\n2. Play the audio with subtitles below",
+            "result_item_success"
+        ),
+        srt_path,
+        vtt_path,
+        preview_html,
+        all_text,
+    )
+demo = gr.Blocks(css=css)
+with demo:
+    gr.Markdown(title)
+    with gr.Tabs():
+        with gr.TabItem("Upload Audio"):
+            audio_input = gr.Audio(
+                sources=["upload"],
+                type="filepath",
+                label="Upload audio file",
+            )
+            process_button = gr.Button("Generate Subtitles")
+            with gr.Column():
+                info_output = gr.HTML(label="Status")
+                srt_output = gr.File(label="SRT Subtitle File")
+                vtt_output = gr.File(label="WebVTT Subtitle File")
+                preview_output = gr.HTML(label="Preview Player")
+                text_output = gr.Textbox(
+                    label="Full Transcription",
+                    placeholder="Transcribed text will appear here...",
+                    lines=5
+                )
+        process_button.click(
+            process_audio,
+            inputs=[audio_input],
+            outputs=[
+                info_output,
+                srt_output,
+                vtt_output,
+                preview_output,
+                text_output,
+            ],
+        )
+    gr.Markdown(description)
 if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO)
     demo.launch(share=True, file_directories=["static"])