cantonese-call-transcriber

Running

App Files Files Community

terry-li-hm commited on Sep 6, 2024

Commit

103d57b

1 Parent(s): 794435e

Update

Browse files

Files changed (1) hide show

app.py +23 -55

app.py CHANGED Viewed

@@ -1,87 +1,56 @@
 # coding=utf-8
-import base64
-import io
-import os
-import re
-import tempfile
 import gradio as gr
-import librosa
 import numpy as np
 import soundfile as sf
 import spaces
 import torch
 import torchaudio
-from funasr import AutoModel
-from sv import clean_and_emoji_annotate_speech, process_audio
 @spaces.GPU
-def model_inference(input_wav, language, fs=16000):
-    language_abbr = {
-        "auto": "auto",
-        "zh": "zh",
-        "en": "en",
-        "yue": "yue",
-        "ja": "ja",
-        "ko": "ko",
-        "nospeech": "nospeech",
-    }
-    language = "auto" if len(language) < 1 else language
-    selected_language = language_abbr[language]
     # Handle input_wav format
     if isinstance(input_wav, tuple):
         fs, input_wav = input_wav
         input_wav = input_wav.astype(np.float32) / np.iinfo(np.int16).max
-        if len(input_wav.shape) > 1:
-            input_wav = input_wav.mean(-1)
         if fs != 16000:
             resampler = torchaudio.transforms.Resample(fs, 16000)
-            input_wav_t = torch.from_numpy(input_wav).to(torch.float32)
-            input_wav = resampler(input_wav_t[None, :])[0, :].numpy()
-    # Save the input audio to a temporary file
-    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
-        sf.write(temp_audio.name, input_wav, 16000)
-        temp_audio_path = temp_audio.name
-    try:
-        # Process the audio using the function from sv.py
-        result = process_audio(temp_audio_path, language=selected_language)
-    finally:
-        # Remove the temporary audio file
-        os.remove(temp_audio_path)
     return result
-audio_examples = [
-    ["example/mtr.mp3", "auto"],
-]
 def launch():
     with gr.Blocks(theme=gr.themes.Soft()) as demo:
         with gr.Row():
             with gr.Column():
                 audio_inputs = gr.Audio(label="Upload audio or use the microphone")
-                with gr.Accordion("Configuration"):
-                    language_inputs = gr.Dropdown(
-                        choices=["auto", "zh", "en", "yue", "ja", "ko", "nospeech"],
-                        value="auto",
-                        label="Language",
-                    )
                 fn_button = gr.Button("Start", variant="primary")
                 text_outputs = gr.Textbox(label="Results")
-            gr.Examples(
-                examples=audio_examples,
-                inputs=[audio_inputs, language_inputs],
-                examples_per_page=20,
-            )
         fn_button.click(
             model_inference,
@@ -93,5 +62,4 @@ def launch():
 if __name__ == "__main__":
-    # iface.launch()
     launch()

 # coding=utf-8
 import gradio as gr
 import numpy as np
 import soundfile as sf
 import spaces
 import torch
 import torchaudio
+from sv import process_audio
 @spaces.GPU
+def model_inference(input_wav, language):
+    # Simplify language selection
+    language = language if language else "auto"
     # Handle input_wav format
     if isinstance(input_wav, tuple):
         fs, input_wav = input_wav
         input_wav = input_wav.astype(np.float32) / np.iinfo(np.int16).max
+        input_wav = input_wav.mean(-1) if len(input_wav.shape) > 1 else input_wav
         if fs != 16000:
             resampler = torchaudio.transforms.Resample(fs, 16000)
+            input_wav = resampler(torch.from_numpy(input_wav).float()[None, :])[
+                0
+            ].numpy()
+    # Process audio
+    with sf.SoundFile("temp.wav", "w", samplerate=16000, channels=1) as f:
+        f.write(input_wav)
+    result = process_audio("temp.wav", language=language)
     return result
 def launch():
     with gr.Blocks(theme=gr.themes.Soft()) as demo:
         with gr.Row():
             with gr.Column():
                 audio_inputs = gr.Audio(label="Upload audio or use the microphone")
+                language_inputs = gr.Dropdown(
+                    choices=["auto", "zh", "en", "yue", "ja", "ko", "nospeech"],
+                    value="auto",
+                    label="Language",
+                )
                 fn_button = gr.Button("Start", variant="primary")
                 text_outputs = gr.Textbox(label="Results")
+        gr.Examples(
+            examples=[["example/mtr.mp3", "yue"]],
+            inputs=[audio_inputs, language_inputs],
+            examples_per_page=20,
+        )
         fn_button.click(
             model_inference,
 if __name__ == "__main__":
     launch()