cantonese-call-transcriber

Sleeping

App Files Files Community

terry-li-hm commited on Sep 6, 2024

Commit

a76b03e

1 Parent(s): a08029c

`sv.py`

Browse files

Files changed (1) hide show

sv.py +357 -0

sv.py ADDED Viewed

	@@ -0,0 +1,357 @@

+import datetime
+import math
+import os
+import numpy as np
+import torch
+import torchaudio
+from funasr import AutoModel
+from pyannote.audio import Audio, Pipeline
+from pyannote.core import Segment
+# Load models
+model = AutoModel(
+    model="FunAudioLLM/SenseVoiceSmall",
+    # vad_model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch",
+    # vad_kwargs={"max_single_segment_time": 30000},
+    hub="hf",
+    device="cuda" if torch.cuda.is_available() else "cpu",
+)
+pyannote_pipeline = Pipeline.from_pretrained(
+    "pyannote/speaker-diarization-3.1", use_auth_token=os.getenv("HF_TOKEN")
+)
+if torch.cuda.is_available():
+    pyannote_pipeline.to(torch.device("cuda"))
+# Emoji dictionaries and formatting functions
+emo_dict = {
+    "<|HAPPY|>": "😊",
+    "<|SAD|>": "😔",
+    "<|ANGRY|>": "😡",
+    "<|NEUTRAL|>": "",
+    "<|FEARFUL|>": "😰",
+    "<|DISGUSTED|>": "🤢",
+    "<|SURPRISED|>": "😮",
+}
+event_dict = {
+    "<|BGM|>": "🎼",
+    "<|Speech|>": "",
+    "<|Applause|>": "👏",
+    "<|Laughter|>": "😀",
+    "<|Cry|>": "😭",
+    "<|Sneeze|>": "🤧",
+    "<|Breath|>": "",
+    "<|Cough|>": "🤧",
+}
+emoji_dict = {
+    "<|nospeech|><|Event_UNK|>": "❓",
+    "<|zh|>": "",
+    "<|en|>": "",
+    "<|yue|>": "",
+    "<|ja|>": "",
+    "<|ko|>": "",
+    "<|nospeech|>": "",
+    "<|HAPPY|>": "😊",
+    "<|SAD|>": "😔",
+    "<|ANGRY|>": "😡",
+    "<|NEUTRAL|>": "",
+    "<|BGM|>": "🎼",
+    "<|Speech|>": "",
+    "<|Applause|>": "👏",
+    "<|Laughter|>": "😀",
+    "<|FEARFUL|>": "😰",
+    "<|DISGUSTED|>": "🤢",
+    "<|SURPRISED|>": "😮",
+    "<|Cry|>": "😭",
+    "<|EMO_UNKNOWN|>": "",
+    "<|Sneeze|>": "🤧",
+    "<|Breath|>": "",
+    "<|Cough|>": "😷",
+    "<|Sing|>": "",
+    "<|Speech_Noise|>": "",
+    "<|withitn|>": "",
+    "<|woitn|>": "",
+    "<|GBG|>": "",
+    "<|Event_UNK|>": "",
+}
+lang_dict = {
+    "<|zh|>": "<|lang|>",
+    "<|en|>": "<|lang|>",
+    "<|yue|>": "<|lang|>",
+    "<|ja|>": "<|lang|>",
+    "<|ko|>": "<|lang|>",
+    "<|nospeech|>": "<|lang|>",
+}
+emo_set = {"😊", "😔", "😡", "😰", "🤢", "😮"}
+event_set = {"🎼", "👏", "😀", "😭", "🤧", "😷"}
+def format_str(s):
+    for sptk in emoji_dict:
+        s = s.replace(sptk, emoji_dict[sptk])
+    return s
+def format_str_v2(s):
+    sptk_dict = {}
+    for sptk in emoji_dict:
+        sptk_dict[sptk] = s.count(sptk)
+        s = s.replace(sptk, "")
+    emo = "<|NEUTRAL|>"
+    for e in emo_dict:
+        if sptk_dict[e] > sptk_dict[emo]:
+            emo = e
+    for e in event_dict:
+        if sptk_dict[e] > 0:
+            s = event_dict[e] + s
+    s = s + emo_dict[emo]
+    for emoji in emo_set.union(event_set):
+        s = s.replace(" " + emoji, emoji)
+        s = s.replace(emoji + " ", emoji)
+    return s.strip()
+def format_str_v3(s):
+    def get_emo(s):
+        return s[-1] if s[-1] in emo_set else None
+    def get_event(s):
+        return s[0] if s[0] in event_set else None
+    s = s.replace("<|nospeech|><|Event_UNK|>", "❓")
+    for lang in lang_dict:
+        s = s.replace(lang, "<|lang|>")
+    s_list = [format_str_v2(s_i).strip(" ") for s_i in s.split("<|lang|>")]
+    new_s = " " + s_list[0]
+    cur_ent_event = get_event(new_s)
+    for i in range(1, len(s_list)):
+        if len(s_list[i]) == 0:
+            continue
+        if get_event(s_list[i]) == cur_ent_event and get_event(s_list[i]) != None:
+            s_list[i] = s_list[i][1:]
+        # else:
+        cur_ent_event = get_event(s_list[i])
+        if get_emo(s_list[i]) != None and get_emo(s_list[i]) == get_emo(new_s):
+            new_s = new_s[:-1]
+        new_s += s_list[i].strip().lstrip()
+    new_s = new_s.replace("The.", " ")
+    return new_s.strip()
+def time_to_seconds(time_str):
+    h, m, s = time_str.split(":")
+    return round(int(h) * 3600 + int(m) * 60 + float(s), 9)
+import datetime
+def parse_time(time_str):
+    # Remove 's' if present at the end of the string
+    time_str = time_str.rstrip("s")
+    # Split the time string into hours, minutes, and seconds
+    parts = time_str.split(":")
+    if len(parts) == 3:
+        h, m, s = parts
+    elif len(parts) == 2:
+        h = "0"
+        m, s = parts
+    else:
+        h = m = "0"
+        s = parts[0]
+    return int(h) * 3600 + int(m) * 60 + float(s)
+def format_time(seconds, use_short_format=True):
+    if isinstance(seconds, datetime.timedelta):
+        seconds = seconds.total_seconds()
+    minutes, seconds = divmod(seconds, 60)
+    hours, minutes = divmod(int(minutes), 60)
+    if use_short_format or (hours == 0 and minutes == 0):
+        return f"{seconds:05.3f}s"
+    elif hours == 0:
+        return f"{minutes:02d}:{seconds:06.3f}"
+    else:
+        return f"{hours:02d}:{minutes:02d}:{seconds:06.3f}"
+def format_time_with_leading_zeros(seconds):
+    formatted = f"{seconds:06.3f}s"
+    print(f"Debug: Input seconds: {seconds}, Formatted output: {formatted}")
+    return formatted
+def generate_diarization(audio_path):
+    # Get the Hugging Face token from the environment variable
+    hf_token = os.environ.get("HF_TOKEN")
+    if not hf_token:
+        raise ValueError(
+            "HF_TOKEN environment variable is not set. Please set it with your Hugging Face token."
+        )
+    # Initialize the audio processor
+    audio = Audio(sample_rate=16000, mono=True)
+    # Load the pretrained pipeline
+    pipeline = Pipeline.from_pretrained(
+        "pyannote/speaker-diarization-3.1", use_auth_token=hf_token
+    )
+    # Send pipeline to GPU if available
+    if torch.cuda.is_available():
+        pipeline.to(torch.device("cuda"))
+    # Set the correct path for the audio file
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    possible_paths = [
+        os.path.join(script_dir, "example", "mtr.mp3"),
+        os.path.join(script_dir, "..", "example", "mtr.mp3"),
+        os.path.join(script_dir, "mtr.mp3"),
+        "mtr.mp3",
+        audio_path,  # Add the provided audio_path to the list of possible paths
+    ]
+    file_path = None
+    for path in possible_paths:
+        if os.path.exists(path):
+            file_path = path
+            break
+    if file_path is None:
+        print("Debugging information:")
+        print(f"Current working directory: {os.getcwd()}")
+        print(f"Script directory: {script_dir}")
+        print("Attempted paths:")
+        for path in possible_paths:
+            print(f"  {path}")
+        raise FileNotFoundError(
+            "Could not find the audio file. Please ensure it's in the correct location."
+        )
+    print(f"Using audio file: {file_path}")
+    # Process the audio file
+    waveform, sample_rate = audio(file_path)
+    # Create a dictionary with the audio information
+    file = {"waveform": waveform, "sample_rate": sample_rate, "uri": "mtr"}
+    # Run the diarization
+    output = pipeline(file)
+    # Save results in human-readable format
+    diarization_segments = []
+    txt_file = "mtr_dn.txt"
+    with open(txt_file, "w") as f:
+        for turn, _, speaker in output.itertracks(yield_label=True):
+            start_time = format_time(turn.start)
+            end_time = format_time(turn.end)
+            duration = format_time(turn.end - turn.start)
+            line = f"{start_time} - {end_time} ({duration}): {speaker}\n"
+            f.write(line)
+            print(line.strip())
+            diarization_segments.append(
+                (
+                    parse_time(start_time),
+                    parse_time(end_time),
+                    parse_time(duration),
+                    speaker,
+                )
+            )
+    print(f"\nHuman-readable diarization results saved to {txt_file}")
+    return diarization_segments
+def process_audio(audio_path, language="yue", fs=16000):
+    # Generate diarization segments
+    diarization_segments = generate_diarization(audio_path)
+    # Load and preprocess audio
+    waveform, sample_rate = torchaudio.load(audio_path)
+    if sample_rate != fs:
+        resampler = torchaudio.transforms.Resample(sample_rate, fs)
+        waveform = resampler(waveform)
+    input_wav = waveform.mean(0).numpy()
+    # Determine if the audio is less than one minute
+    total_duration = sum(duration for _, _, duration, _ in diarization_segments)
+    use_short_format = total_duration < 60
+    # Process the audio in chunks based on diarization segments
+    results = []
+    for start_time, end_time, duration, speaker in diarization_segments:
+        start_seconds = start_time
+        end_seconds = end_time
+        # Convert time to sample indices
+        start_sample = int(start_seconds * fs)
+        end_sample = int(end_seconds * fs)
+        chunk = input_wav[start_sample:end_sample]
+        try:
+            text = model.generate(
+                input=chunk,
+                cache={},
+                language=language,
+                use_itn=True,
+                batch_size_s=500,
+                merge_vad=True,
+            )
+            text = text[0]["text"]
+            text = format_str_v3(text)
+            # Handle empty transcriptions
+            if not text.strip():
+                text = "[inaudible]"
+            results.append((speaker, start_time, end_time, duration, text))
+        except AssertionError as e:
+            if "choose a window size" in str(e):
+                print(
+                    f"Warning: Audio segment too short to process. Skipping. Error: {e}"
+                )
+                results.append((speaker, start_time, end_time, duration, "[too short]"))
+            else:
+                raise
+    # Format the results
+    formatted_text = ""
+    for speaker, start, end, duration, text in results:
+        start_str = format_time_with_leading_zeros(start)
+        end_str = format_time_with_leading_zeros(end)
+        duration_str = format_time_with_leading_zeros(duration)
+        speaker_num = "1" if speaker == "SPEAKER_00" else "2"
+        line = f"{start_str} - {end_str} ({duration_str}) Speaker {speaker_num}: {text}"
+        formatted_text += line + "\n"
+        print(f"Debug: Formatted line: {line}")
+    print("Debug: Full formatted text:")
+    print(formatted_text)
+    return formatted_text.strip()
+if __name__ == "__main__":
+    audio_path = "example/mtr.mp3"  # Replace with your audio file path
+    language = "yue"  # Set language to Cantonese
+    result = process_audio(audio_path, language)
+    # Save the result to mtr.txt
+    output_path = "mtr.txt"
+    with open(output_path, "w", encoding="utf-8") as f:
+        f.write(result)
+    print(f"Diarization and transcription result has been saved to {output_path}")