cantonese-call-transcriber

Sleeping

File size: 10,549 Bytes

a76b03e

import datetime
import math
import os

import numpy as np
import torch
import torchaudio
from funasr import AutoModel
from pyannote.audio import Audio, Pipeline
from pyannote.core import Segment

# Load models
model = AutoModel(
    model="FunAudioLLM/SenseVoiceSmall",
    # vad_model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch",
    # vad_kwargs={"max_single_segment_time": 30000},
    hub="hf",
    device="cuda" if torch.cuda.is_available() else "cpu",
)

pyannote_pipeline = Pipeline.from_pretrained(
    "pyannote/speaker-diarization-3.1", use_auth_token=os.getenv("HF_TOKEN")
)
if torch.cuda.is_available():
    pyannote_pipeline.to(torch.device("cuda"))

# Emoji dictionaries and formatting functions
emo_dict = {
    "<|HAPPY|>": "😊",
    "<|SAD|>": "😔",
    "<|ANGRY|>": "😡",
    "<|NEUTRAL|>": "",
    "<|FEARFUL|>": "😰",
    "<|DISGUSTED|>": "🤢",
    "<|SURPRISED|>": "😮",
}

event_dict = {
    "<|BGM|>": "🎼",
    "<|Speech|>": "",
    "<|Applause|>": "👏",
    "<|Laughter|>": "😀",
    "<|Cry|>": "😭",
    "<|Sneeze|>": "🤧",
    "<|Breath|>": "",
    "<|Cough|>": "🤧",
}

emoji_dict = {
    "<|nospeech|><|Event_UNK|>": "❓",
    "<|zh|>": "",
    "<|en|>": "",
    "<|yue|>": "",
    "<|ja|>": "",
    "<|ko|>": "",
    "<|nospeech|>": "",
    "<|HAPPY|>": "😊",
    "<|SAD|>": "😔",
    "<|ANGRY|>": "😡",
    "<|NEUTRAL|>": "",
    "<|BGM|>": "🎼",
    "<|Speech|>": "",
    "<|Applause|>": "👏",
    "<|Laughter|>": "😀",
    "<|FEARFUL|>": "😰",
    "<|DISGUSTED|>": "🤢",
    "<|SURPRISED|>": "😮",
    "<|Cry|>": "😭",
    "<|EMO_UNKNOWN|>": "",
    "<|Sneeze|>": "🤧",
    "<|Breath|>": "",
    "<|Cough|>": "😷",
    "<|Sing|>": "",
    "<|Speech_Noise|>": "",
    "<|withitn|>": "",
    "<|woitn|>": "",
    "<|GBG|>": "",
    "<|Event_UNK|>": "",
}

lang_dict = {
    "<|zh|>": "<|lang|>",
    "<|en|>": "<|lang|>",
    "<|yue|>": "<|lang|>",
    "<|ja|>": "<|lang|>",
    "<|ko|>": "<|lang|>",
    "<|nospeech|>": "<|lang|>",
}

emo_set = {"😊", "😔", "😡", "😰", "🤢", "😮"}
event_set = {"🎼", "👏", "😀", "😭", "🤧", "😷"}


def format_str(s):
    for sptk in emoji_dict:
        s = s.replace(sptk, emoji_dict[sptk])
    return s


def format_str_v2(s):
    sptk_dict = {}
    for sptk in emoji_dict:
        sptk_dict[sptk] = s.count(sptk)
        s = s.replace(sptk, "")
    emo = "<|NEUTRAL|>"
    for e in emo_dict:
        if sptk_dict[e] > sptk_dict[emo]:
            emo = e
    for e in event_dict:
        if sptk_dict[e] > 0:
            s = event_dict[e] + s
    s = s + emo_dict[emo]

    for emoji in emo_set.union(event_set):
        s = s.replace(" " + emoji, emoji)
        s = s.replace(emoji + " ", emoji)
    return s.strip()


def format_str_v3(s):
    def get_emo(s):
        return s[-1] if s[-1] in emo_set else None

    def get_event(s):
        return s[0] if s[0] in event_set else None

    s = s.replace("<|nospeech|><|Event_UNK|>", "❓")
    for lang in lang_dict:
        s = s.replace(lang, "<|lang|>")
    s_list = [format_str_v2(s_i).strip(" ") for s_i in s.split("<|lang|>")]
    new_s = " " + s_list[0]
    cur_ent_event = get_event(new_s)
    for i in range(1, len(s_list)):
        if len(s_list[i]) == 0:
            continue
        if get_event(s_list[i]) == cur_ent_event and get_event(s_list[i]) != None:
            s_list[i] = s_list[i][1:]
        # else:
        cur_ent_event = get_event(s_list[i])
        if get_emo(s_list[i]) != None and get_emo(s_list[i]) == get_emo(new_s):
            new_s = new_s[:-1]
        new_s += s_list[i].strip().lstrip()
    new_s = new_s.replace("The.", " ")
    return new_s.strip()


def time_to_seconds(time_str):
    h, m, s = time_str.split(":")
    return round(int(h) * 3600 + int(m) * 60 + float(s), 9)


import datetime


def parse_time(time_str):
    # Remove 's' if present at the end of the string
    time_str = time_str.rstrip("s")

    # Split the time string into hours, minutes, and seconds
    parts = time_str.split(":")

    if len(parts) == 3:
        h, m, s = parts
    elif len(parts) == 2:
        h = "0"
        m, s = parts
    else:
        h = m = "0"
        s = parts[0]

    return int(h) * 3600 + int(m) * 60 + float(s)


def format_time(seconds, use_short_format=True):
    if isinstance(seconds, datetime.timedelta):
        seconds = seconds.total_seconds()

    minutes, seconds = divmod(seconds, 60)
    hours, minutes = divmod(int(minutes), 60)

    if use_short_format or (hours == 0 and minutes == 0):
        return f"{seconds:05.3f}s"
    elif hours == 0:
        return f"{minutes:02d}:{seconds:06.3f}"
    else:
        return f"{hours:02d}:{minutes:02d}:{seconds:06.3f}"


def format_time_with_leading_zeros(seconds):
    formatted = f"{seconds:06.3f}s"
    print(f"Debug: Input seconds: {seconds}, Formatted output: {formatted}")
    return formatted


def generate_diarization(audio_path):
    # Get the Hugging Face token from the environment variable
    hf_token = os.environ.get("HF_TOKEN")
    if not hf_token:
        raise ValueError(
            "HF_TOKEN environment variable is not set. Please set it with your Hugging Face token."
        )

    # Initialize the audio processor
    audio = Audio(sample_rate=16000, mono=True)

    # Load the pretrained pipeline
    pipeline = Pipeline.from_pretrained(
        "pyannote/speaker-diarization-3.1", use_auth_token=hf_token
    )

    # Send pipeline to GPU if available
    if torch.cuda.is_available():
        pipeline.to(torch.device("cuda"))

    # Set the correct path for the audio file
    script_dir = os.path.dirname(os.path.abspath(__file__))
    possible_paths = [
        os.path.join(script_dir, "example", "mtr.mp3"),
        os.path.join(script_dir, "..", "example", "mtr.mp3"),
        os.path.join(script_dir, "mtr.mp3"),
        "mtr.mp3",
        audio_path,  # Add the provided audio_path to the list of possible paths
    ]

    file_path = None
    for path in possible_paths:
        if os.path.exists(path):
            file_path = path
            break

    if file_path is None:
        print("Debugging information:")
        print(f"Current working directory: {os.getcwd()}")
        print(f"Script directory: {script_dir}")
        print("Attempted paths:")
        for path in possible_paths:
            print(f"  {path}")
        raise FileNotFoundError(
            "Could not find the audio file. Please ensure it's in the correct location."
        )

    print(f"Using audio file: {file_path}")

    # Process the audio file
    waveform, sample_rate = audio(file_path)

    # Create a dictionary with the audio information
    file = {"waveform": waveform, "sample_rate": sample_rate, "uri": "mtr"}

    # Run the diarization
    output = pipeline(file)

    # Save results in human-readable format
    diarization_segments = []
    txt_file = "mtr_dn.txt"
    with open(txt_file, "w") as f:
        for turn, _, speaker in output.itertracks(yield_label=True):
            start_time = format_time(turn.start)
            end_time = format_time(turn.end)
            duration = format_time(turn.end - turn.start)
            line = f"{start_time} - {end_time} ({duration}): {speaker}\n"
            f.write(line)
            print(line.strip())
            diarization_segments.append(
                (
                    parse_time(start_time),
                    parse_time(end_time),
                    parse_time(duration),
                    speaker,
                )
            )

    print(f"\nHuman-readable diarization results saved to {txt_file}")
    return diarization_segments


def process_audio(audio_path, language="yue", fs=16000):
    # Generate diarization segments
    diarization_segments = generate_diarization(audio_path)

    # Load and preprocess audio
    waveform, sample_rate = torchaudio.load(audio_path)
    if sample_rate != fs:
        resampler = torchaudio.transforms.Resample(sample_rate, fs)
        waveform = resampler(waveform)

    input_wav = waveform.mean(0).numpy()

    # Determine if the audio is less than one minute
    total_duration = sum(duration for _, _, duration, _ in diarization_segments)
    use_short_format = total_duration < 60

    # Process the audio in chunks based on diarization segments
    results = []
    for start_time, end_time, duration, speaker in diarization_segments:
        start_seconds = start_time
        end_seconds = end_time

        # Convert time to sample indices
        start_sample = int(start_seconds * fs)
        end_sample = int(end_seconds * fs)

        chunk = input_wav[start_sample:end_sample]
        try:
            text = model.generate(
                input=chunk,
                cache={},
                language=language,
                use_itn=True,
                batch_size_s=500,
                merge_vad=True,
            )
            text = text[0]["text"]
            text = format_str_v3(text)

            # Handle empty transcriptions
            if not text.strip():
                text = "[inaudible]"

            results.append((speaker, start_time, end_time, duration, text))
        except AssertionError as e:
            if "choose a window size" in str(e):
                print(
                    f"Warning: Audio segment too short to process. Skipping. Error: {e}"
                )
                results.append((speaker, start_time, end_time, duration, "[too short]"))
            else:
                raise

    # Format the results
    formatted_text = ""
    for speaker, start, end, duration, text in results:
        start_str = format_time_with_leading_zeros(start)
        end_str = format_time_with_leading_zeros(end)
        duration_str = format_time_with_leading_zeros(duration)
        speaker_num = "1" if speaker == "SPEAKER_00" else "2"
        line = f"{start_str} - {end_str} ({duration_str}) Speaker {speaker_num}: {text}"
        formatted_text += line + "\n"
        print(f"Debug: Formatted line: {line}")

    print("Debug: Full formatted text:")
    print(formatted_text)
    return formatted_text.strip()


if __name__ == "__main__":
    audio_path = "example/mtr.mp3"  # Replace with your audio file path
    language = "yue"  # Set language to Cantonese

    result = process_audio(audio_path, language)

    # Save the result to mtr.txt
    output_path = "mtr.txt"
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(result)

    print(f"Diarization and transcription result has been saved to {output_path}")