File size: 5,336 Bytes
3fbd296
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import gradio as gr
import numpy as np
import pandas as pd
import torch
import torchaudio
import time

from transformers import pipeline
# from speechbrain.inference.VAD import VAD
from speechbrain.inference.classifiers import EncoderClassifier

transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")
# VAD = VAD.from_hparams(source="speechbrain/vad-crdnn-libriparty", savedir="pretrained_models/vad-crdnn-libriparty")
language_id = EncoderClassifier.from_hparams(source="speechbrain/lang-id-voxlingua107-ecapa")

data = []
current_chunk = []
index_to_lang = {
    0: 'Abkhazian', 1: 'Afrikaans', 2: 'Amharic', 3: 'Arabic', 4: 'Assamese',
    5: 'Azerbaijani', 6: 'Bashkir', 7: 'Belarusian', 8: 'Bulgarian', 9: 'Bengali',
    10: 'Tibetan', 11: 'Breton', 12: 'Bosnian', 13: 'Catalan', 14: 'Cebuano',
    15: 'Czech', 16: 'Welsh', 17: 'Danish', 18: 'German', 19: 'Greek',
    20: 'English', 21: 'Esperanto', 22: 'Spanish', 23: 'Estonian', 24: 'Basque',
    25: 'Persian', 26: 'Finnish', 27: 'Faroese', 28: 'French', 29: 'Galician',
    30: 'Guarani', 31: 'Gujarati', 32: 'Manx', 33: 'Hausa', 34: 'Hawaiian',
    35: 'Hindi', 36: 'Croatian', 37: 'Haitian', 38: 'Hungarian', 39: 'Armenian',
    40: 'Interlingua', 41: 'Indonesian', 42: 'Icelandic', 43: 'Italian', 44: 'Hebrew',
    45: 'Japanese', 46: 'Javanese', 47: 'Georgian', 48: 'Kazakh', 49: 'Central Khmer',
    50: 'Kannada', 51: 'Korean', 52: 'Latin', 53: 'Luxembourgish', 54: 'Lingala',
    55: 'Lao', 56: 'Lithuanian', 57: 'Latvian', 58: 'Malagasy', 59: 'Maori',
    60: 'Macedonian', 61: 'Malayalam', 62: 'Mongolian', 63: 'Marathi', 64: 'Malay',
    65: 'Maltese', 66: 'Burmese', 67: 'Nepali', 68: 'Dutch', 69: 'Norwegian Nynorsk',
    70: 'Norwegian', 71: 'Occitan', 72: 'Panjabi', 73: 'Polish', 74: 'Pushto',
    75: 'Portuguese', 76: 'Romanian', 77: 'Russian', 78: 'Sanskrit', 79: 'Scots',
    80: 'Sindhi', 81: 'Sinhala', 82: 'Slovak', 83: 'Slovenian', 84: 'Shona',
    85: 'Somali', 86: 'Albanian', 87: 'Serbian', 88: 'Sundanese', 89: 'Swedish',
    90: 'Swahili', 91: 'Tamil', 92: 'Telugu', 93: 'Tajik', 94: 'Thai',
    95: 'Turkmen', 96: 'Tagalog', 97: 'Turkish', 98: 'Tatar', 99: 'Ukrainian',
    100: 'Urdu', 101: 'Uzbek', 102: 'Vietnamese', 103: 'Waray', 104: 'Yiddish',
    105: 'Yoruba', 106: 'Chinese'
}
lang_index_JA_EN = {
    'ja': 45,
    'en': 20,
}

def resample_audio(audio, orig_sr, target_sr=16000):
    if orig_sr != target_sr:
        print(f"Resampling audio from {orig_sr} to {target_sr}")
        audio = audio.astype(np.float32)
        resampler = torchaudio.transforms.Resample(orig_freq=orig_sr, new_freq=target_sr)
        audio = resampler(torch.from_numpy(audio).unsqueeze(0)).squeeze(0).numpy()
    return audio


SAMPLING_RATE = 16000
CHUNK_DURATION = 5  # 5秒ごとのチャンク

def process_audio(audio):
    global data, current_chunk
    print("Process_audio")
    print(audio)
    sr, audio_data = audio

    print(audio_data.shape)
    # 一番最初にSampling rateを揃えておく
    audio_data = resample_audio(audio_data, sr, target_sr=SAMPLING_RATE)
    audio_sec = 0

    # 新しいデータを現在のチャンクに追加
    current_chunk.append(audio_data)
    total_chunk = np.concatenate(current_chunk)

    while len(total_chunk) >= SAMPLING_RATE * CHUNK_DURATION:
        chunk = total_chunk[:SAMPLING_RATE * CHUNK_DURATION]
        total_chunk = total_chunk[SAMPLING_RATE * CHUNK_DURATION:]  # 処理済みの部分を削除
        audio_sec += CHUNK_DURATION

        print(f"Processing audio chunk of length {len(chunk)}")
        volume_norm = np.linalg.norm(chunk) / np.finfo(np.float32).max
        length = len(chunk) / SAMPLING_RATE  # 音声データの長さ(秒)
        lang_guess = language_id.classify_batch(torch.from_numpy(chunk).unsqueeze(0))

        # 日本語と英語の確率値を取得
        ja_prob = lang_guess[0][0][lang_index_JA_EN['ja']].item()
        en_prob = lang_guess[0][0][lang_index_JA_EN['en']].item()
        ja_en = 'ja' if ja_prob > en_prob else 'en'

        # Top 3言語を取得
        top3_indices = torch.topk(lang_guess[0], 3, dim=1, largest=True).indices[0]
        top3_languages = [index_to_lang[idx.item()] for idx in top3_indices]

        # transcript
        transcript = transcriber(chunk)
        print(transcript)

        data.append({
            # "Time": pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'),
            "Time": audio_sec,
            "Length (s)": length,
            "Volume": volume_norm,
            "Japanese_English": f"{ja_en} ({ja_prob:.2f}, {en_prob:.2f})",
            "Language": top3_languages,
            "Text": transcript['text'],
        })

        df = pd.DataFrame(data)
        yield (SAMPLING_RATE, chunk), df

    # 未処理の残りのデータを保持
    current_chunk = [total_chunk]

# inputs = gr.Audio(sources=["microphone", "upload"], type="numpy", streaming=True)
inputs = gr.Audio(sources=["microphone", "upload"], type="numpy")
outputs = [gr.Audio(type="numpy"), gr.DataFrame(headers=["Time", "Volume", "Length (s)"])]

demo = gr.Interface(
    fn=process_audio,
    inputs=inputs,
    outputs=outputs,
    live=True,
    title="Real-time Audio Processing",
    description="Speak into the microphone and see real-time audio processing results."
)

demo.launch()