File size: 1,829 Bytes
f481a94
 
 
 
08f9ba3
f481a94
85638c3
f481a94
 
 
 
 
 
 
08f9ba3
f481a94
 
 
 
 
988375c
 
 
 
 
 
 
 
f481a94
08f9ba3
f481a94
5eb8f47
988375c
f481a94
 
827455e
966d371
973c318
 
 
 
988375c
5eb8f47
88c750a
 
08f9ba3
f481a94
 
08f9ba3
5eb8f47
f481a94
 
 
08f9ba3
 
 
2ba320e
5eb8f47
f481a94
212a765
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import gradio as gr
import librosa
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torch

# config
model_name = "vumichien/wav2vec2-large-xlsr-japanese-hiragana"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)


def process_audio_file(file):
    data, sr = librosa.load(file)
    if sr != 16000:
        data = librosa.resample(data, sr, 16000)
    print(data.shape)
    inputs = processor(data, sampling_rate=16000, return_tensors="pt", padding=True)
    return inputs


def transcribe(micro, file):
    if file is not None and micro is None:
        input_audio = file
    elif file is None and micro is not None:
        input_audio = micro
    else:
        input_audio = file
    inputs = process_audio_file(input_audio )
    with torch.no_grad():
        output_logit = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
    pred_ids = torch.argmax(output_logit, dim=-1)
    text = processor.batch_decode(pred_ids)[0]
    return text 


description = "A simple interface to transcribe from spoken Japanese to Hiragana."
article = "Author: <a href=\"https://huggingface.co/vumichien\">Vu Minh Chien</a>."

inputs = [gr.Audio(source="microphone", type="filepath", optional=True),
          gr.Audio(source="upload", type="filepath", optional=True),
        ]
outputs = ["textbox"]
        
examples = [["samples/BASIC5000_0001.wav",""],
            ["samples/BASIC5000_0005.wav",""]
        ]
iface = gr.Interface(
    fn=transcribe,
    inputs=inputs,
    outputs=outputs,
    layout="horizontal",
    theme="huggingface",
    title="Transcribe Japanese audio to Hiragana",
    description=description,
    article=article,
    allow_flagging='never',
    examples=examples,
    live=True,
)
iface.launch(enable_queue=True)