File size: 1,419 Bytes
6d3dc99
 
 
 
 
b615647
 
 
 
6d3dc99
 
b615647
6d3dc99
 
 
b615647
6d3dc99
 
 
 
 
 
 
b615647
 
 
6d3dc99
b615647
6d3dc99
 
 
 
b615647
 
 
6d3dc99
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from speechbrain.pretrained import GraphemeToPhoneme
import datasets
import os
import torchaudio
from wav2vecasr.MispronounciationDetector import MispronounciationDetector
from wav2vecasr.PhonemeASRModel import Wav2Vec2PhonemeASRModel, Wav2Vec2OptimisedPhonemeASRModel, MultitaskPhonemeASRModel
import jiwer
import re

# Load sample data
audio_path, transcript_path = os.path.join(os.getcwd(), "data", "arctic_a0003.wav"), os.path.join(os.getcwd(),"data", "arctic_a0003.txt")
audio, org_sr = torchaudio.load(audio_path)
audio = torchaudio.functional.resample(audio, orig_freq=org_sr, new_freq=16000)
audio = audio.view(audio.shape[1])
audio = audio.to("cpu")
with open(transcript_path) as f:
  text = f.read()
f.close()
print("Done loading sample data")

# Load processors and models
device = "cpu"
path = os.path.join(os.getcwd(), "model", "multitask_best_ctc.pt")
vocab_path = os.path.join(os.getcwd(), "model", "vocab")
asr_model = MultitaskPhonemeASRModel(path, vocab_path, device)
g2p = GraphemeToPhoneme.from_hparams("speechbrain/soundchoice-g2p")
mispronounciation_detector = MispronounciationDetector(asr_model, g2p, "cpu")
print("Done loading models and processors")

# Predict
raw_info = mispronounciation_detector.detect(audio, text)
print(raw_info['ref'])
print(raw_info['hyp'])
print(raw_info['phoneme_errors'])
print(f"PER: {raw_info['per']}\n")