Update README.md
Browse files
README.md
CHANGED
|
@@ -30,4 +30,129 @@ espnet_model_zoo
|
|
| 30 |
```
|
| 31 |
|
| 32 |
|
| 33 |
-
**
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
```
|
| 31 |
|
| 32 |
|
| 33 |
+
**The recipe can be found in ESPnet:** https://github.com/espnet/espnet/tree/master/egs2/owsm_ctc_v3.1/s2t1
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
### Example script for batched inference
|
| 38 |
+
|
| 39 |
+
`Speech2TextGreedySearch` now provides a unified batched inference method `batch_decode`. It performs CTC greedy decoding for a batch of short-form or long-form audios. If an audio is shorter than 30s, it will be padded to 30s; otherwise it will be split into overlapped segments (same as the "long-form ASR/ST" method below).
|
| 40 |
+
|
| 41 |
+
```python
|
| 42 |
+
from espnet2.bin.s2t_inference_ctc import Speech2TextGreedySearch
|
| 43 |
+
|
| 44 |
+
s2t = Speech2TextGreedySearch.from_pretrained(
|
| 45 |
+
"espnet/owsm_ctc_v3.1_1B",
|
| 46 |
+
device="cuda",
|
| 47 |
+
use_flash_attn=False, # set to True for better efficiency if flash attn is installed and dtype is float16 or bfloat16
|
| 48 |
+
lang_sym='<eng>',
|
| 49 |
+
task_sym='<asr>',
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
res = s2t.batch_decode(
|
| 53 |
+
"audio.wav", # a single audio (path or 1-D array/tensor) as input
|
| 54 |
+
batch_size=16,
|
| 55 |
+
context_len_in_secs=4,
|
| 56 |
+
) # res is a single str, i.e., the predicted text without special tokens
|
| 57 |
+
|
| 58 |
+
res = s2t.batch_decode(
|
| 59 |
+
["audio1.wav", "audio2.wav", "audio3.wav"], # a list of audios as input
|
| 60 |
+
batch_size=16,
|
| 61 |
+
context_len_in_secs=4,
|
| 62 |
+
) # res is a list of str
|
| 63 |
+
|
| 64 |
+
# Please check the code of `batch_decode` for all supported inputs
|
| 65 |
+
```
|
| 66 |
+
|
| 67 |
+
### Example script for short-form ASR/ST/LID
|
| 68 |
+
|
| 69 |
+
Our models are trained on 16kHz audio with a fixed duration of 30s. When using the pre-trained model, please ensure the input speech is 16kHz and pad or truncate it to 30s.
|
| 70 |
+
|
| 71 |
+
```python
|
| 72 |
+
import librosa
|
| 73 |
+
from espnet2.bin.s2t_inference_ctc import Speech2TextGreedySearch
|
| 74 |
+
|
| 75 |
+
s2t = Speech2TextGreedySearch.from_pretrained(
|
| 76 |
+
"espnet/owsm_ctc_v3.1_1B",
|
| 77 |
+
device="cuda",
|
| 78 |
+
generate_interctc_outputs=False,
|
| 79 |
+
lang_sym='<eng>',
|
| 80 |
+
task_sym='<asr>',
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
# NOTE: OWSM-CTC is trained on 16kHz audio with a fixed 30s duration. Please ensure your input has the correct sample rate; otherwise resample it to 16k before feeding it to the model
|
| 84 |
+
speech, rate = librosa.load("xxx.wav", sr=16000)
|
| 85 |
+
speech = librosa.util.fix_length(speech, size=(16000 * 30))
|
| 86 |
+
|
| 87 |
+
res = s2t(speech)[0]
|
| 88 |
+
print(res)
|
| 89 |
+
```
|
| 90 |
+
|
| 91 |
+
### Example script for long-form ASR/ST
|
| 92 |
+
|
| 93 |
+
```python
|
| 94 |
+
import soundfile as sf
|
| 95 |
+
import torch
|
| 96 |
+
from espnet2.bin.s2t_inference_ctc import Speech2TextGreedySearch
|
| 97 |
+
|
| 98 |
+
context_len_in_secs = 4 # left and right context when doing buffered inference
|
| 99 |
+
batch_size = 32 # depends on the GPU memory
|
| 100 |
+
s2t = Speech2TextGreedySearch.from_pretrained(
|
| 101 |
+
"espnet/owsm_ctc_v3.1_1B",
|
| 102 |
+
device='cuda' if torch.cuda.is_available() else 'cpu',
|
| 103 |
+
generate_interctc_outputs=False,
|
| 104 |
+
lang_sym='<eng>',
|
| 105 |
+
task_sym='<asr>',
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
speech, rate = sf.read(
|
| 109 |
+
"xxx.wav"
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
text = s2t.decode_long_batched_buffered(
|
| 113 |
+
speech,
|
| 114 |
+
batch_size=batch_size,
|
| 115 |
+
context_len_in_secs=context_len_in_secs,
|
| 116 |
+
)
|
| 117 |
+
print(text)
|
| 118 |
+
```
|
| 119 |
+
|
| 120 |
+
### Example of CTC forced alignment using `ctc-segmentation`
|
| 121 |
+
|
| 122 |
+
CTC segmentation can be efficiently applied to audio of an arbitrary length.
|
| 123 |
+
|
| 124 |
+
```python
|
| 125 |
+
import soundfile as sf
|
| 126 |
+
from espnet2.bin.s2t_ctc_align import CTCSegmentation
|
| 127 |
+
from espnet_model_zoo.downloader import ModelDownloader
|
| 128 |
+
|
| 129 |
+
# Download model first
|
| 130 |
+
d = ModelDownloader()
|
| 131 |
+
downloaded = d.download_and_unpack("espnet/owsm_ctc_v3.2_ft_1B") # "espnet/owsm_ctc_v3.1_1B"
|
| 132 |
+
|
| 133 |
+
aligner = CTCSegmentation(
|
| 134 |
+
**downloaded,
|
| 135 |
+
fs=16000,
|
| 136 |
+
ngpu=1,
|
| 137 |
+
batch_size=32, # batched parallel decoding; reduce it if your GPU memory is smaller
|
| 138 |
+
kaldi_style_text=True,
|
| 139 |
+
time_stamps="auto", # "auto" can be more accurate than "fixed" when converting token index to timestamp
|
| 140 |
+
lang_sym="<eng>",
|
| 141 |
+
task_sym="<asr>",
|
| 142 |
+
context_len_in_secs=2, # left and right context in buffered decoding
|
| 143 |
+
)
|
| 144 |
+
|
| 145 |
+
speech, rate = sf.read(
|
| 146 |
+
"./test_utils/ctc_align_test.wav"
|
| 147 |
+
)
|
| 148 |
+
print(f"speech duration: {len(speech) / rate : .2f} seconds")
|
| 149 |
+
text = """
|
| 150 |
+
utt1 THE SALE OF THE HOTELS
|
| 151 |
+
utt2 IS PART OF HOLIDAY'S STRATEGY
|
| 152 |
+
utt3 TO SELL OFF ASSETS
|
| 153 |
+
utt4 AND CONCENTRATE ON PROPERTY MANAGEMENT
|
| 154 |
+
"""
|
| 155 |
+
|
| 156 |
+
segments = aligner(speech, text)
|
| 157 |
+
print(segments)
|
| 158 |
+
```
|