diff --git a/app.py b/app.py
index ed9189b408434182592e1b8699b08226f5d319c7..5b36fbe8c2c8fc99015ab3c0ffa0f8093a6c66e5 100644
--- a/app.py
+++ b/app.py
@@ -10,9 +10,9 @@ import os
 
 import sys
 
+import yaml
 
 sys.path.insert(0, './')
-from gxl_ai_utils.utils import utils_file
 from wenet.utils.init_tokenizer import init_tokenizer
 from wenet.utils.init_model import init_model
 import logging
@@ -20,6 +20,14 @@ import librosa
 import torch
 import torchaudio
 import numpy as np
+def makedir_for_file(filepath):
+    dirpath = os.path.dirname(filepath)
+    if not os.path.exists(dirpath):
+        os.makedirs(dirpath)
+def load_dict_from_yaml(file_path: str):
+    with open(file_path, 'rt', encoding='utf-8') as f:
+        dict_1 = yaml.load(f, Loader=yaml.FullLoader)
+    return dict_1
 
 # 将图片转换为 Base64
 with open("lab.png", "rb") as image_file:
@@ -53,7 +61,7 @@ def init_model_my():
     args = SimpleNamespace(**{
         "checkpoint": checkpoint_path,
     })
-    configs = utils_file.load_dict_from_yaml(config_path)
+    configs = load_dict_from_yaml(config_path)
     model, configs = init_model(args, configs)
     model = model.cuda()
     tokenizer = init_tokenizer(configs)
@@ -73,7 +81,7 @@ def do_resample(input_wav_path, output_wav_path):
         waveform = torch.mean(waveform, dim=0, keepdim=True)
     waveform = torchaudio.transforms.Resample(
         orig_freq=sample_rate, new_freq=16000)(waveform)
-    utils_file.makedir_for_file(output_wav_path)
+    makedir_for_file(output_wav_path)
     torchaudio.save(output_wav_path, waveform, 16000)
 
 def true_decode_fuc(input_wav_path, input_prompt):
diff --git a/wenet/LLM/causallm_model.py b/wenet/LLM/causallm_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..adf37d71d5aa3abc3d76b10bf1546af81818c4d2
--- /dev/null
+++ b/wenet/LLM/causallm_model.py
@@ -0,0 +1,207 @@
+from typing import Dict, List, Optional, Union
+import torch
+from wenet.LLM.decoder import DecoderOnly
+from wenet.LLM.sampler import sampler
+from wenet.utils.common import IGNORE_ID, th_accuracy
+from wenet.utils.mask import make_pad_mask, subsequent_mask
+
+
+class CausalLM(torch.nn.Module):
+
+    def __init__(
+        self,
+        vocab_size: int,
+        decoder: DecoderOnly,
+        special_tokens: dict,
+        tie_word_embedding: bool = False,
+        linear_bias: bool = False,
+        ignore_id: int = IGNORE_ID,
+        lsm_weight: float = 0.0,
+        reduction: str = 'mean',
+    ) -> None:
+        super().__init__()
+        del special_tokens
+
+        self.embed = torch.nn.Embedding(vocab_size, decoder.hidden_size)
+        self.out = torch.nn.Linear(decoder.hidden_size,
+                                   vocab_size,
+                                   bias=linear_bias)
+
+        self.decoder = decoder
+        self.vocab_size = vocab_size
+        self.criterion_att = torch.nn.CrossEntropyLoss(
+            ignore_index=ignore_id,
+            label_smoothing=lsm_weight,
+            reduction=reduction,
+        )
+        self.tie_word_embedding = tie_word_embedding
+        self.ignore_id = ignore_id
+
+    @torch.jit.unused
+    def forward(
+        self,
+        batch: dict,
+        device: torch.device,
+    ) -> Dict[str, Optional[torch.Tensor]]:
+        """ Forward for training
+        """
+        text = batch['feats'].to(device)
+        target = batch['target'].to(device)
+        text_length = batch['feats_lengths'].to(device)
+
+        mask = ~make_pad_mask(text_length, max_len=text.size(1)).unsqueeze(
+            1)  # (B,1,L)
+        causal_mask = subsequent_mask(
+            mask.size(-1), device=mask.device).unsqueeze(0)  # (1,L,L)
+        att_mask = causal_mask & mask  # (B, L, L)
+
+        embeding = self.embed(text)
+        decoder_out = self.out(self.decoder(embeding,
+                                            att_mask)[0])  # (B, L, vocab_size)
+        loss = self.criterion_att(decoder_out.view(-1, self.vocab_size),
+                                  target.view(-1))
+        acc = th_accuracy(decoder_out.view(-1, self.vocab_size),
+                          target,
+                          ignore_label=self.ignore_id)
+
+        return {
+            "loss": loss,
+            "ppl": torch.exp(loss.detach()),
+            "th_accuracy": acc
+        }
+
+    def tie_or_clone_weights(self, jit_mode: bool):
+        if not self.tie_word_embedding:
+            return
+        if jit_mode:
+            self.out.weight = torch.nn.Parameter(self.embed.weight.clone())
+        else:
+            self.out.weight = self.embed.weight
+            # TODO(Mddct): whether to deal bias for other llm model
+
+    @torch.jit.unused
+    @torch.inference_mode()
+    def generate(
+        self,
+        prompts_tokens: List[List[int]],
+        device: torch.device,
+        stop_tokens: List[int],
+        dtype: torch.dtype = torch.float32,
+        output_len: int = 100,
+        temperature: Union[float, None] = 0.95,
+        top_p: float = 1.0,
+        top_k: int = 100,
+    ) -> List[List[int]]:
+        """Generates responses for given prompts using Gemma model."""
+        # If a single prompt is provided, treat it as a batch of 1.
+        batch_size = len(prompts_tokens)
+        min_prompt_len = min(len(p) for p in prompts_tokens)
+        max_prompt_len = max(len(p) for p in prompts_tokens)
+        max_seq_len = max_prompt_len + output_len
+        assert max_seq_len <= self.decoder.pos_enc.max_len
+
+        # build KV caches
+        kv_caches = []
+        for _ in range(len(self.decoder.decoders)):
+            size = (batch_size, 0, self.decoder.n_kv_head,
+                    self.decoder.head_dim)
+            k_cache = torch.zeros(size=size, dtype=dtype, device=device)
+            v_cache = torch.zeros(size=size, dtype=dtype, device=device)
+            kv_caches.append((k_cache, v_cache))
+
+        # prepare inputs
+        token_ids_tensor = torch.full((batch_size, max_seq_len),
+                                      IGNORE_ID,
+                                      dtype=torch.int64,
+                                      device=device)
+        input_token_ids_tensor = torch.full((batch_size, min_prompt_len),
+                                            IGNORE_ID,
+                                            dtype=torch.int64,
+                                            device=device)
+        # right padding
+        for i, p in enumerate(prompts_tokens):
+            token_ids_tensor[i, :len(p)] = torch.tensor(p)
+            input_token_ids_tensor[i, :min_prompt_len] = torch.tensor(
+                p[:min_prompt_len])
+
+        prompt_mask_tensor = token_ids_tensor != IGNORE_ID
+        input_positions_tensor = torch.arange(0,
+                                              min_prompt_len,
+                                              dtype=torch.int64).to(device)
+        mask_tensor = torch.ones((1, 1, max_seq_len, max_seq_len),
+                                 dtype=torch.bool)
+        mask_tensor = torch.tril(mask_tensor).to(device)
+        curr_mask_tensor = mask_tensor.index_select(2, input_positions_tensor)
+        att_mask = curr_mask_tensor.squeeze(
+            1)[:, :min_prompt_len, :min_prompt_len]
+        output_positions_tensor = torch.LongTensor([min_prompt_len - 1
+                                                    ]).to(device)
+        temperatures_tensor = None if not temperature else torch.FloatTensor(
+            [temperature] * batch_size).to(device)
+        top_ps_tensor = torch.FloatTensor([top_p] * batch_size).to(device)
+        top_ks_tensor = torch.LongTensor([top_k] * batch_size).to(device)
+        output_index = torch.tensor(min_prompt_len,
+                                    dtype=torch.int64).to(device)
+
+        input_token_embeding = self.embed(input_token_ids_tensor)
+        offset = torch.tensor([0] * len(prompts_tokens)).to(device)
+        input_offset = offset
+
+        stop_tokens_tensor = torch.tensor(stop_tokens, device=device)
+        # Prefill up to min_prompt_len tokens, then treat other prefill as
+        # decode and ignore output.
+        for i in range(max_seq_len - min_prompt_len):
+            decoder_out, kv_caches, = self.decoder(
+                input_token_embeding,
+                att_mask,
+                input_offset,
+                kv_caches,
+            )
+            decoder_out = self.out(decoder_out)
+            decoder_out = decoder_out.index_select(1, output_positions_tensor)
+            next_token_ids = sampler(
+                decoder_out,
+                temperatures_tensor,
+                top_ps_tensor,
+                top_ks_tensor,
+            )
+            curr_prompt_mask = prompt_mask_tensor.index_select(
+                1, output_index).squeeze(dim=1)
+            curr_token_ids = token_ids_tensor.index_select(
+                1, output_index).squeeze(dim=1)
+            output_token_ids = torch.where(curr_prompt_mask, curr_token_ids,
+                                           next_token_ids).unsqueeze(dim=1)
+            token_ids_tensor.index_copy_(1, output_index, output_token_ids)
+
+            input_token_ids_tensor = output_token_ids
+            input_token_embeding = self.embed(input_token_ids_tensor)
+
+            input_positions_tensor = output_index.unsqueeze(dim=-1)
+            curr_mask_tensor = mask_tensor.index_select(
+                2, input_positions_tensor)
+            att_mask = curr_mask_tensor.squeeze(1)[:, :output_index +
+                                                   1, :output_index + 1]
+
+            output_positions_tensor = torch.tensor(
+                0, dtype=torch.int64).to(device)
+            input_offset = offset + output_index.unsqueeze(-1)
+            output_index = output_index + 1
+
+            if all(torch.isin(next_token_ids, stop_tokens_tensor)):
+                break
+
+        token_ids = token_ids_tensor.tolist()
+        results = []
+        for i, tokens in enumerate(token_ids):
+            trimmed_output = tokens[len(prompts_tokens[i]
+                                        ):len(prompts_tokens[i]) + output_len]
+            for stop_token in stop_tokens:
+                try:
+                    eos_index = trimmed_output.index(stop_token)
+                    trimmed_output = trimmed_output[:eos_index]
+                    break
+                except Exception:
+                    continue
+            results.append(trimmed_output)
+
+        return results
diff --git a/wenet/LLM/decoder.py b/wenet/LLM/decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..b25ee75dd67c1cbce424568d5ef99176cb52ff8b
--- /dev/null
+++ b/wenet/LLM/decoder.py
@@ -0,0 +1,161 @@
+from functools import partial
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.utils.checkpoint as ckpt
+from wenet.transformer.attention import T_CACHE
+
+from wenet.transformer.encoder_layer import TransformerEncoderLayer
+from wenet.utils.class_utils import (WENET_ACTIVATION_CLASSES,
+                                     WENET_ATTENTION_CLASSES,
+                                     WENET_EMB_CLASSES, WENET_MLP_CLASSES,
+                                     WENET_NORM_CLASSES)
+from wenet.utils.common import mask_to_bias
+
+
+class DecoderOnly(torch.nn.Module):
+
+    def __init__(
+        self,
+        n_kv_head: int,
+        head_dim: int,
+        hidden_size: int,
+        attention_heads: int = 4,
+        linear_units: int = 2048,
+        num_blocks: int = 6,
+        dropout_rate: float = 0.1,
+        positional_dropout_rate: float = 0.1,
+        attention_dropout_rate: float = 0.0,
+        normalize_before: bool = True,
+        query_bias: bool = False,
+        key_bias: bool = False,
+        value_bias: bool = False,
+        mlp_bias: bool = False,
+        activation_type: str = "gelu",
+        gelu_approximate: Union[str, None] = None,
+        max_position_embeding: int = 8192,
+        mlp_type: str = 'gated',
+        layer_norm_type: str = 'rms_norm',
+        norm_eps: float = 1e-5,
+        rms_norm_offset: bool = True,
+        selfattention_layer_type: str = "rope_abs_selfattn",
+        use_sdpa: bool = False,
+        gradient_checkpointing: bool = False,
+        rope_theta: float = 10000.0,
+        rope_style: str = 'google',
+        scale_embed: bool = True,
+    ) -> None:
+        super().__init__()
+
+        assert selfattention_layer_type in ['rope_abs_selfattn']
+        self.pos_enc = WENET_EMB_CLASSES["rope_pos"](
+            hidden_size,
+            head_dim,
+            max_len=max_position_embeding,
+            dropout_rate=positional_dropout_rate,
+            rope_theta=rope_theta,
+            scale=scale_embed)
+        if activation_type == "gelu" and gelu_approximate is not None:
+            activation = WENET_ACTIVATION_CLASSES['gelu'](
+                approximate=gelu_approximate)
+        else:
+            activation = WENET_ACTIVATION_CLASSES[activation_type]()
+
+        mlp_class = WENET_MLP_CLASSES[mlp_type]
+        self.num_blocks = num_blocks
+        # TODO: support lora & refactor lora
+        self.decoders = torch.nn.ModuleList([
+            TransformerEncoderLayer(
+                hidden_size,
+                WENET_ATTENTION_CLASSES[selfattention_layer_type](
+                    attention_heads,
+                    hidden_size,
+                    attention_dropout_rate,
+                    query_bias,
+                    key_bias,
+                    value_bias,
+                    use_sdpa,
+                    n_kv_head,
+                    head_dim,
+                    style=rope_style),
+                mlp_class(hidden_size, linear_units, dropout_rate, activation,
+                          mlp_bias),
+                dropout_rate,
+                normalize_before,
+                layer_norm_type=layer_norm_type,
+                norm_eps=norm_eps,
+                rms_norm_offset=rms_norm_offset,
+            ) for _ in range(self.num_blocks)
+        ])
+        self.pre_norm = normalize_before
+        self.final_norm: Optional[torch.nn.Module] = None
+        if self.pre_norm:
+            norm_class = WENET_NORM_CLASSES[layer_norm_type]
+            if layer_norm_type == "rms_norm":
+                norm_class = partial(
+                    norm_class,
+                    add_unit_offset=rms_norm_offset,
+                )
+            self.final_norm = norm_class(hidden_size, eps=norm_eps)
+
+        self.n_kv_head = n_kv_head
+        self.head_dim = head_dim
+        self._hidden_size = hidden_size
+        self.use_sdpa = use_sdpa
+        self.gradient_checkpointing = gradient_checkpointing
+
+    def forward(
+        self,
+        input: torch.Tensor,
+        att_mask: torch.Tensor,
+        input_position: Union[int, torch.Tensor] = 0,
+        kv_caches: Optional[List[T_CACHE]] = None,
+    ) -> Tuple[torch.Tensor, Union[List[T_CACHE], None]]:
+        xs, pos_emb = self.pos_enc(input, offset=input_position)
+        if self.use_sdpa:
+            att_mask = mask_to_bias(att_mask, xs.dtype)
+
+        if self.gradient_checkpointing and self.training:
+            xs = self.forward_layers_checkpointed(xs, att_mask, pos_emb)
+        else:
+            xs, kv_caches = self.forward_layers(xs, att_mask, pos_emb,
+                                                kv_caches)
+        if self.pre_norm and self.final_norm is not None:
+            xs = self.final_norm(xs)
+        return xs, kv_caches
+
+    def forward_layers(
+        self,
+        xs: torch.Tensor,
+        att_mask: torch.Tensor,
+        pos_emb: torch.Tensor,
+        kv_caches: Optional[List[T_CACHE]] = None,
+    ) -> Tuple[torch.Tensor, Union[List[T_CACHE], None]]:
+        if self.training:
+            for (i, layer) in enumerate(self.decoders):
+                xs, _, _, _ = layer(xs, att_mask, pos_emb)
+            new_kv_caches = kv_caches
+        else:
+            assert kv_caches is not None
+            new_kv_caches = []
+            for (i, layer) in enumerate(self.decoders):
+                xs, _, new_kv_cache, _ = layer(xs,
+                                               att_mask,
+                                               pos_emb,
+                                               att_cache=(kv_caches[i][0],
+                                                          kv_caches[i][1]))
+                new_kv_caches.append(new_kv_cache)
+
+        return xs, new_kv_caches
+
+    @torch.jit.ignore(drop=True)
+    def forward_layers_checkpointed(self, xs: torch.Tensor,
+                                    att_mask: torch.Tensor,
+                                    pos_emb: torch.Tensor) -> torch.Tensor:
+        for layer in self.decoders:
+            xs, _, _, _ = ckpt.checkpoint(layer.__call__, xs, att_mask,
+                                          pos_emb)
+        return xs
+
+    @property
+    def hidden_size(self):
+        return self._hidden_size
diff --git a/wenet/LLM/sampler.py b/wenet/LLM/sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..19f0d5cdaffd11cc2faf1fdbf2e61771635efa4b
--- /dev/null
+++ b/wenet/LLM/sampler.py
@@ -0,0 +1,43 @@
+from typing import Union
+import torch
+
+
+# modified from https://github.com/google/gemma_pytorch/blob/main/gemma/model.py#L26
+@torch.no_grad()
+def sampler(
+    logits: torch.Tensor,
+    temperatures: Union[torch.Tensor, None],
+    top_ps: torch.Tensor,
+    top_ks: torch.Tensor,
+) -> torch.Tensor:
+    assert logits.size(1) == 1
+    logits = logits.squeeze(1)  # (batch_size, vocab_size)
+    if temperatures is None:
+        return torch.argmax(logits, dim=-1).squeeze(dim=-1)
+
+    # Apply temperature scaling.
+    logits.div_(temperatures.unsqueeze(dim=1))
+
+    # Calculate probabilities with softmax.
+    probs = torch.softmax(logits, dim=-1, dtype=torch.float)
+    probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
+
+    # Apply top-p, top-k.
+    probs_sum = torch.cumsum(probs_sort, dim=-1)
+    top_ps_mask = (probs_sum - probs_sort) > top_ps.unsqueeze(dim=1)
+    probs_sort = torch.where(top_ps_mask, 0, probs_sort)
+
+    top_ks_mask = torch.arange(probs_idx.shape[-1], device=probs_idx.device)
+    top_ks_mask = top_ks_mask.expand(probs_idx.shape[0], -1)
+    top_ks_mask = top_ks_mask >= top_ks.unsqueeze(dim=1)
+    probs_sort = torch.where(top_ks_mask, 0, probs_sort)
+
+    # Re-normalization.
+    probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
+    probs = torch.gather(probs_sort,
+                         dim=-1,
+                         index=torch.argsort(probs_idx, dim=-1))
+
+    next_token_ids = torch.multinomial(probs, num_samples=1,
+                                       replacement=True).squeeze(dim=-1)
+    return next_token_ids
diff --git a/wenet/__init__.py b/wenet/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..820ad3180b78bec00478ba4e30ce4b515967e405
--- /dev/null
+++ b/wenet/__init__.py
@@ -0,0 +1 @@
+from wenet.cli.model import load_model  # noqa
diff --git a/wenet/bin/alignment.py b/wenet/bin/alignment.py
new file mode 100644
index 0000000000000000000000000000000000000000..12c272a2bd5829f35d767eac558bea3dbdffdf5f
--- /dev/null
+++ b/wenet/bin/alignment.py
@@ -0,0 +1,268 @@
+# Copyright (c) 2021 Mobvoi Inc. (authors: Di Wu)
+#               2022 Tinnove Inc (authors: Wei Ren)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import argparse
+import copy
+import logging
+import os
+import sys
+
+import torch
+import yaml
+from torch.utils.data import DataLoader
+from textgrid import TextGrid, IntervalTier
+import math
+
+from wenet.dataset.dataset import Dataset
+from wenet.utils.ctc_utils import force_align
+from wenet.utils.common import get_subsample
+from wenet.utils.init_model import init_model
+from wenet.utils.init_tokenizer import init_tokenizer
+
+
+def generator_textgrid(maxtime, lines, output):
+    # Download Praat: https://www.fon.hum.uva.nl/praat/
+    interval = maxtime / (len(lines) + 1)
+    margin = 0.0001
+
+    tg = TextGrid(maxTime=maxtime)
+    linetier = IntervalTier(name="line", maxTime=maxtime)
+
+    i = 0
+    for l in lines:
+        s, e, w = l.split()
+        linetier.add(minTime=float(s) + margin, maxTime=float(e), mark=w)
+
+    tg.append(linetier)
+    print("successfully generator {}".format(output))
+    tg.write(output)
+
+
+def get_frames_timestamp(alignment,
+                         prob,
+                         blank_thres=0.999,
+                         thres=0.0000000001):
+    # convert alignment to a praat format, which is a doing phonetics
+    # by computer and helps analyzing alignment
+    timestamp = []
+    # get frames level duration for each token
+    start = 0
+    end = 0
+    local_start = 0
+    while end < len(alignment):
+        while end < len(alignment) and alignment[end] == 0:
+            end += 1
+        if end == len(alignment):
+            timestamp[-1] += alignment[start:]
+            break
+        end += 1
+        while end < len(alignment) and alignment[end - 1] == alignment[end]:
+            end += 1
+        local_start = end - 1
+        # find the possible front border for current token
+        while local_start >= start and (
+                prob[local_start][0] < math.log(blank_thres)
+                or prob[local_start][alignment[end - 1]] > math.log(thres)):
+            alignment[local_start] = alignment[end - 1]
+            local_start -= 1
+        cur_alignment = alignment[start:end]
+        timestamp.append(cur_alignment)
+        start = end
+    return timestamp
+
+
+def get_labformat(timestamp, subsample):
+    begin = 0
+    begin_time = 0
+    duration = 0
+    labformat = []
+    for idx, t in enumerate(timestamp):
+        # 25ms frame_length,10ms hop_length, 1/subsample
+        subsample = get_subsample(configs)
+        # time duration
+        i = 0
+        while t[i] == 0:
+            i += 1
+        begin = i
+        dur = 0
+        while i < len(t) and t[i] != 0:
+            i += 1
+            dur += 1
+        begin = begin_time + begin * 0.01 * subsample
+        duration = dur * 0.01 * subsample
+        if idx < len(timestamp) - 1:
+            print("{:.2f} {:.2f} {}".format(begin, begin + duration,
+                                            char_dict[t[-1]]))
+            labformat.append("{:.2f} {:.2f} {}\n".format(
+                begin, begin + duration, char_dict[t[-1]]))
+        else:  # last token
+            non_blank = 0
+            for i in t:
+                if i != 0:
+                    token = i
+                    break
+            print("{:.2f} {:.2f} {}".format(begin, begin + duration,
+                                            char_dict[token]))
+            labformat.append("{:.2f} {:.2f} {}\n".format(
+                begin, begin + duration, char_dict[token]))
+        begin_time += len(t) * 0.01 * subsample
+    return labformat
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description='use ctc to generate alignment')
+    parser.add_argument('--config', required=True, help='config file')
+    parser.add_argument('--input_file', required=True, help='format data file')
+    parser.add_argument('--data_type',
+                        default='raw',
+                        choices=['raw', 'shard'],
+                        help='train and cv data type')
+    parser.add_argument('--gpu',
+                        type=int,
+                        default=-1,
+                        help='gpu id for this rank, -1 for cpu')
+    parser.add_argument('--device',
+                        type=str,
+                        default="cpu",
+                        choices=["cpu", "npu", "cuda"],
+                        help='accelerator to use')
+    parser.add_argument('--blank_thres',
+                        default=0.999999,
+                        type=float,
+                        help='ctc blank thes')
+    parser.add_argument('--thres',
+                        default=0.000001,
+                        type=float,
+                        help='ctc non blank thes')
+    parser.add_argument('--checkpoint', required=True, help='checkpoint model')
+    parser.add_argument('--dict', required=True, help='dict file')
+    parser.add_argument(
+        '--non_lang_syms',
+        help="non-linguistic symbol file. One symbol per line.")
+    parser.add_argument('--result_file',
+                        required=True,
+                        help='alignment result file')
+    parser.add_argument('--batch_size', type=int, default=1, help='batch size')
+    parser.add_argument('--gen_praat',
+                        action='store_true',
+                        help='convert alignment to a praat format')
+    parser.add_argument('--bpe_model',
+                        default=None,
+                        type=str,
+                        help='bpe model for english part')
+
+    args = parser.parse_args()
+    print(args)
+    logging.basicConfig(level=logging.DEBUG,
+                        format='%(asctime)s %(levelname)s %(message)s')
+    if args.gpu != -1:
+        # remain the original usage of gpu
+        args.device = "cuda"
+    if "cuda" in args.device:
+        os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)
+
+    if args.batch_size > 1:
+        logging.fatal('alignment mode must be running with batch_size == 1')
+        sys.exit(1)
+
+    with open(args.config, 'r') as fin:
+        configs = yaml.load(fin, Loader=yaml.FullLoader)
+
+    # Load dict
+    char_dict = {}
+    with open(args.dict, 'r') as fin:
+        for line in fin:
+            arr = line.strip().split()
+            assert len(arr) == 2
+            char_dict[int(arr[1])] = arr[0]
+    eos = len(char_dict) - 1
+
+    # Init dataset and data loader
+    ali_conf = copy.deepcopy(configs['dataset_conf'])
+
+    ali_conf['filter_conf']['max_length'] = 102400
+    ali_conf['filter_conf']['min_length'] = 0
+    ali_conf['filter_conf']['token_max_length'] = 102400
+    ali_conf['filter_conf']['token_min_length'] = 0
+    ali_conf['filter_conf']['max_output_input_ratio'] = 102400
+    ali_conf['filter_conf']['min_output_input_ratio'] = 0
+    ali_conf['speed_perturb'] = False
+    ali_conf['spec_aug'] = False
+    ali_conf['spec_trim'] = False
+    ali_conf['shuffle'] = False
+    ali_conf['sort'] = False
+    ali_conf['fbank_conf']['dither'] = 0.0
+    ali_conf['batch_conf']['batch_type'] = "static"
+    ali_conf['batch_conf']['batch_size'] = args.batch_size
+
+    tokenizer = init_tokenizer(configs)
+    ali_dataset = Dataset(args.data_type,
+                          args.input_file,
+                          tokenizer,
+                          ali_conf,
+                          partition=False)
+
+    ali_data_loader = DataLoader(ali_dataset, batch_size=None, num_workers=0)
+
+    # Init asr model from configs
+    model, configs = init_model(args, configs)
+
+    device = torch.device(args.device)
+    model = model.to(device)
+
+    model.eval()
+    with torch.no_grad(), open(args.result_file, 'w',
+                               encoding='utf-8') as fout:
+        for batch_idx, batch in enumerate(ali_data_loader):
+            print("#" * 80)
+            key, feat, target, feats_length, target_length = batch
+
+            feat = feat.to(device)
+            target = target.to(device)
+            feats_length = feats_length.to(device)
+            target_length = target_length.to(device)
+            # Let's assume B = batch_size and N = beam_size
+            # 1. Encoder
+            encoder_out, encoder_mask = model._forward_encoder(
+                feat, feats_length)  # (B, maxlen, encoder_dim)
+            maxlen = encoder_out.size(1)
+            ctc_probs = model.ctc.log_softmax(
+                encoder_out)  # (1, maxlen, vocab_size)
+            # print(ctc_probs.size(1))
+            ctc_probs = ctc_probs.squeeze(0)
+            target = target.squeeze(0)
+            alignment = force_align(ctc_probs, target)
+            fout.write('{} {}\n'.format(key[0], alignment))
+
+            if args.gen_praat:
+                timestamp = get_frames_timestamp(alignment, ctc_probs,
+                                                 args.blank_thres, args.thres)
+                subsample = get_subsample(configs)
+                labformat = get_labformat(timestamp, subsample)
+
+                lab_path = os.path.join(os.path.dirname(args.result_file),
+                                        key[0] + ".lab")
+                with open(lab_path, 'w', encoding='utf-8') as f:
+                    f.writelines(labformat)
+
+                textgrid_path = os.path.join(os.path.dirname(args.result_file),
+                                             key[0] + ".TextGrid")
+                generator_textgrid(maxtime=(len(alignment) + 1) * 0.01 *
+                                   subsample,
+                                   lines=labformat,
+                                   output=textgrid_path)
diff --git a/wenet/bin/average_model.py b/wenet/bin/average_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e97b059ce54b3fc0c27099c030239198f7ebf9d
--- /dev/null
+++ b/wenet/bin/average_model.py
@@ -0,0 +1,125 @@
+# Copyright (c) 2020 Mobvoi Inc (Di Wu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import argparse
+import glob
+import sys
+
+import yaml
+import torch
+
+
+def get_args():
+    parser = argparse.ArgumentParser(description='average model')
+    parser.add_argument('--dst_model', required=True, help='averaged model')
+    parser.add_argument('--src_path',
+                        required=True,
+                        help='src model path for average')
+    parser.add_argument('--val_best',
+                        action="store_true",
+                        help='averaged model')
+    parser.add_argument('--num',
+                        default=5,
+                        type=int,
+                        help='nums for averaged model')
+    parser.add_argument('--min_epoch',
+                        default=0,
+                        type=int,
+                        help='min epoch used for averaging model')
+    parser.add_argument('--max_epoch',
+                        default=sys.maxsize,
+                        type=int,
+                        help='max epoch used for averaging model')
+    parser.add_argument('--min_step',
+                        default=0,
+                        type=int,
+                        help='min step used for averaging model')
+    parser.add_argument('--max_step',
+                        default=sys.maxsize,
+                        type=int,
+                        help='max step used for averaging model')
+    parser.add_argument('--mode',
+                        default="hybrid",
+                        choices=["hybrid", "epoch", "step"],
+                        type=str,
+                        help='average mode')
+
+    args = parser.parse_args()
+    print(args)
+    return args
+
+
+def main():
+    args = get_args()
+    checkpoints = []
+    val_scores = []
+    if args.val_best:
+        if args.mode == "hybrid":
+            yamls = glob.glob('{}/*.yaml'.format(args.src_path))
+            yamls = [
+                f for f in yamls
+                if not (os.path.basename(f).startswith('train')
+                        or os.path.basename(f).startswith('init'))
+            ]
+        elif args.mode == "step":
+            yamls = glob.glob('{}/step_*.yaml'.format(args.src_path))
+        else:
+            yamls = glob.glob('{}/epoch_*.yaml'.format(args.src_path))
+        for y in yamls:
+            with open(y, 'r') as f:
+                dic_yaml = yaml.load(f, Loader=yaml.FullLoader)
+                loss = dic_yaml['loss_dict']['loss']
+                epoch = dic_yaml['epoch']
+                step = dic_yaml['step']
+                tag = dic_yaml['tag']
+                if epoch >= args.min_epoch and epoch <= args.max_epoch \
+                        and step >= args.min_step and step <= args.max_step:
+                    val_scores += [[epoch, step, loss, tag]]
+        sorted_val_scores = sorted(val_scores,
+                                   key=lambda x: x[2],
+                                   reverse=False)
+        print("best val (epoch, step, loss, tag) = " +
+              str(sorted_val_scores[:args.num]))
+        path_list = [
+            args.src_path + '/{}.pt'.format(score[-1])
+            for score in sorted_val_scores[:args.num]
+        ]
+    else:
+        path_list = glob.glob('{}/[!init]*.pt'.format(args.src_path))
+        path_list = sorted(path_list, key=os.path.getmtime)
+        path_list = path_list[-args.num:]
+    print(path_list)
+    avg = {}
+    num = args.num
+    assert num == len(path_list)
+    for path in path_list:
+        print('Processing {}'.format(path))
+        states = torch.load(path, map_location=torch.device('cpu'))
+        for k in states.keys():
+            if k not in avg.keys():
+                avg[k] = states[k].clone()
+            else:
+                avg[k] += states[k]
+    # average
+    for k in avg.keys():
+        if avg[k] is not None:
+            # pytorch 1.6 use true_divide instead of /=
+            avg[k] = torch.true_divide(avg[k], num)
+    print('Saving to {}'.format(args.dst_model))
+    torch.save(avg, args.dst_model)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/wenet/bin/export_ipex.py b/wenet/bin/export_ipex.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d3ff181cf5233932ed5641abc231407c815c2d5
--- /dev/null
+++ b/wenet/bin/export_ipex.py
@@ -0,0 +1,95 @@
+# Copyright (C) 2021-2023 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import print_function
+
+import argparse
+import logging
+import os
+
+import torch
+import yaml
+
+from wenet.utils.init_model import init_model
+import intel_extension_for_pytorch as ipex
+from intel_extension_for_pytorch.quantization import prepare, convert
+
+
+def get_args():
+    parser = argparse.ArgumentParser(description='export your script model')
+    parser.add_argument('--config', required=True, help='config file')
+    parser.add_argument('--checkpoint', required=True, help='checkpoint model')
+    parser.add_argument('--output_file', default=None, help='output file')
+    parser.add_argument('--dtype',
+                        default="fp32",
+                        help='choose the dtype to run:[fp32,bf16]')
+    parser.add_argument('--output_quant_file',
+                        default=None,
+                        help='output quantized model file')
+    args = parser.parse_args()
+    return args
+
+
+def scripting(model):
+    with torch.inference_mode():
+        script_model = torch.jit.script(model)
+        script_model = torch.jit.freeze(
+            script_model,
+            preserved_attrs=[
+                "forward_encoder_chunk", "ctc_activation",
+                "forward_attention_decoder", "subsampling_rate",
+                "right_context", "sos_symbol", "eos_symbol",
+                "is_bidirectional_decoder"
+            ])
+    return script_model
+
+
+def main():
+    args = get_args()
+    logging.basicConfig(level=logging.DEBUG,
+                        format='%(asctime)s %(levelname)s %(message)s')
+    # No need gpu for model export
+    os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
+
+    with open(args.config, 'r') as fin:
+        configs = yaml.load(fin, Loader=yaml.FullLoader)
+    model, configs = init_model(args, configs)
+    print(model)
+
+    # Apply IPEX optimization
+    model.eval()
+    torch._C._jit_set_texpr_fuser_enabled(False)
+    model.to(memory_format=torch.channels_last)
+    if args.dtype == "fp32":
+        ipex_model = ipex.optimize(model)
+    elif args.dtype == "bf16":  # For Intel 4th generation Xeon (SPR)
+        ipex_model = ipex.optimize(model,
+                                   dtype=torch.bfloat16,
+                                   weights_prepack=False)
+
+    # Export jit torch script model
+    if args.output_file:
+        if args.dtype == "fp32":
+            script_model = scripting(ipex_model)
+        elif args.dtype == "bf16":
+            torch._C._jit_set_autocast_mode(True)
+            with torch.cpu.amp.autocast():
+                script_model = scripting(ipex_model)
+        script_model.save(args.output_file)
+        print('Export model successfully, see {}'.format(args.output_file))
+
+    # Export quantized jit torch script model
+    if args.output_quant_file:
+        dynamic_qconfig = ipex.quantization.default_dynamic_qconfig
+        dummy_data = (torch.zeros(1, 67, 80), 16, -16,
+                      torch.zeros(12, 4, 32, 128), torch.zeros(12, 1, 256, 7))
+        model = prepare(model, dynamic_qconfig, dummy_data)
+        model = convert(model)
+        script_quant_model = scripting(model)
+        script_quant_model.save(args.output_quant_file)
+        print('Export quantized model successfully, '
+              'see {}'.format(args.output_quant_file))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/wenet/bin/export_jit.py b/wenet/bin/export_jit.py
new file mode 100644
index 0000000000000000000000000000000000000000..98eadd61cb813c9deb97e8351d17d83789e389a9
--- /dev/null
+++ b/wenet/bin/export_jit.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import argparse
+import logging
+import os
+
+import torch
+import yaml
+
+from wenet.utils.init_model import init_model
+
+
+def get_args():
+    parser = argparse.ArgumentParser(description='export your script model')
+    parser.add_argument('--config', required=True, help='config file')
+    parser.add_argument('--checkpoint', required=True, help='checkpoint model')
+    parser.add_argument('--output_file', default=None, help='output file')
+    parser.add_argument('--output_quant_file',
+                        default=None,
+                        help='output quantized model file')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = get_args()
+    args.jit = True
+    logging.basicConfig(level=logging.DEBUG,
+                        format='%(asctime)s %(levelname)s %(message)s')
+    # No need gpu for model export
+    os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
+
+    with open(args.config, 'r') as fin:
+        configs = yaml.load(fin, Loader=yaml.FullLoader)
+    model, configs = init_model(args, configs)
+    model.eval()
+    print(model)
+    # Export jit torch script model
+
+    if args.output_file:
+        script_model = torch.jit.script(model)
+        script_model.save(args.output_file)
+        print('Export model successfully, see {}'.format(args.output_file))
+
+    # Export quantized jit torch script model
+    if args.output_quant_file:
+        quantized_model = torch.quantization.quantize_dynamic(
+            model, {torch.nn.Linear}, dtype=torch.qint8)
+        print(quantized_model)
+        script_quant_model = torch.jit.script(quantized_model)
+        script_quant_model.save(args.output_quant_file)
+        print('Export quantized model successfully, '
+              'see {}'.format(args.output_quant_file))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/wenet/bin/export_onnx_bpu.py b/wenet/bin/export_onnx_bpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1d93a022e865afbf424faabd552404b7bcdd4ef
--- /dev/null
+++ b/wenet/bin/export_onnx_bpu.py
@@ -0,0 +1,1065 @@
+# Copyright (c) 2022, Horizon Inc. Xingchen Song (sxc19@tsinghua.org.cn)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""NOTE(xcsong): Currently, we only support
+1. specific conformer encoder architecture, see:
+    encoder: conformer
+    encoder_conf:
+      activation_type: **must be** relu
+      attention_heads: 2 or 4 or 8 or any number divisible by output_size
+      causal: **must be** true
+      cnn_module_kernel: 1 ~ 7
+      cnn_module_norm: **must be** batch_norm
+      input_layer: **must be** conv2d8
+      linear_units: 1 ~ 2048
+      normalize_before: **must be** true
+      num_blocks: 1 ~ 12
+      output_size: 1 ~ 512
+      pos_enc_layer_type: **must be** no_pos
+      selfattention_layer_type: **must be** selfattn
+      use_cnn_module: **must be** true
+      use_dynamic_chunk: **must be** true
+      use_dynamic_left_chunk: **must be** true
+
+2. specific decoding method: ctc_greedy_search
+"""
+
+from __future__ import print_function
+
+import os
+import sys
+import copy
+import math
+import yaml
+import logging
+from typing import Tuple
+
+import torch
+import numpy as np
+
+from wenet.transformer.embedding import NoPositionalEncoding
+from wenet.utils.init_model import init_model
+from wenet.bin.export_onnx_cpu import (get_args, to_numpy,
+                                       print_input_output_info)
+
+try:
+    import onnx
+    import onnxruntime
+except ImportError:
+    print('Please install onnx and onnxruntime!')
+    sys.exit(1)
+
+logger = logging.getLogger(__file__)
+logger.setLevel(logging.INFO)
+
+
+class BPULayerNorm(torch.nn.Module):
+    """Refactor torch.nn.LayerNorm to meet 4-D dataflow."""
+
+    def __init__(self, module, chunk_size=8, run_on_bpu=False):
+        super().__init__()
+        original = copy.deepcopy(module)
+        self.hidden = module.weight.size(0)
+        self.chunk_size = chunk_size
+        self.run_on_bpu = run_on_bpu
+
+        if self.run_on_bpu:
+            self.weight = torch.nn.Parameter(
+                module.weight.reshape(1, self.hidden, 1,
+                                      1).repeat(1, 1, 1, chunk_size))
+            self.bias = torch.nn.Parameter(
+                module.bias.reshape(1, self.hidden, 1,
+                                    1).repeat(1, 1, 1, chunk_size))
+            self.negtive = torch.nn.Parameter(
+                torch.ones((1, self.hidden, 1, chunk_size)) * -1.0)
+            self.eps = torch.nn.Parameter(
+                torch.zeros((1, self.hidden, 1, chunk_size)) + module.eps)
+            self.mean_conv_1 = torch.nn.Conv2d(self.hidden, 1, 1, bias=False)
+            self.mean_conv_1.weight = torch.nn.Parameter(
+                torch.ones(self.hidden, self.hidden, 1, 1) /
+                (1.0 * self.hidden))
+            self.mean_conv_2 = torch.nn.Conv2d(self.hidden, 1, 1, bias=False)
+            self.mean_conv_2.weight = torch.nn.Parameter(
+                torch.ones(self.hidden, self.hidden, 1, 1) /
+                (1.0 * self.hidden))
+        else:
+            self.norm = module
+
+        self.check_equal(original)
+
+    def check_equal(self, module):
+        random_data = torch.randn(1, self.chunk_size, self.hidden)
+        orig_out = module(random_data)
+        new_out = self.forward(random_data.transpose(1, 2).unsqueeze(2))
+        np.testing.assert_allclose(to_numpy(orig_out),
+                                   to_numpy(
+                                       new_out.squeeze(2).transpose(1, 2)),
+                                   rtol=1e-02,
+                                   atol=1e-03)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.run_on_bpu:
+            u = self.mean_conv_1(x)  # (1, h, 1, c)
+            numerator = x + u * self.negtive  # (1, h, 1, c)
+            s = torch.pow(numerator, 2)  # (1, h, 1, c)
+            s = self.mean_conv_2(s)  # (1, h, 1, c)
+            denominator = torch.sqrt(s + self.eps)  # (1, h, 1, c)
+            x = torch.div(numerator, denominator)  # (1, h, 1, c)
+            x = x * self.weight + self.bias
+        else:
+            x = x.squeeze(2).transpose(1, 2).contiguous()
+            x = self.norm(x)
+            x = x.transpose(1, 2).contiguous().unsqueeze(2)
+        return x
+
+
+class BPUIdentity(torch.nn.Module):
+    """Refactor torch.nn.Identity().
+       For inserting BPU node whose input == output.
+    """
+
+    def __init__(self, channels):
+        super().__init__()
+        self.channels = channels
+        self.identity_conv = torch.nn.Conv2d(channels,
+                                             channels,
+                                             1,
+                                             groups=channels,
+                                             bias=False)
+        torch.nn.init.dirac_(self.identity_conv.weight.data, groups=channels)
+
+        self.check_equal()
+
+    def check_equal(self):
+        random_data = torch.randn(1, self.channels, 1, 10)
+        result = self.forward(random_data)
+        np.testing.assert_allclose(to_numpy(random_data),
+                                   to_numpy(result),
+                                   rtol=1e-02,
+                                   atol=1e-03)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Identity with 4-D dataflow, input == output.
+        Args:
+            x (torch.Tensor): (batch, in_channel, 1, time)
+
+        Returns:
+            (torch.Tensor): (batch, in_channel, 1, time).
+        """
+        return self.identity_conv(x)
+
+
+class BPULinear(torch.nn.Module):
+    """Refactor torch.nn.Linear or pointwise_conv"""
+
+    def __init__(self, module, is_pointwise_conv=False):
+        super().__init__()
+        # Unchanged submodules and attributes
+        original = copy.deepcopy(module)
+        self.idim = module.weight.size(1)
+        self.odim = module.weight.size(0)
+        self.is_pointwise_conv = is_pointwise_conv
+
+        # Modify weight & bias
+        self.linear = torch.nn.Conv2d(self.idim, self.odim, 1, 1)
+        if is_pointwise_conv:
+            # (odim, idim, kernel=1) -> (odim, idim, 1, 1)
+            self.linear.weight = torch.nn.Parameter(
+                module.weight.unsqueeze(-1))
+        else:
+            # (odim, idim) -> (odim, idim, 1, 1)
+            self.linear.weight = torch.nn.Parameter(
+                module.weight.unsqueeze(2).unsqueeze(3))
+        self.linear.bias = module.bias
+
+        self.check_equal(original)
+
+    def check_equal(self, module):
+        random_data = torch.randn(1, 8, self.idim)
+        if self.is_pointwise_conv:
+            random_data = random_data.transpose(1, 2)
+        original_result = module(random_data)
+        if self.is_pointwise_conv:
+            random_data = random_data.transpose(1, 2)
+            original_result = original_result.transpose(1, 2)
+        random_data = random_data.transpose(1, 2).unsqueeze(2)
+        new_result = self.forward(random_data)
+        np.testing.assert_allclose(to_numpy(original_result),
+                                   to_numpy(
+                                       new_result.squeeze(2).transpose(1, 2)),
+                                   rtol=1e-02,
+                                   atol=1e-03)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Linear with 4-D dataflow.
+        Args:
+            x (torch.Tensor): (batch, in_channel, 1, time)
+        Returns:
+            (torch.Tensor): (batch, out_channel, 1, time).
+        """
+        return self.linear(x)
+
+
+class BPUGlobalCMVN(torch.nn.Module):
+    """Refactor wenet/transformer/cmvn.py::GlobalCMVN"""
+
+    def __init__(self, module):
+        super().__init__()
+        # Unchanged submodules and attributes
+        self.norm_var = module.norm_var
+
+        # NOTE(xcsong): Expand to 4-D tensor, (mel_dim) -> (1, 1, mel_dim, 1)
+        self.mean = module.mean.unsqueeze(-1).unsqueeze(0).unsqueeze(0)
+        self.istd = module.istd.unsqueeze(-1).unsqueeze(0).unsqueeze(0)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """CMVN with 4-D dataflow.
+        Args:
+            x (torch.Tensor): (batch, 1, mel_dim, time)
+        Returns:
+            (torch.Tensor): normalized feature with same shape.
+        """
+        x = x - self.mean
+        if self.norm_var:
+            x = x * self.istd
+        return x
+
+
+class BPUConv2dSubsampling8(torch.nn.Module):
+    """Refactor wenet/transformer/subsampling.py::Conv2dSubsampling8
+
+    NOTE(xcsong): Only support pos_enc_class == NoPositionalEncoding
+    """
+
+    def __init__(self, module):
+        super().__init__()
+        # Unchanged submodules and attributes
+        original = copy.deepcopy(module)
+        self.right_context = module.right_context
+        self.subsampling_rate = module.subsampling_rate
+        assert isinstance(module.pos_enc, NoPositionalEncoding)
+
+        # 1. Modify self.conv
+        # NOTE(xcsong): We change input shape from (1, 1, frames, mel_dim)
+        #   to (1, 1, mel_dim, frames) for more efficient computation.
+        self.conv = module.conv
+        for idx in [0, 2, 4]:
+            self.conv[idx].weight = torch.nn.Parameter(
+                module.conv[idx].weight.transpose(2, 3))
+
+        # 2. Modify self.linear
+        # NOTE(xcsong): Split final projection to meet the requirment of
+        #   maximum kernel_size (7 for XJ3)
+        self.linear = torch.nn.ModuleList()
+        odim = module.linear.weight.size(0)  # 512, in this case
+        freq = module.linear.weight.size(1) // odim  # 4608 // 512 == 9
+        self.odim, self.freq = odim, freq
+        weight = module.linear.weight.reshape(
+            odim, odim, freq,
+            1)  # (odim, odim * freq) -> (odim, odim, freq, 1)
+        self.split_size = []
+        num_split = (freq - 1) // 7 + 1  # XJ3 requires kernel_size <= 7
+        slice_begin = 0
+        for idx in range(num_split):
+            kernel_size = min(freq, (idx + 1) * 7) - idx * 7
+            conv_ele = torch.nn.Conv2d(odim, odim, (kernel_size, 1),
+                                       (kernel_size, 1))
+            conv_ele.weight = torch.nn.Parameter(
+                weight[:, :, slice_begin:slice_begin + kernel_size, :])
+            conv_ele.bias = torch.nn.Parameter(torch.zeros_like(conv_ele.bias))
+            self.linear.append(conv_ele)
+            self.split_size.append(kernel_size)
+            slice_begin += kernel_size
+        self.linear[0].bias = torch.nn.Parameter(module.linear.bias)
+
+        self.check_equal(original)
+
+    def check_equal(self, module):
+        random_data = torch.randn(1, 67, 80)
+        mask = torch.zeros(1, 1, 67)
+        original_result, _, _ = module(random_data, mask)  # (1, 8, 512)
+        random_data = random_data.transpose(1,
+                                            2).unsqueeze(0)  # (1, 1, 80, 67)
+        new_result = self.forward(random_data)  # (1, 512, 1, 8)
+        np.testing.assert_allclose(to_numpy(original_result),
+                                   to_numpy(
+                                       new_result.squeeze(2).transpose(1, 2)),
+                                   rtol=1e-02,
+                                   atol=1e-03)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Subsample x with 4-D dataflow.
+        Args:
+            x (torch.Tensor): Input tensor (#batch, 1, mel_dim, time).
+
+        Returns:
+            torch.Tensor: Subsampled tensor (#batch, odim, 1, time'),
+                where time' = time // 8.
+        """
+        x = self.conv(x)  # (1, odim, freq, time')
+        x_out = torch.zeros(x.size(0), self.odim, 1, x.size(3))
+        x = torch.split(x, self.split_size, dim=2)
+        for idx, (x_part, layer) in enumerate(zip(x, self.linear)):
+            x_out += layer(x_part)
+        return x_out
+
+
+class BPUMultiHeadedAttention(torch.nn.Module):
+    """Refactor wenet/transformer/attention.py::MultiHeadedAttention
+
+    NOTE(xcsong): Only support attention_class == MultiHeadedAttention,
+        we do not consider RelPositionMultiHeadedAttention currently.
+    """
+
+    def __init__(self, module, chunk_size, left_chunks):
+        super().__init__()
+        # Unchanged submodules and attributes
+        original = copy.deepcopy(module)
+        self.d_k = module.d_k
+        self.h = module.h
+        n_feat = self.d_k * self.h
+        self.chunk_size = chunk_size
+        self.left_chunks = left_chunks
+        self.time = chunk_size * (left_chunks + 1)
+        self.activation = torch.nn.Softmax(dim=-1)
+
+        # 1. Modify self.linear_x
+        self.linear_q = BPULinear(module.linear_q)
+        self.linear_k = BPULinear(module.linear_k)
+        self.linear_v = BPULinear(module.linear_v)
+        self.linear_out = BPULinear(module.linear_out)
+        # 2. denom
+        self.register_buffer(
+            "denom", torch.full((1, self.h, 1, 1), 1.0 / math.sqrt(self.d_k)))
+
+        self.check_equal(original)
+
+    def check_equal(self, module):
+        random_data = torch.randn(1, self.chunk_size, self.d_k * self.h)
+        mask = torch.ones((1, self.h, self.chunk_size, self.time),
+                          dtype=torch.bool)
+        cache = torch.zeros(1, self.h, self.chunk_size * self.left_chunks,
+                            self.d_k * 2)
+        original_out, original_cache = module(random_data, random_data,
+                                              random_data, mask[:, 0, :, :],
+                                              torch.empty(0), cache)
+        random_data = random_data.transpose(1, 2).unsqueeze(2)
+        cache = cache.reshape(1, self.h, self.d_k * 2,
+                              self.chunk_size * self.left_chunks)
+        new_out, new_cache = self.forward(random_data, random_data,
+                                          random_data, mask, cache)
+        np.testing.assert_allclose(to_numpy(original_out),
+                                   to_numpy(
+                                       new_out.squeeze(2).transpose(1, 2)),
+                                   rtol=1e-02,
+                                   atol=1e-03)
+        np.testing.assert_allclose(to_numpy(original_cache),
+                                   to_numpy(new_cache.transpose(2, 3)),
+                                   rtol=1e-02,
+                                   atol=1e-03)
+
+    def forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        mask: torch.Tensor,
+        cache: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Compute scaled dot product attention.
+
+        Args:
+            q (torch.Tensor): Query tensor (#batch, size, 1, chunk_size).
+            k (torch.Tensor): Key tensor (#batch, size, 1, chunk_size).
+            v (torch.Tensor): Value tensor (#batch, size, 1, chunk_size).
+            mask (torch.Tensor): Mask tensor,
+                (#batch, head, chunk_size, cache_t + chunk_size).
+            cache (torch.Tensor): Cache tensor
+                (1, head, d_k * 2, cache_t),
+                where `cache_t == chunk_size * left_chunks`.
+
+
+        Returns:
+            torch.Tensor: Output tensor (#batch, size, 1, chunk_size).
+            torch.Tensor: Cache tensor
+                (1, head, d_k * 2, cache_t + chunk_size)
+                where `cache_t == chunk_size * left_chunks`
+        """
+        # 1. Forward QKV
+        q = self.linear_q(q)  # (1, d, 1, c) d == size, c == chunk_size
+        k = self.linear_k(k)  # (1, d, 1, c)
+        v = self.linear_v(v)  # (1, d, 1, c)
+        q = q.view(1, self.h, self.d_k, self.chunk_size)
+        k = k.view(1, self.h, self.d_k, self.chunk_size)
+        v = v.view(1, self.h, self.d_k, self.chunk_size)
+        q = q.transpose(2, 3)  # (batch, head, time1, d_k)
+        k_cache, v_cache = torch.split(cache, cache.size(2) // 2, dim=2)
+        k = torch.cat((k_cache, k), dim=3)
+        v = torch.cat((v_cache, v), dim=3)
+        new_cache = torch.cat((k, v), dim=2)
+        # 2. (Q^T)K
+        scores = torch.matmul(q, k) * self.denom  # (#b, n_head, time1, time2)
+        # 3. Forward attention
+        mask = mask.eq(0)
+        scores = scores.masked_fill(mask, -float('inf'))
+        attn = self.activation(scores).masked_fill(mask, 0.0)
+        attn = attn.transpose(2, 3)
+        x = torch.matmul(v, attn)
+        x = x.view(1, self.d_k * self.h, 1, self.chunk_size)
+        x_out = self.linear_out(x)
+        return x_out, new_cache
+
+
+class BPUConvolution(torch.nn.Module):
+    """Refactor wenet/transformer/convolution.py::ConvolutionModule
+
+    NOTE(xcsong): Only suport use_layer_norm == False
+    """
+
+    def __init__(self, module):
+        super().__init__()
+        # Unchanged submodules and attributes
+        original = copy.deepcopy(module)
+        self.lorder = module.lorder
+        self.use_layer_norm = False
+        self.activation = module.activation
+        channels = module.pointwise_conv1.weight.size(1)
+        self.channels = channels
+        kernel_size = module.depthwise_conv.weight.size(2)
+        assert module.use_layer_norm is False
+
+        # 1. Modify self.pointwise_conv1
+        self.pointwise_conv1 = BPULinear(module.pointwise_conv1, True)
+
+        # 2. Modify self.depthwise_conv
+        self.depthwise_conv = torch.nn.Conv2d(channels,
+                                              channels, (1, kernel_size),
+                                              stride=1,
+                                              groups=channels)
+        self.depthwise_conv.weight = torch.nn.Parameter(
+            module.depthwise_conv.weight.unsqueeze(-2))
+        self.depthwise_conv.bias = torch.nn.Parameter(
+            module.depthwise_conv.bias)
+
+        # 3. Modify self.norm, Only support batchnorm2d
+        self.norm = torch.nn.BatchNorm2d(channels)
+        self.norm.training = False
+        self.norm.num_features = module.norm.num_features
+        self.norm.eps = module.norm.eps
+        self.norm.momentum = module.norm.momentum
+        self.norm.weight = torch.nn.Parameter(module.norm.weight)
+        self.norm.bias = torch.nn.Parameter(module.norm.bias)
+        self.norm.running_mean = module.norm.running_mean
+        self.norm.running_var = module.norm.running_var
+
+        # 4. Modify self.pointwise_conv2
+        self.pointwise_conv2 = BPULinear(module.pointwise_conv2, True)
+
+        # 5. Identity conv, for running `concat` on BPU
+        self.identity = BPUIdentity(channels)
+
+        self.check_equal(original)
+
+    def check_equal(self, module):
+        random_data = torch.randn(1, 8, self.channels)
+        cache = torch.zeros((1, self.channels, self.lorder))
+        original_out, original_cache = module(random_data, cache=cache)
+        random_data = random_data.transpose(1, 2).unsqueeze(2)
+        cache = cache.unsqueeze(2)
+        new_out, new_cache = self.forward(random_data, cache)
+        np.testing.assert_allclose(to_numpy(original_out),
+                                   to_numpy(
+                                       new_out.squeeze(2).transpose(1, 2)),
+                                   rtol=1e-02,
+                                   atol=1e-03)
+        np.testing.assert_allclose(to_numpy(original_cache),
+                                   to_numpy(new_cache.squeeze(2)),
+                                   rtol=1e-02,
+                                   atol=1e-03)
+
+    def forward(self, x: torch.Tensor,
+                cache: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Compute convolution module.
+        Args:
+            x (torch.Tensor): Input tensor (#batch, channels, 1, chunk_size).
+            cache (torch.Tensor): left context cache, it is only
+                used in causal convolution (#batch, channels, 1, cache_t).
+        Returns:
+            torch.Tensor: Output tensor (#batch, channels, 1, chunk_size).
+            torch.Tensor: Cache tensor (#batch, channels, 1, cache_t).
+        """
+        # Concat cache
+        x = torch.cat((self.identity(cache), self.identity(x)), dim=3)
+        new_cache = x[:, :, :, -self.lorder:]
+
+        # GLU mechanism
+        x = self.pointwise_conv1(x)  # (batch, 2*channel, 1, dim)
+        x = torch.nn.functional.glu(x, dim=1)  # (b, channel, 1, dim)
+
+        # Depthwise Conv
+        x = self.depthwise_conv(x)
+        x = self.activation(self.norm(x))
+        x = self.pointwise_conv2(x)
+        return x, new_cache
+
+
+class BPUFFN(torch.nn.Module):
+    """Refactor wenet/transformer/positionwise_feed_forward.py::PositionwiseFeedForward
+    """
+
+    def __init__(self, module):
+        super().__init__()
+        # Unchanged submodules and attributes
+        original = copy.deepcopy(module)
+        self.activation = module.activation
+
+        # 1. Modify self.w_x
+        self.w_1 = BPULinear(module.w_1)
+        self.w_2 = BPULinear(module.w_2)
+
+        self.check_equal(original)
+
+    def check_equal(self, module):
+        random_data = torch.randn(1, 8, self.w_1.idim)
+        original_out = module(random_data)
+        random_data = random_data.transpose(1, 2).unsqueeze(2)
+        new_out = self.forward(random_data)
+        np.testing.assert_allclose(to_numpy(original_out),
+                                   to_numpy(
+                                       new_out.squeeze(2).transpose(1, 2)),
+                                   rtol=1e-02,
+                                   atol=1e-03)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Forward function.
+
+        Args:
+            xs: input tensor (B, D, 1, L)
+        Returns:
+            output tensor, (B, D, 1, L)
+        """
+        return self.w_2(self.activation(self.w_1(x)))
+
+
+class BPUConformerEncoderLayer(torch.nn.Module):
+    """Refactor wenet/transformer/encoder_layer.py::ConformerEncoderLayer
+    """
+
+    def __init__(self, module, chunk_size, left_chunks, ln_run_on_bpu=False):
+        super().__init__()
+        # Unchanged submodules and attributes
+        original = copy.deepcopy(module)
+        self.size = module.size
+        assert module.normalize_before is True
+        assert module.concat_after is False
+
+        # 1. Modify submodules
+        self.feed_forward_macaron = BPUFFN(module.feed_forward_macaron)
+        self.self_attn = BPUMultiHeadedAttention(module.self_attn, chunk_size,
+                                                 left_chunks)
+        self.conv_module = BPUConvolution(module.conv_module)
+        self.feed_forward = BPUFFN(module.feed_forward)
+
+        # 2. Modify norms
+        self.norm_ff = BPULayerNorm(module.norm_ff, chunk_size, ln_run_on_bpu)
+        self.norm_mha = BPULayerNorm(module.norm_mha, chunk_size,
+                                     ln_run_on_bpu)
+        self.norm_ff_macron = BPULayerNorm(module.norm_ff_macaron, chunk_size,
+                                           ln_run_on_bpu)
+        self.norm_conv = BPULayerNorm(module.norm_conv, chunk_size,
+                                      ln_run_on_bpu)
+        self.norm_final = BPULayerNorm(module.norm_final, chunk_size,
+                                       ln_run_on_bpu)
+
+        # 3. 4-D ff_scale
+        self.register_buffer("ff_scale",
+                             torch.full((1, self.size, 1, 1), module.ff_scale))
+
+        self.check_equal(original)
+
+    def check_equal(self, module):
+        time1 = self.self_attn.chunk_size
+        time2 = self.self_attn.time
+        h, d_k = self.self_attn.h, self.self_attn.d_k
+        random_x = torch.randn(1, time1, self.size)
+        att_mask = torch.ones(1, h, time1, time2)
+        att_cache = torch.zeros(1, h, time2 - time1, d_k * 2)
+        cnn_cache = torch.zeros(1, self.size, self.conv_module.lorder)
+        original_x, _, original_att_cache, original_cnn_cache = module(
+            random_x,
+            att_mask[:, 0, :, :],
+            torch.empty(0),
+            att_cache=att_cache,
+            cnn_cache=cnn_cache)
+        random_x = random_x.transpose(1, 2).unsqueeze(2)
+        att_cache = att_cache.reshape(1, h, d_k * 2, time2 - time1)
+        cnn_cache = cnn_cache.unsqueeze(2)
+        new_x, new_att_cache, new_cnn_cache = self.forward(
+            random_x, att_mask, att_cache, cnn_cache)
+        np.testing.assert_allclose(to_numpy(original_att_cache),
+                                   to_numpy(new_att_cache.transpose(2, 3)),
+                                   rtol=1e-02,
+                                   atol=1e-03)
+        np.testing.assert_allclose(to_numpy(original_x),
+                                   to_numpy(new_x.squeeze(2).transpose(1, 2)),
+                                   rtol=1e-02,
+                                   atol=1e-03)
+        np.testing.assert_allclose(to_numpy(original_cnn_cache),
+                                   to_numpy(new_cnn_cache.squeeze(2)),
+                                   rtol=1e-02,
+                                   atol=1e-03)
+
+    def forward(
+        self, x: torch.Tensor, att_mask: torch.Tensor, att_cache: torch.Tensor,
+        cnn_cache: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Compute encoded features.
+
+        Args:
+            x (torch.Tensor): (#batch, size, 1, chunk_size)
+            att_mask (torch.Tensor): Mask tensor for the input
+                (#batch, head, chunk_size, cache_t1 + chunk_size),
+            att_cache (torch.Tensor): Cache tensor of the KEY & VALUE
+                (#batch=1, head, d_k * 2, cache_t1), head * d_k == size.
+            cnn_cache (torch.Tensor): Convolution cache in conformer layer
+                (#batch=1, size, 1, cache_t2)
+        Returns:
+            torch.Tensor: Output tensor (#batch, size, 1, chunk_size).
+            torch.Tensor: att_cache tensor,
+                (1, head, d_k * 2, cache_t1 + chunk_size).
+            torch.Tensor: cnn_cahce tensor (#batch, size, 1, cache_t2).
+        """
+        # 1. ffn_macaron
+        residual = x
+        x = self.norm_ff_macron(x)
+        x = residual + self.ff_scale * self.feed_forward_macaron(x)
+
+        # 2. attention
+        residual = x
+        x = self.norm_mha(x)
+        x_att, new_att_cache = self.self_attn(x, x, x, att_mask, att_cache)
+        x = residual + x_att
+
+        # 3. convolution
+        residual = x
+        x = self.norm_conv(x)
+        x, new_cnn_cache = self.conv_module(x, cnn_cache)
+        x = residual + x
+
+        # 4. ffn
+        residual = x
+        x = self.norm_ff(x)
+        x = residual + self.ff_scale * self.feed_forward(x)
+
+        # 5. final post-norm
+        x = self.norm_final(x)
+
+        return x, new_att_cache, new_cnn_cache
+
+
+class BPUConformerEncoder(torch.nn.Module):
+    """Refactor wenet/transformer/encoder.py::ConformerEncoder
+    """
+
+    def __init__(self, module, chunk_size, left_chunks, ln_run_on_bpu=False):
+        super().__init__()
+        # Unchanged submodules and attributes
+        original = copy.deepcopy(module)
+        output_size = module.output_size()
+        self._output_size = module.output_size()
+        self.after_norm = module.after_norm
+        self.chunk_size = chunk_size
+        self.left_chunks = left_chunks
+        self.head = module.encoders[0].self_attn.h
+        self.layers = len(module.encoders)
+
+        # 1. Modify submodules
+        self.global_cmvn = BPUGlobalCMVN(module.global_cmvn)
+        self.embed = BPUConv2dSubsampling8(module.embed)
+        self.encoders = torch.nn.ModuleList()
+        for layer in module.encoders:
+            self.encoders.append(
+                BPUConformerEncoderLayer(layer, chunk_size, left_chunks,
+                                         ln_run_on_bpu))
+
+        # 2. Auxiliary conv
+        self.identity_cnncache = BPUIdentity(output_size)
+
+        self.check_equal(original)
+
+    def check_equal(self, module):
+        time1 = self.encoders[0].self_attn.chunk_size
+        time2 = self.encoders[0].self_attn.time
+        layers = self.layers
+        h, d_k = self.head, self.encoders[0].self_attn.d_k
+        decoding_window = (self.chunk_size - 1) * \
+            module.embed.subsampling_rate + \
+            module.embed.right_context + 1
+        lorder = self.encoders[0].conv_module.lorder
+        random_x = torch.randn(1, decoding_window, 80)
+        att_mask = torch.ones(1, h, time1, time2)
+        att_cache = torch.zeros(layers, h, time2 - time1, d_k * 2)
+        cnn_cache = torch.zeros(layers, 1, self._output_size, lorder)
+        orig_x, orig_att_cache, orig_cnn_cache = module.forward_chunk(
+            random_x,
+            0,
+            time2 - time1,
+            att_mask=att_mask[:, 0, :, :],
+            att_cache=att_cache,
+            cnn_cache=cnn_cache)
+        random_x = random_x.unsqueeze(0)
+        att_cache = att_cache.reshape(1, h * layers, d_k * 2, time2 - time1)
+        cnn_cache = cnn_cache.reshape(1, self._output_size, layers, lorder)
+        new_x, new_att_cache, new_cnn_cache = self.forward(
+            random_x, att_cache, cnn_cache, att_mask)
+        caches = torch.split(new_att_cache, h, dim=1)
+        caches = [c.transpose(2, 3) for c in caches]
+        np.testing.assert_allclose(to_numpy(orig_att_cache),
+                                   to_numpy(torch.cat(caches, dim=0)),
+                                   rtol=1e-02,
+                                   atol=1e-03)
+        np.testing.assert_allclose(to_numpy(orig_x),
+                                   to_numpy(new_x.squeeze(2).transpose(1, 2)),
+                                   rtol=1e-02,
+                                   atol=1e-03)
+        np.testing.assert_allclose(
+            to_numpy(orig_cnn_cache),
+            to_numpy(new_cnn_cache.transpose(0, 2).transpose(1, 2)),
+            rtol=1e-02,
+            atol=1e-03)
+
+    def forward(
+        self, xs: torch.Tensor, att_cache: torch.Tensor,
+        cnn_cache: torch.Tensor, att_mask: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """ Forward just one chunk
+
+        Args:
+            xs (torch.Tensor): chunk input, with shape (b=1, 1, time, mel-dim),
+                where `time == (chunk_size - 1) * subsample_rate + \
+                        subsample.right_context + 1`
+            att_cache (torch.Tensor): cache tensor for KEY & VALUE in
+                transformer/conformer attention, with shape
+                (1, head * elayers, d_k * 2, cache_t1), where
+                `head * d_k == hidden-dim` and
+                `cache_t1 == chunk_size * left_chunks`.
+            cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer,
+                (1, hidden-dim, elayers, cache_t2), where
+                `cache_t2 == cnn.lorder - 1`
+            att_mask (torch.Tensor): Mask tensor for the input
+                (#batch, head, chunk_size, cache_t1 + chunk_size),
+
+        Returns:
+            torch.Tensor: output of current input xs,
+                with shape (b=1, hidden-dim, 1, chunk_size).
+            torch.Tensor: new attention cache required for next chunk, with
+                same shape as the original att_cache.
+            torch.Tensor: new conformer cnn cache required for next chunk, with
+                same shape as the original cnn_cache.
+        """
+        # xs: (B, 1, time, mel_dim) -> (B, 1, mel_dim, time)
+        xs = xs.transpose(2, 3)
+        xs = self.global_cmvn(xs)
+        # xs: (B, 1, mel_dim, time) -> (B, hidden_dim, 1, chunk_size)
+        xs = self.embed(xs)
+
+        att_cache = torch.split(att_cache, self.head, dim=1)
+        cnn_cache = self.identity_cnncache(cnn_cache)
+        cnn_cache = torch.split(cnn_cache, 1, dim=2)
+        r_att_cache = []
+        r_cnn_cache = []
+        for i, layer in enumerate(self.encoders):
+            xs, new_att_cache, new_cnn_cache = layer(xs,
+                                                     att_mask,
+                                                     att_cache=att_cache[i],
+                                                     cnn_cache=cnn_cache[i])
+            r_att_cache.append(new_att_cache[:, :, :, self.chunk_size:])
+            r_cnn_cache.append(new_cnn_cache)
+        r_att_cache = torch.cat(r_att_cache, dim=1)
+        r_cnn_cache = self.identity_cnncache(torch.cat(r_cnn_cache, dim=2))
+
+        xs = xs.squeeze(2).transpose(1, 2).contiguous()
+        xs = self.after_norm(xs)
+        # NOTE(xcsong): 4D in, 4D out to meet the requirment of CTC input.
+        xs = xs.transpose(1, 2).contiguous().unsqueeze(2)  # (B, C, 1, T)
+
+        return (xs, r_att_cache, r_cnn_cache)
+
+
+class BPUCTC(torch.nn.Module):
+    """Refactor wenet/transformer/ctc.py::CTC
+    """
+
+    def __init__(self, module):
+        super().__init__()
+        # Unchanged submodules and attributes
+        original = copy.deepcopy(module)
+        self.idim = module.ctc_lo.weight.size(1)
+        num_class = module.ctc_lo.weight.size(0)
+
+        # 1. Modify self.ctc_lo, Split final projection to meet the
+        #   requirment of maximum in/out channels (2048 for XJ3)
+        self.ctc_lo = torch.nn.ModuleList()
+        self.split_size = []
+        num_split = (num_class - 1) // 2048 + 1
+        for idx in range(num_split):
+            out_channel = min(num_class, (idx + 1) * 2048) - idx * 2048
+            conv_ele = torch.nn.Conv2d(self.idim, out_channel, 1, 1)
+            self.ctc_lo.append(conv_ele)
+            self.split_size.append(out_channel)
+        orig_weight = torch.split(module.ctc_lo.weight, self.split_size, dim=0)
+        orig_bias = torch.split(module.ctc_lo.bias, self.split_size, dim=0)
+        for i, (w, b) in enumerate(zip(orig_weight, orig_bias)):
+            w = w.unsqueeze(2).unsqueeze(3)
+            self.ctc_lo[i].weight = torch.nn.Parameter(w)
+            self.ctc_lo[i].bias = torch.nn.Parameter(b)
+
+        self.check_equal(original)
+
+    def check_equal(self, module):
+        random_data = torch.randn(1, 100, self.idim)
+        original_result = module.ctc_lo(random_data)
+        random_data = random_data.transpose(1, 2).unsqueeze(2)
+        new_result = self.forward(random_data)
+        np.testing.assert_allclose(to_numpy(original_result),
+                                   to_numpy(
+                                       new_result.squeeze(2).transpose(1, 2)),
+                                   rtol=1e-02,
+                                   atol=1e-03)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """frame activations, without softmax.
+
+        Args:
+            Tensor x: 4d tensor (B, hidden_dim, 1, chunk_size)
+        Returns:
+            torch.Tensor: (B, num_class, 1, chunk_size)
+        """
+        out = []
+        for i, layer in enumerate(self.ctc_lo):
+            out.append(layer(x))
+        out = torch.cat(out, dim=1)
+        return out
+
+
+def export_encoder(asr_model, args):
+    logger.info("Stage-1: export encoder")
+    decode_window, mel_dim = args.decoding_window, args.feature_size
+    encoder = BPUConformerEncoder(asr_model.encoder, args.chunk_size,
+                                  args.num_decoding_left_chunks,
+                                  args.ln_run_on_bpu)
+    encoder.eval()
+    encoder_outpath = os.path.join(args.output_dir, 'encoder.onnx')
+
+    logger.info("Stage-1.1: prepare inputs for encoder")
+    chunk = torch.randn((1, 1, decode_window, mel_dim))
+    required_cache_size = encoder.chunk_size * encoder.left_chunks
+    kv_time = required_cache_size + encoder.chunk_size
+    hidden, layers = encoder._output_size, len(encoder.encoders)
+    head = encoder.encoders[0].self_attn.h
+    d_k = hidden // head
+    lorder = encoder.encoders[0].conv_module.lorder
+    att_cache = torch.zeros(1, layers * head, d_k * 2, required_cache_size)
+    att_mask = torch.ones((1, head, encoder.chunk_size, kv_time))
+    att_mask[:, :, :, :required_cache_size] = 0
+    cnn_cache = torch.zeros((1, hidden, layers, lorder))
+    inputs = (chunk, att_cache, cnn_cache, att_mask)
+    logger.info("chunk.size(): {} att_cache.size(): {} "
+                "cnn_cache.size(): {} att_mask.size(): {}".format(
+                    list(chunk.size()), list(att_cache.size()),
+                    list(cnn_cache.size()), list(att_mask.size())))
+
+    logger.info("Stage-1.2: torch.onnx.export")
+    # NOTE(xcsong): Below attributes will be used in
+    #   onnx2horizonbin.py::generate_config()
+    attributes = {}
+    attributes['input_name'] = "chunk;att_cache;cnn_cache;att_mask"
+    attributes['output_name'] = "output;r_att_cache;r_cnn_cache"
+    attributes['input_type'] = "featuremap;featuremap;featuremap;featuremap"
+    attributes['norm_type'] = \
+        "no_preprocess;no_preprocess;no_preprocess;no_preprocess"
+    attributes['input_layout_train'] = "NCHW;NCHW;NCHW;NCHW"
+    attributes['input_layout_rt'] = "NCHW;NCHW;NCHW;NCHW"
+    attributes['input_shape'] = \
+        "{}x{}x{}x{};{}x{}x{}x{};{}x{}x{}x{};{}x{}x{}x{}".format(
+        chunk.size(0), chunk.size(1), chunk.size(2), chunk.size(3),
+        att_cache.size(0), att_cache.size(1), att_cache.size(2),
+        att_cache.size(3), cnn_cache.size(0), cnn_cache.size(1),
+        cnn_cache.size(2), cnn_cache.size(3), att_mask.size(0),
+        att_mask.size(1), att_mask.size(2), att_mask.size(3)
+    )
+    torch.onnx.export(  # NOTE(xcsong): only support opset==11
+        encoder,
+        inputs,
+        encoder_outpath,
+        opset_version=11,
+        export_params=True,
+        do_constant_folding=True,
+        input_names=attributes['input_name'].split(';'),
+        output_names=attributes['output_name'].split(';'),
+        dynamic_axes=None,
+        verbose=False)
+    onnx_encoder = onnx.load(encoder_outpath)
+    for k in vars(args):
+        meta = onnx_encoder.metadata_props.add()
+        meta.key, meta.value = str(k), str(getattr(args, k))
+    for k in attributes:
+        meta = onnx_encoder.metadata_props.add()
+        meta.key, meta.value = str(k), str(attributes[k])
+    onnx.checker.check_model(onnx_encoder)
+    onnx.helper.printable_graph(onnx_encoder.graph)
+    onnx.save(onnx_encoder, encoder_outpath)
+    print_input_output_info(onnx_encoder, "onnx_encoder")
+    logger.info('Export onnx_encoder, done! see {}'.format(encoder_outpath))
+
+    logger.info("Stage-1.3: check onnx_encoder and torch_encoder")
+    torch_output = []
+    torch_chunk, torch_att_mask = copy.deepcopy(chunk), copy.deepcopy(att_mask)
+    torch_att_cache = copy.deepcopy(att_cache)
+    torch_cnn_cache = copy.deepcopy(cnn_cache)
+    for i in range(10):
+        logger.info("torch chunk-{}: {}, att_cache: {}, cnn_cache: {}"
+                    ", att_mask: {}".format(i, list(torch_chunk.size()),
+                                            list(torch_att_cache.size()),
+                                            list(torch_cnn_cache.size()),
+                                            list(torch_att_mask.size())))
+        torch_att_mask[:, :, :, -(encoder.chunk_size * (i + 1)):] = 1
+        out, torch_att_cache, torch_cnn_cache = encoder(
+            torch_chunk, torch_att_cache, torch_cnn_cache, torch_att_mask)
+        torch_output.append(out)
+    torch_output = torch.cat(torch_output, dim=-1)
+
+    onnx_output = []
+    onnx_chunk, onnx_att_mask = to_numpy(chunk), to_numpy(att_mask)
+    onnx_att_cache = to_numpy(att_cache)
+    onnx_cnn_cache = to_numpy(cnn_cache)
+    ort_session = onnxruntime.InferenceSession(encoder_outpath)
+    input_names = [node.name for node in onnx_encoder.graph.input]
+    for i in range(10):
+        logger.info("onnx  chunk-{}: {}, att_cache: {}, cnn_cache: {},"
+                    " att_mask: {}".format(i, onnx_chunk.shape,
+                                           onnx_att_cache.shape,
+                                           onnx_cnn_cache.shape,
+                                           onnx_att_mask.shape))
+        onnx_att_mask[:, :, :, -(encoder.chunk_size * (i + 1)):] = 1
+        ort_inputs = {
+            'chunk': onnx_chunk,
+            'att_cache': onnx_att_cache,
+            'cnn_cache': onnx_cnn_cache,
+            'att_mask': onnx_att_mask,
+        }
+        ort_outs = ort_session.run(None, ort_inputs)
+        onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2]
+        onnx_output.append(ort_outs[0])
+    onnx_output = np.concatenate(onnx_output, axis=-1)
+
+    np.testing.assert_allclose(to_numpy(torch_output),
+                               onnx_output,
+                               rtol=1e-03,
+                               atol=1e-04)
+    meta = ort_session.get_modelmeta()
+    logger.info("custom_metadata_map={}".format(meta.custom_metadata_map))
+    logger.info("Check onnx_encoder, pass!")
+    return encoder, ort_session
+
+
+def export_ctc(asr_model, args):
+    logger.info("Stage-2: export ctc")
+    ctc = BPUCTC(asr_model.ctc).eval()
+    ctc_outpath = os.path.join(args.output_dir, 'ctc.onnx')
+
+    logger.info("Stage-2.1: prepare inputs for ctc")
+    hidden = torch.randn((1, args.output_size, 1, args.chunk_size))
+
+    logger.info("Stage-2.2: torch.onnx.export")
+    # NOTE(xcsong): Below attributes will be used in
+    #   onnx2horizonbin.py::generate_config()
+    attributes = {}
+    attributes['input_name'], attributes['input_type'] = "hidden", "featuremap"
+    attributes['norm_type'] = "no_preprocess"
+    attributes['input_layout_train'] = "NCHW"
+    attributes['input_layout_rt'] = "NCHW"
+    attributes['input_shape'] = "{}x{}x{}x{}".format(
+        hidden.size(0),
+        hidden.size(1),
+        hidden.size(2),
+        hidden.size(3),
+    )
+    torch.onnx.export(ctc,
+                      hidden,
+                      ctc_outpath,
+                      opset_version=11,
+                      export_params=True,
+                      do_constant_folding=True,
+                      input_names=['hidden'],
+                      output_names=['probs'],
+                      dynamic_axes=None,
+                      verbose=False)
+    onnx_ctc = onnx.load(ctc_outpath)
+    for k in vars(args):
+        meta = onnx_ctc.metadata_props.add()
+        meta.key, meta.value = str(k), str(getattr(args, k))
+    for k in attributes:
+        meta = onnx_ctc.metadata_props.add()
+        meta.key, meta.value = str(k), str(attributes[k])
+    onnx.checker.check_model(onnx_ctc)
+    onnx.helper.printable_graph(onnx_ctc.graph)
+    onnx.save(onnx_ctc, ctc_outpath)
+    print_input_output_info(onnx_ctc, "onnx_ctc")
+    logger.info('Export onnx_ctc, done! see {}'.format(ctc_outpath))
+
+    logger.info("Stage-2.3: check onnx_ctc and torch_ctc")
+    torch_output = ctc(hidden)
+    ort_session = onnxruntime.InferenceSession(ctc_outpath)
+    onnx_output = ort_session.run(None, {'hidden': to_numpy(hidden)})
+
+    np.testing.assert_allclose(to_numpy(torch_output),
+                               onnx_output[0],
+                               rtol=1e-03,
+                               atol=1e-04)
+    meta = ort_session.get_modelmeta()
+    logger.info("custom_metadata_map={}".format(meta.custom_metadata_map))
+    logger.info("Check onnx_ctc, pass!")
+    return ctc, ort_session
+
+
+def export_decoder(asr_model, args):
+    logger.info("Currently, Decoder is not supported.")
+
+
+if __name__ == '__main__':
+    torch.manual_seed(777)
+    args = get_args()
+    args.ln_run_on_bpu = False
+    # NOTE(xcsong): XJ3 BPU only support static shapes
+    assert args.chunk_size > 0
+    assert args.num_decoding_left_chunks > 0
+    os.system("mkdir -p " + args.output_dir)
+    os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
+
+    with open(args.config, 'r') as fin:
+        configs = yaml.load(fin, Loader=yaml.FullLoader)
+
+    model, configs = init_model(args, configs)
+    model.eval()
+    print(model)
+
+    args.feature_size = configs['input_dim']
+    args.output_size = model.encoder.output_size()
+    args.decoding_window = (args.chunk_size - 1) * \
+        model.encoder.embed.subsampling_rate + \
+        model.encoder.embed.right_context + 1
+
+    export_encoder(model, args)
+    export_ctc(model, args)
+    export_decoder(model, args)
diff --git a/wenet/bin/export_onnx_cpu.py b/wenet/bin/export_onnx_cpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..f382545a072bb8babcb69c65913ad548d830e8a0
--- /dev/null
+++ b/wenet/bin/export_onnx_cpu.py
@@ -0,0 +1,470 @@
+# Copyright (c) 2022, Xingchen Song (sxc19@mails.tsinghua.edu.cn)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import argparse
+import logging
+import os
+import copy
+import sys
+
+import torch
+import yaml
+import numpy as np
+
+from wenet.utils.init_model import init_model
+
+try:
+    import onnx
+    import onnxruntime
+    from onnxruntime.quantization import quantize_dynamic, QuantType
+except ImportError:
+    print('Please install onnx and onnxruntime!')
+    sys.exit(1)
+
+
+def get_args():
+    parser = argparse.ArgumentParser(description='export your script model')
+    parser.add_argument('--config', required=True, help='config file')
+    parser.add_argument('--checkpoint', required=True, help='checkpoint model')
+    parser.add_argument('--output_dir', required=True, help='output directory')
+    parser.add_argument('--chunk_size',
+                        required=True,
+                        type=int,
+                        help='decoding chunk size')
+    parser.add_argument('--num_decoding_left_chunks',
+                        required=True,
+                        type=int,
+                        help='cache chunks')
+    parser.add_argument('--reverse_weight',
+                        default=0.5,
+                        type=float,
+                        help='reverse_weight in attention_rescoing')
+    args = parser.parse_args()
+    return args
+
+
+def to_numpy(tensor):
+    if tensor.requires_grad:
+        return tensor.detach().cpu().numpy()
+    else:
+        return tensor.cpu().numpy()
+
+
+def print_input_output_info(onnx_model, name, prefix="\t\t"):
+    input_names = [node.name for node in onnx_model.graph.input]
+    input_shapes = [[d.dim_value for d in node.type.tensor_type.shape.dim]
+                    for node in onnx_model.graph.input]
+    output_names = [node.name for node in onnx_model.graph.output]
+    output_shapes = [[d.dim_value for d in node.type.tensor_type.shape.dim]
+                     for node in onnx_model.graph.output]
+    print("{}{} inputs : {}".format(prefix, name, input_names))
+    print("{}{} input shapes : {}".format(prefix, name, input_shapes))
+    print("{}{} outputs: {}".format(prefix, name, output_names))
+    print("{}{} output shapes : {}".format(prefix, name, output_shapes))
+
+
+def export_encoder(asr_model, args):
+    print("Stage-1: export encoder")
+    encoder = asr_model.encoder
+    encoder.forward = encoder.forward_chunk
+    encoder_outpath = os.path.join(args['output_dir'], 'encoder.onnx')
+
+    print("\tStage-1.1: prepare inputs for encoder")
+    chunk = torch.randn(
+        (args['batch'], args['decoding_window'], args['feature_size']))
+    offset = 0
+    # NOTE(xcsong): The uncertainty of `next_cache_start` only appears
+    #   in the first few chunks, this is caused by dynamic att_cache shape, i,e
+    #   (0, 0, 0, 0) for 1st chunk and (elayers, head, ?, d_k*2) for subsequent
+    #   chunks. One way to ease the ONNX export is to keep `next_cache_start`
+    #   as a fixed value. To do this, for the **first** chunk, if
+    #   left_chunks > 0, we feed real cache & real mask to the model, otherwise
+    #   fake cache & fake mask. In this way, we get:
+    #   1. 16/-1 mode: next_cache_start == 0 for all chunks
+    #   2. 16/4  mode: next_cache_start == chunk_size for all chunks
+    #   3. 16/0  mode: next_cache_start == chunk_size for all chunks
+    #   4. -1/-1 mode: next_cache_start == 0 for all chunks
+    #   NO MORE DYNAMIC CHANGES!!
+    #
+    # NOTE(Mddct): We retain the current design for the convenience of supporting some
+    #   inference frameworks without dynamic shapes. If you're interested in all-in-one
+    #   model that supports different chunks please see:
+    #   https://github.com/wenet-e2e/wenet/pull/1174
+
+    if args['left_chunks'] > 0:  # 16/4
+        required_cache_size = args['chunk_size'] * args['left_chunks']
+        offset = required_cache_size
+        # Real cache
+        att_cache = torch.zeros(
+            (args['num_blocks'], args['head'], required_cache_size,
+             args['output_size'] // args['head'] * 2))
+        # Real mask
+        att_mask = torch.ones(
+            (args['batch'], 1, required_cache_size + args['chunk_size']),
+            dtype=torch.bool)
+        att_mask[:, :, :required_cache_size] = 0
+    elif args['left_chunks'] <= 0:  # 16/-1, -1/-1, 16/0
+        required_cache_size = -1 if args['left_chunks'] < 0 else 0
+        # Fake cache
+        att_cache = torch.zeros((args['num_blocks'], args['head'], 0,
+                                 args['output_size'] // args['head'] * 2))
+        # Fake mask
+        att_mask = torch.ones((0, 0, 0), dtype=torch.bool)
+    cnn_cache = torch.zeros(
+        (args['num_blocks'], args['batch'], args['output_size'],
+         args['cnn_module_kernel'] - 1))
+    inputs = (chunk, offset, required_cache_size, att_cache, cnn_cache,
+              att_mask)
+    print("\t\tchunk.size(): {}\n".format(chunk.size()),
+          "\t\toffset: {}\n".format(offset),
+          "\t\trequired_cache: {}\n".format(required_cache_size),
+          "\t\tatt_cache.size(): {}\n".format(att_cache.size()),
+          "\t\tcnn_cache.size(): {}\n".format(cnn_cache.size()),
+          "\t\tatt_mask.size(): {}\n".format(att_mask.size()))
+
+    print("\tStage-1.2: torch.onnx.export")
+    dynamic_axes = {
+        'chunk': {
+            1: 'T'
+        },
+        'att_cache': {
+            2: 'T_CACHE'
+        },
+        'att_mask': {
+            2: 'T_ADD_T_CACHE'
+        },
+        'output': {
+            1: 'T'
+        },
+        'r_att_cache': {
+            2: 'T_CACHE'
+        },
+    }
+    # NOTE(xcsong): We keep dynamic axes even if in 16/4 mode, this is
+    #   to avoid padding the last chunk (which usually contains less
+    #   frames than required). For users who want static axes, just pop
+    #   out specific axis.
+    # if args['chunk_size'] > 0:  # 16/4, 16/-1, 16/0
+    #     dynamic_axes.pop('chunk')
+    #     dynamic_axes.pop('output')
+    # if args['left_chunks'] >= 0:  # 16/4, 16/0
+    #     # NOTE(xsong): since we feed real cache & real mask into the
+    #     #   model when left_chunks > 0, the shape of cache will never
+    #     #   be changed.
+    #     dynamic_axes.pop('att_cache')
+    #     dynamic_axes.pop('r_att_cache')
+    torch.onnx.export(encoder,
+                      inputs,
+                      encoder_outpath,
+                      opset_version=13,
+                      export_params=True,
+                      do_constant_folding=True,
+                      input_names=[
+                          'chunk', 'offset', 'required_cache_size',
+                          'att_cache', 'cnn_cache', 'att_mask'
+                      ],
+                      output_names=['output', 'r_att_cache', 'r_cnn_cache'],
+                      dynamic_axes=dynamic_axes,
+                      verbose=False)
+    onnx_encoder = onnx.load(encoder_outpath)
+    for (k, v) in args.items():
+        meta = onnx_encoder.metadata_props.add()
+        meta.key, meta.value = str(k), str(v)
+    onnx.checker.check_model(onnx_encoder)
+    onnx.helper.printable_graph(onnx_encoder.graph)
+    # NOTE(xcsong): to add those metadatas we need to reopen
+    #   the file and resave it.
+    onnx.save(onnx_encoder, encoder_outpath)
+    print_input_output_info(onnx_encoder, "onnx_encoder")
+    # Dynamic quantization
+    model_fp32 = encoder_outpath
+    model_quant = os.path.join(args['output_dir'], 'encoder.quant.onnx')
+    quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8)
+    print('\t\tExport onnx_encoder, done! see {}'.format(encoder_outpath))
+
+    print("\tStage-1.3: check onnx_encoder and torch_encoder")
+    torch_output = []
+    torch_chunk = copy.deepcopy(chunk)
+    torch_offset = copy.deepcopy(offset)
+    torch_required_cache_size = copy.deepcopy(required_cache_size)
+    torch_att_cache = copy.deepcopy(att_cache)
+    torch_cnn_cache = copy.deepcopy(cnn_cache)
+    torch_att_mask = copy.deepcopy(att_mask)
+    for i in range(10):
+        print("\t\ttorch chunk-{}: {}, offset: {}, att_cache: {},"
+              " cnn_cache: {}, att_mask: {}".format(
+                  i, list(torch_chunk.size()), torch_offset,
+                  list(torch_att_cache.size()), list(torch_cnn_cache.size()),
+                  list(torch_att_mask.size())))
+        # NOTE(xsong): att_mask of the first few batches need changes if
+        #   we use 16/4 mode.
+        if args['left_chunks'] > 0:  # 16/4
+            torch_att_mask[:, :, -(args['chunk_size'] * (i + 1)):] = 1
+        out, torch_att_cache, torch_cnn_cache = encoder(
+            torch_chunk, torch_offset, torch_required_cache_size,
+            torch_att_cache, torch_cnn_cache, torch_att_mask)
+        torch_output.append(out)
+        torch_offset += out.size(1)
+    torch_output = torch.cat(torch_output, dim=1)
+
+    onnx_output = []
+    onnx_chunk = to_numpy(chunk)
+    onnx_offset = np.array((offset)).astype(np.int64)
+    onnx_required_cache_size = np.array((required_cache_size)).astype(np.int64)
+    onnx_att_cache = to_numpy(att_cache)
+    onnx_cnn_cache = to_numpy(cnn_cache)
+    onnx_att_mask = to_numpy(att_mask)
+    ort_session = onnxruntime.InferenceSession(
+        encoder_outpath, providers=['CPUExecutionProvider'])
+    input_names = [node.name for node in onnx_encoder.graph.input]
+    for i in range(10):
+        print("\t\tonnx  chunk-{}: {}, offset: {}, att_cache: {},"
+              " cnn_cache: {}, att_mask: {}".format(i, onnx_chunk.shape,
+                                                    onnx_offset,
+                                                    onnx_att_cache.shape,
+                                                    onnx_cnn_cache.shape,
+                                                    onnx_att_mask.shape))
+        # NOTE(xsong): att_mask of the first few batches need changes if
+        #   we use 16/4 mode.
+        if args['left_chunks'] > 0:  # 16/4
+            onnx_att_mask[:, :, -(args['chunk_size'] * (i + 1)):] = 1
+        ort_inputs = {
+            'chunk': onnx_chunk,
+            'offset': onnx_offset,
+            'required_cache_size': onnx_required_cache_size,
+            'att_cache': onnx_att_cache,
+            'cnn_cache': onnx_cnn_cache,
+            'att_mask': onnx_att_mask
+        }
+        # NOTE(xcsong): If we use 16/-1, -1/-1 or 16/0 mode, `next_cache_start`
+        #   will be hardcoded to 0 or chunk_size by ONNX, thus
+        #   required_cache_size and att_mask are no more needed and they will
+        #   be removed by ONNX automatically.
+        for k in list(ort_inputs):
+            if k not in input_names:
+                ort_inputs.pop(k)
+        ort_outs = ort_session.run(None, ort_inputs)
+        onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2]
+        onnx_output.append(ort_outs[0])
+        onnx_offset += ort_outs[0].shape[1]
+    onnx_output = np.concatenate(onnx_output, axis=1)
+
+    np.testing.assert_allclose(to_numpy(torch_output),
+                               onnx_output,
+                               rtol=1e-03,
+                               atol=1e-05)
+    meta = ort_session.get_modelmeta()
+    print("\t\tcustom_metadata_map={}".format(meta.custom_metadata_map))
+    print("\t\tCheck onnx_encoder, pass!")
+
+
+def export_ctc(asr_model, args):
+    print("Stage-2: export ctc")
+    ctc = asr_model.ctc
+    ctc.forward = ctc.log_softmax
+    ctc_outpath = os.path.join(args['output_dir'], 'ctc.onnx')
+
+    print("\tStage-2.1: prepare inputs for ctc")
+    hidden = torch.randn(
+        (args['batch'], args['chunk_size'] if args['chunk_size'] > 0 else 16,
+         args['output_size']))
+
+    print("\tStage-2.2: torch.onnx.export")
+    dynamic_axes = {'hidden': {1: 'T'}, 'probs': {1: 'T'}}
+    torch.onnx.export(ctc,
+                      hidden,
+                      ctc_outpath,
+                      opset_version=13,
+                      export_params=True,
+                      do_constant_folding=True,
+                      input_names=['hidden'],
+                      output_names=['probs'],
+                      dynamic_axes=dynamic_axes,
+                      verbose=False)
+    onnx_ctc = onnx.load(ctc_outpath)
+    for (k, v) in args.items():
+        meta = onnx_ctc.metadata_props.add()
+        meta.key, meta.value = str(k), str(v)
+    onnx.checker.check_model(onnx_ctc)
+    onnx.helper.printable_graph(onnx_ctc.graph)
+    onnx.save(onnx_ctc, ctc_outpath)
+    print_input_output_info(onnx_ctc, "onnx_ctc")
+    # Dynamic quantization
+    model_fp32 = ctc_outpath
+    model_quant = os.path.join(args['output_dir'], 'ctc.quant.onnx')
+    quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8)
+    print('\t\tExport onnx_ctc, done! see {}'.format(ctc_outpath))
+
+    print("\tStage-2.3: check onnx_ctc and torch_ctc")
+    torch_output = ctc(hidden)
+    ort_session = onnxruntime.InferenceSession(
+        ctc_outpath, providers=['CPUExecutionProvider'])
+    onnx_output = ort_session.run(None, {'hidden': to_numpy(hidden)})
+
+    np.testing.assert_allclose(to_numpy(torch_output),
+                               onnx_output[0],
+                               rtol=1e-03,
+                               atol=1e-05)
+    print("\t\tCheck onnx_ctc, pass!")
+
+
+def export_decoder(asr_model, args):
+    print("Stage-3: export decoder")
+    decoder = asr_model
+    # NOTE(lzhin): parameters of encoder will be automatically removed
+    #   since they are not used during rescoring.
+    decoder.forward = decoder.forward_attention_decoder
+    decoder_outpath = os.path.join(args['output_dir'], 'decoder.onnx')
+
+    print("\tStage-3.1: prepare inputs for decoder")
+    # hardcode time->200 nbest->10 len->20, they are dynamic axes.
+    encoder_out = torch.randn((1, 200, args['output_size']))
+    hyps = torch.randint(low=0, high=args['vocab_size'], size=[10, 20])
+    hyps[:, 0] = args['vocab_size'] - 1  # <sos>
+    hyps_lens = torch.randint(low=15, high=21, size=[10])
+
+    print("\tStage-3.2: torch.onnx.export")
+    dynamic_axes = {
+        'hyps': {
+            0: 'NBEST',
+            1: 'L'
+        },
+        'hyps_lens': {
+            0: 'NBEST'
+        },
+        'encoder_out': {
+            1: 'T'
+        },
+        'score': {
+            0: 'NBEST',
+            1: 'L'
+        },
+        'r_score': {
+            0: 'NBEST',
+            1: 'L'
+        }
+    }
+    inputs = (hyps, hyps_lens, encoder_out, args['reverse_weight'])
+    torch.onnx.export(
+        decoder,
+        inputs,
+        decoder_outpath,
+        opset_version=13,
+        export_params=True,
+        do_constant_folding=True,
+        input_names=['hyps', 'hyps_lens', 'encoder_out', 'reverse_weight'],
+        output_names=['score', 'r_score'],
+        dynamic_axes=dynamic_axes,
+        verbose=False)
+    onnx_decoder = onnx.load(decoder_outpath)
+    for (k, v) in args.items():
+        meta = onnx_decoder.metadata_props.add()
+        meta.key, meta.value = str(k), str(v)
+    onnx.checker.check_model(onnx_decoder)
+    onnx.helper.printable_graph(onnx_decoder.graph)
+    onnx.save(onnx_decoder, decoder_outpath)
+    print_input_output_info(onnx_decoder, "onnx_decoder")
+    model_fp32 = decoder_outpath
+    model_quant = os.path.join(args['output_dir'], 'decoder.quant.onnx')
+    quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8)
+    print('\t\tExport onnx_decoder, done! see {}'.format(decoder_outpath))
+
+    print("\tStage-3.3: check onnx_decoder and torch_decoder")
+    torch_score, torch_r_score = decoder(hyps, hyps_lens, encoder_out,
+                                         args['reverse_weight'])
+    ort_session = onnxruntime.InferenceSession(
+        decoder_outpath, providers=['CPUExecutionProvider'])
+    input_names = [node.name for node in onnx_decoder.graph.input]
+    ort_inputs = {
+        'hyps': to_numpy(hyps),
+        'hyps_lens': to_numpy(hyps_lens),
+        'encoder_out': to_numpy(encoder_out),
+        'reverse_weight': np.array((args['reverse_weight'])),
+    }
+    for k in list(ort_inputs):
+        if k not in input_names:
+            ort_inputs.pop(k)
+    onnx_output = ort_session.run(None, ort_inputs)
+
+    np.testing.assert_allclose(to_numpy(torch_score),
+                               onnx_output[0],
+                               rtol=1e-03,
+                               atol=1e-05)
+    if args['is_bidirectional_decoder'] and args['reverse_weight'] > 0.0:
+        np.testing.assert_allclose(to_numpy(torch_r_score),
+                                   onnx_output[1],
+                                   rtol=1e-03,
+                                   atol=1e-05)
+    print("\t\tCheck onnx_decoder, pass!")
+
+
+def main():
+    torch.manual_seed(777)
+    args = get_args()
+    logging.basicConfig(level=logging.DEBUG,
+                        format='%(asctime)s %(levelname)s %(message)s')
+    output_dir = args.output_dir
+    os.system("mkdir -p " + output_dir)
+    os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
+
+    with open(args.config, 'r') as fin:
+        configs = yaml.load(fin, Loader=yaml.FullLoader)
+
+    model, configs = init_model(args, configs)
+    model.eval()
+    print(model)
+
+    arguments = {}
+    arguments['output_dir'] = output_dir
+    arguments['batch'] = 1
+    arguments['chunk_size'] = args.chunk_size
+    arguments['left_chunks'] = args.num_decoding_left_chunks
+    arguments['reverse_weight'] = args.reverse_weight
+    arguments['output_size'] = configs['encoder_conf']['output_size']
+    arguments['num_blocks'] = configs['encoder_conf']['num_blocks']
+    arguments['cnn_module_kernel'] = configs['encoder_conf'].get(
+        'cnn_module_kernel', 1)
+    arguments['head'] = configs['encoder_conf']['attention_heads']
+    arguments['feature_size'] = configs['input_dim']
+    arguments['vocab_size'] = configs['output_dim']
+    # NOTE(xcsong): if chunk_size == -1, hardcode to 67
+    arguments['decoding_window'] = (args.chunk_size - 1) * \
+        model.encoder.embed.subsampling_rate + \
+        model.encoder.embed.right_context + 1 if args.chunk_size > 0 else 67
+    arguments['encoder'] = configs['encoder']
+    arguments['decoder'] = configs['decoder']
+    arguments['subsampling_rate'] = model.subsampling_rate()
+    arguments['right_context'] = model.right_context()
+    arguments['sos_symbol'] = model.sos_symbol()
+    arguments['eos_symbol'] = model.eos_symbol()
+    arguments['is_bidirectional_decoder'] = 1 \
+        if model.is_bidirectional_decoder() else 0
+
+    # NOTE(xcsong): Please note that -1/-1 means non-streaming model! It is
+    #   not a [16/4 16/-1 16/0] all-in-one model and it should not be used in
+    #   streaming mode (i.e., setting chunk_size=16 in `decoder_main`). If you
+    #   want to use 16/-1 or any other streaming mode in `decoder_main`,
+    #   please export onnx in the same config.
+    if arguments['left_chunks'] > 0:
+        assert arguments['chunk_size'] > 0  # -1/4 not supported
+
+    export_encoder(model, arguments)
+    export_ctc(model, arguments)
+    export_decoder(model, arguments)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/wenet/bin/export_onnx_gpu.py b/wenet/bin/export_onnx_gpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab6c1dbe0584889c96f1c4556d989149e34b2774
--- /dev/null
+++ b/wenet/bin/export_onnx_gpu.py
@@ -0,0 +1,1263 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import argparse
+import os
+import sys
+
+import torch
+import yaml
+import logging
+
+import torch.nn.functional as F
+from wenet.transformer.ctc import CTC
+from wenet.transformer.decoder import TransformerDecoder
+from wenet.transformer.encoder import BaseEncoder
+from wenet.utils.init_model import init_model
+from wenet.utils.mask import make_pad_mask
+
+try:
+    import onnxruntime
+except ImportError:
+    print("Please install onnxruntime-gpu!")
+    sys.exit(1)
+
+logger = logging.getLogger(__file__)
+logger.setLevel(logging.INFO)
+
+
+class Encoder(torch.nn.Module):
+
+    def __init__(self, encoder: BaseEncoder, ctc: CTC, beam_size: int = 10):
+        super().__init__()
+        self.encoder = encoder
+        self.ctc = ctc
+        self.beam_size = beam_size
+
+    def forward(
+        self,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+    ):
+        """Encoder
+        Args:
+            speech: (Batch, Length, ...)
+            speech_lengths: (Batch, )
+        Returns:
+            encoder_out: B x T x F
+            encoder_out_lens: B
+            ctc_log_probs: B x T x V
+            beam_log_probs: B x T x beam_size
+            beam_log_probs_idx: B x T x beam_size
+        """
+        encoder_out, encoder_mask = self.encoder(speech, speech_lengths, -1,
+                                                 -1)
+        encoder_out_lens = encoder_mask.squeeze(1).sum(1)
+        ctc_log_probs = self.ctc.log_softmax(encoder_out)
+        encoder_out_lens = encoder_out_lens.int()
+        beam_log_probs, beam_log_probs_idx = torch.topk(ctc_log_probs,
+                                                        self.beam_size,
+                                                        dim=2)
+        return (
+            encoder_out,
+            encoder_out_lens,
+            ctc_log_probs,
+            beam_log_probs,
+            beam_log_probs_idx,
+        )
+
+
+class StreamingEncoder(torch.nn.Module):
+
+    def __init__(
+        self,
+        model,
+        required_cache_size,
+        beam_size,
+        transformer=False,
+        return_ctc_logprobs=False,
+    ):
+        super().__init__()
+        self.ctc = model.ctc
+        self.subsampling_rate = model.encoder.embed.subsampling_rate
+        self.embed = model.encoder.embed
+        self.global_cmvn = model.encoder.global_cmvn
+        self.required_cache_size = required_cache_size
+        self.beam_size = beam_size
+        self.encoder = model.encoder
+        self.transformer = transformer
+        self.return_ctc_logprobs = return_ctc_logprobs
+
+    def forward(self, chunk_xs, chunk_lens, offset, att_cache, cnn_cache,
+                cache_mask):
+        """Streaming Encoder
+        Args:
+            xs (torch.Tensor): chunk input, with shape (b, time, mel-dim),
+                where `time == (chunk_size - 1) * subsample_rate + \
+                        subsample.right_context + 1`
+            offset (torch.Tensor): offset with shape (b, 1)
+                        1 is retained for triton deployment
+            required_cache_size (int): cache size required for next chunk
+                compuation
+                > 0: actual cache size
+                <= 0: not allowed in streaming gpu encoder                   `
+            att_cache (torch.Tensor): cache tensor for KEY & VALUE in
+                transformer/conformer attention, with shape
+                (b, elayers, head, cache_t1, d_k * 2), where
+                `head * d_k == hidden-dim` and
+                `cache_t1 == chunk_size * num_decoding_left_chunks`.
+            cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer,
+                (b, elayers, b, hidden-dim, cache_t2), where
+                `cache_t2 == cnn.lorder - 1`
+            cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size)
+                 in a batch of request, each request may have different
+                 history cache. Cache mask is used to indidate the effective
+                 cache for each request
+        Returns:
+            torch.Tensor: log probabilities of ctc output and cutoff by beam size
+                with shape (b, chunk_size, beam)
+            torch.Tensor: index of top beam size probabilities for each timestep
+                with shape (b, chunk_size, beam)
+            torch.Tensor: output of current input xs,
+                with shape (b, chunk_size, hidden-dim).
+            torch.Tensor: new attention cache required for next chunk, with
+                same shape (b, elayers, head, cache_t1, d_k * 2)
+                as the original att_cache
+            torch.Tensor: new conformer cnn cache required for next chunk, with
+                same shape as the original cnn_cache.
+            torch.Tensor: new cache mask, with same shape as the original
+                cache mask
+        """
+        offset = offset.squeeze(1)
+        T = chunk_xs.size(1)
+        chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1)
+        # B X 1 X T
+        chunk_mask = chunk_mask.to(chunk_xs.dtype)
+        # transpose batch & num_layers dim
+        att_cache = torch.transpose(att_cache, 0, 1)
+        cnn_cache = torch.transpose(cnn_cache, 0, 1)
+
+        # rewrite encoder.forward_chunk
+        # <---------forward_chunk START--------->
+        xs = self.global_cmvn(chunk_xs)
+        # chunk mask is important for batch inferencing since
+        # different sequence in a batch has different length
+        xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset)
+        cache_size = att_cache.size(3)  # required cache size
+        masks = torch.cat((cache_mask, chunk_mask), dim=2)
+        index = offset - cache_size
+
+        pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1))
+        pos_emb = pos_emb.to(dtype=xs.dtype)
+
+        next_cache_start = -self.required_cache_size
+        r_cache_mask = masks[:, :, next_cache_start:]
+
+        r_att_cache = []
+        r_cnn_cache = []
+        for i, layer in enumerate(self.encoder.encoders):
+            xs, _, new_att_cache, new_cnn_cache = layer(
+                xs,
+                masks,
+                pos_emb,
+                att_cache=att_cache[i],
+                cnn_cache=cnn_cache[i],
+            )
+            #   shape(new_att_cache) is (B, head, attention_key_size, d_k * 2),
+            #   shape(new_cnn_cache) is (B, hidden-dim, cache_t2)
+            r_att_cache.append(
+                new_att_cache[:, :, next_cache_start:, :].unsqueeze(1))
+            if not self.transformer:
+                r_cnn_cache.append(new_cnn_cache.unsqueeze(1))
+        if self.encoder.normalize_before:
+            chunk_out = self.encoder.after_norm(xs)
+        else:
+            chunk_out = xs
+
+        r_att_cache = torch.cat(r_att_cache, dim=1)  # concat on layers idx
+        if not self.transformer:
+            r_cnn_cache = torch.cat(r_cnn_cache, dim=1)  # concat on layers
+
+        # <---------forward_chunk END--------->
+
+        log_ctc_probs = self.ctc.log_softmax(chunk_out)
+        log_probs, log_probs_idx = torch.topk(log_ctc_probs,
+                                              self.beam_size,
+                                              dim=2)
+        log_probs = log_probs.to(chunk_xs.dtype)
+
+        r_offset = offset + chunk_out.shape[1]
+        # the below ops not supported in Tensorrt
+        # chunk_out_lens = torch.div(chunk_lens, subsampling_rate,
+        #                   rounding_mode='floor')
+        chunk_out_lens = chunk_lens // self.subsampling_rate
+        r_offset = r_offset.unsqueeze(1)
+        if self.return_ctc_logprobs:
+            return (
+                log_ctc_probs,
+                chunk_out,
+                chunk_out_lens,
+                r_offset,
+                r_att_cache,
+                r_cnn_cache,
+                r_cache_mask,
+            )
+        else:
+            return (
+                log_probs,
+                log_probs_idx,
+                chunk_out,
+                chunk_out_lens,
+                r_offset,
+                r_att_cache,
+                r_cnn_cache,
+                r_cache_mask,
+            )
+
+
+class StreamingSqueezeformerEncoder(torch.nn.Module):
+
+    def __init__(self, model, required_cache_size, beam_size):
+        super().__init__()
+        self.ctc = model.ctc
+        self.subsampling_rate = model.encoder.embed.subsampling_rate
+        self.embed = model.encoder.embed
+        self.global_cmvn = model.encoder.global_cmvn
+        self.required_cache_size = required_cache_size
+        self.beam_size = beam_size
+        self.encoder = model.encoder
+        self.reduce_idx = model.encoder.reduce_idx
+        self.recover_idx = model.encoder.recover_idx
+        if self.reduce_idx is None:
+            self.time_reduce = None
+        else:
+            if self.recover_idx is None:
+                self.time_reduce = "normal"  # no recovery at the end
+            else:
+                self.time_reduce = "recover"  # recovery at the end
+                assert len(self.reduce_idx) == len(self.recover_idx)
+
+    def calculate_downsampling_factor(self, i: int) -> int:
+        if self.reduce_idx is None:
+            return 1
+        else:
+            reduce_exp, recover_exp = 0, 0
+            for exp, rd_idx in enumerate(self.reduce_idx):
+                if i >= rd_idx:
+                    reduce_exp = exp + 1
+            if self.recover_idx is not None:
+                for exp, rc_idx in enumerate(self.recover_idx):
+                    if i >= rc_idx:
+                        recover_exp = exp + 1
+            return int(2**(reduce_exp - recover_exp))
+
+    def forward(self, chunk_xs, chunk_lens, offset, att_cache, cnn_cache,
+                cache_mask):
+        """Streaming Encoder
+        Args:
+            xs (torch.Tensor): chunk input, with shape (b, time, mel-dim),
+                where `time == (chunk_size - 1) * subsample_rate + \
+                        subsample.right_context + 1`
+            offset (torch.Tensor): offset with shape (b, 1)
+                        1 is retained for triton deployment
+            required_cache_size (int): cache size required for next chunk
+                compuation
+                > 0: actual cache size
+                <= 0: not allowed in streaming gpu encoder                   `
+            att_cache (torch.Tensor): cache tensor for KEY & VALUE in
+                transformer/conformer attention, with shape
+                (b, elayers, head, cache_t1, d_k * 2), where
+                `head * d_k == hidden-dim` and
+                `cache_t1 == chunk_size * num_decoding_left_chunks`.
+            cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer,
+                (b, elayers, b, hidden-dim, cache_t2), where
+                `cache_t2 == cnn.lorder - 1`
+            cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size)
+                 in a batch of request, each request may have different
+                 history cache. Cache mask is used to indidate the effective
+                 cache for each request
+        Returns:
+            torch.Tensor: log probabilities of ctc output and cutoff by beam size
+                with shape (b, chunk_size, beam)
+            torch.Tensor: index of top beam size probabilities for each timestep
+                with shape (b, chunk_size, beam)
+            torch.Tensor: output of current input xs,
+                with shape (b, chunk_size, hidden-dim).
+            torch.Tensor: new attention cache required for next chunk, with
+                same shape (b, elayers, head, cache_t1, d_k * 2)
+                as the original att_cache
+            torch.Tensor: new conformer cnn cache required for next chunk, with
+                same shape as the original cnn_cache.
+            torch.Tensor: new cache mask, with same shape as the original
+                cache mask
+        """
+        offset = offset.squeeze(1)
+        T = chunk_xs.size(1)
+        chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1)
+        # B X 1 X T
+        chunk_mask = chunk_mask.to(chunk_xs.dtype)
+        # transpose batch & num_layers dim
+        att_cache = torch.transpose(att_cache, 0, 1)
+        cnn_cache = torch.transpose(cnn_cache, 0, 1)
+
+        # rewrite encoder.forward_chunk
+        # <---------forward_chunk START--------->
+        xs = self.global_cmvn(chunk_xs)
+        # chunk mask is important for batch inferencing since
+        # different sequence in a batch has different length
+        xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset)
+        elayers, cache_size = att_cache.size(0), att_cache.size(3)
+        att_mask = torch.cat((cache_mask, chunk_mask), dim=2)
+        index = offset - cache_size
+
+        pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1))
+        pos_emb = pos_emb.to(dtype=xs.dtype)
+
+        next_cache_start = -self.required_cache_size
+        r_cache_mask = att_mask[:, :, next_cache_start:]
+
+        r_att_cache = []
+        r_cnn_cache = []
+        mask_pad = torch.ones(1,
+                              xs.size(1),
+                              device=xs.device,
+                              dtype=torch.bool)
+        mask_pad = mask_pad.unsqueeze(1)
+        max_att_len: int = 0
+        recover_activations: List[Tuple[torch.Tensor, torch.Tensor,
+                                        torch.Tensor, torch.Tensor]] = []
+        index = 0
+        xs_lens = torch.tensor([xs.size(1)], device=xs.device, dtype=torch.int)
+        xs = self.encoder.preln(xs)
+        for i, layer in enumerate(self.encoder.encoders):
+            if self.reduce_idx is not None:
+                if self.time_reduce is not None and i in self.reduce_idx:
+                    recover_activations.append(
+                        (xs, att_mask, pos_emb, mask_pad))
+                    (
+                        xs,
+                        xs_lens,
+                        att_mask,
+                        mask_pad,
+                    ) = self.encoder.time_reduction_layer(
+                        xs, xs_lens, att_mask, mask_pad)
+                    pos_emb = pos_emb[:, ::2, :]
+                    if self.encoder.pos_enc_layer_type == "rel_pos_repaired":
+                        pos_emb = pos_emb[:, :xs.size(1) * 2 - 1, :]
+                    index += 1
+
+            if self.recover_idx is not None:
+                if self.time_reduce == "recover" and i in self.recover_idx:
+                    index -= 1
+                    (
+                        recover_tensor,
+                        recover_att_mask,
+                        recover_pos_emb,
+                        recover_mask_pad,
+                    ) = recover_activations[index]
+                    # recover output length for ctc decode
+                    xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2)
+                    xs = self.encoder.time_recover_layer(xs)
+                    recoverd_t = recover_tensor.size(1)
+                    xs = recover_tensor + xs[:, :recoverd_t, :].contiguous()
+                    att_mask = recover_att_mask
+                    pos_emb = recover_pos_emb
+                    mask_pad = recover_mask_pad
+
+            factor = self.calculate_downsampling_factor(i)
+
+            xs, _, new_att_cache, new_cnn_cache = layer(
+                xs,
+                att_mask,
+                pos_emb,
+                att_cache=att_cache[i][:, :, ::factor, :]
+                [:, :, :pos_emb.size(1) - xs.size(1), :]
+                if elayers > 0 else att_cache[:, :, ::factor, :],
+                cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache,
+            )
+            cached_att = new_att_cache[:, :, next_cache_start // factor:, :]
+            cached_cnn = new_cnn_cache.unsqueeze(1)
+            cached_att = (cached_att.unsqueeze(3).repeat(1, 1, 1, factor,
+                                                         1).flatten(2, 3))
+            if i == 0:
+                # record length for the first block as max length
+                max_att_len = cached_att.size(2)
+            r_att_cache.append(cached_att[:, :, :max_att_len, :].unsqueeze(1))
+            r_cnn_cache.append(cached_cnn)
+
+        chunk_out = xs
+        r_att_cache = torch.cat(r_att_cache, dim=1)  # concat on layers idx
+        r_cnn_cache = torch.cat(r_cnn_cache, dim=1)  # concat on layers
+
+        # <---------forward_chunk END--------->
+
+        log_ctc_probs = self.ctc.log_softmax(chunk_out)
+        log_probs, log_probs_idx = torch.topk(log_ctc_probs,
+                                              self.beam_size,
+                                              dim=2)
+        log_probs = log_probs.to(chunk_xs.dtype)
+
+        r_offset = offset + chunk_out.shape[1]
+        # the below ops not supported in Tensorrt
+        # chunk_out_lens = torch.div(chunk_lens, subsampling_rate,
+        #                   rounding_mode='floor')
+        chunk_out_lens = chunk_lens // self.subsampling_rate
+        r_offset = r_offset.unsqueeze(1)
+
+        return (
+            log_probs,
+            log_probs_idx,
+            chunk_out,
+            chunk_out_lens,
+            r_offset,
+            r_att_cache,
+            r_cnn_cache,
+            r_cache_mask,
+        )
+
+
+class StreamingEfficientConformerEncoder(torch.nn.Module):
+
+    def __init__(self, model, required_cache_size, beam_size):
+        super().__init__()
+        self.ctc = model.ctc
+        self.subsampling_rate = model.encoder.embed.subsampling_rate
+        self.embed = model.encoder.embed
+        self.global_cmvn = model.encoder.global_cmvn
+        self.required_cache_size = required_cache_size
+        self.beam_size = beam_size
+        self.encoder = model.encoder
+
+        # Efficient Conformer
+        self.stride_layer_idx = model.encoder.stride_layer_idx
+        self.stride = model.encoder.stride
+        self.num_blocks = model.encoder.num_blocks
+        self.cnn_module_kernel = model.encoder.cnn_module_kernel
+
+    def calculate_downsampling_factor(self, i: int) -> int:
+        factor = 1
+        for idx, stride_idx in enumerate(self.stride_layer_idx):
+            if i > stride_idx:
+                factor *= self.stride[idx]
+        return factor
+
+    def forward(self, chunk_xs, chunk_lens, offset, att_cache, cnn_cache,
+                cache_mask):
+        """Streaming Encoder
+        Args:
+            chunk_xs (torch.Tensor): chunk input, with shape (b, time, mel-dim),
+                where `time == (chunk_size - 1) * subsample_rate + \
+                        subsample.right_context + 1`
+            chunk_lens (torch.Tensor):
+            offset (torch.Tensor): offset with shape (b, 1)
+                        1 is retained for triton deployment
+            att_cache (torch.Tensor): cache tensor for KEY & VALUE in
+                transformer/conformer attention, with shape
+                (b, elayers, head, cache_t1, d_k * 2), where
+                `head * d_k == hidden-dim` and
+                `cache_t1 == chunk_size * num_decoding_left_chunks`.
+            cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer,
+                (b, elayers, hidden-dim, cache_t2), where
+                `cache_t2 == cnn.lorder - 1`
+            cache_mask: (torch.Tensor): cache mask with shape (b, required_cache_size)
+                 in a batch of request, each request may have different
+                 history cache. Cache mask is used to indidate the effective
+                 cache for each request
+        Returns:
+            torch.Tensor: log probabilities of ctc output and cutoff by beam size
+                with shape (b, chunk_size, beam)
+            torch.Tensor: index of top beam size probabilities for each timestep
+                with shape (b, chunk_size, beam)
+            torch.Tensor: output of current input xs,
+                with shape (b, chunk_size, hidden-dim).
+            torch.Tensor: new attention cache required for next chunk, with
+                same shape (b, elayers, head, cache_t1, d_k * 2)
+                as the original att_cache
+            torch.Tensor: new conformer cnn cache required for next chunk, with
+                same shape as the original cnn_cache.
+            torch.Tensor: new cache mask, with same shape as the original
+                cache mask
+        """
+        offset = offset.squeeze(1)  # (b, )
+        offset *= self.calculate_downsampling_factor(self.num_blocks + 1)
+
+        T = chunk_xs.size(1)
+        chunk_mask = ~make_pad_mask(chunk_lens, T).unsqueeze(1)  # (b, 1, T)
+        # B X 1 X T
+        chunk_mask = chunk_mask.to(chunk_xs.dtype)
+        # transpose batch & num_layers dim
+        #   Shape(att_cache): (elayers, b, head, cache_t1, d_k * 2)
+        #   Shape(cnn_cache): (elayers, b, outsize, cnn_kernel)
+        att_cache = torch.transpose(att_cache, 0, 1)
+        cnn_cache = torch.transpose(cnn_cache, 0, 1)
+
+        # rewrite encoder.forward_chunk
+        # <---------forward_chunk START--------->
+        xs = self.global_cmvn(chunk_xs)
+        # chunk mask is important for batch inferencing since
+        # different sequence in a batch has different length
+        xs, pos_emb, chunk_mask = self.embed(xs, chunk_mask, offset)
+        cache_size = att_cache.size(3)  # required cache size
+        masks = torch.cat((cache_mask, chunk_mask), dim=2)
+        att_mask = torch.cat((cache_mask, chunk_mask), dim=2)
+        index = offset - cache_size
+
+        pos_emb = self.embed.position_encoding(index, cache_size + xs.size(1))
+        pos_emb = pos_emb.to(dtype=xs.dtype)
+
+        next_cache_start = -self.required_cache_size
+        r_cache_mask = masks[:, :, next_cache_start:]
+
+        r_att_cache = []
+        r_cnn_cache = []
+        mask_pad = chunk_mask.to(torch.bool)
+        max_att_len, max_cnn_len = (
+            0,
+            0,
+        )  # for repeat_interleave of new_att_cache
+        for i, layer in enumerate(self.encoder.encoders):
+            factor = self.calculate_downsampling_factor(i)
+            # NOTE(xcsong): Before layer.forward
+            #   shape(att_cache[i:i + 1]) is (b, head, cache_t1, d_k * 2),
+            #   shape(cnn_cache[i])       is (b=1, hidden-dim, cache_t2)
+            # shape(new_att_cache) = [ batch, head, time2, outdim//head * 2 ]
+            att_cache_trunc = 0
+            if xs.size(1) + att_cache.size(3) / factor > pos_emb.size(1):
+                # The time step is not divisible by the downsampling multiple
+                # We propose to double the chunk_size.
+                att_cache_trunc = (xs.size(1) + att_cache.size(3) // factor -
+                                   pos_emb.size(1) + 1)
+            xs, _, new_att_cache, new_cnn_cache = layer(
+                xs,
+                att_mask,
+                pos_emb,
+                mask_pad=mask_pad,
+                att_cache=att_cache[i][:, :, ::factor, :][:, :,
+                                                          att_cache_trunc:, :],
+                cnn_cache=cnn_cache[i, :, :, :]
+                if cnn_cache.size(0) > 0 else cnn_cache,
+            )
+
+            if i in self.stride_layer_idx:
+                # compute time dimension for next block
+                efficient_index = self.stride_layer_idx.index(i)
+                att_mask = att_mask[:, ::self.stride[efficient_index], ::self.
+                                    stride[efficient_index], ]
+                mask_pad = mask_pad[:, ::self.stride[efficient_index], ::self.
+                                    stride[efficient_index], ]
+                pos_emb = pos_emb[:, ::self.stride[efficient_index], :]
+
+            # shape(new_att_cache) = [batch, head, time2, outdim]
+            new_att_cache = new_att_cache[:, :, next_cache_start // factor:, :]
+            # shape(new_cnn_cache) = [batch, 1, outdim, cache_t2]
+            new_cnn_cache = new_cnn_cache.unsqueeze(1)  # shape(1):layerID
+
+            # use repeat_interleave to new_att_cache
+            # new_att_cache = new_att_cache.repeat_interleave(repeats=factor, dim=2)
+            new_att_cache = (new_att_cache.unsqueeze(3).repeat(
+                1, 1, 1, factor, 1).flatten(2, 3))
+            # padding new_cnn_cache to cnn.lorder for casual convolution
+            new_cnn_cache = F.pad(
+                new_cnn_cache,
+                (self.cnn_module_kernel - 1 - new_cnn_cache.size(3), 0),
+            )
+
+            if i == 0:
+                # record length for the first block as max length
+                max_att_len = new_att_cache.size(2)
+                max_cnn_len = new_cnn_cache.size(3)
+
+            # update real shape of att_cache and cnn_cache
+            r_att_cache.append(new_att_cache[:, :,
+                                             -max_att_len:, :].unsqueeze(1))
+            r_cnn_cache.append(new_cnn_cache[:, :, :, -max_cnn_len:])
+
+        if self.encoder.normalize_before:
+            chunk_out = self.encoder.after_norm(xs)
+        else:
+            chunk_out = xs
+
+        # shape of r_att_cache: (b, elayers, head, time2, outdim)
+        r_att_cache = torch.cat(r_att_cache, dim=1)  # concat on layers idx
+        # shape of r_cnn_cache: (b, elayers, outdim, cache_t2)
+        r_cnn_cache = torch.cat(r_cnn_cache, dim=1)  # concat on layers
+
+        # <---------forward_chunk END--------->
+
+        log_ctc_probs = self.ctc.log_softmax(chunk_out)
+        log_probs, log_probs_idx = torch.topk(log_ctc_probs,
+                                              self.beam_size,
+                                              dim=2)
+        log_probs = log_probs.to(chunk_xs.dtype)
+
+        r_offset = offset + chunk_out.shape[1]
+        # the below ops not supported in Tensorrt
+        # chunk_out_lens = torch.div(chunk_lens, subsampling_rate,
+        #                   rounding_mode='floor')
+        chunk_out_lens = (
+            chunk_lens // self.subsampling_rate //
+            self.calculate_downsampling_factor(self.num_blocks + 1))
+        chunk_out_lens += 1
+        r_offset = r_offset.unsqueeze(1)
+
+        return (
+            log_probs,
+            log_probs_idx,
+            chunk_out,
+            chunk_out_lens,
+            r_offset,
+            r_att_cache,
+            r_cnn_cache,
+            r_cache_mask,
+        )
+
+
+class Decoder(torch.nn.Module):
+
+    def __init__(
+        self,
+        decoder: TransformerDecoder,
+        ctc_weight: float = 0.5,
+        reverse_weight: float = 0.0,
+        beam_size: int = 10,
+        decoder_fastertransformer: bool = False,
+    ):
+        super().__init__()
+        self.decoder = decoder
+        self.ctc_weight = ctc_weight
+        self.reverse_weight = reverse_weight
+        self.beam_size = beam_size
+        self.decoder_fastertransformer = decoder_fastertransformer
+
+    def forward(
+        self,
+        encoder_out: torch.Tensor,
+        encoder_lens: torch.Tensor,
+        hyps_pad_sos_eos: torch.Tensor,
+        hyps_lens_sos: torch.Tensor,
+        r_hyps_pad_sos_eos: torch.Tensor,
+        ctc_score: torch.Tensor,
+    ):
+        """Encoder
+        Args:
+            encoder_out: B x T x F
+            encoder_lens: B
+            hyps_pad_sos_eos: B x beam x (T2+1),
+                        hyps with sos & eos and padded by ignore id
+            hyps_lens_sos: B x beam, length for each hyp with sos
+            r_hyps_pad_sos_eos: B x beam x (T2+1),
+                    reversed hyps with sos & eos and padded by ignore id
+            ctc_score: B x beam, ctc score for each hyp
+        Returns:
+            decoder_out: B x beam x T2 x V
+            r_decoder_out: B x beam x T2 x V
+            best_index: B
+        """
+        B, T, F = encoder_out.shape
+        bz = self.beam_size
+        B2 = B * bz
+        encoder_out = encoder_out.repeat(1, bz, 1).view(B2, T, F)
+        encoder_mask = ~make_pad_mask(encoder_lens, T).unsqueeze(1)
+        encoder_mask = encoder_mask.repeat(1, bz, 1).view(B2, 1, T)
+        T2 = hyps_pad_sos_eos.shape[2] - 1
+        hyps_pad = hyps_pad_sos_eos.view(B2, T2 + 1)
+        hyps_lens = hyps_lens_sos.view(B2, )
+        hyps_pad_sos = hyps_pad[:, :-1].contiguous()
+        hyps_pad_eos = hyps_pad[:, 1:].contiguous()
+
+        r_hyps_pad = r_hyps_pad_sos_eos.view(B2, T2 + 1)
+        r_hyps_pad_sos = r_hyps_pad[:, :-1].contiguous()
+        r_hyps_pad_eos = r_hyps_pad[:, 1:].contiguous()
+
+        decoder_out, r_decoder_out, _ = self.decoder(
+            encoder_out,
+            encoder_mask,
+            hyps_pad_sos,
+            hyps_lens,
+            r_hyps_pad_sos,
+            self.reverse_weight,
+        )
+        decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1)
+        V = decoder_out.shape[-1]
+        decoder_out = decoder_out.view(B2, T2, V)
+        mask = ~make_pad_mask(hyps_lens, T2)  # B2 x T2
+        # mask index, remove ignore id
+        index = torch.unsqueeze(hyps_pad_eos * mask, 2)
+        score = decoder_out.gather(2, index).squeeze(2)  # B2 X T2
+        # mask padded part
+        score = score * mask
+        decoder_out = decoder_out.view(B, bz, T2, V)
+        if self.reverse_weight > 0:
+            r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out,
+                                                            dim=-1)
+            r_decoder_out = r_decoder_out.view(B2, T2, V)
+            index = torch.unsqueeze(r_hyps_pad_eos * mask, 2)
+            r_score = r_decoder_out.gather(2, index).squeeze(2)
+            r_score = r_score * mask
+            score = (score * (1 - self.reverse_weight) +
+                     self.reverse_weight * r_score)
+            r_decoder_out = r_decoder_out.view(B, bz, T2, V)
+        score = torch.sum(score, axis=1)  # B2
+        score = torch.reshape(score, (B, bz)) + self.ctc_weight * ctc_score
+        best_index = torch.argmax(score, dim=1)
+        if self.decoder_fastertransformer:
+            return decoder_out, best_index
+        else:
+            return best_index
+
+
+def to_numpy(tensors):
+    out = []
+    if type(tensors) == torch.tensor:
+        tensors = [tensors]
+    for tensor in tensors:
+        if tensor.requires_grad:
+            tensor = tensor.detach().cpu().numpy()
+        else:
+            tensor = tensor.cpu().numpy()
+        out.append(tensor)
+    return out
+
+
+def test(xlist, blist, rtol=1e-3, atol=1e-5, tolerate_small_mismatch=True):
+    for a, b in zip(xlist, blist):
+        try:
+            torch.testing.assert_allclose(a, b, rtol=rtol, atol=atol)
+        except AssertionError as error:
+            if tolerate_small_mismatch:
+                print(error)
+            else:
+                raise
+
+
+def export_offline_encoder(model, configs, args, logger, encoder_onnx_path):
+    bz = 32
+    seq_len = 100
+    beam_size = args.beam_size
+    feature_size = configs["input_dim"]
+
+    speech = torch.randn(bz, seq_len, feature_size, dtype=torch.float32)
+    speech_lens = torch.randint(low=10,
+                                high=seq_len,
+                                size=(bz, ),
+                                dtype=torch.int32)
+    encoder = Encoder(model.encoder, model.ctc, beam_size)
+    encoder.eval()
+
+    torch.onnx.export(
+        encoder,
+        (speech, speech_lens),
+        encoder_onnx_path,
+        export_params=True,
+        opset_version=13,
+        do_constant_folding=True,
+        input_names=["speech", "speech_lengths"],
+        output_names=[
+            "encoder_out",
+            "encoder_out_lens",
+            "ctc_log_probs",
+            "beam_log_probs",
+            "beam_log_probs_idx",
+        ],
+        dynamic_axes={
+            "speech": {
+                0: "B",
+                1: "T"
+            },
+            "speech_lengths": {
+                0: "B"
+            },
+            "encoder_out": {
+                0: "B",
+                1: "T_OUT"
+            },
+            "encoder_out_lens": {
+                0: "B"
+            },
+            "ctc_log_probs": {
+                0: "B",
+                1: "T_OUT"
+            },
+            "beam_log_probs": {
+                0: "B",
+                1: "T_OUT"
+            },
+            "beam_log_probs_idx": {
+                0: "B",
+                1: "T_OUT"
+            },
+        },
+        verbose=False,
+    )
+
+    with torch.no_grad():
+        o0, o1, o2, o3, o4 = encoder(speech, speech_lens)
+
+    providers = ["CUDAExecutionProvider"]
+    ort_session = onnxruntime.InferenceSession(encoder_onnx_path,
+                                               providers=providers)
+    ort_inputs = {
+        "speech": to_numpy(speech),
+        "speech_lengths": to_numpy(speech_lens),
+    }
+    ort_outs = ort_session.run(None, ort_inputs)
+
+    # check encoder output
+    test(to_numpy([o0, o1, o2, o3, o4]), ort_outs)
+    logger.info("export offline onnx encoder succeed!")
+    onnx_config = {
+        "beam_size": args.beam_size,
+        "reverse_weight": args.reverse_weight,
+        "ctc_weight": args.ctc_weight,
+        "fp16": args.fp16,
+    }
+    return onnx_config
+
+
+def export_online_encoder(model, configs, args, logger, encoder_onnx_path):
+    decoding_chunk_size = args.decoding_chunk_size
+    subsampling = model.encoder.embed.subsampling_rate
+    context = model.encoder.embed.right_context + 1
+    decoding_window = (decoding_chunk_size - 1) * subsampling + context
+    batch_size = 32
+    audio_len = decoding_window
+    feature_size = configs["input_dim"]
+    output_size = configs["encoder_conf"]["output_size"]
+    num_layers = configs["encoder_conf"]["num_blocks"]
+    # in transformer the cnn module will not be available
+    transformer = False
+    cnn_module_kernel = configs["encoder_conf"].get("cnn_module_kernel", 1) - 1
+    if not cnn_module_kernel:
+        transformer = True
+    num_decoding_left_chunks = args.num_decoding_left_chunks
+    required_cache_size = decoding_chunk_size * num_decoding_left_chunks
+    if configs["encoder"] == "squeezeformer":
+        encoder = StreamingSqueezeformerEncoder(model, required_cache_size,
+                                                args.beam_size)
+    elif configs["encoder"] == "efficientConformer":
+        encoder = StreamingEfficientConformerEncoder(model,
+                                                     required_cache_size,
+                                                     args.beam_size)
+    else:
+        encoder = StreamingEncoder(
+            model,
+            required_cache_size,
+            args.beam_size,
+            transformer,
+            args.return_ctc_logprobs,
+        )
+    encoder.eval()
+
+    # begin to export encoder
+    chunk_xs = torch.randn(batch_size,
+                           audio_len,
+                           feature_size,
+                           dtype=torch.float32)
+    chunk_lens = torch.ones(batch_size, dtype=torch.int32) * audio_len
+
+    offset = torch.arange(0, batch_size).unsqueeze(1)
+    #  (elayers, b, head, cache_t1, d_k * 2)
+    head = configs["encoder_conf"]["attention_heads"]
+    d_k = configs["encoder_conf"]["output_size"] // head
+    att_cache = torch.randn(
+        batch_size,
+        num_layers,
+        head,
+        required_cache_size,
+        d_k * 2,
+        dtype=torch.float32,
+    )
+    cnn_cache = torch.randn(
+        batch_size,
+        num_layers,
+        output_size,
+        cnn_module_kernel,
+        dtype=torch.float32,
+    )
+
+    cache_mask = torch.ones(batch_size,
+                            1,
+                            required_cache_size,
+                            dtype=torch.float32)
+    input_names = [
+        "chunk_xs",
+        "chunk_lens",
+        "offset",
+        "att_cache",
+        "cnn_cache",
+        "cache_mask",
+    ]
+    output_names = [
+        "log_probs",
+        "log_probs_idx",
+        "chunk_out",
+        "chunk_out_lens",
+        "r_offset",
+        "r_att_cache",
+        "r_cnn_cache",
+        "r_cache_mask",
+    ]
+    if args.return_ctc_logprobs:
+        output_names = [
+            "ctc_log_probs",
+            "chunk_out",
+            "chunk_out_lens",
+            "r_offset",
+            "r_att_cache",
+            "r_cnn_cache",
+            "r_cache_mask",
+        ]
+    input_tensors = (
+        chunk_xs,
+        chunk_lens,
+        offset,
+        att_cache,
+        cnn_cache,
+        cache_mask,
+    )
+    if transformer:
+        assert (args.return_ctc_logprobs is
+                False), "return_ctc_logprobs is not supported in transformer"
+        output_names.pop(6)
+
+    all_names = input_names + output_names
+    dynamic_axes = {}
+    for name in all_names:
+        # only the first dimension is dynamic
+        # all other dimension is fixed
+        dynamic_axes[name] = {0: "B"}
+
+    torch.onnx.export(
+        encoder,
+        input_tensors,
+        encoder_onnx_path,
+        export_params=True,
+        opset_version=14,
+        do_constant_folding=True,
+        input_names=input_names,
+        output_names=output_names,
+        dynamic_axes=dynamic_axes,
+        verbose=False,
+    )
+
+    with torch.no_grad():
+        torch_outs = encoder(chunk_xs, chunk_lens, offset, att_cache,
+                             cnn_cache, cache_mask)
+    if transformer:
+        torch_outs = list(torch_outs).pop(6)
+    ort_session = onnxruntime.InferenceSession(
+        encoder_onnx_path, providers=["CUDAExecutionProvider"])
+    ort_inputs = {}
+
+    input_tensors = to_numpy(input_tensors)
+    for idx, name in enumerate(input_names):
+        ort_inputs[name] = input_tensors[idx]
+    if transformer:
+        del ort_inputs["cnn_cache"]
+    ort_outs = ort_session.run(None, ort_inputs)
+    test(to_numpy(torch_outs), ort_outs, rtol=1e-03, atol=1e-05)
+    logger.info("export to onnx streaming encoder succeed!")
+    onnx_config = {
+        "subsampling_rate": subsampling,
+        "context": context,
+        "decoding_chunk_size": decoding_chunk_size,
+        "num_decoding_left_chunks": num_decoding_left_chunks,
+        "beam_size": args.beam_size,
+        "fp16": args.fp16,
+        "feat_size": feature_size,
+        "decoding_window": decoding_window,
+        "cnn_module_kernel_cache": cnn_module_kernel,
+        "return_ctc_logprobs": args.return_ctc_logprobs,
+    }
+    return onnx_config
+
+
+def export_rescoring_decoder(model, configs, args, logger, decoder_onnx_path,
+                             decoder_fastertransformer):
+    bz, seq_len = 32, 100
+    beam_size = args.beam_size
+    decoder = Decoder(
+        model.decoder,
+        model.ctc_weight,
+        model.reverse_weight,
+        beam_size,
+        decoder_fastertransformer,
+    )
+    decoder.eval()
+
+    hyps_pad_sos_eos = torch.randint(low=3,
+                                     high=1000,
+                                     size=(bz, beam_size, seq_len))
+    hyps_lens_sos = torch.randint(low=3,
+                                  high=seq_len,
+                                  size=(bz, beam_size),
+                                  dtype=torch.int32)
+    r_hyps_pad_sos_eos = torch.randint(low=3,
+                                       high=1000,
+                                       size=(bz, beam_size, seq_len))
+
+    output_size = configs["encoder_conf"]["output_size"]
+    encoder_out = torch.randn(bz, seq_len, output_size, dtype=torch.float32)
+    encoder_out_lens = torch.randint(low=3,
+                                     high=seq_len,
+                                     size=(bz, ),
+                                     dtype=torch.int32)
+    ctc_score = torch.randn(bz, beam_size, dtype=torch.float32)
+
+    input_names = [
+        "encoder_out",
+        "encoder_out_lens",
+        "hyps_pad_sos_eos",
+        "hyps_lens_sos",
+        "r_hyps_pad_sos_eos",
+        "ctc_score",
+    ]
+    output_names = ["best_index"]
+    if decoder_fastertransformer:
+        output_names.insert(0, "decoder_out")
+
+    torch.onnx.export(
+        decoder,
+        (
+            encoder_out,
+            encoder_out_lens,
+            hyps_pad_sos_eos,
+            hyps_lens_sos,
+            r_hyps_pad_sos_eos,
+            ctc_score,
+        ),
+        decoder_onnx_path,
+        export_params=True,
+        opset_version=13,
+        do_constant_folding=True,
+        input_names=input_names,
+        output_names=output_names,
+        dynamic_axes={
+            "encoder_out": {
+                0: "B",
+                1: "T"
+            },
+            "encoder_out_lens": {
+                0: "B"
+            },
+            "hyps_pad_sos_eos": {
+                0: "B",
+                2: "T2"
+            },
+            "hyps_lens_sos": {
+                0: "B"
+            },
+            "r_hyps_pad_sos_eos": {
+                0: "B",
+                2: "T2"
+            },
+            "ctc_score": {
+                0: "B"
+            },
+            "best_index": {
+                0: "B"
+            },
+        },
+        verbose=False,
+    )
+    with torch.no_grad():
+        o0 = decoder(
+            encoder_out,
+            encoder_out_lens,
+            hyps_pad_sos_eos,
+            hyps_lens_sos,
+            r_hyps_pad_sos_eos,
+            ctc_score,
+        )
+    providers = ["CUDAExecutionProvider"]
+    ort_session = onnxruntime.InferenceSession(decoder_onnx_path,
+                                               providers=providers)
+
+    input_tensors = [
+        encoder_out,
+        encoder_out_lens,
+        hyps_pad_sos_eos,
+        hyps_lens_sos,
+        r_hyps_pad_sos_eos,
+        ctc_score,
+    ]
+    ort_inputs = {}
+    input_tensors = to_numpy(input_tensors)
+    for idx, name in enumerate(input_names):
+        ort_inputs[name] = input_tensors[idx]
+
+    # if model.reverse weight == 0,
+    # the r_hyps_pad will be removed
+    # from the onnx decoder since it doen't play any role
+    if model.reverse_weight == 0:
+        del ort_inputs["r_hyps_pad_sos_eos"]
+    ort_outs = ort_session.run(None, ort_inputs)
+
+    # check decoder output
+    if decoder_fastertransformer:
+        test(to_numpy(o0), ort_outs, rtol=1e-03, atol=1e-05)
+    else:
+        test(to_numpy([o0]), ort_outs, rtol=1e-03, atol=1e-05)
+    logger.info("export to onnx decoder succeed!")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="export x86_gpu model")
+    parser.add_argument("--config", required=True, help="config file")
+    parser.add_argument("--checkpoint", required=True, help="checkpoint model")
+    parser.add_argument(
+        "--cmvn_file",
+        required=False,
+        default="",
+        type=str,
+        help="global_cmvn file, default path is in config file",
+    )
+    parser.add_argument(
+        "--reverse_weight",
+        default=-1.0,
+        type=float,
+        required=False,
+        help="reverse weight for bitransformer," +
+        "default value is in config file",
+    )
+    parser.add_argument(
+        "--ctc_weight",
+        default=-1.0,
+        type=float,
+        required=False,
+        help="ctc weight, default value is in config file",
+    )
+    parser.add_argument(
+        "--beam_size",
+        default=10,
+        type=int,
+        required=False,
+        help="beam size would be ctc output size",
+    )
+    parser.add_argument(
+        "--output_onnx_dir",
+        default="onnx_model",
+        help="output onnx encoder and decoder directory",
+    )
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+        help="whether to export fp16 model, default false",
+    )
+    # arguments for streaming encoder
+    parser.add_argument(
+        "--streaming",
+        action="store_true",
+        help="whether to export streaming encoder, default false",
+    )
+    parser.add_argument(
+        "--decoding_chunk_size",
+        default=16,
+        type=int,
+        required=False,
+        help="the decoding chunk size, <=0 is not supported",
+    )
+    parser.add_argument(
+        "--num_decoding_left_chunks",
+        default=5,
+        type=int,
+        required=False,
+        help="number of left chunks, <= 0 is not supported",
+    )
+    parser.add_argument(
+        "--decoder_fastertransformer",
+        action="store_true",
+        help="return decoder_out and best_index for ft",
+    )
+    parser.add_argument(
+        "--return_ctc_logprobs",
+        action="store_true",
+        help="return full ctc_log_probs for TLG streaming encoder",
+    )
+    args = parser.parse_args()
+
+    torch.manual_seed(0)
+    torch.set_printoptions(precision=10)
+
+    with open(args.config, "r") as fin:
+        configs = yaml.load(fin, Loader=yaml.FullLoader)
+    if args.cmvn_file and os.path.exists(args.cmvn_file):
+        if 'cmvn' not in configs:
+            configs['cmvn'] = "global_cmvn"
+            configs['cmvn_conf'] = {}
+        else:
+            assert configs['cmvn'] == "global_cmvn"
+            assert configs['cmvn_conf'] is not None
+        configs['cmvn_conf']["cmvn_file"] = args.cmvn_file
+    if (args.reverse_weight != -1.0
+            and "reverse_weight" in configs["model_conf"]):
+        configs["model_conf"]["reverse_weight"] = args.reverse_weight
+        print("Update reverse weight to", args.reverse_weight)
+    if args.ctc_weight != -1:
+        print("Update ctc weight to ", args.ctc_weight)
+        configs["model_conf"]["ctc_weight"] = args.ctc_weight
+    configs["encoder_conf"]["use_dynamic_chunk"] = False
+
+    model, configs = init_model(args, configs)
+    model.eval()
+
+    if not os.path.exists(args.output_onnx_dir):
+        os.mkdir(args.output_onnx_dir)
+    encoder_onnx_path = os.path.join(args.output_onnx_dir, "encoder.onnx")
+    export_enc_func = None
+    if args.streaming:
+        assert args.decoding_chunk_size > 0
+        assert args.num_decoding_left_chunks > 0
+        export_enc_func = export_online_encoder
+    else:
+        export_enc_func = export_offline_encoder
+
+    onnx_config = export_enc_func(model, configs, args, logger,
+                                  encoder_onnx_path)
+
+    decoder_onnx_path = os.path.join(args.output_onnx_dir, "decoder.onnx")
+    export_rescoring_decoder(
+        model,
+        configs,
+        args,
+        logger,
+        decoder_onnx_path,
+        args.decoder_fastertransformer,
+    )
+
+    if args.fp16:
+        try:
+            import onnxmltools
+            from onnxmltools.utils.float16_converter import (
+                convert_float_to_float16, )
+        except ImportError:
+            print("Please install onnxmltools!")
+            sys.exit(1)
+        encoder_onnx_model = onnxmltools.utils.load_model(encoder_onnx_path)
+        encoder_onnx_model = convert_float_to_float16(encoder_onnx_model)
+        encoder_onnx_path = os.path.join(args.output_onnx_dir,
+                                         "encoder_fp16.onnx")
+        onnxmltools.utils.save_model(encoder_onnx_model, encoder_onnx_path)
+        decoder_onnx_model = onnxmltools.utils.load_model(decoder_onnx_path)
+        decoder_onnx_model = convert_float_to_float16(decoder_onnx_model)
+        decoder_onnx_path = os.path.join(args.output_onnx_dir,
+                                         "decoder_fp16.onnx")
+        onnxmltools.utils.save_model(decoder_onnx_model, decoder_onnx_path)
+    # dump configurations
+
+    config_dir = os.path.join(args.output_onnx_dir, "config.yaml")
+    with open(config_dir, "w") as out:
+        yaml.dump(onnx_config, out)
diff --git a/wenet/bin/recognize.py b/wenet/bin/recognize.py
new file mode 100644
index 0000000000000000000000000000000000000000..1782b8ab125164fe9fd554069577d13d44e255e2
--- /dev/null
+++ b/wenet/bin/recognize.py
@@ -0,0 +1,336 @@
+# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Xiaoyu Chen, Di Wu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import argparse
+import copy
+import logging
+import os
+
+import torch
+import yaml
+from torch.utils.data import DataLoader
+
+from wenet.dataset.dataset import Dataset
+from wenet.utils.config import override_config
+from wenet.utils.init_model import init_model
+from wenet.utils.init_tokenizer import init_tokenizer
+from wenet.utils.context_graph import ContextGraph
+from wenet.utils.ctc_utils import get_blank_id
+from wenet.utils.common import TORCH_NPU_AVAILABLE  # noqa just ensure to check torch-npu
+
+
+def get_args():
+    parser = argparse.ArgumentParser(description='recognize with your model')
+    parser.add_argument('--config', required=True, help='config file')
+    parser.add_argument('--test_data', required=True, help='test data file')
+    parser.add_argument('--data_type',
+                        default='raw',
+                        # choices=['raw', 'shard'],
+                        help='train and cv data type')
+    parser.add_argument('--gpu',
+                        type=int,
+                        default=-1,
+                        help='gpu id for this rank, -1 for cpu')
+    parser.add_argument('--device',
+                        type=str,
+                        default="cpu",
+                        choices=["cpu", "npu", "cuda"],
+                        help='accelerator to use')
+    parser.add_argument('--dtype',
+                        type=str,
+                        default='fp32',
+                        choices=['fp16', 'fp32', 'bf16'],
+                        help='model\'s dtype')
+    parser.add_argument('--num_workers',
+                        default=0,
+                        type=int,
+                        help='num of subprocess workers for reading')
+    parser.add_argument('--checkpoint', required=True, help='checkpoint model')
+    parser.add_argument('--beam_size',
+                        type=int,
+                        default=10,
+                        help='beam size for search')
+    parser.add_argument('--length_penalty',
+                        type=float,
+                        default=0.0,
+                        help='length penalty')
+    parser.add_argument('--blank_penalty',
+                        type=float,
+                        default=0.0,
+                        help='blank penalty')
+    parser.add_argument('--result_dir', required=True, help='asr result file')
+    parser.add_argument('--batch_size',
+                        type=int,
+                        default=16,
+                        help='asr result file')
+    parser.add_argument('--modes',
+                        nargs='+',
+                        help="""decoding mode, support the following:
+                             attention
+                             ctc_greedy_search
+                             ctc_prefix_beam_search
+                             attention_rescoring
+                             rnnt_greedy_search
+                             rnnt_beam_search
+                             rnnt_beam_attn_rescoring
+                             ctc_beam_td_attn_rescoring
+                             hlg_onebest
+                             hlg_rescore
+                             paraformer_greedy_search
+                             paraformer_beam_search""")
+    parser.add_argument('--search_ctc_weight',
+                        type=float,
+                        default=1.0,
+                        help='ctc weight for nbest generation')
+    parser.add_argument('--search_transducer_weight',
+                        type=float,
+                        default=0.0,
+                        help='transducer weight for nbest generation')
+    parser.add_argument('--ctc_weight',
+                        type=float,
+                        default=0.0,
+                        help='ctc weight for rescoring weight in \
+                                  attention rescoring decode mode \
+                              ctc weight for rescoring weight in \
+                                  transducer attention rescore decode mode')
+
+    parser.add_argument('--transducer_weight',
+                        type=float,
+                        default=0.0,
+                        help='transducer weight for rescoring weight in '
+                        'transducer attention rescore mode')
+    parser.add_argument('--attn_weight',
+                        type=float,
+                        default=0.0,
+                        help='attention weight for rescoring weight in '
+                        'transducer attention rescore mode')
+    parser.add_argument('--decoding_chunk_size',
+                        type=int,
+                        default=-1,
+                        help='''decoding chunk size,
+                                <0: for decoding, use full chunk.
+                                >0: for decoding, use fixed chunk size as set.
+                                0: used for training, it's prohibited here''')
+    parser.add_argument('--num_decoding_left_chunks',
+                        type=int,
+                        default=-1,
+                        help='number of left chunks for decoding')
+    parser.add_argument('--simulate_streaming',
+                        action='store_true',
+                        help='simulate streaming inference')
+    parser.add_argument('--reverse_weight',
+                        type=float,
+                        default=0.0,
+                        help='''right to left weight for attention rescoring
+                                decode mode''')
+    parser.add_argument('--override_config',
+                        action='append',
+                        default=[],
+                        help="override yaml config")
+
+    parser.add_argument('--word',
+                        default='',
+                        type=str,
+                        help='word file, only used for hlg decode')
+    parser.add_argument('--hlg',
+                        default='',
+                        type=str,
+                        help='hlg file, only used for hlg decode')
+    parser.add_argument('--lm_scale',
+                        type=float,
+                        default=0.0,
+                        help='lm scale for hlg attention rescore decode')
+    parser.add_argument('--decoder_scale',
+                        type=float,
+                        default=0.0,
+                        help='lm scale for hlg attention rescore decode')
+    parser.add_argument('--r_decoder_scale',
+                        type=float,
+                        default=0.0,
+                        help='lm scale for hlg attention rescore decode')
+
+    parser.add_argument(
+        '--context_bias_mode',
+        type=str,
+        default='',
+        help='''Context bias mode, selectable from the following
+                                option: decoding-graph, deep-biasing''')
+    parser.add_argument('--context_list_path',
+                        type=str,
+                        default='',
+                        help='Context list path')
+    parser.add_argument('--context_graph_score',
+                        type=float,
+                        default=0.0,
+                        help='''The higher the score, the greater the degree of
+                                bias using decoding-graph for biasing''')
+
+    parser.add_argument('--use_lora',
+                        type=bool,
+                        default=False,
+                        help='''Whether to use lora for biasing''')
+    parser.add_argument("--lora_ckpt_path",
+                        default=None,
+                        type=str,
+                        help="lora checkpoint path.")
+
+    parser.add_argument('--task',
+                        type=str,
+                        default='asr',
+                        help='Context list path')
+    parser.add_argument('--lang',
+                        type=str,
+                        default='zh',
+                        help='Context list path')
+    args = parser.parse_args()
+    print(args)
+    return args
+
+
+def main():
+    args = get_args()
+    logging.basicConfig(level=logging.DEBUG,
+                        format='%(asctime)s %(levelname)s %(message)s')
+    if args.gpu != -1:
+        # remain the original usage of gpu
+        args.device = "cuda"
+    if "cuda" in args.device:
+        os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)
+
+    with open(args.config, 'r') as fin:
+        configs = yaml.load(fin, Loader=yaml.FullLoader)
+    if len(args.override_config) > 0:
+        configs = override_config(configs, args.override_config)
+
+    test_conf = copy.deepcopy(configs['dataset_conf'])
+
+    test_conf['filter_conf']['max_length'] = 102400
+    test_conf['filter_conf']['min_length'] = 0
+    test_conf['filter_conf']['token_max_length'] = 102400
+    test_conf['filter_conf']['token_min_length'] = 0
+    test_conf['filter_conf']['max_output_input_ratio'] = 102400
+    test_conf['filter_conf']['min_output_input_ratio'] = 0
+    test_conf['speed_perturb'] = False
+    test_conf['spec_aug'] = False
+    test_conf['spec_sub'] = False
+    test_conf['spec_trim'] = False
+    test_conf['shuffle'] = False
+    test_conf['sort'] = False
+    test_conf['cycle'] = 1
+    test_conf['list_shuffle'] = False
+    if 'fbank_conf' in test_conf:
+        test_conf['fbank_conf']['dither'] = 0.0
+    elif 'mfcc_conf' in test_conf:
+        test_conf['mfcc_conf']['dither'] = 0.0
+    test_conf['batch_conf']['batch_type'] = "static"
+    test_conf['batch_conf']['batch_size'] = args.batch_size
+
+    tokenizer = init_tokenizer(configs)
+    test_dataset = Dataset(args.data_type,
+                           args.test_data,
+                           tokenizer,
+                           test_conf,
+                           partition=False)
+
+    test_data_loader = DataLoader(test_dataset,
+                                  batch_size=None,
+                                  num_workers=args.num_workers)
+
+    # Init asr model from configs
+    args.jit = False
+    model, configs = init_model(args, configs)
+
+    device = torch.device(args.device)
+    model = model.to(device)
+    model.eval()
+    dtype = torch.float32
+    if args.dtype == 'fp16':
+        dtype = torch.float16
+    elif args.dtype == 'bf16':
+        dtype = torch.bfloat16
+    logging.info("compute dtype is {}".format(dtype))
+
+    context_graph = None
+    if 'decoding-graph' in args.context_bias_mode:
+        context_graph = ContextGraph(args.context_list_path,
+                                     tokenizer.symbol_table,
+                                     configs['tokenizer_conf']['bpe_path'],
+                                     args.context_graph_score)
+
+    _, blank_id = get_blank_id(configs, tokenizer.symbol_table)
+    logging.info("blank_id is {}".format(blank_id))
+
+    # TODO(Dinghao Zhou): Support RNN-T related decoding
+    # TODO(Lv Xiang): Support k2 related decoding
+    # TODO(Kaixun Huang): Support context graph
+    files = {}
+    for mode in args.modes:
+        dir_name = os.path.join(args.result_dir, mode)
+        os.makedirs(dir_name, exist_ok=True)
+        file_name = os.path.join(dir_name, 'text')
+        files[mode] = open(file_name, 'w', encoding='utf-8')
+    max_format_len = max([len(mode) for mode in args.modes])
+
+    with torch.cuda.amp.autocast(enabled=True,
+                                 dtype=dtype,
+                                 cache_enabled=False):
+        with torch.no_grad():
+            utt_num=0
+            # logging.info(f'utt_num: {utt_num}')
+            for batch_idx, batch in enumerate(test_data_loader):
+                keys = batch["keys"]
+                feats = batch["feats"].to(device)
+                target = batch["target"].to(device)
+                feats_lengths = batch["feats_lengths"].to(device)
+                target_lengths = batch["target_lengths"].to(device)
+                batch_size = feats.size(0)
+                # task_list = ["transcribe" for i in range(batch_size)]
+                task_list = [args.task for i in range(batch_size)]
+                lang_list = [args.lang for i in range(batch_size)]
+                infos = {"tasks": task_list, "langs":lang_list}
+                results = model.decode(
+                    args.modes,
+                    feats,
+                    feats_lengths,
+                    args.beam_size,
+                    decoding_chunk_size=args.decoding_chunk_size,
+                    num_decoding_left_chunks=args.num_decoding_left_chunks,
+                    ctc_weight=args.ctc_weight,
+                    simulate_streaming=args.simulate_streaming,
+                    reverse_weight=args.reverse_weight,
+                    context_graph=context_graph,
+                    blank_id=blank_id,
+                    blank_penalty=args.blank_penalty,
+                    length_penalty=args.length_penalty,
+                    infos=infos)
+                for i, key in enumerate(keys):
+                    utt_num += 1
+                    for mode, hyps in results.items():
+                        tokens = hyps[i].tokens
+                        line = '{} {}'.format(key,
+                                              tokenizer.detokenize(tokens)[0])
+                        logging.info('{} {}'.format(mode.ljust(max_format_len),
+                                                    line))
+                        files[mode].write(line + '\n')
+                        # if utt_num % 500 == 0:
+                        #     files[mode].flush()
+        for mode, f in files.items():
+            f.flush()  # 强制将缓冲区内容刷新到文件
+            f.close()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/wenet/bin/recognize4llmasr.py b/wenet/bin/recognize4llmasr.py
new file mode 100644
index 0000000000000000000000000000000000000000..725c41bb19a7b685dea2a0def9b02bf01c16cadf
--- /dev/null
+++ b/wenet/bin/recognize4llmasr.py
@@ -0,0 +1,340 @@
+# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Xiaoyu Chen, Di Wu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import argparse
+import copy
+import logging
+import os
+
+import torch
+import yaml
+from gxl_ai_utils.utils.utils_model import set_random_seed
+from torch.utils.data import DataLoader
+
+from wenet.dataset.dataset import Dataset
+from wenet.llm_asr.llmasr_model import LLMASR_Model
+from wenet.utils.config import override_config
+from wenet.utils.init_model import init_model
+from wenet.utils.init_tokenizer import init_tokenizer
+from wenet.utils.context_graph import ContextGraph
+from wenet.utils.ctc_utils import get_blank_id
+from wenet.utils.common import TORCH_NPU_AVAILABLE  # noqa just ensure to check torch-npu
+
+
+def get_args():
+    parser = argparse.ArgumentParser(description='recognize with your model')
+    parser.add_argument('--config', required=True, help='config file')
+    parser.add_argument('--test_data', required=True, help='test data file')
+    parser.add_argument('--data_type',
+                        default='raw',
+                        # choices=['raw', 'shard'],
+                        help='train and cv data type')
+    parser.add_argument('--gpu',
+                        type=int,
+                        default=-1,
+                        help='gpu id for this rank, -1 for cpu')
+    parser.add_argument('--device',
+                        type=str,
+                        default="cpu",
+                        choices=["cpu", "npu", "cuda"],
+                        help='accelerator to use')
+    parser.add_argument('--dtype',
+                        type=str,
+                        default='fp32',
+                        choices=['fp16', 'fp32', 'bf16'],
+                        help='model\'s dtype')
+    parser.add_argument('--num_workers',
+                        default=0,
+                        type=int,
+                        help='num of subprocess workers for reading')
+    parser.add_argument('--checkpoint', required=True, help='checkpoint model')
+    parser.add_argument('--beam_size',
+                        type=int,
+                        default=10,
+                        help='beam size for search')
+    parser.add_argument('--length_penalty',
+                        type=float,
+                        default=0.0,
+                        help='length penalty')
+    parser.add_argument('--blank_penalty',
+                        type=float,
+                        default=0.0,
+                        help='blank penalty')
+    parser.add_argument('--result_dir', required=True, help='asr result file')
+    parser.add_argument('--batch_size',
+                        type=int,
+                        default=16,
+                        help='asr result file')
+    parser.add_argument('--modes',
+                        nargs='+',
+                        help="""decoding mode, support the following:
+                             attention
+                             ctc_greedy_search
+                             ctc_prefix_beam_search
+                             attention_rescoring
+                             rnnt_greedy_search
+                             rnnt_beam_search
+                             rnnt_beam_attn_rescoring
+                             ctc_beam_td_attn_rescoring
+                             hlg_onebest
+                             hlg_rescore
+                             paraformer_greedy_search
+                             paraformer_beam_search""")
+    parser.add_argument('--search_ctc_weight',
+                        type=float,
+                        default=1.0,
+                        help='ctc weight for nbest generation')
+    parser.add_argument('--search_transducer_weight',
+                        type=float,
+                        default=0.0,
+                        help='transducer weight for nbest generation')
+    parser.add_argument('--ctc_weight',
+                        type=float,
+                        default=0.0,
+                        help='ctc weight for rescoring weight in \
+                                  attention rescoring decode mode \
+                              ctc weight for rescoring weight in \
+                                  transducer attention rescore decode mode')
+
+    parser.add_argument('--transducer_weight',
+                        type=float,
+                        default=0.0,
+                        help='transducer weight for rescoring weight in '
+                        'transducer attention rescore mode')
+    parser.add_argument('--attn_weight',
+                        type=float,
+                        default=0.0,
+                        help='attention weight for rescoring weight in '
+                        'transducer attention rescore mode')
+    parser.add_argument('--decoding_chunk_size',
+                        type=int,
+                        default=-1,
+                        help='''decoding chunk size,
+                                <0: for decoding, use full chunk.
+                                >0: for decoding, use fixed chunk size as set.
+                                0: used for training, it's prohibited here''')
+    parser.add_argument('--num_decoding_left_chunks',
+                        type=int,
+                        default=-1,
+                        help='number of left chunks for decoding')
+    parser.add_argument('--simulate_streaming',
+                        action='store_true',
+                        help='simulate streaming inference')
+    parser.add_argument('--reverse_weight',
+                        type=float,
+                        default=0.0,
+                        help='''right to left weight for attention rescoring
+                                decode mode''')
+    parser.add_argument('--override_config',
+                        action='append',
+                        default=[],
+                        help="override yaml config")
+
+    parser.add_argument('--word',
+                        default='',
+                        type=str,
+                        help='word file, only used for hlg decode')
+    parser.add_argument('--hlg',
+                        default='',
+                        type=str,
+                        help='hlg file, only used for hlg decode')
+    parser.add_argument('--lm_scale',
+                        type=float,
+                        default=0.0,
+                        help='lm scale for hlg attention rescore decode')
+    parser.add_argument('--decoder_scale',
+                        type=float,
+                        default=0.0,
+                        help='lm scale for hlg attention rescore decode')
+    parser.add_argument('--r_decoder_scale',
+                        type=float,
+                        default=0.0,
+                        help='lm scale for hlg attention rescore decode')
+
+    parser.add_argument(
+        '--context_bias_mode',
+        type=str,
+        default='',
+        help='''Context bias mode, selectable from the following
+                                option: decoding-graph, deep-biasing''')
+    parser.add_argument('--context_list_path',
+                        type=str,
+                        default='',
+                        help='Context list path')
+    parser.add_argument('--context_graph_score',
+                        type=float,
+                        default=0.0,
+                        help='''The higher the score, the greater the degree of
+                                bias using decoding-graph for biasing''')
+
+    parser.add_argument('--use_lora',
+                        type=bool,
+                        default=False,
+                        help='''Whether to use lora for biasing''')
+    parser.add_argument("--lora_ckpt_path",
+                        default=None,
+                        type=str,
+                        help="lora checkpoint path.")
+
+    parser.add_argument('--task',
+                        type=str,
+                        default='asr',
+                        help='Context list path')
+    parser.add_argument('--lang',
+                        type=str,
+                        default='zh',
+                        help='Context list path')
+    args = parser.parse_args()
+    print(args)
+    return args
+
+
+def main():
+    args = get_args()
+    logging.basicConfig(level=logging.DEBUG,
+                        format='%(asctime)s %(levelname)s %(message)s')
+
+    set_random_seed(777)
+
+    if args.gpu != -1:
+        # remain the original usage of gpu
+        args.device = "cuda"
+    if "cuda" in args.device:
+        os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)
+
+    with open(args.config, 'r') as fin:
+        configs = yaml.load(fin, Loader=yaml.FullLoader)
+    if len(args.override_config) > 0:
+        configs = override_config(configs, args.override_config)
+    configs['dataset_conf']['filter_conf']['filter_no_extra_info'] = False
+    test_conf = copy.deepcopy(configs['dataset_conf'])
+
+    test_conf['filter_conf']['max_length'] = 3000 # whisper最长处理30s 102400
+    test_conf['filter_conf']['min_length'] = 0
+    test_conf['filter_conf']['token_max_length'] = 102400
+    test_conf['filter_conf']['token_min_length'] = 0
+    test_conf['filter_conf']['max_output_input_ratio'] = 102400
+    test_conf['filter_conf']['min_output_input_ratio'] = 0
+    test_conf['speed_perturb'] = False
+    test_conf['spec_aug'] = False
+    test_conf['spec_sub'] = False
+    test_conf['spec_trim'] = False
+    test_conf['shuffle'] = True
+    test_conf['sort'] = False
+    test_conf['cycle'] = 1
+    test_conf['list_shuffle'] = True
+    if 'fbank_conf' in test_conf:
+        test_conf['fbank_conf']['dither'] = 0.0
+    elif 'mfcc_conf' in test_conf:
+        test_conf['mfcc_conf']['dither'] = 0.0
+    test_conf['batch_conf']['batch_type'] = "static"
+    test_conf['batch_conf']['batch_size'] = 1
+    test_conf['split_num'] = 1
+
+
+    tokenizer = init_tokenizer(configs)
+    test_dataset = Dataset(args.data_type,
+                           args.test_data,
+                           tokenizer,
+                           test_conf,
+                           partition=False)
+
+    test_data_loader = DataLoader(test_dataset,
+                                  batch_size=None,
+                                  num_workers=args.num_workers)
+
+    # Init asr model from configs
+    args.jit = False
+    model, configs = init_model(args, configs)
+
+    device = torch.device(args.device)
+    model:LLMASR_Model = model.to(device)
+    model.eval()
+    dtype = torch.float32
+    if args.dtype == 'fp16':
+        dtype = torch.float16
+    elif args.dtype == 'bf16':
+        dtype = torch.bfloat16
+    logging.info("compute dtype is {}".format(dtype))
+
+    context_graph = None
+    if 'decoding-graph' in args.context_bias_mode:
+        context_graph = ContextGraph(args.context_list_path,
+                                     tokenizer.symbol_table,
+                                     configs['tokenizer_conf']['bpe_path'],
+                                     args.context_graph_score)
+
+    _, blank_id = get_blank_id(configs, tokenizer.symbol_table)
+    logging.info("blank_id is {}".format(blank_id))
+
+    # TODO(Dinghao Zhou): Support RNN-T related decoding
+    # TODO(Lv Xiang): Support k2 related decoding
+    # TODO(Kaixun Huang): Support context graph
+    files = {}
+    modes = ['llmasr_decode']
+    for mode in modes:
+        dir_name = os.path.join(args.result_dir, mode)
+        os.makedirs(dir_name, exist_ok=True)
+        file_name = os.path.join(dir_name, 'text')
+        files[mode] = open(file_name, 'w', encoding='utf-8')
+    max_format_len = max([len(mode) for mode in args.modes])
+
+    # Get prompt config
+    from gxl_ai_utils.utils import utils_file
+    global_prompt_dict = utils_file.load_dict_from_yaml('conf/prompt_stage4.yaml')
+
+    with torch.cuda.amp.autocast(enabled=True,
+                                 dtype=dtype,
+                                 cache_enabled=False):
+        with torch.no_grad():
+            # logging.info(f'utt_num: {utt_num}')
+            for batch_idx, batch in enumerate(test_data_loader):
+                keys = batch["keys"]
+                feats = batch["feats"].to(device)
+                target = batch["target"].to(device)
+                feats_lengths = batch["feats_lengths"].to(device)
+                target_lengths = batch["target_lengths"].to(device)
+                batch_size = feats.size(0)
+
+                import random
+                if '><' in args.task:
+                    args.task = args.task.replace('><', '> <')
+                if args.task == "<TRANSCRIBE>" or args.task == "<transcribe>":
+                    is_truncation = False
+                else:
+                    is_truncation = True
+                random_index = random.randint(0, len(global_prompt_dict[args.task])-1)
+                prompt = global_prompt_dict[args.task][random_index]
+                # print(args.task, prompt)
+
+                res_text = model.generate(wavs=feats, wavs_len=feats_lengths, prompt=prompt)
+                for mode in modes:
+                    line = "{}\t{}".format(keys[0], res_text[0])
+                    files[mode].write(line+'\n')
+                    utils_file.logging_print( '{} {} {}'.format(batch_idx, keys[0], res_text[0]))
+                if batch_idx % 100 == 0:
+                    for mode, f in files.items():
+                        f.flush()  # 强制将缓冲区内容刷新到文件
+                # if batch_idx >= 1000 and is_truncation:
+                #     utils_file.logging_info('采用截断至3000的策略')
+                #     break
+        for mode, f in files.items():
+            f.flush()  # 强制将缓冲区内容刷新到文件
+            f.close()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/wenet/bin/recognize_onnx_gpu.py b/wenet/bin/recognize_onnx_gpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..373c3ddbec7fcac4d8c7cbf7c2549e89bdd88617
--- /dev/null
+++ b/wenet/bin/recognize_onnx_gpu.py
@@ -0,0 +1,297 @@
+# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Xiaoyu Chen, Di Wu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This script is for testing exported onnx encoder and decoder from
+export_onnx_gpu.py. The exported onnx models only support batch offline ASR inference.
+It requires a python wrapped c++ ctc decoder.
+Please install it by following:
+https://github.com/Slyne/ctc_decoder.git
+"""
+from __future__ import print_function
+
+import argparse
+import copy
+import logging
+import os
+import sys
+
+import torch
+import yaml
+from torch.utils.data import DataLoader
+
+from wenet.dataset.dataset import Dataset
+from wenet.utils.common import IGNORE_ID
+from wenet.utils.config import override_config
+from wenet.utils.init_tokenizer import init_tokenizer
+
+import onnxruntime as rt
+import multiprocessing
+import numpy as np
+
+try:
+    from swig_decoders import map_batch, \
+        ctc_beam_search_decoder_batch, \
+        TrieVector, PathTrie
+except ImportError:
+    print('Please install ctc decoders first by refering to\n' +
+          'https://github.com/Slyne/ctc_decoder.git')
+    sys.exit(1)
+
+def get_args():
+    parser = argparse.ArgumentParser(description='recognize with your model')
+    parser.add_argument('--config', required=True, help='config file')
+    parser.add_argument('--test_data', required=True, help='test data file')
+    parser.add_argument('--data_type',
+                        default='raw',
+                        choices=['raw', 'shard'],
+                        help='train and cv data type')
+    parser.add_argument('--gpu',
+                        type=int,
+                        default=-1,
+                        help='gpu id for this rank, -1 for cpu')
+    parser.add_argument('--dict', required=True, help='dict file')
+    parser.add_argument('--encoder_onnx',
+                        required=True,
+                        help='encoder onnx file')
+    parser.add_argument('--decoder_onnx',
+                        required=True,
+                        help='decoder onnx file')
+    parser.add_argument('--result_file', required=True, help='asr result file')
+    parser.add_argument('--batch_size',
+                        type=int,
+                        default=32,
+                        help='asr result file')
+    parser.add_argument('--mode',
+                        choices=[
+                            'ctc_greedy_search', 'ctc_prefix_beam_search',
+                            'attention_rescoring'
+                        ],
+                        default='attention_rescoring',
+                        help='decoding mode')
+    parser.add_argument('--bpe_model',
+                        default=None,
+                        type=str,
+                        help='bpe model for english part')
+    parser.add_argument('--override_config',
+                        action='append',
+                        default=[],
+                        help="override yaml config")
+    parser.add_argument('--fp16',
+                        action='store_true',
+                        help='whether to export fp16 model, default false')
+    args = parser.parse_args()
+    return args
+
+def main():
+    args = get_args()
+    logging.basicConfig(level=logging.DEBUG,
+                        format='%(asctime)s %(levelname)s %(message)s')
+    os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)
+
+    with open(args.config, 'r') as fin:
+        configs = yaml.load(fin, Loader=yaml.FullLoader)
+    if len(args.override_config) > 0:
+        configs = override_config(configs, args.override_config)
+
+    reverse_weight = configs["model_conf"].get("reverse_weight", 0.0)
+    special_tokens = configs.get('tokenizer_conf', {}).get('special_tokens', None)
+    test_conf = copy.deepcopy(configs['dataset_conf'])
+    test_conf['filter_conf']['max_length'] = 102400
+    test_conf['filter_conf']['min_length'] = 0
+    test_conf['filter_conf']['token_max_length'] = 102400
+    test_conf['filter_conf']['token_min_length'] = 0
+    test_conf['filter_conf']['max_output_input_ratio'] = 102400
+    test_conf['filter_conf']['min_output_input_ratio'] = 0
+    test_conf['speed_perturb'] = False
+    test_conf['spec_aug'] = False
+    test_conf['spec_sub'] = False
+    test_conf['spec_trim'] = False
+    test_conf['shuffle'] = False
+    test_conf['sort'] = False
+    test_conf['fbank_conf']['dither'] = 0.0
+    test_conf['batch_conf']['batch_type'] = "static"
+    test_conf['batch_conf']['batch_size'] = args.batch_size
+
+    tokenizer = init_tokenizer(configs)
+    test_dataset = Dataset(args.data_type,
+                           args.test_data,
+                           tokenizer,
+                           test_conf,
+                           partition=False)
+    test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0)
+
+    # Init asr model from configs
+    use_cuda = args.gpu >= 0 and torch.cuda.is_available()
+    if use_cuda:
+        EP_list = ['CUDAExecutionProvider', 'CPUExecutionProvider']
+    else:
+        EP_list = ['CPUExecutionProvider']
+
+    encoder_ort_session = rt.InferenceSession(args.encoder_onnx,
+                                              providers=EP_list)
+    decoder_ort_session = None
+    if args.mode == "attention_rescoring":
+        decoder_ort_session = rt.InferenceSession(args.decoder_onnx,
+                                                  providers=EP_list)
+
+    # Load dict
+    vocabulary = []
+    char_dict = {}
+    with open(args.dict, 'r') as fin:
+        for line in fin:
+            arr = line.strip().split()
+            assert len(arr) == 2
+            char_dict[int(arr[1])] = arr[0]
+            vocabulary.append(arr[0])
+
+    vocab_size = len(char_dict)
+    sos = (vocab_size - 1 if special_tokens is None else
+           special_tokens.get("<sos>", vocab_size - 1))
+    eos = (vocab_size - 1 if special_tokens is None else
+           special_tokens.get("<eos>", vocab_size - 1))
+
+    with torch.no_grad(), open(args.result_file, 'w') as fout:
+        for _, batch in enumerate(test_data_loader):
+            keys = batch['keys']
+            feats = batch['feats']
+            feats_lengths = batch['feats_lengths']
+            feats, feats_lengths = feats.numpy(), feats_lengths.numpy()
+            if args.fp16:
+                feats = feats.astype(np.float16)
+            ort_inputs = {
+                encoder_ort_session.get_inputs()[0].name: feats,
+                encoder_ort_session.get_inputs()[1].name: feats_lengths
+            }
+            ort_outs = encoder_ort_session.run(None, ort_inputs)
+            encoder_out, encoder_out_lens, ctc_log_probs, \
+                beam_log_probs, beam_log_probs_idx = ort_outs
+            beam_size = beam_log_probs.shape[-1]
+            batch_size = beam_log_probs.shape[0]
+            num_processes = min(multiprocessing.cpu_count(), batch_size)
+            if args.mode == 'ctc_greedy_search':
+                if beam_size != 1:
+                    log_probs_idx = beam_log_probs_idx[:, :, 0]
+                batch_sents = []
+                for idx, seq in enumerate(log_probs_idx):
+                    batch_sents.append(seq[0:encoder_out_lens[idx]].tolist())
+                hyps = map_batch(batch_sents, vocabulary, num_processes, True,
+                                 0)
+            elif args.mode in ('ctc_prefix_beam_search',
+                               "attention_rescoring"):
+                batch_log_probs_seq_list = beam_log_probs.tolist()
+                batch_log_probs_idx_list = beam_log_probs_idx.tolist()
+                batch_len_list = encoder_out_lens.tolist()
+                batch_log_probs_seq = []
+                batch_log_probs_ids = []
+                batch_start = []  # only effective in streaming deployment
+                batch_root = TrieVector()
+                root_dict = {}
+                for i in range(len(batch_len_list)):
+                    num_sent = batch_len_list[i]
+                    batch_log_probs_seq.append(
+                        batch_log_probs_seq_list[i][0:num_sent])
+                    batch_log_probs_ids.append(
+                        batch_log_probs_idx_list[i][0:num_sent])
+                    root_dict[i] = PathTrie()
+                    batch_root.append(root_dict[i])
+                    batch_start.append(True)
+                score_hyps = ctc_beam_search_decoder_batch(
+                    batch_log_probs_seq, batch_log_probs_ids, batch_root,
+                    batch_start, beam_size, num_processes, 0, -2, 0.99999)
+                if args.mode == 'ctc_prefix_beam_search':
+                    hyps = []
+                    for cand_hyps in score_hyps:
+                        hyps.append(cand_hyps[0][1])
+                    hyps = map_batch(hyps, vocabulary, num_processes, False, 0)
+            if args.mode == 'attention_rescoring':
+                ctc_score, all_hyps = [], []
+                max_len = 0
+                for hyps in score_hyps:
+                    cur_len = len(hyps)
+                    if len(hyps) < beam_size:
+                        hyps += (beam_size - cur_len) * [(-float("INF"),
+                                                          (0, ))]
+                    cur_ctc_score = []
+                    for hyp in hyps:
+                        cur_ctc_score.append(hyp[0])
+                        all_hyps.append(list(hyp[1]))
+                        if len(hyp[1]) > max_len:
+                            max_len = len(hyp[1])
+                    ctc_score.append(cur_ctc_score)
+                if args.fp16:
+                    ctc_score = np.array(ctc_score, dtype=np.float16)
+                else:
+                    ctc_score = np.array(ctc_score, dtype=np.float32)
+                hyps_pad_sos_eos = np.ones(
+                    (batch_size, beam_size, max_len + 2),
+                    dtype=np.int64) * IGNORE_ID
+                r_hyps_pad_sos_eos = np.ones(
+                    (batch_size, beam_size, max_len + 2),
+                    dtype=np.int64) * IGNORE_ID
+                hyps_lens_sos = np.ones((batch_size, beam_size),
+                                        dtype=np.int32)
+                k = 0
+                for i in range(batch_size):
+                    for j in range(beam_size):
+                        cand = all_hyps[k]
+                        l = len(cand) + 2
+                        hyps_pad_sos_eos[i][j][0:l] = [sos] + cand + [eos]
+                        r_hyps_pad_sos_eos[i][j][0:l] = [sos] + cand[::-1] + [
+                            eos
+                        ]
+                        hyps_lens_sos[i][j] = len(cand) + 1
+                        k += 1
+                decoder_ort_inputs = {
+                    decoder_ort_session.get_inputs()[0].name: encoder_out,
+                    decoder_ort_session.get_inputs()[1].name: encoder_out_lens,
+                    decoder_ort_session.get_inputs()[2].name: hyps_pad_sos_eos,
+                    decoder_ort_session.get_inputs()[3].name: hyps_lens_sos,
+                    decoder_ort_session.get_inputs()[-1].name: ctc_score
+                }
+                if reverse_weight > 0:
+                    r_hyps_pad_sos_eos_name = decoder_ort_session.get_inputs(
+                    )[4].name
+                    decoder_ort_inputs[
+                        r_hyps_pad_sos_eos_name] = r_hyps_pad_sos_eos
+                best_index = decoder_ort_session.run(None,
+                                                     decoder_ort_inputs)[0]
+                best_sents = []
+                k = 0
+                for idx in best_index:
+                    cur_best_sent = all_hyps[k:k + beam_size][idx]
+                    best_sents.append(cur_best_sent)
+                    k += beam_size
+                hyps = map_batch(best_sents, vocabulary, num_processes)
+
+            for i, key in enumerate(keys):
+                content = hyps[i]
+                logging.info('{} {}'.format(key, content))
+                fout.write('{} {}\n'.format(key, content))
+
+if __name__ == '__main__':
+    main()
diff --git a/wenet/bin/train.py b/wenet/bin/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..9edf684c9c1750fd3e1ec2611d7427f58391e3d3
--- /dev/null
+++ b/wenet/bin/train.py
@@ -0,0 +1,232 @@
+# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import argparse
+import datetime
+import logging
+import os
+import random
+
+import numpy as np
+import yaml
+import torch
+
+import torch.distributed as dist
+
+from torch.distributed.elastic.multiprocessing.errors import record
+from wenet.utils.common import lrs_to_str, TORCH_NPU_AVAILABLE  # noqa just ensure to check torch-npu
+
+from wenet.utils.executor import Executor
+from wenet.utils.config import override_config
+from wenet.utils.init_model import init_model
+from wenet.utils.init_tokenizer import init_tokenizer
+from wenet.utils.train_utils import (
+    add_fsdp_args, add_model_args, add_dataset_args, add_ddp_args,
+    add_deepspeed_args, add_trace_args, init_distributed,
+    init_dataset_and_dataloader, check_modify_and_save_config,
+    init_optimizer_and_scheduler, init_scaler, trace_and_print_model,
+    wrap_cuda_model, init_summarywriter, save_model, log_per_epoch,
+    add_lora_args, reinit_lora)
+from gxl_ai_utils.utils import utils_file
+
+try:
+    import torch_npu
+
+    torch_npu.npu.conv.allow_hf32 = False
+    # import deepspeed_npu
+    from torch_npu.npu import amp
+    from torch_npu.contrib import transfer_to_npu
+except ImportError:
+    utils_file.logging_warning(
+        "torch_npu is not installed, please install torch_npu first if you want to use torch_npu")
+torch.backends.cudnn.allow_tf32 = False
+torch.backends.cuda.matmul.allow_tf32 = False
+
+from msprobe.pytorch import seed_all
+import gc
+
+gc.set_threshold(700, 10, 10000)  # python gc阈值设置
+
+
+# import deepspeed_npu
+def get_args():
+    parser = argparse.ArgumentParser(description='training your network')
+    parser.add_argument('--train_engine',
+                        default='torch_ddp',
+                        choices=['torch_ddp', 'torch_fsdp', 'deepspeed'],
+                        help='Engine for paralleled training')
+    # set default value of device to "cuda", avoiding the modify of original scripts
+    parser.add_argument('--device',
+                        type=str,
+                        default='cuda',
+                        choices=["cpu", "npu", "cuda"],
+                        help='accelerator for training')
+    # load deepspeed checkpoint
+    parser.add_argument('--load_dir',
+                        type=str,
+                        default=None)
+    parser.add_argument('--ckpt_id',
+                        type=str,
+                        default=None)
+    parser = add_model_args(parser)
+    parser = add_dataset_args(parser)
+    parser = add_ddp_args(parser)
+    parser = add_lora_args(parser)
+    parser = add_deepspeed_args(parser)
+    parser = add_fsdp_args(parser)
+    parser = add_trace_args(parser)
+    args = parser.parse_args()
+    if args.train_engine == "deepspeed":
+        args.deepspeed = True
+        assert args.deepspeed_config is not None
+    return args
+
+
+# NOTE(xcsong): On worker errors, this recod tool will summarize the
+#   details of the error (e.g. time, rank, host, pid, traceback, etc).
+@record
+def main():
+    args = get_args()
+    logging.basicConfig(level=logging.DEBUG,
+                        format='%(asctime)s %(levelname)s %(message)s')
+
+    # Set random seed
+    torch.manual_seed(777)
+    random.seed(777)
+    np.random.seed(777)
+    utils_file.logging_info('开始严格seed')
+    seed_all(777)
+    utils_file.logging_info('结束严格seed')
+    logging.info('Random seed set to {}'.format(777))
+
+    # Read config
+    with open(args.config, 'r') as fin:
+        configs = yaml.load(fin, Loader=yaml.FullLoader)
+    if len(args.override_config) > 0:
+        configs = override_config(configs, args.override_config)
+
+    # init tokenizer
+    tokenizer = init_tokenizer(configs)
+
+    # Init env for ddp OR deepspeed
+    _, _, rank = init_distributed(args)
+
+    # Init asr model from configs
+    model, configs = init_model(args, configs)
+
+    # Get dataset & dataloader
+    train_dataset, cv_dataset, train_data_loader, cv_data_loader = \
+        init_dataset_and_dataloader(args, configs, tokenizer)
+
+    # Do some sanity checks and save config to arsg.model_dir
+    configs = check_modify_and_save_config(args, configs,
+                                           tokenizer.symbol_table)
+
+    if hasattr(args, 'lora_reinit') and args.lora_reinit:
+        reinit_lora(model, args, configs, tokenizer)
+
+    # Check model is jitable & print model archtectures
+    trace_and_print_model(args, model)
+
+    # Tensorboard summary
+    writer = init_summarywriter(args)
+
+    # Dispatch model from cpu to gpu
+    model, device = wrap_cuda_model(args, model, configs)
+
+    # Get optimizer & scheduler
+    model, optimizer, scheduler = init_optimizer_and_scheduler(
+        args, configs, model)
+
+    # Load deepspeed checkpoint
+    if args.load_dir is not None and \
+            args.ckpt_id is not None:
+        _, client_sd = model.load_checkpoint(args.load_dir, args.ckpt_id)
+
+    # Save checkpoints
+    # save_model(model,
+    #            info_dict={
+    #                "save_time":
+    #                datetime.datetime.now().strftime('%d/%m/%Y %H:%M:%S'),
+    #                "tag":
+    #                "init",
+    #                **configs
+    #            })
+
+    # Get executor
+    tag = configs["init_infos"].get("tag", "init")
+    executor = Executor(global_step=configs["init_infos"].get('step', -1),
+                        device=device)
+
+    # Init scaler, used for pytorch amp mixed precision training
+    scaler = init_scaler(args)
+
+    # Start training loop
+    start_epoch = configs["init_infos"].get('epoch', 0) + int("epoch_" in tag)
+    # if save_interval in configs, steps mode else epoch mode
+    end_epoch = configs.get('max_epoch', 100)
+    assert start_epoch <= end_epoch
+    configs.pop("init_infos", None)
+    final_epoch = None
+    for epoch in range(start_epoch, end_epoch):
+        configs['epoch'] = epoch
+
+        lrs = [group['lr'] for group in optimizer.param_groups]
+        logging.info('Epoch {} Step {} TRAIN info lr {} rank {}'.format(
+            epoch, executor.step, lrs_to_str(lrs), rank))
+
+        dist.barrier(
+        )  # NOTE(xcsong): Ensure all ranks start Train at the same time.
+        # NOTE(xcsong): Why we need a new group?  see `train_utils.py::wenet_join`
+        group_join = dist.new_group(  # fix by zhaoyi for 多机训练
+            backend="gloo", timeout=datetime.timedelta(seconds=args.timeout))
+        # group_join = None
+        executor.train(model, optimizer, scheduler, train_data_loader,
+                       cv_data_loader, writer, configs, scaler, group_join)
+        # dist.destroy_process_group(group_join)
+
+        dist.barrier(
+        )  # NOTE(xcsong): Ensure all ranks start CV at the same time.
+        loss_dict = executor.cv(model, cv_data_loader, configs)
+        info_dict = {
+            'epoch': epoch,
+            'lrs': [group['lr'] for group in optimizer.param_groups],
+            'step': executor.step,
+            "loss_dict": loss_dict,
+            'save_time': datetime.datetime.now().strftime('%d/%m/%Y %H:%M:%S'),
+            'tag': "epoch_{}".format(epoch),
+            'loss_dict': loss_dict,
+            **configs
+        }
+        # epoch cv: tensorboard && log
+        log_per_epoch(writer, info_dict=info_dict)
+        save_model(model, info_dict=info_dict)
+
+        final_epoch = epoch
+
+    if final_epoch is not None and rank == 0:
+        final_model_path = os.path.join(args.model_dir, 'final.pt')
+        os.remove(final_model_path) if os.path.exists(
+            final_model_path) else None
+        os.symlink('{}.pt'.format(final_epoch), final_model_path)
+        writer.close()
+    dist.barrier(
+    )  # NOTE(yktian): Ensure all ranks end Train before destroy process group.
+    dist.destroy_process_group()
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/wenet/branchformer/__init__.py b/wenet/branchformer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/wenet/branchformer/cgmlp.py b/wenet/branchformer/cgmlp.py
new file mode 100644
index 0000000000000000000000000000000000000000..b56a2505e2512689503e70b77edfc84f08dafc99
--- /dev/null
+++ b/wenet/branchformer/cgmlp.py
@@ -0,0 +1,194 @@
+# Copyright (c) 2022 Yifan Peng (Carnegie Mellon University)
+#               2023 Voicecomm Inc (Kai Li)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from ESPnet(https://github.com/espnet/espnet)
+"""MLP with convolutional gating (cgMLP) definition.
+
+References:
+    https://openreview.net/forum?id=RA-zVvZLYIy
+    https://arxiv.org/abs/2105.08050
+
+"""
+
+from typing import Tuple
+import torch
+import torch.nn as nn
+from wenet.utils.class_utils import WENET_ACTIVATION_CLASSES
+
+
+class ConvolutionalSpatialGatingUnit(torch.nn.Module):
+    """Convolutional Spatial Gating Unit (CSGU)."""
+
+    def __init__(
+        self,
+        size: int,
+        kernel_size: int,
+        dropout_rate: float,
+        use_linear_after_conv: bool,
+        gate_activation: str,
+        causal: bool = True,
+    ):
+        super().__init__()
+
+        # split input channels
+        n_channels = size // 2
+        self.norm = nn.LayerNorm(n_channels)
+        # self.lorder is used to distinguish if it's a causal convolution,
+        # if self.lorder > 0: it's a causal convolution, the input will be
+        #    padded with self.lorder frames on the left in forward.
+        # else: it's a symmetrical convolution
+        if causal:
+            padding = 0
+            self.lorder = kernel_size - 1
+        else:
+            # kernel_size should be an odd number for none causal convolution
+            assert (kernel_size - 1) % 2 == 0
+            padding = (kernel_size - 1) // 2
+            self.lorder = 0
+        self.conv = torch.nn.Conv1d(
+            n_channels,
+            n_channels,
+            kernel_size,
+            1,
+            padding,
+            groups=n_channels,
+        )
+        if use_linear_after_conv:
+            self.linear = torch.nn.Linear(n_channels, n_channels)
+        else:
+            self.linear = None
+
+        if gate_activation == "identity":
+            self.act = torch.nn.Identity()
+        else:
+            self.act = WENET_ACTIVATION_CLASSES[gate_activation]()
+
+        self.dropout = torch.nn.Dropout(dropout_rate)
+
+    def espnet_initialization_fn(self):
+        torch.nn.init.normal_(self.conv.weight, std=1e-6)
+        torch.nn.init.ones_(self.conv.bias)
+        if self.linear is not None:
+            torch.nn.init.normal_(self.linear.weight, std=1e-6)
+            torch.nn.init.ones_(self.linear.bias)
+
+    def forward(
+        self, x: torch.Tensor, cache: torch.Tensor = torch.zeros((0, 0, 0))
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Forward method
+
+        Args:
+            x (torch.Tensor): (batch, time, channels)
+            cache (torch.Tensor): left context cache, it is only
+                used in causal convolution (#batch, channels, cache_t),
+                (0, 0, 0) meas fake cache.
+
+        Returns:
+            out (torch.Tensor): (batch, time, channels/2)
+        """
+
+        x_r, x_g = x.chunk(2, dim=-1)
+        # exchange the temporal dimension and the feature dimension
+        x_g = x_g.transpose(1, 2)  # (#batch, channels, time)
+
+        if self.lorder > 0:
+            if cache.size(2) == 0:  # cache_t == 0
+                x_g = nn.functional.pad(x_g, (self.lorder, 0), 'constant', 0.0)
+            else:
+                assert cache.size(0) == x_g.size(0)  # equal batch
+                assert cache.size(1) == x_g.size(1)  # equal channel
+                x_g = torch.cat((cache, x_g), dim=2)
+            assert (x_g.size(2) > self.lorder)
+            new_cache = x_g[:, :, -self.lorder:]
+        else:
+            # It's better we just return None if no cache is required,
+            # However, for JIT export, here we just fake one tensor instead of
+            # None.
+            new_cache = torch.zeros((0, 0, 0),
+                                    dtype=x_g.dtype,
+                                    device=x_g.device)
+
+        x_g = x_g.transpose(1, 2)
+        x_g = self.norm(x_g)  # (N, T, D/2)
+        x_g = self.conv(x_g.transpose(1, 2)).transpose(1, 2)  # (N, T, D/2)
+        if self.linear is not None:
+            x_g = self.linear(x_g)
+
+        x_g = self.act(x_g)
+        out = x_r * x_g  # (N, T, D/2)
+        out = self.dropout(out)
+        return out, new_cache
+
+
+class ConvolutionalGatingMLP(torch.nn.Module):
+    """Convolutional Gating MLP (cgMLP)."""
+
+    def __init__(
+        self,
+        size: int,
+        linear_units: int,
+        kernel_size: int,
+        dropout_rate: float,
+        use_linear_after_conv: bool,
+        gate_activation: str,
+        causal: bool = True,
+    ):
+        super().__init__()
+
+        self.channel_proj1 = torch.nn.Sequential(
+            torch.nn.Linear(size, linear_units), torch.nn.GELU())
+        self.csgu = ConvolutionalSpatialGatingUnit(
+            size=linear_units,
+            kernel_size=kernel_size,
+            dropout_rate=dropout_rate,
+            use_linear_after_conv=use_linear_after_conv,
+            gate_activation=gate_activation,
+            causal=causal,
+        )
+        self.channel_proj2 = torch.nn.Linear(linear_units // 2, size)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask: torch.Tensor,
+        cache: torch.Tensor = torch.zeros((0, 0, 0))
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Forward method
+
+        Args:
+            x (torch.Tensor): (batch, time, channels)
+            mask_pad (torch.Tensor): used for batch padding (#batch, 1, time),
+                (0, 0, 0) means fake mask. Not used yet
+            cache (torch.Tensor): left context cache, it is only
+                used in causal convolution (#batch, channels, cache_t),
+                (0, 0, 0) meas fake cache.
+
+        Returns:
+            out (torch.Tensor): (batch, time, channels/2)
+        """
+
+        xs_pad = x
+
+        # size -> linear_units
+        xs_pad = self.channel_proj1(xs_pad)
+
+        # linear_units -> linear_units/2
+        xs_pad, new_cnn_cache = self.csgu(xs_pad, cache)
+
+        # linear_units/2 -> size
+        xs_pad = self.channel_proj2(xs_pad)
+
+        out = xs_pad
+
+        return out, new_cnn_cache
diff --git a/wenet/branchformer/encoder.py b/wenet/branchformer/encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..2feda978e19ebbc1c0b4c1b6e94913e7c582e4c3
--- /dev/null
+++ b/wenet/branchformer/encoder.py
@@ -0,0 +1,177 @@
+# Copyright (c) 2022 Yifan Peng (Carnegie Mellon University)
+#               2023 Voicecomm Inc (Kai Li)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from ESPnet(https://github.com/espnet/espnet)
+"""Encoder definition."""
+
+import torch
+
+from typing import List, Optional, Union
+
+from wenet.branchformer.encoder_layer import BranchformerEncoderLayer
+from wenet.branchformer.cgmlp import ConvolutionalGatingMLP
+from wenet.transformer.encoder import BaseEncoder
+from wenet.utils.class_utils import (
+    WENET_ATTENTION_CLASSES, )
+
+
+class BranchformerEncoder(BaseEncoder):
+    """Branchformer encoder module."""
+
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int = 256,
+        use_attn: bool = True,
+        attention_heads: int = 4,
+        selfattention_layer_type: str = "rel_selfattn",
+        pos_enc_layer_type: str = "rel_pos",
+        use_cgmlp: bool = True,
+        cgmlp_linear_units: int = 2048,
+        cgmlp_conv_kernel: int = 31,
+        use_linear_after_conv: bool = False,
+        gate_activation: str = "identity",
+        merge_method: str = "concat",
+        cgmlp_weight: Union[float, List[float]] = 0.5,
+        attn_branch_drop_rate: Union[float, List[float]] = 0.0,
+        num_blocks: int = 12,
+        dropout_rate: float = 0.1,
+        positional_dropout_rate: float = 0.1,
+        attention_dropout_rate: float = 0.0,
+        input_layer: str = "conv2d",
+        stochastic_depth_rate: Union[float, List[float]] = 0.0,
+        static_chunk_size: int = 0,
+        use_dynamic_chunk: bool = False,
+        global_cmvn: torch.nn.Module = None,
+        use_dynamic_left_chunk: bool = False,
+        causal: bool = False,
+        query_bias: bool = True,
+        key_bias: bool = True,
+        value_bias: bool = True,
+        gradient_checkpointing: bool = False,
+        use_sdpa: bool = False,
+        layer_norm_type: str = 'layer_norm',
+        norm_eps: float = 1e-5,
+        n_kv_head: Optional[int] = None,
+        head_dim: Optional[int] = None,
+    ):
+        super().__init__(input_size, output_size, attention_heads,
+                         cgmlp_linear_units, num_blocks, dropout_rate,
+                         positional_dropout_rate, attention_dropout_rate,
+                         input_layer, pos_enc_layer_type, True,
+                         static_chunk_size, use_dynamic_chunk, global_cmvn,
+                         use_dynamic_left_chunk, gradient_checkpointing,
+                         use_sdpa, layer_norm_type, norm_eps)
+
+        encoder_selfattn_layer_args = (
+            attention_heads,
+            output_size,
+            attention_dropout_rate,
+            query_bias,
+            key_bias,
+            value_bias,
+            use_sdpa,
+            n_kv_head,
+            head_dim,
+        )
+
+        cgmlp_layer = ConvolutionalGatingMLP
+        cgmlp_layer_args = (
+            output_size,
+            cgmlp_linear_units,
+            cgmlp_conv_kernel,
+            dropout_rate,
+            use_linear_after_conv,
+            gate_activation,
+            causal,
+        )
+
+        if isinstance(stochastic_depth_rate, float):
+            stochastic_depth_rate = [stochastic_depth_rate] * num_blocks
+        if len(stochastic_depth_rate) != num_blocks:
+            raise ValueError(
+                f"Length of stochastic_depth_rate ({len(stochastic_depth_rate)}) "
+                f"should be equal to num_blocks ({num_blocks})")
+
+        if isinstance(cgmlp_weight, float):
+            cgmlp_weight = [cgmlp_weight] * num_blocks
+        if len(cgmlp_weight) != num_blocks:
+            raise ValueError(
+                f"Length of cgmlp_weight ({len(cgmlp_weight)}) should be equal to "
+                f"num_blocks ({num_blocks})")
+
+        if isinstance(attn_branch_drop_rate, float):
+            attn_branch_drop_rate = [attn_branch_drop_rate] * num_blocks
+        if len(attn_branch_drop_rate) != num_blocks:
+            raise ValueError(
+                f"Length of attn_branch_drop_rate ({len(attn_branch_drop_rate)}) "
+                f"should be equal to num_blocks ({num_blocks})")
+
+        self.encoders = LayerDropModuleList(
+            p=stochastic_depth_rate,
+            modules=[
+                BranchformerEncoderLayer(
+                    output_size,
+                    WENET_ATTENTION_CLASSES[selfattention_layer_type](
+                        *encoder_selfattn_layer_args) if use_attn else None,
+                    cgmlp_layer(*cgmlp_layer_args) if use_cgmlp else None,
+                    dropout_rate,
+                    merge_method,
+                    cgmlp_weight[lnum],
+                    attn_branch_drop_rate[lnum],
+                    stochastic_depth_rate[lnum],
+                ) for lnum in range(num_blocks)
+            ])
+
+
+# modify from : https://github.com/facebookresearch/fairseq/blob/main/fairseq/modules/layer_drop.py # noqa
+class LayerDropModuleList(torch.nn.ModuleList):
+    """
+    A LayerDrop implementation based on :class:`torch.nn.ModuleList`.
+
+    We refresh the choice of which layers to drop every time we iterate
+    over the LayerDropModuleList instance. During evaluation we always
+    iterate over all layers.
+
+    Usage::
+
+        layers = LayerDropList(p=0.5, modules=[layer1, layer2, layer3])
+        for layer in layers:  # this might iterate over layers 1 and 3
+            x = layer(x)
+        for layer in layers:  # this might iterate over all layers
+            x = layer(x)
+        for layer in layers:  # this might not iterate over any layers
+            x = layer(x)
+
+    Args:
+        p (float): probability of dropping out each layer
+        modules (iterable, optional): an iterable of modules to add
+
+    Limitations:
+        1 can work with ddp when layer's gradient checkpoint disabled
+        2 can't work with ddp when layer's gradient checkpoint enables
+        3 can work with fsdp
+        4 can work with deepspeed
+    """
+
+    def __init__(self, p: List[float], modules=None):
+        super().__init__(modules)
+        assert len(p) == len(self)
+        self.p = p
+
+    def __iter__(self):
+        dropout_probs = torch.empty(len(self)).uniform_()
+        for i, m in enumerate(super().__iter__()):
+            if not self.training or (dropout_probs[i] > self.p[i]):
+                yield m
diff --git a/wenet/branchformer/encoder_layer.py b/wenet/branchformer/encoder_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..a48feefbd1d2e9d54394e32cd16a90c089e0ceae
--- /dev/null
+++ b/wenet/branchformer/encoder_layer.py
@@ -0,0 +1,245 @@
+# Copyright (c) 2022 Yifan Peng (Carnegie Mellon University)
+#               2023 Voicecomm Inc (Kai Li)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from ESPnet(https://github.com/espnet/espnet)
+"""BranchformerEncoderLayer definition."""
+
+import torch
+import torch.nn as nn
+from typing import Optional, Tuple
+
+from wenet.transformer.attention import T_CACHE
+
+
+class BranchformerEncoderLayer(torch.nn.Module):
+    """Branchformer encoder layer module.
+
+    Args:
+        size (int): model dimension
+        attn: standard self-attention or efficient attention, optional
+        cgmlp: ConvolutionalGatingMLP, optional
+        dropout_rate (float): dropout probability
+        merge_method (str): concat, learned_ave, fixed_ave
+        cgmlp_weight (float): weight of the cgmlp branch, between 0 and 1,
+            used if merge_method is fixed_ave
+        attn_branch_drop_rate (float): probability of dropping the attn branch,
+            used if merge_method is learned_ave
+        stochastic_depth_rate (float): stochastic depth probability
+    """
+
+    def __init__(
+        self,
+        size: int,
+        attn: Optional[torch.nn.Module],
+        cgmlp: Optional[torch.nn.Module],
+        dropout_rate: float,
+        merge_method: str,
+        cgmlp_weight: float = 0.5,
+        attn_branch_drop_rate: float = 0.0,
+        stochastic_depth_rate: float = 0.0,
+    ):
+        super().__init__()
+        assert (attn is not None) or (
+            cgmlp is not None), "At least one branch should be valid"
+
+        self.size = size
+        self.attn = attn
+        self.cgmlp = cgmlp
+        self.merge_method = merge_method
+        self.cgmlp_weight = cgmlp_weight
+        self.attn_branch_drop_rate = attn_branch_drop_rate
+        self.stochastic_depth_rate = stochastic_depth_rate
+        self.use_two_branches = (attn is not None) and (cgmlp is not None)
+
+        if attn is not None:
+            self.norm_mha = nn.LayerNorm(size)  # for the MHA module
+        if cgmlp is not None:
+            self.norm_mlp = nn.LayerNorm(size)  # for the MLP module
+        self.norm_final = nn.LayerNorm(
+            size)  # for the final output of the block
+
+        self.dropout = torch.nn.Dropout(dropout_rate)
+
+        # # attention-based pooling for two branches
+        self.pooling_proj1 = torch.nn.Linear(size, 1)
+        self.pooling_proj2 = torch.nn.Linear(size, 1)
+
+        # # linear projections for calculating merging weights
+        self.weight_proj1 = torch.nn.Linear(size, 1)
+        self.weight_proj2 = torch.nn.Linear(size, 1)
+
+        if self.use_two_branches:
+            if self.merge_method == "concat":
+                self.merge_proj = torch.nn.Linear(size + size, size)
+
+            elif self.merge_method == "learned_ave":
+                # linear projection after weighted average
+                self.merge_proj = torch.nn.Linear(size, size)
+
+            elif self.merge_method == "fixed_ave":
+                assert (0.0 <= cgmlp_weight <=
+                        1.0), "cgmlp weight should be between 0.0 and 1.0"
+
+                # remove the other branch if only one branch is used
+                if cgmlp_weight == 0.0:
+                    self.use_two_branches = False
+                    self.cgmlp = None
+                    self.norm_mlp = None
+                elif cgmlp_weight == 1.0:
+                    self.use_two_branches = False
+                    self.attn = None
+                    self.norm_mha = None
+
+                # linear projection after weighted average
+                self.merge_proj = torch.nn.Linear(size, size)
+            else:
+                raise ValueError(f"unknown merge method: {merge_method}")
+        else:
+            self.merge_proj = torch.nn.Identity()
+
+    def _forward(
+        self,
+        x: torch.Tensor,
+        mask: torch.Tensor,
+        pos_emb: torch.Tensor,
+        mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+        att_cache: T_CACHE = (torch.zeros(
+            (0, 0, 0, 0)), torch.zeros(0, 0, 0, 0)),
+        cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)),
+        stoch_layer_coeff: float = 1.0
+    ) -> Tuple[torch.Tensor, torch.Tensor, T_CACHE, torch.Tensor]:
+        # Two branches
+        x1 = x
+        x2 = x
+
+        # Branch 1: multi-headed attention module
+        if self.attn is not None:
+            x1 = self.norm_mha(x1)
+            x_att, new_att_cache = self.attn(x1, x1, x1, mask, pos_emb,
+                                             att_cache)
+            x1 = self.dropout(x_att)
+
+        # Branch 2: convolutional gating mlp
+        # Fake new cnn cache here, and then change it in conv_module
+        new_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device)
+        if self.cgmlp is not None:
+            x2 = self.norm_mlp(x2)
+            x2, new_cnn_cache = self.cgmlp(x2, mask_pad, cnn_cache)
+            x2 = self.dropout(x2)
+
+        # Merge two branches
+        if self.use_two_branches:
+            if self.merge_method == "concat":
+                x = x + stoch_layer_coeff * self.dropout(
+                    self.merge_proj(torch.cat([x1, x2], dim=-1)))
+            elif self.merge_method == "learned_ave":
+                if (self.training and self.attn_branch_drop_rate > 0
+                        and torch.rand(1).item() < self.attn_branch_drop_rate):
+                    # Drop the attn branch
+                    w1, w2 = torch.tensor(0.0), torch.tensor(1.0)
+                else:
+                    # branch1
+                    score1 = (self.pooling_proj1(x1).transpose(1, 2) /
+                              self.size**0.5)
+                    score1 = score1.masked_fill(mask_pad.eq(0), -float('inf'))
+                    score1 = torch.softmax(score1, dim=-1).masked_fill(
+                        mask_pad.eq(0), 0.0)
+
+                    pooled1 = torch.matmul(score1,
+                                           x1).squeeze(1)  # (batch, size)
+                    weight1 = self.weight_proj1(pooled1)  # (batch, 1)
+
+                    # branch2
+                    score2 = (self.pooling_proj2(x2).transpose(1, 2) /
+                              self.size**0.5)
+                    score2 = score2.masked_fill(mask_pad.eq(0), -float('inf'))
+                    score2 = torch.softmax(score2, dim=-1).masked_fill(
+                        mask_pad.eq(0), 0.0)
+
+                    pooled2 = torch.matmul(score2,
+                                           x2).squeeze(1)  # (batch, size)
+                    weight2 = self.weight_proj2(pooled2)  # (batch, 1)
+
+                    # normalize weights of two branches
+                    merge_weights = torch.softmax(torch.cat([weight1, weight2],
+                                                            dim=-1),
+                                                  dim=-1)  # (batch, 2)
+                    merge_weights = merge_weights.unsqueeze(-1).unsqueeze(
+                        -1)  # (batch, 2, 1, 1)
+                    w1, w2 = merge_weights[:,
+                                           0], merge_weights[:,
+                                                             1]  # (batch, 1, 1)
+
+                x = x + stoch_layer_coeff * self.dropout(
+                    self.merge_proj(w1 * x1 + w2 * x2))
+            elif self.merge_method == "fixed_ave":
+                x = x + stoch_layer_coeff * self.dropout(
+                    self.merge_proj((1.0 - self.cgmlp_weight) * x1 +
+                                    self.cgmlp_weight * x2))
+            else:
+                raise RuntimeError(
+                    f"unknown merge method: {self.merge_method}")
+        else:
+            if self.attn is None:
+                x = x + stoch_layer_coeff * self.dropout(self.merge_proj(x2))
+            elif self.cgmlp is None:
+                x = x + stoch_layer_coeff * self.dropout(self.merge_proj(x1))
+            else:
+                # This should not happen
+                raise RuntimeError(
+                    "Both branches are not None, which is unexpected.")
+
+        x = self.norm_final(x)
+
+        return x, mask, new_att_cache, new_cnn_cache
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask: torch.Tensor,
+        pos_emb: torch.Tensor,
+        mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+        att_cache: T_CACHE = (torch.zeros(
+            (0, 0, 0, 0)), torch.zeros(0, 0, 0, 0)),
+        cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)),
+    ) -> Tuple[torch.Tensor, torch.Tensor, T_CACHE, torch.Tensor]:
+        """Compute encoded features.
+
+        Args:
+            x (Union[Tuple, torch.Tensor]): Input tensor  (#batch, time, size).
+            mask (torch.Tensor): Mask tensor for the input (#batch, time, time).
+            pos_emb (torch.Tensor): positional encoding, must not be None
+                for BranchformerEncoderLayer.
+            mask_pad (torch.Tensor): batch padding mask used for conv module.
+                (#batch, 1，time), (0, 0, 0) means fake mask.
+            att_cache (torch.Tensor): Cache tensor of the KEY & VALUE
+                (#batch=1, head, cache_t1, d_k * 2), head * d_k == size.
+            cnn_cache (torch.Tensor): Convolution cache in cgmlp layer
+                (#batch=1, size, cache_t2)
+
+        Returns:
+            torch.Tensor: Output tensor (#batch, time, size).
+            torch.Tensor: Mask tensor (#batch, time, time.
+            torch.Tensor: att_cache tensor,
+                (#batch=1, head, cache_t1 + time, d_k * 2).
+            torch.Tensor: cnn_cahce tensor (#batch, size, cache_t2).
+        """
+
+        stoch_layer_coeff = 1.0
+        # with stochastic depth, residual connection `x + f(x)` becomes
+        # `x <- x + 1 / (1 - p) * f(x)` at training time.
+        if self.training:
+            stoch_layer_coeff = 1.0 / (1 - self.stochastic_depth_rate)
+        return self._forward(x, mask, pos_emb, mask_pad, att_cache, cnn_cache,
+                             stoch_layer_coeff)
diff --git a/wenet/cli/__init__.py b/wenet/cli/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/wenet/cli/hub.py b/wenet/cli/hub.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8ca91ad5affa08f256d33d607dc28ff08601374
--- /dev/null
+++ b/wenet/cli/hub.py
@@ -0,0 +1,116 @@
+# Copyright (c) 2022  Mddct(hamddct@gmail.com)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import requests
+import sys
+import tarfile
+from pathlib import Path
+from urllib.request import urlretrieve
+
+import tqdm
+
+
+def download(url: str, dest: str, only_child=True):
+    """ download from url to dest
+    """
+    assert os.path.exists(dest)
+    print('Downloading {} to {}'.format(url, dest))
+
+    def progress_hook(t):
+        last_b = [0]
+
+        def update_to(b=1, bsize=1, tsize=None):
+            if tsize not in (None, -1):
+                t.total = tsize
+            displayed = t.update((b - last_b[0]) * bsize)
+            last_b[0] = b
+            return displayed
+
+        return update_to
+
+    # *.tar.gz
+    name = url.split('?')[0].split('/')[-1]
+    tar_path = os.path.join(dest, name)
+    with tqdm.tqdm(unit='B',
+                   unit_scale=True,
+                   unit_divisor=1024,
+                   miniters=1,
+                   desc=(name)) as t:
+        urlretrieve(url,
+                    filename=tar_path,
+                    reporthook=progress_hook(t),
+                    data=None)
+        t.total = t.n
+
+    with tarfile.open(tar_path) as f:
+        if not only_child:
+            f.extractall(dest)
+        else:
+            for tarinfo in f:
+                if "/" not in tarinfo.name:
+                    continue
+                name = os.path.basename(tarinfo.name)
+                fileobj = f.extractfile(tarinfo)
+                with open(os.path.join(dest, name), "wb") as writer:
+                    writer.write(fileobj.read())
+
+
+class Hub(object):
+    """Hub for wenet pretrain runtime model
+    """
+    # TODO(Mddct): make assets class to support other language
+    Assets = {
+        # wenetspeech
+        "chinese": "wenetspeech_u2pp_conformer_libtorch.tar.gz",
+        # gigaspeech
+        "english": "gigaspeech_u2pp_conformer_libtorch.tar.gz",
+        # paraformer
+        "paraformer": "paraformer.tar.gz"
+    }
+
+    def __init__(self) -> None:
+        pass
+
+    @staticmethod
+    def get_model_by_lang(lang: str) -> str:
+        if lang not in Hub.Assets.keys():
+            print('ERROR: Unsupported language {} !!!'.format(lang))
+            sys.exit(1)
+
+        # NOTE(Mddct): model_dir structure
+        # Path.Home()/.wenet
+        # - chs
+        #    - units.txt
+        #    - final.zip
+        # - en
+        #    - units.txt
+        #    - final.zip
+        model = Hub.Assets[lang]
+        model_dir = os.path.join(Path.home(), ".wenet", lang)
+        if not os.path.exists(model_dir):
+            os.makedirs(model_dir)
+        # TODO(Mddct): model metadata
+        if set(["final.zip",
+                "units.txt"]).issubset(set(os.listdir(model_dir))):
+            return model_dir
+        # If not exist, download
+        response = requests.get(
+            "https://modelscope.cn/api/v1/datasets/wenet/wenet_pretrained_models/oss/tree"  # noqa
+        )
+        model_info = next(data for data in response.json()["Data"]
+                          if data["Key"] == model)
+        model_url = model_info['Url']
+        download(model_url, model_dir, only_child=True)
+        return model_dir
diff --git a/wenet/cli/model.py b/wenet/cli/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb24bdb3379f58fbb9c6507fedc8da3f2e8fb68f
--- /dev/null
+++ b/wenet/cli/model.py
@@ -0,0 +1,176 @@
+# Copyright (c) 2023 Binbin Zhang (binbzha@qq.com)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import torch
+import torchaudio
+import torchaudio.compliance.kaldi as kaldi
+
+from wenet.cli.hub import Hub
+from wenet.utils.ctc_utils import (force_align, gen_ctc_peak_time,
+                                   gen_timestamps_from_peak)
+from wenet.utils.file_utils import read_symbol_table
+from wenet.transformer.search import (attention_rescoring,
+                                      ctc_prefix_beam_search, DecodeResult)
+from wenet.utils.context_graph import ContextGraph
+from wenet.utils.common import TORCH_NPU_AVAILABLE  # noqa just ensure to check torch-npu
+
+
+class Model:
+
+    def __init__(self,
+                 model_dir: str,
+                 gpu: int = -1,
+                 beam: int = 5,
+                 context_path: str = None,
+                 context_score: float = 6.0,
+                 resample_rate: int = 16000):
+        model_path = os.path.join(model_dir, 'final.zip')
+        units_path = os.path.join(model_dir, 'units.txt')
+        self.model = torch.jit.load(model_path)
+        self.resample_rate = resample_rate
+        self.model.eval()
+        if gpu >= 0:
+            device = 'cuda:{}'.format(gpu)
+        else:
+            device = 'cpu'
+        self.device = torch.device(device)
+        self.model.to(device)
+        self.symbol_table = read_symbol_table(units_path)
+        self.char_dict = {v: k for k, v in self.symbol_table.items()}
+        self.beam = beam
+        if context_path is not None:
+            self.context_graph = ContextGraph(context_path,
+                                              self.symbol_table,
+                                              context_score=context_score)
+        else:
+            self.context_graph = None
+
+    def compute_feats(self, audio_file: str) -> torch.Tensor:
+        waveform, sample_rate = torchaudio.load(audio_file, normalize=False)
+        waveform = waveform.to(torch.float)
+        if sample_rate != self.resample_rate:
+            waveform = torchaudio.transforms.Resample(
+                orig_freq=sample_rate, new_freq=self.resample_rate)(waveform)
+        # NOTE (MengqingCao): complex dtype not supported in torch_npu.abs() now,
+        # thus, delay placing data on NPU after the calculation of fbank.
+        # revert me after complex dtype is supported.
+        if "npu" not in self.device.__str__():
+            waveform = waveform.to(self.device)
+        feats = kaldi.fbank(waveform,
+                            num_mel_bins=80,
+                            frame_length=25,
+                            frame_shift=10,
+                            energy_floor=0.0,
+                            sample_frequency=self.resample_rate)
+        if "npu" in self.device.__str__():
+            feats = feats.to(self.device)
+        feats = feats.unsqueeze(0)
+        return feats
+
+    @torch.no_grad()
+    def _decode(self,
+                audio_file: str,
+                tokens_info: bool = False,
+                label: str = None) -> dict:
+        feats = self.compute_feats(audio_file)
+        encoder_out, _, _ = self.model.forward_encoder_chunk(feats, 0, -1)
+        encoder_lens = torch.tensor([encoder_out.size(1)],
+                                    dtype=torch.long,
+                                    device=encoder_out.device)
+        ctc_probs = self.model.ctc_activation(encoder_out)
+        if label is None:
+            ctc_prefix_results = ctc_prefix_beam_search(
+                ctc_probs,
+                encoder_lens,
+                self.beam,
+                context_graph=self.context_graph)
+        else:  # force align mode, construct ctc prefix result from alignment
+            label_t = self.tokenize(label)
+            alignment = force_align(ctc_probs.squeeze(0),
+                                    torch.tensor(label_t, dtype=torch.long))
+            peaks = gen_ctc_peak_time(alignment)
+            ctc_prefix_results = [
+                DecodeResult(tokens=label_t,
+                             score=0.0,
+                             times=peaks,
+                             nbest=[label_t],
+                             nbest_scores=[0.0],
+                             nbest_times=[peaks])
+            ]
+        rescoring_results = attention_rescoring(self.model, ctc_prefix_results,
+                                                encoder_out, encoder_lens, 0.3,
+                                                0.5)
+        res = rescoring_results[0]
+        result = {}
+        result['text'] = ''.join([self.char_dict[x] for x in res.tokens])
+        result['confidence'] = res.confidence
+
+        if tokens_info:
+            frame_rate = self.model.subsampling_rate(
+            ) * 0.01  # 0.01 seconds per frame
+            max_duration = encoder_out.size(1) * frame_rate
+            times = gen_timestamps_from_peak(res.times, max_duration,
+                                             frame_rate, 1.0)
+            tokens_info = []
+            for i, x in enumerate(res.tokens):
+                tokens_info.append({
+                    'token': self.char_dict[x],
+                    'start': round(times[i][0], 3),
+                    'end': round(times[i][1], 3),
+                    'confidence': round(res.tokens_confidence[i], 2)
+                })
+            result['tokens'] = tokens_info
+        return result
+
+    def transcribe(self, audio_file: str, tokens_info: bool = False) -> dict:
+        return self._decode(audio_file, tokens_info)
+
+    def tokenize(self, label: str):
+        # TODO(Binbin Zhang): Support BPE
+        tokens = []
+        for c in label:
+            if c == ' ':
+                c = "▁"
+            tokens.append(c)
+        token_list = []
+        for c in tokens:
+            if c in self.symbol_table:
+                token_list.append(self.symbol_table[c])
+            elif '<unk>' in self.symbol_table:
+                token_list.append(self.symbol_table['<unk>'])
+        return token_list
+
+    def align(self, audio_file: str, label: str) -> dict:
+        return self._decode(audio_file, True, label)
+
+
+def load_model(language: str = None,
+               model_dir: str = None,
+               gpu: int = -1,
+               beam: int = 5,
+               context_path: str = None,
+               context_score: float = 6.0,
+               device: str = "cpu") -> Model:
+    if model_dir is None:
+        model_dir = Hub.get_model_by_lang(language)
+
+    if gpu != -1:
+        # remain the original usage of gpu
+        device = "cuda"
+    model = Model(model_dir, gpu, beam, context_path, context_score)
+    model.device = torch.device(device)
+    model.model.to(device)
+    return model
diff --git a/wenet/cli/paraformer_model.py b/wenet/cli/paraformer_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4f834ab25446b898c777ee821bbe5c03b5f3fbe
--- /dev/null
+++ b/wenet/cli/paraformer_model.py
@@ -0,0 +1,82 @@
+import os
+
+import torch
+import torchaudio
+import torchaudio.compliance.kaldi as kaldi
+
+from wenet.cli.hub import Hub
+from wenet.paraformer.search import (gen_timestamps_from_peak,
+                                     paraformer_greedy_search)
+from wenet.text.paraformer_tokenizer import ParaformerTokenizer
+from wenet.utils.common import TORCH_NPU_AVAILABLE  # noqa just ensure to check torch-npu
+
+
+class Paraformer:
+
+    def __init__(self, model_dir: str, resample_rate: int = 16000) -> None:
+
+        model_path = os.path.join(model_dir, 'final.zip')
+        units_path = os.path.join(model_dir, 'units.txt')
+        self.model = torch.jit.load(model_path)
+        self.resample_rate = resample_rate
+        self.device = torch.device("cpu")
+        self.tokenizer = ParaformerTokenizer(symbol_table=units_path)
+
+    def transcribe(self, audio_file: str, tokens_info: bool = False) -> dict:
+        waveform, sample_rate = torchaudio.load(audio_file, normalize=False)
+        waveform = waveform.to(torch.float).to(self.device)
+        if sample_rate != self.resample_rate:
+            waveform = torchaudio.transforms.Resample(
+                orig_freq=sample_rate, new_freq=self.resample_rate)(waveform)
+        feats = kaldi.fbank(waveform,
+                            num_mel_bins=80,
+                            frame_length=25,
+                            frame_shift=10,
+                            energy_floor=0.0,
+                            sample_frequency=self.resample_rate,
+                            window_type="hamming")
+        feats = feats.unsqueeze(0)
+        feats_lens = torch.tensor([feats.size(1)],
+                                  dtype=torch.int64,
+                                  device=feats.device)
+
+        decoder_out, token_num, tp_alphas = self.model.forward_paraformer(
+            feats, feats_lens)
+        cif_peaks = self.model.forward_cif_peaks(tp_alphas, token_num)
+        res = paraformer_greedy_search(decoder_out, token_num, cif_peaks)[0]
+        result = {}
+        result['confidence'] = res.confidence
+        result['text'] = self.tokenizer.detokenize(res.tokens)[0]
+        if tokens_info:
+            tokens_info = []
+            times = gen_timestamps_from_peak(res.times,
+                                             num_frames=tp_alphas.size(1),
+                                             frame_rate=0.02)
+
+            for i, x in enumerate(res.tokens):
+                tokens_info.append({
+                    'token': self.tokenizer.char_dict[x],
+                    'start': round(times[i][0], 3),
+                    'end': round(times[i][1], 3),
+                    'confidence': round(res.tokens_confidence[i], 2)
+                })
+            result['tokens'] = tokens_info
+
+        return result
+
+    def align(self, audio_file: str, label: str) -> dict:
+        raise NotImplementedError("Align is currently not supported")
+
+
+def load_model(model_dir: str = None,
+               gpu: int = -1,
+               device: str = "cpu") -> Paraformer:
+    if model_dir is None:
+        model_dir = Hub.get_model_by_lang('paraformer')
+    if gpu != -1:
+        # remain the original usage of gpu
+        device = "cuda"
+    paraformer = Paraformer(model_dir)
+    paraformer.device = torch.device(device)
+    paraformer.model.to(device)
+    return paraformer
diff --git a/wenet/cli/transcribe.py b/wenet/cli/transcribe.py
new file mode 100644
index 0000000000000000000000000000000000000000..28bf27919273f06a343f3faa824dc710339a6077
--- /dev/null
+++ b/wenet/cli/transcribe.py
@@ -0,0 +1,87 @@
+# Copyright (c) 2023 Binbin Zhang (binbzha@qq.com)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+
+from wenet.cli.paraformer_model import load_model as load_paraformer
+from wenet.cli.model import load_model
+
+
+def get_args():
+    parser = argparse.ArgumentParser(description='')
+    parser.add_argument('audio_file', help='audio file to transcribe')
+    parser.add_argument('-l',
+                        '--language',
+                        choices=[
+                            'chinese',
+                            'english',
+                        ],
+                        default='chinese',
+                        help='language type')
+    parser.add_argument('-m',
+                        '--model_dir',
+                        default=None,
+                        help='specify your own model dir')
+    parser.add_argument('-g',
+                        '--gpu',
+                        type=int,
+                        default='-1',
+                        help='gpu id to decode, default is cpu.')
+    parser.add_argument('--device',
+                        type=str,
+                        default='cpu',
+                        choices=["cpu", "npu", "cuda"],
+                        help='accelerator to use')
+    parser.add_argument('-t',
+                        '--show_tokens_info',
+                        action='store_true',
+                        help='whether to output token(word) level information'
+                        ', such times/confidence')
+    parser.add_argument('--align',
+                        action='store_true',
+                        help='force align the input audio and transcript')
+    parser.add_argument('--label', type=str, help='the input label to align')
+    parser.add_argument('--paraformer',
+                        action='store_true',
+                        help='whether to use the best chinese model')
+    parser.add_argument('--beam', type=int, default=5, help="beam size")
+    parser.add_argument('--context_path',
+                        type=str,
+                        default=None,
+                        help='context list file')
+    parser.add_argument('--context_score',
+                        type=float,
+                        default=6.0,
+                        help='context score')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = get_args()
+
+    if args.paraformer:
+        model = load_paraformer(args.model_dir, args.gpu, args.device)
+    else:
+        model = load_model(args.language, args.model_dir, args.gpu, args.beam,
+                           args.context_path, args.context_score, args.device)
+    if args.align:
+        result = model.align(args.audio_file, args.label)
+    else:
+        result = model.transcribe(args.audio_file, args.show_tokens_info)
+    print(result)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/wenet/ctl_model/asr_model_ctl.py b/wenet/ctl_model/asr_model_ctl.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e9bc810a7c010432d0f70c423d14c59c8bbcd79
--- /dev/null
+++ b/wenet/ctl_model/asr_model_ctl.py
@@ -0,0 +1,277 @@
+# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu)
+#               2023 NetEase Inc. (authors: Yuting Yang)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from ESPnet(https://github.com/espnet/espnet) and
+# fairseq(https://github.com/facebookresearch/fairseq)
+
+from typing import Dict, Optional
+
+import torch
+import torch.nn.functional as F
+from wenet.transformer.ctc import CTC
+from wenet.transformer.decoder import TransformerDecoder
+from wenet.ctl_model.encoder import TransformerEncoder
+from wenet.transformer.asr_model import ASRModel
+from wenet.utils.common import IGNORE_ID
+
+
+class CTLModel(ASRModel):
+    """
+        Implementation of Interspeecch 2023 paper:
+        'Enhancing the Unified Streaming and Non-streaming Model
+         with Contrastive Learning'
+        https://arxiv.org/abs/2306.00755
+    """
+
+    def __init__(
+        self,
+        vocab_size: int,
+        encoder: TransformerEncoder,
+        decoder: TransformerDecoder,
+        ctc: CTC,
+        ctc_weight: float = 0.5,
+        ignore_id: int = IGNORE_ID,
+        reverse_weight: float = 0.0,
+        lsm_weight: float = 0.0,
+        length_normalized_loss: bool = False,
+        logit_temp: float = 0.1,
+        n_negatives: int = 0,
+        ctl_weight: float = 1,
+        special_tokens: dict = None,
+    ):
+        assert 0.0 <= ctc_weight <= 1.0, ctc_weight
+        super().__init__(vocab_size,
+                         encoder,
+                         decoder,
+                         ctc,
+                         ctc_weight,
+                         ignore_id,
+                         reverse_weight,
+                         lsm_weight,
+                         length_normalized_loss,
+                         special_tokens=special_tokens)
+
+        # For CTL Loss
+        self.n_negatives = n_negatives
+        self.ctl_weight = ctl_weight
+        self.logit_temp = logit_temp
+
+    @torch.jit.unused
+    def forward(
+        self,
+        batch: dict,
+        device: torch.device,
+    ) -> Dict[str, Optional[torch.Tensor]]:
+
+        speech = batch['feats'].to(device)
+        speech_lengths = batch['feats_lengths'].to(device)
+        text = batch['target'].to(device)
+        text_lengths = batch['target_lengths'].to(device)
+        loss_full, encoder_out_full, _, _ = self.forward_full(
+            speech, speech_lengths, text, text_lengths)
+        loss_chunk, encoder_out, lens_chunk, encoder_mask = self.forward_chunk(
+            speech, speech_lengths, text, text_lengths)
+
+        ctl_loss = 0.0
+        if self.ctl_weight > 0 and self.n_negatives > 0:
+            num = encoder_out_full.size(1)
+            targets = encoder_out_full
+            src = encoder_out
+            negs, negs_idxs = self.sample_negatives(targets,
+                                                    targets.size(1),
+                                                    speech_lengths=lens_chunk)
+            ctl_loss = self.CTL(src, targets, negs, encoder_mask)
+
+        loss = loss_full + loss_chunk + self.ctl_weight * ctl_loss
+        return {
+            "loss": loss,
+            "loss_full": loss_full,
+            "loss_chunk": loss_chunk,
+            "loss_ctl": ctl_loss
+        }
+
+    def forward_full(
+        self,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        text: torch.Tensor,
+        text_lengths: torch.Tensor,
+    ):
+        """Full context mode
+        Frontend + Encoder + Decoder + Calc loss
+
+        Args:
+            speech: (Batch, Length, ...)
+            speech_lengths: (Batch, )
+            text: (Batch, Length)
+            text_lengths: (Batch,)
+        """
+
+        assert text_lengths.dim() == 1, text_lengths.shape
+        # Check that batch_size is unified
+        assert (speech.shape[0] == speech_lengths.shape[0] == text.shape[0] ==
+                text_lengths.shape[0]), (speech.shape, speech_lengths.shape,
+                                         text.shape, text_lengths.shape)
+        # 1. Encoder
+        encoder_out, encoder_mask = self.encoder.forward_full(
+            speech, speech_lengths)
+        encoder_out_lens = encoder_mask.squeeze(1).sum(1)
+
+        # 2a. Attention-decoder branch
+        if self.ctc_weight != 1.0:
+            loss_att, acc_att = self._calc_att_loss(encoder_out, encoder_mask,
+                                                    text, text_lengths)
+        else:
+            loss_att = None
+
+        # 2b. CTC branch
+        if self.ctc_weight != 0.0:
+            loss_ctc = self.ctc(encoder_out, encoder_out_lens, text,
+                                text_lengths)
+        else:
+            loss_ctc = None
+
+        if loss_ctc is None:
+            loss = loss_att
+        elif loss_att is None:
+            loss = loss_ctc
+        else:
+            loss = self.ctc_weight * loss_ctc[0] + (1 -
+                                                    self.ctc_weight) * loss_att
+        return loss, encoder_out, encoder_out_lens, encoder_mask
+
+    def forward_chunk(
+        self,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        text: torch.Tensor,
+        text_lengths: torch.Tensor,
+    ):
+        """Chunk-based context mode
+        Frontend + Encoder + Decoder + Calc loss
+
+        Args:
+            speech: (Batch, Length, ...)
+            speech_lengths: (Batch, )
+            text: (Batch, Length)
+            text_lengths: (Batch,)
+        """
+
+        assert text_lengths.dim() == 1, text_lengths.shape
+        # Check that batch_size is unified
+        assert (speech.shape[0] == speech_lengths.shape[0] == text.shape[0] ==
+                text_lengths.shape[0]), (speech.shape, speech_lengths.shape,
+                                         text.shape, text_lengths.shape)
+        # 1. Encoder
+        encoder_out, encoder_mask = self.encoder(speech, speech_lengths)
+        encoder_out_lens = encoder_mask.squeeze(1).sum(1)
+
+        # 2a. Attention-decoder branch
+        if self.ctc_weight != 1.0:
+            loss_att, acc_att = self._calc_att_loss(encoder_out, encoder_mask,
+                                                    text, text_lengths)
+        else:
+            loss_att = None
+
+        # 2b. CTC branch
+        if self.ctc_weight != 0.0:
+            loss_ctc = self.ctc(encoder_out, encoder_out_lens, text,
+                                text_lengths)
+        else:
+            loss_ctc = None
+
+        if loss_ctc is None:
+            loss = loss_att
+        elif loss_att is None:
+            loss = loss_ctc
+        else:
+            loss = self.ctc_weight * loss_ctc[0] + (1 -
+                                                    self.ctc_weight) * loss_att
+        return loss, encoder_out, encoder_out_lens, encoder_mask
+
+    def sample_negatives(self, y, num, padding_count=0, speech_lengths=None):
+        if self.n_negatives == 0:
+            return y.new(0)
+        bsz, tsz, fsz = y.shape
+        y = y.reshape(-1, fsz)  # BTC => (BxT)C
+
+        # FIXME: what happens if padding_count is specified?
+        high = tsz - (padding_count or 0)
+        with torch.no_grad():
+            assert high > 1, f"{bsz,tsz,fsz}"
+
+            if self.n_negatives > 0:
+                tszs = (torch.arange(num).unsqueeze(-1).expand(
+                    -1, self.n_negatives).flatten())
+                if speech_lengths is not None:
+                    neg_idxs = [
+                        torch.randint(low=0,
+                                      high=speech_lengths[i].item() - 1,
+                                      size=(1, self.n_negatives * tsz))
+                        for i in range(len(speech_lengths))
+                    ]
+                    neg_idxs = torch.cat(neg_idxs).reshape(
+                        bsz, self.n_negatives * tsz)
+                else:
+                    neg_idxs = torch.randint(low=0,
+                                             high=num - 1,
+                                             size=(bsz,
+                                                   self.n_negatives * tsz))
+                neg_idxs[neg_idxs >= tszs] += 1
+
+        if self.n_negatives > 0:
+            neg_idxs = neg_idxs + (torch.arange(bsz).unsqueeze(1) * high)
+
+        negs = y[neg_idxs.view(-1)]
+        negs = negs.contiguous().view(bsz, num, self.n_negatives,
+                                      fsz).permute(2, 0, 1, 3)  # to NxBxTxC
+        return negs, neg_idxs
+
+    def compute_preds(self, x, y, negatives):
+        neg_is_pos = (y == negatives).all(-1)
+        y = y.unsqueeze(0)
+        targets = torch.cat([y, negatives], dim=0)
+
+        logits = torch.cosine_similarity(x.float(), targets.float(), dim=-1)
+        logits = logits / self.logit_temp
+        logits = logits.type_as(x)
+
+        if neg_is_pos.any():
+            if not hasattr(self, "_inftensor"):
+                self._inftensor = float("-inf")
+            # logits[1:] = index_put(logits[1:], neg_is_pos, self._inftensor)
+            logits[1:][neg_is_pos] = self._inftensor
+        logits = logits.transpose(0, 2)
+        logits = logits.transpose(0, 1)
+        logits = logits.reshape(-1, logits.size(-1))
+        return logits
+
+    def CTL(self, x, y, negs, mask=None):
+        # Step1: compute cosine similarity, shape [B*T, n_negatives+1]
+        logits = self.compute_preds(x, y, negs)
+
+        # Step2: target shape [B*T]
+        target = x.new_zeros(x.size(0) * x.size(1), dtype=torch.long)
+
+        # Step3: compute CTL loss
+        if mask is not None:
+            normalize_length = mask.sum()
+            bz, sz = mask.size(0), mask.size(-1)
+            mask = mask.squeeze(1).reshape(bz * sz).eq(0)
+            ce = F.cross_entropy(logits, target, reduction='none')
+            loss = ce.masked_fill(mask, 0).sum() / normalize_length
+        else:
+            loss = F.cross_entropy(logits, target)
+
+        return loss
diff --git a/wenet/ctl_model/encoder.py b/wenet/ctl_model/encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..9aa18b7048a922e0bd9725cfe867083a5bf5b26e
--- /dev/null
+++ b/wenet/ctl_model/encoder.py
@@ -0,0 +1,172 @@
+# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu)
+#               2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn)
+#               2023 NetEase Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from ESPnet(https://github.com/espnet/espnet)
+"""Encoder definition."""
+from typing import Optional, Tuple
+
+import torch
+
+from wenet.utils.mask import make_pad_mask
+from wenet.transformer.encoder import TransformerEncoder, ConformerEncoder
+
+
+class DualTransformerEncoder(TransformerEncoder):
+    """Transformer encoder module."""
+
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int = 256,
+        attention_heads: int = 4,
+        linear_units: int = 2048,
+        num_blocks: int = 6,
+        dropout_rate: float = 0.1,
+        positional_dropout_rate: float = 0.1,
+        attention_dropout_rate: float = 0.0,
+        input_layer: str = "conv2d",
+        pos_enc_layer_type: str = "abs_pos",
+        normalize_before: bool = True,
+        static_chunk_size: int = 0,
+        use_dynamic_chunk: bool = False,
+        global_cmvn: torch.nn.Module = None,
+        use_dynamic_left_chunk: bool = False,
+        query_bias: bool = True,
+        key_bias: bool = True,
+        value_bias: bool = True,
+        activation_type: str = "relu",
+        gradient_checkpointing: bool = False,
+        use_sdpa: bool = False,
+        layer_norm_type: str = 'layer_norm',
+        norm_eps: float = 1e-5,
+        n_kv_head: Optional[int] = None,
+        head_dim: Optional[int] = None,
+        selfattention_layer_type: str = "selfattn",
+        mlp_type: str = 'position_wise_feed_forward',
+        mlp_bias: bool = True,
+        n_expert: int = 8,
+        n_expert_activated: int = 2,
+    ):
+        """ Construct DualTransformerEncoder
+        Support both the full context mode and the streaming mode separately
+        """
+        super().__init__(input_size, output_size, attention_heads,
+                         linear_units, num_blocks, dropout_rate,
+                         positional_dropout_rate, attention_dropout_rate,
+                         input_layer, pos_enc_layer_type, normalize_before,
+                         static_chunk_size, use_dynamic_chunk, global_cmvn,
+                         use_dynamic_left_chunk, query_bias, key_bias,
+                         value_bias, activation_type, gradient_checkpointing,
+                         use_sdpa, layer_norm_type, norm_eps, n_kv_head,
+                         head_dim, selfattention_layer_type, mlp_type,
+                         mlp_bias, n_expert, n_expert_activated)
+
+    def forward_full(
+        self,
+        xs: torch.Tensor,
+        xs_lens: torch.Tensor,
+        decoding_chunk_size: int = 0,
+        num_decoding_left_chunks: int = -1,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        T = xs.size(1)
+        masks = ~make_pad_mask(xs_lens, T).unsqueeze(1)  # (B, 1, T)
+        if self.global_cmvn is not None:
+            xs = self.global_cmvn(xs)
+        xs, pos_emb, masks = self.embed(xs, masks)
+        mask_pad = masks  # (B, 1, T/subsample_rate)
+        for layer in self.encoders:
+            xs, masks, _, _ = layer(xs, masks, pos_emb, mask_pad)
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+        return xs, masks
+
+
+class DualConformerEncoder(ConformerEncoder):
+    """Conformer encoder module."""
+
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int = 256,
+        attention_heads: int = 4,
+        linear_units: int = 2048,
+        num_blocks: int = 6,
+        dropout_rate: float = 0.1,
+        positional_dropout_rate: float = 0.1,
+        attention_dropout_rate: float = 0.0,
+        input_layer: str = "conv2d",
+        pos_enc_layer_type: str = "rel_pos",
+        normalize_before: bool = True,
+        static_chunk_size: int = 0,
+        use_dynamic_chunk: bool = False,
+        global_cmvn: torch.nn.Module = None,
+        use_dynamic_left_chunk: bool = False,
+        positionwise_conv_kernel_size: int = 1,
+        macaron_style: bool = True,
+        selfattention_layer_type: str = "rel_selfattn",
+        activation_type: str = "swish",
+        use_cnn_module: bool = True,
+        cnn_module_kernel: int = 15,
+        causal: bool = False,
+        cnn_module_norm: str = "batch_norm",
+        query_bias: bool = True,
+        key_bias: bool = True,
+        value_bias: bool = True,
+        conv_bias: bool = True,
+        gradient_checkpointing: bool = False,
+        use_sdpa: bool = False,
+        layer_norm_type: str = 'layer_norm',
+        norm_eps: float = 1e-5,
+        n_kv_head: Optional[int] = None,
+        head_dim: Optional[int] = None,
+        mlp_type: str = 'position_wise_feed_forward',
+        mlp_bias: bool = True,
+        n_expert: int = 8,
+        n_expert_activated: int = 2,
+    ):
+        """ Construct DualConformerEncoder
+        Support both the full context mode and the streaming mode separately
+        """
+        super().__init__(
+            input_size, output_size, attention_heads, linear_units, num_blocks,
+            dropout_rate, positional_dropout_rate, attention_dropout_rate,
+            input_layer, pos_enc_layer_type, normalize_before,
+            static_chunk_size, use_dynamic_chunk, global_cmvn,
+            use_dynamic_left_chunk, positionwise_conv_kernel_size,
+            macaron_style, selfattention_layer_type, activation_type,
+            use_cnn_module, cnn_module_kernel, causal, cnn_module_norm,
+            query_bias, key_bias, value_bias, conv_bias,
+            gradient_checkpointing, use_sdpa, layer_norm_type, norm_eps,
+            n_kv_head, head_dim, mlp_type, mlp_bias, n_expert,
+            n_expert_activated)
+
+    def forward_full(
+        self,
+        xs: torch.Tensor,
+        xs_lens: torch.Tensor,
+        decoding_chunk_size: int = 0,
+        num_decoding_left_chunks: int = -1,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        T = xs.size(1)
+        masks = ~make_pad_mask(xs_lens, T).unsqueeze(1)  # (B, 1, T)
+        if self.global_cmvn is not None:
+            xs = self.global_cmvn(xs)
+        xs, pos_emb, masks = self.embed(xs, masks)
+        mask_pad = masks  # (B, 1, T/subsample_rate)
+        for layer in self.encoders:
+            xs, masks, _, _ = layer(xs, masks, pos_emb, mask_pad)
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+        return xs, masks
diff --git a/wenet/dataset/__init__.py b/wenet/dataset/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/wenet/dataset/datapipes.py b/wenet/dataset/datapipes.py
new file mode 100644
index 0000000000000000000000000000000000000000..54127a8214f0325b8821356d7ec3c73b3a4e5741
--- /dev/null
+++ b/wenet/dataset/datapipes.py
@@ -0,0 +1,470 @@
+# Copyright (c) 2023 Wenet Community. (authors: Dinghao Zhou)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+from collections.abc import Callable
+import copy
+import sys
+import tarfile
+import logging
+from typing import List, Optional
+import numpy as np
+import torch
+from torch.utils.data import IterDataPipe, functional_datapipe
+from torch.utils.data import datapipes
+from torch.utils.data.datapipes.iter import Mapper
+from torch.utils.data.datapipes.iter.sharding import (
+    SHARDING_PRIORITIES, ShardingFilterIterDataPipe)
+from torch.utils.data.datapipes.utils.common import _check_unpickable_fn
+
+from wenet.dataset.processor import parse_url
+
+
+@functional_datapipe("map_ignore_error")
+class MapperIgnoreErrorDataPipe(Mapper):
+
+    def __init__(self,
+                 dataset: IterDataPipe,
+                 fn: Callable,
+                 input_col=None,
+                 output_col=None,
+                 log_error: bool = True) -> None:
+        super().__init__(dataset, fn, input_col, output_col)
+        self._iter = None
+        self.log_error = log_error
+
+    def __iter__(self):
+        if self._iter is None:
+            self._iter = iter(self.datapipe)
+
+        while True:
+            try:
+                elem = next(self._iter)
+                yield self._apply_fn(elem)
+            except StopIteration:
+                self._iter = None
+                return
+            except Exception as ex:
+                if self.log_error:
+                    logging.warning(str(ex))
+
+
+@functional_datapipe('bucket_by_sequence_length')
+class BucketBySequenceLengthDataPipe(IterDataPipe):
+
+    def __init__(
+        self,
+        dataset: IterDataPipe,
+        elem_length_func,
+        bucket_boundaries: List[int],
+        bucket_batch_sizes: List[int],
+        wrapper_class=None,
+    ) -> None:
+        super().__init__()
+        _check_unpickable_fn(elem_length_func)
+        assert len(bucket_batch_sizes) == len(bucket_boundaries) + 1
+        self.bucket_batch_sizes = bucket_batch_sizes
+        self.bucket_boundaries = bucket_boundaries + [sys.maxsize]
+        self.elem_length_func = elem_length_func
+
+        self._group_dp = GroupByWindowDataPipe(dataset,
+                                               self._element_to_bucket_id,
+                                               self._window_size_func,
+                                               wrapper_class=wrapper_class)
+
+    def __iter__(self):
+        yield from self._group_dp
+
+    def _element_to_bucket_id(self, elem):
+        seq_len = self.elem_length_func(elem)
+        bucket_id = 0
+        for (i, b) in enumerate(self.bucket_boundaries):
+            if seq_len < b:
+                bucket_id = i
+                break
+        return bucket_id
+
+    def _window_size_func(self, bucket_id):
+        return self.bucket_batch_sizes[bucket_id]
+
+
+@functional_datapipe("group_by_window")
+class GroupByWindowDataPipe(datapipes.iter.Grouper):
+
+    def __init__(
+        self,
+        dataset: IterDataPipe,
+        key_func,
+        window_size_func,
+        wrapper_class=None,
+    ):
+        super().__init__(dataset,
+                         key_func,
+                         keep_key=False,
+                         group_size=None,
+                         drop_remaining=False)
+        _check_unpickable_fn(window_size_func)
+        self.dp = dataset
+        self.window_size_func = window_size_func
+        if wrapper_class is not None:
+            _check_unpickable_fn(wrapper_class)
+            del self.wrapper_class
+            self.wrapper_class = wrapper_class
+
+    def __iter__(self):
+        for x in self.datapipe:
+            key = self.group_key_fn(x)
+
+            self.buffer_elements[key].append(x)
+            self.curr_buffer_size += 1
+
+            group_size = self.window_size_func(key)
+            if group_size == len(self.buffer_elements[key]):
+                result = self.wrapper_class(self.buffer_elements[key])
+                yield result
+                self.curr_buffer_size -= len(self.buffer_elements[key])
+                del self.buffer_elements[key]
+
+            if self.curr_buffer_size == self.max_buffer_size:
+                result_to_yield = self._remove_biggest_key()
+                if result_to_yield is not None:
+                    result = self.wrapper_class(result_to_yield)
+                    yield result
+
+        for key in tuple(self.buffer_elements.keys()):
+            result = self.wrapper_class(self.buffer_elements.pop(key))
+            self.curr_buffer_size -= len(result)
+            yield result
+
+
+@functional_datapipe("sort")
+class SortDataPipe(IterDataPipe):
+
+    def __init__(self,
+                 dataset: IterDataPipe,
+                 buffer_size: int = 500,
+                 key_func=None,
+                 reverse=False) -> None:
+        if key_func is not None:
+            _check_unpickable_fn(key_func)
+        self.buffer_size = buffer_size
+        super().__init__()
+        self.dp = dataset
+        self._buffer = []
+        self.key_func = key_func
+        self.reverse = reverse
+
+    def __iter__(self):
+        for elem in self.dp:
+            self._buffer.append(elem)
+            if len(self._buffer) >= self.buffer_size:
+                self._buffer.sort(key=self.key_func, reverse=self.reverse)
+                for x in self._buffer:
+                    yield x
+                del self._buffer
+                self._buffer = []
+        # The sample left over
+        self._buffer.sort(key=self.key_func, reverse=self.reverse)
+        for x in self._buffer:
+            yield x
+        del self._buffer
+        self._buffer = []
+
+
+@functional_datapipe("dynamic_batch")
+class DynamicBatchDataPipe(IterDataPipe):
+
+    def __init__(self, dataset: IterDataPipe, window_class,
+                 wrapper_class) -> None:
+        _check_unpickable_fn(window_class)
+        _check_unpickable_fn(wrapper_class)
+        super().__init__()
+        self.dp = dataset
+        assert window_class is not None
+        assert wrapper_class is not None
+        self.window_class = window_class
+        self._buffer = []
+        self._wrappr_class = wrapper_class
+
+    def __iter__(self):
+        for elem in self.dp:
+            if not self.window_class(elem, len(self._buffer)):
+                self._buffer.append(elem)
+            else:
+                if len(self._buffer) > 0:
+                    yield self._wrappr_class(self._buffer)
+                del self._buffer
+                self._buffer = [elem]
+        if len(self._buffer) > 0:
+            yield self._wrappr_class(self._buffer)
+        del self._buffer
+        self._buffer = []
+
+
+@functional_datapipe("prefetch")
+class PrefetchDataPipe(IterDataPipe):
+    """Performs prefetching"""
+
+    def __init__(
+        self,
+        dataset: IterDataPipe,
+        buffer_size: int = 500,
+    ):
+        # TODO(Mddct): support multiprocessing pool with shared-memory to
+        #   prefetch
+        super().__init__()
+        self.dp = dataset
+        self._iter = None
+        self._prefetch_buffer_size = buffer_size
+        self._buffer = None
+        if self._prefetch_buffer_size > 0:
+            self._buffer = collections.deque(maxlen=self._prefetch_buffer_size)
+
+    def __iter__(self):
+        if self._prefetch_buffer_size > 0:
+            if self._iter is None:
+                self._iter = iter(self.dp)
+            assert self._buffer is not None
+
+            while True:
+                if len(self._buffer) <= self._prefetch_buffer_size // 2:
+                    while len(self._buffer) < self._prefetch_buffer_size:
+                        try:
+                            self._buffer.append(next(self._iter))
+                        except StopIteration:
+                            if len(self._buffer) != 0:
+                                while len(self._buffer) > 0:
+                                    yield self._buffer.popleft()
+                            self._iter = None
+                            return
+                while len(self._buffer) > self._prefetch_buffer_size // 2:
+                    elem = self._buffer.popleft()
+                    yield elem
+
+        else:
+            yield from self.dp
+
+
+@functional_datapipe("repeat")
+class RepeatDatapipe(IterDataPipe):
+
+    def __init__(self, dataset: IterDataPipe, count: int = -1):
+        super().__init__()
+        self.dp = dataset
+        self.count = count
+
+    def __iter__(self):
+        if self.count == 1:
+            yield from self.dp
+            return
+        i = 0
+        while self.count < 0 or i < self.count:
+            for elem in self.dp:
+                new_elem = copy.copy(elem)
+                yield new_elem
+            i += 1
+
+
+@functional_datapipe("shard")
+class ShardDataPipe(ShardingFilterIterDataPipe):
+
+    def __init__(self, dataset: IterDataPipe, partition: bool = False):
+        super().__init__(dataset, None)
+        self.partition = partition
+        self.dp = dataset
+
+    def apply_sharding(self, num_of_instances: int, instance_id: int,
+                       sharding_group: SHARDING_PRIORITIES):
+        if self.partition:
+            return super().apply_sharding(num_of_instances, instance_id,
+                                          sharding_group)
+        else:
+            # We can not handle uneven data for CV on DDP, so we don't
+            # sample data by rank, that means every GPU gets the same
+            # and all the CV data
+            info = torch.utils.data.get_worker_info()
+            if info is None:
+                self.num_of_instances = 1
+                self.instance_id = 0
+            else:
+                n_workers_per_device = info.num_workers
+                self.num_of_instances = n_workers_per_device
+                self.instance_id = info.id
+
+
+@functional_datapipe("interleave")
+class InterlaveDataPipe(IterDataPipe):
+
+    def __init__(
+        self,
+        source_datapipes: List[IterDataPipe],
+        weights: Optional[List[float]] = None,
+        seed=2027,
+    ):
+        super().__init__()
+        self.rng = np.random.default_rng(seed)
+        self.source_datapipes = source_datapipes
+        self.weights = weights
+        if weights is None:
+            self.weights = [1 / len(self.source_datapipes)] * len(
+                self.source_datapipes)
+        else:
+            self.weights = [weight / sum(weights) for weight in weights]
+        self.iters = None
+
+    def __iter__(self):
+        weights = copy.deepcopy(self.weights)
+        exhausted = len(self.source_datapipes) * [False]
+        if self.iters is None:
+            self.iters = [(i, iter(d))
+                          for i, d in enumerate(self.source_datapipes)]
+        while True:
+            # TODO(Mddct): rng
+            index_iter = self.rng.choice(self.iters, p=weights)
+            i, ite = index_iter
+            try:
+                elem = next(ite)
+                yield elem
+            except StopIteration:
+                weights[i] = 0.
+                exhausted[i] = True
+                if all(exhausted):
+                    return
+                weights = [weight / sum(weights) for weight in weights]
+
+
+class TextLineDataPipe(IterDataPipe):
+    """ Streamming Text line
+    """
+
+    def __init__(self, filenames, mode='r'):
+        super().__init__()
+        _dp = datapipes.iter.FileLister(filenames)
+        _dp = datapipes.iter.FileOpener(_dp, mode=mode)
+        self.dp = _dp
+
+    def __iter__(self):
+        for fname, stream in self.dp:
+            for line in stream:
+                line = line.strip('\n')
+                yield {"file_name": fname, "line": line}
+            stream.close()
+
+
+@functional_datapipe("tar_file_and_group")
+class TarsDataPipe(IterDataPipe):
+    """ Decode wenet's tar , yield {'txt': "...", "raw": "..."}
+    """
+
+    def __init__(self, dataset: IterDataPipe) -> None:
+        super().__init__()
+        self.dp = dataset
+
+    def __iter__(self):
+        from wenet.dataset.processor import AUDIO_FORMAT_SETS
+        for sample in self.dp:
+            assert 'file_name' in sample
+            assert 'line' in sample
+            assert 'stream' in sample
+            try:
+                with tarfile.open(fileobj=sample['stream'],
+                                  mode="r:*") as stream:
+                    prev_prefix = None
+                    example = {
+                        'file_name': sample['file_name'],
+                        'tar_file_name': sample['line']
+                    }
+                    valid = True
+                    for tarinfo in stream:
+                        name = tarinfo.name
+                        pos = name.rfind('.')
+                        assert pos > 0
+                        prefix, postfix = name[:pos], name[pos + 1:]
+                        if prev_prefix is not None and prefix != prev_prefix:
+                            example['key'] = prev_prefix
+                            if valid:
+                                yield example
+                            example = {
+                                'file_name': sample['file_name'],
+                                'tar_file_name': sample['line']
+                            }
+                            valid = True
+                        with stream.extractfile(tarinfo) as file_obj:
+                            try:
+                                if postfix == 'txt':
+                                    example['txt'] = file_obj.read().decode(
+                                        'utf8').strip()
+                                elif postfix in AUDIO_FORMAT_SETS:
+                                    example['wav'] = file_obj.read()
+                                else:
+                                    example[postfix] = file_obj.read()
+                            except Exception as ex:
+                                valid = False
+                                logging.warning(
+                                    'error to parse {}'.format(name))
+                            prev_prefix = prefix
+                    if prev_prefix is not None:
+                        example['key'] = prev_prefix
+                        yield example
+            except Exception as ex:
+                msg = 'In tar_file_and_group: {} when processing {}'.format(
+                    ex, sample['line'])
+                logging.warning(msg)
+            finally:
+                if 'process' in sample:
+                    sample['process'].communicate()
+                sample['stream'].close()
+
+
+class WenetRawDatasetSource(IterDataPipe):
+
+    def __init__(self,
+                 filenames: str,
+                 prefetch: int = 500,
+                 partition: bool = True,
+                 shuffle: bool = False,
+                 shuffle_size: int = 10000,
+                 cycle: int = 1) -> None:
+        super().__init__()
+        self.dp = TextLineDataPipe(filenames)
+        if shuffle:
+            self.dp = self.dp.shuffle(buffer_size=shuffle_size)
+        self.dp = self.dp.repeat(cycle).prefetch(prefetch)
+        self.dp = self.dp.shard(partition)
+
+    def __iter__(self):
+        for d in self.dp:
+            yield d
+
+
+class WenetTarShardDatasetSource(IterDataPipe):
+
+    def __init__(self,
+                 filenames: str,
+                 prefetch: int = 500,
+                 partition: bool = True,
+                 shuffle: bool = False,
+                 shuffle_size: int = 10000,
+                 cycle: int = 1) -> None:
+        super().__init__()
+        self.dp = TextLineDataPipe(filenames)
+        if shuffle:
+            self.dp = self.dp.shuffle(buffer_size=shuffle_size)
+        self.dp = self.dp.repeat(cycle)
+        self.dp = self.dp.shard(partition).map_ignore_error(
+            parse_url).tar_file_and_group().prefetch(prefetch)
+
+    def __iter__(self):
+        for d in self.dp:
+            yield d
diff --git a/wenet/dataset/dataset.py b/wenet/dataset/dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..fed31991e62232a0da741614129f66cf2d369f0f
--- /dev/null
+++ b/wenet/dataset/dataset.py
@@ -0,0 +1,234 @@
+# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+
+import torch
+import torch.distributed as dist
+from torch.utils.data import IterableDataset
+
+import wenet.dataset.deprecated.processor as processor
+from wenet.text.base_tokenizer import BaseTokenizer
+from wenet.utils.file_utils import read_lists
+
+
+class Processor(IterableDataset):
+
+    def __init__(self, source, f, *args, **kw):
+        assert callable(f)
+        self.source = source
+        self.f = f
+        self.args = args
+        self.kw = kw
+
+    def set_epoch(self, epoch):
+        self.source.set_epoch(epoch)
+
+    def __iter__(self):
+        """ Return an iterator over the source dataset processed by the
+            given processor.
+        """
+        assert self.source is not None
+        assert callable(self.f)
+        return self.f(iter(self.source), *self.args, **self.kw)
+
+    def apply(self, f):
+        assert callable(f)
+        return Processor(self, f, *self.args, **self.kw)
+
+
+class DistributedSampler:
+
+    def __init__(self, shuffle=True, partition=True, split_num=1):
+        self.epoch = -1
+        self.update()
+        self.shuffle = shuffle
+        self.partition = partition
+        self.split_num = split_num
+
+    def update(self):
+        assert dist.is_available()
+        if dist.is_initialized():
+            self.rank = dist.get_rank()
+            self.world_size = dist.get_world_size()
+        else:
+            self.rank = 0
+            self.world_size = 1
+        worker_info = torch.utils.data.get_worker_info()
+        if worker_info is None:
+            self.worker_id = 0
+            self.num_workers = 1
+        else:
+            self.worker_id = worker_info.id
+            self.num_workers = worker_info.num_workers
+        return dict(rank=self.rank,
+                    world_size=self.world_size,
+                    worker_id=self.worker_id,
+                    num_workers=self.num_workers)
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+
+    def split_data(self, total_num):
+        data = list(range(total_num))
+        sub_epoch = self.epoch + 1
+        full_epoch = sub_epoch // self.split_num
+        num_per_sub_epochs = total_num // self.split_num
+        random.Random(full_epoch).shuffle(data)
+
+        split_index = sub_epoch - full_epoch * self.split_num
+        begin = split_index * num_per_sub_epochs
+        end = (begin + num_per_sub_epochs 
+                if (split_index + 1) < self.split_num else
+                total_num)
+        
+        # print(f'begin: {begin}, end: {end}, world_size: {self.world_size}')
+        return data[begin:end]
+
+    def sample(self, data, split_num=1):
+        """ Sample data according to rank/world_size/num_workers
+
+            Args:
+                data(List): input data list
+
+            Returns:
+                List: data list after sample
+        """
+        if self.split_num == 1:
+            data = list(range(len(data)))
+        else:
+            data = self.split_data(len(data))
+        # TODO(Binbin Zhang): fix this
+        # We can not handle uneven data for CV on DDP, so we don't
+        # sample data by rank, that means every GPU gets the same
+        # and all the CV data
+        if self.partition:
+            if self.shuffle:
+                random.Random(self.epoch).shuffle(data)
+            data = data[self.rank::self.world_size]
+            # print(f'num dataset: {len(data)}')
+        data = data[self.worker_id::self.num_workers]
+        self.epoch += 1
+        return data
+
+
+class DataList(IterableDataset):
+
+    def __init__(self, lists, shuffle=True, partition=True, split_num=1):
+        self.lists = lists
+        self.sampler = DistributedSampler(shuffle, partition, split_num)
+
+    def set_epoch(self, epoch):
+        self.sampler.set_epoch(epoch)
+
+    def __iter__(self):
+        sampler_info = self.sampler.update()
+        indexes = self.sampler.sample(self.lists)
+        for index in indexes:
+            # yield dict(src=src)
+            data = dict(src=self.lists[index])
+            data.update(sampler_info)
+            yield data
+
+
+def Dataset(data_type,
+            data_list_file,
+            tokenizer: BaseTokenizer,
+            conf,
+            partition=True):
+    """ Construct dataset from arguments
+
+        We have two shuffle stage in the Dataset. The first is global
+        shuffle at shards tar/raw file level. The second is global shuffle
+        at training samples level.
+
+        Args:
+            data_type(str): raw/shard
+            bpe_model(str): model for english bpe part
+            partition(bool): whether to do data partition in terms of rank
+    """
+    assert data_type in ['raw', 'shard', 'shard_full_data']
+    lists = read_lists(data_list_file)
+    shuffle = conf.get('shuffle', True)
+    split_num = conf.get('split_num', 1)
+    dataset = DataList(lists, shuffle=shuffle, partition=partition, split_num=split_num)
+    if data_type == 'shard':
+        dataset = Processor(dataset, processor.url_opener)
+        dataset = Processor(dataset, processor.tar_file_and_group)
+    elif data_type == 'shard_full_data':
+        dataset = Processor(dataset, processor.url_opener)
+        dataset = Processor(dataset, processor.tar_file_and_group_full_data)
+    else:
+        dataset = Processor(dataset, processor.parse_raw)
+
+    speaker_conf = conf.get('speaker_conf', None)
+    if speaker_conf is not None:
+        dataset = Processor(dataset, processor.parse_speaker, **speaker_conf)
+    
+    if conf.get('eod_id', None) is not None:
+        tokenizer.eod_id = conf['eod_id']
+    # prompt dict
+    from gxl_ai_utils.utils import utils_file
+    global_prompt_dict = utils_file.load_dict_from_yaml('conf/prompt_stage4.yaml')
+    dataset = Processor(dataset, processor.tokenize, tokenizer,  
+                        global_prompt_dict=global_prompt_dict)
+    filter_conf = conf.get('filter_conf', {})
+    dataset = Processor(dataset, processor.filter, **filter_conf)
+
+    resample_conf = conf.get('resample_conf', {})
+    dataset = Processor(dataset, processor.resample, **resample_conf)
+
+    speed_perturb = conf.get('speed_perturb', False)
+    if speed_perturb:
+        dataset = Processor(dataset, processor.speed_perturb)
+
+    feats_type = conf.get('feats_type', 'fbank')
+    assert feats_type in ['fbank', 'mfcc', 'log_mel_spectrogram']
+    if feats_type == 'fbank':
+        fbank_conf = conf.get('fbank_conf', {})
+        dataset = Processor(dataset, processor.compute_fbank, **fbank_conf)
+    elif feats_type == 'mfcc':
+        mfcc_conf = conf.get('mfcc_conf', {})
+        dataset = Processor(dataset, processor.compute_mfcc, **mfcc_conf)
+    elif feats_type == 'log_mel_spectrogram':
+        log_mel_spectrogram_conf = conf.get('log_mel_spectrogram_conf', {})
+        dataset = Processor(dataset, processor.compute_log_mel_spectrogram,
+                            **log_mel_spectrogram_conf)
+
+    spec_aug = conf.get('spec_aug', True)
+    spec_sub = conf.get('spec_sub', False)
+    spec_trim = conf.get('spec_trim', False)
+    if spec_aug:
+        spec_aug_conf = conf.get('spec_aug_conf', {})
+        dataset = Processor(dataset, processor.spec_aug, **spec_aug_conf)
+    if spec_sub:
+        spec_sub_conf = conf.get('spec_sub_conf', {})
+        dataset = Processor(dataset, processor.spec_sub, **spec_sub_conf)
+    if spec_trim:
+        spec_trim_conf = conf.get('spec_trim_conf', {})
+        dataset = Processor(dataset, processor.spec_trim, **spec_trim_conf)
+
+    if shuffle:
+        shuffle_conf = conf.get('shuffle_conf', {})
+        dataset = Processor(dataset, processor.shuffle, **shuffle_conf)
+
+    sort = conf.get('sort', True)
+    if sort:
+        sort_conf = conf.get('sort_conf', {})
+        dataset = Processor(dataset, processor.sort, **sort_conf)
+
+    batch_conf = conf.get('batch_conf', {})
+    dataset = Processor(dataset, processor.batch, **batch_conf)
+    dataset = Processor(dataset, processor.padding)
+    return dataset
diff --git a/wenet/dataset/deprecated/dataset.py b/wenet/dataset/deprecated/dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ce51612cbe6968cb5879be13404cc1115e04710
--- /dev/null
+++ b/wenet/dataset/deprecated/dataset.py
@@ -0,0 +1,202 @@
+# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+
+import torch
+import torch.distributed as dist
+from torch.utils.data import IterableDataset
+
+import wenet.dataset.deprecated.processor as processor
+from wenet.text.base_tokenizer import BaseTokenizer
+from wenet.utils.file_utils import read_lists
+
+
+class Processor(IterableDataset):
+
+    def __init__(self, source, f, *args, **kw):
+        assert callable(f)
+        self.source = source
+        self.f = f
+        self.args = args
+        self.kw = kw
+
+    def set_epoch(self, epoch):
+        self.source.set_epoch(epoch)
+
+    def __iter__(self):
+        """ Return an iterator over the source dataset processed by the
+            given processor.
+        """
+        assert self.source is not None
+        assert callable(self.f)
+        return self.f(iter(self.source), *self.args, **self.kw)
+
+    def apply(self, f):
+        assert callable(f)
+        return Processor(self, f, *self.args, **self.kw)
+
+
+class DistributedSampler:
+
+    def __init__(self, shuffle=True, partition=True):
+        self.epoch = -1
+        self.update()
+        self.shuffle = shuffle
+        self.partition = partition
+
+    def update(self):
+        assert dist.is_available()
+        if dist.is_initialized():
+            self.rank = dist.get_rank()
+            self.world_size = dist.get_world_size()
+        else:
+            self.rank = 0
+            self.world_size = 1
+        worker_info = torch.utils.data.get_worker_info()
+        if worker_info is None:
+            self.worker_id = 0
+            self.num_workers = 1
+        else:
+            self.worker_id = worker_info.id
+            self.num_workers = worker_info.num_workers
+        return dict(rank=self.rank,
+                    world_size=self.world_size,
+                    worker_id=self.worker_id,
+                    num_workers=self.num_workers)
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+
+    def sample(self, data):
+        """ Sample data according to rank/world_size/num_workers
+
+            Args:
+                data(List): input data list
+
+            Returns:
+                List: data list after sample
+        """
+        data = list(range(len(data)))
+        # TODO(Binbin Zhang): fix this
+        # We can not handle uneven data for CV on DDP, so we don't
+        # sample data by rank, that means every GPU gets the same
+        # and all the CV data
+        if self.partition:
+            if self.shuffle:
+                random.Random(self.epoch).shuffle(data)
+            data = data[self.rank::self.world_size]
+        data = data[self.worker_id::self.num_workers]
+        return data
+
+
+class DataList(IterableDataset):
+
+    def __init__(self, lists, shuffle=True, partition=True):
+        self.lists = lists
+        self.sampler = DistributedSampler(shuffle, partition)
+
+    def set_epoch(self, epoch):
+        self.sampler.set_epoch(epoch)
+
+    def __iter__(self):
+        sampler_info = self.sampler.update()
+        indexes = self.sampler.sample(self.lists)
+        for index in indexes:
+            # yield dict(src=src)
+            data = dict(src=self.lists[index])
+            data.update(sampler_info)
+            yield data
+
+
+def Dataset(data_type,
+            data_list_file,
+            tokenizer: BaseTokenizer,
+            conf,
+            partition=True):
+    """ Construct dataset from arguments
+
+        We have two shuffle stage in the Dataset. The first is global
+        shuffle at shards tar/raw file level. The second is global shuffle
+        at training samples level.
+
+        Args:
+            data_type(str): raw/shard
+            bpe_model(str): model for english bpe part
+            partition(bool): whether to do data partition in terms of rank
+    """
+    assert data_type in ['raw', 'shard']
+    lists = read_lists(data_list_file)
+    shuffle = conf.get('shuffle', True)
+    dataset = DataList(lists, shuffle=shuffle, partition=partition)
+    if data_type == 'shard':
+        dataset = Processor(dataset, processor.url_opener)
+        dataset = Processor(dataset, processor.tar_file_and_group)
+    else:
+        dataset = Processor(dataset, processor.parse_raw)
+
+    speaker_conf = conf.get('speaker_conf', None)
+    if speaker_conf is not None:
+        dataset = Processor(dataset, processor.parse_speaker, **speaker_conf)
+
+    dataset = Processor(dataset, processor.tokenize, tokenizer)
+    filter_conf = conf.get('filter_conf', {})
+    dataset = Processor(dataset, processor.filter, **filter_conf)
+
+    resample_conf = conf.get('resample_conf', {})
+    dataset = Processor(dataset, processor.resample, **resample_conf)
+
+    speed_perturb = conf.get('speed_perturb', False)
+    if speed_perturb:
+        dataset = Processor(dataset, processor.speed_perturb)
+
+    feats_type = conf.get('feats_type', 'fbank')
+    assert feats_type in ['fbank', 'mfcc', 'log_mel_spectrogram']
+    if feats_type == 'fbank':
+        fbank_conf = conf.get('fbank_conf', {})
+        dataset = Processor(dataset, processor.compute_fbank, **fbank_conf)
+    elif feats_type == 'mfcc':
+        mfcc_conf = conf.get('mfcc_conf', {})
+        dataset = Processor(dataset, processor.compute_mfcc, **mfcc_conf)
+    elif feats_type == 'log_mel_spectrogram':
+        log_mel_spectrogram_conf = conf.get('log_mel_spectrogram_conf', {})
+        dataset = Processor(dataset, processor.compute_log_mel_spectrogram,
+                            **log_mel_spectrogram_conf)
+
+    spec_aug = conf.get('spec_aug', True)
+    spec_sub = conf.get('spec_sub', False)
+    spec_trim = conf.get('spec_trim', False)
+    if spec_aug:
+        spec_aug_conf = conf.get('spec_aug_conf', {})
+        dataset = Processor(dataset, processor.spec_aug, **spec_aug_conf)
+    if spec_sub:
+        spec_sub_conf = conf.get('spec_sub_conf', {})
+        dataset = Processor(dataset, processor.spec_sub, **spec_sub_conf)
+    if spec_trim:
+        spec_trim_conf = conf.get('spec_trim_conf', {})
+        dataset = Processor(dataset, processor.spec_trim, **spec_trim_conf)
+
+    if shuffle:
+        shuffle_conf = conf.get('shuffle_conf', {})
+        dataset = Processor(dataset, processor.shuffle, **shuffle_conf)
+
+    sort = conf.get('sort', True)
+    if sort:
+        sort_conf = conf.get('sort_conf', {})
+        dataset = Processor(dataset, processor.sort, **sort_conf)
+
+    batch_conf = conf.get('batch_conf', {})
+    dataset = Processor(dataset, processor.batch, **batch_conf)
+    dataset = Processor(dataset, processor.padding)
+    return dataset
diff --git a/wenet/dataset/deprecated/processor.py b/wenet/dataset/deprecated/processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5a0c671db14abe35cdae4e590c024c46fc203b2
--- /dev/null
+++ b/wenet/dataset/deprecated/processor.py
@@ -0,0 +1,1023 @@
+# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import librosa
+import logging
+import json
+import random
+import tarfile
+from subprocess import PIPE, Popen
+from urllib.parse import urlparse
+
+import torch
+import torchaudio
+import torchaudio.compliance.kaldi as kaldi
+import torch.nn.functional as F
+from gxl_ai_utils.utils import utils_file
+from torch.nn.utils.rnn import pad_sequence
+from wenet.text.base_tokenizer import BaseTokenizer
+
+# torchaudio.utils.sox_utils.set_buffer_size(16500)
+torchaudio.set_audio_backend("soundfile")
+
+AUDIO_FORMAT_SETS = set(['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma'])
+
+
+def url_opener(data):
+    """ Give url or local file, return file descriptor
+        Inplace operation.
+
+        Args:
+            data(Iterable[str]): url or local file list
+
+        Returns:
+            Iterable[{src, stream}]
+    """
+    for sample in data:
+        assert 'src' in sample
+        # TODO(Binbin Zhang): support HTTP
+        url = sample['src']
+        try:
+            pr = urlparse(url)
+            # local file
+            if pr.scheme == '' or pr.scheme == 'file':
+                stream = open(url, 'rb')
+            # network file, such as HTTP(HDFS/OSS/S3)/HTTPS/SCP
+            else:
+                cmd = f'wget -q -O - {url}'
+                process = Popen(cmd, shell=True, stdout=PIPE)
+                sample.update(process=process)
+                stream = process.stdout
+            sample.update(stream=stream)
+            yield sample
+        except Exception as ex:
+            logging.warning('Failed to open {}'.format(url))
+
+
+def tar_file_and_group(data):
+    """ Expand a stream of open tar files into a stream of tar file contents.
+        And groups the file with same prefix
+
+        Args:
+            data: Iterable[{src, stream}]
+
+        Returns:
+            Iterable[{key, wav, txt, sample_rate}]
+    """
+    for sample in data:
+        assert 'stream' in sample
+        stream = None
+        try:
+            stream = tarfile.open(fileobj=sample['stream'], mode="r:*")
+            prev_prefix = None
+            example = {}
+            valid = True
+            for tarinfo in stream:
+                name = tarinfo.name
+                pos = name.rfind('.')
+                assert pos > 0
+                prefix, postfix = name[:pos], name[pos + 1:]
+                if prev_prefix is not None and prefix != prev_prefix:
+                    example['key'] = prev_prefix
+                    if valid:
+                        yield example
+                    example = {}
+                    valid = True
+                with stream.extractfile(tarinfo) as file_obj:
+                    try:
+                        if postfix == 'txt':
+                            example['txt'] = file_obj.read().decode(
+                                'utf8').strip()
+                        elif postfix in AUDIO_FORMAT_SETS:
+                            waveform, sample_rate = torchaudio.load(file_obj)
+                            example['wav'] = waveform
+                            example['sample_rate'] = sample_rate
+                        else:
+                            example[postfix] = file_obj.read()
+                    except Exception as ex:
+                        valid = False
+                        logging.warning('error to parse {}'.format(name))
+                prev_prefix = prefix
+            if prev_prefix is not None:
+                example['key'] = prev_prefix
+                yield example
+        except Exception as ex:
+            logging.warning(
+                'In tar_file_and_group: {} when processing {}'.format(
+                    ex, sample['src']))
+        finally:
+            if stream is not None:
+                stream.close()
+            if 'process' in sample:
+                sample['process'].communicate()
+            sample['stream'].close()
+
+
+def tar_file_and_group_full_data(data):
+    """ Expand a stream of open tar files into a stream of tar file contents.
+        And groups the file with same prefix
+
+        Args:
+            data: Iterable[{src, stream}]
+
+        Returns:
+            Iterable[{key, wav, txt, sample_rate}]
+    """
+    for sample in data:
+        assert 'stream' in sample
+        stream = None
+        try:
+            stream = tarfile.open(fileobj=sample['stream'], mode="r:*")
+            prev_prefix = None
+            example = {}
+            valid = True
+            for tarinfo in stream:
+                name = tarinfo.name
+                pos = name.rfind('.')
+                assert pos > 0
+                prefix, postfix = name[:pos], name[pos + 1:]
+                if prev_prefix is not None and prefix != prev_prefix:
+                    example['key'] = prev_prefix
+                    if valid:
+                        # assert 'txt' in example
+                        if 'txt' not in example:
+                            example['txt'] = ''
+                        yield example
+                    example = {}
+                    valid = True
+                with stream.extractfile(tarinfo) as file_obj:
+                    try:
+                        if postfix == 'txt':
+                            example['txt'] = file_obj.read().decode(
+                                'utf8').strip()
+                        elif postfix == 'lang':
+                            example['lang'] = file_obj.read().decode(
+                                'utf8').strip()
+                        elif postfix == 'speaker':
+                            try:
+                                example['speaker'] = file_obj.read().decode(
+                                    'utf8').strip()
+                            except Exception as ex:
+                                example['speaker'] = "none"
+                        elif postfix == 'emotion':
+                            example['emotion'] = file_obj.read().decode(
+                                'utf8').strip()
+                        elif postfix == 'gender':
+                            example['gender'] = file_obj.read().decode(
+                                'utf8').strip()
+                        elif postfix == 'task':
+                            example['task'] = file_obj.read().decode(
+                                'utf8').strip()
+                        elif postfix == 'speech_token':
+                            example['speech_token'] = file_obj.read()
+                        elif postfix == 'duration':
+                            duration_str = file_obj.read().decode(
+                                'utf8').strip()
+                            try:
+                                duration_float = float(duration_str)
+                                example['duration'] = duration_float
+                            except Exception as ex:
+                                logging.warning(f'error to parse duration {duration_str}')
+                                example['duration'] = 0
+
+                        elif postfix in AUDIO_FORMAT_SETS:
+                            waveform, sample_rate = torchaudio.load(file_obj)
+                            # 检查音频的维度
+                            num_channels = waveform.shape[0]
+                            # 如果音频是多通道的，则进行通道平均
+                            if num_channels > 1:
+                                waveform = torch.mean(waveform, dim=0, keepdim=True)
+                            example['wav'] = waveform
+                            example['sample_rate'] = sample_rate
+                        else:
+                            example[postfix] = file_obj.read()
+                    except Exception as ex:
+                        valid = False
+                        # logging.warning('error to parse {}'.format(name))
+                prev_prefix = prefix
+            if prev_prefix is not None:
+                example['key'] = prev_prefix
+                if 'txt' in example:
+                    yield example
+
+        except Exception as ex:
+            logging.warning(
+                'In tar_file_and_group: {} when processing {}'.format(
+                    ex, sample['src']))
+        finally:
+            if stream is not None:
+                stream.close()
+            if 'process' in sample:
+                sample['process'].communicate()
+            sample['stream'].close()
+
+
+def parse_raw(data):
+    """ Parse key/wav/txt from json line
+
+        Args:
+            data: Iterable[str], str is a json line has key/wav/txt
+
+        Returns:
+            Iterable[{key, wav, txt, sample_rate}]
+    """
+    for sample in data:
+        assert 'src' in sample
+        json_line = sample['src']
+        obj = json.loads(json_line)
+        assert 'key' in obj
+        assert 'wav' in obj
+        assert 'txt' in obj
+        key = obj['key']
+        wav_file = obj['wav']
+        txt = obj['txt']
+        try:
+            if 'start' in obj:
+                assert 'end' in obj
+                sample_rate = torchaudio.info(wav_file).sample_rate
+                start_frame = int(obj['start'] * sample_rate)
+                end_frame = int(obj['end'] * sample_rate)
+                waveform, _ = torchaudio.load(filepath=wav_file,
+                                              num_frames=end_frame -
+                                                         start_frame,
+                                              frame_offset=start_frame)
+            else:
+                waveform, sample_rate = torchaudio.load(wav_file)
+                # 检查音频的维度
+                num_channels = waveform.shape[0]
+                # 如果音频是多通道的，则进行通道平均
+                if num_channels > 1:
+                    waveform = torch.mean(waveform, dim=0, keepdim=True)
+            example = copy.deepcopy(obj)  # copy and keep all the fields
+            example['wav'] = waveform  # overwrite wav
+            example['sample_rate'] = sample_rate
+            yield example
+        except Exception as ex:
+            logging.warning('Failed to read {}'.format(wav_file))
+
+
+def parse_speaker(data, speaker_table_path):
+    speaker_dict = {}
+    with open(speaker_table_path, 'r', encoding='utf8') as fin:
+        for line in fin:
+            arr = line.strip().split()
+            speaker_dict[arr[0]] = int(arr[1])
+    for sample in data:
+        assert 'speaker' in sample
+        speaker = sample['speaker']
+        sample['speaker'] = speaker_dict.get(speaker, 0)
+        yield sample
+
+
+def filter(data,
+           max_length=1200,
+           min_length=10,
+           token_max_length=250,
+           token_min_length=1,
+           min_output_input_ratio=0.00005,
+           max_output_input_ratio=1,
+           filter_no_extra_info: bool = False,
+           max_seq_len=1000):
+    """ Filter sample according to feature and label length
+        Inplace operation.
+
+        Args::
+            data: Iterable[{key, wav, label, sample_rate}]
+            max_length: drop utterance which is greater than max_length(10ms)
+            min_length: drop utterance which is less than min_length(10ms)
+            token_max_length: drop utterance which is greater than
+                token_max_length, especially when use char unit for
+                english modeling
+            token_min_length: drop utterance which is
+                less than token_max_length
+            min_output_input_ratio: minimal ration of
+                token_length / feats_length(10ms)
+            max_output_input_ratio: maximum ration of
+                token_length / feats_length(10ms)
+
+        Returns:
+            Iterable[{key, wav, label, sample_rate}]
+    """
+    for sample in data:
+        try:
+            assert 'sample_rate' in sample
+            assert 'wav' in sample
+            assert 'label' in sample
+        except:
+            continue
+        # sample['wav'] is torch.Tensor, we have 100 frames every second
+        num_frames = sample['wav'].size(1) / sample['sample_rate'] * 100
+
+        # filter for shard_in_common
+        if filter_no_extra_info:
+            if 'lang' not in sample:
+                continue
+            if 'task' not in sample:
+                continue
+
+        if num_frames < min_length:
+            continue
+
+        # if "output_type" in sample and sample["output_type"] == "speech2text_token":
+        #     max_length = int(max_length / 2)
+        # if "output_type" in sample and sample["output_type"] == "text2token":
+        #     max_length = int(max_length / 1.5)
+        if num_frames > max_length:
+            # continue
+            if 'task' in sample and sample['task'] == '<CAPTION>':
+                # utils_file.logging_limit_print('进行了随机剪裁')
+                # 随机选择一个起始点进行裁剪
+                start_frame = random.randint(0, int(num_frames - max_length))
+                end_frame = start_frame + max_length
+                sample['wav'] = sample['wav'][:, int(start_frame / 100 * sample['sample_rate']): int(
+                    end_frame / 100 * sample['sample_rate'])]
+                # print('sample[', sample['wav'].shape)
+            else:
+                continue
+        if len(sample['label']) < token_min_length:
+            continue
+        if len(sample['label']) > token_max_length:
+            continue
+        # if num_frames != 0:
+        #     if len(sample['label']) / num_frames < min_output_input_ratio:
+        #         continue
+        #     if len(sample['label']) / num_frames > max_output_input_ratio:
+        #         continue
+
+        if sample["output_type"] == "speech2text_token":
+            seq_len = len(sample['prompt']) + num_frames / 8 + len(sample['label']) + len(sample['speech_token'])
+        elif sample["output_type"] == "text2token":
+            seq_len = len(sample['prompt']) + len(sample['label']) + len(sample['speech_token'])
+        else:
+            seq_len =  len(sample['prompt']) + num_frames / 8 + len(sample['label'])
+        utils_file.logging_limit_print(f'seqlen: {seq_len}, output_type:{sample["output_type"]},len(sample["prompt"]):{len(sample["prompt"])},num_frames / 8:{num_frames / 8},len(sample["label"]):{len(sample["label"])},len(sample["speech_token"]):{len(sample["speech_token"])} ')
+        if max_seq_len > 0 and max_seq_len < seq_len:
+            utils_file.logging_limit_print(f"seqlen: {seq_len} 超过了最大长度:{max_seq_len}，contiune")
+            continue
+        yield sample
+
+
+def resample(data, resample_rate=16000):
+    """ Resample data.
+        Inplace operation.
+
+        Args:
+            data: Iterable[{key, wav, label, sample_rate}]
+            resample_rate: target resample rate
+
+        Returns:
+            Iterable[{key, wav, label, sample_rate}]
+    """
+    for sample in data:
+        assert 'sample_rate' in sample
+        assert 'wav' in sample
+        sample_rate = sample['sample_rate']
+        waveform = sample['wav']
+        if sample_rate != resample_rate:
+            sample['sample_rate'] = resample_rate
+            sample['wav'] = torchaudio.transforms.Resample(
+                orig_freq=sample_rate, new_freq=resample_rate)(waveform)
+        yield sample
+
+
+def speed_perturb(data, speeds=None):
+    """ Apply speed perturb to the data.
+        Inplace operation.
+
+        Args:
+            data: Iterable[{key, wav, label, sample_rate}]
+            speeds(List[float]): optional speed
+
+        Returns:
+            Iterable[{key, wav, label, sample_rate}]
+    """
+    if speeds is None:
+        speeds = [0.9, 1.0, 1.1]
+    for sample in data:
+        assert 'sample_rate' in sample
+        assert 'wav' in sample
+        sample_rate = sample['sample_rate']
+        waveform = sample['wav']
+        speed = random.choice(speeds)
+        if speed != 1.0:
+            wav, _ = torchaudio.sox_effects.apply_effects_tensor(
+                waveform, sample_rate,
+                [['speed', str(speed)], ['rate', str(sample_rate)]])
+            sample['wav'] = wav
+
+        yield sample
+
+
+def compute_fbank(data,
+                  num_mel_bins=23,
+                  frame_length=25,
+                  frame_shift=10,
+                  dither=0.0):
+    """ Extract fbank
+
+        Args:
+            data: Iterable[{key, wav, label, sample_rate}]
+
+        Returns:
+            Iterable[{key, feat, label}]
+    """
+    for sample in data:
+        assert 'sample_rate' in sample
+        assert 'wav' in sample
+        assert 'key' in sample
+        assert 'label' in sample
+        sample_rate = sample['sample_rate']
+        waveform = sample['wav']
+        waveform = waveform * (1 << 15)
+        # Only keep key, feat, label
+        mat = kaldi.fbank(waveform,
+                          num_mel_bins=num_mel_bins,
+                          frame_length=frame_length,
+                          frame_shift=frame_shift,
+                          dither=dither,
+                          energy_floor=0.0,
+                          sample_frequency=sample_rate)
+        sample['feat'] = mat
+        yield sample
+
+
+def compute_mfcc(data,
+                 num_mel_bins=23,
+                 frame_length=25,
+                 frame_shift=10,
+                 dither=0.0,
+                 num_ceps=40,
+                 high_freq=0.0,
+                 low_freq=20.0):
+    """ Extract mfcc
+
+        Args:
+            data: Iterable[{key, wav, label, sample_rate}]
+
+        Returns:
+            Iterable[{key, feat, label}]
+    """
+    for sample in data:
+        assert 'sample_rate' in sample
+        assert 'wav' in sample
+        assert 'key' in sample
+        assert 'label' in sample
+        sample_rate = sample['sample_rate']
+        waveform = sample['wav']
+        waveform = waveform * (1 << 15)
+        # Only keep key, feat, label
+        mat = kaldi.mfcc(waveform,
+                         num_mel_bins=num_mel_bins,
+                         frame_length=frame_length,
+                         frame_shift=frame_shift,
+                         dither=dither,
+                         num_ceps=num_ceps,
+                         high_freq=high_freq,
+                         low_freq=low_freq,
+                         sample_frequency=sample_rate)
+        sample['feat'] = mat
+        yield sample
+
+
+def compute_log_mel_spectrogram(data,
+                                n_fft=400,
+                                hop_length=160,
+                                num_mel_bins=80,
+                                padding=0):
+    """ Extract log mel spectrogram, modified from openai-whisper, see:
+        - https://github.com/openai/whisper/blob/main/whisper/audio.py
+        - https://github.com/wenet-e2e/wenet/pull/2141#issuecomment-1811765040
+
+        Args:
+            data: Iterable[{key, wav, label, sample_rate}]
+
+        Returns:
+            Iterable[{key, feat, label}]
+    """
+    for sample in data:
+        assert 'sample_rate' in sample
+        assert 'wav' in sample
+        assert 'key' in sample
+        assert 'label' in sample
+        sample_rate = sample['sample_rate']
+        waveform = sample['wav'].squeeze(0)  # (channel=1, sample) -> (sample,)
+        # print(f'wavform shape: {waveform.shape}')
+        if padding > 0:
+            waveform = F.pad(waveform, (0, padding))
+        window = torch.hann_window(n_fft)
+        stft = torch.stft(waveform,
+                          n_fft,
+                          hop_length,
+                          window=window,
+                          return_complex=True)
+        magnitudes = stft[..., :-1].abs() ** 2
+
+        filters = torch.from_numpy(
+            librosa.filters.mel(sr=sample_rate,
+                                n_fft=n_fft,
+                                n_mels=num_mel_bins))
+        mel_spec = filters @ magnitudes
+
+        # NOTE(xcsong): https://github.com/openai/whisper/discussions/269
+        log_spec = torch.clamp(mel_spec, min=1e-10).log10()
+        log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
+        log_spec = (log_spec + 4.0) / 4.0
+        sample['feat'] = log_spec.transpose(0, 1)
+        yield sample
+
+
+import re
+
+
+def process_text(text):
+    # 1. 删除汉字左右两侧的空格
+    text = re.sub(r'\s*([\u4e00-\u9fff])\s*', r'\1', text)
+    # 2. 将英文转成小写
+    text = text.lower()
+    # 3. 删除 < 和 > 符号两侧的空格
+    text = re.sub(r'\s*<\s*', '<', text)
+    text = re.sub(r'\s*>\s*', '>', text)
+    return text
+
+
+global_style_dict = {
+    "朗读": "新闻科普",
+    "科普百科": "新闻科普",
+    "悬疑恐怖": "恐怖故事",
+    "童话故事": "童话故事",
+    "客服": "客服",
+    "诗歌": "诗歌散文",
+    "散文": "诗歌散文",
+    "武侠评书": "有声书",
+    "小说": "有声书",
+    "历史": "有声书",
+    "科幻": "有声书",
+    "对话": "日常口语",
+    "口语": "日常口语",
+    "幽默": "其他",
+    "其他": "其他",
+}
+
+
+def replace_keys_in_brackets(input_str, key_value_dict):
+    for key, value in key_value_dict.items():
+        # 构造匹配 <key> 形式的正则表达式模式
+        pattern = re.compile(r'<{}>'.format(key))
+        input_str = pattern.sub(f"<{value}>", input_str)
+    return input_str
+
+
+def tokenize(data, tokenizer: BaseTokenizer, global_prompt_dict=None):
+    """ Decode text to chars or BPE
+        Inplace operation
+
+        Args:
+            data: Iterable[{key, wav, txt, sample_rate}]
+
+        Returns:
+            Iterable[{key, wav, txt, tokens, label, sample_rate}]
+    """
+    for sample in data:
+        try:
+            assert 'txt' in sample
+        except:
+            print(f'tokenize: {sample}')
+            exit()
+        if 'task' in sample:
+            task_name = sample['task']
+            # if "<AGE>" in task_name:
+            #     txt = sample['txt'].replace("<YOUTH>", "<ADULT>").replace("<MIDDLE_AGE>", "<ADULT>").replace("<MIDDLE>", "<ADULT>")
+            if "<STYLE>" in sample['task']:
+                txt = replace_keys_in_brackets(sample['txt'], global_style_dict)
+            else:
+                txt = sample['txt']
+        else:
+            txt = sample['txt']
+
+        tokens, label = tokenizer.tokenize(process_text(txt))
+        sample['tokens'] = tokens  # token是字符， label是数字
+        sample['label'] = label + [tokenizer.eod_id]
+        if 'task' in sample:
+            task_name = sample['task']
+            try:
+                random_index = random.randint(0, len(global_prompt_dict[task_name]) - 1)
+                prompt = global_prompt_dict[task_name][random_index]
+                sample['prompt'] = tokenizer.tokenize(prompt)[1]  # labels
+            except:
+                pass
+        else:
+            task_name = '<TRANSCRIBE>'
+            try:
+                random_index = random.randint(0, len(global_prompt_dict[task_name]) - 1)
+                prompt = global_prompt_dict[task_name][random_index]
+                sample['prompt'] = tokenizer.tokenize(prompt)[1]  # labels
+            except:
+                pass
+
+        if 'speech_token' in sample:
+            old_task_name = sample['task']
+            if old_task_name == "<TRANSCRIBE>":
+                task_name = '<TEXT2SPEECH_TOKEN>'
+                sample['output_type'] = 'text2token'
+            elif old_task_name == "<S2TCHAT>":
+                task_name = '<SPEECH2TEXT_SPEECH_TOKEN>'
+                sample['output_type'] = 'speech2text_token'
+            else:
+                task_name = old_task_name
+            try:
+                random_index = random.randint(0, len(global_prompt_dict[task_name]) - 1)
+                prompt = global_prompt_dict[task_name][random_index]
+                sample['prompt'] = tokenizer.tokenize(prompt)[1]  # labels
+            except:
+                pass
+            # 报错修改 from sywang ,只有推理的时候才会需要（raw格式），tar格式会自动转int list
+            # try:
+            #     utils_file.logging_limit_print("type of sample['speech_token']: ", type(sample['speech_token']))
+            #     speech_tokens = ast.literal_eval(sample['speech_token'])  # 解析字符串为列表
+            # except (ValueError, SyntaxError) as e:
+            #     print(f"解析错误: {e}在{speech_tokens}")
+            #     speech_tokens = []
+            # speech_token = [int(x) for x in speech_tokens]
+            speech_token = [int(x) for x in sample['speech_token']]
+            sample['speech_token'] = speech_token + [4096]
+        else:
+            sample['output_type'] = 'text'
+            sample['speech_token'] = [4096]
+        yield sample
+
+
+def spec_aug(data, num_t_mask=2, num_f_mask=2, max_t=50, max_f=10, max_w=80):
+    """ Do spec augmentation
+        Inplace operation
+
+        Args:
+            data: Iterable[{key, feat, label}]
+            num_t_mask: number of time mask to apply
+            num_f_mask: number of freq mask to apply
+            max_t: max width of time mask
+            max_f: max width of freq mask
+            max_w: max width of time warp
+
+        Returns
+            Iterable[{key, feat, label}]
+    """
+    for sample in data:
+        assert 'feat' in sample
+        x = sample['feat']
+        assert isinstance(x, torch.Tensor)
+        y = x.clone().detach()
+        max_frames = y.size(0)
+        max_freq = y.size(1)
+        # time mask
+        for i in range(num_t_mask):
+            start = random.randint(0, max_frames - 1)
+            length = random.randint(1, max_t)
+            end = min(max_frames, start + length)
+            y[start:end, :] = 0
+        # freq mask
+        for i in range(num_f_mask):
+            start = random.randint(0, max_freq - 1)
+            length = random.randint(1, max_f)
+            end = min(max_freq, start + length)
+            y[:, start:end] = 0
+        sample['feat'] = y
+        yield sample
+
+
+def spec_sub(data, max_t=20, num_t_sub=3):
+    """ Do spec substitute
+        Inplace operation
+        ref: U2++, section 3.2.3 [https://arxiv.org/abs/2106.05642]
+
+        Args:
+            data: Iterable[{key, feat, label}]
+            max_t: max width of time substitute
+            num_t_sub: number of time substitute to apply
+
+        Returns
+            Iterable[{key, feat, label}]
+    """
+    for sample in data:
+        assert 'feat' in sample
+        x = sample['feat']
+        assert isinstance(x, torch.Tensor)
+        y = x.clone().detach()
+        max_frames = y.size(0)
+        for i in range(num_t_sub):
+            start = random.randint(0, max_frames - 1)
+            length = random.randint(1, max_t)
+            end = min(max_frames, start + length)
+            # only substitute the earlier time chosen randomly for current time
+            pos = random.randint(0, start)
+            y[start:end, :] = x[start - pos:end - pos, :]
+        sample['feat'] = y
+        yield sample
+
+
+def spec_trim(data, max_t=20):
+    """ Trim tailing frames. Inplace operation.
+        ref: TrimTail [https://arxiv.org/abs/2211.00522]
+
+        Args:
+            data: Iterable[{key, feat, label}]
+            max_t: max width of length trimming
+
+        Returns
+            Iterable[{key, feat, label}]
+    """
+    for sample in data:
+        assert 'feat' in sample
+        x = sample['feat']
+        assert isinstance(x, torch.Tensor)
+        max_frames = x.size(0)
+        length = random.randint(1, max_t)
+        if length < max_frames / 2:
+            y = x.clone().detach()[:max_frames - length]
+            sample['feat'] = y
+        yield sample
+
+
+def shuffle(data, shuffle_size=10000):
+    """ Local shuffle the data
+
+        Args:
+            data: Iterable[{key, feat, label}]
+            shuffle_size: buffer size for shuffle
+
+        Returns:
+            Iterable[{key, feat, label}]
+    """
+    buf = []
+    for sample in data:
+        buf.append(sample)
+        if len(buf) >= shuffle_size:
+            random.shuffle(buf)
+            for x in buf:
+                yield x
+            buf = []
+    # The sample left over
+    random.shuffle(buf)
+    for x in buf:
+        yield x
+
+
+def sort(data, sort_size=500):
+    """ Sort the data by feature length.
+        Sort is used after shuffle and before batch, so we can group
+        utts with similar lengths into a batch, and `sort_size` should
+        be less than `shuffle_size`
+
+        Args:
+            data: Iterable[{key, feat, label}]
+            sort_size: buffer size for sort
+
+        Returns:
+            Iterable[{key, feat, label}]
+    """
+
+    buf = []
+    for sample in data:
+        buf.append(sample)
+        if len(buf) >= sort_size:
+            buf.sort(key=lambda x: x['feat'].size(0))
+            for x in buf:
+                yield x
+            buf = []
+    # The sample left over
+    buf.sort(key=lambda x: x['feat'].size(0))
+    for x in buf:
+        yield x
+
+
+def static_batch(data, batch_size=16):
+    """ Static batch the data by `batch_size`
+
+        Args:
+            data: Iterable[{key, feat, label}]
+            batch_size: batch size
+
+        Returns:
+            Iterable[List[{key, feat, label}]]
+    """
+    buf = []
+    for sample in data:
+        buf.append(sample)
+        if len(buf) >= batch_size:
+            yield buf
+            buf = []
+    if len(buf) > 0:
+        yield buf
+
+
+def dynamic_batch(data, max_frames_in_batch=12000, max_seq_in_batch=10000000):
+    """ Dynamic batch the data until the total frames in batch
+        reach `max_frames_in_batch`
+
+        Args:
+            data: Iterable[{key, feat, label}]
+            max_frames_in_batch: max_frames in one batch
+
+        Returns:
+            Iterable[List[{key, feat, label}]]
+    """
+    buf = []
+    longest_frames = 0
+    longest_seq = 0
+    max_frames_in_batch = max_frames_in_batch
+
+    buf_speech_token = []
+    longest_frames_token = 0
+    longest_seq_token = 0
+    max_frames_in_batch_token = int(max_frames_in_batch)
+
+    buf_speech_token_with_text = []
+    longest_frames_token_with_text = 0
+    longest_seq_token_with_text = 0
+    max_frames_in_batch_token_with_text = int(max_frames_in_batch / 2.5)
+
+    for sample in data:
+        assert 'feat' in sample
+        assert isinstance(sample['feat'], torch.Tensor)
+        new_sample_frames = sample['feat'].size(0)
+        if "output_type" in sample and sample["output_type"] == "speech2text_token":
+            new_seq = sample['feat'].size(0) / 8 + len(sample['label']) + len(sample.get('prompt', [])) + len(
+                sample.get('speech_token', []))
+            longest_seq_token = max(longest_seq_token, new_seq)
+            utils_file.logging_limit_print(
+                f'batchf fuc,当前条目new_seq为： {new_seq}，longest_seq_token为： {longest_seq_token}')
+            longest_frames_token = max(longest_frames_token, new_sample_frames)
+            frames_after_padding_token = longest_frames_token * (len(buf_speech_token)+1)
+            seq_after_padding_token = longest_seq_token * (len(buf_speech_token)+1)
+            utils_file.logging_limit_print(
+                f'batchf fuc,当前条目new_seq为： {new_seq}，longest_seq_token为： {longest_seq_token}，seq_after_padding_token： {seq_after_padding_token}')
+            utils_file.logging_limit_print(
+                f'batchf fuc,当前条目 new_sample_frames 为： {new_sample_frames}，longest_frames_token： {longest_frames_token}，frames_after_padding_token： {frames_after_padding_token}')
+            if frames_after_padding_token > max_frames_in_batch_token or seq_after_padding_token > max_seq_in_batch:
+                yield buf_speech_token
+                buf_speech_token = [sample]
+                longest_frames_token = new_sample_frames
+                longest_seq_token = new_seq
+            else:
+                buf_speech_token.append(sample)
+        elif "output_type" in sample and sample["output_type"] == "text2token":
+            new_seq = len(sample['label']) + len(sample.get('prompt', [])) + len(
+                sample.get('speech_token', []))
+            longest_seq_token_with_text = max(longest_seq_token_with_text, new_seq)
+            longest_frames_token_with_text = max(longest_frames_token_with_text, new_sample_frames)
+            frames_after_padding_token_with_text = longest_frames_token_with_text * (len(buf_speech_token_with_text)+1)
+            seq_after_padding_token_with_text = longest_seq_token_with_text * (len(buf_speech_token_with_text)+1)
+            if frames_after_padding_token_with_text > max_frames_in_batch_token_with_text or seq_after_padding_token_with_text > max_seq_in_batch:
+                yield buf_speech_token_with_text
+                buf_speech_token_with_text = [sample]
+                longest_frames_token_with_text = new_sample_frames
+                longest_seq_token_with_text = new_seq
+            else:
+                buf_speech_token_with_text.append(sample)
+        else:
+            new_seq = sample['feat'].size(0) / 8 + len(sample['label']) + len(sample.get('prompt', []))
+            longest_seq = max(longest_seq, new_seq)
+            longest_frames = max(longest_frames, new_sample_frames)
+            frames_after_padding = longest_frames * (len(buf)+1)
+            seq_after_padding = longest_seq * (len(buf)+1)
+            if frames_after_padding > max_frames_in_batch or seq_after_padding > max_seq_in_batch:
+                yield buf
+                buf = [sample]
+                longest_frames = new_sample_frames
+                longest_seq = new_seq
+            else:
+                buf.append(sample)
+    if len(buf) > 0:
+        yield buf
+
+
+def batch(data, batch_type='static', batch_size=16, max_frames_in_batch=12000, max_seq_in_batch=10000000):
+    """ Wrapper for static/dynamic batch
+    """
+    if batch_type == 'static':
+        return static_batch(data, batch_size)
+    elif batch_type == 'dynamic':
+        return dynamic_batch(data, max_frames_in_batch, max_seq_in_batch=max_seq_in_batch)
+    else:
+        logging.fatal('Unsupported batch type {}'.format(batch_type))
+
+
+def padding(data):
+    """ Padding the data into training data
+
+        Args:
+            data: Iterable[List[{key, feat, label}]]
+
+        Returns:
+            Iterable[Tuple(keys, feats, labels, feats lengths, label lengths)]
+    """
+    for sample in data:
+        assert isinstance(sample, list)
+        feats_length = torch.tensor([x['feat'].size(0) for x in sample],
+                                    dtype=torch.int32)
+        order = torch.argsort(feats_length, descending=True)
+        feats_lengths = torch.tensor(
+            [sample[i]['feat'].size(0) for i in order], dtype=torch.int32)
+        sorted_feats = [sample[i]['feat'] for i in order]
+        sorted_keys = [sample[i]['key'] for i in order]
+        sorted_labels = [
+            torch.tensor(sample[i]['label'], dtype=torch.int64) for i in order
+        ]
+        sorted_speech_tokens = [
+            torch.tensor(sample[i]['speech_token'], dtype=torch.int64) for i in order
+        ]
+
+        sorted_wavs = [sample[i]['wav'].squeeze(0) for i in order]
+        label_lengths = torch.tensor([x.size(0) for x in sorted_labels],
+                                     dtype=torch.int32)
+        speech_token_lengths = torch.tensor([x.size(0) for x in sorted_speech_tokens],
+                                            dtype=torch.int32)
+        wav_lengths = torch.tensor([x.size(0) for x in sorted_wavs],
+                                   dtype=torch.int32)
+        # print('------------------')
+        # for feat_item in sorted_feats:
+        #     print(feat_item.shape)
+        # print('------------------')
+
+        padded_feats = pad_sequence(sorted_feats,
+                                    batch_first=True,
+                                    padding_value=0)
+        padding_labels = pad_sequence(sorted_labels,
+                                      batch_first=True,
+                                      padding_value=-100)
+
+        padding_speech_tokens = pad_sequence(sorted_speech_tokens,
+                                             batch_first=True,
+                                             padding_value=-100)
+        padded_wavs = pad_sequence(sorted_wavs,
+                                   batch_first=True,
+                                   padding_value=0)
+
+        sorted_lang = [
+            sample[i].get('lang', 'cn') for i in order
+        ]
+
+        sorted_speaker = [
+            sample[i].get('speaker', 'None') for i in order
+        ]
+
+        sorted_emotion = [
+            sample[i].get('emotion', 'None') for i in order
+        ]
+        sorted_gender = [
+            sample[i].get('gender', 'None') for i in order
+        ]
+        # sorted_duration = [
+        #     sample[i]['duration'] for i in order
+        # ]
+        sorted_task = [
+            sample[i].get('task', '<TRANSCRIBE>') for i in order
+        ]
+
+        batch = {
+            "keys": sorted_keys,
+            "feats": padded_feats,
+            "target": padding_labels,
+            "feats_lengths": feats_lengths,
+            "target_lengths": label_lengths,
+            "pcm": padded_wavs,
+            "pcm_length": wav_lengths,
+            "speech_tokens": padding_speech_tokens,
+            "speech_tokens_length": speech_token_lengths,
+            "lang": sorted_lang,
+            "speaker": sorted_speaker,
+            "emotion": sorted_emotion,
+            "gender": sorted_gender,
+            "task": sorted_task
+        }
+        if 'prompt' in sample[0]:
+            sorted_prompts = [
+                torch.tensor(sample[i]['prompt'], dtype=torch.int64
+                             ) for i in order
+            ]
+            prompt_lengths = torch.tensor([x.size(0) for x in
+                                           sorted_prompts], dtype=torch.int32)
+            padding_prompts = pad_sequence(sorted_prompts,
+                                           batch_first=True,
+                                           padding_value=-1)
+            batch['prompt'] = padding_prompts
+            batch['prompt_lengths'] = prompt_lengths
+
+        if 'output_type' in sample[0] and sample[0]['output_type'] == 'speech2text_token':
+            batch['output_type'] = 'speech2text_token'
+        elif 'output_type' in sample[0] and sample[0]['output_type'] == 'text2token':
+            batch['output_type'] = 'text2token'
+        else:
+            batch['output_type'] = 'text'
+        yield batch
diff --git a/wenet/dataset/kaldi_io.py b/wenet/dataset/kaldi_io.py
new file mode 100644
index 0000000000000000000000000000000000000000..b686380e426dc4fc228e65bad3af2da29c1134da
--- /dev/null
+++ b/wenet/dataset/kaldi_io.py
@@ -0,0 +1,772 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# Copyright 2014-2016  Brno University of Technology (author: Karel Vesely)
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+import numpy as np
+import sys, os, re, gzip, struct
+
+#################################################
+# Adding kaldi tools to shell path,
+
+# Select kaldi,
+if not 'KALDI_ROOT' in os.environ:
+    # Default! To change run python with 'export KALDI_ROOT=/some_dir python'
+    os.environ['KALDI_ROOT'] = '/mnt/matylda5/iveselyk/Tools/kaldi-trunk'
+
+# Add kaldi tools to path,
+os.environ['PATH'] = os.popen(
+    'echo $KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/src/nnet3bin:$KALDI_ROOT/src/online2bin/:$KALDI_ROOT/src/ivectorbin/:$KALDI_ROOT/src/lmbin/'
+).readline().strip() + ':' + os.environ['PATH']
+
+
+#################################################
+# Define all custom exceptions,
+class UnsupportedDataType(Exception):
+    pass
+
+
+class UnknownVectorHeader(Exception):
+    pass
+
+
+class UnknownMatrixHeader(Exception):
+    pass
+
+
+class BadSampleSize(Exception):
+    pass
+
+
+class BadInputFormat(Exception):
+    pass
+
+
+class SubprocessFailed(Exception):
+    pass
+
+
+#################################################
+# Data-type independent helper functions,
+
+
+def open_or_fd(file, mode='rb'):
+    """ fd = open_or_fd(file)
+   Open file, gzipped file, pipe, or forward the file-descriptor.
+   Eventually seeks in the 'file' argument contains ':offset' suffix.
+  """
+    offset = None
+    try:
+        # strip 'ark:' prefix from r{x,w}filename (optional),
+        if re.search('^(ark|scp)(,scp|,b|,t|,n?f|,n?p|,b?o|,n?s|,n?cs)*:',
+                     file):
+            (prefix, file) = file.split(':', 1)
+        # separate offset from filename (optional),
+        if re.search(':[0-9]+$', file):
+            (file, offset) = file.rsplit(':', 1)
+        # input pipe?
+        if file[-1] == '|':
+            fd = popen(file[:-1], 'rb')  # custom,
+        # output pipe?
+        elif file[0] == '|':
+            fd = popen(file[1:], 'wb')  # custom,
+        # is it gzipped?
+        elif file.split('.')[-1] == 'gz':
+            fd = gzip.open(file, mode)
+        # a normal file...
+        else:
+            fd = open(file, mode)
+    except TypeError:
+        # 'file' is opened file descriptor,
+        fd = file
+    # Eventually seek to offset,
+    if offset != None: fd.seek(int(offset))
+    return fd
+
+
+# based on '/usr/local/lib/python3.4/os.py'
+def popen(cmd, mode="rb"):
+    if not isinstance(cmd, str):
+        raise TypeError("invalid cmd type (%s, expected string)" % type(cmd))
+
+    import subprocess, io, threading
+
+    # cleanup function for subprocesses,
+    def cleanup(proc, cmd):
+        ret = proc.wait()
+        if ret > 0:
+            raise SubprocessFailed('cmd %s returned %d !' % (cmd, ret))
+        return
+
+    # text-mode,
+    if mode == "r":
+        proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
+        threading.Thread(target=cleanup,
+                         args=(proc, cmd)).start()  # clean-up thread,
+        return io.TextIOWrapper(proc.stdout)
+    elif mode == "w":
+        proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE)
+        threading.Thread(target=cleanup,
+                         args=(proc, cmd)).start()  # clean-up thread,
+        return io.TextIOWrapper(proc.stdin)
+    # binary,
+    elif mode == "rb":
+        proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
+        threading.Thread(target=cleanup,
+                         args=(proc, cmd)).start()  # clean-up thread,
+        return proc.stdout
+    elif mode == "wb":
+        proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE)
+        threading.Thread(target=cleanup,
+                         args=(proc, cmd)).start()  # clean-up thread,
+        return proc.stdin
+    # sanity,
+    else:
+        raise ValueError("invalid mode %s" % mode)
+
+
+def read_key(fd):
+    """ [key] = read_key(fd)
+   Read the utterance-key from the opened ark/stream descriptor 'fd'.
+  """
+    key = ''
+    while 1:
+        char = fd.read(1).decode("latin1")
+        if char == '': break
+        if char == ' ': break
+        key += char
+    key = key.strip()
+    if key == '': return None  # end of file,
+    assert (re.match('^\S+$', key) != None)  # check format (no whitespace!)
+    return key
+
+
+#################################################
+# Integer vectors (alignments, ...),
+
+
+def read_ali_ark(file_or_fd):
+    """ Alias to 'read_vec_int_ark()' """
+    return read_vec_int_ark(file_or_fd)
+
+
+def read_vec_int_ark(file_or_fd):
+    """ generator(key,vec) = read_vec_int_ark(file_or_fd)
+   Create generator of (key,vector<int>) tuples, which reads from the ark file/stream.
+   file_or_fd : ark, gzipped ark, pipe or opened file descriptor.
+
+   Read ark to a 'dictionary':
+   d = { u:d for u,d in kaldi_io.read_vec_int_ark(file) }
+  """
+    fd = open_or_fd(file_or_fd)
+    try:
+        key = read_key(fd)
+        while key:
+            ali = read_vec_int(fd)
+            yield key, ali
+            key = read_key(fd)
+    finally:
+        if fd is not file_or_fd: fd.close()
+
+
+def read_vec_int_scp(file_or_fd):
+    """ generator(key,vec) = read_vec_int_scp(file_or_fd)
+   Returns generator of (key,vector<int>) tuples, read according to kaldi scp.
+   file_or_fd : scp, gzipped scp, pipe or opened file descriptor.
+
+   Iterate the scp:
+   for key,vec in kaldi_io.read_vec_int_scp(file):
+     ...
+
+   Read scp to a 'dictionary':
+   d = { key:vec for key,mat in kaldi_io.read_vec_int_scp(file) }
+  """
+    fd = open_or_fd(file_or_fd)
+    try:
+        for line in fd:
+            (key, rxfile) = line.decode().split(' ')
+            vec = read_vec_int(rxfile)
+            yield key, vec
+    finally:
+        if fd is not file_or_fd: fd.close()
+
+
+def read_vec_int(file_or_fd):
+    """ [int-vec] = read_vec_int(file_or_fd)
+   Read kaldi integer vector, ascii or binary input,
+  """
+    fd = open_or_fd(file_or_fd)
+    binary = fd.read(2).decode()
+    if binary == '\0B':  # binary flag
+        assert (fd.read(1).decode() == '\4')
+        # int-size
+        vec_size = np.frombuffer(fd.read(4), dtype='int32',
+                                 count=1)[0]  # vector dim
+        # Elements from int32 vector are sored in tuples: (sizeof(int32), value),
+        vec = np.frombuffer(fd.read(vec_size * 5),
+                            dtype=[('size', 'int8'), ('value', 'int32')],
+                            count=vec_size)
+        assert (vec[0]['size'] == 4)  # int32 size,
+        ans = vec[:]['value']  # values are in 2nd column,
+    else:  # ascii,
+        arr = (binary + fd.readline().decode()).strip().split()
+        try:
+            arr.remove('[')
+            arr.remove(']')  # optionally
+        except ValueError:
+            pass
+        ans = np.array(arr, dtype=int)
+    if fd is not file_or_fd: fd.close()  # cleanup
+    return ans
+
+
+# Writing,
+def write_vec_int(file_or_fd, v, key=''):
+    """ write_vec_int(f, v, key='')
+   Write a binary kaldi integer vector to filename or stream.
+   Arguments:
+   file_or_fd : filename or opened file descriptor for writing,
+   v : the vector to be stored,
+   key (optional) : used for writing ark-file, the utterance-id gets written before the vector.
+
+   Example of writing single vector:
+   kaldi_io.write_vec_int(filename, vec)
+
+   Example of writing arkfile:
+   with open(ark_file,'w') as f:
+     for key,vec in dict.iteritems():
+       kaldi_io.write_vec_flt(f, vec, key=key)
+  """
+    fd = open_or_fd(file_or_fd, mode='wb')
+    if sys.version_info[0] == 3: assert (fd.mode == 'wb')
+    try:
+        if key != '':
+            fd.write(
+                (key +
+                 ' ').encode("latin1"))  # ark-files have keys (utterance-id),
+        fd.write('\0B'.encode())  # we write binary!
+        # dim,
+        fd.write('\4'.encode())  # int32 type,
+        fd.write(struct.pack(np.dtype('int32').char, v.shape[0]))
+        # data,
+        for i in range(len(v)):
+            fd.write('\4'.encode())  # int32 type,
+            fd.write(struct.pack(np.dtype('int32').char, v[i]))  # binary,
+    finally:
+        if fd is not file_or_fd: fd.close()
+
+
+#################################################
+# Float vectors (confidences, ivectors, ...),
+
+
+# Reading,
+def read_vec_flt_scp(file_or_fd):
+    """ generator(key,mat) = read_vec_flt_scp(file_or_fd)
+   Returns generator of (key,vector) tuples, read according to kaldi scp.
+   file_or_fd : scp, gzipped scp, pipe or opened file descriptor.
+
+   Iterate the scp:
+   for key,vec in kaldi_io.read_vec_flt_scp(file):
+     ...
+
+   Read scp to a 'dictionary':
+   d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) }
+  """
+    fd = open_or_fd(file_or_fd)
+    try:
+        for line in fd:
+            (key, rxfile) = line.decode().split(' ')
+            vec = read_vec_flt(rxfile)
+            yield key, vec
+    finally:
+        if fd is not file_or_fd: fd.close()
+
+
+def read_vec_flt_ark(file_or_fd):
+    """ generator(key,vec) = read_vec_flt_ark(file_or_fd)
+   Create generator of (key,vector<float>) tuples, reading from an ark file/stream.
+   file_or_fd : ark, gzipped ark, pipe or opened file descriptor.
+
+   Read ark to a 'dictionary':
+   d = { u:d for u,d in kaldi_io.read_vec_flt_ark(file) }
+  """
+    fd = open_or_fd(file_or_fd)
+    try:
+        key = read_key(fd)
+        while key:
+            ali = read_vec_flt(fd)
+            yield key, ali
+            key = read_key(fd)
+    finally:
+        if fd is not file_or_fd: fd.close()
+
+
+def read_vec_flt(file_or_fd):
+    """ [flt-vec] = read_vec_flt(file_or_fd)
+   Read kaldi float vector, ascii or binary input,
+  """
+    fd = open_or_fd(file_or_fd)
+    binary = fd.read(2).decode()
+    if binary == '\0B':  # binary flag
+        # Data type,
+        header = fd.read(3).decode()
+        if header == 'FV ': sample_size = 4  # floats
+        elif header == 'DV ': sample_size = 8  # doubles
+        else: raise UnknownVectorHeader("The header contained '%s'" % header)
+        assert (sample_size > 0)
+        # Dimension,
+        assert (fd.read(1).decode() == '\4')
+        # int-size
+        vec_size = np.frombuffer(fd.read(4), dtype='int32',
+                                 count=1)[0]  # vector dim
+        # Read whole vector,
+        buf = fd.read(vec_size * sample_size)
+        if sample_size == 4: ans = np.frombuffer(buf, dtype='float32')
+        elif sample_size == 8: ans = np.frombuffer(buf, dtype='float64')
+        else: raise BadSampleSize
+        return ans
+    else:  # ascii,
+        arr = (binary + fd.readline().decode()).strip().split()
+        try:
+            arr.remove('[')
+            arr.remove(']')  # optionally
+        except ValueError:
+            pass
+        ans = np.array(arr, dtype=float)
+    if fd is not file_or_fd: fd.close()  # cleanup
+    return ans
+
+
+# Writing,
+def write_vec_flt(file_or_fd, v, key=''):
+    """ write_vec_flt(f, v, key='')
+   Write a binary kaldi vector to filename or stream. Supports 32bit and 64bit floats.
+   Arguments:
+   file_or_fd : filename or opened file descriptor for writing,
+   v : the vector to be stored,
+   key (optional) : used for writing ark-file, the utterance-id gets written before the vector.
+
+   Example of writing single vector:
+   kaldi_io.write_vec_flt(filename, vec)
+
+   Example of writing arkfile:
+   with open(ark_file,'w') as f:
+     for key,vec in dict.iteritems():
+       kaldi_io.write_vec_flt(f, vec, key=key)
+  """
+    fd = open_or_fd(file_or_fd, mode='wb')
+    if sys.version_info[0] == 3: assert (fd.mode == 'wb')
+    try:
+        if key != '':
+            fd.write(
+                (key +
+                 ' ').encode("latin1"))  # ark-files have keys (utterance-id),
+        fd.write('\0B'.encode())  # we write binary!
+        # Data-type,
+        if v.dtype == 'float32': fd.write('FV '.encode())
+        elif v.dtype == 'float64': fd.write('DV '.encode())
+        else:
+            raise UnsupportedDataType(
+                "'%s', please use 'float32' or 'float64'" % v.dtype)
+        # Dim,
+        fd.write('\04'.encode())
+        fd.write(struct.pack(np.dtype('uint32').char, v.shape[0]))  # dim
+        # Data,
+        fd.write(v.tobytes())
+    finally:
+        if fd is not file_or_fd: fd.close()
+
+
+#################################################
+# Float matrices (features, transformations, ...),
+
+
+# Reading,
+def read_mat_scp(file_or_fd):
+    """ generator(key,mat) = read_mat_scp(file_or_fd)
+   Returns generator of (key,matrix) tuples, read according to kaldi scp.
+   file_or_fd : scp, gzipped scp, pipe or opened file descriptor.
+
+   Iterate the scp:
+   for key,mat in kaldi_io.read_mat_scp(file):
+     ...
+
+   Read scp to a 'dictionary':
+   d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) }
+  """
+    fd = open_or_fd(file_or_fd)
+    try:
+        for line in fd:
+            (key, rxfile) = line.decode().split(' ')
+            mat = read_mat(rxfile)
+            yield key, mat
+    finally:
+        if fd is not file_or_fd: fd.close()
+
+
+def read_mat_ark(file_or_fd):
+    """ generator(key,mat) = read_mat_ark(file_or_fd)
+   Returns generator of (key,matrix) tuples, read from ark file/stream.
+   file_or_fd : scp, gzipped scp, pipe or opened file descriptor.
+
+   Iterate the ark:
+   for key,mat in kaldi_io.read_mat_ark(file):
+     ...
+
+   Read ark to a 'dictionary':
+   d = { key:mat for key,mat in kaldi_io.read_mat_ark(file) }
+  """
+    fd = open_or_fd(file_or_fd)
+    try:
+        key = read_key(fd)
+        while key:
+            mat = read_mat(fd)
+            yield key, mat
+            key = read_key(fd)
+    finally:
+        if fd is not file_or_fd: fd.close()
+
+
+def read_mat(file_or_fd):
+    """ [mat] = read_mat(file_or_fd)
+   Reads single kaldi matrix, supports ascii and binary.
+   file_or_fd : file, gzipped file, pipe or opened file descriptor.
+  """
+    fd = open_or_fd(file_or_fd)
+    try:
+        binary = fd.read(2).decode()
+        if binary == '\0B':
+            mat = _read_mat_binary(fd)
+        else:
+            assert (binary == ' [')
+            mat = _read_mat_ascii(fd)
+    finally:
+        if fd is not file_or_fd: fd.close()
+    return mat
+
+
+def _read_mat_binary(fd):
+    # Data type
+    header = fd.read(3).decode()
+    # 'CM', 'CM2', 'CM3' are possible values,
+    if header.startswith('CM'): return _read_compressed_mat(fd, header)
+    elif header == 'FM ': sample_size = 4  # floats
+    elif header == 'DM ': sample_size = 8  # doubles
+    else: raise UnknownMatrixHeader("The header contained '%s'" % header)
+    assert (sample_size > 0)
+    # Dimensions
+    s1, rows, s2, cols = np.frombuffer(fd.read(10),
+                                       dtype='int8,int32,int8,int32',
+                                       count=1)[0]
+    # Read whole matrix
+    buf = fd.read(rows * cols * sample_size)
+    if sample_size == 4: vec = np.frombuffer(buf, dtype='float32')
+    elif sample_size == 8: vec = np.frombuffer(buf, dtype='float64')
+    else: raise BadSampleSize
+    mat = np.reshape(vec, (rows, cols))
+    return mat
+
+
+def _read_mat_ascii(fd):
+    rows = []
+    while 1:
+        line = fd.readline().decode()
+        if (len(line) == 0): raise BadInputFormat  # eof, should not happen!
+        if len(line.strip()) == 0: continue  # skip empty line
+        arr = line.strip().split()
+        if arr[-1] != ']':
+            rows.append(np.array(arr, dtype='float32'))  # not last line
+        else:
+            rows.append(np.array(arr[:-1], dtype='float32'))  # last line
+            mat = np.vstack(rows)
+            return mat
+
+
+def _read_compressed_mat(fd, format):
+    """ Read a compressed matrix,
+      see: https://github.com/kaldi-asr/kaldi/blob/master/src/matrix/compressed-matrix.h
+      methods: CompressedMatrix::Read(...), CompressedMatrix::CopyToMat(...),
+  """
+    assert (format == 'CM ')  # The formats CM2, CM3 are not supported...
+
+    # Format of header 'struct',
+    global_header = np.dtype([('minvalue', 'float32'), ('range', 'float32'),
+                              ('num_rows', 'int32'), ('num_cols', 'int32')
+                              ])  # member '.format' is not written,
+    per_col_header = np.dtype([('percentile_0', 'uint16'),
+                               ('percentile_25', 'uint16'),
+                               ('percentile_75', 'uint16'),
+                               ('percentile_100', 'uint16')])
+
+    # Mapping for percentiles in col-headers,
+    def uint16_to_float(value, min, range):
+        return np.float32(min + range * 1.52590218966964e-05 * value)
+
+    # Mapping for matrix elements,
+    def uint8_to_float_v2(vec, p0, p25, p75, p100):
+        # Split the vector by masks,
+        mask_0_64 = (vec <= 64)
+        mask_193_255 = (vec > 192)
+        mask_65_192 = (~(mask_0_64 | mask_193_255))
+        # Sanity check (useful but slow...),
+        # assert(len(vec) == np.sum(np.hstack([mask_0_64,mask_65_192,mask_193_255])))
+        # assert(len(vec) == np.sum(np.any([mask_0_64,mask_65_192,mask_193_255], axis=0)))
+        # Build the float vector,
+        ans = np.empty(len(vec), dtype='float32')
+        ans[mask_0_64] = p0 + (p25 - p0) / 64. * vec[mask_0_64]
+        ans[mask_65_192] = p25 + (p75 - p25) / 128. * (vec[mask_65_192] - 64)
+        ans[mask_193_255] = p75 + (p100 - p75) / 63. * (vec[mask_193_255] -
+                                                        192)
+        return ans
+
+    # Read global header,
+    globmin, globrange, rows, cols = np.frombuffer(fd.read(16),
+                                                   dtype=global_header,
+                                                   count=1)[0]
+
+    # The data is structed as [Colheader, ... , Colheader, Data, Data , .... ]
+    #                         {           cols           }{     size         }
+    col_headers = np.frombuffer(fd.read(cols * 8),
+                                dtype=per_col_header,
+                                count=cols)
+    data = np.reshape(np.frombuffer(fd.read(cols * rows),
+                                    dtype='uint8',
+                                    count=cols * rows),
+                      newshape=(cols, rows))  # stored as col-major,
+
+    mat = np.empty((cols, rows), dtype='float32')
+    for i, col_header in enumerate(col_headers):
+        col_header_flt = [
+            uint16_to_float(percentile, globmin, globrange)
+            for percentile in col_header
+        ]
+        mat[i] = uint8_to_float_v2(data[i], *col_header_flt)
+
+    return mat.T  # transpose! col-major -> row-major,
+
+
+def write_ark_scp(key, mat, ark_fout, scp_out):
+    mat_offset = write_mat(ark_fout, mat, key)
+    scp_line = '{}\t{}:{}'.format(key, ark_fout.name, mat_offset)
+    scp_out.write(scp_line)
+    scp_out.write('\n')
+
+
+# Writing,
+def write_mat(file_or_fd, m, key=''):
+    """ write_mat(f, m, key='')
+  Write a binary kaldi matrix to filename or stream. Supports 32bit and 64bit floats.
+  Arguments:
+   file_or_fd : filename of opened file descriptor for writing,
+   m : the matrix to be stored,
+   key (optional) : used for writing ark-file, the utterance-id gets written before the matrix.
+
+   Example of writing single matrix:
+   kaldi_io.write_mat(filename, mat)
+
+   Example of writing arkfile:
+   with open(ark_file,'w') as f:
+     for key,mat in dict.iteritems():
+       kaldi_io.write_mat(f, mat, key=key)
+  """
+    mat_offset = 0
+    fd = open_or_fd(file_or_fd, mode='wb')
+    if sys.version_info[0] == 3: assert (fd.mode == 'wb')
+    try:
+        if key != '':
+            fd.write(
+                (key +
+                 ' ').encode("latin1"))  # ark-files have keys (utterance-id),
+        mat_offset = fd.tell()
+        fd.write('\0B'.encode())  # we write binary!
+        # Data-type,
+        if m.dtype == 'float32': fd.write('FM '.encode())
+        elif m.dtype == 'float64': fd.write('DM '.encode())
+        else:
+            raise UnsupportedDataType(
+                "'%s', please use 'float32' or 'float64'" % m.dtype)
+        # Dims,
+        fd.write('\04'.encode())
+        fd.write(struct.pack(np.dtype('uint32').char, m.shape[0]))  # rows
+        fd.write('\04'.encode())
+        fd.write(struct.pack(np.dtype('uint32').char, m.shape[1]))  # cols
+        # Data,
+        fd.write(m.tobytes())
+    finally:
+        if fd is not file_or_fd: fd.close()
+    return mat_offset
+
+
+#################################################
+# 'Posterior' kaldi type (posteriors, confusion network, nnet1 training targets, ...)
+# Corresponds to: vector<vector<tuple<int,float> > >
+# - outer vector: time axis
+# - inner vector: records at the time
+# - tuple: int = index, float = value
+#
+
+
+def read_cnet_ark(file_or_fd):
+    """ Alias of function 'read_post_ark()', 'cnet' = confusion network """
+    return read_post_ark(file_or_fd)
+
+
+def read_post_ark(file_or_fd):
+    """ generator(key,vec<vec<int,float>>) = read_post_ark(file)
+   Returns generator of (key,posterior) tuples, read from ark file.
+   file_or_fd : ark, gzipped ark, pipe or opened file descriptor.
+
+   Iterate the ark:
+   for key,post in kaldi_io.read_post_ark(file):
+     ...
+
+   Read ark to a 'dictionary':
+   d = { key:post for key,post in kaldi_io.read_post_ark(file) }
+  """
+    fd = open_or_fd(file_or_fd)
+    try:
+        key = read_key(fd)
+        while key:
+            post = read_post(fd)
+            yield key, post
+            key = read_key(fd)
+    finally:
+        if fd is not file_or_fd: fd.close()
+
+
+def read_post(file_or_fd):
+    """ [post] = read_post(file_or_fd)
+   Reads single kaldi 'Posterior' in binary format.
+
+   The 'Posterior' is C++ type 'vector<vector<tuple<int,float> > >',
+   the outer-vector is usually time axis, inner-vector are the records
+   at given time,  and the tuple is composed of an 'index' (integer)
+   and a 'float-value'. The 'float-value' can represent a probability
+   or any other numeric value.
+
+   Returns vector of vectors of tuples.
+  """
+    fd = open_or_fd(file_or_fd)
+    ans = []
+    binary = fd.read(2).decode()
+    assert (binary == '\0B')
+    # binary flag
+    assert (fd.read(1).decode() == '\4')
+    # int-size
+    outer_vec_size = np.frombuffer(fd.read(4), dtype='int32',
+                                   count=1)[0]  # number of frames (or bins)
+
+    # Loop over 'outer-vector',
+    for i in range(outer_vec_size):
+        assert (fd.read(1).decode() == '\4')
+        # int-size
+        inner_vec_size = np.frombuffer(
+            fd.read(4), dtype='int32',
+            count=1)[0]  # number of records for frame (or bin)
+        data = np.frombuffer(fd.read(inner_vec_size * 10),
+                             dtype=[('size_idx', 'int8'), ('idx', 'int32'),
+                                    ('size_post', 'int8'),
+                                    ('post', 'float32')],
+                             count=inner_vec_size)
+        assert (data[0]['size_idx'] == 4)
+        assert (data[0]['size_post'] == 4)
+        ans.append(data[['idx', 'post']].tolist())
+
+    if fd is not file_or_fd: fd.close()
+    return ans
+
+
+#################################################
+# Kaldi Confusion Network bin begin/end times,
+# (kaldi stores CNs time info separately from the Posterior).
+#
+
+
+def read_cntime_ark(file_or_fd):
+    """ generator(key,vec<tuple<float,float>>) = read_cntime_ark(file_or_fd)
+   Returns generator of (key,cntime) tuples, read from ark file.
+   file_or_fd : file, gzipped file, pipe or opened file descriptor.
+
+   Iterate the ark:
+   for key,time in kaldi_io.read_cntime_ark(file):
+     ...
+
+   Read ark to a 'dictionary':
+   d = { key:time for key,time in kaldi_io.read_post_ark(file) }
+  """
+    fd = open_or_fd(file_or_fd)
+    try:
+        key = read_key(fd)
+        while key:
+            cntime = read_cntime(fd)
+            yield key, cntime
+            key = read_key(fd)
+    finally:
+        if fd is not file_or_fd: fd.close()
+
+
+def read_cntime(file_or_fd):
+    """ [cntime] = read_cntime(file_or_fd)
+   Reads single kaldi 'Confusion Network time info', in binary format:
+   C++ type: vector<tuple<float,float> >.
+   (begin/end times of bins at the confusion network).
+
+   Binary layout is '<num-bins> <beg1> <end1> <beg2> <end2> ...'
+
+   file_or_fd : file, gzipped file, pipe or opened file descriptor.
+
+   Returns vector of tuples.
+  """
+    fd = open_or_fd(file_or_fd)
+    binary = fd.read(2).decode()
+    assert (binary == '\0B')
+    # assuming it's binary
+
+    assert (fd.read(1).decode() == '\4')
+    # int-size
+    vec_size = np.frombuffer(fd.read(4), dtype='int32',
+                             count=1)[0]  # number of frames (or bins)
+
+    data = np.frombuffer(fd.read(vec_size * 10),
+                         dtype=[('size_beg', 'int8'), ('t_beg', 'float32'),
+                                ('size_end', 'int8'), ('t_end', 'float32')],
+                         count=vec_size)
+    assert (data[0]['size_beg'] == 4)
+    assert (data[0]['size_end'] == 4)
+    ans = data[['t_beg',
+                't_end']].tolist()  # Return vector of tuples (t_beg,t_end),
+
+    if fd is not file_or_fd: fd.close()
+    return ans
+
+
+#################################################
+# Segments related,
+#
+
+
+# Segments as 'Bool vectors' can be handy,
+# - for 'superposing' the segmentations,
+# - for frame-selection in Speaker-ID experiments,
+def read_segments_as_bool_vec(segments_file):
+    """ [ bool_vec ] = read_segments_as_bool_vec(segments_file)
+   using kaldi 'segments' file for 1 wav, format : '<utt> <rec> <t-beg> <t-end>'
+   - t-beg, t-end is in seconds,
+   - assumed 100 frames/second,
+  """
+    segs = np.loadtxt(segments_file, dtype='object,object,f,f', ndmin=1)
+    # Sanity checks,
+    assert (len(segs) > 0)  # empty segmentation is an error,
+    assert (len(np.unique([rec[1] for rec in segs])) == 1
+            )  # segments with only 1 wav-file,
+    # Convert time to frame-indexes,
+    start = np.rint([100 * rec[2] for rec in segs]).astype(int)
+    end = np.rint([100 * rec[3] for rec in segs]).astype(int)
+    # Taken from 'read_lab_to_bool_vec', htk.py,
+    frms = np.repeat(
+        np.r_[np.tile([False, True], len(end)), False],
+        np.r_[np.c_[start - np.r_[0, end[:-1]], end - start].flat, 0])
+    assert np.sum(end - start) == np.sum(frms)
+    return frms
diff --git a/wenet/dataset/processor.py b/wenet/dataset/processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..56f64beb232fbe7fd88b8f49429c2a336d141a81
--- /dev/null
+++ b/wenet/dataset/processor.py
@@ -0,0 +1,694 @@
+# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import librosa
+import logging
+import json
+import random
+import tarfile
+from subprocess import PIPE, Popen
+from urllib.parse import urlparse
+
+import torch
+import torchaudio
+import torchaudio.compliance.kaldi as kaldi
+import torch.nn.functional as F
+from torch.nn.utils.rnn import pad_sequence
+from wenet.text.base_tokenizer import BaseTokenizer
+
+torchaudio.utils.sox_utils.set_buffer_size(16500)
+
+AUDIO_FORMAT_SETS = set(['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma'])
+
+
+def url_opener(data):
+    """ Give url or local file, return file descriptor
+        Inplace operation.
+
+        Args:
+            data(Iterable[str]): url or local file list
+
+        Returns:
+            Iterable[{src, stream}]
+    """
+    for sample in data:
+        assert 'src' in sample
+        # TODO(Binbin Zhang): support HTTP
+        url = sample['src']
+        try:
+            pr = urlparse(url)
+            # local file
+            if pr.scheme == '' or pr.scheme == 'file':
+                stream = open(url, 'rb')
+            # network file, such as HTTP(HDFS/OSS/S3)/HTTPS/SCP
+            else:
+                cmd = f'wget -q -O - {url}'
+                process = Popen(cmd, shell=True, stdout=PIPE)
+                sample.update(process=process)
+                stream = process.stdout
+            sample.update(stream=stream)
+            yield sample
+        except Exception as ex:
+            logging.warning('Failed to open {}'.format(url))
+
+
+def tar_file_and_group(data):
+    """ Expand a stream of open tar files into a stream of tar file contents.
+        And groups the file with same prefix
+
+        Args:
+            data: Iterable[{src, stream}]
+
+        Returns:
+            Iterable[{key, wav, txt, sample_rate}]
+    """
+    for sample in data:
+        assert 'stream' in sample
+        stream = None
+        try:
+            stream = tarfile.open(fileobj=sample['stream'], mode="r:*")
+            prev_prefix = None
+            example = {}
+            valid = True
+            for tarinfo in stream:
+                name = tarinfo.name
+                pos = name.rfind('.')
+                assert pos > 0
+                prefix, postfix = name[:pos], name[pos + 1:]
+                if prev_prefix is not None and prefix != prev_prefix:
+                    example['key'] = prev_prefix
+                    if valid:
+                        yield example
+                    example = {}
+                    valid = True
+                with stream.extractfile(tarinfo) as file_obj:
+                    try:
+                        if postfix == 'txt':
+                            example['txt'] = file_obj.read().decode(
+                                'utf8').strip()
+                        elif postfix in AUDIO_FORMAT_SETS:
+                            waveform, sample_rate = torchaudio.load(file_obj)
+                            example['wav'] = waveform
+                            example['sample_rate'] = sample_rate
+                        else:
+                            example[postfix] = file_obj.read()
+                    except Exception as ex:
+                        valid = False
+                        logging.warning('error to parse {}'.format(name))
+                prev_prefix = prefix
+            if prev_prefix is not None:
+                example['key'] = prev_prefix
+                yield example
+        except Exception as ex:
+            logging.warning(
+                'In tar_file_and_group: {} when processing {}'.format(
+                    ex, sample['src']))
+        finally:
+            if stream is not None:
+                stream.close()
+            if 'process' in sample:
+                sample['process'].communicate()
+            sample['stream'].close()
+
+
+def parse_raw(data):
+    """ Parse key/wav/txt from json line
+
+        Args:
+            data: Iterable[str], str is a json line has key/wav/txt
+
+        Returns:
+            Iterable[{key, wav, txt, sample_rate}]
+    """
+    for sample in data:
+        assert 'src' in sample
+        json_line = sample['src']
+        obj = json.loads(json_line)
+        assert 'key' in obj
+        assert 'wav' in obj
+        assert 'txt' in obj
+        key = obj['key']
+        wav_file = obj['wav']
+        txt = obj['txt']
+        try:
+            if 'start' in obj:
+                assert 'end' in obj
+                sample_rate = torchaudio.info(wav_file).sample_rate
+                start_frame = int(obj['start'] * sample_rate)
+                end_frame = int(obj['end'] * sample_rate)
+                waveform, _ = torchaudio.load(filepath=wav_file,
+                                              num_frames=end_frame -
+                                              start_frame,
+                                              frame_offset=start_frame)
+            else:
+                waveform, sample_rate = torchaudio.load(wav_file)
+            example = copy.deepcopy(obj)  # copy and keep all the fields
+            example['wav'] = waveform  # overwrite wav
+            example['sample_rate'] = sample_rate
+            yield example
+        except Exception as ex:
+            logging.warning('Failed to read {}'.format(wav_file))
+
+
+def parse_speaker(data, speaker_table_path):
+    speaker_dict = {}
+    with open(speaker_table_path, 'r', encoding='utf8') as fin:
+        for line in fin:
+            arr = line.strip().split()
+            speaker_dict[arr[0]] = int(arr[1])
+    for sample in data:
+        assert 'speaker' in sample
+        speaker = sample['speaker']
+        sample['speaker'] = speaker_dict.get(speaker, 0)
+        yield sample
+
+
+def filter(data,
+           max_length=10240,
+           min_length=10,
+           token_max_length=200,
+           token_min_length=1,
+           min_output_input_ratio=0.0005,
+           max_output_input_ratio=1):
+    """ Filter sample according to feature and label length
+        Inplace operation.
+
+        Args::
+            data: Iterable[{key, wav, label, sample_rate}]
+            max_length: drop utterance which is greater than max_length(10ms)
+            min_length: drop utterance which is less than min_length(10ms)
+            token_max_length: drop utterance which is greater than
+                token_max_length, especially when use char unit for
+                english modeling
+            token_min_length: drop utterance which is
+                less than token_max_length
+            min_output_input_ratio: minimal ration of
+                token_length / feats_length(10ms)
+            max_output_input_ratio: maximum ration of
+                token_length / feats_length(10ms)
+
+        Returns:
+            Iterable[{key, wav, label, sample_rate}]
+    """
+    for sample in data:
+        try:
+            assert 'sample_rate' in sample
+            assert 'wav' in sample
+            assert 'label' in sample
+        except:
+            continue
+        # sample['wav'] is torch.Tensor, we have 100 frames every second
+        num_frames = sample['wav'].size(1) / sample['sample_rate'] * 100
+        if num_frames < min_length:
+            continue
+        if num_frames > max_length:
+            continue
+        if len(sample['label']) < token_min_length:
+            continue
+        if len(sample['label']) > token_max_length:
+            continue
+        if num_frames != 0:
+            if len(sample['label']) / num_frames < min_output_input_ratio:
+                continue
+            if len(sample['label']) / num_frames > max_output_input_ratio:
+                continue
+        yield sample
+
+
+def resample(data, resample_rate=16000):
+    """ Resample data.
+        Inplace operation.
+
+        Args:
+            data: Iterable[{key, wav, label, sample_rate}]
+            resample_rate: target resample rate
+
+        Returns:
+            Iterable[{key, wav, label, sample_rate}]
+    """
+    for sample in data:
+        assert 'sample_rate' in sample
+        assert 'wav' in sample
+        sample_rate = sample['sample_rate']
+        waveform = sample['wav']
+        if sample_rate != resample_rate:
+            sample['sample_rate'] = resample_rate
+            sample['wav'] = torchaudio.transforms.Resample(
+                orig_freq=sample_rate, new_freq=resample_rate)(waveform)
+        yield sample
+
+
+def speed_perturb(data, speeds=None):
+    """ Apply speed perturb to the data.
+        Inplace operation.
+
+        Args:
+            data: Iterable[{key, wav, label, sample_rate}]
+            speeds(List[float]): optional speed
+
+        Returns:
+            Iterable[{key, wav, label, sample_rate}]
+    """
+    if speeds is None:
+        speeds = [0.9, 1.0, 1.1]
+    for sample in data:
+        assert 'sample_rate' in sample
+        assert 'wav' in sample
+        sample_rate = sample['sample_rate']
+        waveform = sample['wav']
+        speed = random.choice(speeds)
+        if speed != 1.0:
+            wav, _ = torchaudio.sox_effects.apply_effects_tensor(
+                waveform, sample_rate,
+                [['speed', str(speed)], ['rate', str(sample_rate)]])
+            sample['wav'] = wav
+
+        yield sample
+
+
+def compute_fbank(data,
+                  num_mel_bins=23,
+                  frame_length=25,
+                  frame_shift=10,
+                  dither=0.0):
+    """ Extract fbank
+
+        Args:
+            data: Iterable[{key, wav, label, sample_rate}]
+
+        Returns:
+            Iterable[{key, feat, label}]
+    """
+    for sample in data:
+        assert 'sample_rate' in sample
+        assert 'wav' in sample
+        assert 'key' in sample
+        assert 'label' in sample
+        sample_rate = sample['sample_rate']
+        waveform = sample['wav']
+        waveform = waveform * (1 << 15)
+        # Only keep key, feat, label
+        mat = kaldi.fbank(waveform,
+                          num_mel_bins=num_mel_bins,
+                          frame_length=frame_length,
+                          frame_shift=frame_shift,
+                          dither=dither,
+                          energy_floor=0.0,
+                          sample_frequency=sample_rate)
+        sample['feat'] = mat
+        yield sample
+
+
+def compute_mfcc(data,
+                 num_mel_bins=23,
+                 frame_length=25,
+                 frame_shift=10,
+                 dither=0.0,
+                 num_ceps=40,
+                 high_freq=0.0,
+                 low_freq=20.0):
+    """ Extract mfcc
+
+        Args:
+            data: Iterable[{key, wav, label, sample_rate}]
+
+        Returns:
+            Iterable[{key, feat, label}]
+    """
+    for sample in data:
+        assert 'sample_rate' in sample
+        assert 'wav' in sample
+        assert 'key' in sample
+        assert 'label' in sample
+        sample_rate = sample['sample_rate']
+        waveform = sample['wav']
+        waveform = waveform * (1 << 15)
+        # Only keep key, feat, label
+        mat = kaldi.mfcc(waveform,
+                         num_mel_bins=num_mel_bins,
+                         frame_length=frame_length,
+                         frame_shift=frame_shift,
+                         dither=dither,
+                         num_ceps=num_ceps,
+                         high_freq=high_freq,
+                         low_freq=low_freq,
+                         sample_frequency=sample_rate)
+        sample['feat'] = mat
+        yield sample
+
+
+def compute_log_mel_spectrogram(data,
+                                n_fft=400,
+                                hop_length=160,
+                                num_mel_bins=80,
+                                padding=0):
+    """ Extract log mel spectrogram, modified from openai-whisper, see:
+        - https://github.com/openai/whisper/blob/main/whisper/audio.py
+        - https://github.com/wenet-e2e/wenet/pull/2141#issuecomment-1811765040
+
+        Args:
+            data: Iterable[{key, wav, label, sample_rate}]
+
+        Returns:
+            Iterable[{key, feat, label}]
+    """
+    for sample in data:
+        assert 'sample_rate' in sample
+        assert 'wav' in sample
+        assert 'key' in sample
+        assert 'label' in sample
+        sample_rate = sample['sample_rate']
+        waveform = sample['wav'].squeeze(0)  # (channel=1, sample) -> (sample,)
+        if padding > 0:
+            waveform = F.pad(waveform, (0, padding))
+        window = torch.hann_window(n_fft)
+        stft = torch.stft(waveform,
+                          n_fft,
+                          hop_length,
+                          window=window,
+                          return_complex=True)
+        magnitudes = stft[..., :-1].abs()**2
+
+        filters = torch.from_numpy(
+            librosa.filters.mel(sr=sample_rate,
+                                n_fft=n_fft,
+                                n_mels=num_mel_bins))
+        mel_spec = filters @ magnitudes
+
+        # NOTE(xcsong): https://github.com/openai/whisper/discussions/269
+        log_spec = torch.clamp(mel_spec, min=1e-10).log10()
+        log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
+        log_spec = (log_spec + 4.0) / 4.0
+        sample['feat'] = log_spec.transpose(0, 1)
+        yield sample
+
+
+def tokenize(data, tokenizer: BaseTokenizer, global_prompt_dict=None):
+    """ Decode text to chars or BPE
+        Inplace operation
+
+        Args:
+            data: Iterable[{key, wav, txt, sample_rate}]
+
+        Returns:
+            Iterable[{key, wav, txt, tokens, label, sample_rate}]
+    """
+    for sample in data:
+        assert 'txt' in sample
+        if 'task' in sample:
+            task_name = sample['task']
+            if "<AGE>" in task_name:
+                txt = sample['txt'].replace("<YOUTH>", "<ADULT>")
+            else:
+                txt = sample['txt']
+        else:
+            txt = sample['txt']
+        tokens, label = tokenizer.tokenize(txt)
+        sample['tokens'] = tokens
+        sample['label'] = label + [tokenizer.eod_id]
+        if 'task' in sample:
+            task_name = sample['task']
+            random_index = random.randint(0, len(global_prompt_dict[task_name])-1)
+            prompt = global_prompt_dict[task_name][random_index]
+            sample['prompt'] = tokenizer.tokenize(prompt)
+        yield sample
+
+
+def spec_aug(data, num_t_mask=2, num_f_mask=2, max_t=50, max_f=10, max_w=80):
+    """ Do spec augmentation
+        Inplace operation
+
+        Args:
+            data: Iterable[{key, feat, label}]
+            num_t_mask: number of time mask to apply
+            num_f_mask: number of freq mask to apply
+            max_t: max width of time mask
+            max_f: max width of freq mask
+            max_w: max width of time warp
+
+        Returns
+            Iterable[{key, feat, label}]
+    """
+    for sample in data:
+        assert 'feat' in sample
+        x = sample['feat']
+        assert isinstance(x, torch.Tensor)
+        y = x.clone().detach()
+        max_frames = y.size(0)
+        max_freq = y.size(1)
+        # time mask
+        for i in range(num_t_mask):
+            start = random.randint(0, max_frames - 1)
+            length = random.randint(1, max_t)
+            end = min(max_frames, start + length)
+            y[start:end, :] = 0
+        # freq mask
+        for i in range(num_f_mask):
+            start = random.randint(0, max_freq - 1)
+            length = random.randint(1, max_f)
+            end = min(max_freq, start + length)
+            y[:, start:end] = 0
+        sample['feat'] = y
+        yield sample
+
+
+def spec_sub(data, max_t=20, num_t_sub=3):
+    """ Do spec substitute
+        Inplace operation
+        ref: U2++, section 3.2.3 [https://arxiv.org/abs/2106.05642]
+
+        Args:
+            data: Iterable[{key, feat, label}]
+            max_t: max width of time substitute
+            num_t_sub: number of time substitute to apply
+
+        Returns
+            Iterable[{key, feat, label}]
+    """
+    for sample in data:
+        assert 'feat' in sample
+        x = sample['feat']
+        assert isinstance(x, torch.Tensor)
+        y = x.clone().detach()
+        max_frames = y.size(0)
+        for i in range(num_t_sub):
+            start = random.randint(0, max_frames - 1)
+            length = random.randint(1, max_t)
+            end = min(max_frames, start + length)
+            # only substitute the earlier time chosen randomly for current time
+            pos = random.randint(0, start)
+            y[start:end, :] = x[start - pos:end - pos, :]
+        sample['feat'] = y
+        yield sample
+
+
+def spec_trim(data, max_t=20):
+    """ Trim tailing frames. Inplace operation.
+        ref: TrimTail [https://arxiv.org/abs/2211.00522]
+
+        Args:
+            data: Iterable[{key, feat, label}]
+            max_t: max width of length trimming
+
+        Returns
+            Iterable[{key, feat, label}]
+    """
+    for sample in data:
+        assert 'feat' in sample
+        x = sample['feat']
+        assert isinstance(x, torch.Tensor)
+        max_frames = x.size(0)
+        length = random.randint(1, max_t)
+        if length < max_frames / 2:
+            y = x.clone().detach()[:max_frames - length]
+            sample['feat'] = y
+        yield sample
+
+
+def shuffle(data, shuffle_size=10000):
+    """ Local shuffle the data
+
+        Args:
+            data: Iterable[{key, feat, label}]
+            shuffle_size: buffer size for shuffle
+
+        Returns:
+            Iterable[{key, feat, label}]
+    """
+    buf = []
+    for sample in data:
+        buf.append(sample)
+        if len(buf) >= shuffle_size:
+            random.shuffle(buf)
+            for x in buf:
+                yield x
+            buf = []
+    # The sample left over
+    random.shuffle(buf)
+    for x in buf:
+        yield x
+
+
+def sort(data, sort_size=500):
+    """ Sort the data by feature length.
+        Sort is used after shuffle and before batch, so we can group
+        utts with similar lengths into a batch, and `sort_size` should
+        be less than `shuffle_size`
+
+        Args:
+            data: Iterable[{key, feat, label}]
+            sort_size: buffer size for sort
+
+        Returns:
+            Iterable[{key, feat, label}]
+    """
+
+    buf = []
+    for sample in data:
+        buf.append(sample)
+        if len(buf) >= sort_size:
+            buf.sort(key=lambda x: x['feat'].size(0))
+            for x in buf:
+                yield x
+            buf = []
+    # The sample left over
+    buf.sort(key=lambda x: x['feat'].size(0))
+    for x in buf:
+        yield x
+
+
+def static_batch(data, batch_size=16):
+    """ Static batch the data by `batch_size`
+
+        Args:
+            data: Iterable[{key, feat, label}]
+            batch_size: batch size
+
+        Returns:
+            Iterable[List[{key, feat, label}]]
+    """
+    buf = []
+    for sample in data:
+        buf.append(sample)
+        if len(buf) >= batch_size:
+            yield buf
+            buf = []
+    if len(buf) > 0:
+        yield buf
+
+
+def dynamic_batch(data, max_frames_in_batch=12000):
+    """ Dynamic batch the data until the total frames in batch
+        reach `max_frames_in_batch`
+
+        Args:
+            data: Iterable[{key, feat, label}]
+            max_frames_in_batch: max_frames in one batch
+
+        Returns:
+            Iterable[List[{key, feat, label}]]
+    """
+    buf = []
+    longest_frames = 0
+    for sample in data:
+        assert 'feat' in sample
+        assert isinstance(sample['feat'], torch.Tensor)
+        new_sample_frames = sample['feat'].size(0)
+        longest_frames = max(longest_frames, new_sample_frames)
+        frames_after_padding = longest_frames * (len(buf) + 1)
+        if frames_after_padding > max_frames_in_batch:
+            yield buf
+            buf = [sample]
+            longest_frames = new_sample_frames
+        else:
+            buf.append(sample)
+    if len(buf) > 0:
+        yield buf
+
+
+def batch(data, batch_type='static', batch_size=16, max_frames_in_batch=12000):
+    """ Wrapper for static/dynamic batch
+    """
+    if batch_type == 'static':
+        return static_batch(data, batch_size)
+    elif batch_type == 'dynamic':
+        return dynamic_batch(data, max_frames_in_batch)
+    else:
+        logging.fatal('Unsupported batch type {}'.format(batch_type))
+
+
+def padding(data):
+    """ Padding the data into training data
+
+        Args:
+            data: Iterable[List[{key, feat, label}]]
+
+        Returns:
+            Iterable[Tuple(keys, feats, labels, feats lengths, label lengths)]
+    """
+    for sample in data:
+        assert isinstance(sample, list)
+        feats_length = torch.tensor([x['feat'].size(0) for x in sample],
+                                    dtype=torch.int32)
+        order = torch.argsort(feats_length, descending=True)
+        feats_lengths = torch.tensor(
+            [sample[i]['feat'].size(0) for i in order], dtype=torch.int32)
+        sorted_feats = [sample[i]['feat'] for i in order]
+        sorted_keys = [sample[i]['key'] for i in order]
+        sorted_labels = [
+            torch.tensor(sample[i]['label'], dtype=torch.int64) for i in order
+        ]
+        sorted_wavs = [sample[i]['wav'].squeeze(0) for i in order]
+        label_lengths = torch.tensor([x.size(0) for x in sorted_labels],
+                                     dtype=torch.int32)
+        wav_lengths = torch.tensor([x.size(0) for x in sorted_wavs],
+                                   dtype=torch.int32)
+
+        padded_feats = pad_sequence(sorted_feats,
+                                    batch_first=True,
+                                    padding_value=0)
+        padding_labels = pad_sequence(sorted_labels,
+                                      batch_first=True,
+                                      padding_value=-1)
+        padded_wavs = pad_sequence(sorted_wavs,
+                                   batch_first=True,
+                                   padding_value=0)
+        batch = {
+            "keys": sorted_keys,
+            "feats": padded_feats,
+            "target": padding_labels,
+            "feats_lengths": feats_lengths,
+            "target_lengths": label_lengths,
+            "pcm": padded_wavs,
+            "pcm_length": wav_lengths,
+        }
+        if 'speaker' in sample[0]:
+            speaker = torch.tensor([sample[i]['speaker'] for i in order],
+                                   dtype=torch.int32)
+            batch['speaker'] = speaker
+        if 'prompt' in sample[0]:
+            sorted_prompts = [
+                torch.tensor(sample[i]['prompt'], dtype=torch.int64
+                ) for i in order
+            ]
+            prompt_lengths = torch.tensor([x.size(0) for x in 
+                                sorted_prompts], dtype=torch.int32)
+            padding_prompts = pad_sequence(sorted_prompts,
+                                      batch_first=True,
+                                      padding_value=-1)
+            batch['prompt'] = padding_prompts
+            batch['prompt_lengths'] = prompt_lengths
+
+        yield batch
diff --git a/wenet/dataset/wav_distortion.py b/wenet/dataset/wav_distortion.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d6a353dde4d97fe88ab3f2e3ae4556259881599
--- /dev/null
+++ b/wenet/dataset/wav_distortion.py
@@ -0,0 +1,336 @@
+# Copyright (c) 2021 Mobvoi Inc (Chao Yang)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import random
+import math
+
+import torchaudio
+import torch
+
+
+def db2amp(db):
+    return pow(10, db / 20)
+
+
+def amp2db(amp):
+    return 20 * math.log10(amp)
+
+
+def make_poly_distortion(conf):
+    """Generate a db-domain ploynomial distortion function
+
+        f(x) = a * x^m * (1-x)^n + x
+
+    Args:
+        conf: a dict {'a': #int, 'm': #int, 'n': #int}
+
+    Returns:
+        The ploynomial function, which could be applied on
+        a float amplitude value
+    """
+    a = conf['a']
+    m = conf['m']
+    n = conf['n']
+
+    def poly_distortion(x):
+        abs_x = abs(x)
+        if abs_x < 0.000001:
+            x = x
+        else:
+            db_norm = amp2db(abs_x) / 100 + 1
+            if db_norm < 0:
+                db_norm = 0
+            db_norm = a * pow(db_norm, m) * pow((1 - db_norm), n) + db_norm
+            if db_norm > 1:
+                db_norm = 1
+            db = (db_norm - 1) * 100
+            amp = db2amp(db)
+            if amp >= 0.9997:
+                amp = 0.9997
+            if x > 0:
+                x = amp
+            else:
+                x = -amp
+        return x
+
+    return poly_distortion
+
+
+def make_quad_distortion():
+    return make_poly_distortion({'a': 1, 'm': 1, 'n': 1})
+
+
+# the amplitude are set to max for all non-zero point
+def make_max_distortion(conf):
+    """Generate a max distortion function
+
+    Args:
+        conf: a dict {'max_db': float }
+            'max_db': the maxium value.
+
+    Returns:
+        The max function, which could be applied on
+        a float amplitude value
+    """
+    max_db = conf['max_db']
+    if max_db:
+        max_amp = db2amp(max_db)  # < 0.997
+    else:
+        max_amp = 0.997
+
+    def max_distortion(x):
+        if x > 0:
+            x = max_amp
+        elif x < 0:
+            x = -max_amp
+        else:
+            x = 0.0
+        return x
+
+    return max_distortion
+
+
+def make_amp_mask(db_mask=None):
+    """Get a amplitude domain mask from db domain mask
+
+    Args:
+        db_mask: Optional. A list of tuple. if None, using default value.
+
+    Returns:
+        A list of tuple. The amplitude domain mask
+    """
+    if db_mask is None:
+        db_mask = [(-110, -95), (-90, -80), (-65, -60), (-50, -30), (-15, 0)]
+    amp_mask = [(db2amp(db[0]), db2amp(db[1])) for db in db_mask]
+    return amp_mask
+
+
+default_mask = make_amp_mask()
+
+
+def generate_amp_mask(mask_num):
+    """Generate amplitude domain mask randomly in [-100db, 0db]
+
+    Args:
+        mask_num: the slot number of the mask
+
+    Returns:
+        A list of tuple. each tuple defines a slot.
+        e.g. [(-100, -80), (-65, -60), (-50, -30), (-15, 0)]
+        for #mask_num = 4
+    """
+    a = [0] * 2 * mask_num
+    a[0] = 0
+    m = []
+    for i in range(1, 2 * mask_num):
+        a[i] = a[i - 1] + random.uniform(0.5, 1)
+    max_val = a[2 * mask_num - 1]
+    for i in range(0, mask_num):
+        l = ((a[2 * i] - max_val) / max_val) * 100
+        r = ((a[2 * i + 1] - max_val) / max_val) * 100
+        m.append((l, r))
+    return make_amp_mask(m)
+
+
+def make_fence_distortion(conf):
+    """Generate a fence distortion function
+
+    In this fence-like shape function, the values in mask slots are
+    set to maxium, while the values not in mask slots are set to 0.
+    Use seperated masks for Positive and negetive amplitude.
+
+    Args:
+        conf: a dict {'mask_number': int,'max_db': float }
+            'mask_number': the slot number in mask.
+            'max_db': the maxium value.
+
+    Returns:
+        The fence function, which could be applied on
+        a float amplitude value
+    """
+    mask_number = conf['mask_number']
+    max_db = conf['max_db']
+    max_amp = db2amp(max_db)  # 0.997
+    if mask_number <= 0:
+        positive_mask = default_mask
+        negative_mask = make_amp_mask([(-50, 0)])
+    else:
+        positive_mask = generate_amp_mask(mask_number)
+        negative_mask = generate_amp_mask(mask_number)
+
+    def fence_distortion(x):
+        is_in_mask = False
+        if x > 0:
+            for mask in positive_mask:
+                if x >= mask[0] and x <= mask[1]:
+                    is_in_mask = True
+                    return max_amp
+            if not is_in_mask:
+                return 0.0
+        elif x < 0:
+            abs_x = abs(x)
+            for mask in negative_mask:
+                if abs_x >= mask[0] and abs_x <= mask[1]:
+                    is_in_mask = True
+                    return max_amp
+            if not is_in_mask:
+                return 0.0
+        return x
+
+    return fence_distortion
+
+
+#
+def make_jag_distortion(conf):
+    """Generate a jag distortion function
+
+    In this jag-like shape function, the values in mask slots are
+    not changed, while the values not in mask slots are set to 0.
+    Use seperated masks for Positive and negetive amplitude.
+
+    Args:
+        conf: a dict {'mask_number': #int}
+            'mask_number': the slot number in mask.
+
+    Returns:
+        The jag function,which could be applied on
+        a float amplitude value
+    """
+    mask_number = conf['mask_number']
+    if mask_number <= 0:
+        positive_mask = default_mask
+        negative_mask = make_amp_mask([(-50, 0)])
+    else:
+        positive_mask = generate_amp_mask(mask_number)
+        negative_mask = generate_amp_mask(mask_number)
+
+    def jag_distortion(x):
+        is_in_mask = False
+        if x > 0:
+            for mask in positive_mask:
+                if x >= mask[0] and x <= mask[1]:
+                    is_in_mask = True
+                    return x
+            if not is_in_mask:
+                return 0.0
+        elif x < 0:
+            abs_x = abs(x)
+            for mask in negative_mask:
+                if abs_x >= mask[0] and abs_x <= mask[1]:
+                    is_in_mask = True
+                    return x
+            if not is_in_mask:
+                return 0.0
+        return x
+
+    return jag_distortion
+
+
+# gaining 20db means amp = amp * 10
+# gaining -20db means amp = amp / 10
+def make_gain_db(conf):
+    """Generate a db domain gain function
+
+    Args:
+        conf: a dict {'db': #float}
+            'db': the gaining value
+
+    Returns:
+        The db gain function, which could be applied on
+        a float amplitude value
+    """
+    db = conf['db']
+
+    def gain_db(x):
+        return min(0.997, x * pow(10, db / 20))
+
+    return gain_db
+
+
+def distort(x, func, rate=0.8):
+    """Distort a waveform in sample point level
+
+    Args:
+        x: the origin wavefrom
+        func: the distort function
+        rate: sample point-level distort probability
+
+    Returns:
+        the distorted waveform
+    """
+    for i in range(0, x.shape[1]):
+        a = random.uniform(0, 1)
+        if a < rate:
+            x[0][i] = func(float(x[0][i]))
+    return x
+
+
+def distort_chain(x, funcs, rate=0.8):
+    for i in range(0, x.shape[1]):
+        a = random.uniform(0, 1)
+        if a < rate:
+            for func in funcs:
+                x[0][i] = func(float(x[0][i]))
+    return x
+
+
+# x is numpy
+def distort_wav_conf(x, distort_type, distort_conf, rate=0.1):
+    if distort_type == 'gain_db':
+        gain_db = make_gain_db(distort_conf)
+        x = distort(x, gain_db)
+    elif distort_type == 'max_distortion':
+        max_distortion = make_max_distortion(distort_conf)
+        x = distort(x, max_distortion, rate=rate)
+    elif distort_type == 'fence_distortion':
+        fence_distortion = make_fence_distortion(distort_conf)
+        x = distort(x, fence_distortion, rate=rate)
+    elif distort_type == 'jag_distortion':
+        jag_distortion = make_jag_distortion(distort_conf)
+        x = distort(x, jag_distortion, rate=rate)
+    elif distort_type == 'poly_distortion':
+        poly_distortion = make_poly_distortion(distort_conf)
+        x = distort(x, poly_distortion, rate=rate)
+    elif distort_type == 'quad_distortion':
+        quad_distortion = make_quad_distortion()
+        x = distort(x, quad_distortion, rate=rate)
+    elif distort_type == 'none_distortion':
+        pass
+    else:
+        print('unsupport type')
+    return x
+
+
+def distort_wav_conf_and_save(distort_type, distort_conf, rate, wav_in,
+                              wav_out):
+    x, sr = torchaudio.load(wav_in)
+    x = x.detach().numpy()
+    out = distort_wav_conf(x, distort_type, distort_conf, rate)
+    torchaudio.save(wav_out, torch.from_numpy(out), sr)
+
+
+if __name__ == "__main__":
+    distort_type = sys.argv[1]
+    wav_in = sys.argv[2]
+    wav_out = sys.argv[3]
+    conf = None
+    rate = 0.1
+    if distort_type == 'new_jag_distortion':
+        conf = {'mask_number': 4}
+    elif distort_type == 'new_fence_distortion':
+        conf = {'mask_number': 1, 'max_db': -30}
+    elif distort_type == 'poly_distortion':
+        conf = {'a': 4, 'm': 2, "n": 2}
+    distort_wav_conf_and_save(distort_type, conf, rate, wav_in, wav_out)
diff --git a/wenet/e_branchformer/encoder.py b/wenet/e_branchformer/encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..c298e473bbf1154a721d584350b953d27018472a
--- /dev/null
+++ b/wenet/e_branchformer/encoder.py
@@ -0,0 +1,165 @@
+# Copyright (c) 2022 Yifan Peng (Carnegie Mellon University)
+#               2023 Voicecomm Inc (Kai Li)
+#               2023 Lucky Wong
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from ESPnet(https://github.com/espnet/espnet)
+"""Encoder definition."""
+
+import torch
+from typing import List, Optional, Union
+from wenet.branchformer.encoder import LayerDropModuleList
+
+from wenet.e_branchformer.encoder_layer import EBranchformerEncoderLayer
+from wenet.branchformer.cgmlp import ConvolutionalGatingMLP
+from wenet.transformer.encoder import ConformerEncoder
+from wenet.utils.class_utils import (
+    WENET_ACTIVATION_CLASSES,
+    WENET_ATTENTION_CLASSES,
+    WENET_MLP_CLASSES,
+)
+
+
+class EBranchformerEncoder(ConformerEncoder):
+    """E-Branchformer encoder module."""
+
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int = 256,
+        attention_heads: int = 4,
+        linear_units: int = 2048,
+        selfattention_layer_type: str = "rel_selfattn",
+        pos_enc_layer_type: str = "rel_pos",
+        activation_type: str = "swish",
+        cgmlp_linear_units: int = 2048,
+        cgmlp_conv_kernel: int = 31,
+        use_linear_after_conv: bool = False,
+        gate_activation: str = "identity",
+        num_blocks: int = 12,
+        dropout_rate: float = 0.1,
+        positional_dropout_rate: float = 0.1,
+        attention_dropout_rate: float = 0.0,
+        input_layer: str = "conv2d",
+        stochastic_depth_rate: Union[float, List[float]] = 0.0,
+        static_chunk_size: int = 0,
+        use_dynamic_chunk: bool = False,
+        global_cmvn: torch.nn.Module = None,
+        use_dynamic_left_chunk: bool = False,
+        causal: bool = False,
+        merge_conv_kernel: int = 3,
+        use_ffn: bool = True,
+        macaron_style: bool = True,
+        query_bias: bool = True,
+        key_bias: bool = True,
+        value_bias: bool = True,
+        conv_bias: bool = True,
+        gradient_checkpointing: bool = False,
+        use_sdpa: bool = False,
+        layer_norm_type: str = 'layer_norm',
+        norm_eps: float = 1e-5,
+        n_kv_head: Optional[int] = None,
+        head_dim: Optional[int] = None,
+        mlp_type: str = 'position_wise_feed_forward',
+        mlp_bias: bool = True,
+        n_expert: int = 8,
+        n_expert_activated: int = 2,
+    ):
+        super().__init__(input_size,
+                         output_size,
+                         attention_heads,
+                         linear_units,
+                         num_blocks,
+                         dropout_rate,
+                         positional_dropout_rate,
+                         attention_dropout_rate,
+                         input_layer,
+                         pos_enc_layer_type,
+                         True,
+                         static_chunk_size,
+                         use_dynamic_chunk,
+                         global_cmvn,
+                         use_dynamic_left_chunk,
+                         1,
+                         macaron_style,
+                         selfattention_layer_type,
+                         activation_type,
+                         query_bias=query_bias,
+                         key_bias=key_bias,
+                         value_bias=value_bias,
+                         conv_bias=conv_bias,
+                         gradient_checkpointing=gradient_checkpointing,
+                         use_sdpa=use_sdpa,
+                         layer_norm_type=layer_norm_type,
+                         norm_eps=norm_eps,
+                         n_kv_head=n_kv_head,
+                         head_dim=head_dim,
+                         mlp_type=mlp_type,
+                         mlp_bias=mlp_bias,
+                         n_expert=n_expert,
+                         n_expert_activated=n_expert_activated)
+
+        encoder_selfattn_layer_args = (
+            attention_heads,
+            output_size,
+            attention_dropout_rate,
+            query_bias,
+            key_bias,
+            value_bias,
+            use_sdpa,
+            n_kv_head,
+            head_dim,
+        )
+
+        cgmlp_layer = ConvolutionalGatingMLP
+        cgmlp_layer_args = (output_size, cgmlp_linear_units, cgmlp_conv_kernel,
+                            dropout_rate, use_linear_after_conv,
+                            gate_activation, causal)
+
+        # feed-forward module definition
+        mlp_class = WENET_MLP_CLASSES[mlp_type]
+        activation = WENET_ACTIVATION_CLASSES[activation_type]()
+        positionwise_layer_args = (
+            output_size,
+            linear_units,
+            dropout_rate,
+            activation,
+            mlp_bias,
+            n_expert,
+            n_expert_activated,
+        )
+
+        if isinstance(stochastic_depth_rate, float):
+            stochastic_depth_rate = [stochastic_depth_rate] * num_blocks
+        if len(stochastic_depth_rate) != num_blocks:
+            raise ValueError(
+                f"Length of stochastic_depth_rate ({len(stochastic_depth_rate)}) "
+                f"should be equal to num_blocks ({num_blocks})")
+
+        self.encoders = LayerDropModuleList(
+            p=stochastic_depth_rate,
+            modules=[
+                EBranchformerEncoderLayer(
+                    output_size,
+                    WENET_ATTENTION_CLASSES[selfattention_layer_type](
+                        *encoder_selfattn_layer_args),
+                    cgmlp_layer(*cgmlp_layer_args),
+                    mlp_class(*positionwise_layer_args) if use_ffn else None,
+                    mlp_class(*positionwise_layer_args)
+                    if use_ffn and macaron_style else None,
+                    dropout_rate,
+                    merge_conv_kernel=merge_conv_kernel,
+                    causal=causal,
+                    stochastic_depth_rate=stochastic_depth_rate[lnum],
+                ) for lnum in range(num_blocks)
+            ])
diff --git a/wenet/e_branchformer/encoder_layer.py b/wenet/e_branchformer/encoder_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..cad2ac4dabd61ee71a1704093b02f0c81a1e7f92
--- /dev/null
+++ b/wenet/e_branchformer/encoder_layer.py
@@ -0,0 +1,187 @@
+# Copyright (c) 2022 Yifan Peng (Carnegie Mellon University)
+#               2023 Voicecomm Inc (Kai Li)
+#               2023 Lucky Wong
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from ESPnet(https://github.com/espnet/espnet)
+"""EBranchformerEncoderLayer definition."""
+
+import torch
+import torch.nn as nn
+from typing import Optional, Tuple
+
+from wenet.transformer.attention import T_CACHE
+
+
+class EBranchformerEncoderLayer(torch.nn.Module):
+    """E-Branchformer encoder layer module.
+
+    Args:
+        size (int): model dimension
+        attn: standard self-attention or efficient attention
+        cgmlp: ConvolutionalGatingMLP
+        feed_forward: feed-forward module, optional
+        feed_forward: macaron-style feed-forward module, optional
+        dropout_rate (float): dropout probability
+        merge_conv_kernel (int): kernel size of the depth-wise conv in merge module
+    """
+
+    def __init__(
+        self,
+        size: int,
+        attn: torch.nn.Module,
+        cgmlp: torch.nn.Module,
+        feed_forward: Optional[torch.nn.Module],
+        feed_forward_macaron: Optional[torch.nn.Module],
+        dropout_rate: float,
+        merge_conv_kernel: int = 3,
+        causal: bool = True,
+        stochastic_depth_rate=0.0,
+    ):
+        super().__init__()
+
+        self.size = size
+        self.attn = attn
+        self.cgmlp = cgmlp
+
+        self.feed_forward = feed_forward
+        self.feed_forward_macaron = feed_forward_macaron
+        self.ff_scale = 1.0
+        if self.feed_forward is not None:
+            self.norm_ff = nn.LayerNorm(size)
+        if self.feed_forward_macaron is not None:
+            self.ff_scale = 0.5
+            self.norm_ff_macaron = nn.LayerNorm(size)
+
+        self.norm_mha = nn.LayerNorm(size)  # for the MHA module
+        self.norm_mlp = nn.LayerNorm(size)  # for the MLP module
+        # for the final output of the block
+        self.norm_final = nn.LayerNorm(size)
+
+        self.dropout = torch.nn.Dropout(dropout_rate)
+
+        if causal:
+            padding = 0
+            self.lorder = merge_conv_kernel - 1
+        else:
+            # kernel_size should be an odd number for none causal convolution
+            assert (merge_conv_kernel - 1) % 2 == 0
+            padding = (merge_conv_kernel - 1) // 2
+            self.lorder = 0
+        self.depthwise_conv_fusion = torch.nn.Conv1d(
+            size + size,
+            size + size,
+            kernel_size=merge_conv_kernel,
+            stride=1,
+            padding=padding,
+            groups=size + size,
+            bias=True,
+        )
+        self.merge_proj = torch.nn.Linear(size + size, size)
+        self.stochastic_depth_rate = stochastic_depth_rate
+
+    def _forward(
+        self,
+        x: torch.Tensor,
+        mask: torch.Tensor,
+        pos_emb: torch.Tensor,
+        mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+        att_cache: T_CACHE = (torch.zeros(
+            (0, 0, 0, 0)), torch.zeros(0, 0, 0, 0)),
+        cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)),
+        stoch_layer_coeff: float = 1.0
+    ) -> Tuple[torch.Tensor, torch.Tensor, T_CACHE, torch.Tensor]:
+
+        if self.feed_forward_macaron is not None:
+            residual = x
+            x = self.norm_ff_macaron(x)
+            x = residual + stoch_layer_coeff * self.ff_scale * self.dropout(
+                self.feed_forward_macaron(x))
+
+        # Two branches
+        x1 = x
+        x2 = x
+
+        # Branch 1: multi-headed attention module
+        x1 = self.norm_mha(x1)
+        x_att, new_att_cache = self.attn(x1, x1, x1, mask, pos_emb, att_cache)
+        x1 = self.dropout(x_att)
+
+        # Branch 2: convolutional gating mlp
+        # Fake new cnn cache here, and then change it in conv_module
+        new_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device)
+        x2 = self.norm_mlp(x2)
+        x2, new_cnn_cache = self.cgmlp(x2, mask_pad, cnn_cache)
+        x2 = self.dropout(x2)
+
+        # Merge two branches
+        x_concat = torch.cat([x1, x2], dim=-1)
+        x_tmp = x_concat.transpose(1, 2)
+        if self.lorder > 0:
+            x_tmp = nn.functional.pad(x_tmp, (self.lorder, 0), "constant", 0.0)
+            assert x_tmp.size(2) > self.lorder
+        x_tmp = self.depthwise_conv_fusion(x_tmp)
+        x_tmp = x_tmp.transpose(1, 2)
+        x = x + stoch_layer_coeff * self.dropout(
+            self.merge_proj(x_concat + x_tmp))
+
+        if self.feed_forward is not None:
+            # feed forward module
+            residual = x
+            x = self.norm_ff(x)
+            x = residual + stoch_layer_coeff * self.ff_scale * self.dropout(
+                self.feed_forward(x))
+
+        x = self.norm_final(x)
+
+        return x, mask, new_att_cache, new_cnn_cache
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask: torch.Tensor,
+        pos_emb: torch.Tensor,
+        mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+        att_cache: T_CACHE = (torch.zeros(
+            (0, 0, 0, 0)), torch.zeros(0, 0, 0, 0)),
+        cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)),
+    ) -> Tuple[torch.Tensor, torch.Tensor, T_CACHE, torch.Tensor]:
+        """Compute encoded features.
+
+        Args:
+            x (Union[Tuple, torch.Tensor]): Input tensor  (#batch, time, size).
+            mask (torch.Tensor): Mask tensor for the input (#batch, time, time).
+            pos_emb (torch.Tensor): positional encoding, must not be None
+                for BranchformerEncoderLayer.
+            mask_pad (torch.Tensor): batch padding mask used for conv module.
+                (#batch, 1，time), (0, 0, 0) means fake mask.
+            att_cache (torch.Tensor): Cache tensor of the KEY & VALUE
+                (#batch=1, head, cache_t1, d_k * 2), head * d_k == size.
+            cnn_cache (torch.Tensor): Convolution cache in cgmlp layer
+                (#batch=1, size, cache_t2)
+
+        Returns:
+            torch.Tensor: Output tensor (#batch, time, size).
+            torch.Tensor: Mask tensor (#batch, time, time.
+            torch.Tensor: att_cache tensor,
+                (#batch=1, head, cache_t1 + time, d_k * 2).
+            torch.Tensor: cnn_cahce tensor (#batch, size, cache_t2).
+        """
+
+        stoch_layer_coeff = 1.0
+        # with stochastic depth, residual connection `x + f(x)` becomes
+        # `x <- x + 1 / (1 - p) * f(x)` at training time.
+        if self.training:
+            stoch_layer_coeff = 1.0 / (1 - self.stochastic_depth_rate)
+        return self._forward(x, mask, pos_emb, mask_pad, att_cache, cnn_cache,
+                             stoch_layer_coeff)
diff --git a/wenet/efficient_conformer/__init__.py b/wenet/efficient_conformer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/wenet/efficient_conformer/attention.py b/wenet/efficient_conformer/attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc762876071ec9c47e8a3f1e709f83b469224dc1
--- /dev/null
+++ b/wenet/efficient_conformer/attention.py
@@ -0,0 +1,257 @@
+# Copyright (c) 2019 Shigeki Karita
+#               2020 Mobvoi Inc (Binbin Zhang)
+#               2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn)
+#               2022 58.com(Wuba) Inc AI Lab.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Multi-Head Attention layer definition."""
+
+import math
+from typing import Tuple, Optional
+
+import torch
+from torch import nn
+import torch.nn.functional as F
+from wenet.transformer.attention import MultiHeadedAttention
+
+
+class GroupedRelPositionMultiHeadedAttention(MultiHeadedAttention):
+    """Multi-Head Attention layer with relative position encoding.
+    Paper:
+        https://arxiv.org/abs/1901.02860
+        https://arxiv.org/abs/2109.01163
+    Args:
+        n_head (int): The number of heads.
+        n_feat (int): The number of features.
+        dropout_rate (float): Dropout rate.
+    """
+
+    def __init__(self, n_head, n_feat, dropout_rate, group_size=3):
+        """Construct an RelPositionMultiHeadedAttention object."""
+        super().__init__(n_head, n_feat, dropout_rate)
+        # linear transformation for positional encoding
+        self.linear_pos = nn.Linear(n_feat, n_feat, bias=False)
+        self.group_size = group_size
+        self.d_k = n_feat // n_head  # for GroupedAttention
+        self.n_feat = n_feat
+        # these two learnable bias are used in matrix c and matrix d
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        self.pos_bias_u = nn.Parameter(
+            torch.Tensor(self.h, self.d_k * self.group_size))
+        self.pos_bias_v = nn.Parameter(
+            torch.Tensor(self.h, self.d_k * self.group_size))
+        torch.nn.init.xavier_uniform_(self.pos_bias_u)
+        torch.nn.init.xavier_uniform_(self.pos_bias_v)
+
+    def rel_shift(self, x, zero_triu: bool = False):
+        """Compute relative positinal encoding.
+        Args:
+            x (torch.Tensor): Input tensor (batch, time, size).
+            zero_triu (bool): If true, return the lower triangular part of
+                the matrix.
+        Returns:
+            torch.Tensor: Output tensor.
+        """
+
+        zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1),
+                               device=x.device,
+                               dtype=x.dtype)
+        x_padded = torch.cat([zero_pad, x], dim=-1)
+
+        x_padded = x_padded.view(x.size()[0],
+                                 x.size()[1],
+                                 x.size(3) + 1, x.size(2))
+        x = x_padded[:, :, 1:].view_as(x)
+
+        if zero_triu:
+            ones = torch.ones((x.size(2), x.size(3)))
+            x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :]
+
+        return x
+
+    def pad4group(self, Q, K, V, P, mask, group_size: int = 3):
+        """
+        q: (#batch, time1, size) -> (#batch, head, time1, size/head)
+        k,v: (#batch, time2, size) -> (#batch, head, time2, size/head)
+        p: (#batch, time2, size)
+        """
+        # Compute Overflows
+        overflow_Q = Q.size(2) % group_size
+        overflow_KV = K.size(2) % group_size
+
+        # if-else for ONNX export
+        #   0 // 0.00000000000000001 = 0
+        #   1 // 1.00000000000000001 = 1
+        padding_Q = (group_size - overflow_Q) * int(
+            overflow_Q // (overflow_Q + 0.00000000000000001))
+        padding_KV = (group_size - overflow_KV) * int(
+            overflow_KV // (overflow_KV + 0.00000000000000001))
+
+        batch_size, _, seq_len_KV, _ = K.size()
+
+        # Input Padding (B, T, D) -> (B, T + P, D)
+        Q = F.pad(Q, (0, 0, 0, padding_Q), value=0.0)
+        K = F.pad(K, (0, 0, 0, padding_KV), value=0.0)
+        V = F.pad(V, (0, 0, 0, padding_KV), value=0.0)
+
+        if mask is not None and mask.size(2) > 0:  # time2 > 0:
+            mask = mask[:, ::group_size, ::group_size]
+
+        Q = Q.transpose(1, 2).contiguous().view(
+            batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2)
+        K = K.transpose(1, 2).contiguous().view(
+            batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2)
+        V = V.transpose(1, 2).contiguous().view(
+            batch_size, -1, self.h, self.d_k * group_size).transpose(1, 2)
+
+        # process pos_emb
+        P_batch_size = P.size(0)
+        overflow_P = P.size(1) % group_size
+        padding_P = group_size - overflow_P if overflow_P else 0
+        P = F.pad(P, (0, 0, 0, padding_P), value=0.0)
+        P = P.view(P_batch_size, -1, self.h,
+                   self.d_k * group_size).transpose(1, 2)
+
+        return Q, K, V, P, mask, padding_Q
+
+    def forward_attention(self,
+                          value: torch.Tensor,
+                          scores: torch.Tensor,
+                          mask: torch.Tensor = torch.ones((0, 0, 0),
+                                                          dtype=torch.bool),
+                          padding_q: Optional[int] = None) -> torch.Tensor:
+        """Compute attention context vector.
+
+        Args:
+            value (torch.Tensor): Transformed value, size
+                (#batch, n_head, time2, d_k).
+            scores (torch.Tensor): Attention score, size
+                (#batch, n_head, time1, time2).
+            mask (torch.Tensor): Mask, size (#batch, 1, time2) or
+                (#batch, time1, time2), (0, 0, 0) means fake mask.
+            padding_q : for GroupedAttention in efficent conformer
+
+        Returns:
+            torch.Tensor: Transformed value (#batch, time1, d_model)
+                weighted by the attention score (#batch, time1, time2).
+
+        """
+        n_batch = value.size(0)
+        # NOTE(xcsong): When will `if mask.size(2) > 0` be True?
+        #   1. onnx(16/4) [WHY? Because we feed real cache & real mask for the
+        #           1st chunk to ease the onnx export.]
+        #   2. pytorch training
+        if mask.size(2) > 0:  # time2 > 0
+            mask = mask.unsqueeze(1).eq(0)  # (batch, 1, *, time2)
+            # For last chunk, time2 might be larger than scores.size(-1)
+            mask = mask[:, :, :, :scores.size(-1)]  # (batch, 1, *, time2)
+            scores = scores.masked_fill(mask, -float('inf'))
+            attn = torch.softmax(scores, dim=-1).masked_fill(
+                mask, 0.0)  # (batch, head, time1, time2)
+        # NOTE(xcsong): When will `if mask.size(2) > 0` be False?
+        #   1. onnx(16/-1, -1/-1, 16/0)
+        #   2. jit (16/-1, -1/-1, 16/0, 16/4)
+        else:
+            attn = torch.softmax(scores, dim=-1)  # (batch, head, time1, time2)
+
+        p_attn = self.dropout(attn)
+        x = torch.matmul(p_attn, value)  # (batch, head, time1, d_k)
+
+        # n_feat!=h*d_k may be happened in GroupAttention
+        x = (x.transpose(1, 2).contiguous().view(n_batch, -1, self.n_feat)
+             )  # (batch, time1, d_model)
+        if padding_q is not None:
+            # for GroupedAttention in efficent conformer
+            x = x[:, :x.size(1) - padding_q]
+
+        return self.linear_out(x)  # (batch, time1, d_model)
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+        pos_emb: torch.Tensor = torch.empty(0),
+        cache: torch.Tensor = torch.zeros((0, 0, 0, 0)),
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Compute 'Scaled Dot Product Attention' with rel. positional encoding.
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
+                (#batch, time1, time2).
+            pos_emb (torch.Tensor): Positional embedding tensor
+                (#batch, time2, size).
+            cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2),
+                where `cache_t == chunk_size * num_decoding_left_chunks`
+                and `head * d_k == size`
+        Returns:
+            torch.Tensor: Output tensor (#batch, time1, d_model).
+            torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2)
+                where `cache_t == chunk_size * num_decoding_left_chunks`
+                and `head * d_k == size`
+        """
+        q = self.linear_q(query)
+        k = self.linear_k(key)  # (#batch, time2, size)
+        v = self.linear_v(value)
+        p = self.linear_pos(pos_emb)  # (#batch, time2, size)
+
+        batch_size, seq_len_KV, _ = k.size()  # seq_len_KV = time2
+
+        # (#batch, time2, size) -> (#batch, head, time2, size/head)
+        q = q.view(batch_size, -1, self.h, self.d_k).transpose(1, 2)
+        k = k.view(batch_size, -1, self.h, self.d_k).transpose(1, 2)
+        v = v.view(batch_size, -1, self.h, self.d_k).transpose(1, 2)
+        if cache.size(0) > 0:
+            # use attention cache
+            key_cache, value_cache = torch.split(cache,
+                                                 cache.size(-1) // 2,
+                                                 dim=-1)
+            k = torch.cat([key_cache, k], dim=2)
+            v = torch.cat([value_cache, v], dim=2)
+        new_cache = torch.cat((k, v), dim=-1)
+
+        # May be k and p does not match.  eg. time2=18+18/2=27 > mask=36/2=18
+        if mask is not None and mask.size(2) > 0:
+            time2 = mask.size(2)
+            k = k[:, :, -time2:, :]
+            v = v[:, :, -time2:, :]
+
+        # q k v p: (batch, head, time1, d_k)
+        q, k, v, p, mask, padding_q = self.pad4group(q, k, v, p, mask,
+                                                     self.group_size)
+
+        # q_with_bias_u & q_with_bias_v = (batch, head, time1, d_k)
+        q = q.transpose(1, 2)  # (batch, time1, head, d_k)
+        q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2)
+        q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2)
+
+        # compute attention score
+        # first compute matrix a and matrix c
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        # (batch, head, time1, time2)
+        matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1))
+
+        # compute matrix b and matrix d
+        # (batch, head, time1, time2)
+        matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1))
+        # Remove rel_shift since it is useless in speech recognition,
+        # and it requires special attention for streaming.
+        # matrix_bd = self.rel_shift(matrix_bd)
+
+        scores = (matrix_ac + matrix_bd) / math.sqrt(
+            self.d_k * self.group_size)  # (batch, head, time1, time2)
+
+        return self.forward_attention(v, scores, mask, padding_q), new_cache
diff --git a/wenet/efficient_conformer/convolution.py b/wenet/efficient_conformer/convolution.py
new file mode 100644
index 0000000000000000000000000000000000000000..3fa3dff25f8f7f30ff85f878692b4ca61dd6ea5c
--- /dev/null
+++ b/wenet/efficient_conformer/convolution.py
@@ -0,0 +1,154 @@
+# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu)
+#               2022 58.com(Wuba) Inc AI Lab.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from ESPnet(https://github.com/espnet/espnet)
+"""ConvolutionModule definition."""
+from typing import Tuple
+
+import torch
+from torch import nn
+
+
+class ConvolutionModule(nn.Module):
+    """ConvolutionModule in Conformer model."""
+
+    def __init__(self,
+                 channels: int,
+                 kernel_size: int = 15,
+                 activation: nn.Module = nn.ReLU(),
+                 norm: str = "batch_norm",
+                 causal: bool = False,
+                 bias: bool = True,
+                 stride: int = 1):
+        """Construct an ConvolutionModule object.
+        Args:
+            channels (int): The number of channels of conv layers.
+            kernel_size (int): Kernel size of conv layers.
+            causal (int): Whether use causal convolution or not
+            stride (int): Stride Convolution, for efficient Conformer
+        """
+        super().__init__()
+
+        self.pointwise_conv1 = nn.Conv1d(
+            channels,
+            2 * channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=bias,
+        )
+        # self.lorder is used to distinguish if it's a causal convolution,
+        # if self.lorder > 0: it's a causal convolution, the input will be
+        #    padded with self.lorder frames on the left in forward.
+        # else: it's a symmetrical convolution
+        if causal:
+            padding = 0
+            self.lorder = kernel_size - 1
+        else:
+            # kernel_size should be an odd number for none causal convolution
+            assert (kernel_size - 1) % 2 == 0
+            padding = (kernel_size - 1) // 2
+            self.lorder = 0
+
+        self.depthwise_conv = nn.Conv1d(
+            channels,
+            channels,
+            kernel_size,
+            stride=stride,  # for depthwise_conv in StrideConv
+            padding=padding,
+            groups=channels,
+            bias=bias,
+        )
+
+        assert norm in ['batch_norm', 'layer_norm']
+        if norm == "batch_norm":
+            self.use_layer_norm = False
+            self.norm = nn.BatchNorm1d(channels)
+        else:
+            self.use_layer_norm = True
+            self.norm = nn.LayerNorm(channels)
+
+        self.pointwise_conv2 = nn.Conv1d(
+            channels,
+            channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=bias,
+        )
+        self.activation = activation
+        self.stride = stride
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+        cache: torch.Tensor = torch.zeros((0, 0, 0)),
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Compute convolution module.
+        Args:
+            x (torch.Tensor): Input tensor (#batch, time, channels).
+            mask_pad (torch.Tensor): used for batch padding (#batch, 1, time),
+                (0, 0, 0) means fake mask.
+            cache (torch.Tensor): left context cache, it is only
+                used in causal convolution (#batch, channels, cache_t),
+                (0, 0, 0) meas fake cache.
+        Returns:
+            torch.Tensor: Output tensor (#batch, time, channels).
+        """
+        # exchange the temporal dimension and the feature dimension
+        x = x.transpose(1, 2)  # (#batch, channels, time)
+
+        # mask batch padding
+        if mask_pad.size(2) > 0:  # time > 0
+            x.masked_fill_(~mask_pad, 0.0)
+
+        if self.lorder > 0:
+            if cache.size(2) == 0:  # cache_t == 0
+                x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0)
+            else:
+                # When export ONNX，the first cache is not None but all-zero,
+                # cause shape error in residual block,
+                #   eg. cache14 + x9 = 23, 23-7+1=17 != 9
+                cache = cache[:, :, -self.lorder:]
+                assert cache.size(0) == x.size(0)  # equal batch
+                assert cache.size(1) == x.size(1)  # equal channel
+                x = torch.cat((cache, x), dim=2)
+            assert (x.size(2) > self.lorder)
+            new_cache = x[:, :, -self.lorder:]
+        else:
+            # It's better we just return None if no cache is requried,
+            # However, for JIT export, here we just fake one tensor instead of
+            # None.
+            new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device)
+
+        # GLU mechanism
+        x = self.pointwise_conv1(x)  # (batch, 2*channel, dim)
+        x = nn.functional.glu(x, dim=1)  # (batch, channel, dim)
+
+        # 1D Depthwise Conv
+        x = self.depthwise_conv(x)
+        if self.use_layer_norm:
+            x = x.transpose(1, 2)
+        x = self.activation(self.norm(x))
+        if self.use_layer_norm:
+            x = x.transpose(1, 2)
+        x = self.pointwise_conv2(x)
+        # mask batch padding
+        if mask_pad.size(2) > 0:  # time > 0
+            if mask_pad.size(2) != x.size(2):
+                mask_pad = mask_pad[:, :, ::self.stride]
+            x.masked_fill_(~mask_pad, 0.0)
+
+        return x.transpose(1, 2), new_cache
diff --git a/wenet/efficient_conformer/encoder.py b/wenet/efficient_conformer/encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab2284c545545d0b89307dc6733797a8bf66fd18
--- /dev/null
+++ b/wenet/efficient_conformer/encoder.py
@@ -0,0 +1,560 @@
+# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu)
+#               2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn)
+#               2022 58.com(Wuba) Inc AI Lab.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from EfficientConformer(https://github.com/burchim/EfficientConformer)
+#               Paper(https://arxiv.org/abs/2109.01163)
+"""Encoder definition."""
+from typing import Tuple, Optional, List, Union
+
+import torch
+import logging
+import torch.nn.functional as F
+
+from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward
+from wenet.transformer.encoder_layer import ConformerEncoderLayer
+
+from wenet.efficient_conformer.convolution import ConvolutionModule
+from wenet.efficient_conformer.encoder_layer import StrideConformerEncoderLayer
+
+from wenet.utils.mask import make_pad_mask
+from wenet.utils.mask import add_optional_chunk_mask
+from wenet.utils.class_utils import (
+    WENET_ATTENTION_CLASSES,
+    WENET_EMB_CLASSES,
+    WENET_SUBSAMPLE_CLASSES,
+    WENET_ACTIVATION_CLASSES,
+)
+
+
+class EfficientConformerEncoder(torch.nn.Module):
+    """Conformer encoder module."""
+
+    def __init__(self,
+                 input_size: int,
+                 output_size: int = 256,
+                 attention_heads: int = 4,
+                 linear_units: int = 2048,
+                 num_blocks: int = 6,
+                 dropout_rate: float = 0.1,
+                 positional_dropout_rate: float = 0.1,
+                 attention_dropout_rate: float = 0.0,
+                 input_layer: str = "conv2d",
+                 pos_enc_layer_type: str = "rel_pos",
+                 normalize_before: bool = True,
+                 static_chunk_size: int = 0,
+                 use_dynamic_chunk: bool = False,
+                 global_cmvn: torch.nn.Module = None,
+                 use_dynamic_left_chunk: bool = False,
+                 macaron_style: bool = True,
+                 activation_type: str = "swish",
+                 use_cnn_module: bool = True,
+                 cnn_module_kernel: int = 15,
+                 causal: bool = False,
+                 cnn_module_norm: str = "batch_norm",
+                 stride_layer_idx: Optional[Union[int, List[int]]] = 3,
+                 stride: Optional[Union[int, List[int]]] = 2,
+                 group_layer_idx: Optional[Union[int, List[int],
+                                                 tuple]] = (0, 1, 2, 3),
+                 group_size: int = 3,
+                 stride_kernel: bool = True,
+                 **kwargs):
+        """Construct Efficient Conformer Encoder
+
+        Args:
+            input_size to use_dynamic_chunk, see in BaseEncoder
+            macaron_style (bool): Whether to use macaron style for
+                positionwise layer.
+            activation_type (str): Encoder activation function type.
+            use_cnn_module (bool): Whether to use convolution module.
+            cnn_module_kernel (int): Kernel size of convolution module.
+            causal (bool): whether to use causal convolution or not.
+            stride_layer_idx (list): layer id with StrideConv, start from 0
+            stride (list): stride size of each StrideConv in efficient conformer
+            group_layer_idx (list): layer id with GroupedAttention, start from 0
+            group_size (int): group size of every GroupedAttention layer
+            stride_kernel (bool): default True. True: recompute cnn kernels with stride.
+        """
+        super().__init__()
+        self._output_size = output_size
+
+        logging.info(
+            f"input_layer = {input_layer}, "
+            f"subsampling_class = {WENET_SUBSAMPLE_CLASSES[input_layer]}")
+
+        self.global_cmvn = global_cmvn
+        self.embed = WENET_SUBSAMPLE_CLASSES[input_layer](
+            input_size,
+            output_size,
+            dropout_rate,
+            WENET_EMB_CLASSES[pos_enc_layer_type](output_size,
+                                                  positional_dropout_rate),
+        )
+        self.input_layer = input_layer
+        self.normalize_before = normalize_before
+        self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5)
+        self.static_chunk_size = static_chunk_size
+        self.use_dynamic_chunk = use_dynamic_chunk
+        self.use_dynamic_left_chunk = use_dynamic_left_chunk
+
+        activation = WENET_ACTIVATION_CLASSES[activation_type]()
+        self.num_blocks = num_blocks
+        self.attention_heads = attention_heads
+        self.cnn_module_kernel = cnn_module_kernel
+        self.global_chunk_size = 0
+        self.chunk_feature_map = 0
+
+        # efficient conformer configs
+        self.stride_layer_idx = [stride_layer_idx] \
+            if type(stride_layer_idx) == int else stride_layer_idx
+        self.stride = [stride] \
+            if type(stride) == int else stride
+        self.group_layer_idx = [group_layer_idx] \
+            if type(group_layer_idx) == int else group_layer_idx
+        self.grouped_size = group_size  # group size of every GroupedAttention layer
+
+        assert len(self.stride) == len(self.stride_layer_idx)
+        self.cnn_module_kernels = [cnn_module_kernel
+                                   ]  # kernel size of each StridedConv
+        for i in self.stride:
+            if stride_kernel:
+                self.cnn_module_kernels.append(self.cnn_module_kernels[-1] //
+                                               i)
+            else:
+                self.cnn_module_kernels.append(self.cnn_module_kernels[-1])
+
+        logging.info(f"stride_layer_idx= {self.stride_layer_idx}, "
+                     f"stride = {self.stride}, "
+                     f"cnn_module_kernel = {self.cnn_module_kernels}, "
+                     f"group_layer_idx = {self.group_layer_idx}, "
+                     f"grouped_size = {self.grouped_size}")
+
+        # feed-forward module definition
+        positionwise_layer = PositionwiseFeedForward
+        positionwise_layer_args = (
+            output_size,
+            linear_units,
+            dropout_rate,
+            activation,
+        )
+        # convolution module definition
+        convolution_layer = ConvolutionModule
+
+        # encoder definition
+        index = 0
+        layers = []
+        for i in range(num_blocks):
+            # self-attention module definition
+            if i in self.group_layer_idx:
+                encoder_selfattn_layer = WENET_ATTENTION_CLASSES[
+                    "grouped_rel_selfattn"]
+                encoder_selfattn_layer_args = (attention_heads, output_size,
+                                               attention_dropout_rate,
+                                               self.grouped_size)
+            else:
+                if pos_enc_layer_type == "no_pos":
+                    encoder_selfattn_layer = WENET_ATTENTION_CLASSES[
+                        "selfattn"]
+                else:
+                    encoder_selfattn_layer = WENET_ATTENTION_CLASSES[
+                        "rel_selfattn"]
+                encoder_selfattn_layer_args = (attention_heads, output_size,
+                                               attention_dropout_rate)
+
+            # conformer module definition
+            if i in self.stride_layer_idx:
+                # conformer block with downsampling
+                convolution_layer_args_stride = (
+                    output_size, self.cnn_module_kernels[index], activation,
+                    cnn_module_norm, causal, True, self.stride[index])
+                layers.append(
+                    StrideConformerEncoderLayer(
+                        output_size,
+                        encoder_selfattn_layer(*encoder_selfattn_layer_args),
+                        positionwise_layer(*positionwise_layer_args),
+                        positionwise_layer(*positionwise_layer_args)
+                        if macaron_style else None,
+                        convolution_layer(*convolution_layer_args_stride)
+                        if use_cnn_module else None,
+                        torch.nn.AvgPool1d(
+                            kernel_size=self.stride[index],
+                            stride=self.stride[index],
+                            padding=0,
+                            ceil_mode=True,
+                            count_include_pad=False),  # pointwise_conv_layer
+                        dropout_rate,
+                        normalize_before,
+                    ))
+                index = index + 1
+            else:
+                # conformer block
+                convolution_layer_args_normal = (
+                    output_size, self.cnn_module_kernels[index], activation,
+                    cnn_module_norm, causal)
+                layers.append(
+                    ConformerEncoderLayer(
+                        output_size,
+                        encoder_selfattn_layer(*encoder_selfattn_layer_args),
+                        positionwise_layer(*positionwise_layer_args),
+                        positionwise_layer(*positionwise_layer_args)
+                        if macaron_style else None,
+                        convolution_layer(*convolution_layer_args_normal)
+                        if use_cnn_module else None,
+                        dropout_rate,
+                        normalize_before,
+                    ))
+
+        self.encoders = torch.nn.ModuleList(layers)
+
+    def set_global_chunk_size(self, chunk_size):
+        """Used in ONNX export.
+        """
+        logging.info(f"set global chunk size: {chunk_size}, default is 0.")
+        self.global_chunk_size = chunk_size
+        if self.embed.subsampling_rate == 2:
+            self.chunk_feature_map = 2 * self.global_chunk_size + 1
+        elif self.embed.subsampling_rate == 6:
+            self.chunk_feature_map = 6 * self.global_chunk_size + 5
+        elif self.embed.subsampling_rate == 8:
+            self.chunk_feature_map = 8 * self.global_chunk_size + 7
+        else:
+            self.chunk_feature_map = 4 * self.global_chunk_size + 3
+
+    def output_size(self) -> int:
+        return self._output_size
+
+    def calculate_downsampling_factor(self, i: int) -> int:
+        factor = 1
+        for idx, stride_idx in enumerate(self.stride_layer_idx):
+            if i > stride_idx:
+                factor *= self.stride[idx]
+        return factor
+
+    def forward(
+        self,
+        xs: torch.Tensor,
+        xs_lens: torch.Tensor,
+        decoding_chunk_size: int = 0,
+        num_decoding_left_chunks: int = -1,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Embed positions in tensor.
+        Args:
+            xs: padded input tensor (B, T, D)
+            xs_lens: input length (B)
+            decoding_chunk_size: decoding chunk size for dynamic chunk
+                0: default for training, use random dynamic chunk.
+                <0: for decoding, use full chunk.
+                >0: for decoding, use fixed chunk size as set.
+            num_decoding_left_chunks: number of left chunks, this is for decoding,
+            the chunk size is decoding_chunk_size.
+                >=0: use num_decoding_left_chunks
+                <0: use all left chunks
+        Returns:
+            encoder output tensor xs, and subsampled masks
+            xs: padded output tensor (B, T' ~= T/subsample_rate, D)
+            masks: torch.Tensor batch padding mask after subsample
+                (B, 1, T' ~= T/subsample_rate)
+        """
+        T = xs.size(1)
+        masks = ~make_pad_mask(xs_lens, T).unsqueeze(1)  # (B, 1, T)
+        if self.global_cmvn is not None:
+            xs = self.global_cmvn(xs)
+        xs, pos_emb, masks = self.embed(xs, masks)
+        mask_pad = masks  # (B, 1, T/subsample_rate)
+        chunk_masks = add_optional_chunk_mask(xs, masks,
+                                              self.use_dynamic_chunk,
+                                              self.use_dynamic_left_chunk,
+                                              decoding_chunk_size,
+                                              self.static_chunk_size,
+                                              num_decoding_left_chunks)
+        index = 0  # traverse stride
+        for i, layer in enumerate(self.encoders):
+            # layer return : x, mask, new_att_cache, new_cnn_cache
+            xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad)
+            if i in self.stride_layer_idx:
+                masks = masks[:, :, ::self.stride[index]]
+                chunk_masks = chunk_masks[:, ::self.stride[index], ::self.
+                                          stride[index]]
+                mask_pad = masks
+                pos_emb = pos_emb[:, ::self.stride[index], :]
+                index = index + 1
+
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+        # Here we assume the mask is not changed in encoder layers, so just
+        # return the masks before encoder layers, and the masks will be used
+        # for cross attention with decoder later
+        return xs, masks
+
+    def forward_chunk(
+        self,
+        xs: torch.Tensor,
+        offset: int,
+        required_cache_size: int,
+        att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0),
+        cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0),
+        att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool)
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """ Forward just one chunk
+
+        Args:
+            xs (torch.Tensor): chunk input
+            offset (int): current offset in encoder output time stamp
+            required_cache_size (int): cache size required for next chunk
+                compuation
+                >=0: actual cache size
+                <0: means all history cache is required
+            att_cache (torch.Tensor): cache tensor for KEY & VALUE in
+                transformer/conformer attention, with shape
+                (elayers, head, cache_t1, d_k * 2), where
+                `head * d_k == hidden-dim` and
+                `cache_t1 == chunk_size * num_decoding_left_chunks`.
+            cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer,
+                (elayers, b=1, hidden-dim, cache_t2), where
+                `cache_t2 == cnn.lorder - 1`
+            att_mask : mask matrix of self attention
+
+        Returns:
+            torch.Tensor: output of current input xs
+            torch.Tensor: subsampling cache required for next chunk computation
+            List[torch.Tensor]: encoder layers output cache required for next
+                chunk computation
+            List[torch.Tensor]: conformer cnn cache
+
+        """
+        assert xs.size(0) == 1
+
+        # using downsampling factor to recover offset
+        offset *= self.calculate_downsampling_factor(self.num_blocks + 1)
+
+        chunk_masks = torch.ones(1,
+                                 xs.size(1),
+                                 device=xs.device,
+                                 dtype=torch.bool)
+        chunk_masks = chunk_masks.unsqueeze(1)  # (1, 1, xs-time)
+
+        real_len = 0
+        if self.global_chunk_size > 0:
+            # for ONNX decode simulation， padding xs to chunk_size
+            real_len = xs.size(1)
+            pad_len = self.chunk_feature_map - real_len
+            xs = F.pad(xs, (0, 0, 0, pad_len), value=0.0)
+            chunk_masks = F.pad(chunk_masks, (0, pad_len), value=0.0)
+
+        if self.global_cmvn is not None:
+            xs = self.global_cmvn(xs)
+
+        # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim)
+        xs, pos_emb, chunk_masks = self.embed(xs, chunk_masks, offset)
+        elayers, cache_t1 = att_cache.size(0), att_cache.size(2)
+        chunk_size = xs.size(1)
+        attention_key_size = cache_t1 + chunk_size
+        # NOTE(xcsong): After  embed, shape(xs) is (b=1, chunk_size, hidden-dim)
+        # shape(pos_emb) = (b=1, chunk_size, emb_size=output_size=hidden-dim)
+
+        if required_cache_size < 0:
+            next_cache_start = 0
+        elif required_cache_size == 0:
+            next_cache_start = attention_key_size
+        else:
+            next_cache_start = max(attention_key_size - required_cache_size, 0)
+
+        r_att_cache = []
+        r_cnn_cache = []
+        mask_pad = torch.ones(1,
+                              xs.size(1),
+                              device=xs.device,
+                              dtype=torch.bool)
+        mask_pad = mask_pad.unsqueeze(1)  # batchPad (b=1, 1, time=chunk_size)
+
+        if self.global_chunk_size > 0:
+            # for ONNX decode simulation
+            pos_emb = self.embed.position_encoding(
+                offset=max(offset - cache_t1, 0),
+                size=cache_t1 + self.global_chunk_size)
+            att_mask[:, :, -self.global_chunk_size:] = chunk_masks
+            mask_pad = chunk_masks.to(torch.bool)
+        else:
+            pos_emb = self.embed.position_encoding(offset=offset - cache_t1,
+                                                   size=attention_key_size)
+
+        max_att_len, max_cnn_len = 0, 0  # for repeat_interleave of new_att_cache
+        for i, layer in enumerate(self.encoders):
+            factor = self.calculate_downsampling_factor(i)
+            # NOTE(xcsong): Before layer.forward
+            #   shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2),
+            #   shape(cnn_cache[i])       is (b=1, hidden-dim, cache_t2)
+            # shape(new_att_cache) = [ batch, head, time2, outdim//head * 2 ]
+            att_cache_trunc = 0
+            if xs.size(1) + att_cache.size(2) / factor > pos_emb.size(1):
+                # The time step is not divisible by the downsampling multiple
+                att_cache_trunc = xs.size(1) + \
+                    att_cache.size(2) // factor - pos_emb.size(1) + 1
+            xs, _, new_att_cache, new_cnn_cache = layer(
+                xs,
+                att_mask,
+                pos_emb,
+                mask_pad=mask_pad,
+                att_cache=att_cache[i:i +
+                                    1, :, ::factor, :][:, :,
+                                                       att_cache_trunc:, :],
+                cnn_cache=cnn_cache[i, :, :, :]
+                if cnn_cache.size(0) > 0 else cnn_cache)
+
+            if i in self.stride_layer_idx:
+                # compute time dimension for next block
+                efficient_index = self.stride_layer_idx.index(i)
+                att_mask = att_mask[:, ::self.stride[efficient_index], ::self.
+                                    stride[efficient_index]]
+                mask_pad = mask_pad[:, ::self.stride[efficient_index], ::self.
+                                    stride[efficient_index]]
+                pos_emb = pos_emb[:, ::self.stride[efficient_index], :]
+
+            # shape(new_att_cache) = [batch, head, time2, outdim]
+            new_att_cache = new_att_cache[:, :, next_cache_start // factor:, :]
+            # shape(new_cnn_cache) = [1, batch, outdim, cache_t2]
+            new_cnn_cache = new_cnn_cache.unsqueeze(0)
+
+            # use repeat_interleave to new_att_cache
+            new_att_cache = new_att_cache.repeat_interleave(repeats=factor,
+                                                            dim=2)
+            # padding new_cnn_cache to cnn.lorder for casual convolution
+            new_cnn_cache = F.pad(
+                new_cnn_cache,
+                (self.cnn_module_kernel - 1 - new_cnn_cache.size(3), 0))
+
+            if i == 0:
+                # record length for the first block as max length
+                max_att_len = new_att_cache.size(2)
+                max_cnn_len = new_cnn_cache.size(3)
+
+            # update real shape of att_cache and cnn_cache
+            r_att_cache.append(new_att_cache[:, :, -max_att_len:, :])
+            r_cnn_cache.append(new_cnn_cache[:, :, :, -max_cnn_len:])
+
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+
+        # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2),
+        #   ? may be larger than cache_t1, it depends on required_cache_size
+        r_att_cache = torch.cat(r_att_cache, dim=0)
+        # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2)
+        r_cnn_cache = torch.cat(r_cnn_cache, dim=0)
+
+        if self.global_chunk_size > 0 and real_len:
+            chunk_real_len = real_len // self.embed.subsampling_rate // \
+                self.calculate_downsampling_factor(self.num_blocks + 1)
+            # Keeping 1 more timestep can mitigate information leakage
+            #   from the encoder caused by the padding
+            xs = xs[:, :chunk_real_len + 1, :]
+
+        return xs, r_att_cache, r_cnn_cache
+
+    def forward_chunk_by_chunk(
+            self,
+            xs: torch.Tensor,
+            decoding_chunk_size: int,
+            num_decoding_left_chunks: int = -1,
+            use_onnx=False) -> Tuple[torch.Tensor, torch.Tensor]:
+        """ Forward input chunk by chunk with chunk_size like a streaming
+            fashion
+
+        Here we should pay special attention to computation cache in the
+        streaming style forward chunk by chunk. Three things should be taken
+        into account for computation in the current network:
+            1. transformer/conformer encoder layers output cache
+            2. convolution in conformer
+            3. convolution in subsampling
+
+        However, we don't implement subsampling cache for:
+            1. We can control subsampling module to output the right result by
+               overlapping input instead of cache left context, even though it
+               wastes some computation, but subsampling only takes a very
+               small fraction of computation in the whole model.
+            2. Typically, there are several covolution layers with subsampling
+               in subsampling module, it is tricky and complicated to do cache
+               with different convolution layers with different subsampling
+               rate.
+            3. Currently, nn.Sequential is used to stack all the convolution
+               layers in subsampling, we need to rewrite it to make it work
+               with cache, which is not prefered.
+        Args:
+            xs (torch.Tensor): (1, max_len, dim)
+            decoding_chunk_size (int): decoding chunk size
+            num_decoding_left_chunks (int):
+            use_onnx (bool): True for simulating ONNX model inference.
+        """
+        assert decoding_chunk_size > 0
+        # The model is trained by static or dynamic chunk
+        assert self.static_chunk_size > 0 or self.use_dynamic_chunk
+        subsampling = self.embed.subsampling_rate
+        context = self.embed.right_context + 1  # Add current frame
+        stride = subsampling * decoding_chunk_size
+        decoding_window = (decoding_chunk_size - 1) * subsampling + context
+        num_frames = xs.size(1)
+
+        outputs = []
+        offset = 0
+        required_cache_size = decoding_chunk_size * num_decoding_left_chunks
+        if use_onnx:
+            logging.info("Simulating for ONNX runtime ...")
+            att_cache: torch.Tensor = torch.zeros(
+                (self.num_blocks, self.attention_heads, required_cache_size,
+                 self.output_size() // self.attention_heads * 2),
+                device=xs.device)
+            cnn_cache: torch.Tensor = torch.zeros(
+                (self.num_blocks, 1, self.output_size(),
+                 self.cnn_module_kernel - 1),
+                device=xs.device)
+            self.set_global_chunk_size(chunk_size=decoding_chunk_size)
+        else:
+            logging.info("Simulating for JIT runtime ...")
+            att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0),
+                                                  device=xs.device)
+            cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0),
+                                                  device=xs.device)
+
+        # Feed forward overlap input step by step
+        for cur in range(0, num_frames - context + 1, stride):
+            end = min(cur + decoding_window, num_frames)
+            logging.info(f"-->> frame chunk msg: cur={cur}, "
+                         f"end={end}, num_frames={end-cur}, "
+                         f"decoding_window={decoding_window}")
+            if use_onnx:
+                att_mask: torch.Tensor = torch.ones(
+                    (1, 1, required_cache_size + decoding_chunk_size),
+                    dtype=torch.bool,
+                    device=xs.device)
+                if cur == 0:
+                    att_mask[:, :, :required_cache_size] = 0
+            else:
+                att_mask: torch.Tensor = torch.ones((0, 0, 0),
+                                                    dtype=torch.bool,
+                                                    device=xs.device)
+
+            chunk_xs = xs[:, cur:end, :]
+            (y, att_cache, cnn_cache) = \
+                self.forward_chunk(
+                    chunk_xs, offset, required_cache_size,
+                    att_cache, cnn_cache, att_mask)
+            outputs.append(y)
+            offset += y.size(1)
+
+        ys = torch.cat(outputs, 1)
+        masks = torch.ones(1,
+                           1,
+                           ys.size(1),
+                           device=ys.device,
+                           dtype=torch.bool)
+        return ys, masks
diff --git a/wenet/efficient_conformer/encoder_layer.py b/wenet/efficient_conformer/encoder_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d160564feb2accf80bef90bbd4a1d890ca11bae
--- /dev/null
+++ b/wenet/efficient_conformer/encoder_layer.py
@@ -0,0 +1,165 @@
+# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu)
+#               2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn)
+#               2022 58.com(Wuba) Inc AI Lab.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from ESPnet(https://github.com/espnet/espnet)
+"""Encoder self-attention layer definition."""
+
+from typing import Optional, Tuple
+import torch
+from torch import nn
+
+
+class StrideConformerEncoderLayer(nn.Module):
+    """Encoder layer module.
+    Args:
+        size (int): Input dimension.
+        self_attn (torch.nn.Module): Self-attention module instance.
+            `MultiHeadedAttention` or `RelPositionMultiHeadedAttention`
+            instance can be used as the argument.
+        feed_forward (torch.nn.Module): Feed-forward module instance.
+            `PositionwiseFeedForward` instance can be used as the argument.
+        feed_forward_macaron (torch.nn.Module): Additional feed-forward module
+             instance.
+            `PositionwiseFeedForward` instance can be used as the argument.
+        conv_module (torch.nn.Module): Convolution module instance.
+            `ConvlutionModule` instance can be used as the argument.
+        dropout_rate (float): Dropout rate.
+        normalize_before (bool):
+            True: use layer_norm before each sub-block.
+            False: use layer_norm after each sub-block.
+    """
+
+    def __init__(self,
+                 size: int,
+                 self_attn: torch.nn.Module,
+                 feed_forward: Optional[nn.Module] = None,
+                 feed_forward_macaron: Optional[nn.Module] = None,
+                 conv_module: Optional[nn.Module] = None,
+                 pointwise_conv_layer: Optional[nn.Module] = None,
+                 dropout_rate: float = 0.1,
+                 normalize_before: bool = True):
+        """Construct an EncoderLayer object."""
+        super().__init__()
+        self.self_attn = self_attn
+        self.feed_forward = feed_forward
+        self.feed_forward_macaron = feed_forward_macaron
+        self.conv_module = conv_module
+        self.pointwise_conv_layer = pointwise_conv_layer
+        self.norm_ff = nn.LayerNorm(size, eps=1e-5)  # for the FNN module
+        self.norm_mha = nn.LayerNorm(size, eps=1e-5)  # for the MHA module
+        if feed_forward_macaron is not None:
+            self.norm_ff_macaron = nn.LayerNorm(size, eps=1e-5)
+            self.ff_scale = 0.5
+        else:
+            self.ff_scale = 1.0
+        if self.conv_module is not None:
+            self.norm_conv = nn.LayerNorm(size, eps=1e-5)  # for the CNN module
+            self.norm_final = nn.LayerNorm(
+                size, eps=1e-5)  # for the final output of the block
+        self.dropout = nn.Dropout(dropout_rate)
+        self.size = size
+        self.normalize_before = normalize_before
+        self.concat_linear = nn.Linear(size + size, size)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask: torch.Tensor,
+        pos_emb: torch.Tensor,
+        mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+        att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)),
+        cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)),
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Compute encoded features.
+
+        Args:
+            x (torch.Tensor): (#batch, time, size)
+            mask (torch.Tensor): Mask tensor for the input (#batch, time，time),
+                (0, 0, 0) means fake mask.
+            pos_emb (torch.Tensor): positional encoding, must not be None
+                for ConformerEncoderLayer.
+            mask_pad (torch.Tensor): batch padding mask used for conv module.
+                (#batch, 1，time), (0, 0, 0) means fake mask.
+            att_cache (torch.Tensor): Cache tensor of the KEY & VALUE
+                (#batch=1, head, cache_t1, d_k * 2), head * d_k == size.
+            cnn_cache (torch.Tensor): Convolution cache in conformer layer
+                (#batch=1, size, cache_t2)
+        Returns:
+            torch.Tensor: Output tensor (#batch, time, size).
+            torch.Tensor: Mask tensor (#batch, time, time).
+            torch.Tensor: att_cache tensor,
+                (#batch=1, head, cache_t1 + time, d_k * 2).
+            torch.Tensor: cnn_cahce tensor (#batch, size, cache_t2).
+        """
+
+        # whether to use macaron style
+        if self.feed_forward_macaron is not None:
+            residual = x
+            if self.normalize_before:
+                x = self.norm_ff_macaron(x)
+            x = residual + self.ff_scale * self.dropout(
+                self.feed_forward_macaron(x))
+            if not self.normalize_before:
+                x = self.norm_ff_macaron(x)
+
+        # multi-headed self-attention module
+        residual = x
+        if self.normalize_before:
+            x = self.norm_mha(x)
+
+        x_att, new_att_cache = self.self_attn(x, x, x, mask, pos_emb,
+                                              att_cache)
+
+        x = residual + self.dropout(x_att)
+        if not self.normalize_before:
+            x = self.norm_mha(x)
+
+        # convolution module
+        # Fake new cnn cache here, and then change it in conv_module
+        new_cnn_cache = torch.tensor([0.0], dtype=x.dtype, device=x.device)
+        if self.conv_module is not None:
+            residual = x
+            if self.normalize_before:
+                x = self.norm_conv(x)
+            x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache)
+
+            # add pointwise_conv for efficient conformer
+            #   pointwise_conv_layer does not change shape
+            if self.pointwise_conv_layer is not None:
+                residual = residual.transpose(1, 2)
+                residual = self.pointwise_conv_layer(residual)
+                residual = residual.transpose(1, 2)
+                assert residual.size(0) == x.size(0)
+                assert residual.size(1) == x.size(1)
+                assert residual.size(2) == x.size(2)
+
+            x = residual + self.dropout(x)
+
+            if not self.normalize_before:
+                x = self.norm_conv(x)
+
+        # feed forward module
+        residual = x
+        if self.normalize_before:
+            x = self.norm_ff(x)
+
+        x = residual + self.ff_scale * self.dropout(self.feed_forward(x))
+        if not self.normalize_before:
+            x = self.norm_ff(x)
+
+        if self.conv_module is not None:
+            x = self.norm_final(x)
+
+        return x, mask, new_att_cache, new_cnn_cache
diff --git a/wenet/efficient_conformer/subsampling.py b/wenet/efficient_conformer/subsampling.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec05d9fb70a1fd473fb246963a5efb259dca363b
--- /dev/null
+++ b/wenet/efficient_conformer/subsampling.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu)
+#               2022 58.com(Wuba) Inc AI Lab.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from ESPnet(https://github.com/espnet/espnet)
+"""Subsampling layer definition."""
+
+from typing import Tuple, Union
+
+import torch
+from wenet.transformer.subsampling import BaseSubsampling
+
+
+class Conv2dSubsampling2(BaseSubsampling):
+    """Convolutional 2D subsampling (to 1/4 length).
+
+    Args:
+        idim (int): Input dimension.
+        odim (int): Output dimension.
+        dropout_rate (float): Dropout rate.
+
+    """
+
+    def __init__(self, idim: int, odim: int, dropout_rate: float,
+                 pos_enc_class: torch.nn.Module):
+        """Construct an Conv2dSubsampling4 object."""
+        super().__init__()
+        self.conv = torch.nn.Sequential(torch.nn.Conv2d(1, odim, 3, 2),
+                                        torch.nn.ReLU())
+        self.out = torch.nn.Sequential(
+            torch.nn.Linear(odim * ((idim - 1) // 2), odim))
+        self.pos_enc = pos_enc_class
+        # The right context for every conv layer is computed by:
+        # (kernel_size - 1) * frame_rate_of_this_layer
+        self.subsampling_rate = 2
+        # 2 = (3 - 1) * 1
+        self.right_context = 2
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_mask: torch.Tensor,
+        offset: Union[int, torch.Tensor] = 0
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Subsample x.
+
+        Args:
+            x (torch.Tensor): Input tensor (#batch, time, idim).
+            x_mask (torch.Tensor): Input mask (#batch, 1, time).
+
+        Returns:
+            torch.Tensor: Subsampled tensor (#batch, time', odim),
+                where time' = time // 2.
+            torch.Tensor: Subsampled mask (#batch, 1, time'),
+                where time' = time // 2.
+            torch.Tensor: positional encoding
+
+        """
+        x = x.unsqueeze(1)  # (b, c=1, t, f)
+        x = self.conv(x)
+        b, c, t, f = x.size()
+        x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f))
+        x, pos_emb = self.pos_enc(x, offset)
+        return x, pos_emb, x_mask[:, :, :-2:2]
diff --git a/wenet/finetune/lora/__init__.py b/wenet/finetune/lora/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/wenet/finetune/lora/config.yaml b/wenet/finetune/lora/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3432e9bd03d388ce3df2dd5a9f20e914659e9dd4
--- /dev/null
+++ b/wenet/finetune/lora/config.yaml
@@ -0,0 +1,13 @@
+init_batch_size: 2
+init_iters: 8
+init_config:
+  mode: "gradient"  # option: "simple", "svd", "gradient"
+  lora_A: "unit"  # option: "gaussian", "kaiming", "fan_out_kaiming", "xavier", "zeros", "unit", "orthogonal"
+  lora_A_std: 0.01  # only needed when lora_A is "gaussian"
+  lora_B: "unit"  # option: "gaussian", "kaiming", "fan_out_kaiming", "xavier", "zeros", "unit", "orthogonal"
+  lora_B_std: 0.01  # only needed when lora_B is "gaussian"
+  scale: "stable"  # option: "default", "stable", "unit", "normalized", "gd", "weightS"
+  stable_gamma: 2  # only needed when scale is "stable"
+  direction: "ArB2r"  # option: "ArBr", "A2rBr", "ArB2r"（only needed when mode is "gradient"）
+  dtype: "fp32"  # option: "bf16", "fp32"
+  norm_clip: false  # norm clipping
diff --git a/wenet/finetune/lora/layers.py b/wenet/finetune/lora/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..3982ef279f8a5611051172838a6f6d94b5090f02
--- /dev/null
+++ b/wenet/finetune/lora/layers.py
@@ -0,0 +1,350 @@
+# Copyright (c) 2021 microsoft
+#               2023 Alan (alanfangemail@gmail.com)
+#  -----------------------------------------------------------------------------
+#  Licensed under the MIT License (MIT). See LICENSE in the repo root for
+#  license information.
+#  -----------------------------------------------------------------------------
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+import math
+from typing import List
+
+
+class LoRALayer():
+
+    def __init__(
+        self,
+        r: int,
+        lora_alpha: int,
+        lora_dropout: float,
+        merge_weights: bool,
+    ):
+        self.r = r
+        self.lora_alpha = lora_alpha
+        # Optional dropout
+        if lora_dropout > 0.:
+            self.lora_dropout = nn.Dropout(p=lora_dropout)
+        else:
+            self.lora_dropout = self.identity
+        # Mark the weight as unmerged
+        self.merged = False
+        self.merge_weights = merge_weights
+
+    def identity(self, x):
+        return x
+
+
+class Embedding(nn.Embedding, LoRALayer):
+    # LoRA implemented in a dense layer
+    def __init__(self,
+                 num_embeddings: int,
+                 embedding_dim: int,
+                 r: int = 0,
+                 lora_alpha: int = 1,
+                 merge_weights: bool = True,
+                 **kwargs):
+        nn.Embedding.__init__(self, num_embeddings, embedding_dim, **kwargs)
+        LoRALayer.__init__(self,
+                           r=r,
+                           lora_alpha=lora_alpha,
+                           lora_dropout=0,
+                           merge_weights=merge_weights)
+        # Actual trainable parameters
+        if r > 0:
+            self.lora_A = nn.Parameter(
+                self.weight.new_zeros((r, num_embeddings)))
+            self.lora_B = nn.Parameter(
+                self.weight.new_zeros((embedding_dim, r)))
+            self.scaling = self.lora_alpha / self.r
+            # Freezing the pre-trained weight matrix
+            self.weight.requires_grad = False
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        nn.Embedding.reset_parameters(self)
+        if hasattr(self, 'lora_A'):
+            # initialize A the same way as the default for nn.Linear and B to zero
+            nn.init.zeros_(self.lora_A)
+            nn.init.normal_(self.lora_B)
+
+    def train(self, mode: bool = True):
+        nn.Embedding.train(self, mode)
+        if mode:
+            if self.merge_weights and self.merged:
+                # Make sure that the weights are not merged
+                if self.r > 0:
+                    temp = (self.lora_B @ self.lora_A).transpose(0, 1)
+                    self.weight.data -= temp * self.scaling
+                self.merged = False
+        else:
+            if self.merge_weights and not self.merged:
+                # Merge the weights and mark it
+                if self.r > 0:
+                    temp = (self.lora_B @ self.lora_A).transpose(0, 1)
+                    self.weight.data += temp * self.scaling
+                self.merged = True
+
+    def forward(self, x: torch.Tensor):
+        if self.r > 0 and not self.merged:
+            result = nn.Embedding.forward(self, x)
+            after_A = F.embedding(x, self.lora_A.transpose(0, 1),
+                                  self.padding_idx, self.max_norm,
+                                  self.norm_type, self.scale_grad_by_freq,
+                                  self.sparse)
+            result += (after_A @ self.lora_B.transpose(0, 1)) * self.scaling
+            return result
+        else:
+            return nn.Embedding.forward(self, x)
+
+
+class Linear(nn.Linear, LoRALayer):
+    # LoRA implemented in a dense layer
+    def __init__(
+            self,
+            in_features: int,
+            out_features: int,
+            r: int = 0,
+            lora_alpha: int = 1,
+            lora_dropout: float = 0.,
+            fan_in_fan_out: bool = False,
+            # Set this to True if the layer to replace stores weight like (fan_in,
+            #                                                              fan_out)
+            merge_weights: bool = True,
+            **kwargs):
+        nn.Linear.__init__(self, in_features, out_features, **kwargs)
+        LoRALayer.__init__(self,
+                           r=r,
+                           lora_alpha=lora_alpha,
+                           lora_dropout=lora_dropout,
+                           merge_weights=merge_weights)
+
+        self.fan_in_fan_out = fan_in_fan_out
+        # Actual trainable parameters
+        if r > 0:
+            self.lora_A = nn.Parameter(self.weight.new_zeros((r, in_features)))
+            self.lora_B = nn.Parameter(self.weight.new_zeros(
+                (out_features, r)))
+            self.scaling = self.lora_alpha / self.r
+            # Freezing the pre-trained weight matrix
+            self.weight.requires_grad = False
+        self.reset_parameters()
+        if fan_in_fan_out:
+            self.weight.data = self.weight.data.transpose(0, 1)
+
+    def reset_parameters(self):
+        nn.Linear.reset_parameters(self)
+        if hasattr(self, 'lora_A'):
+            # initialize A the same way as the default for nn.Linear and B to zero
+            nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))
+            nn.init.zeros_(self.lora_B)
+
+    def T(self, w):
+        return w.transpose(0, 1) if self.fan_in_fan_out else w
+
+    def train(self, mode: bool = True):
+        nn.Linear.train(self, mode)
+        if mode:
+            if self.merge_weights and self.merged:
+                # Make sure that the weights are not merged
+                if self.r > 0:
+                    temp = self.T(self.lora_B @ self.lora_A)
+                    self.weight.data -= temp * self.scaling
+                self.merged = False
+        else:
+            if self.merge_weights and not self.merged:
+                # Merge the weights and mark it
+                if self.r > 0:
+                    temp = self.T(self.lora_B @ self.lora_A)
+                    self.weight.data += temp * self.scaling
+                self.merged = True
+
+    def forward(self, x: torch.Tensor):
+        if self.r > 0 and not self.merged:
+            result = F.linear(x, self.T(self.weight), bias=self.bias)
+            result += (self.lora_dropout(x) @ self.lora_A.transpose(0, 1)
+                       @ self.lora_B.transpose(0, 1)) * self.scaling
+            return result
+        else:
+            return F.linear(x, self.T(self.weight), bias=self.bias)
+
+
+class MergedLinear(nn.Linear, LoRALayer):
+    # LoRA implemented in a dense layer
+    def __init__(self,
+                 in_features: int,
+                 out_features: int,
+                 r: int = 0,
+                 lora_alpha: int = 1,
+                 lora_dropout: float = 0.,
+                 enable_lora: List[bool] = None,
+                 fan_in_fan_out: bool = False,
+                 merge_weights: bool = True,
+                 **kwargs):
+        if enable_lora is None:
+            enable_lora = [False]
+        nn.Linear.__init__(self, in_features, out_features, **kwargs)
+        LoRALayer.__init__(self,
+                           r=r,
+                           lora_alpha=lora_alpha,
+                           lora_dropout=lora_dropout,
+                           merge_weights=merge_weights)
+        assert out_features % len(enable_lora) == 0, \
+            'The length of enable_lora must divide out_features'
+        self.enable_lora = enable_lora
+        self.fan_in_fan_out = fan_in_fan_out
+        # Actual trainable parameters
+        if r > 0 and any(enable_lora):
+            self.lora_A = nn.Parameter(
+                self.weight.new_zeros((r * sum(enable_lora), in_features)))
+            self.lora_B = nn.Parameter(
+                self.weight.new_zeros(
+                    (out_features // len(enable_lora) * sum(enable_lora), r)))
+            # weights for Conv1D with groups=sum(enable_lora)
+            self.scaling = self.lora_alpha / self.r
+            # Freezing the pre-trained weight matrix
+            self.weight.requires_grad = False
+            # Compute the indices
+            self.lora_ind = self.weight.new_zeros(
+                (out_features, ), dtype=torch.bool).view(len(enable_lora), -1)
+            self.lora_ind[enable_lora, :] = True
+            self.lora_ind = self.lora_ind.view(-1)
+        self.reset_parameters()
+        if fan_in_fan_out:
+            self.weight.data = self.weight.data.transpose(0, 1)
+
+    def reset_parameters(self):
+        nn.Linear.reset_parameters(self)
+        if hasattr(self, 'lora_A'):
+            # initialize A the same way as the default for nn.Linear and B to zero
+            nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))
+            nn.init.zeros_(self.lora_B)
+
+    def zero_pad(self, x):
+        result = x.new_zeros((len(self.lora_ind), *x.size()[1:]))
+        result[self.lora_ind] = x
+        return result
+
+    def T(self, w):
+        return w.transpose(0, 1) if self.fan_in_fan_out else w
+
+    def merge_AB(self):
+        delta_w = F.conv1d(self.lora_A.unsqueeze(0),
+                           self.lora_B.unsqueeze(-1),
+                           groups=sum(self.enable_lora)).squeeze(0)
+        return self.T(delta_w)
+
+    def train(self, mode: bool = True):
+        nn.Linear.train(self, mode)
+        if mode:
+            if self.merge_weights and self.merged:
+                # Make sure that the weights are not merged
+                if self.r > 0 and any(self.enable_lora):
+                    self.weight.data -= self.merge_AB() * self.scaling
+                self.merged = False
+        else:
+            if self.merge_weights and not self.merged:
+                # Merge the weights and mark it
+                if self.r > 0 and any(self.enable_lora):
+                    self.weight.data += self.merge_AB() * self.scaling
+                self.merged = True
+
+    def forward(self, x: torch.Tensor):
+        if self.merged:
+            return F.linear(x, self.T(self.weight), bias=self.bias)
+        else:
+            result = F.linear(x, self.T(self.weight), bias=self.bias)
+            if self.r > 0:
+                temp = self.T(self.merge_AB().T)
+                result += self.lora_dropout(x) @ temp * self.scaling
+            return result
+
+
+class ConvLoRA(nn.Module, LoRALayer):
+
+    def __init__(self,
+                 conv_module,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 r=0,
+                 lora_alpha=1,
+                 lora_dropout=0.,
+                 merge_weights=True,
+                 **kwargs):
+        super(ConvLoRA, self).__init__()
+        self.conv = conv_module(in_channels, out_channels, kernel_size,
+                                **kwargs)
+        LoRALayer.__init__(self,
+                           r=r,
+                           lora_alpha=lora_alpha,
+                           lora_dropout=lora_dropout,
+                           merge_weights=merge_weights)
+        assert isinstance(kernel_size, int)
+        # Actual trainable parameters
+        if r > 0:
+            self.lora_A = nn.Parameter(
+                self.conv.weight.new_zeros(
+                    (r * kernel_size, in_channels * kernel_size)))
+            self.lora_B = nn.Parameter(
+                self.conv.weight.new_zeros(
+                    (out_channels // self.conv.groups * kernel_size,
+                     r * kernel_size)))
+            self.scaling = self.lora_alpha / self.r
+            # Freezing the pre-trained weight matrix
+            self.conv.weight.requires_grad = False
+        self.reset_parameters()
+        self.merged = False
+
+    def reset_parameters(self):
+        self.conv.reset_parameters()
+        if hasattr(self, 'lora_A'):
+            # initialize A the same way as the default for nn.Linear and B to zero
+            nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))
+            nn.init.zeros_(self.lora_B)
+
+    def train(self, mode=True):
+        super(ConvLoRA, self).train(mode)
+        if mode:
+            if self.merge_weights and self.merged:
+                if self.r > 0:
+                    # Make sure that the weights are not merged
+                    self.conv.weight.data -= (self.lora_B @ self.lora_A).view(
+                        self.conv.weight.shape) * self.scaling
+                self.merged = False
+        else:
+            if self.merge_weights and not self.merged:
+                if self.r > 0:
+                    # Merge the weights and mark it
+                    self.conv.weight.data += (self.lora_B @ self.lora_A).view(
+                        self.conv.weight.shape) * self.scaling
+                self.merged = True
+
+    def forward(self, x):
+        if self.r > 0 and not self.merged:
+            return self.conv._conv_forward(
+                x, self.conv.weight +
+                (self.lora_B @ self.lora_A).view(self.conv.weight.shape) *
+                self.scaling, self.conv.bias)
+        return self.conv(x)
+
+
+class Conv2d(ConvLoRA):
+
+    def __init__(self, *args, **kwargs):
+        super(Conv2d, self).__init__(nn.Conv2d, *args, **kwargs)
+
+
+class Conv1d(ConvLoRA):
+
+    def __init__(self, *args, **kwargs):
+        super(Conv1d, self).__init__(nn.Conv1d, *args, **kwargs)
+
+
+# Can Extend to other ones like this
+class Conv3d(ConvLoRA):
+
+    def __init__(self, *args, **kwargs):
+        super(Conv3d, self).__init__(nn.Conv3d, *args, **kwargs)
diff --git a/wenet/finetune/lora/utils.py b/wenet/finetune/lora/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6edc8a8b1cc163d85c4ab6ab014b9b3a1ea9b22
--- /dev/null
+++ b/wenet/finetune/lora/utils.py
@@ -0,0 +1,334 @@
+# Copyright (c) 2021 microsoft
+#               2023 Alan (alanfangemail@gmail.com)
+#  -----------------------------------------------------------------------------
+#  Licensed under the MIT License (MIT). See LICENSE in the repo root for
+#  license information.
+#  -----------------------------------------------------------------------------
+
+import logging
+import torch
+import torch.nn as nn
+
+from typing import Dict, List
+
+import wenet.finetune.lora.layers as lora
+
+
+def get_nested_attr(module, attr_path):
+    attrs = attr_path.split('.')
+    for attr in attrs:
+        if hasattr(module, attr):
+            module = getattr(module, attr)
+        else:
+            return None
+    return module
+
+
+def inject_lora(module, lora_config):
+    lora_rank = lora_config["lora_rank"]
+    lora_alpha = lora_config["lora_alpha"]
+    lora_dropout = lora_config["lora_dropout"]
+    for lora_attr in lora_config["lora_list"]:
+        if hasattr(module, lora_attr):
+            submodule = getattr(module, lora_attr)
+            n_feat = submodule.in_features
+            lora_linear = lora.Linear(n_feat, n_feat, r=lora_rank,
+                                      lora_alpha=lora_alpha,
+                                      lora_dropout=lora_dropout)
+            setattr(module, lora_attr, lora_linear)
+
+
+def inject_lora_to_model(model, lora_config):
+    lora_modules = []
+    for module in lora_config["lora_modules"]:
+        submodule = get_nested_attr(model, module)
+        for layer in submodule:
+            lora_modules.append(layer)
+
+    updated_lora_modules = []
+    for i in range(len(lora_modules)):
+        for attn_attr in lora_config["lora_attn_attr"]:
+            if hasattr(lora_modules[i], attn_attr):
+                updated_lora_modules.append(getattr(lora_modules[i], attn_attr))
+
+    for lora_module in updated_lora_modules:
+        inject_lora(lora_module, lora_config)
+
+
+def mark_only_lora_as_trainable(model: nn.Module, bias: str = 'none') -> None:
+    logging.info('freezing all params except lora module.')
+    for n, p in model.named_parameters():
+        if 'lora_' not in n:
+            p.requires_grad = False
+    if bias == 'none':
+        return
+    elif bias == 'all':
+        for n, p in model.named_parameters():
+            if 'bias' in n:
+                p.requires_grad = True
+    elif bias == 'lora_only':
+        for m in model.modules():
+            if isinstance(m, lora.LoRALayer) and \
+               hasattr(m, 'bias') and \
+               m.bias is not None:
+                m.bias.requires_grad = True
+    else:
+        raise NotImplementedError
+
+
+def lora_state_dict(model: nn.Module,
+                    bias: str = 'none') -> Dict[str, torch.Tensor]:
+    my_state_dict = model.state_dict()
+    if bias == 'none':
+        return {k: my_state_dict[k] for k in my_state_dict if 'lora_' in k}
+    elif bias == 'all':
+        return {
+            k: my_state_dict[k]
+            for k in my_state_dict if 'lora_' in k or 'bias' in k
+        }
+    elif bias == 'lora_only':
+        to_return = {}
+        for k in my_state_dict:
+            if 'lora_' in k:
+                to_return[k] = my_state_dict[k]
+                bias_name = k.split('lora_')[0] + 'bias'
+                if bias_name in my_state_dict:
+                    to_return[bias_name] = my_state_dict[bias_name]
+        return to_return
+    else:
+        raise NotImplementedError
+
+
+def get_record_gradient_hook(model, record_dict):
+    def record_gradient_hook(grad):
+        for n, p in model.named_parameters():
+            if p.requires_grad and p.grad is not None:
+                if n not in record_dict:
+                    record_dict[n] = p.grad.cpu()
+                else:
+                    record_dict[n] += p.grad.cpu()
+                p.grad = None
+        return grad
+
+    return record_gradient_hook
+
+
+def estimate_gradient(
+    model, dataloader, max_iters: int = 8,
+    device: torch.device = torch.device("cpu")
+) -> Dict[str, List[torch.Tensor]]:
+    r"""
+    Estimate the gradient of the model on the given dataset
+    """
+    logging.info("Estimating gradient layer by layer, time needed")
+    model.train()
+    named_grads = {}
+    hooks = []
+    requires_grad_states = {}
+    for name, param in model.named_parameters():
+        requires_grad_states[name] = param.requires_grad
+        param.requires_grad = True
+        hook = param.register_hook(get_record_gradient_hook(model, named_grads))
+        hooks.append(hook)
+    num = 0
+    for _, batch_dict in enumerate(dataloader):
+        num += 1
+        if max_iters is not None and num >= max_iters:
+            break
+        outputs = model(batch_dict, device)
+        outputs['loss'].backward()
+        get_record_gradient_hook(model, named_grads)(None)  # get gradient of last layer
+        # make sure the gradient is cleared
+        for n, p in model.named_parameters():
+            if p.grad is not None:
+                p.grad = None
+    for n, _ in named_grads.items():
+        named_grads[n] /= num
+    for hook in hooks:
+        hook.remove()
+    # recover original requires_grad states
+    for name, param in model.named_parameters():
+        param.requires_grad = requires_grad_states[name]
+    torch.cuda.empty_cache()
+    return named_grads
+
+
+@torch.no_grad()
+def reinit_lora_modules(name, module, init_config, **kwargs):
+    r"""Refer to https://github.com/Outsider565/LoRA-GA/blob/
+    c185846309ea9012d0bcd46ebd30347dda1c592c/run_exp.py#L67
+    Reinitialize the lora model with the given configuration.
+    """
+    import math
+    lora_r = min(module.lora_A.shape)
+    a_dim = max(module.lora_A.shape)
+    b_dim = max(module.lora_B.shape)
+    if init_config.mode == "simple":
+        match init_config.lora_A:
+            case "gaussian":
+                torch.nn.init.normal_(
+                    module.lora_A, mean=0.0,
+                    std=init_config.lora_A_std
+                )
+            case "kaiming":
+                # https://github.com/microsoft/LoRA/blob/a0a92e0f26c067cf94747bdbf1ce73793fa44d19/loralib/layers.py#L124
+                torch.nn.init.kaiming_uniform_(module.lora_A,
+                                               a=math.sqrt(5))
+            case "fan_out_kaiming":
+                torch.nn.init.kaiming_normal_(
+                    module.lora_A, mode="fan_out"
+                )
+            case "xavier":
+                torch.nn.init.xavier_normal_(module.lora_A)
+            case "zeros":
+                torch.nn.init.zeros_(module.lora_A)
+            case "unit":
+                torch.nn.init.normal_(
+                    module.lora_A, mean=0.0,
+                    std=1.0 / (a_dim**0.5)
+                )
+            case "orthogonal":
+                torch.nn.init.orthogonal_(module.lora_A)
+            case _:
+                raise ValueError(
+                    f"Unknown lora_A initialization: {init_config.lora_A}"
+                )
+        match init_config.lora_B:
+            case "gaussian":
+                torch.nn.init.normal_(
+                    module.lora_B, mean=0.0,
+                    std=init_config.lora_B_std
+                )
+            case "kaiming":
+                torch.nn.init.kaiming_normal_(module.lora_B)
+            case "fan_out_kaiming":
+                torch.nn.init.kaiming_normal_(
+                    module.lora_B, mode="fan_out"
+                )
+            case "xavier":
+                torch.nn.init.xavier_normal_(module.lora_B)
+            case "zeros":
+                torch.nn.init.zeros_(module.lora_B)
+            case "unit":
+                torch.nn.init.normal_(
+                    module.lora_B, mean=0.0,
+                    std=1.0 / (b_dim**0.5)
+                )
+            case "orthogonal":
+                torch.nn.init.orthogonal_(module.lora_B)
+            case _:
+                raise ValueError(
+                    f"Unknown lora_B initialization: {init_config.lora_B}"
+                )
+        if getattr(init_config, 'scale', '') == "stable":
+            gamma = init_config.stable_gamma
+            m, n = module.weight.shape
+            module.lora_B.data *= (m**0.25) / gamma**0.5
+            module.lora_A.data *= (n**0.25) / gamma**0.5
+    elif init_config.mode == "svd":
+        U, S, V = torch.svd_lowrank(module.weight.float(), q=4 * lora_r,
+                                    niter=4)
+        V = V.T
+        m, n = module.weight.shape
+        if init_config.scale == "default":
+            S = S / module.scaling
+            module.lora_B = torch.nn.Parameter(
+                (U[:, :lora_r] * torch.sqrt(S[:lora_r])).contiguous()
+            )
+            module.lora_A = torch.nn.Parameter(
+                (V[:lora_r, :].T * torch.sqrt(S[:lora_r])).T.contiguous()
+            )
+        elif init_config.scale == "stable":
+            gamma = init_config.stable_gamma
+            module.lora_B = torch.nn.Parameter(
+                (U[:, :lora_r] * (m**0.25) / gamma**0.5).contiguous()
+            )
+            module.lora_A = torch.nn.Parameter(
+                (V[:lora_r, :] * (n**0.25) / gamma**0.5).contiguous()
+            )
+        elif init_config.scale == "unit":
+            module.lora_B = torch.nn.Parameter((U[:, :lora_r]).contiguous())
+            module.lora_A = torch.nn.Parameter((V[:lora_r, :]).contiguous())
+        elif init_config.scale == "normalized":
+            S_sum = S[:lora_r].sum()
+            module.lora_B = torch.nn.Parameter(
+                (U[:, :lora_r] * torch.sqrt(S[:lora_r])
+                 / torch.sqrt(S_sum) * lora_r**0.5).contiguous()
+            )
+            module.lora_A = torch.nn.Parameter(
+                (V[:lora_r, :].T * torch.sqrt(S[:lora_r])
+                 / torch.sqrt(S_sum) * lora_r**0.5).T.contiguous()
+            )
+    elif init_config.mode == "gradient":
+        named_grad = kwargs["named_grads"]
+        grad_name = name + ".weight"
+        grads = named_grad[grad_name]
+        U, S, V = torch.svd_lowrank(grads.cuda().float(), q=4 * lora_r, niter=4)
+        V = V.T
+        # set direction
+        if init_config.direction == "ArBr":
+            B = U[:, 0 : 2 * lora_r : 2]
+            A = V[1 : 2 * lora_r : 2, :]
+        elif init_config.direction == "A2rBr":
+            B = U[:, :lora_r]
+            A = V[lora_r : 2 * lora_r, :]
+        elif init_config.direction == "ArB2r":
+            B = U[:, lora_r : 2 * lora_r]
+            A = V[:lora_r, :]
+        scaling_factor = module.scaling
+        if init_config.scale == "gd":
+            A = A / scaling_factor
+            B = B / scaling_factor
+        elif init_config.scale == "unit":
+            # Because A,B is orthogonal, do not need to scale
+            pass
+        elif init_config.scale == "stable":
+            m, n = grads.shape
+            # m: feature_out, n: feature_in
+            # the scale of output is only related to the feature_out
+            gamma = init_config.stable_gamma
+            B = B * m**0.25 / gamma**0.5
+            A = A * m**0.25 / gamma**0.5
+        elif init_config.scale == "weightS":
+            _, S, _ = torch.svd_lowrank(module.weight.float(), q=4 * lora_r,
+                                        niter=4)
+            S = S / module.scaling
+            avg_s = torch.sqrt(S[:lora_r]).mean().to(A.device)
+            B = B * avg_s
+            A = A * avg_s
+        module.lora_B = torch.nn.Parameter(B.contiguous().cuda())
+        module.lora_A = torch.nn.Parameter(A.contiguous().cuda())
+
+    with torch.no_grad():
+        # consider dtype not in init_config
+        if not hasattr(init_config, "dtype"):
+            pass
+        elif init_config.dtype == "bf16":
+            module.lora_A.data = module.lora_A.data.to(torch.bfloat16)
+            module.lora_B.data = module.lora_B.data.to(torch.bfloat16)
+        elif init_config.dtype == "fp32":
+            module.lora_A.data = module.lora_A.data.to(torch.float32)
+            module.lora_B.data = module.lora_B.data.to(torch.float32)
+        # If lora_A@lora_B is not zero,
+        # then we need to subtract lora_A@lora_B from the original weight matrix
+        offset = (
+            module.lora_B @ module.lora_A
+        ).to(module.weight.data.device)
+        scaling_factor = module.scaling
+        offset *= scaling_factor
+        if hasattr(init_config, "norm_clip") and init_config.norm_clip:
+            # for numerical stability,
+            # offset's largest value must be less then weight's largest value
+            ratio = torch.max(torch.abs(module.weight.data)) / torch.max(
+                torch.abs(offset)
+            )
+            if ratio < 1:
+                offset *= ratio
+                module.lora_A.data *= ratio**0.5
+                module.lora_B.data *= ratio**0.5
+                logging.warning(f"Clipping offset by {ratio}")
+        try:
+            module.weight.data -= offset
+        except Exception as e:
+            logging.warning(f"{e}")
+            breakpoint()
diff --git a/wenet/k2/__init__.py b/wenet/k2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/wenet/k2/model.py b/wenet/k2/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..2baabbfb2b2fe734ad74713b60613f7fe85e82f3
--- /dev/null
+++ b/wenet/k2/model.py
@@ -0,0 +1,304 @@
+# Copyright (c) 2023 Binbin Zhang (binbzha@qq.com)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+import torch
+from torch.nn.utils.rnn import pad_sequence
+
+from wenet.transformer.asr_model import ASRModel
+from wenet.transformer.ctc import CTC
+from wenet.transformer.decoder import TransformerDecoder
+from wenet.transformer.encoder import TransformerEncoder
+from wenet.utils.common import (IGNORE_ID, add_sos_eos, reverse_pad_list)
+
+
+class K2Model(ASRModel):
+
+    def __init__(
+            self,
+            vocab_size: int,
+            encoder: TransformerEncoder,
+            decoder: TransformerDecoder,
+            ctc: CTC,
+            ctc_weight: float = 0.5,
+            ignore_id: int = IGNORE_ID,
+            reverse_weight: float = 0.0,
+            lsm_weight: float = 0.0,
+            length_normalized_loss: bool = False,
+            lfmmi_dir: str = '',
+            special_tokens: dict = None,
+            device: torch.device = torch.device("cuda"),
+    ):
+        super().__init__(vocab_size,
+                         encoder,
+                         decoder,
+                         ctc,
+                         ctc_weight,
+                         ignore_id,
+                         reverse_weight,
+                         lsm_weight,
+                         length_normalized_loss,
+                         special_tokens=special_tokens)
+        self.lfmmi_dir = lfmmi_dir
+        self.device = device
+        if self.lfmmi_dir != '':
+            self.load_lfmmi_resource()
+
+    @torch.jit.unused
+    def _forward_ctc(
+            self, encoder_out: torch.Tensor, encoder_mask: torch.Tensor,
+            text: torch.Tensor,
+            text_lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        loss_ctc, ctc_probs = self._calc_lfmmi_loss(encoder_out, encoder_mask,
+                                                    text)
+        return loss_ctc, ctc_probs
+
+    @torch.jit.unused
+    def load_lfmmi_resource(self):
+        try:
+            import icefall
+        except ImportError:
+            print('Error: Failed to import icefall')
+        with open('{}/tokens.txt'.format(self.lfmmi_dir), 'r') as fin:
+            for line in fin:
+                arr = line.strip().split()
+                if arr[0] == '<sos/eos>':
+                    self.sos_eos_id = int(arr[1])
+        device = torch.device(self.device)
+        self.graph_compiler = icefall.mmi_graph_compiler.MmiTrainingGraphCompiler(
+            self.lfmmi_dir,
+            device=device,
+            oov="<UNK>",
+            sos_id=self.sos_eos_id,
+            eos_id=self.sos_eos_id,
+        )
+        self.lfmmi = icefall.mmi.LFMMILoss(
+            graph_compiler=self.graph_compiler,
+            den_scale=1,
+            use_pruned_intersect=False,
+        )
+        self.word_table = {}
+        with open('{}/words.txt'.format(self.lfmmi_dir), 'r') as fin:
+            for line in fin:
+                arr = line.strip().split()
+                assert len(arr) == 2
+                self.word_table[int(arr[1])] = arr[0]
+
+    @torch.jit.unused
+    def _calc_lfmmi_loss(self, encoder_out, encoder_mask, text):
+        try:
+            import k2
+        except ImportError:
+            print('Error: Failed to import k2')
+        ctc_probs = self.ctc.log_softmax(encoder_out)
+        supervision_segments = torch.stack((
+            torch.arange(len(encoder_mask)),
+            torch.zeros(len(encoder_mask)),
+            encoder_mask.squeeze(dim=1).sum(dim=1).to('cpu'),
+        ), 1).to(torch.int32)
+        dense_fsa_vec = k2.DenseFsaVec(
+            ctc_probs,
+            supervision_segments,
+            allow_truncate=3,
+        )
+        text = [
+            ' '.join([self.word_table[j.item()] for j in i if j != -1])
+            for i in text
+        ]
+        loss = self.lfmmi(dense_fsa_vec=dense_fsa_vec, texts=text) / len(text)
+        return loss, ctc_probs
+
+    def load_hlg_resource_if_necessary(self, hlg, word):
+        try:
+            import k2
+        except ImportError:
+            print('Error: Failed to import k2')
+        if not hasattr(self, 'hlg'):
+            device = torch.device(self.device)
+            self.hlg = k2.Fsa.from_dict(torch.load(hlg, map_location=device))
+        if not hasattr(self.hlg, "lm_scores"):
+            self.hlg.lm_scores = self.hlg.scores.clone()
+        if not hasattr(self, 'word_table'):
+            self.word_table = {}
+            with open(word, 'r') as fin:
+                for line in fin:
+                    arr = line.strip().split()
+                    assert len(arr) == 2
+                    self.word_table[int(arr[1])] = arr[0]
+
+    @torch.no_grad()
+    def hlg_onebest(
+        self,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        decoding_chunk_size: int = -1,
+        num_decoding_left_chunks: int = -1,
+        simulate_streaming: bool = False,
+        hlg: str = '',
+        word: str = '',
+        symbol_table: Dict[str, int] = None,
+    ) -> List[int]:
+        try:
+            import icefall
+        except ImportError:
+            print('Error: Failed to import icefall')
+        self.load_hlg_resource_if_necessary(hlg, word)
+        encoder_out, encoder_mask = self._forward_encoder(
+            speech, speech_lengths, decoding_chunk_size,
+            num_decoding_left_chunks,
+            simulate_streaming)  # (B, maxlen, encoder_dim)
+        ctc_probs = self.ctc.log_softmax(
+            encoder_out)  # (1, maxlen, vocab_size)
+        supervision_segments = torch.stack(
+            (torch.arange(len(encoder_mask)), torch.zeros(len(encoder_mask)),
+             encoder_mask.squeeze(dim=1).sum(dim=1).cpu()),
+            1,
+        ).to(torch.int32)
+        lattice = icefall.decode.get_lattice(
+            nnet_output=ctc_probs,
+            decoding_graph=self.hlg,
+            supervision_segments=supervision_segments,
+            search_beam=20,
+            output_beam=7,
+            min_active_states=30,
+            max_active_states=10000,
+            subsampling_factor=4)
+        best_path = icefall.decode.one_best_decoding(lattice=lattice,
+                                                     use_double_scores=True)
+        hyps = icefall.utils.get_texts(best_path)
+        hyps = [[symbol_table[k] for j in i for k in self.word_table[j]]
+                for i in hyps]
+        return hyps
+
+    @torch.no_grad()
+    def hlg_rescore(
+        self,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        decoding_chunk_size: int = -1,
+        num_decoding_left_chunks: int = -1,
+        simulate_streaming: bool = False,
+        lm_scale: float = 0,
+        decoder_scale: float = 0,
+        r_decoder_scale: float = 0,
+        hlg: str = '',
+        word: str = '',
+        symbol_table: Dict[str, int] = None,
+    ) -> List[int]:
+        try:
+            import k2
+            import icefall
+        except ImportError:
+            print('Error: Failed to import k2 & icefall')
+        self.load_hlg_resource_if_necessary(hlg, word)
+        device = speech.device
+        encoder_out, encoder_mask = self._forward_encoder(
+            speech, speech_lengths, decoding_chunk_size,
+            num_decoding_left_chunks,
+            simulate_streaming)  # (B, maxlen, encoder_dim)
+        ctc_probs = self.ctc.log_softmax(
+            encoder_out)  # (1, maxlen, vocab_size)
+        supervision_segments = torch.stack(
+            (torch.arange(len(encoder_mask)), torch.zeros(len(encoder_mask)),
+             encoder_mask.squeeze(dim=1).sum(dim=1).cpu()),
+            1,
+        ).to(torch.int32)
+        lattice = icefall.decode.get_lattice(
+            nnet_output=ctc_probs,
+            decoding_graph=self.hlg,
+            supervision_segments=supervision_segments,
+            search_beam=20,
+            output_beam=7,
+            min_active_states=30,
+            max_active_states=10000,
+            subsampling_factor=4)
+        nbest = icefall.decode.Nbest.from_lattice(
+            lattice=lattice,
+            num_paths=100,
+            use_double_scores=True,
+            nbest_scale=0.5,
+        )
+        nbest = nbest.intersect(lattice)
+        assert hasattr(nbest.fsa, "lm_scores")
+        assert hasattr(nbest.fsa, "tokens")
+        assert isinstance(nbest.fsa.tokens, torch.Tensor)
+
+        tokens_shape = nbest.fsa.arcs.shape().remove_axis(1)
+        tokens = k2.RaggedTensor(tokens_shape, nbest.fsa.tokens)
+        tokens = tokens.remove_values_leq(0)
+        hyps = tokens.tolist()
+
+        # cal attention_score
+        hyps_pad = pad_sequence([
+            torch.tensor(hyp, device=device, dtype=torch.long) for hyp in hyps
+        ], True, self.ignore_id)  # (beam_size, max_hyps_len)
+        ori_hyps_pad = hyps_pad
+        hyps_lens = torch.tensor([len(hyp) for hyp in hyps],
+                                 device=device,
+                                 dtype=torch.long)  # (beam_size,)
+        hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id)
+        hyps_lens = hyps_lens + 1  # Add <sos> at begining
+        encoder_out_repeat = []
+        tot_scores = nbest.tot_scores()
+        repeats = [tot_scores[i].shape[0] for i in range(tot_scores.dim0)]
+        for i in range(len(encoder_out)):
+            encoder_out_repeat.append(encoder_out[i:i + 1].repeat(
+                repeats[i], 1, 1))
+        encoder_out = torch.concat(encoder_out_repeat, dim=0)
+        encoder_mask = torch.ones(encoder_out.size(0),
+                                  1,
+                                  encoder_out.size(1),
+                                  dtype=torch.bool,
+                                  device=device)
+        # used for right to left decoder
+        r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id)
+        r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos,
+                                    self.ignore_id)
+        reverse_weight = 0.5
+        decoder_out, r_decoder_out, _ = self.decoder(
+            encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad,
+            reverse_weight)  # (beam_size, max_hyps_len, vocab_size)
+        decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1)
+        decoder_out = decoder_out
+        # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a
+        # conventional transformer decoder.
+        r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1)
+        r_decoder_out = r_decoder_out
+
+        decoder_scores = torch.tensor([
+            sum([decoder_out[i, j, hyps[i][j]] for j in range(len(hyps[i]))])
+            for i in range(len(hyps))
+        ],
+                                      device=device)  # noqa
+        r_decoder_scores = []
+        for i in range(len(hyps)):
+            score = 0
+            for j in range(len(hyps[i])):
+                score += r_decoder_out[i, len(hyps[i]) - j - 1, hyps[i][j]]
+            score += r_decoder_out[i, len(hyps[i]), self.eos]
+            r_decoder_scores.append(score)
+        r_decoder_scores = torch.tensor(r_decoder_scores, device=device)
+
+        am_scores = nbest.compute_am_scores()
+        ngram_lm_scores = nbest.compute_lm_scores()
+        tot_scores = am_scores.values + lm_scale * ngram_lm_scores.values + \
+            decoder_scale * decoder_scores + r_decoder_scale * r_decoder_scores
+        ragged_tot_scores = k2.RaggedTensor(nbest.shape, tot_scores)
+        max_indexes = ragged_tot_scores.argmax()
+        best_path = k2.index_fsa(nbest.fsa, max_indexes)
+        hyps = icefall.utils.get_texts(best_path)
+        hyps = [[symbol_table[k] for j in i for k in self.word_table[j]]
+                for i in hyps]
+        return hyps
diff --git a/wenet/llm_asr/__init__.py b/wenet/llm_asr/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/wenet/llm_asr/downsampler.py b/wenet/llm_asr/downsampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f556b885fd4dcd8129004ecd9abd2dfe68d2580
--- /dev/null
+++ b/wenet/llm_asr/downsampler.py
@@ -0,0 +1,232 @@
+import torch
+from torch import nn
+
+
+class GxlConv1dSubsampling2(nn.Module):
+    """Conv1d subsampling module.
+
+    Args:
+        idim (int): Input dimension.
+        odim (int): Output dimension.
+        dropout_rate (float): Dropout rate.
+
+    """
+
+    def __init__(self, idim: int, odim: int):
+        """Construct an Conv1dSubsampling object."""
+        super().__init__()
+        self.conv = torch.nn.Sequential(
+            torch.nn.Conv1d(idim, odim, 3, 1),
+            torch.nn.GELU(),
+            torch.nn.Conv1d(odim, odim, 3, 2),
+            torch.nn.GELU(),
+        )
+
+    def forward(self, x):
+        """
+
+        Args:
+            x: (B, T, idim)
+
+        Returns:
+        """
+        x = x.transpose(1, 2)
+        x = self.conv(x)
+        x = x.transpose(1, 2)
+        return x
+
+
+class GxlConv1dSubsampling4(nn.Module):
+    """Conv1d subsampling module.
+
+    Args:
+        idim (int): Input dimension.
+        odim (int): Output dimension.
+        dropout_rate (float): Dropout rate.
+
+    """
+
+    def __init__(self, idim: int, odim: int):
+        """Construct an Conv1dSubsampling object."""
+        super().__init__()
+        self.conv = torch.nn.Sequential(
+            torch.nn.ConstantPad1d((2, 0), 0.0),
+            torch.nn.Conv1d(idim, odim, 3, 1),
+            torch.nn.GELU(),
+            torch.nn.ConstantPad1d((2, 0), 0.0),
+            torch.nn.Conv1d(odim, odim, 3, 2),
+            torch.nn.GELU(),
+            torch.nn.ConstantPad1d((2, 0), 0.0),
+            torch.nn.Conv1d(odim, odim, 3, 2),
+            torch.nn.GELU(),
+        )
+
+    def forward(self, x, mask_pad):
+        """
+
+        Args:
+            x: (B, T, idim)
+
+        Returns:
+        """
+        x = x.transpose(1, 2)
+        x = self.conv(x)
+        x = x.transpose(1, 2)
+        mask_pad = mask_pad[:, :, 0::2]
+        mask_pad = mask_pad[:, :, 0::2]
+        return x, mask_pad
+
+
+class GxlConv1dSubsampling6(nn.Module):
+    """Conv1d subsampling module.
+
+    Args:
+        idim (int): Input dimension.
+        odim (int): Output dimension.
+        dropout_rate (float): Dropout rate.
+
+    """
+
+    def __init__(self, idim: int, odim: int):
+        """Construct an Conv1dSubsampling object."""
+        super().__init__()
+        self.conv = torch.nn.Sequential(
+            torch.nn.Conv1d(idim, odim, 3, 1),
+            torch.nn.GELU(),
+            torch.nn.Conv1d(odim, odim, 3, 2),
+            torch.nn.GELU(),
+            torch.nn.Conv1d(odim, odim, 3, 3),
+            torch.nn.GELU(),
+        )
+
+    def forward(self, x):
+        """
+
+        Args:
+            x: (B, T, idim)
+
+        Returns:
+        """
+        x = x.transpose(1, 2)
+        x = self.conv(x)
+        x = x.transpose(1, 2)
+        return x
+
+
+class GxlConv1dSubsampling8(nn.Module):
+    """Conv1d subsampling module.
+
+    Args:
+        idim (int): Input dimension.
+        odim (int): Output dimension.
+        dropout_rate (float): Dropout rate.
+
+    """
+
+    def __init__(self, idim: int, odim: int):
+        """Construct an Conv1dSubsampling object."""
+        super().__init__()
+        self.conv = torch.nn.Sequential(
+            torch.nn.Conv1d(idim, odim, 3, 1),
+            torch.nn.GELU(),
+            torch.nn.Conv1d(odim, odim, 3, 2),
+            torch.nn.GELU(),
+            torch.nn.Conv1d(odim, odim, 3, 2),
+            torch.nn.GELU(),
+            torch.nn.Conv1d(odim, odim, 3, 2),
+            torch.nn.GELU(),
+        )
+
+    def forward(self, x):
+        """
+
+        Args:
+            x: (B, T, idim)
+
+        Returns:
+        """
+        x = x.transpose(1, 2)
+        x = self.conv(x)
+        x = x.transpose(1, 2)
+        return x
+    
+class LyzConv1dSubsampling(torch.nn.Module):
+    def __init__(
+        self,
+        enc_out_dim: int = 512,
+        llm_embed_dim: int = 4096,
+        kernel_size: int = 5,
+        activation_func: str = 'relu',
+        norm: str = 'batch',
+    ):
+        super().__init__()
+        
+        if enc_out_dim * 4 < llm_embed_dim:
+            self.left_padding1 = nn.ConstantPad1d((kernel_size - 1, 0), 0.0)
+            self.conv1d1 = nn.Conv1d(enc_out_dim, 2 * enc_out_dim, kernel_size, 1, 0)
+            self.bn1 = nn.BatchNorm1d(2 * enc_out_dim, eps=1e-3, momentum=0.99)
+            self.relu1 = nn.ReLU()
+
+            self.left_padding2 = nn.ConstantPad1d((kernel_size - 1, 0), 0.0)
+            self.conv1d2 = nn.Conv1d(2 * enc_out_dim, 4 * enc_out_dim, kernel_size, 2, 0)
+            self.bn2 = nn.BatchNorm1d(4 * enc_out_dim, eps=1e-3, momentum=0.99)
+            self.relu2 = nn.ReLU()
+            
+            self.project = nn.Linear(4 * enc_out_dim, llm_embed_dim)
+            self.cnn_num = 2
+        else:
+            self.left_padding2 = nn.ConstantPad1d((kernel_size - 1, 0), 0.0)
+            self.conv1d2 = nn.Conv1d(enc_out_dim, 2 * enc_out_dim, kernel_size, 2, 0)
+            if norm == 'batch':
+                self.bn2 = nn.BatchNorm1d(2 * enc_out_dim, eps=1e-3, momentum=0.99)
+            elif norm == 'layer':
+                self.bn2 = nn.LayerNorm(2 * enc_out_dim, eps=1e-3)
+            if activation_func == 'gelu':
+                self.relu2 = nn.GELU()
+            else:
+                self.relu2 = nn.ReLU()
+            self.project = nn.Linear(2 * enc_out_dim, llm_embed_dim)
+            self.cnn_num = 1
+    
+    def forward(self, x, mask_pad):
+        """
+            x: B, T, enc_out_dim
+            mask: (B, T) or (B, 1, T)
+        """
+        x = x.transpose(1, 2)  # B, channels, T
+
+        # mask batch padding
+        if mask_pad.size(2) > 0:  # time > 0
+            x.masked_fill_(~mask_pad, 0.0)
+
+        if self.cnn_num == 2:
+            x = self.left_padding1(x)
+            x = self.conv1d1(x)
+            x = self.bn1(x)
+            x = self.relu1(x)
+
+        x = self.left_padding2(x)
+        x = self.conv1d2(x)
+        if isinstance(self.bn2, nn.LayerNorm):
+            x = x.transpose(1, 2)
+        x = self.bn2(x)
+        if isinstance(self.bn2, nn.LayerNorm):
+            x = x.transpose(1, 2)
+        x = self.relu2(x)
+
+        x = x.transpose(1, 2)
+        x = self.project(x)
+
+        return x, mask_pad[:, :, 0::2]
+    
+def get_downsampler(downsample_rate, ndim=1280):
+    down_sample_2 = nn.Identity()
+    if downsample_rate == 2:
+        down_sample_2 = GxlConv1dSubsampling2(ndim, ndim)
+    elif downsample_rate == 4:
+        down_sample_2 = GxlConv1dSubsampling4(ndim, ndim)
+    elif downsample_rate == 8:
+        down_sample_2 = GxlConv1dSubsampling8(ndim, ndim)
+    elif downsample_rate == 6:
+        down_sample_2 = GxlConv1dSubsampling6(ndim, ndim)
+    return down_sample_2
\ No newline at end of file
diff --git a/wenet/llm_asr/init_llmasr.py b/wenet/llm_asr/init_llmasr.py
new file mode 100644
index 0000000000000000000000000000000000000000..249a38ecc05da37ae84c7a918a0261dd501b9cc4
--- /dev/null
+++ b/wenet/llm_asr/init_llmasr.py
@@ -0,0 +1,124 @@
+import logging
+
+import torch
+
+from wenet.llm_asr.llmasr_model import LLMASR_Model
+from wenet.transformer.cmvn import GlobalCMVN
+from wenet.utils.checkpoint import load_checkpoint, load_trained_modules
+from wenet.utils.cmvn import load_cmvn
+
+from gxl_ai_utils.utils import utils_file
+
+def init_llmasr(args, configs, is_inference=False):
+    llm_path = configs["llm_path"]
+    lora = configs["use_lora"]
+    lora_alpha = configs["lora_alpha"]
+    lora_rank = configs["lora_rank"]
+    lora_dropout = configs["lora_dropout"]
+    # prompt_pattern = configs['prompt_pattern']
+
+    encoder_output_dim = -1
+    if configs['encoder'] == 'transformer':
+        if configs.get('cmvn', None) == 'global_cmvn':
+            mean, istd = load_cmvn(configs['cmvn_conf']['cmvn_file'],
+                                   configs['cmvn_conf']['is_json_cmvn'])
+            global_cmvn = GlobalCMVN(
+                torch.from_numpy(mean).float(),
+                torch.from_numpy(istd).float())
+        else:
+            global_cmvn = None
+        encoder_type = configs.get('encoder', 'conformer')
+        input_dim = configs['input_dim']
+        from wenet.utils.init_model import WENET_ENCODER_CLASSES
+        encoder = WENET_ENCODER_CLASSES[encoder_type](
+            input_dim,
+            global_cmvn=global_cmvn,
+            **configs['encoder_conf'],
+            **configs['encoder_conf']['efficient_conf']
+            if 'efficient_conf' in configs['encoder_conf'] else {})
+        encoder_output_dim = configs['encoder_conf']['output_size']
+    elif configs['encoder'] == 'whisper':
+        raise NotImplementedError('whisper 还没实现')
+    elif configs['encoder'] == 'hubert':
+        raise NotImplementedError('hubert 还没实现')
+    else:
+        encoder = None
+    logging.info(f'encoder output dim:{encoder_output_dim}')
+
+
+    # encoder = encoder.to(torch.float16)
+    speech_token_num = configs.get('speech_token_num', 0)
+    train_speech_out = speech_token_num != 0
+
+    model = LLMASR_Model(
+        encoder=encoder,
+        encoder_output_dim=encoder_output_dim,
+        llm_path=llm_path,
+        lora=lora,
+        lora_alpha=lora_alpha,
+        lora_rank=lora_rank,
+        lora_dropout=lora_dropout,
+        is_inference=is_inference,
+        downsample_rate=configs.get('downsample_rate',1),
+        adapter_type=configs.get('adapter_type', 'lyz'),
+        speech_token_num=speech_token_num,
+        train_speech_out=train_speech_out,
+    )
+
+    utils_file.print_model_size(model.encoder)
+    utils_file.print_model_size(model.llama_model)
+    # utils_file.print_model_size(model.speech_transformer)
+    # utils_file.print_model_size(model.speech_llama_proj)
+
+    logging.info(f'耿雪龙：init_salmonn()：开始加载初始化模型')
+    if hasattr(args, 'checkpoint') and args.checkpoint is not None:
+        logging.info(f'耿雪龙： 设置了初始化模型位置，开始加载，参数文件位置：{args.checkpoint}')
+        infos = load_checkpoint(model, args.checkpoint)
+    elif hasattr(args, 'checkpoint') and args.enc_init is not None:
+        infos = load_trained_modules(model, args)
+    else:
+        infos = {}
+
+    if configs.get('init_step', False):
+        infos = {}
+    configs["init_infos"] = infos
+    print(configs)
+    logging.info('耿雪龙：加载初始化模型完毕')
+
+    if not is_inference:
+        logging.info('耿雪龙：不更换LLM的参数')
+        # logging.info('耿雪龙: 开始更换LLM的参数')
+        # checkpoint4llm_wrapper = "/home/work_nfs8/xlgeng/new_workspace/wenet_gxl_salmonn4ft_LLM/examples/aishell/ft_LLM/exp/ft_2B_v1/1_epoch/step_34272.pt"
+        # load_checkpoint(model, checkpoint4llm_wrapper)
+        # logging.info('耿雪龙: 更换LLM的参数完毕')
+    else:
+        logging.info('耿雪龙: 不更换LLM的参数')
+
+    logging.info('耿雪龙：开始选择性冻结模块')
+    fire_module = configs.get("fire_module", None)
+    if fire_module is None:
+        logging.info('耿雪龙：没有选择解冻的模块,也就是没有训练参数，直接报错返回')
+        raise ValueError('没有选择解冻的模块,也就是没有训练参数，直接报错返回')
+    for k, p in model.named_parameters():
+        # if k.startswith("llama_model") or k.startswith("speech_encoder"):
+        # if k.startswith("llama_model") or k.startswith("speech_transformer"):
+        if fire_module == 'link':
+            # link 包括下采样块， transformer块， 前后linear块
+            if k.startswith("llama_model") or k.startswith("encoder"):
+                p.requires_grad = False
+        elif fire_module == 'encoder':
+            if not k.startswith("encoder"):
+                p.requires_grad = False
+        elif fire_module == 'llm':
+            if not k.startswith("llama_model"):
+                p.requires_grad = False
+        elif fire_module == 'link_and_encoder':
+            # 这里和speech token相关的层不会被冻结
+            if k.startswith("llama_model"):
+                p.requires_grad = False
+        elif fire_module == "link_and_encoder_and_lora":
+            break
+        logging.info(f"{k} {p.requires_grad}")
+    logging.info('耿雪龙：冻结完毕')
+
+    return model, configs
diff --git a/wenet/llm_asr/llmasr_model.py b/wenet/llm_asr/llmasr_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0d1ae9e08d50cb800de8e697a02c08691da3c1b
--- /dev/null
+++ b/wenet/llm_asr/llmasr_model.py
@@ -0,0 +1,685 @@
+import logging
+import os
+
+import torchaudio
+import torch
+from peft import LoraConfig, TaskType, get_peft_model
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from wenet.transformer.encoder import TransformerEncoder
+from wenet.llm_asr.utils4llmasr import *
+from gxl_ai_utils.utils import utils_file
+
+from wenet.llm_asr.downsampler import get_downsampler, LyzConv1dSubsampling
+from wenet.utils.mask import make_pad_mask
+
+
+# import torch_npu
+# from torch_npu.contrib import transfer_to_npu
+
+# from msprobe.pytorch import seed_all,PrecisionDebugger
+
+class LLMASR_Model(nn.Module):
+    def __init__(self,
+                 encoder,
+                 encoder_output_dim,
+                 llm_path,
+                 lora=True, lora_alpha=32, lora_rank=8, lora_dropout=0.1,
+                 prompt_pattern="{}：<Speech><SpeechHere></Speech>",
+                 # "USER: <Speech><SpeechHere></Speech> {}\nASSISTANT:"
+                 is_inference=False,
+                 downsample_rate=1,
+                 llm_embed_dim=4096,
+                 task_num=2,
+                 adapter_type='lyz',
+                 speech_token_num=0,
+                 train_speech_out=False):
+        """"""
+        super().__init__()
+        self.downsample_rate = downsample_rate
+
+        self.encoder = encoder
+        self.ln_speech = nn.LayerNorm(encoder_output_dim)
+
+        # 连接层, 51.6M
+        if adapter_type == 'gxl':
+            self.speech_transformer = TransformerEncoder(
+                input_size=encoder_output_dim,
+                output_size=encoder_output_dim,
+                attention_heads=4,
+                linear_units=2560,
+                num_blocks=4,
+                dropout_rate=0.1,
+                positional_dropout_rate=0.1,
+                attention_dropout_rate=0.0,
+                input_layer="linear",
+                pos_enc_layer_type="abs_pos",
+                normalize_before=True
+            )
+        else:
+            self.speech_transformer = None
+
+        # LLM,
+        self.low_resource = False
+        if not self.low_resource:
+            self.llama_model = AutoModelForCausalLM.from_pretrained(
+                llm_path,
+                # torch_dtype=torch.float32 if is_inference else torch.float16,
+                torch_dtype=torch.bfloat16,
+                trust_remote_code=True,
+                output_hidden_states=True,
+            )
+        else:
+            self.llama_model = AutoModelForCausalLM.from_pretrained(
+                llm_path,
+                torch_dtype=torch.float16,
+                load_in_8bit=True,
+                device_map="auto",
+                trust_remote_code=True,
+                output_hidden_states=True,
+            )
+
+        self.max_length = 300
+        self.min_length = 1
+        self.num_beams = 4
+        self.do_sample = True
+        self.top_p = 0.0
+        self.top_k = 0
+        self.repetition_penalty = 1.05
+        self.length_penalty = 1.0
+        self.temperature = 1.0
+        self.IGNORE_ID = -100
+
+        # lora
+        self.lora = lora
+        if lora:
+            utils_file.logging_limit_print("耿雪龙： 使用lora了")
+            #target_modules = ['w_pack', 'o_proj', 'gate_proj', 'down_proj']
+            target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'down_proj']
+            if is_inference:
+                self.peft_config = LoraConfig(
+                    task_type=TaskType.CAUSAL_LM,
+                    inference_mode=True,
+                    r=lora_rank,
+                    lora_alpha=lora_alpha,
+                    lora_dropout=lora_dropout,
+                    target_modules=target_modules,
+                )
+            else:
+                self.peft_config = LoraConfig(
+                    task_type=TaskType.CAUSAL_LM,
+                    inference_mode=False,
+                    r=lora_rank,
+                    lora_alpha=lora_alpha,
+                    lora_dropout=lora_dropout,
+                    target_modules=target_modules,
+                )
+            self.llama_model = get_peft_model(self.llama_model, self.peft_config)
+
+        # tokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            llm_path, use_fast=False, trust_remote_code=True)
+        """
+        设置分词器的pad_token和padding的方向。
+        """
+        self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
+        self.tokenizer.padding_side = "right"
+
+        if hasattr(self.llama_model.config, 'hidden_size'):
+            utils_file.logging_limit_print(
+                f"self.llama_model.config.hidden_size: {self.llama_model.config.hidden_size}")
+            if adapter_type == 'lyz':
+                self.down_sample_2 = LyzConv1dSubsampling(encoder_output_dim, self.llama_model.config.hidden_size)
+            elif adapter_type == 'gxl':
+                self.down_sample_2 = get_downsampler(downsample_rate, encoder_output_dim)
+                self.speech_llama_proj = nn.Linear(
+                    encoder_output_dim, self.llama_model.config.hidden_size)
+            # self.task_embeddings = torch.nn.Embedding(task_num, self.llama_model.config.hidden_size)
+        else:
+            raise NotImplementedError("self.llama_model.config.hidden_size not exist")
+
+        self.embed_tokens = self.llama_model.model.model.embed_tokens if self.lora else self.llama_model.model.embed_tokens
+        self.lm_head = self.llama_model.model.lm_head if self.lora else self.llama_model.lm_head
+
+        self.speech_token_num = speech_token_num
+        # init speech token module
+        if speech_token_num > 0:
+            utils_file.logging_info(f'耿雪龙： 进行语音token生成任务， speech_token_num: {speech_token_num}')
+            self.speech_token_emded = torch.nn.Embedding(speech_token_num + 2, self.llama_model.config.hidden_size)
+            self.speaker_head = torch.nn.Linear(self.llama_model.config.hidden_size, speech_token_num)
+        else:
+            # 不做任何处理
+            self.speaker_head = nn.Identity()
+            self.speech_token_emded = nn.Identity()
+        self.train_speech_out = train_speech_out
+        utils_file.logging_info(f'耿雪龙： 是否进行语音输出训练：{self.train_speech_out}')
+        self.loss_fct = CrossEntropyLoss(reduction='mean')
+        # self.debugger = PrecisionDebugger(config_path='./do_align_test/config_gpu.json', model=self.encoder)
+
+    def get_label_embedding(self, labels, labels_lengths):
+        """"""
+        labels_pad_mask = make_pad_mask(labels_lengths)  # B, L
+        labels = labels.masked_fill(labels_pad_mask, 0)
+        labels_embeds = self.embed_tokens(labels)
+        labels_target = labels.masked_fill(labels_pad_mask, self.IGNORE_ID)  # B, L
+        labels_mask = ~labels_pad_mask
+        return labels_embeds, labels_target, labels_mask
+
+    def get_speech_token_label_embedding(self, speech_token_labels, speech_tokens_length):
+        """"""
+        speech_tokens_pad_mask = make_pad_mask(speech_tokens_length)  # B, L
+        speech_token_labels = speech_token_labels.masked_fill(speech_tokens_pad_mask, 0)
+        speech_token_labels_embeds = self.speech_token_emded(speech_token_labels)
+        utils_file.logging_limit_print(f'进行speech_token_labels修改，修改前 speech_token_labels',
+                                       speech_token_labels.shape, speech_token_labels[0][-1], speech_token_labels[0][0])
+        speech_token_labels = speech_token_labels + 152064
+        utils_file.logging_limit_print(f'进行speech_token_labels修改，修改后 speech_token_labels',
+                                       speech_token_labels.shape, speech_token_labels[0][-1], speech_token_labels[0][0])
+        speech_token_labels_target = speech_token_labels.masked_fill(speech_tokens_pad_mask, self.IGNORE_ID)  # B, L
+        speech_token_labels_mask = ~speech_tokens_pad_mask
+        return speech_token_labels_embeds, speech_token_labels_target, speech_token_labels_mask
+
+    def forward(self,
+                batch,
+                device,
+                ):
+        """"""
+        rank = int(os.environ.get('RANK', 0))
+        # wavs = batch['feats'].to(device)
+        # wavs_len = batch['feats_lengths'].to(device)
+        # if rank == 0:
+        #     utils_file.logging_limit_print(
+        #         f'wavs shape: {wavs.shape},第一帧的前20个数字：\n{wavs[0][0][:20]}')
+
+        output_type = batch['output_type']
+        assert output_type in ['text', 'speech2text_token', 'text2token'], f"output_type:{output_type} not support"
+
+        # utils_file.logging_limit_print('进入 llmasr forward() ,首先来看一下输入')
+        # utils_file.logging_limit_print('wavs.shape:', wavs.shape)
+        # utils_file.logging_limit_print('wavs_len.shape:', wavs_len.shape)
+        # utils_file.logging_limit_print('wavs_len:', wavs_len)
+        # utils_file.logging_limit_print('labels.shape:', labels.shape)
+        # utils_file.logging_limit_print('labels_lengths.shape:', labels_lengths.shape)
+        # utils_file.logging_limit_print('output_type:', output_type)
+        # utils_file.logging_limit_print('观看结束')
+
+        # speech inputs
+        if output_type == 'text' or output_type == 'speech2text_token':
+            wavs = batch['feats'].to(device)
+            wavs_len = batch['feats_lengths'].to(device)
+            speech_embeds, speech_masks = self.get_embedding_from_wav(wavs, wavs_len)
+            speech_target = torch.full(speech_masks.shape, self.IGNORE_ID).to(
+                speech_embeds.device)
+            utils_file.logging_limit_print('进入 llmasr forward() ,首先来看一下输入')
+            utils_file.logging_limit_print('wavs.shape:', wavs.shape)
+            utils_file.logging_limit_print('wavs_len.shape:', wavs_len.shape)
+            utils_file.logging_limit_print('wavs_len:', wavs_len)
+            utils_file.logging_limit_print('output_type:', output_type)
+            utils_file.logging_limit_print('speech_embeds:', speech_embeds.shape)
+            utils_file.logging_limit_print('观看结束')  # haha
+        else:
+            labels = batch['target'].to(device)
+            labels_lengths = batch['target_lengths'].to(device)
+            #  text 2 token ,拿到文本序列
+            labels_pad_mask = make_pad_mask(labels_lengths)  # B, L
+            labels = labels.masked_fill(labels_pad_mask, 0)
+            speech_embeds = self.embed_tokens(labels)  # B, L, D
+            speech_target = torch.full(labels_pad_mask.shape, self.IGNORE_ID).to(
+                speech_embeds.device)
+            speech_masks = ~labels_pad_mask
+
+        # add bos and eos
+        speech_embeds, speech_masks, speech_target = self._add_bos_eos(0 + self.speech_token_num,
+                                                                       1 + self.speech_token_num,
+                                                                       speech_embeds, speech_masks, speech_target)
+
+        # prompt
+        if 'prompt' in batch:
+            prompt = batch['prompt'].to(device)
+            prompt_lengths = batch['prompt_lengths'].to(device)
+            prompt_pad_mask = make_pad_mask(prompt_lengths)  # B, L
+            prompt = prompt.masked_fill(prompt_pad_mask, self.tokenizer.eos_token_id)
+            prompt_embeds = self.embed_tokens(prompt)  # B, L, D
+            prompt_target = torch.full(prompt.shape, self.IGNORE_ID).to(
+                speech_embeds.device)  # B, L
+            prompt_mask = ~prompt_pad_mask
+        else:
+            raise ValueError('prompt is not in batch')
+
+        if output_type == 'speech2text_token':
+            labels = batch['target'].to(device)
+            labels_lengths = batch['target_lengths'].to(device)
+            speech_token_labels = batch['speech_tokens'].to(device)
+            speech_tokens_length = batch['speech_tokens_length'].to(device)
+
+            utils_file.logging_limit_print('进入 llmasr forward() ,首先来一下目标')
+            utils_file.logging_limit_print('labels.shape:', labels.shape)
+            utils_file.logging_limit_print('labels_lengths.shape:', labels_lengths.shape)
+            utils_file.logging_limit_print('labels_lengths:', labels_lengths)
+            utils_file.logging_limit_print('speech_token_labels.shape:', speech_token_labels.shape)
+            utils_file.logging_limit_print('speech_tokens_length.shape:', speech_tokens_length.shape)
+            utils_file.logging_limit_print('speech_tokens_length:', speech_tokens_length)
+            utils_file.logging_limit_print('观看结束')
+
+            labels_embeds, labels_target, labels_mask = self.get_label_embedding(labels, labels_lengths)
+            speech_token_labels_embeds, speech_token_labels_target, speech_token_labels_mask = self.get_speech_token_label_embedding(
+                speech_token_labels, speech_tokens_length)
+
+            # concat
+            inputs_embeds = torch.cat([prompt_embeds, speech_embeds,
+                                       labels_embeds, speech_token_labels_embeds], dim=1)
+            attention_mask = torch.cat([prompt_mask, speech_masks,
+                                        labels_mask, speech_token_labels_mask], dim=1)
+            target = torch.cat([prompt_target, speech_target,
+                                labels_target, speech_token_labels_target], dim=1)
+        elif output_type == "text2token":
+            speech_token_labels = batch['speech_tokens'].to(device)
+            speech_tokens_length = batch['speech_tokens_length'].to(device)
+            speech_token_labels_embeds, speech_token_labels_target, speech_token_labels_mask = self.get_speech_token_label_embedding(
+                speech_token_labels, speech_tokens_length)
+
+            inputs_embeds = torch.cat([prompt_embeds, speech_embeds,
+                                       speech_token_labels_embeds], dim=1)
+            attention_mask = torch.cat([prompt_mask, speech_masks,
+                                        speech_token_labels_mask], dim=1)
+            target = torch.cat([prompt_target, speech_target,
+                                speech_token_labels_target], dim=1)
+        elif output_type == "text":
+            labels = batch['target'].to(device)
+            labels_lengths = batch['target_lengths'].to(device)
+            labels_embeds, labels_target, labels_mask = self.get_label_embedding(labels, labels_lengths)
+
+            # concat
+            inputs_embeds = torch.cat([prompt_embeds, speech_embeds,
+                                       labels_embeds], dim=1)
+            attention_mask = torch.cat([prompt_mask, speech_masks,
+                                        labels_mask], dim=1)
+            target = torch.cat([prompt_target, speech_target,
+                                labels_target], dim=1)
+        else:
+            raise NotImplementedError(f'output_type {output_type} not support')
+        utils_file.logging_limit_print(f'耿雪龙 output_type: {output_type}')
+
+        position_ids = attention_mask.long().cumsum(-1) - 1
+        position_ids.masked_fill_(attention_mask == 0, 1)
+        outputs = self.llama_model(
+            inputs_embeds=inputs_embeds,
+            # labels=target,
+            attention_mask=attention_mask,
+            position_ids=position_ids.to(inputs_embeds.device)
+        )
+        hidden_states = outputs['hidden_states'][-1]
+        logits = self.lm_head(hidden_states)
+        logits2 = self.speaker_head(hidden_states)  # speech_head
+        combined_logits = torch.cat([logits, logits2], dim=-1)
+        shift_logits = combined_logits[..., :-1, :].contiguous()
+        shift_target = target[..., 1:].contiguous()
+        shift_logits = shift_logits.view(-1, combined_logits.shape[-1])  # 注意这里维度的调整，根据logits2的维度相应改变
+        shift_target = shift_target.view(-1)
+        shift_target = shift_target.to(shift_logits.device)
+        loss = self.loss_fct(shift_logits, shift_target)
+        loss.requires_grad_(True)
+        return {"loss": loss}
+
+    def generate(
+            self,
+            wavs,
+            wavs_len,
+            prompt,
+    ):
+        speech_embeds, speech_masks = self.get_embedding_from_wav(wavs, wavs_len)
+        speech_embeds, speech_masks, _ = self._add_bos_eos(0 + self.speech_token_num, 1 + self.speech_token_num,
+                                                           speech_embeds, speech_masks, None)
+        prompt = self.tokenizer([prompt], return_tensors="pt"
+                                )['input_ids'].to(speech_embeds.device)
+        prompt_embeds = self.embed_tokens(prompt)
+
+        embeds = torch.cat([prompt_embeds, speech_embeds], dim=1)
+        atts = torch.ones(embeds.size()[:-1], dtype=torch.long).to(embeds.device)
+
+        if self.embed_tokens.weight.dtype == torch.float16 or self.embed_tokens.weight.dtype == torch.bfloat16:
+            utils_file.logging_limit_print('generate(): self.embed_tokens.weight.dtype == torch.float16')
+            # embeds = embeds.to(torch.float16)
+            embeds = embeds.to(torch.bfloat16)
+            atts = atts.to(torch.bfloat16)
+        outputs = self.llama_model.generate(
+            inputs_embeds=embeds,
+            max_new_tokens=self.max_length,
+            num_beams=self.num_beams,
+            do_sample=self.do_sample,
+            min_length=self.min_length,
+            top_p=self.top_p,
+            top_k=self.top_k,
+            repetition_penalty=self.repetition_penalty,
+            length_penalty=self.length_penalty,
+            temperature=self.temperature,
+            attention_mask=atts,
+            eos_token_id=151643,
+            pad_token_id=-100,
+        )
+
+        # 获取生成的token IDs
+        # token_ids = outputs[0].tolist()  # 假设batch_size=1，取第一个输出
+        # 将token IDs转换为字符串
+        # tokens = [self.tokenizer.decode([token_id], skip_special_tokens=True) for token_id in token_ids]
+        # 打印token列表和字符串列表
+        # print("Token IDs:", token_ids)
+        # print("Tokens:", tokens)
+
+        # 使用tokenizer将token IDs批量转换为字符串
+        # output_text = self.tokenizer.batch_decode(outputs, add_special_tokens=False, skip_special_tokens=True)
+        # print("Output Text:", output_text)
+
+        output_text = self.tokenizer.batch_decode(outputs, add_special_tokens=False, skip_special_tokens=True)
+        # 处理token，为英文单词前加上空格
+        # processed_tokens = []
+        # for token in tokens:
+        #     # 检查是否为英文单词（简单判断：是否全部由字母组成）
+        #     if token.isalpha() and token[0].isascii():
+        #         processed_tokens.append(" " + token)  # 英文单词前加空格
+        #     else:
+        #         processed_tokens.append(token)  # 其他token保持不变
+        # output_text = "".join(processed_tokens)
+        return output_text
+
+    def generate4seech_token(
+            self,
+            wavs,
+            wavs_len,
+            prompt,
+    ):
+        speech_embeds, speech_masks = self.get_embedding_from_wav(wavs, wavs_len)
+        speech_embeds, speech_masks, _ = self._add_bos_eos(0 + self.speech_token_num, 1 + self.speech_token_num,
+                                                           speech_embeds, speech_masks, None)
+        prompt = self.tokenizer([prompt], return_tensors="pt"
+                                )['input_ids'].to(speech_embeds.device)
+        prompt_embeds = self.embed_tokens(prompt)
+
+        embeds = torch.cat([prompt_embeds, speech_embeds], dim=1)
+        atts = torch.ones(embeds.size()[:-1], dtype=torch.long).to(embeds.device)
+
+        if self.embed_tokens.weight.dtype == torch.float16:
+            utils_file.logging_limit_print('generate(): self.embed_tokens.weight.dtype == torch.float16')
+            embeds = embeds.to(torch.float16)
+            atts = atts.half()
+
+        outputs = self.llama_model.generate(
+            inputs_embeds=embeds,
+            max_new_tokens=self.max_length,
+            num_beams=self.num_beams,
+            do_sample=self.do_sample,
+            min_length=self.min_length,
+            top_p=self.top_p,
+            top_k=self.top_k,
+            repetition_penalty=self.repetition_penalty,
+            length_penalty=self.length_penalty,
+            temperature=self.temperature,
+            attention_mask=atts,
+            eos_token_id=151643,
+            pad_token_id=-100,
+        )
+        output_text = self.tokenizer.batch_decode(outputs, add_special_tokens=False, skip_special_tokens=True)
+
+        return output_text
+
+    def get_embedding_from_wav(self, wavs, wavs_len):
+        """
+        return:
+        wav_embedding: (b, l, v)
+        wav_mask:  (b, l), wav为有效值的位置为true
+        """
+        # utils_file.logging_limit_print('get_embedding_from_wav(): wavs.shape:', wavs.shape)
+        # utils_file.logging_limit_print('get_embedding_from_wav(): wavs_len.shape:', wavs_len.shape)
+        rank = int(os.environ.get('RANK', 0))
+        # self.debugger.start()
+        encoder_out, encoder_mask = self.encoder(wavs, wavs_len)
+        # self.debugger.stop()
+        # self.debugger.step()
+        if rank == 0:
+            utils_file.logging_limit_print(
+                f'encoder out shape: {encoder_out.shape},encoder的第一帧的前20个数字：\n{encoder_out[0][0][:20]}')
+
+        # utils_file.logging_limit_print(
+        #     'get_embedding_from_wav(): speech_embeds.shape,by  self.encoder(wavs, wavs_len):',
+        #     encoder_out.shape)
+
+        speech_embeds, encoder_mask = self.down_sample_2(encoder_out, encoder_mask)
+        if rank == 0:
+            utils_file.logging_limit_print(
+                f'out of down_sample_2 shape: {speech_embeds.shape},encoder的第一帧的前20个数字：\n{speech_embeds[0][0][:20]}')
+
+        # utils_file.logging_limit_print(
+        #     'get_embedding_from_wav(): speech_embeds.shape,by  self.down_sample_2(speech_embeds):', speech_embeds.shape)
+        # # max_utt_len = speech_embeds.size(1)
+        # filled_wavs_len = torch.ones(speech_embeds.size(0)) * max_utt_len
+        # filled_wavs_len = filled_wavs_len.to(speech_embeds.device)
+        if self.speech_transformer is not None:
+            filled_wavs_len = encoder_mask.squeeze(1).sum(-1)
+            speech_embeds, encoder_mask = self.speech_transformer(speech_embeds, filled_wavs_len)
+            if rank == 0:
+                utils_file.logging_limit_print(
+                    f'out of link shape: {speech_embeds.shape},encoder的第一帧的前20个数字：\n {speech_embeds[0][0][:20]}')
+
+            # utils_file.logging_limit_print(
+            #     'get_embedding_from_wav(): speech_embeds.shape,by  self.speech_transformer(speech_embeds, speech_lens):',
+            #     speech_embeds.shape)
+            speech_embeds = self.speech_llama_proj(speech_embeds)
+            if rank == 0:
+                utils_file.logging_limit_print(
+                    f'out of speech_llama_proj shape: {speech_embeds.shape},encoder的第一帧的前20个数字：\n {speech_embeds[0][0][:20]}')
+
+        # utils_file.logging_limit_print(
+        #     'get_embedding_from_wav(): speech_embeds.shape,by  self.speech_llama_proj(speech_embeds):',
+        #     speech_embeds.shape)
+
+        return speech_embeds, encoder_mask.squeeze(1)
+
+    def get_embedding_from_text(self, text):
+        text_id = self.tokenizer(
+            text,
+            return_tensors="pt",
+            add_special_tokens=False
+        ).to(
+            self.embed_tokens.weight.device).input_ids
+        text_embeds = self.embed_tokens(text_id)
+        return text_embeds
+
+    def get_embeds_from_wav_path(self, wav_path):
+        wav_i2_path = wav_path
+        utils_file.logging_limit_print('get_embeds_from_wav_path(): wav_i2_path:', wav_i2_path)
+        waveform_i2, _ = torchaudio.load(wav_i2_path)
+        utils_file.logging_limit_print('get_embeds_from_wav_path(): waveform_i2.shape:', waveform_i2.shape)
+        if len(waveform_i2.shape) != 1:
+            waveform_i2 = waveform_i2[0]
+        waveform_i2 = waveform_i2.to(self.embed_tokens.weight.device)
+        wavs_len_i2 = torch.tensor([len(waveform_i2)], device=self.embed_tokens.weight.device, dtype=torch.int32)
+        wavs_i2 = waveform_i2.unsqueeze(0)
+        sample_i2_embeds = self.get_embedding_from_wav(wavs_i2, wavs_len_i2)
+        utils_file.logging_limit_print('get_embeds_from_wav_path(): sample_i2_embeds.shape:', sample_i2_embeds.shape)
+        return sample_i2_embeds
+
+    def _add_bos_eos(self, bos, eos, inputs_embeds, attention_mask, target=None):
+        B = len(inputs_embeds)
+        bos_eos_target = torch.full([B, 1], self.IGNORE_ID).to(inputs_embeds.device)  # B,1
+        bos_eos_mask = torch.full([B, 1], True).to(inputs_embeds.device)  # B, 1
+
+        if bos is not None:
+            bos_embed = self.speech_token_emded(torch.full([B, 1],
+                                                           bos).to(inputs_embeds.device))  # B, 1, D
+            inputs_embeds = torch.cat((bos_embed, inputs_embeds), 1)  # B, (1+T), D
+            attention_mask = torch.cat((bos_eos_mask, attention_mask), 1)  # B, (1+T)
+            if target is not None:
+                target = torch.cat((bos_eos_target, target), 1)  # B, (1+T), D
+
+        if eos is not None:
+            eos_embed = self.speech_token_emded(torch.full([B, 1],
+                                                           eos).to(inputs_embeds.device))  # B, 1, D
+            inputs_embeds = torch.cat((inputs_embeds, eos_embed), 1)  # B, (1+T+1), D
+            attention_mask = torch.cat((attention_mask, bos_eos_mask), 1)  # B, (1+T+1)
+            if target is not None:
+                target = torch.cat((target, bos_eos_target), 1)  # B, (1+T+1), D
+
+        return inputs_embeds, attention_mask, target
+
+    def infer_for_speech2text_token(  # speech2text-token
+            self,
+            wavs,
+            wavs_len,
+            prompt,
+            text=None,
+    ):
+        if text is not None:
+            prompt = torch.cat((prompt, text), dim=1)
+        speech_embeds, speech_masks = self.get_embedding_from_wav(wavs, wavs_len)
+        speech_embeds, speech_masks, _ = self._add_bos_eos(0 + self.speech_token_num, None,
+                                                           speech_embeds, speech_masks, None)
+        prompt = self.tokenizer([prompt], return_tensors="pt"
+                                )['input_ids'].to(speech_embeds.device)
+        prompt_embeds = self.embed_tokens(prompt)
+        embeds = torch.cat([prompt_embeds, speech_embeds], dim=1)
+        atts = torch.ones(embeds.size()[:-1], dtype=torch.long).to(embeds.device)
+        if self.embed_tokens.weight.dtype == torch.float16:
+            utils_file.logging_limit_print('generate(): self.embed_tokens.weight.dtype == torch.float16')
+            embeds = embeds.to(torch.float16)
+            atts = atts.half()
+        device = wavs.device
+
+        max_len = 300
+        hyps = torch.ones([1, 1], dtype=torch.int64,
+                          device=device).fill_(1 + self.speech_token_num)  # (B*N, 1)
+        llm_out = self.llama_model(
+            inputs_embeds=embeds,
+            past_key_values=None,
+            output_hidden_states=True
+        )
+        cache = llm_out.past_key_values
+        utils_file.logging_limit_print('得到首个cache,开始进行for循环推理')
+        token_emb = self.speech_token_emded(hyps[:, -1:])
+
+        for i in range(max_len):
+            llm_out = self.llama_model(
+                inputs_embeds=token_emb,
+                past_key_values=cache,
+                output_hidden_states=True
+            )
+            cache = llm_out.past_key_values
+            hidden_states = llm_out.hidden_states[-1]  # 最后一层的
+            token_logits_1 = self.lm_head(hidden_states)
+            # utils_file.logging_limit_print(f'token_logits_1.shape:{token_logits_1.shape}')
+            token_logits_2 = self.speaker_head(hidden_states)
+            # utils_file.logging_limit_print(f'token_logits_2.shape:{token_logits_2.shape}')
+            big_logits = torch.cat([token_logits_1, token_logits_2], dim=-1)
+            # utils_file.logging_limit_print(f'big_logits.shape:{big_logits.shape}')
+            logp = torch.nn.functional.log_softmax(big_logits[:, -1, :], dim=-1)  # 取了最后一个
+            # utils_file.logging_limit_print(f'logp.shape:{logp.shape}')
+            max_index = torch.argmax(logp, dim=-1, keepdim=True)
+            # utils_file.logging_limit_print(f'max_index.shape:{max_index.shape}')
+            utils_file.logging_limit_print(f'max_index:{max_index}')
+
+            hyps = torch.cat((hyps, max_index),
+                             dim=1)  # (B*N, i+1)
+            if max_index < 152064:
+                token_emb = self.embed_tokens(hyps[:, -1:])
+            else:
+                if max_index == 152064 + 4096:
+                    utils_file.logging_limit_print(f'耿雪龙 遇到token结束符号，结束')
+                    break
+                token_emb = self.speech_token_emded(hyps[:, -1:])
+        best_hyps = hyps[0, :]
+        text_res = []
+        token_res = []
+        for i in best_hyps[1:]:
+            if i < 152064:
+                text_res.append(i)
+            else:
+                token_res.append(str((i - 152064).item()))
+        str_i = self.tokenizer.decode(text_res, skip_special_tokens=True, add_special_tokens=False)
+        return [str_i + " | " + " ".join(token_res)]
+        # output_text = self.tokenizer.batch_decode(outputs, add_special_tokens=False, skip_special_tokens=True)
+
+    def infer_for_text2token(  # text2token
+            self,
+            wavs,
+            wavs_len,
+            prompt,
+            text=None,
+    ):
+        if text is not None:
+            prompt = torch.cat((prompt, text), dim=1)
+        # speech_embeds, speech_masks = self.get_embedding_from_wav(wavs, wavs_len)
+        # speech_embeds, speech_masks, _ = self._add_bos_eos(0 + self.speech_token_num, None,
+        #                                                    speech_embeds, speech_masks, None)
+        labels_lengths = torch.tensor([len(text)-1], dtype=torch.int64)
+        labels = text[:,:-1]
+        labels_pad_mask = make_pad_mask(labels_lengths)  # B, L
+        labels = labels.masked_fill(labels_pad_mask, 0)
+        speech_embeds = self.embed_tokens(labels)  # B, L, D
+        speech_target = torch.full(labels_pad_mask.shape, self.IGNORE_ID).to(
+            speech_embeds.device)
+        speech_masks = ~labels_pad_mask
+
+        prompt = self.tokenizer([prompt], return_tensors="pt"
+                                )['input_ids'].to(speech_embeds.device)
+        prompt_embeds = self.embed_tokens(prompt)
+        embeds = torch.cat([prompt_embeds, speech_embeds], dim=1)
+        atts = torch.ones(embeds.size()[:-1], dtype=torch.long).to(embeds.device)
+        if self.embed_tokens.weight.dtype == torch.float16:
+            utils_file.logging_limit_print('generate(): self.embed_tokens.weight.dtype == torch.float16')
+            embeds = embeds.to(torch.float16)
+            atts = atts.half()
+        device = wavs.device
+
+        max_len = 300
+        hyps = torch.ones([1, 1], dtype=torch.int64,
+                          device=device).fill_()  # (B*N, 1)
+        llm_out = self.llama_model(
+            inputs_embeds=embeds,
+            past_key_values=None,
+            output_hidden_states=True
+        )
+        cache = llm_out.past_key_values
+        utils_file.logging_limit_print('得到首个cache,开始进行for循环推理')
+        token_emb = self.embed_tokens(hyps[:, -1:])
+
+        for i in range(max_len):
+            llm_out = self.llama_model(
+                inputs_embeds=token_emb,
+                past_key_values=cache,
+                output_hidden_states=True
+            )
+            cache = llm_out.past_key_values
+            hidden_states = llm_out.hidden_states[-1]  # 最后一层的
+            token_logits_1 = self.lm_head(hidden_states)
+            # utils_file.logging_limit_print(f'token_logits_1.shape:{token_logits_1.shape}')
+            token_logits_2 = self.speaker_head(hidden_states)
+            # utils_file.logging_limit_print(f'token_logits_2.shape:{token_logits_2.shape}')
+            big_logits = torch.cat([token_logits_1, token_logits_2], dim=-1)
+            # utils_file.logging_limit_print(f'big_logits.shape:{big_logits.shape}')
+            logp = torch.nn.functional.log_softmax(big_logits[:, -1, :], dim=-1)  # 取了最后一个
+            # utils_file.logging_limit_print(f'logp.shape:{logp.shape}')
+            max_index = torch.argmax(logp, dim=-1, keepdim=True)
+            # utils_file.logging_limit_print(f'max_index.shape:{max_index.shape}')
+            utils_file.logging_limit_print(f'max_index:{max_index}')
+
+            hyps = torch.cat((hyps, max_index),
+                             dim=1)  # (B*N, i+1)
+            if max_index < 152064:
+                token_emb = self.embed_tokens(hyps[:, -1:])
+            else:
+                if max_index == 152064 + 4096 :
+                    utils_file.logging_limit_print(f'耿雪龙 遇到token结束符号，结束')
+                    break
+                token_emb = self.speech_token_emded(hyps[:, -1:])
+        best_hyps = hyps[0, :]
+        text_res = []
+        token_res = []
+        for i in best_hyps[1:]:
+            if i < 152064:
+                text_res.append(i)
+            else:
+                token_res.append(str((i - 152064).item()))
+        str_i = self.tokenizer.decode(text_res, skip_special_tokens=True, add_special_tokens=False)
+        return [str_i + " | " + " ".join(token_res)]
+        # output_text = self.tokenizer.batch_decode(outputs, add_special_tokens=False, skip_special_tokens=True)
diff --git a/wenet/llm_asr/utils4llmasr.py b/wenet/llm_asr/utils4llmasr.py
new file mode 100644
index 0000000000000000000000000000000000000000..f395892f9300954631f393f0feefd0eacc461089
--- /dev/null
+++ b/wenet/llm_asr/utils4llmasr.py
@@ -0,0 +1,71 @@
+import random
+from typing import Tuple
+
+import torch
+
+from wenet.utils.common import pad_list
+from gxl_ai_utils.utils import utils_file
+
+
+def add_sos_eos4speech_llm(ys_pad: torch.Tensor, sos: int, eos: int,
+                           ignore_id: int) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Add <sos> and <eos> labels.
+    为out后接一个eos. in基本保持不变
+
+    Args:
+        ys_pad (torch.Tensor): batch of padded target sequences (B, Lmax)
+        sos (int): index of <sos>
+        eos (int): index of <eeos>
+        ignore_id (int): index of padding
+
+    Returns:
+        ys_in (torch.Tensor) : (B, Lmax)
+        ys_out (torch.Tensor) : (B, Lmax + 1)
+
+    Examples:
+        >>> sos_id = 10
+        >>> eos_id = 11
+        >>> ignore_id = -1
+        >>> ys_pad
+        tensor([[ 1,  2,  3,  4,  5],
+                [ 4,  5,  6, -1, -1],
+                [ 7,  8,  9, -1, -1]], dtype=torch.int32)
+        >>> ys_in,ys_out=add_sos_eos(ys_pad, sos_id , eos_id, ignore_id)
+        >>> ys_in
+        tensor([[ 1,  2,  3,  4,  5],
+                [ 4,  5,  6, 11, 11],
+                [ 7,  8,  9, 11, 11]])
+        >>> ys_out
+        tensor([[ 1,  2,  3,  4,  5, 11],
+                [ 4,  5,  6, 11, -1, -1],
+                [ 7,  8,  9, 11, -1, -1]])
+    """
+    _sos = torch.tensor([sos],
+                        dtype=torch.long,
+                        requires_grad=False,
+                        device=ys_pad.device)
+    _eos = torch.tensor([eos],
+                        dtype=torch.long,
+                        requires_grad=False,
+                        device=ys_pad.device)
+    ys = [y[y != ignore_id] for y in ys_pad]  # parse padded ys
+    # ys_in = [torch.cat([_sos, y], dim=0) for y in ys]
+    ys_in = [y for y in ys]
+    ys_out = [torch.cat([y, _eos], dim=0) for y in ys]
+    return pad_list(ys_in, eos), pad_list(ys_out, ignore_id)
+
+global_prompt_dict = None
+def get_prompt_by_task(task_name):
+    """
+    根据task给定指定的prompt, 并实现prompt的多样随意性
+    Args:
+        task_name:
+
+    Returns:
+
+    """
+    global global_prompt_dict
+    if global_prompt_dict is None:
+        global_prompt_dict = utils_file.load_dict_from_yaml('conf/prompt.yaml')
+    random_index = random.randint(0, len(global_prompt_dict[task_name])-1)
+    return global_prompt_dict[task_name][random_index]
diff --git a/wenet/paraformer/__init__.py b/wenet/paraformer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/wenet/paraformer/attention.py b/wenet/paraformer/attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3322319974784af0e9ad6c21486737a0a58756b
--- /dev/null
+++ b/wenet/paraformer/attention.py
@@ -0,0 +1,217 @@
+from typing import Optional, Tuple
+from wenet.transformer.attention import MultiHeadedAttention
+from torch import nn
+import math
+import torch
+
+
+class MultiHeadedAttentionSANM(MultiHeadedAttention):
+    """Multi-Head Attention layer.
+    Args:
+        n_head (int): The number of heads.
+        n_feat (int): The number of features.
+        dropout_rate (float): Dropout rate.
+    """
+
+    def __init__(self,
+                 n_head,
+                 in_feat,
+                 n_feat,
+                 dropout_rate,
+                 kernel_size,
+                 sanm_shfit=0):
+        """Construct an MultiHeadedAttention object."""
+        super().__init__(n_head, n_feat, dropout_rate)
+        # We assume d_v always equals d_k
+        # self.linear_q = nn.Linear(n_feat, n_feat)
+        # self.linear_k = nn.Linear(n_feat, n_feat)
+        # self.linear_v = nn.Linear(n_feat, n_feat)
+        del self.linear_q, self.linear_k, self.linear_v
+        self.linear_q_k_v = nn.Linear(in_feat, n_feat * 3)
+
+        self.fsmn_block = nn.Conv1d(n_feat,
+                                    n_feat,
+                                    kernel_size,
+                                    stride=1,
+                                    padding=0,
+                                    groups=n_feat,
+                                    bias=False)
+        # padding
+        self.left_padding = (kernel_size - 1) // 2
+        if sanm_shfit > 0:
+            self.left_padding = self.left_padding + sanm_shfit
+        self.right_padding = kernel_size - 1 - self.left_padding
+        self.pad_fn = nn.ConstantPad1d((self.left_padding, self.right_padding),
+                                       0.0)
+
+    def forward_qkv(
+        self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+        x = query
+        b, t, _ = x.size()
+        q_k_v = self.linear_q_k_v(x)
+        q, k, v = torch.split(q_k_v, int(self.h * self.d_k), dim=-1)
+        q = torch.reshape(q, (b, t, self.h, self.d_k)).transpose(
+            1, 2)  # (batch, head, time1, d_k)
+        k = torch.reshape(k, (b, t, self.h, self.d_k)).transpose(
+            1, 2)  # (batch, head, time2, d_k)
+        v = torch.reshape(v, (b, t, self.h, self.d_k)).transpose(
+            1, 2)  # (batch, head, time2, d_k)
+
+        return q, k, v
+
+    def forward_fsmn(self,
+                     inputs: torch.Tensor,
+                     mask: torch.Tensor,
+                     mask_shfit_chunk: Optional[torch.Tensor] = None):
+        b, _, t, _ = inputs.size()
+        inputs = inputs.transpose(1, 2).view(b, t, -1)
+        if mask.size(2) > 0:  # time2 > 0
+            # TODO(Mddct): make sure mask is right
+            if mask_shfit_chunk is not None:
+                mask = mask * mask_shfit_chunk
+            mask = mask.transpose(1, 2)  # [B,T,1]
+            inputs = inputs * mask
+        x = inputs.transpose(1, 2)
+        # x = torch.nn.functional.pad(x, (self.left_padding, self.right_padding),
+        #                             value=0.0,
+        #                             mode='constant')
+        x = self.pad_fn(x)
+        x = self.fsmn_block(x)
+        x = x.transpose(1, 2)
+        x += inputs
+        x = self.dropout(x)
+        return x * mask
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+        mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+        pos_emb: torch.Tensor = torch.empty(0),
+        cache: torch.Tensor = torch.zeros((0, 0, 0, 0)),
+        mask_shfit_chunk: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        q, k, v = self.forward_qkv(query, key, value)
+        if cache.size(0) > 0:
+            key_cache, value_cache = torch.split(cache,
+                                                 cache.size(-1) // 2,
+                                                 dim=-1)
+            k = torch.cat([key_cache, k], dim=2)
+            v = torch.cat([value_cache, v], dim=2)
+        # NOTE(Mddct): we need know fsmn_memory's cache, but paraformer is nonstreamming
+        # refactor later if streaming model is available
+        new_cache = torch.cat((k, v), dim=-1)
+        fsmn_memory = self.forward_fsmn(v,
+                                        mask=mask_pad,
+                                        mask_shfit_chunk=mask_shfit_chunk)
+
+        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k)
+        att = self.forward_attention(v, scores, mask)
+        return att + fsmn_memory, new_cache
+
+
+class DummyMultiHeadSANM(MultiHeadedAttentionSANM):
+    """A dummy multihead attention for Paraformer befroe cross attention
+    """
+
+    def __init__(self,
+                 n_head,
+                 in_feat,
+                 n_feat,
+                 dropout_rate,
+                 kernel_size,
+                 sanm_shfit=0):
+        super().__init__(n_head, in_feat, n_feat, dropout_rate, kernel_size,
+                         sanm_shfit)
+        del self.linear_q_k_v
+        del self.linear_out
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+        mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+        pos_emb: torch.Tensor = torch.empty(0),
+        cache: torch.Tensor = torch.zeros((0, 0, 0, 0)),
+        mask_shfit_chunk: Optional[torch.Tensor] = None
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        query = query * mask_pad.transpose(1, 2)
+        inputs = query
+        x = inputs.transpose(1, 2)
+        x = self.pad_fn(x)
+        # TODO(Mddct): cache here for future streaming
+        cache: Optional[torch.Tensor] = None
+        x = self.fsmn_block(x)
+        x = x.transpose(1, 2)
+        if x.size(1) != inputs.size(1):
+            inputs = inputs[:, -1, :]
+
+        x = x + inputs
+        x = self.dropout(x)
+        x = x * mask_pad.transpose(1, 2)
+        return x, cache
+
+
+class MultiHeadAttentionCross(MultiHeadedAttentionSANM):
+
+    def __init__(self,
+                 n_head,
+                 in_feat,
+                 n_feat,
+                 dropout_rate,
+                 kernel_size,
+                 sanm_shfit=0,
+                 target_size: Optional[int] = None):
+        super().__init__(n_head, in_feat, n_feat, dropout_rate, kernel_size,
+                         sanm_shfit)
+        del self.linear_q_k_v
+        del self.fsmn_block
+        self.linear_q = nn.Linear(n_feat, n_feat)
+        self.linear_k_v = nn.Linear(
+            n_feat if target_size is None else target_size, n_feat * 2)
+
+    def forward_qkv(
+        self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        # NOTE(Mddct): here value == key
+        _ = value
+
+        x = query
+        b = x.size(0)
+        q = self.linear_q(x)
+        q_h = torch.reshape(q, (b, -1, self.h, self.d_k)).transpose(
+            1, 2)  # (batch, head, time1, d_k)
+
+        k_v = self.linear_k_v(key)
+        k, v = torch.split(k_v, int(self.h * self.d_k), dim=-1)
+        k_h = torch.reshape(k, (b, -1, self.h, self.d_k)).transpose(
+            1, 2)  # (batch, head, time2, d_k)
+        v_h = torch.reshape(v, (b, -1, self.h, self.d_k)).transpose(
+            1, 2)  # (batch, head, time2, d_k)
+
+        return q_h, k_h, v_h
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+        mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+        pos_emb: torch.Tensor = torch.empty(0),
+        cache: torch.Tensor = torch.zeros((0, 0, 0, 0)),
+        mask_shfit_chunk: Optional[torch.Tensor] = None
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        q, k, v = self.forward_qkv(query, key, key)
+        q = q * self.d_k**(-0.5)
+        scores = torch.matmul(q, k.transpose(-2, -1))
+
+        # TODO(Mddct): support future streaming paraformer
+        cache: Optional[torch.Tensor] = None
+        return self.forward_attention(v, scores, mask), cache
diff --git a/wenet/paraformer/cif.py b/wenet/paraformer/cif.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ee7c342a8650ccc38b3a79639dd1e2ec5f58502
--- /dev/null
+++ b/wenet/paraformer/cif.py
@@ -0,0 +1,293 @@
+# Copyright (c) 2023 ASLP@NWPU (authors: He Wang, Fan Yu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License. Modified from
+# FunASR(https://github.com/alibaba-damo-academy/FunASR)
+
+from typing import Optional
+
+import torch
+from torch import nn
+from torchaudio.compliance.kaldi import Tuple
+from wenet.utils.mask import make_pad_mask
+
+
+class Cif(nn.Module):
+
+    def __init__(
+        self,
+        idim,
+        l_order,
+        r_order,
+        threshold=1.0,
+        dropout=0.1,
+        smooth_factor=1.0,
+        noise_threshold=0.0,
+        tail_threshold=0.45,
+        residual=True,
+        cnn_groups=0,
+    ):
+        super().__init__()
+
+        self.pad = nn.ConstantPad1d((l_order, r_order), 0.0)
+        self.cif_conv1d = nn.Conv1d(
+            idim,
+            idim,
+            l_order + r_order + 1,
+            groups=idim if cnn_groups == 0 else cnn_groups)
+        self.cif_output = nn.Linear(idim, 1)
+        self.dropout = torch.nn.Dropout(p=dropout)
+        self.threshold = threshold
+        self.smooth_factor = smooth_factor
+        self.noise_threshold = noise_threshold
+        self.tail_threshold = tail_threshold
+        self.residual = residual
+
+    def forward(
+        self,
+        hidden,
+        target_label: Optional[torch.Tensor] = None,
+        mask: torch.Tensor = torch.tensor(0),
+        ignore_id: int = -1,
+        mask_chunk_predictor: Optional[torch.Tensor] = None,
+        target_label_length: Optional[torch.Tensor] = None
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        h = hidden
+        context = h.transpose(1, 2)
+        queries = self.pad(context)
+        memory = self.cif_conv1d(queries)
+        if self.residual:
+            output = memory + context
+        else:
+            output = memory
+        output = self.dropout(output)
+        output = output.transpose(1, 2)
+        output = torch.relu(output)
+        output = self.cif_output(output)
+        alphas = torch.sigmoid(output)
+        alphas = torch.nn.functional.relu(alphas * self.smooth_factor -
+                                          self.noise_threshold)
+        if mask is not None:
+            mask = mask.transpose(-1, -2)
+            alphas = alphas * mask
+        if mask_chunk_predictor is not None:
+            alphas = alphas * mask_chunk_predictor
+        alphas = alphas.squeeze(-1)
+        mask = mask.squeeze(-1)
+        if target_label_length is not None:
+            target_length = target_label_length
+        elif target_label is not None:
+            target_length = (target_label != ignore_id).float().sum(-1)
+        else:
+            target_length = None
+        token_num = alphas.sum(-1)
+        if target_length is not None:
+            alphas *= (target_length / token_num)[:, None] \
+                .repeat(1, alphas.size(1))
+        elif self.tail_threshold > 0.0:
+            hidden, alphas, token_num = self.tail_process_fn(hidden,
+                                                             alphas,
+                                                             token_num,
+                                                             mask=mask)
+
+        acoustic_embeds, cif_peak = cif(hidden, alphas, self.threshold)
+
+        if target_length is None and self.tail_threshold > 0.0:
+            token_num_int = torch.max(token_num).type(torch.int32).item()
+            acoustic_embeds = acoustic_embeds[:, :token_num_int, :]
+
+        return acoustic_embeds, token_num, alphas, cif_peak
+
+    def tail_process_fn(
+        self,
+        hidden: torch.Tensor,
+        alphas: torch.Tensor,
+        token_num: Optional[torch.Tensor] = None,
+        mask: Optional[torch.Tensor] = None
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        b, _, d = hidden.size()
+        if mask is not None:
+            zeros_t = torch.zeros((b, 1),
+                                  dtype=torch.float32,
+                                  device=alphas.device)
+            mask = mask.to(zeros_t.dtype)
+            ones_t = torch.ones_like(zeros_t)
+            mask_1 = torch.cat([mask, zeros_t], dim=1)
+            mask_2 = torch.cat([ones_t, mask], dim=1)
+            mask = mask_2 - mask_1
+            tail_threshold = mask * self.tail_threshold
+            alphas = torch.cat([alphas, zeros_t], dim=1)
+            alphas = torch.add(alphas, tail_threshold)
+        else:
+            tail_threshold_tensor = torch.tensor([self.tail_threshold],
+                                                 dtype=alphas.dtype).to(
+                                                     alphas.device)
+            tail_threshold_tensor = torch.reshape(tail_threshold_tensor,
+                                                  (1, 1))
+            alphas = torch.cat([alphas, tail_threshold_tensor], dim=1)
+        zeros = torch.zeros((b, 1, d), dtype=hidden.dtype).to(hidden.device)
+        hidden = torch.cat([hidden, zeros], dim=1)
+        token_num = alphas.sum(dim=-1)
+        token_num_floor = torch.floor(token_num)
+
+        return hidden, alphas, token_num_floor
+
+    def gen_frame_alignments(self,
+                             alphas: torch.Tensor = None,
+                             encoder_sequence_length: torch.Tensor = None):
+        batch_size, maximum_length = alphas.size()
+        int_type = torch.int32
+
+        is_training = self.training
+        if is_training:
+            token_num = torch.round(torch.sum(alphas, dim=1)).type(int_type)
+        else:
+            token_num = torch.floor(torch.sum(alphas, dim=1)).type(int_type)
+
+        max_token_num = torch.max(token_num).item()
+
+        alphas_cumsum = torch.cumsum(alphas, dim=1)
+        alphas_cumsum = torch.floor(alphas_cumsum).type(int_type)
+        alphas_cumsum = alphas_cumsum[:, None, :].repeat(1, max_token_num, 1)
+
+        index = torch.ones([batch_size, max_token_num], dtype=int_type)
+        index = torch.cumsum(index, dim=1)
+        index = index[:, :,
+                      None].repeat(1, 1,
+                                   maximum_length).to(alphas_cumsum.device)
+
+        index_div = torch.floor(torch.true_divide(alphas_cumsum,
+                                                  index)).type(int_type)
+        index_div_bool_zeros = index_div.eq(0)
+        index_div_bool_zeros_count = torch.sum(index_div_bool_zeros,
+                                               dim=-1) + 1
+        index_div_bool_zeros_count = torch.clamp(index_div_bool_zeros_count, 0,
+                                                 encoder_sequence_length.max())
+        token_num_mask = (~make_pad_mask(token_num, max_len=max_token_num)).to(
+            token_num.device)
+        index_div_bool_zeros_count *= token_num_mask
+
+        index_div_bool_zeros_count_tile = \
+            index_div_bool_zeros_count[:, :, None].repeat(1, 1, maximum_length)
+        ones = torch.ones_like(index_div_bool_zeros_count_tile)
+        zeros = torch.zeros_like(index_div_bool_zeros_count_tile)
+        ones = torch.cumsum(ones, dim=2)
+        cond = index_div_bool_zeros_count_tile == ones
+        index_div_bool_zeros_count_tile = torch.where(cond, zeros, ones)
+
+        index_div_bool_zeros_count_tile_bool = index_div_bool_zeros_count_tile \
+            .type(torch.bool)
+        index_div_bool_zeros_count_tile = \
+            1 - index_div_bool_zeros_count_tile_bool.type(int_type)
+        index_div_bool_zeros_count_tile_out = torch.sum(
+            index_div_bool_zeros_count_tile, dim=1)
+        index_div_bool_zeros_count_tile_out = \
+            index_div_bool_zeros_count_tile_out.type(int_type)
+        predictor_mask = (~make_pad_mask(encoder_sequence_length,
+                                         max_len=encoder_sequence_length
+                                         .max())).type(int_type)\
+            .to(encoder_sequence_length.device)
+        index_div_bool_zeros_count_tile_out = \
+            index_div_bool_zeros_count_tile_out * predictor_mask
+
+        predictor_alignments = index_div_bool_zeros_count_tile_out
+        predictor_alignments_length = predictor_alignments.sum(-1).type(
+            encoder_sequence_length.dtype)
+        return predictor_alignments.detach(), \
+            predictor_alignments_length.detach()
+
+
+class MAELoss(nn.Module):
+
+    def __init__(self, normalize_length=False):
+        super(MAELoss, self).__init__()
+        self.normalize_length = normalize_length
+        self.criterion = torch.nn.L1Loss(reduction='sum')
+
+    def forward(self, token_length, pre_token_length):
+        loss_token_normalizer = token_length.size(0)
+        if self.normalize_length:
+            loss_token_normalizer = token_length.sum().type(torch.float32)
+        loss = self.criterion(token_length, pre_token_length)
+        loss = loss / loss_token_normalizer
+        return loss
+
+
+def cif_without_hidden(alphas: torch.Tensor, threshold: float):
+    # https://github.com/alibaba-damo-academy/FunASR/blob/main/funasr/models/predictor/cif.py#L187
+    batch_size, len_time = alphas.size()
+
+    # loop varss
+    integrate = torch.zeros([batch_size], device=alphas.device)
+    # intermediate vars along time
+    list_fires = []
+
+    for t in range(len_time):
+        alpha = alphas[:, t]
+
+        integrate += alpha
+        list_fires.append(integrate)
+
+        fire_place = integrate >= threshold
+        integrate = torch.where(
+            fire_place, integrate -
+            torch.ones([batch_size], device=alphas.device) * threshold,
+            integrate)
+
+    fires = torch.stack(list_fires, 1)
+    return fires
+
+
+def cif(hidden: torch.Tensor, alphas: torch.Tensor, threshold: float):
+    batch_size, len_time, hidden_size = hidden.size()
+
+    # loop varss
+    integrate = torch.zeros([batch_size], device=hidden.device)
+    frame = torch.zeros([batch_size, hidden_size], device=hidden.device)
+    # intermediate vars along time
+    list_fires = []
+    list_frames = []
+
+    for t in range(len_time):
+        alpha = alphas[:, t]
+        distribution_completion = torch.ones([batch_size],
+                                             device=hidden.device) - integrate
+
+        integrate += alpha
+        list_fires.append(integrate)
+
+        fire_place = integrate >= threshold
+        integrate = torch.where(
+            fire_place,
+            integrate - torch.ones([batch_size], device=hidden.device),
+            integrate)
+        cur = torch.where(fire_place, distribution_completion, alpha)
+        remainds = alpha - cur
+
+        frame += cur[:, None] * hidden[:, t, :]
+        list_frames.append(frame)
+        frame = torch.where(fire_place[:, None].repeat(1, hidden_size),
+                            remainds[:, None] * hidden[:, t, :], frame)
+
+    fires = torch.stack(list_fires, 1)
+    frames = torch.stack(list_frames, 1)
+    list_ls = []
+    len_labels = torch.round(alphas.sum(-1)).int()
+    max_label_len = len_labels.max()
+    for b in range(batch_size):
+        fire = fires[b, :]
+        l = torch.index_select(frames[b, :, :], 0,
+                               torch.nonzero(fire >= threshold).squeeze())
+        pad_l = torch.zeros([int(max_label_len - l.size(0)), hidden_size],
+                            device=hidden.device)
+        list_ls.append(torch.cat([l, pad_l], 0))
+    return torch.stack(list_ls, 0), fires
diff --git a/wenet/paraformer/convert_paraformer_to_wenet_config_and_ckpt.py b/wenet/paraformer/convert_paraformer_to_wenet_config_and_ckpt.py
new file mode 100644
index 0000000000000000000000000000000000000000..85961339104c9b2057d0463e25851e385eb3cd4f
--- /dev/null
+++ b/wenet/paraformer/convert_paraformer_to_wenet_config_and_ckpt.py
@@ -0,0 +1,329 @@
+# NOTE(Mddct): This file is to convert paraformer config to wenet's train.yaml config
+
+import argparse
+import json
+import math
+import os
+from pathlib import Path
+import shutil
+import urllib.request
+import torch
+from tqdm import tqdm
+from typing import Dict, List, Optional, Tuple
+
+import yaml
+
+
+def _load_paraformer_cmvn(cmvn_file) -> Tuple[List, List]:
+    with open(cmvn_file, 'r', encoding='utf-8') as f:
+        lines = f.readlines()
+
+    means_list = []
+    vars_list = []
+    for i in range(len(lines)):
+        line_item = lines[i].split()
+        if line_item[0] == '<AddShift>':
+            line_item = lines[i + 1].split()
+            if line_item[0] == '<LearnRateCoef>':
+                add_shift_line = line_item[3:(len(line_item) - 1)]
+                means_list = list(map(float, list(add_shift_line)))
+                continue
+        elif line_item[0] == '<Rescale>':
+            line_item = lines[i + 1].split()
+            if line_item[0] == '<LearnRateCoef>':
+                rescale_line = line_item[3:(len(line_item) - 1)]
+                vars_list = list(map(float, list(rescale_line)))
+                continue
+
+    for i in range(len(means_list)):
+        # paraformer mean is negative
+        means_list[i] = -means_list[i]
+        vars_list[i] = 1. / math.pow(vars_list[i],
+                                     2) + means_list[i] * means_list[i]
+    return means_list, vars_list
+
+
+def _filter_dict_fields(input_dict, fields_to_keep):
+    filtered_dict = {
+        key: value
+        for key, value in input_dict.items() if key in fields_to_keep
+    }
+    return filtered_dict
+
+
+def _to_wenet_cmvn(cmvn_file):
+    means, istd = _load_paraformer_cmvn(cmvn_file)
+
+    d = {}
+    d['mean_stat'] = means
+    d['var_stat'] = istd
+    d['frame_num'] = 1
+
+    return json.dumps(d)
+
+
+def extract_dict(configs, wenet_dict_path: str) -> int:
+    tokens = configs['token_list']
+    with open(wenet_dict_path, '+w') as f:
+        for i, token in enumerate(tokens):
+            token = '<sos>' if token == '<s>' else token
+            token = '<eos>' if token == '</s>' else token
+            f.writelines(token + ' ' + str(i) + '\n')
+
+        f.flush()
+    return len(tokens)
+
+
+def convert_to_wenet_json_cmvn(paraformer_cmvn_path, wenet_cmvn_path: str):
+    json_cmvn = _to_wenet_cmvn(paraformer_cmvn_path)
+    with open(wenet_cmvn_path, '+w') as f:
+        f.write(json_cmvn)
+        f.flush()
+
+
+def convert_to_wenet_tokenizer_conf(symbol_table_path, seg_dict, configs,
+                                    output_path):
+    configs['tokenizer'] = 'paraformer'
+    configs['tokenizer_conf'] = {}
+    configs['tokenizer_conf']['symbol_table_path'] = symbol_table_path
+    configs['tokenizer_conf']['seg_dict_path'] = output_path
+    configs['tokenizer_conf']['special_tokens'] = {}
+    configs['tokenizer_conf']['special_tokens']['<eos>'] = 2
+    configs['tokenizer_conf']['special_tokens']['<sos>'] = 1
+    configs['tokenizer_conf']['special_tokens']['<blank>'] = 0
+    configs['tokenizer_conf']['special_tokens']['<unk>'] = 8403
+
+    shutil.copy(seg_dict, output_path)
+
+
+def convert_to_wenet_yaml(configs, wenet_yaml_path: str,
+                          fields_to_keep: List[str]) -> Dict:
+    configs = _filter_dict_fields(configs, fields_to_keep)
+    configs['encoder'] = 'sanm_encoder'
+    configs['encoder_conf']['input_layer'] = 'paraformer_dummy'
+    configs['decoder'] = 'sanm_decoder'
+    configs['lfr_conf'] = {'lfr_m': 7, 'lfr_n': 6}
+
+    configs['input_dim'] = configs['lfr_conf']['lfr_m'] * 80
+    # configs['predictor'] = 'cif_predictor'
+    configs['predictor'] = 'paraformer_predictor'
+    configs['predictor_conf'] = configs.pop('predictor_conf')
+    configs['predictor_conf']['cnn_groups'] = 1
+    configs['predictor_conf']['residual'] = False
+    del configs['predictor_conf']['upsample_type']
+    del configs['predictor_conf']['use_cif1_cnn']
+    # This type not use
+    del configs['encoder_conf']['selfattention_layer_type'], configs[
+        'encoder_conf']['pos_enc_class']
+    configs['encoder_conf']['pos_enc_layer_type'] = 'abs_pos_paraformer'
+
+    configs['ctc_conf'] = {}
+    configs['ctc_conf']['ctc_blank_id'] = 0
+
+    configs['dataset_conf'] = {}
+    configs['dataset_conf']['filter_conf'] = {}
+    configs['dataset_conf']['filter_conf']['max_length'] = 20000
+    configs['dataset_conf']['filter_conf']['min_length'] = 0
+    configs['dataset_conf']['filter_conf']['token_max_length'] = 200
+    configs['dataset_conf']['filter_conf']['token_min_length'] = 1
+    configs['dataset_conf']['resample_conf'] = {}
+    configs['dataset_conf']['resample_conf']['resample_rate'] = 16000
+    configs['dataset_conf']['speed_perturb'] = True
+    configs['dataset_conf']['spec_aug'] = True
+    configs['dataset_conf']['spec_aug_conf'] = {}
+    configs['dataset_conf']['spec_aug_conf']['num_t_mask'] = 2
+    configs['dataset_conf']['spec_aug_conf']['num_f_mask'] = 2
+    configs['dataset_conf']['spec_aug_conf']['max_t'] = 50
+    configs['dataset_conf']['spec_aug_conf']['max_f'] = 10
+    configs['dataset_conf']['fbank_conf'] = {}
+    configs['dataset_conf']['fbank_conf']['num_mel_bins'] = 80
+    configs['dataset_conf']['fbank_conf']['frame_shift'] = 10
+    configs['dataset_conf']['fbank_conf']['frame_length'] = 25
+    configs['dataset_conf']['fbank_conf']['dither'] = 0.1
+    configs['dataset_conf']['fbank_conf']['window_type'] = 'hamming'
+    configs['dataset_conf']['spec_sub'] = False
+    configs['dataset_conf']['spec_trim'] = False
+    configs['dataset_conf']['shuffle'] = True
+    configs['dataset_conf']['shuffle_conf'] = {}
+    configs['dataset_conf']['shuffle_conf']['shuffle_size'] = 1500
+    configs['dataset_conf']['sort'] = True
+    configs['dataset_conf']['sort_conf'] = {}
+    configs['dataset_conf']['sort_conf']['sort_size'] = 500
+    configs['dataset_conf']['batch_conf'] = {}
+    configs['dataset_conf']['batch_conf']['batch_type'] = 'dynamic'
+    configs['dataset_conf']['batch_conf']['batch_size'] = 26
+    configs['dataset_conf']['batch_conf']['max_frames_in_batch'] = 12000
+
+    configs['model_conf']['add_eos'] = configs['model_conf']['predictor_bias']
+    del configs['model_conf']['predictor_bias']
+    del configs['model_conf']['predictor_weight']
+
+    configs['grad_clip'] = 5
+    configs['accum_grad'] = 1
+    configs['max_epoch'] = 100
+    configs['log_interval'] = 100
+
+    configs['model_conf']['length_normalized_loss'] = False
+
+    with open(wenet_yaml_path, '+w') as f:
+        f.write(yaml.dump(configs))
+        f.flush()
+    return configs
+
+
+def convert_to_wenet_state_dict(args, wenet_model_path):
+    wenet_state_dict = {}
+    checkpoint = torch.load(args.paraformer_model, map_location='cpu')
+    for name in checkpoint.keys():
+        wenet_name = name
+
+        if wenet_name.startswith('predictor.cif_output2'):
+            wenet_name = wenet_name.replace('predictor.cif_output2.',
+                                            'predictor.tp_output.')
+        elif wenet_name.startswith('predictor.cif'):
+            wenet_name = wenet_name.replace('predictor.cif',
+                                            'predictor.predictor.cif')
+        elif wenet_name.startswith('predictor.upsample'):
+            wenet_name = wenet_name.replace('predictor.', 'predictor.tp_')
+        elif wenet_name.startswith('predictor.blstm'):
+            wenet_name = wenet_name.replace('predictor.', 'predictor.tp_')
+        elif wenet_name == 'decoder.embed.0.weight':
+            wenet_name = 'embed.weight'
+
+        wenet_state_dict[wenet_name] = checkpoint[name].float()
+
+    torch.save(wenet_state_dict, wenet_model_path)
+
+
+def get_args():
+    parser = argparse.ArgumentParser(description='load ali-paraformer')
+    parser.add_argument('--paraformer_config',
+                        default=None,
+                        help='ali released Paraformer model\'s config')
+    parser.add_argument('--paraformer_cmvn',
+                        default=None,
+                        help='ali released Paraformer model\'s cmvn')
+    parser.add_argument('--paraformer_seg_dict',
+                        default=None,
+                        help='ali released Paraformer model\'s en dict')
+    parser.add_argument('--output_dir',
+                        default='.',
+                        help="output file:\
+        global_cmvn, units.txt, train.yaml, wenet_paraformer.pt")
+    parser.add_argument("--paraformer_model",
+                        default=None,
+                        help="ali released Paraformer model")
+    args = parser.parse_args()
+    return args
+
+
+def _download_fn(output_dir,
+                 name,
+                 renmae: Optional[str] = None,
+                 version: str = 'master'):
+    url = "https://www.modelscope.cn/api/v1/"\
+        "models/iic/"\
+        "speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch"\
+        "/repo?Revision={}&FilePath=".format(version) + name
+    print(url)
+    # "speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"\
+    if renmae is None:
+        output_file = os.path.join(output_dir, name)
+    else:
+        output_file = os.path.join(output_dir, renmae)
+
+    user_agent = "Mozilla/5.0"
+    req = urllib.request.Request(url)
+    req.add_header("User-Agent", user_agent)
+    response = urllib.request.urlopen(req)
+    file_size = int(response.headers["Content-Length"])
+
+    with tqdm(total=file_size, unit='B', unit_scale=True, ncols=80,
+              desc=name) as pbar:
+        with urllib.request.urlopen(req) as response:
+            with open(output_file, "wb") as file:
+                while True:
+                    data = response.read(4096)
+                    if not data:
+                        break
+                    file.write(data)
+                    pbar.update(len(data))
+    print("{} download finished".format(name))
+
+
+def may_get_assets_and_refine_args(args):
+
+    assets_dir = os.path.join(Path.home(),
+                              ".wenet/cache/paraformer-offline-cn")
+
+    if not os.path.exists(assets_dir):
+        os.makedirs(assets_dir)
+
+    # TODO: md5 check
+    if args.paraformer_config is None:
+        config_name = 'config.yaml'
+        args.paraformer_config = os.path.join(assets_dir, config_name)
+        if not os.path.exists(args.paraformer_config):
+            _download_fn(assets_dir, config_name, version='v1.2.4')
+    if args.paraformer_cmvn is None:
+        cmvn_name = 'am.mvn'
+        args.paraformer_cmvn = os.path.join(assets_dir, cmvn_name)
+        if not os.path.exists(args.paraformer_cmvn):
+            _download_fn(assets_dir, cmvn_name)
+    if args.paraformer_seg_dict is None:
+        seg_dict = 'seg_dict'
+        args.paraformer_seg_dict = os.path.join(assets_dir, "seg_dict")
+        if not os.path.exists(args.paraformer_seg_dict):
+            _download_fn(assets_dir, seg_dict)
+    if args.paraformer_model is None:
+        model_name = 'model.pt'
+        args.paraformer_model = os.path.join(assets_dir, "model.pt")
+        if not os.path.exists(args.paraformer_model):
+            _download_fn(assets_dir, model_name, "model.pt")
+
+
+def main():
+
+    args = get_args()
+    may_get_assets_and_refine_args(args)
+    assert os.path.exists(args.output_dir)
+    with open(args.paraformer_config, 'r') as fin:
+        configs = yaml.load(fin, Loader=yaml.FullLoader)
+
+    json_cmvn_path = os.path.join(args.output_dir, 'global_cmvn')
+    convert_to_wenet_json_cmvn(args.paraformer_cmvn, json_cmvn_path)
+
+    wenet_units = os.path.join(args.output_dir, 'units.txt')
+    seg_dict = os.path.join(args.output_dir,
+                            os.path.basename(args.paraformer_seg_dict))
+    vocab_size = extract_dict(configs, wenet_units)
+    convert_to_wenet_tokenizer_conf(wenet_units, args.paraformer_seg_dict,
+                                    configs, seg_dict)
+    configs['output_dim'] = vocab_size
+    configs['model'] = 'paraformer'
+    configs['cmvn'] = "global_cmvn"
+    configs['cmvn_conf'] = {}
+    configs['cmvn_conf']['is_json_cmvn'] = True
+    configs['cmvn_conf']['cmvn_file'] = json_cmvn_path
+    fields_to_keep = [
+        'model', 'encoder_conf', 'decoder_conf', 'predictor_conf', 'input_dim',
+        'output_dim', 'cmvn', 'cmvn_conf', 'model_conf', 'paraformer', 'optim',
+        'optim_conf', 'scheduler', 'scheduler_conf', 'tokenizer',
+        'tokenizer_conf'
+    ]
+    wenet_train_yaml = os.path.join(args.output_dir, "train.yaml")
+    convert_to_wenet_yaml(configs, wenet_train_yaml, fields_to_keep)
+
+    wenet_model_path = os.path.join(args.output_dir, "wenet_paraformer.pt")
+    convert_to_wenet_state_dict(args, wenet_model_path)
+
+    print("Please check {} {} {} {} {} in {}".format(json_cmvn_path,
+                                                     wenet_train_yaml,
+                                                     wenet_model_path,
+                                                     wenet_units, seg_dict,
+                                                     args.output_dir))
+
+
+if __name__ == "__main__":
+
+    main()
diff --git a/wenet/paraformer/embedding.py b/wenet/paraformer/embedding.py
new file mode 100644
index 0000000000000000000000000000000000000000..387de28ae28ca844bce7f7af7fe984aefe47ee10
--- /dev/null
+++ b/wenet/paraformer/embedding.py
@@ -0,0 +1,14 @@
+from wenet.transformer.embedding import WhisperPositionalEncoding
+
+
+class ParaformerPositinoalEncoding(WhisperPositionalEncoding):
+    """ Sinusoids position encoding used in paraformer.encoder
+    """
+
+    def __init__(self,
+                 depth: int,
+                 d_model: int,
+                 dropout_rate: float = 0.1,
+                 max_len: int = 1500):
+        super().__init__(depth, dropout_rate, max_len)
+        self.xscale = d_model**0.5
diff --git a/wenet/paraformer/layers.py b/wenet/paraformer/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..d17280d8a5c3d21cf18e137f18d3254c0fc04160
--- /dev/null
+++ b/wenet/paraformer/layers.py
@@ -0,0 +1,495 @@
+""" NOTE(Mddct): This file is experimental and is used to export paraformer
+"""
+
+import math
+from typing import Optional, Tuple
+import torch
+
+import torch.utils.checkpoint as ckpt
+
+from wenet.paraformer.attention import (DummyMultiHeadSANM,
+                                        MultiHeadAttentionCross,
+                                        MultiHeadedAttentionSANM)
+from wenet.paraformer.embedding import ParaformerPositinoalEncoding
+from wenet.paraformer.subsampling import IdentitySubsampling
+from wenet.transformer.encoder import BaseEncoder
+from wenet.transformer.decoder import TransformerDecoder
+from wenet.transformer.decoder_layer import DecoderLayer
+from wenet.transformer.encoder_layer import TransformerEncoderLayer
+from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward
+from wenet.utils.mask import make_non_pad_mask
+
+
+class LFR(torch.nn.Module):
+
+    def __init__(self, m: int = 7, n: int = 6) -> None:
+        """
+        Actually, this implements stacking frames and skipping frames.
+        if m = 1 and n = 1, just return the origin features.
+        if m = 1 and n > 1, it works like skipping.
+        if m > 1 and n = 1, it works like stacking but only support right frames.
+        if m > 1 and n > 1, it works like LFR.
+
+        """
+        super().__init__()
+
+        self.m = m
+        self.n = n
+
+        self.left_padding_nums = math.ceil((self.m - 1) // 2)
+
+    def forward(self, input: torch.Tensor,
+                input_lens: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        orign_type = input_lens.dtype
+        input_lens = input_lens.to(torch.int64)
+        B, _, D = input.size()
+        n_lfr = torch.ceil(input_lens / self.n).to(input_lens.dtype)
+        # right_padding_nums >= 0
+        prepad_nums = input_lens + self.left_padding_nums
+
+        right_padding_nums = torch.where(
+            self.m >= (prepad_nums - self.n * (n_lfr - 1)),
+            self.m - (prepad_nums - self.n * (n_lfr - 1)),
+            0,
+        )
+        T_all = self.left_padding_nums + input_lens + right_padding_nums
+
+        new_len = T_all // self.n
+
+        T_all_max = T_all.max().int()
+
+        tail_frames_index = (input_lens - 1).view(B, 1, 1).repeat(1, 1,
+                                                                  D)  # [B,1,D]
+
+        tail_frames = torch.gather(input, 1, tail_frames_index)
+        tail_frames = tail_frames.repeat(1, right_padding_nums.max().int(), 1)
+        head_frames = input[:, 0:1, :].repeat(1, self.left_padding_nums, 1)
+
+        # stack
+        input = torch.cat([head_frames, input, tail_frames], dim=1)
+
+        index = torch.arange(T_all_max,
+                             device=input.device,
+                             dtype=input_lens.dtype).unsqueeze(0).repeat(
+                                 B, 1)  # [B, T_all_max]
+        # [B, T_all_max]
+        index_mask = index < (self.left_padding_nums + input_lens).unsqueeze(1)
+
+        tail_index_mask = torch.logical_not(
+            index >= (T_all.unsqueeze(1))) & index_mask
+        tail = torch.ones(T_all_max,
+                          dtype=input_lens.dtype,
+                          device=input.device).unsqueeze(0).repeat(B, 1) * (
+                              T_all_max - 1)  # [B, T_all_max]
+        indices = torch.where(torch.logical_or(index_mask, tail_index_mask),
+                              index, tail)
+        input = torch.gather(input, 1, indices.unsqueeze(2).repeat(1, 1, D))
+
+        input = input.unfold(1, self.m, step=self.n).transpose(2, 3)
+        # new len
+        new_len = new_len.to(orign_type)
+        return input.reshape(B, -1, D * self.m), new_len
+
+
+class PositionwiseFeedForwardDecoderSANM(torch.nn.Module):
+    """Positionwise feed forward layer.
+
+    Args:
+        idim (int): Input dimenstion.
+        hidden_units (int): The number of hidden units.
+        dropout_rate (float): Dropout rate.
+
+    """
+
+    def __init__(self,
+                 idim,
+                 hidden_units,
+                 dropout_rate,
+                 adim=None,
+                 activation=torch.nn.ReLU()):
+        """Construct an PositionwiseFeedForward object."""
+        super(PositionwiseFeedForwardDecoderSANM, self).__init__()
+        self.w_1 = torch.nn.Linear(idim, hidden_units)
+        self.w_2 = torch.nn.Linear(hidden_units,
+                                   idim if adim is None else adim,
+                                   bias=False)
+        self.dropout = torch.nn.Dropout(dropout_rate)
+        self.activation = activation
+        self.norm = torch.nn.LayerNorm(hidden_units)
+
+    def forward(self, x):
+        """Forward function."""
+        return self.w_2(self.norm(self.dropout(self.activation(self.w_1(x)))))
+
+
+class AliParaformerEncoderLayer(TransformerEncoderLayer):
+
+    def __init__(self,
+                 size: int,
+                 self_attn: torch.nn.Module,
+                 feed_forward: torch.nn.Module,
+                 dropout_rate: float,
+                 normalize_before: bool = True,
+                 in_size: int = 256):
+        """ Resize input in_size to size
+        """
+        super().__init__(size, self_attn, feed_forward, dropout_rate,
+                         normalize_before)
+        self.in_size = in_size
+        self.size = size
+        del self.norm1
+        self.norm1 = torch.nn.LayerNorm(in_size)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask: torch.Tensor,
+        pos_emb: Optional[torch.Tensor] = None,
+        mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+        att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)),
+        cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)),
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        residual = x
+        if self.normalize_before:
+            x = self.norm1(x)
+        x_att, new_att_cache = self.self_attn(
+            x,
+            x,
+            x,
+            mask,
+            cache=att_cache,
+            mask_pad=mask_pad,
+        )
+        if self.in_size == self.size:
+            x = residual + self.dropout(x_att)
+        else:
+            x = self.dropout(x_att)
+
+        if not self.normalize_before:
+            x = self.norm1(x)
+
+        residual = x
+        if self.normalize_before:
+            x = self.norm2(x)
+        x = residual + self.dropout(self.feed_forward(x))
+        if not self.normalize_before:
+            x = self.norm2(x)
+
+        fake_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device)
+        return x, mask, new_att_cache, fake_cnn_cache
+
+
+class SanmEncoder(BaseEncoder):
+
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int = 256,
+        attention_heads: int = 4,
+        linear_units: int = 2048,
+        num_blocks: int = 6,
+        dropout_rate: float = 0.1,
+        positional_dropout_rate: float = 0.1,
+        attention_dropout_rate: float = 0,
+        input_layer: str = "conv2d",
+        pos_enc_layer_type: str = "abs_pos",
+        normalize_before: bool = True,
+        static_chunk_size: int = 0,
+        use_dynamic_chunk: bool = False,
+        global_cmvn: torch.nn.Module = None,
+        use_dynamic_left_chunk: bool = False,
+        kernel_size: int = 11,
+        sanm_shfit: int = 0,
+        gradient_checkpointing: bool = False,
+    ):
+        super().__init__(input_size,
+                         output_size,
+                         attention_heads,
+                         linear_units,
+                         num_blocks,
+                         dropout_rate,
+                         positional_dropout_rate,
+                         attention_dropout_rate,
+                         input_layer,
+                         pos_enc_layer_type,
+                         normalize_before,
+                         static_chunk_size,
+                         use_dynamic_chunk,
+                         global_cmvn,
+                         use_dynamic_left_chunk,
+                         gradient_checkpointing=gradient_checkpointing)
+        del self.embed
+        self.embed = IdentitySubsampling(
+            input_size,
+            output_size,
+            dropout_rate,
+            ParaformerPositinoalEncoding(input_size,
+                                         output_size,
+                                         positional_dropout_rate,
+                                         max_len=5000),
+        )
+
+        encoder_selfattn_layer = MultiHeadedAttentionSANM
+        encoder_selfattn_layer_args0 = (
+            attention_heads,
+            input_size,
+            output_size,
+            attention_dropout_rate,
+            kernel_size,
+            sanm_shfit,
+        )
+        encoder_selfattn_layer_args = (
+            attention_heads,
+            output_size,
+            output_size,
+            attention_dropout_rate,
+            kernel_size,
+            sanm_shfit,
+        )
+        self.encoders0 = torch.nn.ModuleList([
+            AliParaformerEncoderLayer(
+                output_size,
+                encoder_selfattn_layer(*encoder_selfattn_layer_args0),
+                PositionwiseFeedForward(output_size, linear_units,
+                                        dropout_rate),
+                dropout_rate,
+                normalize_before,
+                in_size=input_size,
+            )
+        ])
+        self.encoders = torch.nn.ModuleList([
+            AliParaformerEncoderLayer(
+                output_size,
+                encoder_selfattn_layer(*encoder_selfattn_layer_args),
+                PositionwiseFeedForward(
+                    output_size,
+                    linear_units,
+                    dropout_rate,
+                ),
+                dropout_rate,
+                normalize_before,
+                in_size=output_size) for _ in range(num_blocks - 1)
+        ])
+        if self.normalize_before:
+            self.after_norm = torch.nn.LayerNorm(output_size)
+
+    def forward_layers(self, xs: torch.Tensor, chunk_masks: torch.Tensor,
+                       pos_emb: torch.Tensor,
+                       mask_pad: torch.Tensor) -> torch.Tensor:
+        for layer in self.encoders0:
+            xs, _, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad)
+        for layer in self.encoders:
+            xs, _, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad)
+        return xs
+
+    @torch.jit.unused
+    def forward_layers_checkpointed(self, xs: torch.Tensor,
+                                    chunk_masks: torch.Tensor,
+                                    pos_emb: torch.Tensor,
+                                    mask_pad: torch.Tensor) -> torch.Tensor:
+        for layer in self.encoders0:
+            xs, _, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad)
+        for layer in self.encoders:
+            xs, _, _, _ = ckpt.checkpoint(layer.__call__,
+                                          xs,
+                                          chunk_masks,
+                                          pos_emb,
+                                          mask_pad,
+                                          use_reentrant=False)
+        return xs
+
+
+class _Decoders3(torch.nn.Module):
+    """Paraformer has a decoder3"""
+
+    def __init__(self, hidden: int, pos_clss: torch.nn.Module) -> None:
+        super().__init__()
+        self.feed_forward = pos_clss
+        self.norm1 = torch.nn.LayerNorm(hidden)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.feed_forward(self.norm1(x))
+
+
+class SanmDecoderLayer(DecoderLayer):
+
+    def __init__(self,
+                 size: int,
+                 self_attn: Optional[torch.nn.Module],
+                 src_attn: Optional[torch.nn.Module],
+                 feed_forward: torch.nn.Module,
+                 dropout_rate: float,
+                 normalize_before: bool = True):
+        super().__init__(size, self_attn, src_attn, feed_forward, dropout_rate,
+                         normalize_before)
+        # NOTE(Mddct): ali-Paraformer need eps=1e-12
+        self.norm1 = torch.nn.LayerNorm(size, eps=1e-12)
+        self.norm2 = torch.nn.LayerNorm(size, eps=1e-12)
+        self.norm3 = torch.nn.LayerNorm(size, eps=1e-12)
+
+    def forward(
+        self,
+        tgt: torch.Tensor,
+        tgt_mask: torch.Tensor,
+        memory: torch.Tensor,
+        memory_mask: torch.Tensor,
+        cache: Optional[torch.Tensor] = None
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        residual = tgt
+        if self.normalize_before:
+            tgt = self.norm1(tgt)
+        tgt = self.feed_forward(tgt)
+
+        if cache is None:
+            tgt_q = tgt
+            tgt_q_mask = tgt_mask
+        else:
+            # compute only the last frame query keeping dim: max_time_out -> 1
+            assert cache.shape == (
+                tgt.shape[0],
+                tgt.shape[1] - 1,
+                self.size,
+            ), "{cache.shape} == {(tgt.shape[0], tgt.shape[1] - 1, self.size)}"
+            tgt_q = tgt[:, -1:, :]
+            residual = residual[:, -1:, :]
+            tgt_q_mask = tgt_mask[:, -1:, :]
+
+        x = tgt
+        if self.self_attn is not None:
+            if self.normalize_before:
+                tgt = self.norm2(tgt)
+            tgt_q = tgt
+            x = self.self_attn(tgt_q,
+                               tgt,
+                               tgt,
+                               tgt_q_mask,
+                               mask_pad=tgt_q_mask)[0]
+            x = residual + self.dropout(x)
+
+        if self.src_attn is not None:
+            residual = x
+            if self.normalize_before:
+                x = self.norm3(x)
+
+            x = residual + self.dropout(
+                self.src_attn(
+                    x, memory, memory, memory_mask, mask_pad=memory_mask)[0])
+
+        return x, tgt_mask, memory, memory_mask
+
+
+class SanmDecoder(TransformerDecoder):
+
+    def __init__(
+        self,
+        vocab_size: int,
+        encoder_output_size: int,
+        attention_heads: int = 4,
+        linear_units: int = 2048,
+        num_blocks: int = 6,
+        dropout_rate: float = 0.1,
+        positional_dropout_rate: float = 0.1,
+        self_attention_dropout_rate: float = 0,
+        src_attention_dropout_rate: float = 0,
+        input_layer: str = "embed",
+        use_output_layer: bool = True,
+        normalize_before: bool = True,
+        src_attention: bool = True,
+        att_layer_num: int = 16,
+        kernel_size: int = 11,
+        sanm_shfit: int = 0,
+        gradient_checkpointing: bool = False,
+    ):
+        super().__init__(vocab_size,
+                         encoder_output_size,
+                         attention_heads,
+                         linear_units,
+                         num_blocks,
+                         dropout_rate,
+                         positional_dropout_rate,
+                         self_attention_dropout_rate,
+                         src_attention_dropout_rate,
+                         input_layer,
+                         use_output_layer,
+                         normalize_before,
+                         src_attention,
+                         gradient_checkpointing=gradient_checkpointing)
+        del self.embed, self.decoders
+        self.decoders = torch.nn.ModuleList([
+            SanmDecoderLayer(
+                encoder_output_size,
+                DummyMultiHeadSANM(attention_heads, encoder_output_size,
+                                   encoder_output_size, dropout_rate,
+                                   kernel_size, sanm_shfit),
+                MultiHeadAttentionCross(attention_heads, encoder_output_size,
+                                        encoder_output_size, dropout_rate,
+                                        kernel_size, sanm_shfit,
+                                        encoder_output_size),
+                PositionwiseFeedForwardDecoderSANM(encoder_output_size,
+                                                   linear_units, dropout_rate),
+                dropout_rate,
+                normalize_before,
+            ) for _ in range(att_layer_num)
+        ])
+        # NOTE(Mddct): att_layer_num == num_blocks in released pararformer model
+        assert att_layer_num == num_blocks
+
+        # NOTE(Mddct): Paraformer has a deocder3
+        self.decoders3 = torch.nn.ModuleList([
+            _Decoders3(
+                encoder_output_size,
+                PositionwiseFeedForwardDecoderSANM(encoder_output_size,
+                                                   linear_units, dropout_rate))
+        ])
+
+    def forward(
+        self,
+        encoder_out: torch.Tensor,
+        encoder_out_mask: torch.Tensor,
+        sematic_embeds: torch.Tensor,
+        ys_pad_lens: torch.Tensor,
+        r_ys_in_pad: torch.Tensor = torch.empty(0),
+        reverse_weight: float = 0.0,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+        ys_pad_mask = make_non_pad_mask(ys_pad_lens).unsqueeze(1)
+        x = sematic_embeds
+        if self.gradient_checkpointing and self.training:
+            x = self.forward_layers_checkpointed(x, ys_pad_mask, encoder_out,
+                                                 encoder_out_mask)
+        else:
+            x = self.forward_layers(x, ys_pad_mask, encoder_out,
+                                    encoder_out_mask)
+        if self.normalize_before:
+            x = self.after_norm(x)
+        if self.output_layer is not None:
+            x = self.output_layer(x)
+        return x, torch.tensor(0.0), ys_pad_lens
+
+    def forward_layers(self, x: torch.Tensor, tgt_mask: torch.Tensor,
+                       memory: torch.Tensor,
+                       memory_mask: torch.Tensor) -> torch.Tensor:
+        for layer in self.decoders:
+            x, _, _, _ = layer(x, tgt_mask, memory, memory_mask)
+        for layer in self.decoders3:
+            x = layer(x)
+        return x
+
+    @torch.jit.unused
+    def forward_layers_checkpointed(self, x: torch.Tensor,
+                                    tgt_mask: torch.Tensor,
+                                    memory: torch.Tensor,
+                                    memory_mask: torch.Tensor) -> torch.Tensor:
+        for i, layer in enumerate(self.decoders):
+            if i == 0:
+                x, _, _, _ = layer(x, tgt_mask, memory, memory_mask)
+            else:
+                x, _, _, _ = ckpt.checkpoint(layer.__call__,
+                                             x,
+                                             tgt_mask,
+                                             memory,
+                                             memory_mask,
+                                             use_reentrant=False)
+        for layer in self.decoders3:
+            x = layer(x)
+        return x
diff --git a/wenet/paraformer/paraformer.py b/wenet/paraformer/paraformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..be19f15b494ac5c697f5dcda2f022d31e86e6e62
--- /dev/null
+++ b/wenet/paraformer/paraformer.py
@@ -0,0 +1,407 @@
+# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu)
+#               2023 ASLP@NWPU (authors: He Wang, Fan Yu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from ESPnet(https://github.com/espnet/espnet) and
+# FunASR(https://github.com/alibaba-damo-academy/FunASR)
+
+from typing import Dict, List, Optional, Tuple
+
+import torch
+from wenet.paraformer.cif import Cif, cif_without_hidden
+
+from wenet.paraformer.layers import SanmDecoder, SanmEncoder
+from wenet.paraformer.layers import LFR
+from wenet.paraformer.search import (paraformer_beam_search,
+                                     paraformer_greedy_search)
+from wenet.transformer.asr_model import ASRModel
+from wenet.transformer.ctc import CTC
+from wenet.transformer.decoder import TransformerDecoder
+from wenet.transformer.encoder import BaseEncoder
+from wenet.transformer.search import (DecodeResult, ctc_greedy_search,
+                                      ctc_prefix_beam_search)
+from wenet.utils.common import IGNORE_ID, add_sos_eos, th_accuracy
+from wenet.utils.mask import make_non_pad_mask
+
+
+class Predictor(torch.nn.Module):
+
+    def __init__(
+        self,
+        idim,
+        l_order,
+        r_order,
+        threshold=1.0,
+        dropout=0.1,
+        smooth_factor=1.0,
+        noise_threshold=0.0,
+        tail_threshold=0.45,
+        residual=True,
+        cnn_groups=0,
+        smooth_factor2=0.25,
+        noise_threshold2=0.01,
+        upsample_times=3,
+    ):
+        super().__init__()
+        self.predictor = Cif(idim, l_order, r_order, threshold, dropout,
+                             smooth_factor, noise_threshold, tail_threshold,
+                             residual, cnn_groups)
+
+        # accurate timestamp branch
+        self.smooth_factor2 = smooth_factor2
+        self.noise_threshold2 = noise_threshold
+        self.upsample_times = upsample_times
+        self.noise_threshold2 = noise_threshold2
+        self.tp_upsample_cnn = torch.nn.ConvTranspose1d(
+            idim, idim, self.upsample_times, self.upsample_times)
+        self.tp_blstm = torch.nn.LSTM(idim,
+                                      idim,
+                                      1,
+                                      bias=True,
+                                      batch_first=True,
+                                      dropout=0.0,
+                                      bidirectional=True)
+        self.tp_output = torch.nn.Linear(idim * 2, 1)
+
+    def forward(self,
+                hidden,
+                target_label: Optional[torch.Tensor] = None,
+                mask: torch.Tensor = torch.tensor(0),
+                ignore_id: int = -1,
+                mask_chunk_predictor: Optional[torch.Tensor] = None,
+                target_label_length: Optional[torch.Tensor] = None):
+
+        acoustic_embeds, token_num, alphas, cif_peak = self.predictor(
+            hidden, target_label, mask, ignore_id, mask_chunk_predictor,
+            target_label_length)
+
+        output, (_, _) = self.tp_blstm(
+            self.tp_upsample_cnn(hidden.transpose(1, 2)).transpose(1, 2))
+        tp_alphas = torch.sigmoid(self.tp_output(output))
+        tp_alphas = torch.nn.functional.relu(tp_alphas * self.smooth_factor2 -
+                                             self.noise_threshold2)
+
+        mask = mask.repeat(1, self.upsample_times,
+                           1).transpose(-1,
+                                        -2).reshape(tp_alphas.shape[0], -1)
+        mask = mask.unsqueeze(-1)
+        tp_alphas = tp_alphas * mask
+        tp_alphas = tp_alphas.squeeze(-1)
+        tp_token_num = tp_alphas.sum(-1)
+
+        return acoustic_embeds, token_num, alphas, cif_peak, tp_alphas, tp_token_num
+
+
+class Paraformer(ASRModel):
+    """ Paraformer: Fast and Accurate Parallel Transformer for
+        Non-autoregressive End-to-End Speech Recognition
+        see https://arxiv.org/pdf/2206.08317.pdf
+
+    """
+
+    def __init__(self,
+                 vocab_size: int,
+                 encoder: BaseEncoder,
+                 decoder: TransformerDecoder,
+                 predictor: Predictor,
+                 ctc: CTC,
+                 ctc_weight: float = 0.5,
+                 ignore_id: int = -1,
+                 lsm_weight: float = 0,
+                 length_normalized_loss: bool = False,
+                 sampler: bool = True,
+                 sampling_ratio: float = 0.75,
+                 add_eos: bool = True,
+                 special_tokens: Optional[Dict] = None,
+                 apply_non_blank_embedding: bool = False):
+        assert isinstance(encoder,
+                          SanmEncoder), isinstance(decoder, SanmDecoder)
+        super().__init__(vocab_size, encoder, decoder, ctc, ctc_weight,
+                         IGNORE_ID, 0.0, lsm_weight, length_normalized_loss,
+                         None, apply_non_blank_embedding)
+        if ctc_weight == 0.0:
+            del ctc
+        self.predictor = predictor
+        self.lfr = LFR()
+
+        assert special_tokens is not None
+        self.sos = special_tokens['<sos>']
+        self.eos = special_tokens['<eos>']
+
+        self.sampler = sampler
+        self.sampling_ratio = sampling_ratio
+        if sampler:
+            self.embed = torch.nn.Embedding(vocab_size, encoder.output_size())
+        # NOTE(Mddct): add eos in tail of labels for predictor
+        # eg:
+        #    gt:         你 好 we@@ net
+        #    labels:     你 好 we@@ net eos
+        self.add_eos = add_eos
+
+    @torch.jit.unused
+    def forward(
+        self,
+        batch: Dict,
+        device: torch.device,
+    ) -> Dict[str, Optional[torch.Tensor]]:
+        """Frontend + Encoder + Predictor + Decoder + Calc loss
+        """
+        speech = batch['feats'].to(device)
+        speech_lengths = batch['feats_lengths'].to(device)
+        text = batch['target'].to(device)
+        text_lengths = batch['target_lengths'].to(device)
+
+        # 0 encoder
+        encoder_out, encoder_out_mask = self._forward_encoder(
+            speech, speech_lengths)
+
+        # 1 predictor
+        ys_pad, ys_pad_lens = text, text_lengths
+        if self.add_eos:
+            _, ys_pad = add_sos_eos(text, self.sos, self.eos, self.ignore_id)
+            ys_pad_lens = text_lengths + 1
+        acoustic_embd, token_num, _, _, _, tp_token_num = self.predictor(
+            encoder_out, ys_pad, encoder_out_mask, self.ignore_id)
+
+        # 2 decoder with sampler
+        # TODO(Mddct): support mwer here
+        acoustic_embd = self._sampler(
+            encoder_out,
+            encoder_out_mask,
+            ys_pad,
+            ys_pad_lens,
+            acoustic_embd,
+        )
+        # 3 loss
+        # 3.1 ctc branhch
+        loss_ctc: Optional[torch.Tensor] = None
+        if self.ctc_weight != 0.0:
+            loss_ctc, _ = self._forward_ctc(encoder_out, encoder_out_mask,
+                                            text, text_lengths)
+        # 3.2 quantity loss for cif
+        loss_quantity = torch.nn.functional.l1_loss(
+            token_num,
+            ys_pad_lens.to(token_num.dtype),
+            reduction='sum',
+        )
+        loss_quantity = loss_quantity / ys_pad_lens.sum().to(token_num.dtype)
+        loss_quantity_tp = torch.nn.functional.l1_loss(
+            tp_token_num, ys_pad_lens.to(token_num.dtype),
+            reduction='sum') / ys_pad_lens.sum().to(token_num.dtype)
+
+        loss_decoder, acc_att = self._calc_att_loss(encoder_out,
+                                                    encoder_out_mask, ys_pad,
+                                                    acoustic_embd, ys_pad_lens)
+        loss = loss_decoder
+        if loss_ctc is not None:
+            loss = loss + self.ctc_weight * loss_ctc
+        loss = loss + loss_quantity + loss_quantity_tp
+        return {
+            "loss": loss,
+            "loss_ctc": loss_ctc,
+            "loss_decoder": loss_decoder,
+            "loss_quantity": loss_quantity,
+            "loss_quantity_tp": loss_quantity_tp,
+            "th_accuracy": acc_att,
+        }
+
+    def _calc_att_loss(
+        self,
+        encoder_out: torch.Tensor,
+        encoder_mask: torch.Tensor,
+        ys_pad: torch.Tensor,
+        ys_pad_emb: torch.Tensor,
+        ys_pad_lens: torch.Tensor,
+        infos: Dict[str, List[str]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        decoder_out, _, _ = self.decoder(encoder_out, encoder_mask, ys_pad_emb,
+                                         ys_pad_lens)
+        loss_att = self.criterion_att(decoder_out, ys_pad)
+        acc_att = th_accuracy(decoder_out.view(-1, self.vocab_size),
+                              ys_pad,
+                              ignore_label=self.ignore_id)
+        return loss_att, acc_att
+
+    @torch.jit.unused
+    def _sampler(self, encoder_out, encoder_out_mask, ys_pad, ys_pad_lens,
+                 pre_acoustic_embeds):
+        device = encoder_out.device
+        B, _ = ys_pad.size()
+
+        tgt_mask = make_non_pad_mask(ys_pad_lens)
+        # zero the ignore id
+        ys_pad = ys_pad * tgt_mask
+        ys_pad_embed = self.embed(ys_pad)  # [B, T, L]
+        with torch.no_grad():
+            decoder_out, _, _ = self.decoder(encoder_out, encoder_out_mask,
+                                             pre_acoustic_embeds, ys_pad_lens)
+            pred_tokens = decoder_out.argmax(-1)
+            nonpad_positions = tgt_mask
+            same_num = ((pred_tokens == ys_pad) * nonpad_positions).sum(1)
+            input_mask = torch.ones_like(
+                nonpad_positions,
+                device=device,
+                dtype=tgt_mask.dtype,
+            )
+            for li in range(B):
+                target_num = (ys_pad_lens[li] -
+                              same_num[li].sum()).float() * self.sampling_ratio
+                target_num = target_num.long()
+                if target_num > 0:
+                    input_mask[li].scatter_(
+                        dim=0,
+                        index=torch.randperm(ys_pad_lens[li],
+                                             device=device)[:target_num],
+                        value=0,
+                    )
+            input_mask = torch.where(input_mask > 0, 1, 0)
+            input_mask = input_mask * tgt_mask
+            input_mask_expand = input_mask.unsqueeze(2)  # [B, T, 1]
+
+        sematic_embeds = torch.where(input_mask_expand == 1,
+                                     pre_acoustic_embeds, ys_pad_embed)
+        # zero out the paddings
+        return sematic_embeds * tgt_mask.unsqueeze(2)
+
+    def _forward_encoder(
+        self,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        decoding_chunk_size: int = 0,
+        num_decoding_left_chunks: int = -1,
+        simulate_streaming: bool = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # TODO(Mddct): support chunk by chunk
+        assert simulate_streaming is False
+        features, features_lens = self.lfr(speech, speech_lengths)
+        features_lens = features_lens.to(speech_lengths.dtype)
+        encoder_out, encoder_out_mask = self.encoder(features, features_lens,
+                                                     decoding_chunk_size,
+                                                     num_decoding_left_chunks)
+        return encoder_out, encoder_out_mask
+
+    @torch.jit.export
+    def forward_paraformer(
+        self,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        res = self._forward_paraformer(speech, speech_lengths)
+        return res['decoder_out'], res['decoder_out_lens'], res['tp_alphas']
+
+    @torch.jit.export
+    def forward_encoder_chunk(
+        self,
+        xs: torch.Tensor,
+        offset: int,
+        required_cache_size: int,
+        att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0),
+        cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0),
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        # TODO(Mddct): fix
+        xs_lens = torch.tensor(xs.size(1), dtype=torch.int)
+        encoder_out, _ = self._forward_encoder(xs, xs_lens)
+        return encoder_out, att_cache, cnn_cache
+
+    @torch.jit.export
+    def forward_cif_peaks(self, alphas: torch.Tensor,
+                          token_nums: torch.Tensor) -> torch.Tensor:
+        cif2_token_nums = alphas.sum(-1)
+        scale_alphas = alphas / (cif2_token_nums / token_nums).unsqueeze(1)
+        peaks = cif_without_hidden(scale_alphas,
+                                   self.predictor.predictor.threshold - 1e-4)
+
+        return peaks
+
+    def _forward_paraformer(
+        self,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        decoding_chunk_size: int = -1,
+        num_decoding_left_chunks: int = -1,
+    ) -> Dict[str, torch.Tensor]:
+        # encoder
+        encoder_out, encoder_out_mask = self._forward_encoder(
+            speech, speech_lengths, decoding_chunk_size,
+            num_decoding_left_chunks)
+
+        # cif predictor
+        acoustic_embed, token_num, _, _, tp_alphas, _ = self.predictor(
+            encoder_out, mask=encoder_out_mask)
+        token_num = token_num.floor().to(speech_lengths.dtype)
+
+        # decoder
+        decoder_out, _, _ = self.decoder(encoder_out, encoder_out_mask,
+                                         acoustic_embed, token_num)
+        decoder_out = decoder_out.log_softmax(dim=-1)
+
+        return {
+            "encoder_out": encoder_out,
+            "encoder_out_mask": encoder_out_mask,
+            "decoder_out": decoder_out,
+            "tp_alphas": tp_alphas,
+            "decoder_out_lens": token_num
+        }
+
+    def decode(
+        self,
+        methods: List[str],
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        beam_size: int,
+        decoding_chunk_size: int = -1,
+        num_decoding_left_chunks: int = -1,
+        ctc_weight: float = 0,
+        simulate_streaming: bool = False,
+        reverse_weight: float = 0,
+        context_graph=None,
+        blank_id: int = 0,
+        blank_penalty: float = 0.0,
+        length_penalty: float = 0.0,
+        infos: Dict[str, List[str]] = None,
+    ) -> Dict[str, List[DecodeResult]]:
+        res = self._forward_paraformer(speech, speech_lengths,
+                                       decoding_chunk_size,
+                                       num_decoding_left_chunks)
+        encoder_out, encoder_mask, decoder_out, decoder_out_lens, tp_alphas = res[
+            'encoder_out'], res['encoder_out_mask'], res['decoder_out'], res[
+                'decoder_out_lens'], res['tp_alphas']
+        peaks = self.forward_cif_peaks(tp_alphas, decoder_out_lens)
+        results = {}
+        if 'paraformer_greedy_search' in methods:
+            assert decoder_out is not None
+            assert decoder_out_lens is not None
+            paraformer_greedy_result = paraformer_greedy_search(
+                decoder_out, decoder_out_lens, peaks)
+            results['paraformer_greedy_search'] = paraformer_greedy_result
+        if 'paraformer_beam_search' in methods:
+            assert decoder_out is not None
+            assert decoder_out_lens is not None
+            paraformer_beam_result = paraformer_beam_search(
+                decoder_out,
+                decoder_out_lens,
+                beam_size=beam_size,
+                eos=self.eos)
+            results['paraformer_beam_search'] = paraformer_beam_result
+        if 'ctc_greedy_search' in methods or 'ctc_prefix_beam_search' in methods:
+            ctc_probs = self.ctc_logprobs(encoder_out, blank_penalty, blank_id)
+            encoder_lens = encoder_mask.squeeze(1).sum(1)
+            if 'ctc_greedy_search' in methods:
+                results['ctc_greedy_search'] = ctc_greedy_search(
+                    ctc_probs, encoder_lens, blank_id)
+            if 'ctc_prefix_beam_search' in methods:
+                ctc_prefix_result = ctc_prefix_beam_search(
+                    ctc_probs, encoder_lens, beam_size, context_graph,
+                    blank_id)
+                results['ctc_prefix_beam_search'] = ctc_prefix_result
+        return results
diff --git a/wenet/paraformer/search.py b/wenet/paraformer/search.py
new file mode 100644
index 0000000000000000000000000000000000000000..702ace0f983b890f8b285f42fafec753c38e58cb
--- /dev/null
+++ b/wenet/paraformer/search.py
@@ -0,0 +1,255 @@
+import math
+from typing import Any, List, Optional, Tuple, Union
+import torch
+
+from wenet.transformer.search import DecodeResult
+from wenet.utils.mask import (make_non_pad_mask, mask_finished_preds,
+                              mask_finished_scores)
+
+
+def _isChinese(ch: str):
+    if '\u4e00' <= ch <= '\u9fff' or '\u0030' <= ch <= '\u0039' or ch == '@':
+        return True
+    return False
+
+
+def _isAllChinese(word: Union[List[Any], str]):
+    word_lists = []
+    for i in word:
+        cur = i.replace(' ', '')
+        cur = cur.replace('</s>', '')
+        cur = cur.replace('<s>', '')
+        cur = cur.replace('<unk>', '')
+        cur = cur.replace('<OOV>', '')
+        word_lists.append(cur)
+
+    if len(word_lists) == 0:
+        return False
+
+    for ch in word_lists:
+        if _isChinese(ch) is False:
+            return False
+    return True
+
+
+def _isAllAlpha(word: Union[List[Any], str]):
+    word_lists = []
+    for i in word:
+        cur = i.replace(' ', '')
+        cur = cur.replace('</s>', '')
+        cur = cur.replace('<s>', '')
+        cur = cur.replace('<unk>', '')
+        cur = cur.replace('<OOV>', '')
+        word_lists.append(cur)
+
+    if len(word_lists) == 0:
+        return False
+
+    for ch in word_lists:
+        if ch.isalpha() is False and ch != "'":
+            return False
+        elif ch.isalpha() is True and _isChinese(ch) is True:
+            return False
+
+    return True
+
+
+def paraformer_beautify_result(tokens: List[str]) -> str:
+    middle_lists = []
+    word_lists = []
+    word_item = ''
+
+    # wash words lists
+    for token in tokens:
+        if token in ['<sos>', '<eos>', '<blank>']:
+            continue
+        else:
+            middle_lists.append(token)
+
+    # all chinese characters
+    if _isAllChinese(middle_lists):
+        for _, ch in enumerate(middle_lists):
+            word_lists.append(ch.replace(' ', ''))
+
+    # all alpha characters
+    elif _isAllAlpha(middle_lists):
+        for _, ch in enumerate(middle_lists):
+            word = ''
+            if '@@' in ch:
+                word = ch.replace('@@', '')
+                word_item += word
+            else:
+                word_item += ch
+                word_lists.append(word_item)
+                word_lists.append(' ')
+                word_item = ''
+
+    # mix characters
+    else:
+        alpha_blank = False
+        for _, ch in enumerate(middle_lists):
+            word = ''
+            if _isAllChinese(ch):
+                if alpha_blank is True:
+                    word_lists.pop()
+                word_lists.append(ch)
+                alpha_blank = False
+            elif '@@' in ch:
+                word = ch.replace('@@', '')
+                word_item += word
+                alpha_blank = False
+            elif _isAllAlpha(ch):
+                word_item += ch
+                word_lists.append(word_item)
+                word_lists.append(' ')
+                word_item = ''
+                alpha_blank = True
+            else:
+                word_lists.append(ch)
+                alpha_blank = False
+    return ''.join(word_lists).strip()
+
+
+def gen_timestamps_from_peak(cif_peaks: List[int],
+                             num_frames: int,
+                             frame_rate=0.02):
+    START_END_THRESHOLD = 5
+    MAX_TOKEN_DURATION = 14
+    force_time_shift = -0.5
+    fire_place = [peak + force_time_shift for peak in cif_peaks]
+    times = []
+    for i in range(len(fire_place) - 1):
+        if MAX_TOKEN_DURATION < 0 or fire_place[
+                i + 1] - fire_place[i] <= MAX_TOKEN_DURATION:
+            times.append(
+                [fire_place[i] * frame_rate, fire_place[i + 1] * frame_rate])
+        else:
+            split = fire_place[i] + MAX_TOKEN_DURATION
+            times.append([fire_place[i] * frame_rate, split * frame_rate])
+    if len(times) > 0:
+        if num_frames - fire_place[-1] > START_END_THRESHOLD:
+            end = (num_frames + fire_place[-1]) * 0.5
+            times[-1][1] = end * frame_rate
+            times.append([end * frame_rate, num_frames * frame_rate])
+        else:
+            times[-1][1] = num_frames * frame_rate
+    return times
+
+
+def paraformer_greedy_search(
+        decoder_out: torch.Tensor,
+        decoder_out_lens: torch.Tensor,
+        cif_peaks: Optional[torch.Tensor] = None) -> List[DecodeResult]:
+    batch_size = decoder_out.shape[0]
+    maxlen = decoder_out.size(1)
+    topk_prob, topk_index = decoder_out.topk(1, dim=2)
+    topk_index = topk_index.view(batch_size, maxlen)  # (B, maxlen)
+    topk_prob = topk_prob.view(batch_size, maxlen)
+    results: List[DecodeResult] = []
+    topk_index = topk_index.cpu().tolist()
+    topk_prob = topk_prob.cpu().tolist()
+    decoder_out_lens = decoder_out_lens.cpu().numpy()
+    for (i, hyp) in enumerate(topk_index):
+        confidence = 0.0
+        tokens_confidence = []
+        lens = decoder_out_lens[i]
+        for logp in topk_prob[i][:lens]:
+            tokens_confidence.append(math.exp(logp))
+            confidence += logp
+        r = DecodeResult(hyp[:lens],
+                         tokens_confidence=tokens_confidence,
+                         confidence=math.exp(confidence / lens))
+        results.append(r)
+
+    if cif_peaks is not None:
+        for (b, peaks) in enumerate(cif_peaks):
+            result = results[b]
+            times = []
+            n_token = 0
+            for (i, peak) in enumerate(peaks):
+                if n_token >= len(result.tokens):
+                    break
+                if peak > 1 - 1e-4:
+                    times.append(i)
+                    n_token += 1
+            result.times = times
+            assert len(result.times) == len(result.tokens)
+    return results
+
+
+def paraformer_beam_search(decoder_out: torch.Tensor,
+                           decoder_out_lens: torch.Tensor,
+                           beam_size: int = 10,
+                           eos: int = -1) -> List[DecodeResult]:
+    mask = make_non_pad_mask(decoder_out_lens)
+    indices, _ = _batch_beam_search(decoder_out,
+                                    mask,
+                                    beam_size=beam_size,
+                                    eos=eos)
+
+    best_hyps = indices[:, 0, :].cpu()
+    decoder_out_lens = decoder_out_lens.cpu()
+    results = []
+    # TODO(Mddct): scores, times etc
+    for (i, hyp) in enumerate(best_hyps.tolist()):
+        r = DecodeResult(hyp[:decoder_out_lens.numpy()[i]])
+        results.append(r)
+    return results
+
+
+def _batch_beam_search(
+    logit: torch.Tensor,
+    masks: torch.Tensor,
+    beam_size: int = 10,
+    eos: int = -1,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """ Perform batch beam search
+
+        Args:
+            logit: shape (batch_size, seq_length, vocab_size)
+            masks: shape (batch_size, seq_length)
+            beam_size: beam size
+
+        Returns:
+            indices: shape (batch_size, beam_size, seq_length)
+            log_prob: shape (batch_size, beam_size)
+
+        """
+
+    batch_size, seq_length, vocab_size = logit.shape
+    masks = ~masks
+    # beam search
+    with torch.no_grad():
+        # b,t,v
+        log_post = torch.nn.functional.log_softmax(logit, dim=-1)
+        # b,k
+        log_prob, indices = log_post[:, 0, :].topk(beam_size, sorted=True)
+        end_flag = torch.eq(masks[:, 0], 1).view(-1, 1)
+        # mask predictor and scores if end
+        log_prob = mask_finished_scores(log_prob, end_flag)
+        indices = mask_finished_preds(indices, end_flag, eos)
+        # b,k,1
+        indices = indices.unsqueeze(-1)
+
+        for i in range(1, seq_length):
+            # b,v
+            scores = mask_finished_scores(log_post[:, i, :], end_flag)
+            # b,v -> b,k,v
+            topk_scores = scores.unsqueeze(1).repeat(1, beam_size, 1)
+            # b,k,1 + b,k,v -> b,k,v
+            top_k_logp = log_prob.unsqueeze(-1) + topk_scores
+
+            # b,k,v -> b,k*v -> b,k
+            log_prob, top_k_index = top_k_logp.view(batch_size,
+                                                    -1).topk(beam_size,
+                                                             sorted=True)
+
+            index = mask_finished_preds(top_k_index, end_flag, eos)
+
+            indices = torch.cat([indices, index.unsqueeze(-1)], dim=-1)
+
+            end_flag = torch.eq(masks[:, i], 1).view(-1, 1)
+
+        indices = torch.fmod(indices, vocab_size)
+
+    return indices, log_prob
diff --git a/wenet/paraformer/subsampling.py b/wenet/paraformer/subsampling.py
new file mode 100644
index 0000000000000000000000000000000000000000..a36b7a25433c95016c6efdfa95fcea032ecbaa6e
--- /dev/null
+++ b/wenet/paraformer/subsampling.py
@@ -0,0 +1,48 @@
+from typing import Tuple, Union
+import torch
+from wenet.transformer.subsampling import BaseSubsampling
+
+
+class IdentitySubsampling(BaseSubsampling):
+    """ Paraformer subsampling
+    """
+
+    def __init__(self, idim: int, odim: int, dropout_rate: float,
+                 pos_enc_class: torch.nn.Module):
+        super().__init__()
+        _, _ = idim, odim
+        self.right_context = 6
+        self.subsampling_rate = 6
+        self.pos_enc = pos_enc_class
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_mask: torch.Tensor,
+        offset: Union[torch.Tensor, int] = 0
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Subsample x.
+
+        Args:
+            x (torch.Tensor): Input tensor (#batch, time, idim).
+            x_mask (torch.Tensor): Input mask (#batch, 1, time).
+
+        Returns:
+            torch.Tensor: Subsampled tensor (#batch, time', odim),
+                where time' = time.
+            torch.Tensor: Subsampled mask (#batch, 1, time'),
+                where time' = time
+            torch.Tensor: positional encoding
+
+        """
+        # NOTE(Mddct): Paraformer starts from 1
+        if isinstance(offset, torch.Tensor):
+            offset = torch.add(offset, 1)
+        else:
+            offset = offset + 1
+        x, pos_emb = self.pos_enc(x, offset)
+        return x, pos_emb, x_mask
+
+    def position_encoding(self, offset: Union[int, torch.Tensor],
+                          size: int) -> torch.Tensor:
+        return self.pos_enc.position_encoding(offset + 1, size)
diff --git a/wenet/squeezeformer/__init__.py b/wenet/squeezeformer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/wenet/squeezeformer/attention.py b/wenet/squeezeformer/attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3c973840f8ac9a1bd38cbc0095b97168e0a1495
--- /dev/null
+++ b/wenet/squeezeformer/attention.py
@@ -0,0 +1,232 @@
+# Copyright (c) 2019 Shigeki Karita
+#               2020 Mobvoi Inc (Binbin Zhang)
+#               2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn)
+#               2022 Ximalaya Inc. (Yuguang Yang)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Multi-Head Attention layer definition."""
+
+import math
+import torch
+import torch.nn as nn
+from wenet.transformer.attention import MultiHeadedAttention
+from typing import Tuple
+
+
+class RelPositionMultiHeadedAttention(MultiHeadedAttention):
+    """Multi-Head Attention layer with relative position encoding.
+    Paper: https://arxiv.org/abs/1901.02860
+    Args:
+        n_head (int): The number of heads.
+        n_feat (int): The number of features.
+        dropout_rate (float): Dropout rate.
+    """
+
+    def __init__(self,
+                 n_head,
+                 n_feat,
+                 dropout_rate,
+                 do_rel_shift=False,
+                 adaptive_scale=False,
+                 init_weights=False):
+        """Construct an RelPositionMultiHeadedAttention object."""
+        super().__init__(n_head, n_feat, dropout_rate)
+        # linear transformation for positional encoding
+        self.linear_pos = nn.Linear(n_feat, n_feat, bias=False)
+        # these two learnable bias are used in matrix c and matrix d
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        self.do_rel_shift = do_rel_shift
+        self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k))
+        self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k))
+        torch.nn.init.xavier_uniform_(self.pos_bias_u)
+        torch.nn.init.xavier_uniform_(self.pos_bias_v)
+        self.adaptive_scale = adaptive_scale
+        self.ada_scale = nn.Parameter(torch.ones([1, 1, n_feat]),
+                                      requires_grad=adaptive_scale)
+        self.ada_bias = nn.Parameter(torch.zeros([1, 1, n_feat]),
+                                     requires_grad=adaptive_scale)
+        if init_weights:
+            self.init_weights()
+
+    def init_weights(self):
+        input_max = (self.h * self.d_k)**-0.5
+        torch.nn.init.uniform_(self.linear_q.weight, -input_max, input_max)
+        torch.nn.init.uniform_(self.linear_q.bias, -input_max, input_max)
+        torch.nn.init.uniform_(self.linear_k.weight, -input_max, input_max)
+        torch.nn.init.uniform_(self.linear_k.bias, -input_max, input_max)
+        torch.nn.init.uniform_(self.linear_v.weight, -input_max, input_max)
+        torch.nn.init.uniform_(self.linear_v.bias, -input_max, input_max)
+        torch.nn.init.uniform_(self.linear_pos.weight, -input_max, input_max)
+        torch.nn.init.uniform_(self.linear_out.weight, -input_max, input_max)
+        torch.nn.init.uniform_(self.linear_out.bias, -input_max, input_max)
+
+    def rel_shift(self, x, zero_triu: bool = False):
+        """Compute relative positinal encoding.
+        Args:
+            x (torch.Tensor): Input tensor (batch, time, size).
+            zero_triu (bool): If true, return the lower triangular part of
+                the matrix.
+        Returns:
+            torch.Tensor: Output tensor.
+        """
+
+        zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1),
+                               device=x.device,
+                               dtype=x.dtype)
+        x_padded = torch.cat([zero_pad, x], dim=-1)
+
+        x_padded = x_padded.view(x.size()[0],
+                                 x.size()[1],
+                                 x.size(3) + 1, x.size(2))
+        x = x_padded[:, :, 1:].view_as(x)
+
+        if zero_triu:
+            ones = torch.ones((x.size(2), x.size(3)))
+            x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :]
+
+        return x
+
+    def forward_attention(
+        self,
+        value: torch.Tensor,
+        scores: torch.Tensor,
+        mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool)
+    ) -> torch.Tensor:
+        """Compute attention context vector.
+
+        Args:
+            value (torch.Tensor): Transformed value, size
+                (#batch, n_head, time2, d_k).
+            scores (torch.Tensor): Attention score, size
+                (#batch, n_head, time1, time2).
+            mask (torch.Tensor): Mask, size (#batch, 1, time2) or
+                (#batch, time1, time2), (0, 0, 0) means fake mask.
+
+        Returns:
+            torch.Tensor: Transformed value (#batch, time1, d_model)
+                weighted by the attention score (#batch, time1, time2).
+
+        """
+        n_batch = value.size(0)
+        # NOTE(xcsong): When will `if mask.size(2) > 0` be True?
+        #   1. onnx(16/4) [WHY? Because we feed real cache & real mask for the
+        #           1st chunk to ease the onnx export.]
+        #   2. pytorch training
+        if mask.size(2) > 0:  # time2 > 0
+            mask = mask.unsqueeze(1).eq(0)  # (batch, 1, *, time2)
+            # For last chunk, time2 might be larger than scores.size(-1)
+            mask = mask[:, :, :, :scores.size(-1)]  # (batch, 1, *, time2)
+            scores = scores.masked_fill(mask, -float('inf'))
+            # (batch, head, time1, time2)
+            attn = torch.softmax(scores, dim=-1).masked_fill(mask, 0.0)
+        # NOTE(xcsong): When will `if mask.size(2) > 0` be False?
+        #   1. onnx(16/-1, -1/-1, 16/0)
+        #   2. jit (16/-1, -1/-1, 16/0, 16/4)
+        else:
+            attn = torch.softmax(scores, dim=-1)  # (batch, head, time1, time2)
+
+        p_attn = self.dropout(attn)
+        x = torch.matmul(p_attn, value)  # (batch, head, time1, d_k)
+        x = (x.transpose(1, 2).contiguous().view(n_batch, -1,
+                                                 self.h * self.d_k)
+             )  # (batch, time1, d_model)
+
+        return self.linear_out(x)  # (batch, time1, d_model)
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+        pos_emb: torch.Tensor = torch.empty(0),
+        cache: torch.Tensor = torch.zeros((0, 0, 0, 0))
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Compute 'Scaled Dot Product Attention' with rel. positional encoding.
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
+                (#batch, time1, time2), (0, 0, 0) means fake mask.
+            pos_emb (torch.Tensor): Positional embedding tensor
+                (#batch, time2, size).
+            cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2),
+                where `cache_t == chunk_size * num_decoding_left_chunks`
+                and `head * d_k == size`
+        Returns:
+            torch.Tensor: Output tensor (#batch, time1, d_model).
+            torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2)
+                where `cache_t == chunk_size * num_decoding_left_chunks`
+                and `head * d_k == size`
+        """
+        if self.adaptive_scale:
+            query = self.ada_scale * query + self.ada_bias
+            key = self.ada_scale * key + self.ada_bias
+            value = self.ada_scale * value + self.ada_bias
+        q, k, v = self.forward_qkv(query, key, value)
+        q = q.transpose(1, 2)  # (batch, time1, head, d_k)
+
+        # NOTE(xcsong):
+        #   when export onnx model, for 1st chunk, we feed
+        #       cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode)
+        #       or cache(1, head, real_cache_t, d_k * 2) (16/4 mode).
+        #       In all modes, `if cache.size(0) > 0` will alwayse be `True`
+        #       and we will always do splitting and
+        #       concatnation(this will simplify onnx export). Note that
+        #       it's OK to concat & split zero-shaped tensors(see code below).
+        #   when export jit  model, for 1st chunk, we always feed
+        #       cache(0, 0, 0, 0) since jit supports dynamic if-branch.
+        # >>> a = torch.ones((1, 2, 0, 4))
+        # >>> b = torch.ones((1, 2, 3, 4))
+        # >>> c = torch.cat((a, b), dim=2)
+        # >>> torch.equal(b, c)        # True
+        # >>> d = torch.split(a, 2, dim=-1)
+        # >>> torch.equal(d[0], d[1])  # True
+        if cache.size(0) > 0:
+            key_cache, value_cache = torch.split(cache,
+                                                 cache.size(-1) // 2,
+                                                 dim=-1)
+            k = torch.cat([key_cache, k], dim=2)
+            v = torch.cat([value_cache, v], dim=2)
+        # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's
+        #   non-trivial to calculate `next_cache_start` here.
+        new_cache = torch.cat((k, v), dim=-1)
+
+        n_batch_pos = pos_emb.size(0)
+        p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k)
+        p = p.transpose(1, 2)  # (batch, head, time1, d_k)
+
+        # (batch, head, time1, d_k)
+        q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2)
+        # (batch, head, time1, d_k)
+        q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2)
+
+        # compute attention score
+        # first compute matrix a and matrix c
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        # (batch, head, time1, time2)
+        matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1))
+
+        # compute matrix b and matrix d
+        # (batch, head, time1, time2)
+        matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1))
+        # Remove rel_shift since it is useless in speech recognition,
+        # and it requires special attention for streaming.
+        if self.do_rel_shift:
+            matrix_bd = self.rel_shift(matrix_bd)
+
+        scores = (matrix_ac + matrix_bd) / math.sqrt(
+            self.d_k)  # (batch, head, time1, time2)
+
+        return self.forward_attention(v, scores, mask), new_cache
diff --git a/wenet/squeezeformer/conv2d.py b/wenet/squeezeformer/conv2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..5107d25330ee3a304ca4f202157def77606297c9
--- /dev/null
+++ b/wenet/squeezeformer/conv2d.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Conv2d Module with Valid Padding"""
+
+import torch.nn.functional as F
+from torch.nn.modules.conv import _ConvNd, _size_2_t, Union, _pair, Tensor, Optional
+
+
+class Conv2dValid(_ConvNd):
+    """
+    Conv2d operator for VALID mode padding.
+    """
+
+    def __init__(
+            self,
+            in_channels: int,
+            out_channels: int,
+            kernel_size: _size_2_t,
+            stride: _size_2_t = 1,
+            padding: Union[str, _size_2_t] = 0,
+            dilation: _size_2_t = 1,
+            groups: int = 1,
+            bias: bool = True,
+            padding_mode: str = 'zeros',  # TODO: refine this type
+            device=None,
+            dtype=None,
+            valid_trigx: bool = False,
+            valid_trigy: bool = False) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        kernel_size_ = _pair(kernel_size)
+        stride_ = _pair(stride)
+        padding_ = padding if isinstance(padding, str) else _pair(padding)
+        dilation_ = _pair(dilation)
+        super(Conv2dValid,
+              self).__init__(in_channels, out_channels,
+                             kernel_size_, stride_, padding_, dilation_, False,
+                             _pair(0), groups, bias, padding_mode,
+                             **factory_kwargs)
+        self.valid_trigx = valid_trigx
+        self.valid_trigy = valid_trigy
+
+    def _conv_forward(self, input: Tensor, weight: Tensor,
+                      bias: Optional[Tensor]):
+        validx, validy = 0, 0
+        if self.valid_trigx:
+            validx = (input.size(-2) *
+                      (self.stride[-2] - 1) - 1 + self.kernel_size[-2]) // 2
+        if self.valid_trigy:
+            validy = (input.size(-1) *
+                      (self.stride[-1] - 1) - 1 + self.kernel_size[-1]) // 2
+        return F.conv2d(input, weight, bias, self.stride, (validx, validy),
+                        self.dilation, self.groups)
+
+    def forward(self, input: Tensor) -> Tensor:
+        return self._conv_forward(input, self.weight, self.bias)
diff --git a/wenet/squeezeformer/convolution.py b/wenet/squeezeformer/convolution.py
new file mode 100644
index 0000000000000000000000000000000000000000..4218cbacbeb0cf821e7507536a30f0d486119688
--- /dev/null
+++ b/wenet/squeezeformer/convolution.py
@@ -0,0 +1,177 @@
+# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu)
+#               2022 Ximalaya Inc. (authors: Yuguang Yang)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from ESPnet(https://github.com/espnet/espnet)
+"""ConvolutionModule definition."""
+
+from typing import Tuple
+
+import torch
+from torch import nn
+
+
+class ConvolutionModule(nn.Module):
+    """ConvolutionModule in Conformer model."""
+
+    def __init__(self,
+                 channels: int,
+                 kernel_size: int = 15,
+                 activation: nn.Module = nn.ReLU(),
+                 norm: str = "batch_norm",
+                 causal: bool = False,
+                 bias: bool = True,
+                 adaptive_scale: bool = False,
+                 init_weights: bool = False):
+        """Construct an ConvolutionModule object.
+        Args:
+            channels (int): The number of channels of conv layers.
+            kernel_size (int): Kernel size of conv layers.
+            causal (int): Whether use causal convolution or not
+        """
+        super().__init__()
+        self.bias = bias
+        self.channels = channels
+        self.kernel_size = kernel_size
+        self.adaptive_scale = adaptive_scale
+        self.ada_scale = torch.nn.Parameter(torch.ones([1, 1, channels]),
+                                            requires_grad=adaptive_scale)
+        self.ada_bias = torch.nn.Parameter(torch.zeros([1, 1, channels]),
+                                           requires_grad=adaptive_scale)
+
+        self.pointwise_conv1 = nn.Conv1d(
+            channels,
+            2 * channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=bias,
+        )
+        # self.lorder is used to distinguish if it's a causal convolution,
+        # if self.lorder > 0: it's a causal convolution, the input will be
+        #    padded with self.lorder frames on the left in forward.
+        # else: it's a symmetrical convolution
+        if causal:
+            padding = 0
+            self.lorder = kernel_size - 1
+        else:
+            # kernel_size should be an odd number for none causal convolution
+            assert (kernel_size - 1) % 2 == 0
+            padding = (kernel_size - 1) // 2
+            self.lorder = 0
+        self.depthwise_conv = nn.Conv1d(
+            channels,
+            channels,
+            kernel_size,
+            stride=1,
+            padding=padding,
+            groups=channels,
+            bias=bias,
+        )
+
+        assert norm in ['batch_norm', 'layer_norm']
+        if norm == "batch_norm":
+            self.use_layer_norm = False
+            self.norm = nn.BatchNorm1d(channels)
+        else:
+            self.use_layer_norm = True
+            self.norm = nn.LayerNorm(channels)
+
+        self.pointwise_conv2 = nn.Conv1d(
+            channels,
+            channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=bias,
+        )
+        self.activation = activation
+        if init_weights:
+            self.init_weights()
+
+    def init_weights(self):
+        pw_max = self.channels**-0.5
+        dw_max = self.kernel_size**-0.5
+        torch.nn.init.uniform_(self.pointwise_conv1.weight.data, -pw_max,
+                               pw_max)
+        if self.bias:
+            torch.nn.init.uniform_(self.pointwise_conv1.bias.data, -pw_max,
+                                   pw_max)
+        torch.nn.init.uniform_(self.depthwise_conv.weight.data, -dw_max,
+                               dw_max)
+        if self.bias:
+            torch.nn.init.uniform_(self.depthwise_conv.bias.data, -dw_max,
+                                   dw_max)
+        torch.nn.init.uniform_(self.pointwise_conv2.weight.data, -pw_max,
+                               pw_max)
+        if self.bias:
+            torch.nn.init.uniform_(self.pointwise_conv2.bias.data, -pw_max,
+                                   pw_max)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+        cache: torch.Tensor = torch.zeros((0, 0, 0)),
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Compute convolution module.
+        Args:
+            x (torch.Tensor): Input tensor (#batch, time, channels).
+            mask_pad (torch.Tensor): used for batch padding (#batch, 1, time),
+                (0, 0, 0) means fake mask.
+            cache (torch.Tensor): left context cache, it is only
+                used in causal convolution (#batch, channels, cache_t),
+                (0, 0, 0) meas fake cache.
+        Returns:
+            torch.Tensor: Output tensor (#batch, time, channels).
+        """
+        if self.adaptive_scale:
+            x = self.ada_scale * x + self.ada_bias
+        # exchange the temporal dimension and the feature dimension
+        x = x.transpose(1, 2)  # (#batch, channels, time)
+        # mask batch padding
+        if mask_pad.size(2) > 0:  # time > 0
+            x.masked_fill_(~mask_pad, 0.0)
+
+        if self.lorder > 0:
+            if cache.size(2) == 0:  # cache_t == 0
+                x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0)
+            else:
+                assert cache.size(0) == x.size(0)  # equal batch
+                assert cache.size(1) == x.size(1)  # equal channel
+                x = torch.cat((cache, x), dim=2)
+            assert (x.size(2) > self.lorder)
+            new_cache = x[:, :, -self.lorder:]
+        else:
+            # It's better we just return None if no cache is required,
+            # However, for JIT export, here we just fake one tensor instead of
+            # None.
+            new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device)
+
+        # GLU mechanism
+        x = self.pointwise_conv1(x)  # (batch, 2*channel, dim)
+        x = nn.functional.glu(x, dim=1)  # (batch, channel, dim)
+
+        # 1D Depthwise Conv
+        x = self.depthwise_conv(x)
+        if self.use_layer_norm:
+            x = x.transpose(1, 2)
+        x = self.activation(self.norm(x))
+        if self.use_layer_norm:
+            x = x.transpose(1, 2)
+        x = self.pointwise_conv2(x)
+        # mask batch padding
+        if mask_pad.size(2) > 0:  # time > 0
+            x.masked_fill_(~mask_pad, 0.0)
+
+        return x.transpose(1, 2), new_cache
diff --git a/wenet/squeezeformer/encoder.py b/wenet/squeezeformer/encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..bee71b92bbe024121da36a7cde7735011ec33331
--- /dev/null
+++ b/wenet/squeezeformer/encoder.py
@@ -0,0 +1,466 @@
+# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from Squeezeformer(https://github.com/kssteven418/Squeezeformer)
+#               Squeezeformer(https://github.com/upskyy/Squeezeformer)
+#               NeMo(https://github.com/NVIDIA/NeMo)
+
+import torch
+import torch.nn as nn
+from typing import Tuple, Union, Optional, List
+from wenet.squeezeformer.subsampling \
+    import DepthwiseConv2dSubsampling4, TimeReductionLayer1D, \
+    TimeReductionLayer2D, TimeReductionLayerStream
+from wenet.squeezeformer.encoder_layer import SqueezeformerEncoderLayer
+from wenet.transformer.embedding import RelPositionalEncoding
+from wenet.transformer.attention import MultiHeadedAttention
+from wenet.squeezeformer.attention import RelPositionMultiHeadedAttention
+from wenet.squeezeformer.positionwise_feed_forward \
+    import PositionwiseFeedForward
+from wenet.squeezeformer.convolution import ConvolutionModule
+from wenet.utils.mask import make_pad_mask, add_optional_chunk_mask
+from wenet.utils.class_utils import WENET_ACTIVATION_CLASSES
+
+
+class SqueezeformerEncoder(nn.Module):
+
+    def __init__(self,
+                 input_size: int = 80,
+                 encoder_dim: int = 256,
+                 output_size: int = 256,
+                 attention_heads: int = 4,
+                 num_blocks: int = 12,
+                 reduce_idx: Optional[Union[int, List[int]]] = 5,
+                 recover_idx: Optional[Union[int, List[int]]] = 11,
+                 feed_forward_expansion_factor: int = 4,
+                 dw_stride: bool = False,
+                 input_dropout_rate: float = 0.1,
+                 pos_enc_layer_type: str = "rel_pos",
+                 time_reduction_layer_type: str = "conv1d",
+                 do_rel_shift: bool = True,
+                 feed_forward_dropout_rate: float = 0.1,
+                 attention_dropout_rate: float = 0.1,
+                 cnn_module_kernel: int = 31,
+                 cnn_norm_type: str = "batch_norm",
+                 dropout: float = 0.1,
+                 causal: bool = False,
+                 adaptive_scale: bool = True,
+                 activation_type: str = "swish",
+                 init_weights: bool = True,
+                 global_cmvn: torch.nn.Module = None,
+                 normalize_before: bool = False,
+                 use_dynamic_chunk: bool = False,
+                 concat_after: bool = False,
+                 static_chunk_size: int = 0,
+                 use_dynamic_left_chunk: bool = False):
+        """Construct SqueezeformerEncoder
+
+        Args:
+            input_size to use_dynamic_chunk, see in Transformer BaseEncoder.
+            encoder_dim (int): The hidden dimension of encoder layer.
+            output_size (int): The output dimension of final projection layer.
+            attention_heads (int): Num of attention head in attention module.
+            num_blocks (int): Num of encoder layers.
+            reduce_idx Optional[Union[int, List[int]]]:
+                reduce layer index, from 40ms to 80ms per frame.
+            recover_idx Optional[Union[int, List[int]]]:
+                recover layer index, from 80ms to 40ms per frame.
+            feed_forward_expansion_factor (int): Enlarge coefficient of FFN.
+            dw_stride (bool): Whether do depthwise convolution
+                              on subsampling module.
+            input_dropout_rate (float): Dropout rate of input projection layer.
+            pos_enc_layer_type (str): Self attention type.
+            time_reduction_layer_type (str): Conv1d or Conv2d reduction layer.
+            do_rel_shift (bool): Whether to do relative shift
+                                 operation on rel-attention module.
+            cnn_module_kernel (int): Kernel size of CNN module.
+            activation_type (str): Encoder activation function type.
+            use_cnn_module (bool): Whether to use convolution module.
+            cnn_module_kernel (int): Kernel size of convolution module.
+            adaptive_scale (bool): Whether to use adaptive scale.
+            init_weights (bool): Whether to initialize weights.
+            causal (bool): whether to use causal convolution or not.
+        """
+        super(SqueezeformerEncoder, self).__init__()
+        self.global_cmvn = global_cmvn
+        self.reduce_idx: Optional[Union[int, List[int]]] = [reduce_idx] \
+            if type(reduce_idx) == int else reduce_idx
+        self.recover_idx: Optional[Union[int, List[int]]] = [recover_idx] \
+            if type(recover_idx) == int else recover_idx
+        self.check_ascending_list()
+        if reduce_idx is None:
+            self.time_reduce = None
+        else:
+            if recover_idx is None:
+                self.time_reduce = 'normal'  # no recovery at the end
+            else:
+                self.time_reduce = 'recover'  # recovery at the end
+                assert len(self.reduce_idx) == len(self.recover_idx)
+            self.reduce_stride = 2
+        self._output_size = output_size
+        self.normalize_before = normalize_before
+        self.static_chunk_size = static_chunk_size
+        self.use_dynamic_chunk = use_dynamic_chunk
+        self.use_dynamic_left_chunk = use_dynamic_left_chunk
+        self.pos_enc_layer_type = pos_enc_layer_type
+        activation = WENET_ACTIVATION_CLASSES[activation_type]()
+
+        # self-attention module definition
+        if pos_enc_layer_type != "rel_pos":
+            encoder_selfattn_layer = MultiHeadedAttention
+            encoder_selfattn_layer_args = (
+                attention_heads,
+                output_size,
+                attention_dropout_rate,
+            )
+        else:
+            encoder_selfattn_layer = RelPositionMultiHeadedAttention
+            encoder_selfattn_layer_args = (attention_heads, encoder_dim,
+                                           attention_dropout_rate,
+                                           do_rel_shift, adaptive_scale,
+                                           init_weights)
+
+        # feed-forward module definition
+        positionwise_layer = PositionwiseFeedForward
+        positionwise_layer_args = (encoder_dim,
+                                   encoder_dim * feed_forward_expansion_factor,
+                                   feed_forward_dropout_rate, activation,
+                                   adaptive_scale, init_weights)
+
+        # convolution module definition
+        convolution_layer = ConvolutionModule
+        convolution_layer_args = (encoder_dim, cnn_module_kernel, activation,
+                                  cnn_norm_type, causal, True, adaptive_scale,
+                                  init_weights)
+
+        self.embed = DepthwiseConv2dSubsampling4(
+            1, encoder_dim, RelPositionalEncoding(encoder_dim,
+                                                  dropout_rate=0.1), dw_stride,
+            input_size, input_dropout_rate, init_weights)
+
+        self.preln = nn.LayerNorm(encoder_dim)
+        self.encoders = torch.nn.ModuleList([
+            SqueezeformerEncoderLayer(
+                encoder_dim,
+                encoder_selfattn_layer(*encoder_selfattn_layer_args),
+                positionwise_layer(*positionwise_layer_args),
+                convolution_layer(*convolution_layer_args),
+                positionwise_layer(*positionwise_layer_args), normalize_before,
+                dropout, concat_after) for _ in range(num_blocks)
+        ])
+        if time_reduction_layer_type == 'conv1d':
+            time_reduction_layer = TimeReductionLayer1D
+            time_reduction_layer_args = {
+                'channel': encoder_dim,
+                'out_dim': encoder_dim,
+            }
+        elif time_reduction_layer_type == 'stream':
+            time_reduction_layer = TimeReductionLayerStream
+            time_reduction_layer_args = {
+                'channel': encoder_dim,
+                'out_dim': encoder_dim,
+            }
+        else:
+            time_reduction_layer = TimeReductionLayer2D
+            time_reduction_layer_args = {'encoder_dim': encoder_dim}
+
+        self.time_reduction_layer = time_reduction_layer(
+            **time_reduction_layer_args)
+        self.time_recover_layer = nn.Linear(encoder_dim, encoder_dim)
+        self.final_proj = None
+        if output_size != encoder_dim:
+            self.final_proj = nn.Linear(encoder_dim, output_size)
+
+    def output_size(self) -> int:
+        return self._output_size
+
+    def forward(
+        self,
+        xs: torch.Tensor,
+        xs_lens: torch.Tensor,
+        decoding_chunk_size: int = 0,
+        num_decoding_left_chunks: int = -1,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        T = xs.size(1)
+        masks = ~make_pad_mask(xs_lens, T).unsqueeze(1)  # (B, 1, T)
+        if self.global_cmvn is not None:
+            xs = self.global_cmvn(xs)
+        xs, pos_emb, masks = self.embed(xs, masks)
+        mask_pad = masks  # (B, 1, T/subsample_rate)
+        chunk_masks = add_optional_chunk_mask(xs, masks,
+                                              self.use_dynamic_chunk,
+                                              self.use_dynamic_left_chunk,
+                                              decoding_chunk_size,
+                                              self.static_chunk_size,
+                                              num_decoding_left_chunks)
+        xs_lens = mask_pad.squeeze(1).sum(1)
+        xs = self.preln(xs)
+        recover_activations: \
+            List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = []
+        index = 0
+        for i, layer in enumerate(self.encoders):
+            if self.reduce_idx is not None:
+                if self.time_reduce is not None and i in self.reduce_idx:
+                    recover_activations.append(
+                        (xs, chunk_masks, pos_emb, mask_pad))
+                    xs, xs_lens, chunk_masks, mask_pad = \
+                        self.time_reduction_layer(xs, xs_lens, chunk_masks, mask_pad)
+                    pos_emb = pos_emb[:, ::2, :]
+                    index += 1
+
+            if self.recover_idx is not None:
+                if self.time_reduce == 'recover' and i in self.recover_idx:
+                    index -= 1
+                    (recover_tensor, recover_chunk_masks,
+                     recover_pos_emb, recover_mask_pad) \
+                        = recover_activations[index]
+                    # recover output length for ctc decode
+                    xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2)
+                    xs = self.time_recover_layer(xs)
+                    recoverd_t = recover_tensor.size(1)
+                    xs = recover_tensor + xs[:, :recoverd_t, :].contiguous()
+                    chunk_masks = recover_chunk_masks
+                    pos_emb = recover_pos_emb
+                    mask_pad = recover_mask_pad
+                    xs = xs.masked_fill(~mask_pad[:, 0, :].unsqueeze(-1), 0.0)
+
+            xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad)
+
+        if self.final_proj is not None:
+            xs = self.final_proj(xs)
+        return xs, masks
+
+    def check_ascending_list(self):
+        if self.reduce_idx is not None:
+            assert self.reduce_idx == sorted(self.reduce_idx), \
+                "reduce_idx should be int or ascending list"
+        if self.recover_idx is not None:
+            assert self.recover_idx == sorted(self.recover_idx), \
+                "recover_idx should be int or ascending list"
+
+    def calculate_downsampling_factor(self, i: int) -> int:
+        if self.reduce_idx is None:
+            return 1
+        else:
+            reduce_exp, recover_exp = 0, 0
+            for exp, rd_idx in enumerate(self.reduce_idx):
+                if i >= rd_idx:
+                    reduce_exp = exp + 1
+            if self.recover_idx is not None:
+                for exp, rc_idx in enumerate(self.recover_idx):
+                    if i >= rc_idx:
+                        recover_exp = exp + 1
+            return int(2**(reduce_exp - recover_exp))
+
+    def forward_chunk(
+        self,
+        xs: torch.Tensor,
+        offset: int,
+        required_cache_size: int,
+        att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0),
+        cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0),
+        att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """ Forward just one chunk
+
+        Args:
+            xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim),
+                where `time == (chunk_size - 1) * subsample_rate + \
+                        subsample.right_context + 1`
+            offset (int): current offset in encoder output time stamp
+            required_cache_size (int): cache size required for next chunk
+                compuation
+                >=0: actual cache size
+                <0: means all history cache is required
+            att_cache (torch.Tensor): cache tensor for KEY & VALUE in
+                transformer/conformer attention, with shape
+                (elayers, head, cache_t1, d_k * 2), where
+                `head * d_k == hidden-dim` and
+                `cache_t1 == chunk_size * num_decoding_left_chunks`.
+            cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer,
+                (elayers, b=1, hidden-dim, cache_t2), where
+                `cache_t2 == cnn.lorder - 1`
+
+        Returns:
+            torch.Tensor: output of current input xs,
+                with shape (b=1, chunk_size, hidden-dim).
+            torch.Tensor: new attention cache required for next chunk, with
+                dynamic shape (elayers, head, ?, d_k * 2)
+                depending on required_cache_size.
+            torch.Tensor: new conformer cnn cache required for next chunk, with
+                same shape as the original cnn_cache.
+
+        """
+        assert xs.size(0) == 1
+        # tmp_masks is just for interface compatibility
+        tmp_masks = torch.ones(1,
+                               xs.size(1),
+                               device=xs.device,
+                               dtype=torch.bool)
+        tmp_masks = tmp_masks.unsqueeze(1)
+        if self.global_cmvn is not None:
+            xs = self.global_cmvn(xs)
+        # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim)
+        xs, pos_emb, _ = self.embed(xs, tmp_masks, offset)
+        # NOTE(xcsong): After  embed, shape(xs) is (b=1, chunk_size, hidden-dim)
+        elayers, cache_t1 = att_cache.size(0), att_cache.size(2)
+        chunk_size = xs.size(1)
+        attention_key_size = cache_t1 + chunk_size
+        pos_emb = self.embed.position_encoding(offset=offset - cache_t1,
+                                               size=attention_key_size)
+        if required_cache_size < 0:
+            next_cache_start = 0
+        elif required_cache_size == 0:
+            next_cache_start = attention_key_size
+        else:
+            next_cache_start = max(attention_key_size - required_cache_size, 0)
+
+        r_att_cache = []
+        r_cnn_cache = []
+
+        mask_pad = torch.ones(1,
+                              xs.size(1),
+                              device=xs.device,
+                              dtype=torch.bool)
+        mask_pad = mask_pad.unsqueeze(1)
+        max_att_len: int = 0
+        recover_activations: \
+            List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = []
+        index = 0
+        xs_lens = torch.tensor([xs.size(1)], device=xs.device, dtype=torch.int)
+        xs = self.preln(xs)
+        for i, layer in enumerate(self.encoders):
+            # NOTE(xcsong): Before layer.forward
+            #   shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2),
+            #   shape(cnn_cache[i])       is (b=1, hidden-dim, cache_t2)
+            if self.reduce_idx is not None:
+                if self.time_reduce is not None and i in self.reduce_idx:
+                    recover_activations.append(
+                        (xs, att_mask, pos_emb, mask_pad))
+                    xs, xs_lens, att_mask, mask_pad = \
+                        self.time_reduction_layer(xs, xs_lens, att_mask, mask_pad)
+                    pos_emb = pos_emb[:, ::2, :]
+                    index += 1
+
+            if self.recover_idx is not None:
+                if self.time_reduce == 'recover' and i in self.recover_idx:
+                    index -= 1
+                    (recover_tensor, recover_att_mask,
+                     recover_pos_emb, recover_mask_pad) \
+                        = recover_activations[index]
+                    # recover output length for ctc decode
+                    xs = xs.unsqueeze(2).repeat(1, 1, 2, 1).flatten(1, 2)
+                    xs = self.time_recover_layer(xs)
+                    recoverd_t = recover_tensor.size(1)
+                    xs = recover_tensor + xs[:, :recoverd_t, :].contiguous()
+                    att_mask = recover_att_mask
+                    pos_emb = recover_pos_emb
+                    mask_pad = recover_mask_pad
+                    if att_mask.size(1) != 0:
+                        xs = xs.masked_fill(~att_mask[:, 0, :].unsqueeze(-1),
+                                            0.0)
+
+            factor = self.calculate_downsampling_factor(i)
+
+            xs, _, new_att_cache, new_cnn_cache = layer(
+                xs,
+                att_mask,
+                pos_emb,
+                att_cache=att_cache[i:i + 1][:, :, ::factor, :]
+                [:, :, :pos_emb.size(1) - xs.size(1), :]
+                if elayers > 0 else att_cache[:, :, ::factor, :],
+                cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache)
+            # NOTE(xcsong): After layer.forward
+            #   shape(new_att_cache) is (1, head, attention_key_size, d_k * 2),
+            #   shape(new_cnn_cache) is (b=1, hidden-dim, cache_t2)
+            cached_att \
+                = new_att_cache[:, :, next_cache_start // factor:, :]
+            cached_cnn = new_cnn_cache.unsqueeze(0)
+            cached_att = cached_att.unsqueeze(3).\
+                repeat(1, 1, 1, factor, 1).flatten(2, 3)
+            if i == 0:
+                # record length for the first block as max length
+                max_att_len = cached_att.size(2)
+            r_att_cache.append(cached_att[:, :, :max_att_len, :])
+            r_cnn_cache.append(cached_cnn)
+        # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2),
+        #   ? may be larger than cache_t1, it depends on required_cache_size
+        r_att_cache = torch.cat(r_att_cache, dim=0)
+        # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2)
+        r_cnn_cache = torch.cat(r_cnn_cache, dim=0)
+
+        if self.final_proj is not None:
+            xs = self.final_proj(xs)
+        return (xs, r_att_cache, r_cnn_cache)
+
+    def forward_chunk_by_chunk(
+        self,
+        xs: torch.Tensor,
+        decoding_chunk_size: int,
+        num_decoding_left_chunks: int = -1,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """ Forward input chunk by chunk with chunk_size like a streaming
+            fashion
+
+        Here we should pay special attention to computation cache in the
+        streaming style forward chunk by chunk. Three things should be taken
+        into account for computation in the current network:
+            1. transformer/conformer encoder layers output cache
+            2. convolution in conformer
+            3. convolution in subsampling
+
+        However, we don't implement subsampling cache for:
+            1. We can control subsampling module to output the right result by
+               overlapping input instead of cache left context, even though it
+               wastes some computation, but subsampling only takes a very
+               small fraction of computation in the whole model.
+            2. Typically, there are several covolution layers with subsampling
+               in subsampling module, it is tricky and complicated to do cache
+               with different convolution layers with different subsampling
+               rate.
+            3. Currently, nn.Sequential is used to stack all the convolution
+               layers in subsampling, we need to rewrite it to make it work
+               with cache, which is not prefered.
+        Args:
+            xs (torch.Tensor): (1, max_len, dim)
+            chunk_size (int): decoding chunk size
+        """
+        assert decoding_chunk_size > 0
+        # The model is trained by static or dynamic chunk
+        assert self.static_chunk_size > 0 or self.use_dynamic_chunk
+        subsampling = self.embed.subsampling_rate
+        context = self.embed.right_context + 1  # Add current frame
+        stride = subsampling * decoding_chunk_size
+        decoding_window = (decoding_chunk_size - 1) * subsampling + context
+        num_frames = xs.size(1)
+        att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device)
+        cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device)
+        outputs = []
+        offset = 0
+        required_cache_size = decoding_chunk_size * num_decoding_left_chunks
+
+        # Feed forward overlap input step by step
+        for cur in range(0, num_frames - context + 1, stride):
+            end = min(cur + decoding_window, num_frames)
+            chunk_xs = xs[:, cur:end, :]
+            (y, att_cache, cnn_cache) = \
+                self.forward_chunk(
+                    chunk_xs, offset, required_cache_size,
+                    att_cache, cnn_cache)
+            outputs.append(y)
+            offset += y.size(1)
+        ys = torch.cat(outputs, 1)
+        masks = torch.ones((1, 1, ys.size(1)),
+                           device=ys.device,
+                           dtype=torch.bool)
+        return ys, masks
diff --git a/wenet/squeezeformer/encoder_layer.py b/wenet/squeezeformer/encoder_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..b354b303221131e51ee1726433ce80a168829a9d
--- /dev/null
+++ b/wenet/squeezeformer/encoder_layer.py
@@ -0,0 +1,121 @@
+# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""SqueezeformerEncoderLayer definition."""
+
+import torch
+import torch.nn as nn
+from typing import Optional, Tuple
+
+
+class SqueezeformerEncoderLayer(nn.Module):
+    """Encoder layer module.
+        Args:
+            size (int): Input dimension.
+            self_attn (torch.nn.Module): Self-attention module instance.
+                `MultiHeadedAttention` or `RelPositionMultiHeadedAttention`
+                instance can be used as the argument.
+            feed_forward1 (torch.nn.Module): Feed-forward module instance.
+                `PositionwiseFeedForward` instance can be used as the argument.
+            conv_module (torch.nn.Module): Convolution module instance.
+                `ConvlutionModule` instance can be used as the argument.
+            feed_forward2 (torch.nn.Module): Feed-forward module instance.
+                `PositionwiseFeedForward` instance can be used as the argument.
+            dropout_rate (float): Dropout rate.
+            normalize_before (bool):
+                True: use layer_norm before each sub-block.
+                False: use layer_norm after each sub-block.
+        """
+
+    def __init__(
+        self,
+        size: int,
+        self_attn: torch.nn.Module,
+        feed_forward1: Optional[nn.Module] = None,
+        conv_module: Optional[nn.Module] = None,
+        feed_forward2: Optional[nn.Module] = None,
+        normalize_before: bool = False,
+        dropout_rate: float = 0.1,
+        concat_after: bool = False,
+    ):
+        super(SqueezeformerEncoderLayer, self).__init__()
+        self.size = size
+        self.self_attn = self_attn
+        self.layer_norm1 = nn.LayerNorm(size)
+        self.ffn1 = feed_forward1
+        self.layer_norm2 = nn.LayerNorm(size)
+        self.conv_module = conv_module
+        self.layer_norm3 = nn.LayerNorm(size)
+        self.ffn2 = feed_forward2
+        self.layer_norm4 = nn.LayerNorm(size)
+        self.normalize_before = normalize_before
+        self.dropout = nn.Dropout(dropout_rate)
+        self.concat_after = concat_after
+        if concat_after:
+            self.concat_linear = nn.Linear(size + size, size)
+        else:
+            self.concat_linear = nn.Identity()
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask: torch.Tensor,
+        pos_emb: torch.Tensor,
+        mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+        att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)),
+        cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)),
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        # self attention module
+        residual = x
+        if self.normalize_before:
+            x = self.layer_norm1(x)
+        x_att, new_att_cache = self.self_attn(x, x, x, mask, pos_emb,
+                                              att_cache)
+        if self.concat_after:
+            x_concat = torch.cat((x, x_att), dim=-1)
+            x = residual + self.concat_linear(x_concat)
+        else:
+            x = residual + self.dropout(x_att)
+        if not self.normalize_before:
+            x = self.layer_norm1(x)
+
+        # ffn module
+        residual = x
+        if self.normalize_before:
+            x = self.layer_norm2(x)
+        x = self.ffn1(x)
+        x = residual + self.dropout(x)
+        if not self.normalize_before:
+            x = self.layer_norm2(x)
+
+        # conv module
+        new_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device)
+        residual = x
+        if self.normalize_before:
+            x = self.layer_norm3(x)
+        x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache)
+        x = residual + self.dropout(x)
+        if not self.normalize_before:
+            x = self.layer_norm3(x)
+
+        # ffn module
+        residual = x
+        if self.normalize_before:
+            x = self.layer_norm4(x)
+        x = self.ffn2(x)
+        # we do not use dropout here since it is inside feed forward function
+        x = residual + self.dropout(x)
+        if not self.normalize_before:
+            x = self.layer_norm4(x)
+
+        return x, mask, new_att_cache, new_cnn_cache
diff --git a/wenet/squeezeformer/positionwise_feed_forward.py b/wenet/squeezeformer/positionwise_feed_forward.py
new file mode 100644
index 0000000000000000000000000000000000000000..40100959bca389382bd89e7b29609b329c12a56f
--- /dev/null
+++ b/wenet/squeezeformer/positionwise_feed_forward.py
@@ -0,0 +1,77 @@
+# Copyright (c) 2019 Shigeki Karita
+#               2020 Mobvoi Inc (Binbin Zhang)
+#               2022 Ximalaya Inc (Yuguang Yang)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Positionwise feed forward layer definition."""
+
+import torch
+
+
+class PositionwiseFeedForward(torch.nn.Module):
+    """Positionwise feed forward layer.
+
+    FeedForward are appied on each position of the sequence.
+    The output dim is same with the input dim.
+
+    Args:
+        idim (int): Input dimenstion.
+        hidden_units (int): The number of hidden units.
+        dropout_rate (float): Dropout rate.
+        activation (torch.nn.Module): Activation function
+    """
+
+    def __init__(self,
+                 idim: int,
+                 hidden_units: int,
+                 dropout_rate: float,
+                 activation: torch.nn.Module = torch.nn.ReLU(),
+                 adaptive_scale: bool = False,
+                 init_weights: bool = False):
+        """Construct a PositionwiseFeedForward object."""
+        super(PositionwiseFeedForward, self).__init__()
+        self.idim = idim
+        self.hidden_units = hidden_units
+        self.w_1 = torch.nn.Linear(idim, hidden_units)
+        self.activation = activation
+        self.dropout = torch.nn.Dropout(dropout_rate)
+        self.w_2 = torch.nn.Linear(hidden_units, idim)
+        self.ada_scale = None
+        self.ada_bias = None
+        self.adaptive_scale = adaptive_scale
+        self.ada_scale = torch.nn.Parameter(torch.ones([1, 1, idim]),
+                                            requires_grad=adaptive_scale)
+        self.ada_bias = torch.nn.Parameter(torch.zeros([1, 1, idim]),
+                                           requires_grad=adaptive_scale)
+        if init_weights:
+            self.init_weights()
+
+    def init_weights(self):
+        ffn1_max = self.idim**-0.5
+        ffn2_max = self.hidden_units**-0.5
+        torch.nn.init.uniform_(self.w_1.weight.data, -ffn1_max, ffn1_max)
+        torch.nn.init.uniform_(self.w_1.bias.data, -ffn1_max, ffn1_max)
+        torch.nn.init.uniform_(self.w_2.weight.data, -ffn2_max, ffn2_max)
+        torch.nn.init.uniform_(self.w_2.bias.data, -ffn2_max, ffn2_max)
+
+    def forward(self, xs: torch.Tensor) -> torch.Tensor:
+        """Forward function.
+
+        Args:
+            xs: input tensor (B, L, D)
+        Returns:
+            output tensor, (B, L, D)
+        """
+        if self.adaptive_scale:
+            xs = self.ada_scale * xs + self.ada_bias
+        return self.w_2(self.dropout(self.activation(self.w_1(xs))))
diff --git a/wenet/squeezeformer/subsampling.py b/wenet/squeezeformer/subsampling.py
new file mode 100644
index 0000000000000000000000000000000000000000..c769e102558c3a8479553df04a336c6ffd51a4b8
--- /dev/null
+++ b/wenet/squeezeformer/subsampling.py
@@ -0,0 +1,321 @@
+# Copyright (c) 2022 Ximalaya Inc. (authors: Yuguang Yang)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from Squeezeformer(https://github.com/kssteven418/Squeezeformer)
+#               Squeezeformer(https://github.com/upskyy/Squeezeformer)
+#               NeMo(https://github.com/NVIDIA/NeMo)
+"""DepthwiseConv2dSubsampling4 and TimeReductionLayer definition."""
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from wenet.transformer.subsampling import BaseSubsampling
+from typing import Tuple
+from wenet.squeezeformer.conv2d import Conv2dValid
+
+
+class DepthwiseConv2dSubsampling4(BaseSubsampling):
+    """Depthwise Convolutional 2D subsampling (to 1/4 length).
+
+        Args:
+            idim (int): Input dimension.
+            odim (int): Output dimension.
+            pos_enc_class (nn.Module): position encoding class.
+            dw_stride (int): Whether do depthwise convolution.
+            input_size (int): filter bank dimension.
+
+        """
+
+    def __init__(self,
+                 idim: int,
+                 odim: int,
+                 pos_enc_class: torch.nn.Module,
+                 dw_stride: bool = False,
+                 input_size: int = 80,
+                 input_dropout_rate: float = 0.1,
+                 init_weights: bool = True):
+        super(DepthwiseConv2dSubsampling4, self).__init__()
+        self.idim = idim
+        self.odim = odim
+        self.pw_conv = nn.Conv2d(in_channels=idim,
+                                 out_channels=odim,
+                                 kernel_size=3,
+                                 stride=2)
+        self.act1 = nn.ReLU()
+        self.dw_conv = nn.Conv2d(in_channels=odim,
+                                 out_channels=odim,
+                                 kernel_size=3,
+                                 stride=2,
+                                 groups=odim if dw_stride else 1)
+        self.act2 = nn.ReLU()
+        self.pos_enc = pos_enc_class
+        self.input_proj = nn.Sequential(
+            nn.Linear(odim * (((input_size - 1) // 2 - 1) // 2), odim),
+            nn.Dropout(p=input_dropout_rate),
+        )
+        if init_weights:
+            linear_max = (odim * input_size / 4)**-0.5
+            torch.nn.init.uniform_(self.input_proj.state_dict()['0.weight'],
+                                   -linear_max, linear_max)
+            torch.nn.init.uniform_(self.input_proj.state_dict()['0.bias'],
+                                   -linear_max, linear_max)
+        self.subsampling_rate = 4
+        # 6 = (3 - 1) * 1 + (3 - 1) * 2
+        self.right_context = 6
+
+    def forward(
+            self,
+            x: torch.Tensor,
+            x_mask: torch.Tensor,
+            offset: int = 0
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        x = x.unsqueeze(1)  # (b, c=1, t, f)
+        x = self.pw_conv(x)
+        x = self.act1(x)
+        x = self.dw_conv(x)
+        x = self.act2(x)
+        b, c, t, f = x.size()
+        x = x.permute(0, 2, 1, 3)
+        x = x.contiguous().view(b, t, c * f)
+        x, pos_emb = self.pos_enc(x, offset)
+        x = self.input_proj(x)
+        return x, pos_emb, x_mask[:, :, :-2:2][:, :, :-2:2]
+
+
+class TimeReductionLayer1D(nn.Module):
+    """
+    Modified NeMo,
+    Squeezeformer Time Reduction procedure.
+    Downsamples the audio by `stride` in the time dimension.
+    Args:
+        channel (int): input dimension of
+                       MultiheadAttentionMechanism and PositionwiseFeedForward
+        out_dim (int): Output dimension of the module.
+        kernel_size (int): Conv kernel size for
+                           depthwise convolution in convolution module
+        stride (int): Downsampling factor in time dimension.
+    """
+
+    def __init__(self,
+                 channel: int,
+                 out_dim: int,
+                 kernel_size: int = 5,
+                 stride: int = 2):
+        super(TimeReductionLayer1D, self).__init__()
+
+        self.channel = channel
+        self.out_dim = out_dim
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = max(0, self.kernel_size - self.stride)
+
+        self.dw_conv = nn.Conv1d(
+            in_channels=channel,
+            out_channels=channel,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=self.padding,
+            groups=channel,
+        )
+
+        self.pw_conv = nn.Conv1d(
+            in_channels=channel,
+            out_channels=out_dim,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+        )
+
+        self.init_weights()
+
+    def init_weights(self):
+        dw_max = self.kernel_size**-0.5
+        pw_max = self.channel**-0.5
+        torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max)
+        torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max)
+        torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max)
+        torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max)
+
+    def forward(
+            self,
+            xs,
+            xs_lens: torch.Tensor,
+            mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+            mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+    ):
+        xs = xs.transpose(1, 2)  # [B, C, T]
+        xs = xs.masked_fill(mask_pad.eq(0), 0.0)
+
+        xs = self.dw_conv(xs)
+        xs = self.pw_conv(xs)
+
+        xs = xs.transpose(1, 2)  # [B, T, C]
+
+        B, T, D = xs.size()
+        mask = mask[:, ::self.stride, ::self.stride]
+        mask_pad = mask_pad[:, :, ::self.stride]
+        L = mask_pad.size(-1)
+        # For JIT exporting, we remove F.pad operator.
+        if L - T < 0:
+            xs = xs[:, :L - T, :].contiguous()
+        else:
+            dummy_pad = torch.zeros(B, L - T, D, device=xs.device)
+            xs = torch.cat([xs, dummy_pad], dim=1)
+
+        xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc')
+        return xs, xs_lens, mask, mask_pad
+
+
+class TimeReductionLayer2D(nn.Module):
+
+    def __init__(self,
+                 kernel_size: int = 5,
+                 stride: int = 2,
+                 encoder_dim: int = 256):
+        super(TimeReductionLayer2D, self).__init__()
+        self.encoder_dim = encoder_dim
+        self.kernel_size = kernel_size
+        self.dw_conv = Conv2dValid(in_channels=encoder_dim,
+                                   out_channels=encoder_dim,
+                                   kernel_size=(kernel_size, 1),
+                                   stride=stride,
+                                   valid_trigy=True)
+        self.pw_conv = Conv2dValid(
+            in_channels=encoder_dim,
+            out_channels=encoder_dim,
+            kernel_size=1,
+            stride=1,
+            valid_trigx=False,
+            valid_trigy=False,
+        )
+
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.init_weights()
+
+    def init_weights(self):
+        dw_max = self.kernel_size**-0.5
+        pw_max = self.encoder_dim**-0.5
+        torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max)
+        torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max)
+        torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max)
+        torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max)
+
+    def forward(
+        self,
+        xs: torch.Tensor,
+        xs_lens: torch.Tensor,
+        mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+        mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        xs = xs.masked_fill(mask_pad.transpose(1, 2).eq(0), 0.0)
+        xs = xs.unsqueeze(2)
+        padding1 = self.kernel_size - self.stride
+        xs = F.pad(xs, (0, 0, 0, 0, 0, padding1, 0, 0),
+                   mode='constant',
+                   value=0.)
+        xs = self.dw_conv(xs.permute(0, 3, 1, 2))
+        xs = self.pw_conv(xs).permute(0, 3, 2, 1).squeeze(1).contiguous()
+        tmp_length = xs.size(1)
+        xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc')
+        padding2 = max(0, (xs_lens.max() - tmp_length).data.item())
+        batch_size, hidden = xs.size(0), xs.size(-1)
+        dummy_pad = torch.zeros(batch_size, padding2, hidden, device=xs.device)
+        xs = torch.cat([xs, dummy_pad], dim=1)
+        mask = mask[:, ::2, ::2]
+        mask_pad = mask_pad[:, :, ::2]
+        return xs, xs_lens, mask, mask_pad
+
+
+class TimeReductionLayerStream(nn.Module):
+    """
+    Squeezeformer Time Reduction procedure.
+    Downsamples the audio by `stride` in the time dimension.
+    Args:
+        channel (int): input dimension of
+            MultiheadAttentionMechanism and PositionwiseFeedForward
+        out_dim (int): Output dimension of the module.
+        kernel_size (int): Conv kernel size for
+            depthwise convolution in convolution module
+        stride (int): Downsampling factor in time dimension.
+    """
+
+    def __init__(self,
+                 channel: int,
+                 out_dim: int,
+                 kernel_size: int = 1,
+                 stride: int = 2):
+        super(TimeReductionLayerStream, self).__init__()
+
+        self.channel = channel
+        self.out_dim = out_dim
+        self.kernel_size = kernel_size
+        self.stride = stride
+
+        self.dw_conv = nn.Conv1d(
+            in_channels=channel,
+            out_channels=channel,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=0,
+            groups=channel,
+        )
+
+        self.pw_conv = nn.Conv1d(
+            in_channels=channel,
+            out_channels=out_dim,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+        )
+
+        self.init_weights()
+
+    def init_weights(self):
+        dw_max = self.kernel_size**-0.5
+        pw_max = self.channel**-0.5
+        torch.nn.init.uniform_(self.dw_conv.weight, -dw_max, dw_max)
+        torch.nn.init.uniform_(self.dw_conv.bias, -dw_max, dw_max)
+        torch.nn.init.uniform_(self.pw_conv.weight, -pw_max, pw_max)
+        torch.nn.init.uniform_(self.pw_conv.bias, -pw_max, pw_max)
+
+    def forward(
+            self,
+            xs,
+            xs_lens: torch.Tensor,
+            mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+            mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+    ):
+        xs = xs.transpose(1, 2)  # [B, C, T]
+        xs = xs.masked_fill(mask_pad.eq(0), 0.0)
+
+        xs = self.dw_conv(xs)
+        xs = self.pw_conv(xs)
+
+        xs = xs.transpose(1, 2)  # [B, T, C]
+
+        B, T, D = xs.size()
+        mask = mask[:, ::self.stride, ::self.stride]
+        mask_pad = mask_pad[:, :, ::self.stride]
+        L = mask_pad.size(-1)
+        # For JIT exporting, we remove F.pad operator.
+        if L - T < 0:
+            xs = xs[:, :L - T, :].contiguous()
+        else:
+            dummy_pad = torch.zeros(B, L - T, D, device=xs.device)
+            xs = torch.cat([xs, dummy_pad], dim=1)
+
+        xs_lens = torch.div(xs_lens + 1, 2, rounding_mode='trunc')
+        return xs, xs_lens, mask, mask_pad
diff --git a/wenet/ssl/bestrq/bestrq_model.py b/wenet/ssl/bestrq/bestrq_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..b70fd13dbe53108f63f0cebe4fecfc14dca32c97
--- /dev/null
+++ b/wenet/ssl/bestrq/bestrq_model.py
@@ -0,0 +1,297 @@
+import math
+from typing import Dict, Optional, Tuple
+import torch
+
+from wenet.ssl.bestrq.mask import compute_mask_indices_v2
+from wenet.utils.mask import make_non_pad_mask, make_pad_mask
+from wenet.transformer.attention import RelPositionMultiHeadedAttention
+from wenet.transformer.encoder_layer import ConformerEncoderLayer
+
+
+def quantize_vector(latent: torch.Tensor, codebook: torch.Tensor):
+    """
+    Symbols in comments:
+    B: batch_size.
+    D: latent_dim.
+    C: num_latent_classes per group
+    G: num of codebook groups.
+
+    Args:
+        latent: [B, D]
+        codebook: [C, G, D // G]
+
+    Returns:
+        (quantized, codes, onehot).
+         - quantized: [B, D]
+         - codes:     [B, G]
+         - onehot:    [B, G, C]
+    """
+
+    assert len(codebook.size()) == 3
+    b, d = latent.size()
+    c, g, _ = codebook.size()
+    assert d % g == 0
+
+    latent = latent.reshape(b, g, d // g)
+
+    # [B, G, C]
+    # torch.transpose(codebook, [2,1,0])
+    distance = (
+        # [b, g, 1]
+        torch.sum(latent**2, -1, keepdim=True) -
+        # [b, g, c]
+        2 * torch.einsum('bgd,cgd->bgc', latent, codebook) +
+        # [1, g, c]
+        torch.sum(codebook.permute([2, 1, 0])**2, 0, keepdim=True))
+
+    # [B, G]
+    codes = torch.argmin(distance, dim=-1)
+
+    # [B, G, C]
+    one_hot = torch.nn.functional.one_hot(codes, c).type(codebook.dtype)
+    quantized = torch.einsum('bgc,cgd->bgd', one_hot, codebook)
+    quantized = torch.reshape(quantized, [b, d])
+    return quantized, codes, one_hot
+
+
+class BestRQModel(torch.nn.Module):
+
+    def __init__(
+        self,
+        encoder: torch.nn.Module,
+        num_mel_bins: int = 80,
+        embedding_dim: int = 16,
+        num_embeddings: int = 8192,
+        num_codebooks: int = 1,
+        mask_prob: float = 0.01,
+        mask_length: int = 10,
+        min_masks: int = 2,
+        norm_epsilon: float = 1e-5,
+        out_bias: bool = False,
+        features_regularization_weight: float = 0.01,
+    ) -> None:
+        super().__init__()
+        assert mask_prob > 0.0
+        self.mask_prob = mask_prob
+        self.mask_length = mask_length
+        self.min_masks = min_masks
+
+        self.num_codebooks = num_codebooks
+        self.num_embeddings = num_embeddings
+        self.features_regularization_weight = features_regularization_weight
+
+        # encoder
+        self.encoder = encoder
+        # n softmax
+        self.encoder_top_n_out = torch.nn.parameter.Parameter(
+            torch.empty(self.num_codebooks, self.encoder.output_size(),
+                        num_embeddings))
+        torch.nn.init.trunc_normal_(self.encoder_top_n_out, std=0.02)
+        self.out_bias = out_bias
+        if self.out_bias:
+            self.encoder_top_n_out_bias = torch.nn.parameter.Parameter(
+                torch.empty(self.num_codebooks, num_embeddings))
+            torch.nn.init.zeros_(self.encoder_top_n_out_bias)
+
+        # stack input: eg: fbank
+        self.stack_frames = self.encoder.embed.right_context + 1
+        self.stride = self.encoder.embed.subsampling_rate
+        input_dim = num_mel_bins * self.stride
+
+        # random projectoin
+        self.projection = torch.nn.parameter.Parameter(
+            torch.empty(input_dim, embedding_dim * self.num_codebooks),
+            requires_grad=False,
+        )
+        torch.nn.init.xavier_uniform_(self.projection)
+
+        # codebooks
+        # [num_embeddings, num_codebooks, num_embeddings] means
+        # [C, G, D] see quantize_vector
+        self.embeddings = torch.nn.parameter.Parameter(
+            torch.empty(num_embeddings, self.num_codebooks, embedding_dim),
+            requires_grad=False,
+        )
+        torch.nn.init.normal_(self.embeddings)
+        self.embeddings /= (self.embeddings.norm(dim=-1, p=2, keepdim=True) +
+                            1e-8)
+
+        # force reset encoder papameter
+        self.reset_encoder_parameter()
+
+    def reset_encoder_parameter(self):
+
+        def _reset_parameter(module: torch.nn.Module):
+            if isinstance(module, torch.nn.Linear):
+                torch.nn.init.trunc_normal_(module.weight.data,
+                                            mean=0.0,
+                                            std=0.02)
+                if module.bias is not None:
+                    module.bias.data.zero_()
+            elif isinstance(module, torch.nn.Conv1d):
+                torch.nn.init.kaiming_normal_(module.weight)
+                if module.bias is not None:
+                    k = math.sqrt(module.groups /
+                                  (module.in_channels * module.kernel_size[0]))
+                    torch.nn.init.uniform_(module.bias, a=-k, b=k)
+            elif isinstance(module, torch.Tensor):
+                torch.nn.init.trunc_normal_(module)
+            else:
+                raise NotImplementedError("other module not support now")
+
+        encoders = self.encoder.encoders
+        for _, layer in enumerate(encoders):
+            self_attn = layer.self_attn
+            _reset_parameter(self_attn.linear_q)
+            _reset_parameter(self_attn.linear_k)
+            _reset_parameter(self_attn.linear_v)
+            _reset_parameter(self_attn.linear_out)
+            if isinstance(self_attn, RelPositionMultiHeadedAttention):
+                _reset_parameter(self_attn.pos_bias_u)
+                _reset_parameter(self_attn.pos_bias_v)
+            if isinstance(layer, ConformerEncoderLayer):
+                conv1, conv2 = (layer.conv_module.pointwise_conv1,
+                                layer.conv_module.depthwise_conv)
+                _reset_parameter(conv1)
+                _reset_parameter(conv2)
+
+    def forward(
+        self,
+        batch: Dict,
+        device: torch.device,
+    ):
+        xs = batch['feats'].to(device)
+        xs_lens = batch['feats_lengths'].to(device)
+        input = xs
+
+        features_pen: Optional[torch.Tensor] = None
+        if self.features_regularization_weight != 0.0:
+            features_pen = input.pow(2).mean()
+
+        # 1 mask input
+        xs, code_ids_mask = self._apply_mask_signal(xs, xs_lens)
+
+        # 2.0 stack fbank
+        unmasked_xs = self._stack_features(input, xs_lens)
+        masked_xs = xs
+
+        # 2.1 get nearest embedding
+        target_ids = self._nearest_embedding_idx(unmasked_xs)
+        target_ids = target_ids[:, :code_ids_mask.size(1), :]
+
+        # 3 forward xxx-formaer block and its subsampling layer
+        out, out_mask = self.encoder(masked_xs, xs_lens)
+
+        # 4 get logits
+        out = out.unsqueeze(1)  # [B, 1, T', dim]
+        top_n_out = self.encoder_top_n_out.unsqueeze(
+            0)  # [1, num_codebooks, dim, num_embeddings]
+        out = torch.matmul(out,
+                           top_n_out)  # [B, num_codebooks, T', num_embeddings]
+        if self.out_bias:
+            out = out + self.encoder_top_n_out_bias.unsqueeze(0).unsqueeze(2)
+
+        # 5 compute loss
+        masks = out_mask.squeeze(1) * code_ids_mask
+        loss = self._compute_loss(out, target_ids, mask=masks)
+        if self.features_regularization_weight != 0.0:
+            loss = loss + self.features_regularization_weight * features_pen
+
+        # 6 other info: num codes used in batch, unique num codes used in batch
+        num_codes = masks.sum() * self.num_codebooks
+        uniq_num_codes = torch.tensor(
+            torch.unique(target_ids * masks.unsqueeze(2)).numel()).detach()
+        ids_corr = out.argmax(dim=-1, keepdim=False).transpose(1,
+                                                               2) == target_ids
+        codes_acc = (ids_corr * masks.unsqueeze(2)).sum() / num_codes
+        return {
+            "codes_acc": codes_acc,
+            "features_l2": features_pen,
+            "loss": loss,
+            "num_codes": num_codes,
+            "uniq_num_codes": uniq_num_codes,
+            "th_accuracy": codes_acc,
+        }
+
+    def _apply_mask_signal(
+            self, input: torch.Tensor,
+            input_lens: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        device = input.device
+        B, T, _ = input.size()
+        padding_mask = make_pad_mask(input_lens)
+
+        # calc subsampling masks
+        padding_mask_stride = padding_mask.unfold(
+            1,
+            size=self.stack_frames,
+            step=self.stride,
+        )
+        padding_mask, _ = torch.max(padding_mask_stride, dim=-1)
+        masks = compute_mask_indices_v2(padding_mask.size(),
+                                        padding_mask,
+                                        self.mask_prob,
+                                        self.mask_length,
+                                        min_masks=self.min_masks,
+                                        device=device)
+        # calc signal mask
+        subsampling_mask = masks
+        bool_stride_mask = torch.ones_like(padding_mask_stride, device=device)
+        mask_stride = torch.where(masks.unsqueeze(-1), bool_stride_mask, False)
+        # recover orign seq masks
+        masks = mask_stride[:, :, :self.stride].flatten(start_dim=1)
+        masks_padding = torch.zeros(
+            B,
+            T,
+            device=device,
+            dtype=padding_mask.dtype,
+        )
+        masks_padding[:, :masks.size(-1)] = masks
+        masks = masks_padding
+        masks_expand = masks.unsqueeze(-1)  # [B, T, 1]
+        # NOTE(Mddct): you can use size (b,t,d) for torch.normal
+        mask_emb = torch.normal(mean=0, std=0.1,
+                                size=(1, 1, input.size(2))).to(input.device)
+        xs = torch.where(masks_expand, mask_emb, input)
+        return xs, subsampling_mask
+
+    def _stack_features(self, input: torch.Tensor,
+                        input_lens: torch.Tensor) -> torch.Tensor:
+
+        stack_input = input.unfold(1, size=self.stride, step=self.stride)
+        stack_input = stack_input.transpose(-1, -2)
+        b, n, f, d = stack_input.size()
+        stack_input = stack_input.reshape(b, n, f * d)
+
+        # NOTE(Mddct): important!!!
+        # norm stack features
+        mask = make_non_pad_mask(input_lens)
+        stack_mask = mask.unfold(1, size=self.stride, step=self.stride)
+        stack_mask, _ = torch.min(stack_mask, dim=-1)
+
+        stack_input = stack_input * stack_mask.unsqueeze(2)
+        mean = stack_input.sum(1, keepdim=True) / stack_mask.sum(
+            dim=1, keepdim=True).unsqueeze(1)
+        std = torch.sqrt(((stack_input - mean)**2).sum(dim=1, keepdim=True) /
+                         stack_mask.sum(dim=1, keepdim=True).unsqueeze(1))
+        norm_stack_input = (stack_input - mean) / (std + 1e-5)
+        return norm_stack_input
+
+    def _compute_loss(self, input: torch.Tensor, target: torch.Tensor,
+                      mask: torch.Tensor) -> torch.Tensor:
+        logits = input.transpose(1, 2).contiguous().view(-1, input.size(-1))
+        loss = torch.nn.functional.cross_entropy(
+            logits,
+            target.contiguous().view(-1),
+            reduction='none',
+        )
+        loss = (loss * mask.view(-1)).sum() / mask.sum()
+        return loss
+
+    def _nearest_embedding_idx(self, xs: torch.Tensor) -> torch.Tensor:
+        xs = torch.matmul(xs, self.projection.to(xs.device))
+        xs = xs / (xs.norm(dim=-1, p=2, keepdim=True) + 1e-8)
+        codebooks = self.embeddings
+        B, T, C = xs.size()
+        xs_flatten = xs.view(B * T, C)
+        _, codes, _ = quantize_vector(xs_flatten, codebooks)
+        return codes.reshape(B, T, -1)  # [B, T, num_codebooks]
diff --git a/wenet/ssl/bestrq/mask.py b/wenet/ssl/bestrq/mask.py
new file mode 100644
index 0000000000000000000000000000000000000000..6fc8b2b74cd1cdc16328cf4d1a4846b574393066
--- /dev/null
+++ b/wenet/ssl/bestrq/mask.py
@@ -0,0 +1,160 @@
+import torch
+import numpy as np
+
+
+def _sampler(pdf: torch.Tensor, num_samples: int,
+             device=torch.device('cpu')) -> torch.Tensor:
+    size = pdf.size()
+    z = -torch.log(torch.rand(size, device=device))
+    _, indices = torch.topk(pdf + z, num_samples)
+    return indices
+
+
+def compute_mask_indices(
+        size: torch.Size,
+        mask_prob: float,
+        mask_length: int,
+        min_masks: int = 0,
+        device=torch.device('cpu'),
+) -> torch.Tensor:
+
+    assert len(size) == 2
+    batch_size, seq_length = size
+
+    # compute number of masked span in batch
+    num_masked_spans = mask_prob * float(seq_length) / float(
+        mask_length) + torch.rand(1)[0]
+    num_masked_spans = int(num_masked_spans)
+    num_masked_spans = max(num_masked_spans, min_masks)
+
+    # num_masked <= seq_length
+    if num_masked_spans * mask_length > seq_length:
+        num_masked_spans = seq_length // mask_length
+
+    pdf = torch.ones(batch_size, seq_length - (mask_length - 1), device=device)
+    mask_idxs = _sampler(pdf, num_masked_spans, device=device)
+
+    mask_idxs = mask_idxs.unsqueeze(-1).repeat(1, 1, mask_length).view(
+        batch_size,
+        num_masked_spans * mask_length)  # [B,num_masked_spans*mask_length]
+
+    offset = torch.arange(mask_length, device=device).view(1, 1, -1).repeat(
+        1, num_masked_spans, 1)  # [1,num_masked_spans,mask_length]
+    offset = offset.view(1, num_masked_spans * mask_length)
+
+    mask_idxs = mask_idxs + offset  # [B,num_masked_spans, mask_length]
+
+    ones = torch.ones(batch_size,
+                      seq_length,
+                      dtype=torch.bool,
+                      device=mask_idxs.device)
+    # masks to fill
+    full_mask = torch.zeros_like(ones,
+                                 dtype=torch.bool,
+                                 device=mask_idxs.device)
+    return torch.scatter(full_mask, dim=1, index=mask_idxs, src=ones)
+
+
+def compute_mask_indices_v2(
+        shape,
+        padding_mask,
+        mask_prob: float,
+        mask_length: int,
+        mask_type: str = 'static',
+        mask_other: float = 0.0,
+        min_masks: int = 2,
+        no_overlap: bool = False,
+        min_space: int = 1,
+        device=torch.device('cpu'),
+):
+    bsz, all_sz = shape
+    mask = np.full((bsz, all_sz), False)
+    padding_mask = padding_mask.cpu().numpy()
+    all_num_mask = int(
+        # add a random number for probabilistic rounding
+        mask_prob * all_sz / float(mask_length) + np.random.rand())
+
+    all_num_mask = max(min_masks, all_num_mask)
+
+    mask_idcs = []
+    for i in range(bsz):
+        if padding_mask is not None and not isinstance(padding_mask, bytes):
+            sz = all_sz - padding_mask[i].sum()
+            num_mask = int(
+                # add a random number for probabilistic rounding
+                mask_prob * sz / float(mask_length) + np.random.rand())
+            num_mask = max(min_masks, num_mask)
+        else:
+            sz = all_sz
+            num_mask = all_num_mask
+
+        if mask_type == 'static':
+            lengths = np.full(num_mask, mask_length)
+        elif mask_type == 'uniform':
+            lengths = np.random.randint(mask_other,
+                                        mask_length * 2 + 1,
+                                        size=num_mask)
+        elif mask_type == 'normal':
+            lengths = np.random.normal(mask_length, mask_other, size=num_mask)
+            lengths = [max(1, int(round(x))) for x in lengths]
+        elif mask_type == 'poisson':
+            lengths = np.random.poisson(mask_length, size=num_mask)
+            lengths = [int(round(x)) for x in lengths]
+        else:
+            raise Exception('unknown mask selection ' + mask_type)
+
+        if sum(lengths) == 0:
+            lengths[0] = min(mask_length, sz - 1)
+
+        if no_overlap:
+            mask_idc = []
+
+            def arrange(s, e, length, keep_length, mask_idc):
+                span_start = np.random.randint(s, e - length)
+                mask_idc.extend(span_start + i for i in range(length))
+
+                new_parts = []
+                if span_start - s - min_space >= keep_length:
+                    new_parts.append((s, span_start - min_space + 1))
+                if e - span_start - keep_length - min_space > keep_length:
+                    new_parts.append((span_start + length + min_space, e))
+                return new_parts
+
+            parts = [(0, sz)]
+            min_length = min(lengths)
+            for length in sorted(lengths, reverse=True):
+                lens = np.fromiter(
+                    (e - s if e - s >= length + min_space else 0
+                     for s, e in parts),
+                    np.int,
+                )
+                l_sum = np.sum(lens)
+                if l_sum == 0:
+                    break
+                probs = lens / np.sum(lens)
+                c = np.random.choice(len(parts), p=probs)
+                s, e = parts.pop(c)
+                parts.extend(arrange(s, e, length, min_length, mask_idc))
+            mask_idc = np.asarray(mask_idc)
+        else:
+            min_len = min(lengths)
+            if sz - min_len <= num_mask:
+                min_len = sz - num_mask - 1
+
+            mask_idc = np.random.choice(sz - min_len, num_mask, replace=False)
+
+            mask_idc = np.asarray([
+                mask_idc[j] + offset for j in range(len(mask_idc))
+                for offset in range(lengths[j])
+            ])
+
+        mask_idcs.append(np.unique(mask_idc[mask_idc < sz]))
+
+    min_len = min([len(m) for m in mask_idcs])
+    for i, mask_idc in enumerate(mask_idcs):
+        if len(mask_idc) > min_len:
+            mask_idc = np.random.choice(mask_idc, min_len, replace=False)
+        mask[i, mask_idc] = True
+
+    mask = torch.from_numpy(mask).to(device)
+    return mask
diff --git a/wenet/ssl/init_dataset.py b/wenet/ssl/init_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..10072a5c2a0143add593bf94aeccbf98bccb5860
--- /dev/null
+++ b/wenet/ssl/init_dataset.py
@@ -0,0 +1,157 @@
+from functools import partial
+import sys
+
+import torch
+from torch.nn.utils.rnn import pad_sequence
+from wenet.dataset import processor
+from wenet.dataset.datapipes import WenetRawDatasetSource, WenetTarShardDatasetSource
+
+
+def padding(data):
+    """ Padding the data into training data
+
+        Args:
+            data: List[{key, feat, label}
+
+        Returns:
+            Tuple(keys, feats, labels, feats lengths, label lengths)
+    """
+    sample = data
+    assert isinstance(sample, list)
+    feats_length = torch.tensor([x['feat'].size(0) for x in sample],
+                                dtype=torch.int32)
+    order = torch.argsort(feats_length, descending=True)
+    feats_lengths = torch.tensor([sample[i]['feat'].size(0) for i in order],
+                                 dtype=torch.int32)
+    sorted_feats = [sample[i]['feat'] for i in order]
+    sorted_keys = [sample[i]['key'] for i in order]
+    padded_feats = pad_sequence(sorted_feats,
+                                batch_first=True,
+                                padding_value=0)
+    batch = {
+        "keys": sorted_keys,
+        "feats": padded_feats,
+        "feats_lengths": feats_lengths,
+        # NOTE(Mddct): cv need targets , refine later
+        "target": padded_feats,
+        "target_lengths": feats_lengths,
+    }
+    return batch
+
+
+def Dataset(data_type, data_list_file, conf=None, partition=True):
+    """ Construct dataset from arguments for ssl model
+
+        We have two shuffle stage in the Dataset. The first is global
+        shuffle at shards tar/raw file level. The second is global shuffle
+        at training samples level.
+
+        Args:
+            data_type(str): raw/shard
+            partition(bool): whether to do data partition in terms of rank
+    """
+    assert conf is not None
+    assert data_type in ['raw', 'shard']
+    # cycle dataset
+    cycle = conf.get('cycle', 1)
+    # stage1 shuffle: source
+    list_shuffle = conf.get('list_shuffle', True)
+
+    list_shuffle_size = sys.maxsize
+    if list_shuffle:
+        list_shuffle_conf = conf.get('list_shuffle_conf', {})
+        list_shuffle_size = list_shuffle_conf.get('shuffle_size',
+                                                  list_shuffle_size)
+    if data_type == 'raw':
+        dataset = WenetRawDatasetSource(data_list_file,
+                                        partition=partition,
+                                        shuffle=list_shuffle,
+                                        shuffle_size=list_shuffle_size,
+                                        cycle=cycle)
+        dataset = dataset.map(processor.parse_json)
+    else:
+        dataset = WenetTarShardDatasetSource(data_list_file,
+                                             partition=partition,
+                                             shuffle=list_shuffle,
+                                             shuffle_size=list_shuffle_size,
+                                             cycle=cycle)
+    dataset = dataset.map_ignore_error(processor.decode_wav)
+
+    singal_channel_conf = conf.get('singal_channel_conf', {})
+    dataset = dataset.map(
+        partial(processor.singal_channel, **singal_channel_conf))
+
+    filter_conf = conf.get('filter_conf', {})
+    dataset = dataset.filter(partial(processor.filter, **filter_conf))
+
+    resample_conf = conf.get('resample_conf', {})
+    dataset = dataset.map(partial(processor.resample, **resample_conf))
+
+    speed_perturb = conf.get('speed_perturb', False)
+    if speed_perturb:
+        dataset = dataset.map(partial(processor.speed_perturb))
+
+    feats_type = conf.get('feats_type', 'fbank')
+    assert feats_type in ['fbank', 'mfcc', 'log_mel_spectrogram']
+    if feats_type == 'fbank':
+        fbank_conf = conf.get('fbank_conf', {})
+        dataset = dataset.map(partial(processor.compute_fbank, **fbank_conf))
+    elif feats_type == 'mfcc':
+        mfcc_conf = conf.get('mfcc_conf', {})
+        dataset = dataset.map(partial(processor.compute_mfcc, **mfcc_conf))
+    elif feats_type == 'log_mel_spectrogram':
+        log_mel_spectrogram_conf = conf.get('log_mel_spectrogram_conf', {})
+        dataset = dataset.map(
+            partial(processor.compute_log_mel_spectrogram,
+                    **log_mel_spectrogram_conf))
+    spec_aug = conf.get('spec_aug', True)
+    spec_sub = conf.get('spec_sub', False)
+    spec_trim = conf.get('spec_trim', False)
+    if spec_aug:
+        spec_aug_conf = conf.get('spec_aug_conf', {})
+        dataset = dataset.map(partial(processor.spec_aug, **spec_aug_conf))
+    if spec_sub:
+        spec_sub_conf = conf.get('spec_sub_conf', {})
+        dataset = dataset.map(partial(processor.spec_sub, **spec_sub_conf))
+    if spec_trim:
+        spec_trim_conf = conf.get('spec_trim_conf', {})
+        dataset = dataset.map(partial(processor.spec_trim, **spec_trim_conf))
+
+    shuffle = conf.get('shuffle', True)
+    if shuffle:
+        shuffle_conf = conf.get('shuffle_conf', {})
+        dataset = dataset.shuffle(buffer_size=shuffle_conf['shuffle_size'])
+
+    sort = conf.get('sort', True)
+    if sort:
+        sort_conf = conf.get('sort_conf', {})
+        dataset = dataset.sort(buffer_size=sort_conf['sort_size'],
+                               key_func=processor.sort_by_feats)
+
+    batch_conf = conf.get('batch_conf', {})
+    batch_type = batch_conf.get('batch_type', 'static')
+    assert batch_type in ['static', 'bucket', 'dynamic']
+    if batch_type == 'static':
+        assert 'batch_size' in batch_conf
+        batch_size = batch_conf.get('batch_size', 16)
+        dataset = dataset.batch(batch_size, wrapper_class=padding)
+    elif batch_type == 'bucket':
+        assert 'bucket_boundaries' in batch_conf
+        assert 'bucket_batch_sizes' in batch_conf
+        dataset = dataset.bucket_by_sequence_length(
+            processor.feats_length_fn,
+            batch_conf['bucket_boundaries'],
+            batch_conf['bucket_batch_sizes'],
+            wrapper_class=padding)
+    else:
+        max_frames_in_batch = batch_conf.get('max_frames_in_batch', 12000)
+        dataset = dataset.dynamic_batch(
+            processor.DynamicBatchWindow(max_frames_in_batch),
+            wrapper_class=padding,
+        )
+
+    return dataset
+
+
+def init_dataset(data_type, data_list_file, conf=None, partition=True):
+    return Dataset(data_type, data_list_file, conf, partition)
diff --git a/wenet/ssl/init_model.py b/wenet/ssl/init_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c0e1957e1f3fe934ee973f1b169337e5a04945c
--- /dev/null
+++ b/wenet/ssl/init_model.py
@@ -0,0 +1,19 @@
+from wenet.ssl.bestrq.bestrq_model import BestRQModel
+from wenet.ssl.wav2vec2.wav2vec2_model import Wav2vec2Model
+from wenet.ssl.w2vbert.w2vbert_model import W2VBERTModel
+
+WENET_SSL_MODEL_CLASS = {
+    "w2vbert_model": W2VBERTModel,
+    "wav2vec_model": Wav2vec2Model,
+    "bestrq_model": BestRQModel
+}
+
+
+def init_model(configs, encoder):
+
+    assert 'model' in configs
+    model_type = configs['model']
+    assert model_type in WENET_SSL_MODEL_CLASS.keys()
+    model = WENET_SSL_MODEL_CLASS[model_type](encoder=encoder,
+                                              **configs['model_conf'])
+    return model
diff --git a/wenet/ssl/w2vbert/convert_w2vbert_to_wenet_config_and_ckpt.py b/wenet/ssl/w2vbert/convert_w2vbert_to_wenet_config_and_ckpt.py
new file mode 100644
index 0000000000000000000000000000000000000000..1dcf128c3c0b05c7913a79a02f26f9ed41519296
--- /dev/null
+++ b/wenet/ssl/w2vbert/convert_w2vbert_to_wenet_config_and_ckpt.py
@@ -0,0 +1,194 @@
+import argparse
+import os
+import torch
+
+import yaml
+
+
+def convert_to_wenet_yaml(wenet_yaml_path: str):
+    configs = {}
+    configs['input_dim'] = 80
+    # whisper token nums
+    configs['output_dim'] = 51866
+
+    configs = {}
+    configs['input_dim'] = 80
+    # whisper token nums
+    configs['output_dim'] = 1024
+
+    configs['encoder'] = 'conformer'
+    configs['encoder_conf'] = {}
+    configs['encoder_conf']['causal'] = True
+    configs['encoder_conf']['gradient_checkpointing'] = True
+    configs['encoder_conf']['input_layer'] = 'stack_n_frames'
+    configs['encoder_conf']['output_size'] = 1024
+    configs['encoder_conf']['attention_heads'] = 16
+    configs['encoder_conf']['linear_units'] = 4096
+    configs['encoder_conf']['num_blocks'] = 24
+    configs['encoder_conf']['dropout_rate'] = 0.1
+    configs['encoder_conf']['positional_dropout_rate'] = 0.0
+    configs['encoder_conf']['attention_dropout_rate'] = 0.0
+    configs['encoder_conf']['normalize_before'] = True
+    configs['encoder_conf']['use_dynamic_chunk'] = False
+    configs['encoder_conf']['use_dynamic_left_chunk'] = False
+    configs['encoder_conf']['pos_enc_layer_type'] = "no_pos"
+    configs['encoder_conf']['static_chunk_size'] = -1
+    configs['encoder_conf']['activation_type'] = "swish"
+    configs['encoder_conf']['conv_bias'] = False
+    configs['encoder_conf']['selfattention_layer_type'] = 'shaw_rel_selfattn'
+    configs['encoder_conf']['cnn_module_kernel'] = 31
+    configs['encoder_conf']['cnn_module_norm'] = 'layer_norm'
+
+    # dummy decoder
+    # TODO(Mddct): To use whisper's decoder here
+    configs['decoder'] = 'transformer'
+    configs['decoder_conf'] = {}
+    configs['decoder_conf']['attention_head'] = 16
+    configs['decoder_conf']['linear_units'] = 4096
+    configs['decoder_conf']['num_blocks'] = 6
+    configs['decoder_conf']['dropout_rate'] = 0.1
+    configs['decoder_conf']['positional_dropout_rate'] = 0.1
+    configs['decoder_conf']['self_attention_dropout_rate'] = 0.0
+    configs['decoder_conf']['src_attention_dropout_rate'] = 0.0
+
+    configs['cmvn'] = None
+    configs['cmvn_conf'] = {}
+    configs['cmvn_conf']['cmvn_file'] = None
+    configs['cmvn_conf']['is_json_cmvn'] = None
+
+    configs['model'] = "asr_model"
+    configs['model_conf'] = {}
+    configs['model_conf']['ctc_weight'] = 0.3
+    configs['model_conf']['lsm_weight'] = 0.1
+    configs['model_conf']['length_normalized_loss'] = False
+
+    configs['dataset'] = "asr"
+    configs['dataset_conf'] = {}
+    configs['dataset_conf']['filter_conf'] = {}
+    configs['dataset_conf']['filter_conf'][
+        'max_length'] = 419000  # 1/2 subsample # noqa
+    configs['dataset_conf']['filter_conf']['min_length'] = 0
+    configs['dataset_conf']['filter_conf']['token_max_length'] = 400
+    configs['dataset_conf']['filter_conf']['token_min_length'] = 1
+    configs['dataset_conf']['resample_conf'] = {}
+    configs['dataset_conf']['resample_conf']['resample_rate'] = 16000
+    configs['dataset_conf']['speed_perturb'] = False
+    configs['dataset_conf']['spec_aug'] = True
+    configs['dataset_conf']['spec_aug_conf'] = {}
+    configs['dataset_conf']['spec_aug_conf']['num_t_mask'] = 2
+    configs['dataset_conf']['spec_aug_conf']['num_f_mask'] = 2
+    configs['dataset_conf']['spec_aug_conf']['max_t'] = 50
+    configs['dataset_conf']['spec_aug_conf']['max_f'] = 10
+    configs['dataset_conf']['spec_sub'] = True
+    configs['dataset_conf']['spec_sub_conf'] = {}
+    configs['dataset_conf']['spec_sub_conf']['num_t_sub'] = 3
+    configs['dataset_conf']['spec_sub_conf']['max_t'] = 30
+    configs['dataset_conf']['spec_trim'] = False
+    configs['dataset_conf']['shuffle'] = True
+    configs['dataset_conf']['shuffle_conf'] = {}
+    configs['dataset_conf']['shuffle_conf']['shuffle_size'] = 1500
+    configs['dataset_conf']['sort'] = True
+    configs['dataset_conf']['sort_conf'] = {}
+    configs['dataset_conf']['sort_conf']['sort_size'] = 500
+    configs['dataset_conf']['feats_type'] = "fbank"
+    configs['dataset_conf']['batch_conf'] = {}
+    configs['dataset_conf']['batch_conf']['batch_type'] = 'dynamic'
+    configs['dataset_conf']['batch_conf']['batch_size'] = 26
+    configs['dataset_conf']['batch_conf']['max_frames_in_batch'] = 12000
+
+    configs['grad_clip'] = 5
+    configs['accum_grad'] = 4
+    configs['max_epoch'] = 100
+    configs['log_interval'] = 100
+
+    configs['optim'] = "adam"
+    configs['optim_conf'] = {}
+    configs['optim_conf']['lr'] = 0.0005
+    configs['scheduler'] = "warmuplr"
+    configs['scheduler_conf'] = {}
+    configs['scheduler_conf']['warmup_steps'] = 12000
+
+    with open(wenet_yaml_path, '+w') as f:
+        f.write(yaml.dump(configs))
+        f.flush()
+
+    print(configs)
+
+
+def convert_to_wenet_state_dict(w2vbert_conformer_state_dict,
+                                wenet_state_dict_path):
+
+    wenet_state_dict = {}
+    print("==============start CKPT Conversion =========================")
+    conformer_state_dict = w2vbert_conformer_state_dict
+    wenet_state_dict = {}
+    for name in conformer_state_dict.keys():
+        old_name = name
+        name = name.replace('encoder.layers', 'encoder.encoders')
+        name = name.replace("ffn1_layer_norm", "norm_ff_macaron")
+        name = name.replace("self_attn_layer_norm", "norm_mha")
+        name = name.replace("conv_layer_norm", "norm_conv")
+        name = name.replace("ffn2_layer_norm", "norm_ff")
+        name = name.replace("self_attn.q_proj", "self_attn.linear_q")
+        name = name.replace("self_attn.k_proj", "self_attn.linear_k")
+        name = name.replace("self_attn.v_proj", "self_attn.linear_v")
+        name = name.replace("self_attn.output_proj", "self_attn.linear_out")
+        name = name.replace("self_attn.sdpa.rel_k_embed",
+                            "self_attn.rel_k_embed")
+        name = name.replace("conv.pointwise_conv1",
+                            "conv_module.pointwise_conv1")
+        name = name.replace("conv.depthwise_conv",
+                            "conv_module.depthwise_conv")
+        name = name.replace("conv.pointwise_conv2",
+                            "conv_module.pointwise_conv2")
+        name = name.replace("conv.layer_norm", "conv_module.norm")
+        name = name.replace("ffn1.inner_proj", "feed_forward_macaron.w_1")
+        name = name.replace("ffn1.output_proj", "feed_forward_macaron.w_2")
+        name = name.replace("ffn2.inner_proj", "feed_forward.w_1")
+        name = name.replace("ffn2.output_proj", "feed_forward.w_2")
+        name = name.replace("encoder_frontend.model_dim_proj",
+                            "encoder.embed.out")
+        name = name.replace("encoder_frontend.post_extract_layer_norm",
+                            "encoder.embed.norm")
+        name = name.replace(".layer_norm.", ".norm_final.")
+        wenet_state_dict[name] = conformer_state_dict[old_name]
+
+    print("Saving fp32 ckpt to {}...".format(wenet_state_dict_path))
+    torch.save(wenet_state_dict, wenet_state_dict_path)
+    print(
+        "DONE\n===================- End CKPT Conversion ====================\n"
+    )
+
+
+def get_args():
+    parser = argparse.ArgumentParser(
+        description='load and parse w2vbert2-conformer')
+    # yapf: disable
+    parser.add_argument(
+        '--w2vbert2_ckpt',
+        required=True,
+        help= 'https://huggingface.co/facebook/conformer-shaw/resolve/main/conformer_shaw.pt' # noqa
+    )
+    # yapf: enable
+    parser.add_argument('--output_dir',
+                        default='.',
+                        help='output file in wenet\'s style: ' +
+                        'units.txt, train.yaml, model.pt')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+
+    args = get_args()
+    args.jit = True
+    checkpoint = torch.load(args.w2vbert2_ckpt, map_location="cpu")
+
+    os.makedirs(args.output_dir, exist_ok=True)
+    convert_to_wenet_state_dict(
+        checkpoint["model"],
+        os.path.join(args.output_dir, 'wenet_w2vbert_conformer_600m.pt'))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/wenet/ssl/w2vbert/w2vbert_model.py b/wenet/ssl/w2vbert/w2vbert_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..27db0abf1ef5848e744ba646134cd63311401f12
--- /dev/null
+++ b/wenet/ssl/w2vbert/w2vbert_model.py
@@ -0,0 +1,319 @@
+import math
+from typing import Dict, Optional, Tuple, Union
+import torch
+
+from wenet.ssl.bestrq.mask import compute_mask_indices_v2
+from wenet.ssl.wav2vec2.quantizer import Wav2vecGumbelVectorQuantizer
+from wenet.ssl.wav2vec2.wav2vec2_model import (_compute_contrastive_loss,
+                                               _sample_negative_indices)
+from wenet.transformer.attention import RelPositionMultiHeadedAttention
+
+from wenet.transformer.encoder import ConformerEncoder, TransformerEncoder
+from wenet.transformer.encoder_layer import ConformerEncoderLayer
+from wenet.utils.mask import make_non_pad_mask
+
+
+class W2VBERTModel(torch.nn.Module):
+
+    def __init__(
+        self,
+        encoder: Union[ConformerEncoder, TransformerEncoder],
+        embedding_dim: int = 256,
+        num_embeddings: int = 320,
+        num_codebooks: int = 1,
+        mask_prob: float = 0.065,
+        mask_length: int = 10,
+        min_masks: int = 2,
+        num_negatives: int = 100,
+        features_regularization_weight: float = 0.01,
+        max_gumbel_temperature: float = 2.0,
+        min_gumbel_temperature: float = 0.1,
+        gumbel_temperature_decay: float = 0.999995,
+        contrastive_logits_temperature: float = 0.1,
+        diversity_weight: float = 0.0,
+        bias: bool = True,
+        contrastive_blocks: int = 6,
+        masked_blocks: int = 6,
+        contrastive_weight: float = 1.0,
+        mlm_weight: float = 1.0,
+        warmup_steps: int = 25000,
+    ) -> None:
+        """ Wrap encoder to train using W2V-BERT's style
+
+        Described in:
+        https://arxiv.org/pdf/2108.06209v2.pdf
+
+        Args:
+            encoder: wenet's encoder,
+                     only support conformer and transformer now
+            embedding_dim: codebooks embedding dim
+            num_embeddings: numbers of each codebook
+            num_codebooks: numbers of codebooks i.e groups of codebook
+            mask_prob: probs of mask
+            mask_length: spans of masks
+            min_masks: min masks for each audio
+            num_negatives: numbers of negatives of each masks
+            features_regularization_weight: l2 regularization weight
+            max_gumbel_temperature: maximum temperature for gumbel softmax
+            min_gumbel_temperature: minimum temperature for gumbel softmax
+            gumbel_temperature_decay:
+                decay of gumbel temperature during training
+            contrastive_logits_temperature:
+                the temperature in the contrastive loss.
+        """
+        super().__init__()
+        assert mask_prob > 0.0
+        assert (contrastive_blocks > 0 and masked_blocks > 0 and
+                contrastive_blocks + masked_blocks == len(encoder.encoders))
+        self.contrastive_blocks = contrastive_blocks
+        self.masked_blocks = masked_blocks
+
+        self.mask_prob = mask_prob
+        self.mask_length = mask_length
+        self.min_masks = min_masks
+        self.num_negatives = num_negatives
+
+        self.features_regularization_weight = features_regularization_weight
+        self.diversity_weight = diversity_weight
+
+        self.contrastive_weight = contrastive_weight
+        self.mlm_weight = mlm_weight
+        self.warmup_steps = warmup_steps
+        # encoder
+        self.encoder = encoder
+
+        # quantizer
+        self.num_codebooks = num_codebooks
+        self.quantizer = Wav2vecGumbelVectorQuantizer(
+            self.encoder.output_size(),
+            num_codebooks=num_codebooks,
+            num_embeddings=num_embeddings,
+            embedding_dim=embedding_dim,
+            hard=False,
+        )
+        self.max_gumbel_temp = max_gumbel_temperature
+        self.min_gumbel_temp = min_gumbel_temperature
+        self.gumbel_temp_decay = gumbel_temperature_decay
+
+        self.num_codevectors_per_group = num_embeddings
+        self.num_codevector_groups = num_codebooks
+
+        self.contrastive_logits_temp = contrastive_logits_temperature
+
+        # NOET(Mddct): mask_em is replaced by random value in Wav-BERT
+        # self.mask_emb = torch.nn.parameter.Parameter(
+        #     torch.empty(self.encoder.output_size()).uniform_(),
+        #     requires_grad=True,
+        # )
+        # TODO(Mddct): support causal or lookahead mask or keep consistent with
+        # wenet dynamic chunk training
+
+        # # n softmax
+        self.encoder_top_n_out = torch.nn.parameter.Parameter(
+            torch.empty(num_codebooks, self.encoder.output_size(),
+                        num_embeddings))
+        torch.nn.init.trunc_normal_(self.encoder_top_n_out, std=0.02)
+        self.bias = bias
+        if bias:
+            self.encoder_top_n_out_bias = torch.nn.parameter.Parameter(
+                torch.empty(num_codebooks, num_embeddings))
+            torch.nn.init.zeros_(self.encoder_top_n_out_bias)
+
+        # reset parameter
+        self.reset_encoder_parameter()
+
+    def reset_encoder_parameter(self):
+
+        def _reset_parameter(module: torch.nn.Module):
+            if isinstance(module, torch.nn.Linear):
+                torch.nn.init.trunc_normal_(module.weight.data,
+                                            mean=0.0,
+                                            std=0.02)
+                if module.bias is not None:
+                    module.bias.data.zero_()
+            elif isinstance(module, torch.nn.Conv1d):
+                torch.nn.init.kaiming_normal_(module.weight)
+                if module.bias is not None:
+                    k = math.sqrt(module.groups /
+                                  (module.in_channels * module.kernel_size[0]))
+                    torch.nn.init.uniform_(module.bias, a=-k, b=k)
+            elif isinstance(module, torch.Tensor):
+                torch.nn.init.trunc_normal_(module)
+            else:
+                raise NotImplementedError("other module not support now")
+
+        encoders = self.encoder.encoders
+        for _, layer in enumerate(encoders):
+            self_attn = layer.self_attn
+            _reset_parameter(self_attn.linear_q)
+            _reset_parameter(self_attn.linear_k)
+            _reset_parameter(self_attn.linear_v)
+            _reset_parameter(self_attn.linear_out)
+            if isinstance(self_attn, RelPositionMultiHeadedAttention):
+                _reset_parameter(self_attn.pos_bias_u)
+                _reset_parameter(self_attn.pos_bias_v)
+            if isinstance(layer, ConformerEncoderLayer):
+                conv1, conv2 = (layer.conv_module.pointwise_conv1,
+                                layer.conv_module.depthwise_conv)
+                _reset_parameter(conv1)
+                _reset_parameter(conv2)
+
+    @torch.jit.unused
+    def forward(
+        self,
+        batch: Dict,
+        device: torch.device,
+    ):
+        steps = batch.get('steps', None)
+        xs = batch['feats'].to(device)
+        xs_lens = batch['feats_lengths'].to(device)
+        assert xs.size(0) == xs_lens.size(0)
+        assert steps is not None
+
+        # 1 forward subsampling
+        # NOTE(Mddct): use subsampling as feature extraction
+        xs, pos_emb, masks = self._forward_subsampling(xs, xs_lens)
+        unmasked_xs = xs
+        # 2 mask features
+        masked_xs, masked_masks = self._apply_mask(xs, masks.squeeze(1))
+        # 3 forward encoder blocks
+        contrastive_vec, mlm_vec, out_mask = self._forward_encoder_blocks(
+            masked_xs, masks, pos_emb, masks)
+
+        # 4 constrastive branch
+        gumbel_temperature = max(
+            self.max_gumbel_temp * self.gumbel_temp_decay**steps,
+            self.min_gumbel_temp)
+
+        quantized_features, codevector_perplexity, targets_ids = self.quantizer(
+            unmasked_xs, masks.squeeze(1), gumbel_temperature)
+
+        sampled_negative_indices = _sample_negative_indices(
+            xs.size()[:-1], self.num_negatives, masked_masks.device,
+            masked_masks)
+
+        loss_contrastive = _compute_contrastive_loss(
+            quantized_features, contrastive_vec, sampled_negative_indices,
+            masked_masks, self.contrastive_logits_temp, self.num_negatives)
+        loss = loss_contrastive
+
+        # scale by sample size
+        # make sure that diversity loss is multiplied by `sample_size`
+        # since contrastive_loss is `sum`-reduced instead of averaged
+        sample_size = masked_masks.sum()
+        # higher codevector_perplexity leads to lower diversity loss
+        loss_diversity: Optional[torch.Tensor] = None
+        if self.diversity_weight != 0.0:
+            loss_diversity = (
+                self.num_codevector_groups * self.num_codevectors_per_group -
+                codevector_perplexity) / (self.num_codevectors_per_group *
+                                          self.num_codevector_groups)
+            loss_diversity = loss_diversity * sample_size
+            loss = loss + self.diversity_weight * loss_diversity
+        loss = loss / sample_size
+
+        features_pen: Optional[torch.Tensor] = None
+        if self.features_regularization_weight != 0.0:
+            features_pen = xs.pow(2).mean()
+            loss = loss + self.features_regularization_weight * features_pen
+
+        # 5 maked lm branch
+        out = mlm_vec.unsqueeze(1)
+        top_n_out = self.encoder_top_n_out.unsqueeze(
+            0)  # [1, num_codebooks, dim, num_embeddings]
+        out = torch.matmul(out,
+                           top_n_out)  # [B, num_codebooks, T', num_embeddings]
+        if self.bias:
+            out = out + self.encoder_top_n_out_bias.unsqueeze(0).unsqueeze(2)
+        num_codes = masked_masks.sum() * self.num_codebooks
+        loss_mlm = self._compute_mlm_loss(out,
+                                          targets_ids,
+                                          mask=out_mask.squeeze(1) *
+                                          masked_masks)
+        ids_corr = out.argmax(dim=-1,
+                              keepdim=False).transpose(1, 2) == targets_ids
+        codes_acc = (ids_corr * masked_masks.unsqueeze(2)).sum() / num_codes
+        # TODO(Mddct): support num codes used in batch, unique num codes
+        # used in batch like bestrq
+
+        # 6 final loss
+        mlm_weight = (self.mlm_weight if steps >= self.warmup_steps else 0.1 +
+                      0.9 * (steps / self.warmup_steps))
+        loss = self.contrastive_weight * loss + mlm_weight * loss_mlm
+        return {
+            "code_ppl": codevector_perplexity.detach(),
+            "features_l2": features_pen,
+            "codes_acc": codes_acc.detach(),
+            "loss": loss,
+            "loss_contrastive": loss_contrastive / sample_size,
+            "loss_diversity": loss_diversity,
+            "loss_mlm": loss_mlm,
+        }
+
+    def _apply_mask(
+            self, xs: torch.Tensor,
+            xs_masks: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+
+        masks = compute_mask_indices_v2(xs.size()[:-1],
+                                        ~xs_masks,
+                                        self.mask_prob,
+                                        self.mask_length,
+                                        min_masks=self.min_masks,
+                                        device=xs.device)
+        masks_expand = masks.unsqueeze(-1)  # [B, T, 1]
+
+        mask_emb = torch.normal(mean=0,
+                                std=0.1,
+                                size=xs.size(),
+                                device=xs.device)
+        xs = torch.where(masks_expand, mask_emb, xs)
+
+        return xs, masks
+
+    def _compute_mlm_loss(self, input: torch.Tensor, target: torch.Tensor,
+                          mask: torch.Tensor) -> torch.Tensor:
+        log_probs = torch.log_softmax(input, dim=-1).transpose(
+            1, 2)  # [B, T', num_codebooks, num_embeddings]
+
+        per_example_n_loss = -log_probs.gather(3, target.unsqueeze(3)).squeeze(
+            3)  # [B, T', num_codebooks]
+
+        numerator = torch.sum(per_example_n_loss * mask.unsqueeze(2))
+        denominator = torch.sum(mask) + 1e-5
+        loss = numerator / (denominator * self.num_codebooks)
+        return loss
+
+    def _forward_subsampling(
+        self, xs: torch.Tensor, xs_lens: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+        masks = make_non_pad_mask(xs_lens).unsqueeze(1)  # (B, 1, T)
+        if self.encoder.global_cmvn is not None:
+            xs = self.encoder.global_cmvn(xs)
+        xs, pos_emb, masks = self.encoder.embed(xs, masks)
+        return xs, pos_emb, masks
+
+    def _forward_encoder_blocks(
+        self, xs: torch.Tensor, xs_masks: torch.Tensor, pos_emb: torch.Tensor,
+        mask_pad: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+        masks = xs_masks
+
+        xs: torch.Tensor
+        # forward contrastive layers get context vector for Contrastive Loss
+        for layer in self.encoder.encoders[:self.contrastive_blocks]:
+            xs, masks, _, _ = layer(xs, xs_masks, pos_emb, mask_pad)
+        contrastive_vec = xs
+
+        for layer in self.encoder.encoders[self.contrastive_blocks:]:
+            xs, masks, _, _ = layer(xs, xs_masks, pos_emb, mask_pad)
+        masked_vec = xs
+
+        if self.encoder.normalize_before:
+            xs = self.encoder.after_norm(xs)
+            masked_vec = xs
+        # Here we assume the mask is not changed in encoder layers, so just
+        # return the masks before encoder layers, and the masks will be used
+        # for cross attention with decoder later
+        return contrastive_vec, masked_vec, masks
diff --git a/wenet/ssl/wav2vec2/quantizer.py b/wenet/ssl/wav2vec2/quantizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5bbb14c3fc7fe3944a7412825eecd21aa28d941
--- /dev/null
+++ b/wenet/ssl/wav2vec2/quantizer.py
@@ -0,0 +1,113 @@
+from typing import Tuple
+import torch
+
+
+def gumbel(shape: torch.Size, dtype: torch.dtype, device: torch.device):
+    """Sample Gumbel random values with given shape and float dtype.
+
+    The values are distributed according to the probability density function:
+
+    .. math::
+     f(x) = e^{-(x + e^{-x})}
+
+    Args:
+      shape (torch.Size): pdf shape
+      dtype (torch.dtype): pdf value dtype
+
+    Returns:
+       A random array with the specified shape and dtype.
+    """
+    # see https://www.cnblogs.com/initial-h/p/9468974.html for more details
+    return -torch.log(-torch.log(
+        torch.empty(shape, device=device).uniform_(
+            torch.finfo(dtype).tiny, 1.)))
+
+
+class Wav2vecGumbelVectorQuantizer(torch.nn.Module):
+
+    def __init__(self,
+                 features_dim: int = 256,
+                 num_codebooks: int = 2,
+                 num_embeddings: int = 8192,
+                 embedding_dim: int = 16,
+                 hard: bool = False) -> None:
+
+        super().__init__()
+
+        self.num_groups = num_codebooks
+        self.num_codevectors_per_group = num_embeddings
+        # codebooks
+        # means [C, G, D] see quantize_vector in bestrq_model.py
+        assert embedding_dim % num_codebooks == 0.0
+        self.embeddings = torch.nn.parameter.Parameter(
+            torch.empty(1, num_codebooks * num_embeddings,
+                        embedding_dim // num_codebooks),
+            requires_grad=True,
+        )
+        torch.nn.init.uniform_(self.embeddings)
+
+        self.weight_proj = torch.nn.Linear(features_dim,
+                                           num_codebooks * num_embeddings)
+        # use gumbel softmax or argmax(non-differentiable)
+        self.hard = hard
+
+    @staticmethod
+    def _compute_perplexity(probs, mask=None):
+        if mask is not None:
+
+            mask_extended = torch.broadcast_to(mask.flatten()[:, None, None],
+                                               probs.shape)
+            probs = torch.where(mask_extended.to(torch.bool), probs,
+                                torch.zeros_like(probs))
+            marginal_probs = probs.sum(dim=0) / mask.sum()
+        else:
+            marginal_probs = probs.mean(dim=0)
+
+        perplexity = torch.exp(-torch.sum(
+            marginal_probs * torch.log(marginal_probs + 1e-7), dim=-1)).sum()
+        return perplexity
+
+    def forward(
+        self,
+        input: torch.Tensor,
+        input_mask: torch.Tensor,
+        temperature: float = 1.
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+        b, t, _ = input.size()
+
+        hidden = self.weight_proj(input)
+        hidden = hidden.reshape(b * t * self.num_groups, -1)
+        if not self.hard:
+            # sample code vector probs via gumbel in differentiateable way
+            gumbels = gumbel(hidden.size(), hidden.dtype, hidden.device)
+            codevector_probs = torch.nn.functional.softmax(
+                (hidden + gumbels) / temperature, dim=-1)
+
+            # compute perplexity
+            codevector_soft_dist = torch.nn.functional.softmax(
+                hidden.reshape(b * t, self.num_groups, -1),
+                dim=-1,
+            )  # [B*T, num_codebooks, num_embeddings]
+            perplexity = self._compute_perplexity(codevector_soft_dist,
+                                                  input_mask)
+        else:
+            # take argmax in non-differentiable way
+            # comptute hard codevector distribution (one hot)
+            codevector_idx = hidden.argmax(axis=-1)
+            codevector_probs = torch.nn.functional.one_hot(
+                codevector_idx, hidden.shape[-1]) * 1.0
+            codevector_probs = codevector_probs.reshape(
+                b * t, self.num_groups, -1)
+            perplexity = self._compute_perplexity(codevector_probs, input_mask)
+
+        targets_idx = codevector_probs.argmax(-1).reshape(b, t, -1)
+        codevector_probs = codevector_probs.reshape(b * t, -1)
+        # use probs to retrieve codevectors
+        codevectors_per_group = codevector_probs.unsqueeze(
+            -1) * self.embeddings
+        codevectors = codevectors_per_group.reshape(
+            b * t, self.num_groups, self.num_codevectors_per_group, -1)
+
+        codevectors = codevectors.sum(-2).reshape(b, t, -1)
+        return codevectors, perplexity, targets_idx
diff --git a/wenet/ssl/wav2vec2/wav2vec2_model.py b/wenet/ssl/wav2vec2/wav2vec2_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..9cbd0c3b32eb8219731002d322504e18e86bd12f
--- /dev/null
+++ b/wenet/ssl/wav2vec2/wav2vec2_model.py
@@ -0,0 +1,324 @@
+import math
+from typing import Dict, Optional, Tuple, Union
+import torch
+
+import torch.nn.functional as F
+from wenet.ssl.bestrq.mask import compute_mask_indices_v2
+from wenet.ssl.wav2vec2.quantizer import Wav2vecGumbelVectorQuantizer
+from wenet.transformer.attention import RelPositionMultiHeadedAttention
+
+from wenet.transformer.encoder import ConformerEncoder, TransformerEncoder
+from wenet.transformer.encoder_layer import ConformerEncoderLayer
+from wenet.utils.mask import make_non_pad_mask
+
+
+def _sample_negative_indices(features_shape: Tuple,
+                             num_negatives: int,
+                             device: torch.device,
+                             mask_time_indices: Optional[torch.Tensor] = None):
+    """
+    Sample `num_negatives` vectors from feature vectors.
+    """
+    batch_size, sequence_length = features_shape
+
+    sequence_length_range = torch.arange(sequence_length, device=device)
+
+    # get `num_negatives` random vector indices from the same utterance
+    sampled_negative_indices = torch.zeros(
+        (batch_size, sequence_length, num_negatives),
+        dtype=sequence_length_range.dtype,
+        device=device)
+
+    mask_time_indices = (mask_time_indices.bool()
+                         if mask_time_indices is not None else torch.ones(
+                             features_shape, dtype=torch.bool, device=device))
+
+    for batch_idx in range(batch_size):
+        high = mask_time_indices[batch_idx].sum() - 1
+        mapped_masked_indices = sequence_length_range[
+            mask_time_indices[batch_idx]]
+
+        feature_indices = torch.arange(high + 1).unsqueeze(1).expand(
+            high + 1, num_negatives)
+        sampled_indices = torch.randint(0,
+                                        high,
+                                        size=(high + 1, num_negatives))
+        sampled_indices[sampled_indices >= feature_indices] += 1
+
+        # remap to actual indices
+        sampled_negative_indices[batch_idx][mask_time_indices[
+            batch_idx]] = mapped_masked_indices[sampled_indices]
+
+        # correct for batch size
+        sampled_negative_indices[batch_idx] += batch_idx * sequence_length
+
+    return sampled_negative_indices.reshape(batch_size, -1)
+
+
+def _compute_contrastive_loss(quantized_features: torch.Tensor,
+                              features: torch.Tensor,
+                              negative_indices: torch.Tensor,
+                              mask_time_indices: torch.Tensor,
+                              logits_temp: float,
+                              num_negatives: int = 1):
+    batch_size, sequence_length, hidden_size = quantized_features.shape
+
+    # take negative vectors from sampled indices
+    quantized_negatives = quantized_features.view(
+        -1, hidden_size)[negative_indices.view(-1)]
+    quantized_negatives = quantized_negatives.view(batch_size, sequence_length,
+                                                   num_negatives,
+                                                   hidden_size).permute(
+                                                       2, 0, 1, 3)
+
+    target_features = torch.cat(
+        [quantized_features.unsqueeze(0), quantized_negatives], dim=0)
+    loss_logits = F.cosine_similarity(features, target_features, dim=-1)
+    loss_logits = loss_logits / logits_temp
+
+    neg_is_pos = (quantized_features == quantized_negatives).all(-1)
+    neg_is_pos = torch.cat(
+        [
+            torch.full(
+                (1, ) + loss_logits.shape[1:], False,
+                device=neg_is_pos.device), neg_is_pos
+        ],
+        dim=0,
+    )
+
+    # make sure incorrectly sampled vectors don't contribute to loss
+    loss_logits = torch.where(neg_is_pos, -1e9, loss_logits)
+
+    predictions = loss_logits.permute(2, 1, 0).reshape(-1,
+                                                       loss_logits.shape[0])
+    targets = ((1 - mask_time_indices.long()) * -100).transpose(1, 0).flatten()
+
+    target_mask = torch.where(targets >= 0, 1.0, 0.0)
+    contrastive_loss = F.cross_entropy(
+        predictions, targets.long(), reduction='none') * target_mask
+
+    contrastive_loss = contrastive_loss.sum()
+
+    return contrastive_loss
+
+
+class Wav2vec2Model(torch.nn.Module):
+
+    def __init__(
+        self,
+        encoder: Union[ConformerEncoder, TransformerEncoder],
+        embedding_dim: int = 256,
+        num_embeddings: int = 320,
+        num_codebooks: int = 1,
+        mask_prob: float = 0.065,
+        mask_length: int = 10,
+        min_masks: int = 2,
+        num_negatives: int = 100,
+        features_regularization_weight: float = 0.01,
+        max_gumbel_temperature: float = 2.0,
+        min_gumbel_temperature: float = 0.1,
+        gumbel_temperature_decay: float = 0.999995,
+        contrastive_logits_temperature: float = 0.1,
+        diversity_weight: float = 0.0,
+    ) -> None:
+        """ Wrap encoder to train using wav2vec2's style
+
+        Args:
+            encoder: wenet's encoder,
+                     only support conformer and transformer now
+            embedding_dim: codebooks embedding dim
+            num_embeddings: numbers of each codebook
+            num_codebooks: numbers of codebooks i.e groups of codebook
+            mask_prob: probs of mask
+            mask_length: spans of masks
+            min_maks: min masks for each audio
+            num_negatives: numbers of negatives of each masks
+            features_regularization_weight: l2 regularization weight
+            max_gumbel_temperature: maximum temperature for gumbel softmax
+            min_gumbel_temperature: minimum temperature for gumbel softmax
+            gumbel_temperature_decay:
+                decay of gumbel temperature during training
+            contrastive_logits_temperature:
+                the temperature in the contrastive loss.
+        """
+        super().__init__()
+        assert mask_prob > 0.0
+        self.mask_prob = mask_prob
+        self.mask_length = mask_length
+        self.min_masks = min_masks
+        self.num_negatives = num_negatives
+
+        self.features_regularization_weight = features_regularization_weight
+        self.diversity_weight = diversity_weight
+
+        # encoder
+        self.encoder = encoder
+
+        # quantizer
+        self.quantizer = Wav2vecGumbelVectorQuantizer(
+            self.encoder.output_size(),
+            num_codebooks=num_codebooks,
+            num_embeddings=num_embeddings,
+            embedding_dim=embedding_dim,
+            hard=False,
+        )
+        self.max_gumbel_temp = max_gumbel_temperature
+        self.min_gumbel_temp = min_gumbel_temperature
+        self.gumbel_temp_decay = gumbel_temperature_decay
+
+        self.num_codevectors_per_group = num_embeddings
+        self.num_codevector_groups = num_codebooks
+
+        self.contrastive_logits_temp = contrastive_logits_temperature
+
+        self.mask_emb = torch.nn.parameter.Parameter(
+            torch.empty(self.encoder.output_size()).uniform_(),
+            requires_grad=True,
+        )
+        # TODO(Mddct): support causal or lookahead mask or keep consistent with
+        # wenet dynamic chunk training
+
+        # reset parameter
+        self.reset_encoder_parameter()
+
+    def reset_encoder_parameter(self):
+
+        def _reset_parameter(module: torch.nn.Module):
+            if isinstance(module, torch.nn.Linear):
+                torch.nn.init.trunc_normal_(module.weight.data,
+                                            mean=0.0,
+                                            std=0.02)
+                if module.bias is not None:
+                    module.bias.data.zero_()
+            elif isinstance(module, torch.nn.Conv1d):
+                torch.nn.init.kaiming_normal_(module.weight)
+                if module.bias is not None:
+                    k = math.sqrt(module.groups /
+                                  (module.in_channels * module.kernel_size[0]))
+                    torch.nn.init.uniform_(module.bias, a=-k, b=k)
+            elif isinstance(module, torch.Tensor):
+                torch.nn.init.trunc_normal_(module)
+            else:
+                raise NotImplementedError("other module not support now")
+
+        encoders = self.encoder.encoders
+        for _, layer in enumerate(encoders):
+            self_attn = layer.self_attn
+            _reset_parameter(self_attn.linear_q)
+            _reset_parameter(self_attn.linear_k)
+            _reset_parameter(self_attn.linear_v)
+            _reset_parameter(self_attn.linear_out)
+            if isinstance(self_attn, RelPositionMultiHeadedAttention):
+                _reset_parameter(self_attn.pos_bias_u)
+                _reset_parameter(self_attn.pos_bias_v)
+            if isinstance(layer, ConformerEncoderLayer):
+                conv1, conv2 = (layer.conv_module.pointwise_conv1,
+                                layer.conv_module.depthwise_conv)
+                _reset_parameter(conv1)
+                _reset_parameter(conv2)
+
+    @torch.jit.unused
+    def forward(
+        self,
+        batch: Dict,
+        device: torch.device,
+    ):
+        steps = batch.get('steps', None)
+        xs = batch['feats'].to(device)
+        xs_lens = batch['feats_lengths'].to(device)
+        assert xs.size(0) == xs_lens.size(0)
+        assert steps is not None
+
+        # 1 forward subsampling
+        # NOTE(Mddct): use subsampling as feature extraction
+        xs, pos_emb, masks = self._forward_subsampling(xs, xs_lens)
+        unmasked_xs = xs
+        # 2 mask features
+        masked_xs, masked_masks = self._apply_mask(xs, masks.squeeze(1))
+        # 3 forward encoder blocks
+        out, _ = self._forward_encoder_blocks(masked_xs, masks, pos_emb, masks)
+
+        gumbel_temperature = max(
+            self.max_gumbel_temp * self.gumbel_temp_decay**steps,
+            self.min_gumbel_temp)
+
+        quantized_features, codevector_perplexity, _ = self.quantizer(
+            unmasked_xs, masks.squeeze(1), gumbel_temperature)
+
+        sampled_negative_indices = _sample_negative_indices(
+            xs.size()[:-1], self.num_negatives, masked_masks.device,
+            masked_masks)
+
+        loss_contrastive = _compute_contrastive_loss(
+            quantized_features, out, sampled_negative_indices, masked_masks,
+            self.contrastive_logits_temp, self.num_negatives)
+        loss = loss_contrastive
+
+        # scale by sample size
+        # make sure that diversity loss is multiplied by `sample_size`
+        # since contrastive_loss is `sum`-reduced instead of averaged
+        sample_size = masked_masks.sum()
+        # higher codevector_perplexity leads to lower diversity loss
+        loss_diversity: Optional[torch.Tensor] = None
+        if self.diversity_weight != 0.0:
+            loss_diversity = (
+                self.num_codevector_groups * self.num_codevectors_per_group -
+                codevector_perplexity) / (self.num_codevectors_per_group *
+                                          self.num_codevector_groups)
+            loss_diversity = loss_diversity * sample_size
+            loss = loss + self.diversity_weight * loss_diversity
+        loss = loss / sample_size
+
+        features_pen: Optional[torch.Tensor] = None
+        if self.features_regularization_weight != 0.0:
+            features_pen = xs.pow(2).mean()
+            loss = loss + self.features_regularization_weight * features_pen
+
+        return {
+            "code_ppl": codevector_perplexity.detach(),
+            "features_l2": features_pen,
+            "loss": loss,
+            "loss_contrastive": loss_contrastive / sample_size,
+            "loss_diversity": loss_diversity,
+        }
+
+    def _apply_mask(
+            self, xs: torch.Tensor,
+            xs_masks: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+
+        masks = compute_mask_indices_v2(xs.size()[:-1],
+                                        ~xs_masks,
+                                        self.mask_prob,
+                                        self.mask_length,
+                                        min_masks=self.min_masks,
+                                        device=xs.device)
+        masks_expand = masks.unsqueeze(-1)  # [B, T, 1]
+
+        mask_emb = self.mask_emb.to(xs.device).view(1, 1, -1)
+        xs = torch.where(masks_expand, mask_emb, xs)
+
+        return xs, masks
+
+    def _forward_subsampling(
+        self, xs: torch.Tensor, xs_lens: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+        masks = make_non_pad_mask(xs_lens).unsqueeze(1)  # (B, 1, T)
+        if self.encoder.global_cmvn is not None:
+            xs = self.encoder.global_cmvn(xs)
+        xs, pos_emb, masks = self.encoder.embed(xs, masks)
+        return xs, pos_emb, masks
+
+    def _forward_encoder_blocks(self, xs: torch.Tensor, xs_masks: torch.Tensor,
+                                pos_emb: torch.Tensor, mask_pad: torch.Tensor):
+
+        masks = xs_masks
+
+        for layer in self.encoder.encoders:
+            xs, masks, _, _ = layer(xs, xs_masks, pos_emb, mask_pad)
+        if self.encoder.normalize_before:
+            xs = self.encoder.after_norm(xs)
+        # Here we assume the mask is not changed in encoder layers, so just
+        # return the masks before encoder layers, and the masks will be used
+        # for cross attention with decoder later
+        return xs, masks
diff --git a/wenet/text/__init__.py b/wenet/text/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/wenet/text/base_tokenizer.py b/wenet/text/base_tokenizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e7731fa1e7a7d566b6c5909786aa1bc68610193
--- /dev/null
+++ b/wenet/text/base_tokenizer.py
@@ -0,0 +1,41 @@
+from abc import ABC, abstractmethod, abstractproperty
+from typing import Dict, List, Tuple, Union
+
+T = Union[str, bytes]
+
+
+class BaseTokenizer(ABC):
+
+    def tokenize(self, line: str) -> Tuple[List[T], List[int]]:
+        tokens = self.text2tokens(line)
+        ids = self.tokens2ids(tokens)
+        return tokens, ids
+
+    def detokenize(self, ids: List[int]) -> Tuple[str, List[T]]:
+        tokens = self.ids2tokens(ids)
+        text = self.tokens2text(tokens)
+        return text, tokens
+
+    @abstractmethod
+    def text2tokens(self, line: str) -> List[T]:
+        raise NotImplementedError("abstract method")
+
+    @abstractmethod
+    def tokens2text(self, tokens: List[T]) -> str:
+        raise NotImplementedError("abstract method")
+
+    @abstractmethod
+    def tokens2ids(self, tokens: List[T]) -> List[int]:
+        raise NotImplementedError("abstract method")
+
+    @abstractmethod
+    def ids2tokens(self, ids: List[int]) -> List[T]:
+        raise NotImplementedError("abstract method")
+
+    @abstractmethod
+    def vocab_size(self) -> int:
+        raise NotImplementedError("abstract method")
+
+    @abstractproperty
+    def symbol_table(self) -> Dict[T, int]:
+        raise NotImplementedError("abstract method")
diff --git a/wenet/text/bpe_tokenizer.py b/wenet/text/bpe_tokenizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ac5077041bd37e168b3d1f860168d0506061676
--- /dev/null
+++ b/wenet/text/bpe_tokenizer.py
@@ -0,0 +1,51 @@
+from os import PathLike
+from typing import Dict, List, Optional, Union
+from wenet.text.char_tokenizer import CharTokenizer
+from wenet.text.tokenize_utils import tokenize_by_bpe_model
+
+
+class BpeTokenizer(CharTokenizer):
+
+    def __init__(
+        self,
+        bpe_model: Union[PathLike, str],
+        symbol_table: Union[str, PathLike, Dict],
+        non_lang_syms: Optional[Union[str, PathLike, List]] = None,
+        split_with_space: bool = False,
+        connect_symbol: str = '',
+        unk='<unk>',
+    ) -> None:
+        super().__init__(symbol_table, non_lang_syms, split_with_space,
+                         connect_symbol, unk)
+        self._model = bpe_model
+        # NOTE(Mddct): multiprocessing.Process() issues
+        #              don't build sp here
+        self.bpe_model = None
+
+    def _build_sp(self):
+        if self.bpe_model is None:
+            import sentencepiece as spm
+            self.bpe_model = spm.SentencePieceProcessor()
+            self.bpe_model.load(self._model)
+
+    def text2tokens(self, line: str) -> List[str]:
+        self._build_sp()
+        line = line.strip()
+        if self.non_lang_syms_pattern is not None:
+            parts = self.non_lang_syms_pattern.split(line.upper())
+            parts = [w for w in parts if len(w.strip()) > 0]
+        else:
+            parts = [line]
+
+        tokens = []
+        for part in parts:
+            if part in self.non_lang_syms:
+                tokens.append(part)
+            else:
+                tokens.extend(tokenize_by_bpe_model(self.bpe_model, part))
+        return tokens
+
+    def tokens2text(self, tokens: List[str]) -> str:
+        self._build_sp()
+        text = super().tokens2text(tokens)
+        return text.replace("▁", ' ').strip()
diff --git a/wenet/text/char_tokenizer.py b/wenet/text/char_tokenizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..00b59b674c40ffd2e7d377b2077b1ef3ef0f015e
--- /dev/null
+++ b/wenet/text/char_tokenizer.py
@@ -0,0 +1,80 @@
+import re
+
+from os import PathLike
+from typing import Dict, List, Optional, Union
+from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols
+from wenet.text.base_tokenizer import BaseTokenizer
+
+
+class CharTokenizer(BaseTokenizer):
+
+    def __init__(
+        self,
+        symbol_table: Union[str, PathLike, Dict],
+        non_lang_syms: Optional[Union[str, PathLike, List]] = None,
+        split_with_space: bool = False,
+        connect_symbol: str = '',
+        unk='<unk>',
+    ) -> None:
+        self.non_lang_syms_pattern = None
+        if non_lang_syms is not None:
+            self.non_lang_syms_pattern = re.compile(
+                r"(\[[^\[\]]+\]|<[^<>]+>|{[^{}]+})")
+        if not isinstance(symbol_table, Dict):
+            self._symbol_table = read_symbol_table(symbol_table)
+        else:
+            # symbol_table = {"我": 1, "是": 2, "{NOISE}": 3}
+            self._symbol_table = symbol_table
+        if not isinstance(non_lang_syms, List):
+            self.non_lang_syms = read_non_lang_symbols(non_lang_syms)
+        else:
+            # non_lang_syms=["{NOISE}"]
+            self.non_lang_syms = non_lang_syms
+        self.char_dict = {v: k for k, v in self._symbol_table.items()}
+        self.split_with_space = split_with_space
+        self.connect_symbol = connect_symbol
+        self.unk = unk
+
+    def text2tokens(self, line: str) -> List[str]:
+        line = line.strip()
+        if self.non_lang_syms_pattern is not None:
+            parts = self.non_lang_syms_pattern.split(line.upper())
+            parts = [w for w in parts if len(w.strip()) > 0]
+        else:
+            parts = [line]
+
+        tokens = []
+        for part in parts:
+            if part in self.non_lang_syms:
+                tokens.append(part)
+            else:
+                if self.split_with_space:
+                    part = part.split(" ")
+                for ch in part:
+                    if ch == ' ':
+                        ch = "▁"
+                    tokens.append(ch)
+        return tokens
+
+    def tokens2text(self, tokens: List[str]) -> str:
+        return self.connect_symbol.join(tokens)
+
+    def tokens2ids(self, tokens: List[str]) -> List[int]:
+        ids = []
+        for ch in tokens:
+            if ch in self._symbol_table:
+                ids.append(self._symbol_table[ch])
+            elif self.unk in self._symbol_table:
+                ids.append(self._symbol_table[self.unk])
+        return ids
+
+    def ids2tokens(self, ids: List[int]) -> List[str]:
+        content = [self.char_dict[w] for w in ids]
+        return content
+
+    def vocab_size(self) -> int:
+        return len(self.char_dict)
+
+    @property
+    def symbol_table(self) -> Dict[str, int]:
+        return self._symbol_table
diff --git a/wenet/text/hugging_face_tokenizer.py b/wenet/text/hugging_face_tokenizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ea6f0529712c32bcdd3b746d0df733a0f500598
--- /dev/null
+++ b/wenet/text/hugging_face_tokenizer.py
@@ -0,0 +1,58 @@
+from os import PathLike
+from typing import Dict, List, Union
+from wenet.text.base_tokenizer import BaseTokenizer, T as Type
+
+
+class HuggingFaceTokenizer(BaseTokenizer):
+
+    def __init__(self, model: Union[str, PathLike], *args, **kwargs) -> None:
+        # NOTE(Mddct): don't build here, pickle issues
+        self.model = model
+        self.tokenizer = None
+
+        self.args = args
+        self.kwargs = kwargs
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        del state['tokenizer']
+        return state
+
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+        recovery = {'tokenizer': None}
+        self.__dict__.update(recovery)
+
+    def _build_hugging_face(self):
+        from transformers import AutoTokenizer
+        if self.tokenizer is None:
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                self.model, **self.kwargs)
+            self.t2i = self.tokenizer.get_vocab()
+
+    def text2tokens(self, line: str) -> List[Type]:
+        self._build_hugging_face()
+        return self.tokenizer.tokenize(line)
+
+    def tokens2text(self, tokens: List[Type]) -> str:
+        self._build_hugging_face()
+        ids = self.tokens2ids(tokens)
+        return self.tokenizer.decode(ids)
+
+    def tokens2ids(self, tokens: List[Type]) -> List[int]:
+        self._build_hugging_face()
+        return self.tokenizer.convert_tokens_to_ids(tokens)
+
+    def ids2tokens(self, ids: List[int]) -> List[Type]:
+        self._build_hugging_face()
+        return self.tokenizer.convert_ids_to_tokens(ids)
+
+    def vocab_size(self) -> int:
+        self._build_hugging_face()
+        # TODO: we need special tokenize size in future
+        return len(self.tokenizer)
+
+    @property
+    def symbol_table(self) -> Dict[Type, int]:
+        self._build_hugging_face()
+        return self.t2i
diff --git a/wenet/text/paraformer_tokenizer.py b/wenet/text/paraformer_tokenizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..11c0ad8f4ca104739582a4012139ccffa700e21a
--- /dev/null
+++ b/wenet/text/paraformer_tokenizer.py
@@ -0,0 +1,52 @@
+from os import PathLike
+from typing import Dict, List, Optional, Union
+from wenet.paraformer.search import paraformer_beautify_result
+from wenet.text.char_tokenizer import CharTokenizer
+from wenet.text.tokenize_utils import tokenize_by_seg_dict
+
+
+def read_seg_dict(path):
+    seg_table = {}
+    with open(path, 'r', encoding='utf8') as fin:
+        for line in fin:
+            arr = line.strip().split('\t')
+            assert len(arr) == 2
+            seg_table[arr[0]] = arr[1]
+    return seg_table
+
+
+class ParaformerTokenizer(CharTokenizer):
+
+    def __init__(self,
+                 symbol_table: Union[str, PathLike, Dict],
+                 seg_dict: Optional[Union[str, PathLike, Dict]] = None,
+                 split_with_space: bool = False,
+                 connect_symbol: str = '',
+                 unk='<unk>') -> None:
+        super().__init__(symbol_table, None, split_with_space, connect_symbol,
+                         unk)
+        self.seg_dict = seg_dict
+        if seg_dict is not None and not isinstance(seg_dict, Dict):
+            self.seg_dict = read_seg_dict(seg_dict)
+
+    def text2tokens(self, line: str) -> List[str]:
+        assert self.seg_dict is not None
+
+        # TODO(Mddct): duplicated here, refine later
+        line = line.strip()
+        if self.non_lang_syms_pattern is not None:
+            parts = self.non_lang_syms_pattern.split(line)
+            parts = [w for w in parts if len(w.strip()) > 0]
+        else:
+            parts = [line]
+
+        tokens = []
+        for part in parts:
+            if part in self.non_lang_syms:
+                tokens.append(part)
+            else:
+                tokens.extend(tokenize_by_seg_dict(self.seg_dict, part))
+        return tokens
+
+    def tokens2text(self, tokens: List[str]) -> str:
+        return paraformer_beautify_result(tokens)
diff --git a/wenet/text/tokenize_utils.py b/wenet/text/tokenize_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..0bb322495c4295beeede73fbf1500f22dde70ba3
--- /dev/null
+++ b/wenet/text/tokenize_utils.py
@@ -0,0 +1,65 @@
+# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)
+#               2023 Tsinghua Univ. (authors: Xingchen Song)
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+
+
+def tokenize_by_bpe_model(sp, txt):
+    return _tokenize_by_seg_dic_or_bpe_model(txt, sp=sp, upper=True)
+
+
+def tokenize_by_seg_dict(seg_dict, txt):
+    return _tokenize_by_seg_dic_or_bpe_model(txt,
+                                             seg_dict=seg_dict,
+                                             upper=False)
+
+
+def _tokenize_by_seg_dic_or_bpe_model(
+    txt,
+    sp=None,
+    seg_dict=None,
+    upper=True,
+):
+    if sp is None:
+        assert seg_dict is not None
+    if seg_dict is None:
+        assert sp is not None
+    tokens = []
+    # CJK(China Japan Korea) unicode range is [U+4E00, U+9FFF], ref:
+    # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+    pattern = re.compile(r'([\u4e00-\u9fff])')
+    # Example:
+    #   txt   = "你好 ITS'S OKAY 的"
+    #   chars = ["你", "好", " ITS'S OKAY ", "的"]
+    chars = pattern.split(txt.upper() if upper else txt)
+    mix_chars = [w for w in chars if len(w.strip()) > 0]
+    for ch_or_w in mix_chars:
+        # ch_or_w is a single CJK charater(i.e., "你"), do nothing.
+        if pattern.fullmatch(ch_or_w) is not None:
+            tokens.append(ch_or_w)
+        # ch_or_w contains non-CJK charaters(i.e., " IT'S OKAY "),
+        # encode ch_or_w using bpe_model.
+        else:
+            if sp is not None:
+                for p in sp.encode_as_pieces(ch_or_w):
+                    tokens.append(p)
+            else:
+                for en_token in ch_or_w.split():
+                    en_token = en_token.strip()
+                    if en_token in seg_dict:
+                        tokens.extend(seg_dict[en_token].split(' '))
+                    else:
+                        tokens.append(en_token)
+
+    return tokens
diff --git a/wenet/text/whisper_tokenizer.py b/wenet/text/whisper_tokenizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb118a3b685159fb64bd624c67a5e3baf7878695
--- /dev/null
+++ b/wenet/text/whisper_tokenizer.py
@@ -0,0 +1,103 @@
+from os import PathLike
+from typing import Dict, List, Optional, Tuple, Union
+from wenet.text.base_tokenizer import BaseTokenizer
+
+from wenet.utils.file_utils import read_non_lang_symbols
+
+
+class WhisperTokenizer(BaseTokenizer):
+
+    def __init__(
+        self,
+        multilingual: bool,
+        num_languages: int = 99,
+        language: Optional[str] = None,
+        task: Optional[str] = None,
+        non_lang_syms: Optional[Union[str, PathLike, List]] = None,
+        *args,
+        **kwargs,
+    ) -> None:
+        # NOTE(Mddct): don't build here, pickle issues
+        self.tokenizer = None
+        # TODO: we don't need this in future
+        self.multilingual = multilingual
+        self.num_languages = num_languages
+        self.language = language
+        self.task = task
+
+        if not isinstance(non_lang_syms, List):
+            self.non_lang_syms = read_non_lang_symbols(non_lang_syms)
+        else:
+            # non_lang_syms=["{NOISE}"]
+            self.non_lang_syms = non_lang_syms
+        # TODO(Mddct): add special tokens, like non_lang_syms
+        del self.non_lang_syms
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        del state['tokenizer']
+        return state
+
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+        recovery = {'tokenizer': None}
+        self.__dict__.update(recovery)
+
+    def _build_tiktoken(self):
+        if self.tokenizer is None:
+            from whisper.tokenizer import get_tokenizer
+            self.tokenizer = get_tokenizer(multilingual=self.multilingual,
+                                           num_languages=self.num_languages,
+                                           language=self.language,
+                                           task=self.task)
+            self.t2i = {}
+            self.i2t = {}
+            for i in range(self.tokenizer.encoding.n_vocab):
+                unit = str(
+                    self.tokenizer.encoding.decode_single_token_bytes(i))
+                if len(unit) == 0:
+                    unit = str(i)
+                unit = unit.replace(" ", "<space>")
+                # unit = bytes(unit, 'utf-8')
+                self.t2i[unit] = i
+                self.i2t[i] = unit
+            assert len(self.t2i) == len(self.i2t)
+
+    def tokenize(self, line: str) -> Tuple[List[str], List[int]]:
+        self._build_tiktoken()
+        ids = self.tokenizer.encoding.encode(line)
+        text = [self.i2t[d] for d in ids]
+        return text, ids
+
+    def detokenize(self, ids: List[int]) -> Tuple[str, List[str]]:
+        self._build_tiktoken()
+        tokens = [self.i2t[d] for d in ids]
+        text = self.tokenizer.encoding.decode(ids)
+        return text, tokens
+
+    def text2tokens(self, line: str) -> List[str]:
+        self._build_tiktoken()
+        return self.tokenize(line)[0]
+
+    def tokens2text(self, tokens: List[str]) -> str:
+        self._build_tiktoken()
+        ids = [self.t2i[t] for t in tokens]
+        return self.detokenize(ids)[0]
+
+    def tokens2ids(self, tokens: List[str]) -> List[int]:
+        self._build_tiktoken()
+        ids = [self.t2i[t] for t in tokens]
+        return ids
+
+    def ids2tokens(self, ids: List[int]) -> List[str]:
+        self._build_tiktoken()
+        return [self.tokenizer.encoding.decode([id]) for id in ids]
+
+    def vocab_size(self) -> int:
+        self._build_tiktoken()
+        return len(self.t2i)
+
+    @property
+    def symbol_table(self) -> Dict[str, int]:
+        self._build_tiktoken()
+        return self.t2i
diff --git a/wenet/transducer/__init__.py b/wenet/transducer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/wenet/transducer/joint.py b/wenet/transducer/joint.py
new file mode 100644
index 0000000000000000000000000000000000000000..31d53f41d26fcf5efafac400d60e85c5322bc6e1
--- /dev/null
+++ b/wenet/transducer/joint.py
@@ -0,0 +1,106 @@
+from typing import Optional
+
+import torch
+from torch import nn
+from wenet.utils.class_utils import WENET_ACTIVATION_CLASSES
+
+
+class TransducerJoint(torch.nn.Module):
+
+    def __init__(self,
+                 vocab_size: int,
+                 enc_output_size: int,
+                 pred_output_size: int,
+                 join_dim: int,
+                 prejoin_linear: bool = True,
+                 postjoin_linear: bool = False,
+                 joint_mode: str = 'add',
+                 activation: str = "tanh",
+                 hat_joint: bool = False,
+                 dropout_rate: float = 0.1,
+                 hat_activation: str = 'tanh'):
+        # TODO(Mddct): concat in future
+        assert joint_mode in ['add']
+        super().__init__()
+
+        self.activatoin = WENET_ACTIVATION_CLASSES[activation]()
+        self.prejoin_linear = prejoin_linear
+        self.postjoin_linear = postjoin_linear
+        self.joint_mode = joint_mode
+
+        if not self.prejoin_linear and not self.postjoin_linear:
+            assert enc_output_size == pred_output_size == join_dim
+        # torchscript compatibility
+        self.enc_ffn: Optional[nn.Linear] = None
+        self.pred_ffn: Optional[nn.Linear] = None
+        if self.prejoin_linear:
+            self.enc_ffn = nn.Linear(enc_output_size, join_dim)
+            self.pred_ffn = nn.Linear(pred_output_size, join_dim)
+        # torchscript compatibility
+        self.post_ffn: Optional[nn.Linear] = None
+        if self.postjoin_linear:
+            self.post_ffn = nn.Linear(join_dim, join_dim)
+
+        # NOTE: <blank> in vocab_size
+        self.hat_joint = hat_joint
+        self.vocab_size = vocab_size
+        self.ffn_out: Optional[torch.nn.Linear] = None
+        if not self.hat_joint:
+            self.ffn_out = nn.Linear(join_dim, vocab_size)
+
+        self.blank_pred: Optional[torch.nn.Module] = None
+        self.token_pred: Optional[torch.nn.Module] = None
+        if self.hat_joint:
+            self.blank_pred = torch.nn.Sequential(
+                torch.nn.Tanh(), torch.nn.Dropout(dropout_rate),
+                torch.nn.Linear(join_dim, 1), torch.nn.LogSigmoid())
+            self.token_pred = torch.nn.Sequential(
+                WENET_ACTIVATION_CLASSES[hat_activation](),
+                torch.nn.Dropout(dropout_rate),
+                torch.nn.Linear(join_dim, self.vocab_size - 1))
+
+    def forward(self,
+                enc_out: torch.Tensor,
+                pred_out: torch.Tensor,
+                pre_project: bool = True) -> torch.Tensor:
+        """
+        Args:
+            enc_out (torch.Tensor): [B, T, E]
+            pred_out (torch.Tensor): [B, T, P]
+        Return:
+            [B,T,U,V]
+        """
+        if (pre_project and self.prejoin_linear and self.enc_ffn is not None
+                and self.pred_ffn is not None):
+            enc_out = self.enc_ffn(enc_out)  # [B,T,E] -> [B,T,D]
+            pred_out = self.pred_ffn(pred_out)
+        if enc_out.ndim != 4:
+            enc_out = enc_out.unsqueeze(2)  # [B,T,D] -> [B,T,1,D]
+        if pred_out.ndim != 4:
+            pred_out = pred_out.unsqueeze(1)  # [B,U,D] -> [B,1,U,D]
+
+        # TODO(Mddct): concat joint
+        _ = self.joint_mode
+        out = enc_out + pred_out  # [B,T,U,V]
+
+        if self.postjoin_linear and self.post_ffn is not None:
+            out = self.post_ffn(out)
+
+        if not self.hat_joint and self.ffn_out is not None:
+            out = self.activatoin(out)
+            out = self.ffn_out(out)
+            return out
+        else:
+            assert self.blank_pred is not None
+            assert self.token_pred is not None
+            blank_logp = self.blank_pred(out)  # [B,T,U,1]
+
+            # scale blank logp
+            scale_logp = torch.clamp(1 - torch.exp(blank_logp), min=1e-6)
+            label_logp = self.token_pred(out).log_softmax(
+                dim=-1)  # [B,T,U,vocab-1]
+            # scale token logp
+            label_logp = torch.log(scale_logp) + label_logp
+
+            out = torch.cat((blank_logp, label_logp), dim=-1)  # [B,T,U,vocab]
+            return out
diff --git a/wenet/transducer/predictor.py b/wenet/transducer/predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..6949aa0cf9cf153c18216478e2d2ff957ec74c16
--- /dev/null
+++ b/wenet/transducer/predictor.py
@@ -0,0 +1,495 @@
+from typing import List, Optional, Tuple
+
+import torch
+from torch import nn
+from wenet.utils.class_utils import WENET_ACTIVATION_CLASSES, WENET_RNN_CLASSES
+
+
+def ApplyPadding(input, padding, pad_value) -> torch.Tensor:
+    """
+    Args:
+        input:   [bs, max_time_step, dim]
+        padding: [bs, max_time_step]
+    """
+    return padding * pad_value + input * (1 - padding)
+
+
+class PredictorBase(torch.nn.Module):
+
+    # NOTE(Mddct): We can use ABC abstract here, but
+    # keep this class simple enough for now
+    def __init__(self) -> None:
+        super().__init__()
+
+    def init_state(self,
+                   batch_size: int,
+                   device: torch.device,
+                   method: str = "zero") -> List[torch.Tensor]:
+        _, _, _ = batch_size, method, device
+        raise NotImplementedError("this is a base precictor")
+
+    def batch_to_cache(self,
+                       cache: List[torch.Tensor]) -> List[List[torch.Tensor]]:
+        _ = cache
+        raise NotImplementedError("this is a base precictor")
+
+    def cache_to_batch(self,
+                       cache: List[List[torch.Tensor]]) -> List[torch.Tensor]:
+        _ = cache
+        raise NotImplementedError("this is a base precictor")
+
+    def output_size(self):
+        raise NotImplementedError("this is a base precictor")
+
+    def forward(
+        self,
+        input: torch.Tensor,
+        cache: Optional[List[torch.Tensor]] = None,
+    ):
+        _, _, = input, cache
+        raise NotImplementedError("this is a base precictor")
+
+    def forward_step(
+            self, input: torch.Tensor, padding: torch.Tensor,
+            cache: List[torch.Tensor]
+    ) -> Tuple[torch.Tensor, List[torch.Tensor]]:
+        _, _, _, = input, padding, cache
+        raise NotImplementedError("this is a base precictor")
+
+
+class RNNPredictor(PredictorBase):
+
+    def __init__(self,
+                 voca_size: int,
+                 embed_size: int,
+                 output_size: int,
+                 embed_dropout: float,
+                 hidden_size: int,
+                 num_layers: int,
+                 bias: bool = True,
+                 rnn_type: str = "lstm",
+                 dropout: float = 0.1) -> None:
+        super().__init__()
+        self.n_layers = num_layers
+        self.hidden_size = hidden_size
+        self._output_size = output_size
+        # disable rnn base out projection
+        self.embed = nn.Embedding(voca_size, embed_size)
+        self.dropout = nn.Dropout(embed_dropout)
+        # NOTE(Mddct): rnn base from torch not support layer norm
+        # will add layer norm and prune value in cell and layer
+        # ref: https://github.com/Mddct/neural-lm/blob/main/models/gru_cell.py
+        self.rnn = WENET_RNN_CLASSES[rnn_type](input_size=embed_size,
+                                               hidden_size=hidden_size,
+                                               num_layers=num_layers,
+                                               bias=bias,
+                                               batch_first=True,
+                                               dropout=dropout)
+        self.projection = nn.Linear(hidden_size, output_size)
+
+    def output_size(self):
+        return self._output_size
+
+    def forward(
+        self,
+        input: torch.Tensor,
+        cache: Optional[List[torch.Tensor]] = None,
+    ) -> torch.Tensor:
+        """
+        Args:
+            input (torch.Tensor): [batch, max_time).
+            padding (torch.Tensor): [batch, max_time]
+            cache : rnn predictor cache[0] == state_m
+                    cache[1] == state_c
+        Returns:
+            output: [batch, max_time, output_size]
+        """
+
+        # NOTE(Mddct): we don't use pack input format
+        embed = self.embed(input)  # [batch, max_time, emb_size]
+        embed = self.dropout(embed)
+        states: Optional[Tuple[torch.Tensor, torch.Tensor]] = None
+        if cache is None:
+            state = self.init_state(batch_size=input.size(0),
+                                    device=input.device)
+            states = (state[0], state[1])
+        else:
+            assert len(cache) == 2
+            states = (cache[0], cache[1])
+        out, (m, c) = self.rnn(embed, states)
+        out = self.projection(out)
+
+        # NOTE(Mddct): Although we don't use staate in transducer
+        # training forward, we need make it right for padding value
+        # so we create forward_step for infering, forward for training
+        _, _ = m, c
+        return out
+
+    def batch_to_cache(self,
+                       cache: List[torch.Tensor]) -> List[List[torch.Tensor]]:
+        """
+        Args:
+           cache: [state_m, state_c]
+               state_ms: [1*n_layers, bs, ...]
+               state_cs: [1*n_layers, bs, ...]
+        Returns:
+           new_cache: [[state_m_1, state_c_1], [state_m_2, state_c_2]...]
+        """
+        assert len(cache) == 2
+        state_ms = cache[0]
+        state_cs = cache[1]
+
+        assert state_ms.size(1) == state_cs.size(1)
+
+        new_cache: List[List[torch.Tensor]] = []
+        for state_m, state_c in zip(torch.split(state_ms, 1, dim=1),
+                                    torch.split(state_cs, 1, dim=1)):
+            new_cache.append([state_m, state_c])
+        return new_cache
+
+    def cache_to_batch(self,
+                       cache: List[List[torch.Tensor]]) -> List[torch.Tensor]:
+        """
+        Args:
+            cache : [[state_m_1, state_c_1], [state_m_1, state_c_1]...]
+
+        Returns:
+            new_caceh: [state_ms, state_cs],
+                state_ms: [1*n_layers, bs, ...]
+                state_cs: [1*n_layers, bs, ...]
+        """
+        state_ms = torch.cat([states[0] for states in cache], dim=1)
+        state_cs = torch.cat([states[1] for states in cache], dim=1)
+        return [state_ms, state_cs]
+
+    def init_state(
+        self,
+        batch_size: int,
+        device: torch.device,
+        method: str = "zero",
+    ) -> List[torch.Tensor]:
+        assert batch_size > 0
+        # TODO(Mddct): xavier init method
+        _ = method
+        return [
+            torch.zeros(1 * self.n_layers,
+                        batch_size,
+                        self.hidden_size,
+                        device=device),
+            torch.zeros(1 * self.n_layers,
+                        batch_size,
+                        self.hidden_size,
+                        device=device)
+        ]
+
+    def forward_step(
+            self, input: torch.Tensor, padding: torch.Tensor,
+            cache: List[torch.Tensor]
+    ) -> Tuple[torch.Tensor, List[torch.Tensor]]:
+        """
+        Args:
+            input (torch.Tensor): [batch_size, time_step=1]
+            padding (torch.Tensor): [batch_size,1], 1 is padding value
+            cache : rnn predictor cache[0] == state_m
+                    cache[1] == state_c
+        """
+        assert len(cache) == 2
+        state_m, state_c = cache[0], cache[1]
+        embed = self.embed(input)  # [batch, 1, emb_size]
+        embed = self.dropout(embed)
+        out, (m, c) = self.rnn(embed, (state_m, state_c))
+
+        out = self.projection(out)
+        m = ApplyPadding(m, padding.unsqueeze(0), state_m)
+        c = ApplyPadding(c, padding.unsqueeze(0), state_c)
+
+        return (out, [m, c])
+
+
+class EmbeddingPredictor(PredictorBase):
+    """Embedding predictor
+
+    Described in:
+    https://arxiv.org/pdf/2109.07513.pdf
+
+    embed-> proj -> layer norm -> swish
+    """
+
+    def __init__(self,
+                 voca_size: int,
+                 embed_size: int,
+                 output_size: int,
+                 embed_dropout: float,
+                 n_head: int,
+                 history_size: int = 2,
+                 activation: str = "swish",
+                 bias: bool = False,
+                 layer_norm_epsilon: float = 1e-5) -> None:
+
+        super().__init__()
+        assert output_size == embed_size
+        # multi head
+        self.num_heads = n_head
+        self.embed_size = embed_size
+        self.context_size = history_size + 1
+        self.pos_embed = torch.nn.Linear(embed_size * self.context_size,
+                                         self.num_heads,
+                                         bias=bias)
+        self.embed = nn.Embedding(voca_size, self.embed_size)
+        self.embed_dropout = nn.Dropout(p=embed_dropout)
+        self.ffn = nn.Linear(self.embed_size, self.embed_size)
+        self.norm = nn.LayerNorm(self.embed_size, eps=layer_norm_epsilon)
+        self.activatoin = WENET_ACTIVATION_CLASSES[activation]()
+
+    def output_size(self):
+        return self.embed_size
+
+    def init_state(self,
+                   batch_size: int,
+                   device: torch.device,
+                   method: str = "zero") -> List[torch.Tensor]:
+        assert batch_size > 0
+        _ = method
+        return [
+            torch.zeros(batch_size,
+                        self.context_size - 1,
+                        self.embed_size,
+                        device=device),
+        ]
+
+    def batch_to_cache(self,
+                       cache: List[torch.Tensor]) -> List[List[torch.Tensor]]:
+        """
+        Args:
+            cache : [history]
+                history: [bs, ...]
+        Returns:
+            new_ache : [[history_1], [history_2], [history_3]...]
+        """
+        assert len(cache) == 1
+        cache_0 = cache[0]
+        history: List[List[torch.Tensor]] = []
+        for h in torch.split(cache_0, 1, dim=0):
+            history.append([h])
+        return history
+
+    def cache_to_batch(self,
+                       cache: List[List[torch.Tensor]]) -> List[torch.Tensor]:
+        """
+        Args:
+            cache : [[history_1], [history_2], [history3]...]
+
+        Returns:
+            new_caceh: [history],
+                history: [bs, ...]
+        """
+        history = torch.cat([h[0] for h in cache], dim=0)
+        return [history]
+
+    def forward(self,
+                input: torch.Tensor,
+                cache: Optional[List[torch.Tensor]] = None):
+        """ forward for training
+        """
+        input = self.embed(input)  # [bs, seq_len, embed]
+        input = self.embed_dropout(input)
+        if cache is None:
+            zeros = self.init_state(input.size(0), device=input.device)[0]
+        else:
+            assert len(cache) == 1
+            zeros = cache[0]
+
+        input = torch.cat((zeros, input),
+                          dim=1)  # [bs, context_size-1 + seq_len, embed]
+
+        input = input.unfold(1, self.context_size, 1).permute(
+            0, 1, 3, 2)  # [bs, seq_len, context_size, embed]
+        # multi head pos: [n_head, embed, context_size]
+        multi_head_pos = self.pos_embed.weight.view(self.num_heads,
+                                                    self.embed_size,
+                                                    self.context_size)
+
+        # broadcast dot attenton
+        input_expand = input.unsqueeze(
+            2)  # [bs, seq_len, 1, context_size, embed]
+        multi_head_pos = multi_head_pos.permute(
+            0, 2, 1)  # [num_heads, context_size, embed]
+
+        # [bs, seq_len, num_heads, context_size, embed]
+        weight = input_expand * multi_head_pos
+        weight = weight.sum(dim=-1, keepdim=False).unsqueeze(
+            3)  # [bs, seq_len, num_heads, 1, context_size]
+        output = weight.matmul(input_expand).squeeze(
+            dim=3)  # [bs, seq_len, num_heads, embed]
+        output = output.sum(dim=2)  # [bs, seq_len, embed]
+        output = output / (self.num_heads * self.context_size)
+
+        output = self.ffn(output)
+        output = self.norm(output)
+        output = self.activatoin(output)
+        return output
+
+    def forward_step(
+        self,
+        input: torch.Tensor,
+        padding: torch.Tensor,
+        cache: List[torch.Tensor],
+    ) -> Tuple[torch.Tensor, List[torch.Tensor]]:
+        """ forward step for inference
+        Args:
+            input (torch.Tensor): [batch_size, time_step=1]
+            padding (torch.Tensor): [batch_size,1], 1 is padding value
+            cache: for embedding predictor, cache[0] == history
+        """
+        assert input.size(1) == 1
+        assert len(cache) == 1
+        history = cache[0]
+        assert history.size(1) == self.context_size - 1
+        input = self.embed(input)  # [bs, 1, embed]
+        input = self.embed_dropout(input)
+        context_input = torch.cat((history, input), dim=1)
+        input_expand = context_input.unsqueeze(1).unsqueeze(
+            2)  # [bs, 1, 1, context_size, embed]
+
+        # multi head pos: [n_head, embed, context_size]
+        multi_head_pos = self.pos_embed.weight.view(self.num_heads,
+                                                    self.embed_size,
+                                                    self.context_size)
+
+        multi_head_pos = multi_head_pos.permute(
+            0, 2, 1)  # [num_heads, context_size, embed]
+        # [bs, 1, num_heads, context_size, embed]
+        weight = input_expand * multi_head_pos
+        weight = weight.sum(dim=-1, keepdim=False).unsqueeze(
+            3)  # [bs, 1, num_heads, 1, context_size]
+        output = weight.matmul(input_expand).squeeze(
+            dim=3)  # [bs, 1, num_heads, embed]
+        output = output.sum(dim=2)  # [bs, 1, embed]
+        output = output / (self.num_heads * self.context_size)
+
+        output = self.ffn(output)
+        output = self.norm(output)
+        output = self.activatoin(output)
+        new_cache = context_input[:, 1:, :]
+        # TODO(Mddct): we need padding new_cache in future
+        # new_cache = ApplyPadding(history, padding, new_cache)
+        return (output, [new_cache])
+
+
+class ConvPredictor(PredictorBase):
+
+    def __init__(self,
+                 voca_size: int,
+                 embed_size: int,
+                 output_size: int,
+                 embed_dropout: float,
+                 history_size: int = 2,
+                 activation: str = "relu",
+                 bias: bool = False,
+                 layer_norm_epsilon: float = 1e-5) -> None:
+        super().__init__()
+
+        assert embed_size == output_size
+        assert history_size >= 0
+        self.embed_size = embed_size
+        self.context_size = history_size + 1
+        self.embed = nn.Embedding(voca_size, self.embed_size)
+        self.embed_dropout = nn.Dropout(p=embed_dropout)
+        self.conv = nn.Conv1d(in_channels=embed_size,
+                              out_channels=embed_size,
+                              kernel_size=self.context_size,
+                              padding=0,
+                              groups=embed_size,
+                              bias=bias)
+        self.norm = nn.LayerNorm(embed_size, eps=layer_norm_epsilon)
+        self.activatoin = WENET_ACTIVATION_CLASSES[activation]()
+
+    def output_size(self):
+        return self.embed_size
+
+    def init_state(self,
+                   batch_size: int,
+                   device: torch.device,
+                   method: str = "zero") -> List[torch.Tensor]:
+        assert batch_size > 0
+        assert method == "zero"
+        return [
+            torch.zeros(batch_size,
+                        self.context_size - 1,
+                        self.embed_size,
+                        device=device)
+        ]
+
+    def cache_to_batch(self,
+                       cache: List[List[torch.Tensor]]) -> List[torch.Tensor]:
+        """
+        Args:
+            cache : [[history_1], [history_2], [history3]...]
+
+        Returns:
+            new_caceh: [history],
+                history: [bs, ...]
+        """
+        history = torch.cat([h[0] for h in cache], dim=0)
+        return [history]
+
+    def batch_to_cache(self,
+                       cache: List[torch.Tensor]) -> List[List[torch.Tensor]]:
+        """
+        Args:
+            cache : [history]
+                history: [bs, ...]
+        Returns:
+            new_ache : [[history_1], [history_2], [history_3]...]
+        """
+        assert len(cache) == 1
+        cache_0 = cache[0]
+        history: List[List[torch.Tensor]] = []
+        for h in torch.split(cache_0, 1, dim=0):
+            history.append([h])
+        return history
+
+    def forward(self,
+                input: torch.Tensor,
+                cache: Optional[List[torch.Tensor]] = None):
+        """ forward for training
+        """
+        input = self.embed(input)  # [bs, seq_len, embed]
+        input = self.embed_dropout(input)
+        if cache is None:
+            zeros = self.init_state(input.size(0), device=input.device)[0]
+        else:
+            assert len(cache) == 1
+            zeros = cache[0]
+
+        input = torch.cat((zeros, input),
+                          dim=1)  # [bs, context_size-1 + seq_len, embed]
+        input = input.permute(0, 2, 1)
+        out = self.conv(input).permute(0, 2, 1)
+        out = self.activatoin(self.norm(out))
+        return out
+
+    def forward_step(
+            self, input: torch.Tensor, padding: torch.Tensor,
+            cache: List[torch.Tensor]
+    ) -> Tuple[torch.Tensor, List[torch.Tensor]]:
+        """ forward step for inference
+        Args:
+            input (torch.Tensor): [batch_size, time_step=1]
+            padding (torch.Tensor): [batch_size,1], 1 is padding value
+            cache: for embedding predictor, cache[0] == history
+        """
+        assert input.size(1) == 1
+        assert len(cache) == 1
+        history = cache[0]
+        assert history.size(1) == self.context_size - 1
+        input = self.embed(input)  # [bs, 1, embed]
+        input = self.embed_dropout(input)
+        context_input = torch.cat((history, input), dim=1)
+        input = context_input.permute(0, 2, 1)
+        out = self.conv(input).permute(0, 2, 1)
+        out = self.activatoin(self.norm(out))
+
+        new_cache = context_input[:, 1:, :]
+        # TODO(Mddct): apply padding in future
+        return (out, [new_cache])
diff --git a/wenet/transducer/search/greedy_search.py b/wenet/transducer/search/greedy_search.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef7354562b6617b7be33bf32d673117eb1d3d547
--- /dev/null
+++ b/wenet/transducer/search/greedy_search.py
@@ -0,0 +1,54 @@
+from typing import List
+
+import torch
+
+
+def basic_greedy_search(
+    model: torch.nn.Module,
+    encoder_out: torch.Tensor,
+    encoder_out_lens: torch.Tensor,
+    n_steps: int = 64,
+) -> List[List[int]]:
+    # fake padding
+    padding = torch.zeros(1, 1).to(encoder_out.device)
+    # sos
+    pred_input_step = torch.tensor([model.blank]).reshape(1, 1)
+    cache = model.predictor.init_state(1,
+                                       method="zero",
+                                       device=encoder_out.device)
+    new_cache: List[torch.Tensor] = []
+    t = 0
+    hyps = []
+    prev_out_nblk = True
+    pred_out_step = None
+    per_frame_max_noblk = n_steps
+    per_frame_noblk = 0
+    while t < encoder_out_lens:
+        encoder_out_step = encoder_out[:, t:t + 1, :]  # [1, 1, E]
+        if prev_out_nblk:
+            step_outs = model.predictor.forward_step(pred_input_step, padding,
+                                                     cache)  # [1, 1, P]
+            pred_out_step, new_cache = step_outs[0], step_outs[1]
+
+        joint_out_step = model.joint(encoder_out_step,
+                                     pred_out_step)  # [1,1,v]
+        joint_out_probs = joint_out_step.log_softmax(dim=-1)
+
+        joint_out_max = joint_out_probs.argmax(dim=-1).squeeze()  # []
+        if joint_out_max != model.blank:
+            hyps.append(joint_out_max.item())
+            prev_out_nblk = True
+            per_frame_noblk = per_frame_noblk + 1
+            pred_input_step = joint_out_max.reshape(1, 1)
+            # state_m, state_c =  clstate_out_m, state_out_c
+            cache = new_cache
+
+        if joint_out_max == model.blank or per_frame_noblk >= per_frame_max_noblk:
+            if joint_out_max == model.blank:
+                prev_out_nblk = False
+            # TODO(Mddct): make t in chunk for streamming
+            # or t should't be too lang to predict none blank
+            t = t + 1
+            per_frame_noblk = 0
+
+    return [hyps]
diff --git a/wenet/transducer/search/prefix_beam_search.py b/wenet/transducer/search/prefix_beam_search.py
new file mode 100644
index 0000000000000000000000000000000000000000..f00917717c16a73916586708ebfede54fa02a21f
--- /dev/null
+++ b/wenet/transducer/search/prefix_beam_search.py
@@ -0,0 +1,148 @@
+from typing import List, Tuple
+
+import torch
+from wenet.utils.common import log_add
+
+
+class Sequence():
+
+    __slots__ = {'hyp', 'score', 'cache'}
+
+    def __init__(
+        self,
+        hyp: List[torch.Tensor],
+        score,
+        cache: List[torch.Tensor],
+    ):
+        self.hyp = hyp
+        self.score = score
+        self.cache = cache
+
+
+class PrefixBeamSearch():
+
+    def __init__(self, encoder, predictor, joint, ctc, blank):
+        self.encoder = encoder
+        self.predictor = predictor
+        self.joint = joint
+        self.ctc = ctc
+        self.blank = blank
+
+    def forward_decoder_one_step(
+            self, encoder_x: torch.Tensor, pre_t: torch.Tensor,
+            cache: List[torch.Tensor]
+    ) -> Tuple[torch.Tensor, List[torch.Tensor]]:
+        padding = torch.zeros(pre_t.size(0), 1, device=encoder_x.device)
+        pre_t, new_cache = self.predictor.forward_step(pre_t.unsqueeze(-1),
+                                                       padding, cache)
+        x = self.joint(encoder_x, pre_t)  # [beam, 1, 1, vocab]
+        x = x.log_softmax(dim=-1)
+        return x, new_cache
+
+    def prefix_beam_search(self,
+                           speech: torch.Tensor,
+                           speech_lengths: torch.Tensor,
+                           decoding_chunk_size: int = -1,
+                           beam_size: int = 5,
+                           num_decoding_left_chunks: int = -1,
+                           simulate_streaming: bool = False,
+                           ctc_weight: float = 0.3,
+                           transducer_weight: float = 0.7):
+        """prefix beam search
+           also see wenet.transducer.transducer.beam_search
+        """
+        assert speech.shape[0] == speech_lengths.shape[0]
+        assert decoding_chunk_size != 0
+        device = speech.device
+        batch_size = speech.shape[0]
+        assert batch_size == 1
+
+        # 1. Encoder
+        encoder_out, _ = self.encoder(
+            speech, speech_lengths, decoding_chunk_size,
+            num_decoding_left_chunks)  # (B, maxlen, encoder_dim)
+        maxlen = encoder_out.size(1)
+
+        ctc_probs = self.ctc.log_softmax(encoder_out).squeeze(0)
+        beam_init: List[Sequence] = []
+
+        # 2. init beam using Sequence to save beam unit
+        cache = self.predictor.init_state(1, method="zero", device=device)
+        beam_init.append(Sequence(hyp=[self.blank], score=0.0, cache=cache))
+        # 3. start decoding (notice: we use breathwise first searching)
+        # !!!! In this decoding method: one frame do not output multi units. !!!!
+        # !!!!    Experiments show that this strategy has little impact      !!!!
+        for i in range(maxlen):
+            # 3.1 building input
+            # decoder taking the last token to predict the next token
+            input_hyp = [s.hyp[-1] for s in beam_init]
+            input_hyp_tensor = torch.tensor(input_hyp,
+                                            dtype=torch.int,
+                                            device=device)
+            # building statement from beam
+            cache_batch = self.predictor.cache_to_batch(
+                [s.cache for s in beam_init])
+            # build score tensor to do torch.add() function
+            scores = torch.tensor([s.score for s in beam_init]).to(device)
+
+            # 3.2 forward decoder
+            logp, new_cache = self.forward_decoder_one_step(
+                encoder_out[:, i, :].unsqueeze(1),
+                input_hyp_tensor,
+                cache_batch,
+            )  # logp: (N, 1, 1, vocab_size)
+            logp = logp.squeeze(1).squeeze(1)  # logp: (N, vocab_size)
+            new_cache = self.predictor.batch_to_cache(new_cache)
+
+            # 3.3 shallow fusion for transducer score
+            #     and ctc score where we can also add the LM score
+            logp = torch.log(
+                torch.add(transducer_weight * torch.exp(logp),
+                          ctc_weight * torch.exp(ctc_probs[i].unsqueeze(0))))
+
+            # 3.4 first beam prune
+            top_k_logp, top_k_index = logp.topk(beam_size)  # (N, N)
+            scores = torch.add(scores.unsqueeze(1), top_k_logp)
+
+            # 3.5 generate new beam (N*N)
+            beam_A = []
+            for j in range(len(beam_init)):
+                # update seq
+                base_seq = beam_init[j]
+                for t in range(beam_size):
+                    # blank: only update the score
+                    if top_k_index[j, t] == self.blank:
+                        new_seq = Sequence(hyp=base_seq.hyp.copy(),
+                                           score=scores[j, t].item(),
+                                           cache=base_seq.cache)
+
+                        beam_A.append(new_seq)
+                    # other unit: update hyp score statement and last
+                    else:
+                        hyp_new = base_seq.hyp.copy()
+                        hyp_new.append(top_k_index[j, t].item())
+                        new_seq = Sequence(hyp=hyp_new,
+                                           score=scores[j, t].item(),
+                                           cache=new_cache[j])
+                        beam_A.append(new_seq)
+
+            # 3.6 prefix fusion
+            fusion_A = [beam_A[0]]
+            for j in range(1, len(beam_A)):
+                s1 = beam_A[j]
+                if_do_append = True
+                for t in range(len(fusion_A)):
+                    # notice: A_ can not fusion with A
+                    if s1.hyp == fusion_A[t].hyp:
+                        fusion_A[t].score = log_add(
+                            [fusion_A[t].score, s1.score])
+                        if_do_append = False
+                        break
+                if if_do_append:
+                    fusion_A.append(s1)
+
+            # 4. second pruned
+            fusion_A.sort(key=lambda x: x.score, reverse=True)
+            beam_init = fusion_A[:beam_size]
+
+        return beam_init, encoder_out
diff --git a/wenet/transducer/transducer.py b/wenet/transducer/transducer.py
new file mode 100644
index 0000000000000000000000000000000000000000..1348fff67207620421f0fbd1100cf1843fd670e6
--- /dev/null
+++ b/wenet/transducer/transducer.py
@@ -0,0 +1,571 @@
+from typing import Dict, List, Optional, Tuple, Union
+
+import torch
+import torchaudio
+from torch import nn
+from torch.nn.utils.rnn import pad_sequence
+
+from wenet.transducer.predictor import PredictorBase
+from wenet.transducer.search.greedy_search import basic_greedy_search
+from wenet.transducer.search.prefix_beam_search import PrefixBeamSearch
+from wenet.transformer.asr_model import ASRModel
+from wenet.transformer.ctc import CTC
+from wenet.transformer.decoder import BiTransformerDecoder, TransformerDecoder
+from wenet.transformer.label_smoothing_loss import LabelSmoothingLoss
+from wenet.utils.common import (IGNORE_ID, add_blank, add_sos_eos,
+                                reverse_pad_list, TORCH_NPU_AVAILABLE)
+
+
+class Transducer(ASRModel):
+    """Transducer-ctc-attention hybrid Encoder-Predictor-Decoder model"""
+
+    def __init__(
+        self,
+        vocab_size: int,
+        blank: int,
+        encoder: nn.Module,
+        predictor: PredictorBase,
+        joint: nn.Module,
+        attention_decoder: Optional[Union[TransformerDecoder,
+                                          BiTransformerDecoder]] = None,
+        ctc: Optional[CTC] = None,
+        ctc_weight: float = 0,
+        ignore_id: int = IGNORE_ID,
+        reverse_weight: float = 0.0,
+        lsm_weight: float = 0.0,
+        length_normalized_loss: bool = False,
+        transducer_weight: float = 1.0,
+        attention_weight: float = 0.0,
+        enable_k2: bool = False,
+        delay_penalty: float = 0.0,
+        warmup_steps: float = 25000,
+        lm_only_scale: float = 0.25,
+        am_only_scale: float = 0.0,
+        special_tokens: dict = None,
+    ) -> None:
+        assert attention_weight + ctc_weight + transducer_weight == 1.0
+        super().__init__(vocab_size,
+                         encoder,
+                         attention_decoder,
+                         ctc,
+                         ctc_weight,
+                         ignore_id,
+                         reverse_weight,
+                         lsm_weight,
+                         length_normalized_loss,
+                         special_tokens=special_tokens)
+
+        self.blank = blank
+        self.transducer_weight = transducer_weight
+        self.attention_decoder_weight = 1 - self.transducer_weight - self.ctc_weight
+
+        self.predictor = predictor
+        self.joint = joint
+        self.bs = None
+
+        # k2 rnnt loss
+        self.enable_k2 = enable_k2
+        self.delay_penalty = delay_penalty
+        if delay_penalty != 0.0:
+            assert self.enable_k2 is True
+        self.lm_only_scale = lm_only_scale
+        self.am_only_scale = am_only_scale
+        self.warmup_steps = warmup_steps
+        self.simple_am_proj: Optional[nn.Linear] = None
+        self.simple_lm_proj: Optional[nn.Linear] = None
+        if self.enable_k2:
+            self.simple_am_proj = torch.nn.Linear(self.encoder.output_size(),
+                                                  vocab_size)
+            self.simple_lm_proj = torch.nn.Linear(self.predictor.output_size(),
+                                                  vocab_size)
+
+        # Note(Mddct): decoder also means predictor in transducer,
+        # but here decoder is attention decoder
+        del self.criterion_att
+        if attention_decoder is not None:
+            self.criterion_att = LabelSmoothingLoss(
+                size=vocab_size,
+                padding_idx=ignore_id,
+                smoothing=lsm_weight,
+                normalize_length=length_normalized_loss,
+            )
+
+    @torch.jit.unused
+    def forward(
+        self,
+        batch: dict,
+        device: torch.device,
+    ) -> Dict[str, Optional[torch.Tensor]]:
+        """Frontend + Encoder + predictor + joint + loss
+        """
+        self.device = device
+        speech = batch['feats'].to(device)
+        speech_lengths = batch['feats_lengths'].to(device)
+        text = batch['target'].to(device)
+        text_lengths = batch['target_lengths'].to(device)
+        steps = batch.get('steps', 0)
+        assert text_lengths.dim() == 1, text_lengths.shape
+        # Check that batch_size is unified
+        assert (speech.shape[0] == speech_lengths.shape[0] == text.shape[0] ==
+                text_lengths.shape[0]), (speech.shape, speech_lengths.shape,
+                                         text.shape, text_lengths.shape)
+
+        # Encoder
+        encoder_out, encoder_mask = self.encoder(speech, speech_lengths)
+        encoder_out_lens = encoder_mask.squeeze(1).sum(1)
+
+        # compute_loss
+        loss_rnnt = self._compute_loss(encoder_out,
+                                       encoder_out_lens,
+                                       encoder_mask,
+                                       text,
+                                       text_lengths,
+                                       steps=steps)
+
+        loss = self.transducer_weight * loss_rnnt
+        # optional attention decoder
+        loss_att: Optional[torch.Tensor] = None
+        if self.attention_decoder_weight != 0.0 and self.decoder is not None:
+            loss_att, acc_att = self._calc_att_loss(encoder_out, encoder_mask,
+                                                    text, text_lengths)
+        else:
+            acc_att = None
+
+        # optional ctc
+        loss_ctc: Optional[torch.Tensor] = None
+        if self.ctc_weight != 0.0 and self.ctc is not None:
+            loss_ctc, _ = self.ctc(encoder_out, encoder_out_lens, text,
+                                   text_lengths)
+        else:
+            loss_ctc = None
+
+        if loss_ctc is not None:
+            loss = loss + self.ctc_weight * loss_ctc.sum()
+        if loss_att is not None:
+            loss = loss + self.attention_decoder_weight * loss_att.sum()
+        # NOTE: 'loss' must be in dict
+        return {
+            'loss': loss,
+            'loss_att': loss_att,
+            'loss_ctc': loss_ctc,
+            'loss_rnnt': loss_rnnt,
+            'th_accuracy': acc_att,
+        }
+
+    def init_bs(self):
+        if self.bs is None:
+            self.bs = PrefixBeamSearch(self.encoder, self.predictor,
+                                       self.joint, self.ctc, self.blank)
+
+    def _cal_transducer_score(
+        self,
+        encoder_out: torch.Tensor,
+        encoder_mask: torch.Tensor,
+        hyps_lens: torch.Tensor,
+        hyps_pad: torch.Tensor,
+    ):
+        # ignore id -> blank, add blank at head
+        hyps_pad_blank = add_blank(hyps_pad, self.blank, self.ignore_id)
+        xs_in_lens = encoder_mask.squeeze(1).sum(1).int()
+
+        # 1. Forward predictor
+        predictor_out = self.predictor(hyps_pad_blank)
+        # 2. Forward joint
+        joint_out = self.joint(encoder_out, predictor_out)
+        rnnt_text = hyps_pad.to(torch.int64)
+        rnnt_text = torch.where(rnnt_text == self.ignore_id, 0,
+                                rnnt_text).to(torch.int32)
+        # 3. Compute transducer loss
+        loss_td = torchaudio.functional.rnnt_loss(joint_out,
+                                                  rnnt_text,
+                                                  xs_in_lens,
+                                                  hyps_lens.int(),
+                                                  blank=self.blank,
+                                                  reduction='none')
+        return loss_td * -1
+
+    def _cal_attn_score(
+        self,
+        encoder_out: torch.Tensor,
+        encoder_mask: torch.Tensor,
+        hyps_pad: torch.Tensor,
+        hyps_lens: torch.Tensor,
+    ):
+        # (beam_size, max_hyps_len)
+        ori_hyps_pad = hyps_pad
+
+        # td_score = loss_td * -1
+        hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id)
+        hyps_lens = hyps_lens + 1  # Add <sos> at begining
+        # used for right to left decoder
+        r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id)
+        r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos,
+                                    self.ignore_id)
+        decoder_out, r_decoder_out, _ = self.decoder(
+            encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad,
+            self.reverse_weight)  # (beam_size, max_hyps_len, vocab_size)
+        decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1)
+        decoder_out = decoder_out.cpu().numpy()
+        # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a
+        # conventional transformer decoder.
+        r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1)
+        r_decoder_out = r_decoder_out.cpu().numpy()
+        return decoder_out, r_decoder_out
+
+    def beam_search(
+        self,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        decoding_chunk_size: int = -1,
+        beam_size: int = 5,
+        num_decoding_left_chunks: int = -1,
+        simulate_streaming: bool = False,
+        ctc_weight: float = 0.3,
+        transducer_weight: float = 0.7,
+    ):
+        """beam search
+
+        Args:
+            speech (torch.Tensor): (batch=1, max_len, feat_dim)
+            speech_length (torch.Tensor): (batch, )
+            beam_size (int): beam size for beam search
+            decoding_chunk_size (int): decoding chunk for dynamic chunk
+                trained model.
+                <0: for decoding, use full chunk.
+                >0: for decoding, use fixed chunk size as set.
+                0: used for training, it's prohibited here
+            simulate_streaming (bool): whether do encoder forward in a
+                streaming fashion
+            ctc_weight (float): ctc probability weight in transducer
+                prefix beam search.
+                final_prob = ctc_weight * ctc_prob + transducer_weight * transducer_prob
+            transducer_weight (float): transducer probability weight in
+                prefix beam search
+        Returns:
+            List[List[int]]: best path result
+
+        """
+        self.init_bs()
+        beam, _ = self.bs.prefix_beam_search(
+            speech,
+            speech_lengths,
+            decoding_chunk_size,
+            beam_size,
+            num_decoding_left_chunks,
+            simulate_streaming,
+            ctc_weight,
+            transducer_weight,
+        )
+        return beam[0].hyp[1:], beam[0].score
+
+    def transducer_attention_rescoring(
+            self,
+            speech: torch.Tensor,
+            speech_lengths: torch.Tensor,
+            beam_size: int,
+            decoding_chunk_size: int = -1,
+            num_decoding_left_chunks: int = -1,
+            simulate_streaming: bool = False,
+            reverse_weight: float = 0.0,
+            ctc_weight: float = 0.0,
+            attn_weight: float = 0.0,
+            transducer_weight: float = 0.0,
+            search_ctc_weight: float = 1.0,
+            search_transducer_weight: float = 0.0,
+            beam_search_type: str = 'transducer') -> List[List[int]]:
+        """beam search
+
+        Args:
+            speech (torch.Tensor): (batch=1, max_len, feat_dim)
+            speech_length (torch.Tensor): (batch, )
+            beam_size (int): beam size for beam search
+            decoding_chunk_size (int): decoding chunk for dynamic chunk
+                trained model.
+                <0: for decoding, use full chunk.
+                >0: for decoding, use fixed chunk size as set.
+                0: used for training, it's prohibited here
+            simulate_streaming (bool): whether do encoder forward in a
+                streaming fashion
+            ctc_weight (float): ctc probability weight using in rescoring.
+                rescore_prob = ctc_weight * ctc_prob +
+                               transducer_weight * (transducer_loss * -1) +
+                               attn_weight * attn_prob
+            attn_weight (float): attn probability weight using in rescoring.
+            transducer_weight (float): transducer probability weight using in
+                rescoring
+            search_ctc_weight (float): ctc weight using
+                               in rnnt beam search (seeing in self.beam_search)
+            search_transducer_weight (float): transducer weight using
+                               in rnnt beam search (seeing in self.beam_search)
+        Returns:
+            List[List[int]]: best path result
+
+        """
+
+        assert speech.shape[0] == speech_lengths.shape[0]
+        assert decoding_chunk_size != 0
+        if reverse_weight > 0.0:
+            # decoder should be a bitransformer decoder if reverse_weight > 0.0
+            assert hasattr(self.decoder, 'right_decoder')
+        device = speech.device
+        batch_size = speech.shape[0]
+        # For attention rescoring we only support batch_size=1
+        assert batch_size == 1
+        # encoder_out: (1, maxlen, encoder_dim), len(hyps) = beam_size
+        self.init_bs()
+        if beam_search_type == 'transducer':
+            beam, encoder_out = self.bs.prefix_beam_search(
+                speech,
+                speech_lengths,
+                decoding_chunk_size=decoding_chunk_size,
+                beam_size=beam_size,
+                num_decoding_left_chunks=num_decoding_left_chunks,
+                ctc_weight=search_ctc_weight,
+                transducer_weight=search_transducer_weight,
+            )
+            beam_score = [s.score for s in beam]
+            hyps = [s.hyp[1:] for s in beam]
+
+        elif beam_search_type == 'ctc':
+            hyps, encoder_out = self._ctc_prefix_beam_search(
+                speech,
+                speech_lengths,
+                beam_size=beam_size,
+                decoding_chunk_size=decoding_chunk_size,
+                num_decoding_left_chunks=num_decoding_left_chunks,
+                simulate_streaming=simulate_streaming)
+            beam_score = [hyp[1] for hyp in hyps]
+            hyps = [hyp[0] for hyp in hyps]
+        assert len(hyps) == beam_size
+
+        # build hyps and encoder output
+        hyps_pad = pad_sequence([
+            torch.tensor(hyp, device=device, dtype=torch.long) for hyp in hyps
+        ], True, self.ignore_id)  # (beam_size, max_hyps_len)
+        hyps_lens = torch.tensor([len(hyp) for hyp in hyps],
+                                 device=device,
+                                 dtype=torch.long)  # (beam_size,)
+
+        encoder_out = encoder_out.repeat(beam_size, 1, 1)
+        encoder_mask = torch.ones(beam_size,
+                                  1,
+                                  encoder_out.size(1),
+                                  dtype=torch.bool,
+                                  device=device)
+
+        # 2.1 calculate transducer score
+        td_score = self._cal_transducer_score(
+            encoder_out,
+            encoder_mask,
+            hyps_lens,
+            hyps_pad,
+        )
+        # 2.2 calculate attention score
+        decoder_out, r_decoder_out = self._cal_attn_score(
+            encoder_out,
+            encoder_mask,
+            hyps_pad,
+            hyps_lens,
+        )
+
+        # Only use decoder score for rescoring
+        best_score = -float('inf')
+        best_index = 0
+        for i, hyp in enumerate(hyps):
+            score = 0.0
+            for j, w in enumerate(hyp):
+                score += decoder_out[i][j][w]
+            score += decoder_out[i][len(hyp)][self.eos]
+            td_s = td_score[i]
+            # add right to left decoder score
+            if reverse_weight > 0:
+                r_score = 0.0
+                for j, w in enumerate(hyp):
+                    r_score += r_decoder_out[i][len(hyp) - j - 1][w]
+                r_score += r_decoder_out[i][len(hyp)][self.eos]
+                score = score * (1 - reverse_weight) + r_score * reverse_weight
+            # add ctc score
+            score = score * attn_weight + \
+                beam_score[i] * ctc_weight + \
+                td_s * transducer_weight
+            if score > best_score:
+                best_score = score
+                best_index = i
+
+        return hyps[best_index], best_score
+
+    def greedy_search(
+        self,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        decoding_chunk_size: int = -1,
+        num_decoding_left_chunks: int = -1,
+        simulate_streaming: bool = False,
+        n_steps: int = 64,
+    ) -> List[List[int]]:
+        """ greedy search
+
+        Args:
+            speech (torch.Tensor): (batch=1, max_len, feat_dim)
+            speech_length (torch.Tensor): (batch, )
+            beam_size (int): beam size for beam search
+            decoding_chunk_size (int): decoding chunk for dynamic chunk
+                trained model.
+                <0: for decoding, use full chunk.
+                >0: for decoding, use fixed chunk size as set.
+                0: used for training, it's prohibited here
+            simulate_streaming (bool): whether do encoder forward in a
+                streaming fashion
+        Returns:
+            List[List[int]]: best path result
+        """
+        # TODO(Mddct): batch decode
+        assert speech.size(0) == 1
+        assert speech.shape[0] == speech_lengths.shape[0]
+        assert decoding_chunk_size != 0
+        # TODO(Mddct): forward chunk by chunk
+        _ = simulate_streaming
+        # Let's assume B = batch_size
+        encoder_out, encoder_mask = self.encoder(
+            speech,
+            speech_lengths,
+            decoding_chunk_size,
+            num_decoding_left_chunks,
+        )
+        encoder_out_lens = encoder_mask.squeeze(1).sum()
+        hyps = basic_greedy_search(self,
+                                   encoder_out,
+                                   encoder_out_lens,
+                                   n_steps=n_steps)
+
+        return hyps
+
+    @torch.jit.export
+    def forward_encoder_chunk(
+        self,
+        xs: torch.Tensor,
+        offset: int,
+        required_cache_size: int,
+        att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0),
+        cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0),
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+        return self.encoder.forward_chunk(xs, offset, required_cache_size,
+                                          att_cache, cnn_cache)
+
+    @torch.jit.export
+    def forward_predictor_step(
+            self, xs: torch.Tensor, cache: List[torch.Tensor]
+    ) -> Tuple[torch.Tensor, List[torch.Tensor]]:
+        assert len(cache) == 2
+        # fake padding
+        padding = torch.zeros(1, 1)
+        return self.predictor.forward_step(xs, padding, cache)
+
+    @torch.jit.export
+    def forward_joint_step(self, enc_out: torch.Tensor,
+                           pred_out: torch.Tensor) -> torch.Tensor:
+        return self.joint(enc_out, pred_out)
+
+    @torch.jit.export
+    def forward_predictor_init_state(self) -> List[torch.Tensor]:
+        return self.predictor.init_state(1, device=torch.device("cpu"))
+
+    def _compute_loss(self,
+                      encoder_out: torch.Tensor,
+                      encoder_out_lens: torch.Tensor,
+                      encoder_mask: torch.Tensor,
+                      text: torch.Tensor,
+                      text_lengths: torch.Tensor,
+                      steps: int = 0) -> torch.Tensor:
+        ys_in_pad = add_blank(text, self.blank, self.ignore_id)
+        # predictor
+        predictor_out = self.predictor(ys_in_pad)
+        if self.simple_lm_proj is None and self.simple_am_proj is None:
+            # joint
+            joint_out = self.joint(encoder_out, predictor_out)
+            # NOTE(Mddct): some loss implementation require pad valid is zero
+            # torch.int32 rnnt_loss required
+            rnnt_text = text.to(torch.int64)
+            rnnt_text = torch.where(rnnt_text == self.ignore_id, 0,
+                                    rnnt_text).to(torch.int32)
+            rnnt_text_lengths = text_lengths.to(torch.int32)
+            encoder_out_lens = encoder_out_lens.to(torch.int32)
+            loss = torchaudio.functional.rnnt_loss(joint_out,
+                                                   rnnt_text,
+                                                   encoder_out_lens,
+                                                   rnnt_text_lengths,
+                                                   blank=self.blank,
+                                                   reduction="mean")
+        else:
+            try:
+                import k2
+            except ImportError:
+                print('Error: k2 is not installed')
+            delay_penalty = self.delay_penalty
+            if steps < 2 * self.warmup_steps:
+                delay_penalty = 0.00
+            ys_in_pad = ys_in_pad.type(torch.int64)
+            boundary = torch.zeros((encoder_out.size(0), 4),
+                                   dtype=torch.int64,
+                                   device=encoder_out.device)
+            boundary[:, 3] = encoder_mask.squeeze(1).sum(1)
+            boundary[:, 2] = text_lengths
+
+            rnnt_text = torch.where(text == self.ignore_id, 0, text)
+            lm = self.simple_lm_proj(predictor_out)
+            am = self.simple_am_proj(encoder_out)
+            amp_autocast = torch.cuda.amp.autocast
+            if "npu" in self.device.__str__() and TORCH_NPU_AVAILABLE:
+                amp_autocast = torch.npu.amp.autocast
+            with amp_autocast(enabled=False):
+                simple_loss, (px_grad, py_grad) = k2.rnnt_loss_smoothed(
+                    lm=lm.float(),
+                    am=am.float(),
+                    symbols=rnnt_text,
+                    termination_symbol=self.blank,
+                    lm_only_scale=self.lm_only_scale,
+                    am_only_scale=self.am_only_scale,
+                    boundary=boundary,
+                    reduction="sum",
+                    return_grad=True,
+                    delay_penalty=delay_penalty,
+                )
+            # ranges : [B, T, prune_range]
+            ranges = k2.get_rnnt_prune_ranges(
+                px_grad=px_grad,
+                py_grad=py_grad,
+                boundary=boundary,
+                s_range=5,
+            )
+            am_pruned, lm_pruned = k2.do_rnnt_pruning(
+                am=self.joint.enc_ffn(encoder_out),
+                lm=self.joint.pred_ffn(predictor_out),
+                ranges=ranges,
+            )
+            logits = self.joint(
+                am_pruned,
+                lm_pruned,
+                pre_project=False,
+            )
+            with amp_autocast(enabled=False):
+                pruned_loss = k2.rnnt_loss_pruned(
+                    logits=logits.float(),
+                    symbols=rnnt_text,
+                    ranges=ranges,
+                    termination_symbol=self.blank,
+                    boundary=boundary,
+                    reduction="sum",
+                    delay_penalty=delay_penalty,
+                )
+            simple_loss_scale = 0.5
+            if steps < self.warmup_steps:
+                simple_loss_scale = (1.0 - (steps / self.warmup_steps) *
+                                     (1.0 - simple_loss_scale))
+            pruned_loss_scale = 1.0
+            if steps < self.warmup_steps:
+                pruned_loss_scale = 0.1 + 0.9 * (steps / self.warmup_steps)
+            loss = (simple_loss_scale * simple_loss +
+                    pruned_loss_scale * pruned_loss)
+            loss = loss / encoder_out.size(0)
+        return loss
diff --git a/wenet/transformer/__init__.py b/wenet/transformer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/wenet/transformer/asr_model.py b/wenet/transformer/asr_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2553f6b0b0640e1d6018c8907c36232f6c38ad5
--- /dev/null
+++ b/wenet/transformer/asr_model.py
@@ -0,0 +1,575 @@
+# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from ESPnet(https://github.com/espnet/espnet)
+
+from typing import Dict, List, Optional, Tuple
+
+import torch
+from torch.nn.utils.rnn import pad_sequence
+
+from wenet.transformer.ctc import CTC
+from wenet.transformer.decoder import TransformerDecoder
+from wenet.transformer.encoder import BaseEncoder
+from wenet.transformer.label_smoothing_loss import LabelSmoothingLoss
+from wenet.transformer.search import (ctc_greedy_search,
+                                      ctc_prefix_beam_search,
+                                      attention_beam_search,
+                                      attention_rescoring, DecodeResult)
+from wenet.utils.mask import make_pad_mask
+from wenet.utils.common import (IGNORE_ID, add_sos_eos, th_accuracy,
+                                reverse_pad_list)
+from wenet.utils.context_graph import ContextGraph
+
+
+class ASRModel(torch.nn.Module):
+    """CTC-attention hybrid Encoder-Decoder model"""
+
+    def __init__(
+        self,
+        vocab_size: int,
+        encoder: BaseEncoder,
+        decoder: TransformerDecoder,
+        ctc: CTC,
+        ctc_weight: float = 0.5,
+        ignore_id: int = IGNORE_ID,
+        reverse_weight: float = 0.0,
+        lsm_weight: float = 0.0,
+        length_normalized_loss: bool = False,
+        special_tokens: Optional[dict] = None,
+        apply_non_blank_embedding: bool = False,
+    ):
+        assert 0.0 <= ctc_weight <= 1.0, ctc_weight
+
+        super().__init__()
+        # note that eos is the same as sos (equivalent ID)
+        self.sos = (vocab_size - 1 if special_tokens is None else
+                    special_tokens.get("<sos>", vocab_size - 1))
+        self.eos = (vocab_size - 1 if special_tokens is None else
+                    special_tokens.get("<eos>", vocab_size - 1))
+        self.vocab_size = vocab_size
+        self.special_tokens = special_tokens
+        self.ignore_id = ignore_id
+        self.ctc_weight = ctc_weight
+        self.reverse_weight = reverse_weight
+        self.apply_non_blank_embedding = apply_non_blank_embedding
+
+        self.encoder = encoder
+        self.decoder = decoder
+        self.ctc = ctc
+        self.criterion_att = LabelSmoothingLoss(
+            size=vocab_size,
+            padding_idx=ignore_id,
+            smoothing=lsm_weight,
+            normalize_length=length_normalized_loss,
+        )
+        if ctc_weight == 0:
+            """
+            防止多次训练后由于该位置梯度堆叠导致的报错
+            """
+            for p in self.ctc.parameters():
+                p.requires_grad = False
+
+    @torch.jit.unused
+    def forward(
+        self,
+        batch: dict,
+        device: torch.device,
+    ) -> Dict[str, Optional[torch.Tensor]]:
+        """Frontend + Encoder + Decoder + Calc loss"""
+        speech = batch['feats'].to(device)
+        speech_lengths = batch['feats_lengths'].to(device)
+        text = batch['target'].to(device)
+        text_lengths = batch['target_lengths'].to(device)
+
+        # lang speaker emotion gender -> List<str>
+        # duration -> List<float>
+        # 如有用到该数据,需要使用对应的str_to_id进行映射
+        if 'lang' in batch:
+            lang = batch['lang']
+        else:
+            lang = None
+        if 'speaker' in batch:
+            speaker = batch['speaker']
+        else:
+            speaker = None
+        if 'emotion' in batch:
+            emotion = batch['emotion']
+        else:
+            emotion = None
+        if 'gender' in batch:
+            gender = batch['gender']
+        else:
+            gender = None
+        if 'duration' in batch:
+            duration = batch['duration']
+        else:
+            duration = None
+
+        if 'task' in batch:
+            task = batch['task']
+        else:
+            task = None
+        # print(lang, speaker, emotion, gender, duration)
+
+
+        assert text_lengths.dim() == 1, text_lengths.shape
+        # Check that batch_size is unified
+        assert (speech.shape[0] == speech_lengths.shape[0] == text.shape[0] ==
+                text_lengths.shape[0]), (speech.shape, speech_lengths.shape,
+                                         text.shape, text_lengths.shape)
+        # 1. Encoder
+        encoder_out, encoder_mask = self.encoder(speech, speech_lengths)
+        encoder_out_lens = encoder_mask.squeeze(1).sum(1)
+
+        # 2a. CTC branch
+        if self.ctc_weight != 0.0:
+            loss_ctc, ctc_probs = self.ctc(encoder_out, encoder_out_lens, text,
+                                           text_lengths)
+        else:
+            loss_ctc, ctc_probs = None, None
+
+        # 2b. Attention-decoder branch
+        # use non blank (token level) embedding for decoder
+        if self.apply_non_blank_embedding:
+            assert self.ctc_weight != 0
+            assert ctc_probs is not None
+            encoder_out, encoder_mask = self.filter_blank_embedding(
+                ctc_probs, encoder_out)
+        if self.ctc_weight != 1.0:
+            langs_list = []
+            for item in lang:
+                if item=='<CN>' or item=="<ENGLISH>":
+                    langs_list.append('zh')
+                elif item=='<EN>':
+                    langs_list.append('en')
+                else:
+                    print('出现无法识别的语种： {}'.format(item))
+                    langs_list.append(item)
+            task_list = []
+            for item in task:
+                if item == "<SOT>":
+                    task_list.append('sot_task')
+                elif item =="<TRANSCRIBE>":
+                    task_list.append("transcribe")
+                elif item=="<EMOTION>":
+                    task_list.append("emotion_task")
+                elif item=="<CAPTION>":
+                    task_list.append("caption_task")
+                else:
+                    print('出现无法识别的任务种类： {}'.format(item), flush=True)
+                    task_list.append(item)
+            loss_att, acc_att = self._calc_att_loss(
+                encoder_out, encoder_mask, text, text_lengths, {
+                    "langs": langs_list,
+                    "tasks": task_list
+                })
+        else:
+            loss_att = None
+            acc_att = None
+
+        if loss_ctc is None:
+            loss = loss_att
+        elif loss_att is None:
+            loss = loss_ctc
+        else:
+            loss = self.ctc_weight * loss_ctc + (1 -
+                                                 self.ctc_weight) * loss_att
+        return {
+            "loss": loss,
+            "loss_att": loss_att,
+            "loss_ctc": loss_ctc,
+            "th_accuracy": acc_att,
+        }
+
+    def tie_or_clone_weights(self, jit_mode: bool = True):
+        self.decoder.tie_or_clone_weights(jit_mode)
+
+    @torch.jit.unused
+    def _forward_ctc(
+            self, encoder_out: torch.Tensor, encoder_mask: torch.Tensor,
+            text: torch.Tensor,
+            text_lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        encoder_out_lens = encoder_mask.squeeze(1).sum(1)
+        loss_ctc, ctc_probs = self.ctc(encoder_out, encoder_out_lens, text,
+                                       text_lengths)
+        return loss_ctc, ctc_probs
+
+    def filter_blank_embedding(
+            self, ctc_probs: torch.Tensor,
+            encoder_out: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        batch_size = encoder_out.size(0)
+        maxlen = encoder_out.size(1)
+        top1_index = torch.argmax(ctc_probs, dim=2)
+        indices = []
+        for j in range(batch_size):
+            indices.append(
+                torch.tensor(
+                    [i for i in range(maxlen) if top1_index[j][i] != 0]))
+
+        select_encoder_out = [
+            torch.index_select(encoder_out[i, :, :], 0,
+                               indices[i].to(encoder_out.device))
+            for i in range(batch_size)
+        ]
+        select_encoder_out = pad_sequence(select_encoder_out,
+                                          batch_first=True,
+                                          padding_value=0).to(
+                                              encoder_out.device)
+        xs_lens = torch.tensor([len(indices[i]) for i in range(batch_size)
+                                ]).to(encoder_out.device)
+        T = select_encoder_out.size(1)
+        encoder_mask = ~make_pad_mask(xs_lens, T).unsqueeze(1)  # (B, 1, T)
+        encoder_out = select_encoder_out
+        return encoder_out, encoder_mask
+
+    def _calc_att_loss(
+        self,
+        encoder_out: torch.Tensor,
+        encoder_mask: torch.Tensor,
+        ys_pad: torch.Tensor,
+        ys_pad_lens: torch.Tensor,
+        infos: Dict[str, List[str]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        ys_in_pad, ys_out_pad = add_sos_eos(ys_pad, self.sos, self.eos,
+                                            self.ignore_id)
+        ys_in_lens = ys_pad_lens + 1
+
+        # reverse the seq, used for right to left decoder
+        r_ys_pad = reverse_pad_list(ys_pad, ys_pad_lens, float(self.ignore_id))
+        r_ys_in_pad, r_ys_out_pad = add_sos_eos(r_ys_pad, self.sos, self.eos,
+                                                self.ignore_id)
+        # 1. Forward decoder
+        decoder_out, r_decoder_out, _ = self.decoder(encoder_out, encoder_mask,
+                                                     ys_in_pad, ys_in_lens,
+                                                     r_ys_in_pad,
+                                                     self.reverse_weight)
+        # 2. Compute attention loss
+        loss_att = self.criterion_att(decoder_out, ys_out_pad)
+        r_loss_att = torch.tensor(0.0)
+        if self.reverse_weight > 0.0:
+            r_loss_att = self.criterion_att(r_decoder_out, r_ys_out_pad)
+        loss_att = loss_att * (
+            1 - self.reverse_weight) + r_loss_att * self.reverse_weight
+        acc_att = th_accuracy(
+            decoder_out.view(-1, self.vocab_size),
+            ys_out_pad,
+            ignore_label=self.ignore_id,
+        )
+        return loss_att, acc_att
+
+    def _forward_encoder(
+        self,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        decoding_chunk_size: int = -1,
+        num_decoding_left_chunks: int = -1,
+        simulate_streaming: bool = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Let's assume B = batch_size
+        # 1. Encoder
+        if simulate_streaming and decoding_chunk_size > 0:
+            encoder_out, encoder_mask = self.encoder.forward_chunk_by_chunk(
+                speech,
+                decoding_chunk_size=decoding_chunk_size,
+                num_decoding_left_chunks=num_decoding_left_chunks
+            )  # (B, maxlen, encoder_dim)
+        else:
+            encoder_out, encoder_mask = self.encoder(
+                speech,
+                speech_lengths,
+                decoding_chunk_size=decoding_chunk_size,
+                num_decoding_left_chunks=num_decoding_left_chunks
+            )  # (B, maxlen, encoder_dim)
+        return encoder_out, encoder_mask
+
+    @torch.jit.unused
+    def ctc_logprobs(self,
+                     encoder_out: torch.Tensor,
+                     blank_penalty: float = 0.0,
+                     blank_id: int = 0):
+        if blank_penalty > 0.0:
+            logits = self.ctc.ctc_lo(encoder_out)
+            logits[:, :, blank_id] -= blank_penalty
+            ctc_probs = logits.log_softmax(dim=2)
+        else:
+            ctc_probs = self.ctc.log_softmax(encoder_out)
+
+        return ctc_probs
+
+    def decode(
+        self,
+        methods: List[str],
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        beam_size: int,
+        decoding_chunk_size: int = -1,
+        num_decoding_left_chunks: int = -1,
+        ctc_weight: float = 0.0,
+        simulate_streaming: bool = False,
+        reverse_weight: float = 0.0,
+        context_graph: ContextGraph = None,
+        blank_id: int = 0,
+        blank_penalty: float = 0.0,
+        length_penalty: float = 0.0,
+        infos: Dict[str, List[str]] = None,
+    ) -> Dict[str, List[DecodeResult]]:
+        """ Decode input speech
+
+        Args:
+            methods:(List[str]): list of decoding methods to use, which could
+                could contain the following decoding methods, please refer paper:
+                https://arxiv.org/pdf/2102.01547.pdf
+                   * ctc_greedy_search
+                   * ctc_prefix_beam_search
+                   * atttention
+                   * attention_rescoring
+            speech (torch.Tensor): (batch, max_len, feat_dim)
+            speech_length (torch.Tensor): (batch, )
+            beam_size (int): beam size for beam search
+            decoding_chunk_size (int): decoding chunk for dynamic chunk
+                trained model.
+                <0: for decoding, use full chunk.
+                >0: for decoding, use fixed chunk size as set.
+                0: used for training, it's prohibited here
+            simulate_streaming (bool): whether do encoder forward in a
+                streaming fashion
+            reverse_weight (float): right to left decoder weight
+            ctc_weight (float): ctc score weight
+
+        Returns: dict results of all decoding methods
+        """
+        assert speech.shape[0] == speech_lengths.shape[0]
+        assert decoding_chunk_size != 0
+        encoder_out, encoder_mask = self._forward_encoder(
+            speech, speech_lengths, decoding_chunk_size,
+            num_decoding_left_chunks, simulate_streaming)
+        encoder_lens = encoder_mask.squeeze(1).sum(1)
+        ctc_probs = self.ctc_logprobs(encoder_out, blank_penalty, blank_id)
+        results = {}
+        if 'attention' in methods:
+            results['attention'] = attention_beam_search(
+                self, encoder_out, encoder_mask, beam_size, length_penalty,
+                infos)
+        if 'ctc_greedy_search' in methods:
+            results['ctc_greedy_search'] = ctc_greedy_search(
+                ctc_probs, encoder_lens, blank_id)
+        if 'ctc_prefix_beam_search' in methods:
+            ctc_prefix_result = ctc_prefix_beam_search(ctc_probs, encoder_lens,
+                                                       beam_size,
+                                                       context_graph, blank_id)
+            results['ctc_prefix_beam_search'] = ctc_prefix_result
+        if 'attention_rescoring' in methods:
+            # attention_rescoring depends on ctc_prefix_beam_search nbest
+            if 'ctc_prefix_beam_search' in results:
+                ctc_prefix_result = results['ctc_prefix_beam_search']
+            else:
+                ctc_prefix_result = ctc_prefix_beam_search(
+                    ctc_probs, encoder_lens, beam_size, context_graph,
+                    blank_id)
+            if self.apply_non_blank_embedding:
+                encoder_out, _ = self.filter_blank_embedding(
+                    ctc_probs, encoder_out)
+            results['attention_rescoring'] = attention_rescoring(
+                self, ctc_prefix_result, encoder_out, encoder_lens, ctc_weight,
+                reverse_weight, infos)
+        return results
+
+    @torch.jit.export
+    def subsampling_rate(self) -> int:
+        """ Export interface for c++ call, return subsampling_rate of the
+            model
+        """
+        return self.encoder.embed.subsampling_rate
+
+    @torch.jit.export
+    def right_context(self) -> int:
+        """ Export interface for c++ call, return right_context of the model
+        """
+        return self.encoder.embed.right_context
+
+    @torch.jit.export
+    def sos_symbol(self) -> int:
+        """ Export interface for c++ call, return sos symbol id of the model
+        """
+        return self.sos
+
+    @torch.jit.export
+    def eos_symbol(self) -> int:
+        """ Export interface for c++ call, return eos symbol id of the model
+        """
+        return self.eos
+
+    @torch.jit.export
+    def forward_encoder_chunk(
+        self,
+        xs: torch.Tensor,
+        offset: int,
+        required_cache_size: int,
+        att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0),
+        cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0),
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """ Export interface for c++ call, give input chunk xs, and return
+            output from time 0 to current chunk.
+
+        Args:
+            xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim),
+                where `time == (chunk_size - 1) * subsample_rate + \
+                        subsample.right_context + 1`
+            offset (int): current offset in encoder output time stamp
+            required_cache_size (int): cache size required for next chunk
+                compuation
+                >=0: actual cache size
+                <0: means all history cache is required
+            att_cache (torch.Tensor): cache tensor for KEY & VALUE in
+                transformer/conformer attention, with shape
+                (elayers, head, cache_t1, d_k * 2), where
+                `head * d_k == hidden-dim` and
+                `cache_t1 == chunk_size * num_decoding_left_chunks`.
+            cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer,
+                (elayers, b=1, hidden-dim, cache_t2), where
+                `cache_t2 == cnn.lorder - 1`
+
+        Returns:
+            torch.Tensor: output of current input xs,
+                with shape (b=1, chunk_size, hidden-dim).
+            torch.Tensor: new attention cache required for next chunk, with
+                dynamic shape (elayers, head, ?, d_k * 2)
+                depending on required_cache_size.
+            torch.Tensor: new conformer cnn cache required for next chunk, with
+                same shape as the original cnn_cache.
+
+        """
+        return self.encoder.forward_chunk(xs, offset, required_cache_size,
+                                          att_cache, cnn_cache)
+
+    @torch.jit.export
+    def ctc_activation(self, xs: torch.Tensor) -> torch.Tensor:
+        """ Export interface for c++ call, apply linear transform and log
+            softmax before ctc
+        Args:
+            xs (torch.Tensor): encoder output
+
+        Returns:
+            torch.Tensor: activation before ctc
+
+        """
+        return self.ctc.log_softmax(xs)
+
+    @torch.jit.export
+    def is_bidirectional_decoder(self) -> bool:
+        """
+        Returns:
+            torch.Tensor: decoder output
+        """
+        if hasattr(self.decoder, 'right_decoder'):
+            return True
+        else:
+            return False
+
+    @torch.jit.export
+    def forward_attention_decoder(
+        self,
+        hyps: torch.Tensor,
+        hyps_lens: torch.Tensor,
+        encoder_out: torch.Tensor,
+        reverse_weight: float = 0,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """ Export interface for c++ call, forward decoder with multiple
+            hypothesis from ctc prefix beam search and one encoder output
+        Args:
+            hyps (torch.Tensor): hyps from ctc prefix beam search, already
+                pad sos at the begining
+            hyps_lens (torch.Tensor): length of each hyp in hyps
+            encoder_out (torch.Tensor): corresponding encoder output
+            r_hyps (torch.Tensor): hyps from ctc prefix beam search, already
+                pad eos at the begining which is used fo right to left decoder
+            reverse_weight: used for verfing whether used right to left decoder,
+            > 0 will use.
+
+        Returns:
+            torch.Tensor: decoder output
+        """
+        assert encoder_out.size(0) == 1
+        num_hyps = hyps.size(0)
+        assert hyps_lens.size(0) == num_hyps
+        encoder_out = encoder_out.repeat(num_hyps, 1, 1)
+        encoder_mask = torch.ones(num_hyps,
+                                  1,
+                                  encoder_out.size(1),
+                                  dtype=torch.bool,
+                                  device=encoder_out.device)
+
+        # input for right to left decoder
+        # this hyps_lens has count <sos> token, we need minus it.
+        r_hyps_lens = hyps_lens - 1
+        # this hyps has included <sos> token, so it should be
+        # convert the original hyps.
+        r_hyps = hyps[:, 1:]
+        #   >>> r_hyps
+        #   >>> tensor([[ 1,  2,  3],
+        #   >>>         [ 9,  8,  4],
+        #   >>>         [ 2, -1, -1]])
+        #   >>> r_hyps_lens
+        #   >>> tensor([3, 3, 1])
+
+        # NOTE(Mddct): `pad_sequence` is not supported by ONNX, it is used
+        #   in `reverse_pad_list` thus we have to refine the below code.
+        #   Issue: https://github.com/wenet-e2e/wenet/issues/1113
+        # Equal to:
+        #   >>> r_hyps = reverse_pad_list(r_hyps, r_hyps_lens, float(self.ignore_id))
+        #   >>> r_hyps, _ = add_sos_eos(r_hyps, self.sos, self.eos, self.ignore_id)
+        max_len = torch.max(r_hyps_lens)
+        index_range = torch.arange(0, max_len, 1).to(encoder_out.device)
+        seq_len_expand = r_hyps_lens.unsqueeze(1)
+        seq_mask = seq_len_expand > index_range  # (beam, max_len)
+        #   >>> seq_mask
+        #   >>> tensor([[ True,  True,  True],
+        #   >>>         [ True,  True,  True],
+        #   >>>         [ True, False, False]])
+        index = (seq_len_expand - 1) - index_range  # (beam, max_len)
+        #   >>> index
+        #   >>> tensor([[ 2,  1,  0],
+        #   >>>         [ 2,  1,  0],
+        #   >>>         [ 0, -1, -2]])
+        index = index * seq_mask
+        #   >>> index
+        #   >>> tensor([[2, 1, 0],
+        #   >>>         [2, 1, 0],
+        #   >>>         [0, 0, 0]])
+        r_hyps = torch.gather(r_hyps, 1, index)
+        #   >>> r_hyps
+        #   >>> tensor([[3, 2, 1],
+        #   >>>         [4, 8, 9],
+        #   >>>         [2, 2, 2]])
+        r_hyps = torch.where(seq_mask, r_hyps, self.eos)
+        #   >>> r_hyps
+        #   >>> tensor([[3, 2, 1],
+        #   >>>         [4, 8, 9],
+        #   >>>         [2, eos, eos]])
+        r_hyps = torch.cat([hyps[:, 0:1], r_hyps], dim=1)
+        #   >>> r_hyps
+        #   >>> tensor([[sos, 3, 2, 1],
+        #   >>>         [sos, 4, 8, 9],
+        #   >>>         [sos, 2, eos, eos]])
+
+        decoder_out, r_decoder_out, _ = self.decoder(
+            encoder_out, encoder_mask, hyps, hyps_lens, r_hyps,
+            reverse_weight)  # (num_hyps, max_hyps_len, vocab_size)
+        decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1)
+
+        # right to left decoder may be not used during decoding process,
+        # which depends on reverse_weight param.
+        # r_dccoder_out will be 0.0, if reverse_weight is 0.0
+        r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1)
+        return decoder_out, r_decoder_out
diff --git a/wenet/transformer/attention.py b/wenet/transformer/attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c895d4caa7648c18d88f76f2dbb83c76d9c118b
--- /dev/null
+++ b/wenet/transformer/attention.py
@@ -0,0 +1,686 @@
+# Copyright (c) 2019 Shigeki Karita
+#               2020 Mobvoi Inc (Binbin Zhang)
+#               2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Multi-Head Attention layer definition."""
+
+import math
+from typing import Optional, Tuple
+
+import torch
+from torch import nn
+
+from wenet.utils.rope_utils import WENET_APPLY_ROTARY_EMB
+
+T_CACHE = Tuple[torch.Tensor, torch.Tensor]
+
+
+class MultiHeadedAttention(nn.Module):
+    """Multi-Head Attention layer.
+    if n_kv_head != None and n_kv_head != n_head
+    see: https://arxiv.org/pdf/1911.02150.pdf
+         https://arxiv.org/pdf/2305.13245.pdf
+
+    Example:
+        case 1: n_kv_head == None, head_dim == None, MultiHead attention (MHSA)
+        case 2: n_kv_head=1, n_head = 16, MultiQuery attention (MQA)
+        case 3: nv_kv_head=2, n_head = 16, GroupedQuery attention (GQA)
+
+    Args:
+        n_head (int): The number of heads.
+        n_feat (int): The number of features.
+        dropout_rate (float): Dropout rate.
+
+    """
+
+    def __init__(self,
+                 n_head: int,
+                 n_feat: int,
+                 dropout_rate: float,
+                 query_bias: bool = True,
+                 key_bias: bool = True,
+                 value_bias: bool = True,
+                 use_sdpa: bool = False,
+                 n_kv_head: Optional[int] = None,
+                 head_dim: Optional[int] = None):
+        """Construct an MultiHeadedAttention object."""
+        super().__init__()
+
+        self.inner_dim = n_feat if head_dim is None else head_dim * n_head
+        if n_kv_head is not None:
+            assert head_dim is not None
+            self.inner_kv_dim = head_dim * n_kv_head
+            n_kv_head = n_kv_head
+        else:
+            self.inner_kv_dim = self.inner_dim
+            n_kv_head = n_head
+        # We assume d_v always equals d_k
+        self.d_k = self.inner_dim // n_head
+        assert self.d_k == self.inner_kv_dim // n_kv_head
+        self.h = n_head
+        self.h_kv = n_kv_head
+
+        self.linear_q = nn.Linear(n_feat, self.inner_dim, bias=query_bias)
+        self.linear_k = nn.Linear(n_feat, self.inner_kv_dim, bias=key_bias)
+        self.linear_v = nn.Linear(n_feat, self.inner_kv_dim, bias=value_bias)
+        self.linear_out = nn.Linear(self.inner_dim, n_feat, bias=query_bias)
+        self.dropout = nn.Dropout(p=dropout_rate)
+
+        self.use_sdpa = use_sdpa
+        self.dropout_rate = dropout_rate
+
+    def _forward_linearx(self,
+                         name: str,
+                         x: torch.Tensor,
+                         head_first: bool = True) -> torch.Tensor:
+        assert x.ndim >= 3
+        if name == 'query':
+            x = self.linear_q(x)
+            x_shape = x.size()
+            x_shape = x_shape[:-1] + torch.Size([self.h, self.d_k])
+        elif name == 'key':
+            x = self.linear_k(x)
+            x_shape = x.size()
+            x_shape = x_shape[:-1] + torch.Size([self.h_kv, self.d_k])
+        else:
+            assert name == 'value'
+            x = self.linear_v(x)
+            x_shape = x.size()
+            x_shape = x_shape[:-1] + torch.Size([self.h_kv, self.d_k])
+
+        # split last dim
+        x = x.view(x_shape)
+        if head_first:
+            x = x.transpose(-3,
+                            -2)  # (batch, ...,  head or head_kv, time, d_k)
+        return x
+
+    def forward_qkv(
+        self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Transform query, key and value.
+
+        Args:
+            query (torch.Tensor): Query tensor (#batch, ..., time1, size).
+            key (torch.Tensor): Key tensor (#batch, ..., time2, size).
+            value (torch.Tensor): Value tensor (#batch, ..., time2, size).
+
+        Returns:
+            torch.Tensor: Transformed query tensor, size
+                (#batch, ..., n_head, time1, d_k).
+            torch.Tensor: Transformed key tensor, size
+                (#batch, ..., n_head_kv, time2, d_k).
+            torch.Tensor: Transformed value tensor, size
+                (#batch, ..., n_head_kv, time2, d_k).
+
+        """
+        q = self._forward_linearx('query', query)
+        k = self._forward_linearx('key', key)
+        v = self._forward_linearx('value', value)
+        return q, k, v
+
+    def forward_attention(
+        self,
+        value: torch.Tensor,
+        scores: torch.Tensor,
+        mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool)
+    ) -> torch.Tensor:
+        """Compute attention context vector.
+
+        Args:
+            value (torch.Tensor): Transformed value, size
+                (#batch, ..., n_head, time2, d_k).
+            scores (torch.Tensor): Attention score, size
+                (#batch, ..., n_head, time1, time2).
+            mask (torch.Tensor): Mask, size (#batch, 1, time2) or
+                (#batch, ..., time1, time2), (0, ..., 0, 0) means fake mask.
+
+        Returns:
+            torch.Tensor: Transformed value (#batch, time1, d_model)
+                weighted by the attention score (#batch, time1, time2).
+
+        """
+        # NOTE(xcsong): When will `if mask.size(2) > 0` be True?
+        #   1. onnx(16/4) [WHY? Because we feed real cache & real mask for the
+        #           1st chunk to ease the onnx export.]
+        #   2. pytorch training
+        if mask.size(-1) > 0:  # time2 > 0
+            mask = mask.unsqueeze(-3).eq(0)  # (batch, .., 1, *, time2)
+            # For last chunk, time2 might be larger than scores.size(-1)
+            mask = mask[..., :scores.size(-1)]  # (batch, 1, *, time2)
+            scores = scores.masked_fill(mask, -float('inf'))
+            attn = torch.softmax(scores.float(),
+                                 dim=-1).type_as(value).masked_fill(
+                                     mask, 0.0)  # (batch, head, time1, time2)
+        # NOTE(xcsong): When will `if mask.size(2) > 0` be False?
+        #   1. onnx(16/-1, -1/-1, 16/0)
+        #   2. jit (16/-1, -1/-1, 16/0, 16/4)
+        else:
+            attn = torch.softmax(scores.float(), dim=-1).type_as(
+                value)  # (batch, ..., head, time1, time2)
+
+        p_attn = self.dropout(attn)
+        x = torch.matmul(p_attn, value)  # (batch, ...,  head, time1, d_k)
+        x = x.transpose(-3, -2).contiguous()  # [batch, ..., time1, head, d_k]
+        x_shape = x.size()[:-2] + torch.Size([self.h * self.d_k])
+        x = x.view(x_shape)  # (batch, ..., time1, d_model)
+        return self.linear_out(x)  # (batch, ...,  time1, d_model)
+
+    def _update_kv_and_cache(
+            self,
+            k: torch.Tensor,
+            v: torch.Tensor,
+            cache: T_CACHE,
+            head_first: bool = True
+    ) -> Tuple[torch.Tensor, torch.Tensor, T_CACHE]:
+        new_cache = cache
+        seq_axis = -2 if head_first else -3
+        head_axis = -3 if head_first else -2
+        if not self.training:
+            # NOTE(xcsong):
+            #   when export onnx model, for 1st chunk, we feed
+            #       cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode)
+            #       or cache(1, head, real_cache_t, d_k * 2) (16/4 mode).
+            #       In all modes, `if cache.size(0) > 0` will alwayse be `True`
+            #       and we will always do splitting and
+            #       concatnation(this will simplify onnx export). Note that
+            #       it's OK to concat & split zero-shaped tensors(see code below).
+            #   when export jit  model, for 1st chunk, we always feed
+            #       cache(0, 0, 0, 0) since jit supports dynamic if-branch.
+            # >>> a = torch.ones((1, 2, 0, 4))
+            # >>> b = torch.ones((1, 2, 3, 4))
+            # >>> c = torch.cat((a, b), dim=2)
+            # >>> torch.equal(b, c)        # True
+            # >>> d = torch.split(a, 2, dim=-1)
+            # >>> torch.equal(d[0], d[1])  # True
+            key_cache, value_cache = cache
+            if key_cache.size(0) > 0:
+                k = torch.cat([key_cache, k], dim=seq_axis)
+            if value_cache.size(0) > 0:
+                v = torch.cat([value_cache, v], dim=seq_axis)
+            # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's
+            #   non-trivial to calculate `next_cache_start` here.
+            # new_cache = torch.cat((k, v), dim=-1) if not self.training else cache
+            new_cache = (k, v)
+        # for multi query or multi group attention
+        if self.h_kv != self.h and self.h_kv != 1:
+            # NOTE: onnxruntime issues:
+            #     https://github.com/wenet-e2e/wenet/issues/2517
+            # k = torch.repeat_interleave(
+            #     k,
+            #     self.h // self.h_kv,
+            #     dim=-3,
+            # )
+            # v = torch.repeat_interleave(
+            #     v,
+            #     self.h // self.h_kv,
+            #     dim=-3,
+            # )
+            n_repeat = self.h // self.h_kv
+            k_shape = k.size()
+            repeat_axis = head_axis + 1
+            k = k.unsqueeze(head_axis).expand(
+                k_shape[:repeat_axis] + torch.Size([n_repeat]) +
+                k_shape[repeat_axis:]).reshape(
+                    k_shape[:head_axis] + torch.Size([self.h_kv * n_repeat]) +
+                    k_shape[repeat_axis:])
+            v_shape = v.size()
+            v = v.unsqueeze(head_axis).expand(
+                v_shape[:repeat_axis] + torch.Size([n_repeat]) +
+                v_shape[(repeat_axis):]).reshape(
+                    v_shape[:head_axis] + torch.Size([self.h_kv * n_repeat]) +
+                    v_shape[repeat_axis:])
+
+        return k, v, new_cache
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+        pos_emb: torch.Tensor = torch.empty(0),
+        cache: T_CACHE = (torch.zeros(0, 0, 0, 0), torch.zeros(0, 0, 0, 0)),
+    ) -> Tuple[torch.Tensor, T_CACHE]:
+        """Compute scaled dot product attention.
+
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
+                (#batch, time1, time2).
+                1.When applying cross attention between decoder and encoder,
+                the batch padding mask for input is in (#batch, 1, T) shape.
+                2.When applying self attention of encoder,
+                the mask is in (#batch, T, T)  shape.
+                3.When applying self attention of decoder,
+                the mask is in (#batch, L, L)  shape.
+                4.If the different position in decoder see different block
+                of the encoder, such as Mocha, the passed in mask could be
+                in (#batch, L, T) shape. But there is no such case in current
+                Wenet.
+            cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2),
+                where `cache_t == chunk_size * num_decoding_left_chunks`
+                and `head * d_k == size`
+
+
+        Returns:
+            torch.Tensor: Output tensor (#batch, time1, d_model).
+            torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2)
+                where `cache_t == chunk_size * num_decoding_left_chunks`
+                and `head * d_k == size`
+
+        """
+        q, k, v = self.forward_qkv(query, key, value)
+        k, v, new_cache = self._update_kv_and_cache(k, v, cache)
+
+        if not self.use_sdpa:
+            scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k)
+            return self.forward_attention(v, scores, mask), new_cache
+        else:
+            output = torch.nn.functional.scaled_dot_product_attention(
+                q,
+                k,
+                v,
+                attn_mask=mask.unsqueeze(1),
+                dropout_p=self.dropout_rate,
+                scale=1 / math.sqrt(self.d_k),
+            )
+            output = (output.transpose(1, 2).contiguous().view(
+                query.size(0), -1,
+                self.h * self.d_k))  # (batch, time1, d_model)
+            return self.linear_out(output), new_cache
+
+
+class RelPositionMultiHeadedAttention(MultiHeadedAttention):
+    """Multi-Head Attention layer with relative position encoding.
+    Paper: https://arxiv.org/abs/1901.02860
+    Args:
+        n_head (int): The number of heads.
+        n_feat (int): The number of features.
+        dropout_rate (float): Dropout rate.
+    """
+
+    def __init__(self,
+                 n_head: int,
+                 n_feat: int,
+                 dropout_rate: float,
+                 query_bias: bool = True,
+                 key_bias: bool = True,
+                 value_bias: bool = True,
+                 use_sdpa: bool = False,
+                 n_kv_head: Optional[int] = None,
+                 head_dim: Optional[int] = None):
+        """Construct an RelPositionMultiHeadedAttention object."""
+        super().__init__(n_head, n_feat, dropout_rate, query_bias, key_bias,
+                         value_bias, use_sdpa, n_kv_head, head_dim)
+        # linear transformation for positional encoding
+        self.linear_pos = nn.Linear(n_feat, n_feat, bias=False)
+        # these two learnable bias are used in matrix c and matrix d
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k))
+        self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k))
+        torch.nn.init.xavier_uniform_(self.pos_bias_u)
+        torch.nn.init.xavier_uniform_(self.pos_bias_v)
+
+    def rel_shift(self, x, zero_triu: bool = False):
+        """Compute relative positinal encoding.
+        Args:
+            x (torch.Tensor): Input tensor (batch, time, size).
+            zero_triu (bool): If true, return the lower triangular part of
+                the matrix.
+        Returns:
+            torch.Tensor: Output tensor.
+        """
+
+        zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1),
+                               device=x.device,
+                               dtype=x.dtype)
+        x_padded = torch.cat([zero_pad, x], dim=-1)
+
+        x_padded = x_padded.view(x.size()[0],
+                                 x.size()[1],
+                                 x.size(3) + 1, x.size(2))
+        x = x_padded[:, :, 1:].view_as(x)
+
+        if zero_triu:
+            ones = torch.ones((x.size(2), x.size(3)))
+            x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :]
+
+        return x
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+        pos_emb: torch.Tensor = torch.empty(0),
+        cache: T_CACHE = (torch.zeros((0, 0, 0, 0)), torch.zeros((0, 0, 0, 0)))
+    ) -> Tuple[torch.Tensor, T_CACHE]:
+        """Compute 'Scaled Dot Product Attention' with rel. positional encoding.
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
+                (#batch, time1, time2), (0, 0, 0) means fake mask.
+            pos_emb (torch.Tensor): Positional embedding tensor
+                (#batch, time2, size).
+            cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2),
+                where `cache_t == chunk_size * num_decoding_left_chunks`
+                and `head * d_k == size`
+        Returns:
+            torch.Tensor: Output tensor (#batch, time1, d_model).
+            torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2)
+                where `cache_t == chunk_size * num_decoding_left_chunks`
+                and `head * d_k == size`
+        """
+        q, k, v = self.forward_qkv(query, key, value)
+        q = q.transpose(1, 2)  # (batch, time1, head, d_k)
+        k, v, new_cache = self._update_kv_and_cache(k, v, cache)
+
+        n_batch_pos = pos_emb.size(0)
+        p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k)
+        p = p.transpose(1, 2)  # (batch, head, time1, d_k)
+
+        # (batch, head, time1, d_k)
+        q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2)
+        # (batch, head, time1, d_k)
+        q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2)
+
+        # compute matrix b and matrix d
+        # (batch, head, time1, time2)
+        matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1))
+        # Remove rel_shift since it is useless in speech recognition,
+        # and it requires special attention for streaming.
+        # matrix_bd = self.rel_shift(matrix_bd)
+        if not self.use_sdpa:
+            # compute attention score
+            # first compute matrix a and matrix c
+            # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+            # (batch, head, time1, time2)
+            matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1))
+
+            scores = (matrix_ac + matrix_bd) / math.sqrt(
+                self.d_k)  # (batch, head, time1, time2)
+
+            return self.forward_attention(v, scores, mask), new_cache
+        else:
+            # NOTE(Mddct): we need mask bias, not boolean mask
+            assert mask.dtype != torch.bool
+            mask = mask.unsqueeze(1)
+            # matrix_bd as a mask bias
+            mask = (matrix_bd + mask) / math.sqrt(self.d_k)
+            output = torch.nn.functional.scaled_dot_product_attention(
+                q_with_bias_u,
+                k,
+                v,
+                attn_mask=mask,
+                dropout_p=self.dropout_rate,
+                scale=1 / math.sqrt(self.d_k),
+            )
+            output = (output.transpose(1, 2).contiguous().view(
+                query.size(0), -1,
+                self.h * self.d_k))  # (batch, time1, d_model)
+            return self.linear_out(output), new_cache
+
+
+class MultiHeadedCrossAttention(MultiHeadedAttention):
+
+    def __init__(self,
+                 n_head: int,
+                 n_feat: int,
+                 dropout_rate: float,
+                 query_bias: bool = True,
+                 key_bias: bool = True,
+                 value_bias: bool = True,
+                 use_sdpa: bool = False,
+                 n_kv_head: Optional[int] = None,
+                 head_dim: Optional[int] = None):
+        super().__init__(n_head, n_feat, dropout_rate, query_bias, key_bias,
+                         value_bias, use_sdpa, n_kv_head, head_dim)
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+        pos_emb: torch.Tensor = torch.empty(0),
+        cache: T_CACHE = (torch.zeros((0, 0, 0, 0)), torch.zeros((0, 0, 0, 0)))
+    ) -> Tuple[torch.Tensor, T_CACHE]:
+        del pos_emb
+        key_cache, value_cache = cache
+        assert key_cache.size(0) == value_cache.size(0)
+        if key_cache.size(0) > 0:
+            assert not self.training
+            q = self._forward_linearx('query', query)
+            k, v = key_cache, value_cache
+
+        else:
+            q, k, v = self.forward_qkv(query, key, value)
+        new_cache = (k, v) if not self.training else cache
+        # for multi query or multi groups attention
+        if self.h_kv != self.h and self.h_kv != 1:
+            k = torch.repeat_interleave(
+                k,
+                self.h // self.h_kv,
+                dim=-3,
+            )
+            v = torch.repeat_interleave(
+                v,
+                self.h // self.h_kv,
+                dim=-3,
+            )
+        B = query.size(0)
+        Beams = 1
+        if B != k.size(0):
+            assert not self.training
+            Beams = B // k.size(0)
+            B = k.size(0)
+            q = q.view(B, Beams, q.size(-3), q.size(-2), q.size(-1))
+            k = k.unsqueeze(1)
+            v = v.unsqueeze(1)
+            mask = mask.unsqueeze(1)
+
+        if not self.use_sdpa:
+            scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k)
+            output = self.forward_attention(v, scores, mask)
+        else:
+            output = torch.nn.functional.scaled_dot_product_attention(
+                q,
+                k,
+                v,
+                attn_mask=mask.unsqueeze(1),
+                dropout_p=self.dropout_rate,
+                scale=1 / math.sqrt(self.d_k),
+            )
+            output = output.transpose(-2, -3).contiguous()
+            output_shape = output.size()[:-2] + torch.Size([self.h * self.d_k])
+            output = output.view(output_shape)  # (batch, ...,  time1, d_model)
+            output = self.linear_out(output)
+
+        if query.size(0) != B:
+            assert not self.training
+            output_shape = torch.Size([B * Beams]) + output.size()[2:]
+            output = output.view(output_shape)
+        return output, new_cache
+
+
+class ShawRelPositionMultiHeadedAttention(MultiHeadedAttention):
+    """ https://arxiv.org/pdf/1803.02155.pdf
+    """
+
+    def __init__(self,
+                 n_head: int,
+                 n_feat: int,
+                 dropout_rate: float,
+                 query_bias: bool = True,
+                 key_bias: bool = True,
+                 value_bias: bool = True,
+                 use_sdpa: bool = False,
+                 n_kv_head: Optional[int] = None,
+                 head_dim: Optional[int] = None):
+        del n_kv_head, head_dim
+        super().__init__(n_head, n_feat, dropout_rate, query_bias, key_bias,
+                         value_bias, use_sdpa, None, None)
+        # TODO(Mddct): 64 8 1 as args
+        self.max_right_rel_pos = 8
+        self.max_left_rel_pos = 64
+        self.rel_k_embed = torch.nn.Embedding(
+            self.max_left_rel_pos + self.max_right_rel_pos + 1, self.d_k)
+
+    def _relative_indices(self, keys: torch.Tensor) -> torch.Tensor:
+        # (S, 1)
+        indices = torch.arange(keys.size(2), device=keys.device).unsqueeze(0)
+
+        # (S, S)
+        rel_indices = indices - indices.transpose(0, 1)
+
+        rel_indices = torch.clamp(rel_indices, -self.max_left_rel_pos,
+                                  self.max_right_rel_pos)
+
+        return rel_indices + self.max_left_rel_pos
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+        pos_emb: torch.Tensor = torch.empty(0),
+        cache: T_CACHE = (torch.zeros((0, 0, 0, 0)), torch.zeros(0, 0, 0, 0))
+    ) -> Tuple[torch.Tensor, T_CACHE]:
+        del pos_emb
+        q, k, v = self.forward_qkv(query, key, value)
+        k, v, new_cache = self._update_kv_and_cache(k, v, cache)
+
+        rel_k = self.rel_k_embed(self._relative_indices(k))  # (t2, t2, d_k)
+        rel_k = rel_k[-q.size(2):]
+        rel_att_weights = torch.einsum("bhld,lrd->bhlr", q, rel_k)
+
+        if not self.use_sdpa:
+            scores = (torch.matmul(q, k.transpose(-2, -1)) +
+                      rel_att_weights) / math.sqrt(self.d_k)
+            return self.forward_attention(v, scores, mask), new_cache
+        else:
+            # NOTE(Mddct): we need mask bias, not boolean mask
+            assert mask.dtype != torch.bool
+            mask = mask.unsqueeze(1)
+            # matrix_bd as a mask bias
+            mask = (rel_att_weights + mask) / math.sqrt(self.d_k)
+            output = torch.nn.functional.scaled_dot_product_attention(
+                q,
+                k,
+                v,
+                attn_mask=mask,
+                dropout_p=self.dropout_rate,
+                scale=1 / math.sqrt(self.d_k),
+            )
+            output = (output.transpose(1, 2).contiguous().view(
+                query.size(0), -1,
+                self.h * self.d_k))  # (batch, time1, d_model)
+            return self.linear_out(output), new_cache
+
+
+class RopeMultiHeadedAttention(MultiHeadedAttention):
+
+    def __init__(self,
+                 n_head: int,
+                 n_feat: int,
+                 dropout_rate: float,
+                 query_bias: bool = True,
+                 key_bias: bool = True,
+                 value_bias: bool = True,
+                 use_sdpa: bool = False,
+                 n_kv_head: Optional[int] = None,
+                 head_dim: Optional[int] = None,
+                 style='google'):
+        super().__init__(n_head, n_feat, dropout_rate, query_bias, key_bias,
+                         value_bias, use_sdpa, n_kv_head, head_dim)
+        self.style = style
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+        pos_emb: torch.Tensor = torch.empty(0),
+        cache: T_CACHE = (torch.zeros((0, 0, 0, 0)), torch.zeros(0, 0, 0, 0))
+    ) -> Tuple[torch.Tensor, T_CACHE]:
+        """Compute rope scaled dot product attention.
+
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
+                (#batch, time1, time2).
+                1.When applying cross attention between decoder and encoder,
+                the batch padding mask for input is in (#batch, 1, T) shape.
+                2.When applying self attention of encoder,
+                the mask is in (#batch, T, T)  shape.
+                3.When applying self attention of decoder,
+                the mask is in (#batch, L, L)  shape.
+                4.If the different position in decoder see different block
+                of the encoder, such as Mocha, the passed in mask could be
+                in (#batch, L, T) shape. But there is no such case in current
+                Wenet.
+            cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2),
+                where `cache_t == chunk_size * num_decoding_left_chunks`
+                and `head * d_k == size`
+
+
+        Returns:
+            torch.Tensor: Output tensor (#batch, time1, d_model).
+            torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2)
+                where `cache_t == chunk_size * num_decoding_left_chunks`
+                and `head * d_k == size`
+
+        """
+        q = self._forward_linearx('query', query, head_first=False)
+        k = self._forward_linearx('key', key, head_first=False)
+        v = self._forward_linearx('value', value, head_first=False)
+        # NOTE(Mddct): In order to make the code easier to read,
+        #    these two lines are not placed in MultiHeadedAttention.
+        q = WENET_APPLY_ROTARY_EMB[self.style](q, pos_emb)
+        k = WENET_APPLY_ROTARY_EMB[self.style](k, pos_emb)
+
+        k, v, new_cache = self._update_kv_and_cache(k,
+                                                    v,
+                                                    cache,
+                                                    head_first=False)
+
+        q = q.transpose(1, 2)
+        k = k.transpose(1, 2)
+        v = v.transpose(1, 2)
+        if not self.use_sdpa:
+            scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k)
+            return self.forward_attention(v, scores, mask), new_cache
+        else:
+            output = torch.nn.functional.scaled_dot_product_attention(
+                q,
+                k,
+                v,
+                attn_mask=mask.unsqueeze(1),
+                dropout_p=self.dropout_rate,
+                scale=1 / math.sqrt(self.d_k),
+            )
+            output = (output.transpose(1, 2).contiguous().view(
+                query.size(0), -1,
+                self.h * self.d_k))  # (batch, time1, d_model)
+            return self.linear_out(output), new_cache
diff --git a/wenet/transformer/cmvn.py b/wenet/transformer/cmvn.py
new file mode 100644
index 0000000000000000000000000000000000000000..754b221681362957978749aeb974ab4c75192645
--- /dev/null
+++ b/wenet/transformer/cmvn.py
@@ -0,0 +1,47 @@
+# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+
+class GlobalCMVN(torch.nn.Module):
+
+    def __init__(self,
+                 mean: torch.Tensor,
+                 istd: torch.Tensor,
+                 norm_var: bool = True):
+        """
+        Args:
+            mean (torch.Tensor): mean stats
+            istd (torch.Tensor): inverse std, std which is 1.0 / std
+        """
+        super().__init__()
+        assert mean.shape == istd.shape
+        self.norm_var = norm_var
+        # The buffer can be accessed from this module using self.mean
+        self.register_buffer("mean", mean)
+        self.register_buffer("istd", istd)
+
+    def forward(self, x: torch.Tensor):
+        """
+        Args:
+            x (torch.Tensor): (batch, max_len, feat_dim)
+
+        Returns:
+            (torch.Tensor): normalized feature
+        """
+        x = x - self.mean
+        if self.norm_var:
+            x = x * self.istd
+        return x
diff --git a/wenet/transformer/convolution.py b/wenet/transformer/convolution.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ce3b06991b3f7b9453e382dda3f286d6b28280f
--- /dev/null
+++ b/wenet/transformer/convolution.py
@@ -0,0 +1,150 @@
+# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from ESPnet(https://github.com/espnet/espnet)
+"""ConvolutionModule definition."""
+
+from typing import Tuple
+
+import torch
+from torch import nn
+
+from wenet.utils.class_utils import WENET_NORM_CLASSES
+
+
+class ConvolutionModule(nn.Module):
+    """ConvolutionModule in Conformer model."""
+
+    def __init__(
+        self,
+        channels: int,
+        kernel_size: int = 15,
+        activation: nn.Module = nn.ReLU(),
+        norm: str = "batch_norm",
+        causal: bool = False,
+        bias: bool = True,
+        norm_eps: float = 1e-5,
+    ):
+        """Construct an ConvolutionModule object.
+        Args:
+            channels (int): The number of channels of conv layers.
+            kernel_size (int): Kernel size of conv layers.
+            causal (int): Whether use causal convolution or not
+        """
+        super().__init__()
+
+        self.pointwise_conv1 = nn.Conv1d(
+            channels,
+            2 * channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=bias,
+        )
+        # self.lorder is used to distinguish if it's a causal convolution,
+        # if self.lorder > 0: it's a causal convolution, the input will be
+        #    padded with self.lorder frames on the left in forward.
+        # else: it's a symmetrical convolution
+        if causal:
+            padding = 0
+            self.lorder = kernel_size - 1
+        else:
+            # kernel_size should be an odd number for none causal convolution
+            assert (kernel_size - 1) % 2 == 0
+            padding = (kernel_size - 1) // 2
+            self.lorder = 0
+        self.depthwise_conv = nn.Conv1d(
+            channels,
+            channels,
+            kernel_size,
+            stride=1,
+            padding=padding,
+            groups=channels,
+            bias=bias,
+        )
+
+        assert norm in ['batch_norm', 'layer_norm', 'rms_norm']
+        if norm == "batch_norm":
+            self.use_layer_norm = False
+            self.norm = WENET_NORM_CLASSES['batch_norm'](channels,
+                                                         eps=norm_eps)
+        else:
+            self.use_layer_norm = True
+            self.norm = WENET_NORM_CLASSES[norm](channels, eps=norm_eps)
+
+        self.pointwise_conv2 = nn.Conv1d(
+            channels,
+            channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=bias,
+        )
+        self.activation = activation
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+        cache: torch.Tensor = torch.zeros((0, 0, 0)),
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Compute convolution module.
+        Args:
+            x (torch.Tensor): Input tensor (#batch, time, channels).
+            mask_pad (torch.Tensor): used for batch padding (#batch, 1, time),
+                (0, 0, 0) means fake mask.
+            cache (torch.Tensor): left context cache, it is only
+                used in causal convolution (#batch, channels, cache_t),
+                (0, 0, 0) meas fake cache.
+        Returns:
+            torch.Tensor: Output tensor (#batch, time, channels).
+        """
+        # exchange the temporal dimension and the feature dimension
+        x = x.transpose(1, 2)  # (#batch, channels, time)
+
+        # mask batch padding
+        if mask_pad.size(2) > 0:  # time > 0
+            x.masked_fill_(~mask_pad, 0.0)
+
+        if self.lorder > 0:
+            if cache.size(2) == 0:  # cache_t == 0
+                x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0)
+            else:
+                assert cache.size(0) == x.size(0)  # equal batch
+                assert cache.size(1) == x.size(1)  # equal channel
+                x = torch.cat((cache, x), dim=2)
+            assert (x.size(2) > self.lorder)
+            new_cache = x[:, :, -self.lorder:]
+        else:
+            # It's better we just return None if no cache is required,
+            # However, for JIT export, here we just fake one tensor instead of
+            # None.
+            new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device)
+
+        # GLU mechanism
+        x = self.pointwise_conv1(x)  # (batch, 2*channel, dim)
+        x = nn.functional.glu(x, dim=1)  # (batch, channel, dim)
+
+        # 1D Depthwise Conv
+        x = self.depthwise_conv(x)
+        if self.use_layer_norm:
+            x = x.transpose(1, 2)
+        x = self.activation(self.norm(x))
+        if self.use_layer_norm:
+            x = x.transpose(1, 2)
+        x = self.pointwise_conv2(x)
+        # mask batch padding
+        if mask_pad.size(2) > 0:  # time > 0
+            x.masked_fill_(~mask_pad, 0.0)
+
+        return x.transpose(1, 2), new_cache
diff --git a/wenet/transformer/ctc.py b/wenet/transformer/ctc.py
new file mode 100644
index 0000000000000000000000000000000000000000..67c5c6831bc1f5b2e7f8a1ce6baae233b4fee6e5
--- /dev/null
+++ b/wenet/transformer/ctc.py
@@ -0,0 +1,91 @@
+# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from ESPnet(https://github.com/espnet/espnet)
+
+from typing import Tuple
+
+import torch
+import torch.nn.functional as F
+
+
+class CTC(torch.nn.Module):
+    """CTC module"""
+
+    def __init__(
+        self,
+        odim: int,
+        encoder_output_size: int,
+        dropout_rate: float = 0.0,
+        reduce: bool = True,
+        blank_id: int = 0,
+    ):
+        """ Construct CTC module
+        Args:
+            odim: dimension of outputs
+            encoder_output_size: number of encoder projection units
+            dropout_rate: dropout rate (0.0 ~ 1.0)
+            reduce: reduce the CTC loss into a scalar
+            blank_id: blank label.
+        """
+        super().__init__()
+        eprojs = encoder_output_size
+        self.dropout_rate = dropout_rate
+        self.ctc_lo = torch.nn.Linear(eprojs, odim)
+
+        reduction_type = "sum" if reduce else "none"
+        self.ctc_loss = torch.nn.CTCLoss(blank=blank_id,
+                                         reduction=reduction_type,
+                                         zero_infinity=True)
+
+    def forward(self, hs_pad: torch.Tensor, hlens: torch.Tensor,
+                ys_pad: torch.Tensor,
+                ys_lens: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Calculate CTC loss.
+
+        Args:
+            hs_pad: batch of padded hidden state sequences (B, Tmax, D)
+            hlens: batch of lengths of hidden state sequences (B)
+            ys_pad: batch of padded character id sequence tensor (B, Lmax)
+            ys_lens: batch of lengths of character sequence (B)
+        """
+        # hs_pad: (B, L, NProj) -> ys_hat: (B, L, Nvocab)
+        ys_hat = self.ctc_lo(F.dropout(hs_pad, p=self.dropout_rate))
+        # ys_hat: (B, L, D) -> (L, B, D)
+        ys_hat = ys_hat.transpose(0, 1)
+        ys_hat = ys_hat.log_softmax(2)
+        loss = self.ctc_loss(ys_hat, ys_pad, hlens, ys_lens)
+        # Batch-size average
+        loss = loss / ys_hat.size(1)
+        ys_hat = ys_hat.transpose(0, 1)
+        return loss, ys_hat
+
+    def log_softmax(self, hs_pad: torch.Tensor) -> torch.Tensor:
+        """log_softmax of frame activations
+
+        Args:
+            Tensor hs_pad: 3d tensor (B, Tmax, eprojs)
+        Returns:
+            torch.Tensor: log softmax applied 3d tensor (B, Tmax, odim)
+        """
+        return F.log_softmax(self.ctc_lo(hs_pad), dim=2)
+
+    def argmax(self, hs_pad: torch.Tensor) -> torch.Tensor:
+        """argmax of frame activations
+
+        Args:
+            torch.Tensor hs_pad: 3d tensor (B, Tmax, eprojs)
+        Returns:
+            torch.Tensor: argmax applied 2d tensor (B, Tmax)
+        """
+        return torch.argmax(self.ctc_lo(hs_pad), dim=2)
diff --git a/wenet/transformer/decoder.py b/wenet/transformer/decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c4fab62afbb7e13aafac288c7c0230d7a9df69b
--- /dev/null
+++ b/wenet/transformer/decoder.py
@@ -0,0 +1,494 @@
+# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang, Di Wu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from ESPnet(https://github.com/espnet/espnet)
+"""Decoder definition."""
+from typing import Dict, Tuple, List, Optional
+
+import os
+import torch
+import torch.utils.checkpoint as ckpt
+import logging
+from wenet.transformer.attention import T_CACHE
+
+from wenet.transformer.decoder_layer import DecoderLayer
+from wenet.utils.class_utils import (
+    WENET_EMB_CLASSES,
+    WENET_ATTENTION_CLASSES,
+    WENET_ACTIVATION_CLASSES,
+    WENET_MLP_CLASSES,
+    WENET_NORM_CLASSES,
+)
+from wenet.utils.common import mask_to_bias
+from wenet.utils.mask import (subsequent_mask, make_pad_mask)
+
+
+class TransformerDecoder(torch.nn.Module):
+    """Base class of Transfomer decoder module.
+    Args:
+        vocab_size: output dim
+        encoder_output_size: dimension of attention
+        attention_heads: the number of heads of multi head attention
+        linear_units: the hidden units number of position-wise feedforward
+        num_blocks: the number of decoder blocks
+        dropout_rate: dropout rate
+        self_attention_dropout_rate: dropout rate for attention
+        input_layer: input layer type
+        use_output_layer: whether to use output layer
+        pos_enc_class: PositionalEncoding or ScaledPositionalEncoding
+        normalize_before:
+            True: use layer_norm before each sub-block of a layer.
+            False: use layer_norm after each sub-block of a layer.
+        src_attention: if false, encoder-decoder cross attention is not
+                       applied, such as CIF model
+        query_bias: whether use bias in attention.linear_q
+        key_bias: whether use bias in attention.linear_k, False for whisper models.
+        value_bias: whether use bias in attention.linear_v
+        gradient_checkpointing: rerunning a forward-pass segment for each
+            checkpointed segment during backward.
+        tie_word_embedding: Tie or clone module weights depending of whether we are
+            using TorchScript or not
+    """
+
+    def __init__(
+        self,
+        vocab_size: int,
+        encoder_output_size: int,
+        attention_heads: int = 4,
+        linear_units: int = 2048,
+        num_blocks: int = 6,
+        dropout_rate: float = 0.1,
+        positional_dropout_rate: float = 0.1,
+        self_attention_dropout_rate: float = 0.0,
+        src_attention_dropout_rate: float = 0.0,
+        input_layer: str = "embed",
+        use_output_layer: bool = True,
+        normalize_before: bool = True,
+        src_attention: bool = True,
+        query_bias: bool = True,
+        key_bias: bool = True,
+        value_bias: bool = True,
+        activation_type: str = "relu",
+        gradient_checkpointing: bool = False,
+        tie_word_embedding: bool = False,
+        use_sdpa: bool = False,
+        layer_norm_type: str = 'layer_norm',
+        norm_eps: float = 1e-5,
+        n_kv_head: Optional[int] = None,
+        head_dim: Optional[int] = None,
+        mlp_type: str = 'position_wise_feed_forward',
+        mlp_bias: bool = True,
+        n_expert: int = 8,
+        n_expert_activated: int = 2,
+    ):
+        super().__init__()
+        attention_dim = encoder_output_size
+        activation = WENET_ACTIVATION_CLASSES[activation_type]()
+
+        self.embed = torch.nn.Sequential(
+            torch.nn.Identity() if input_layer == "no_pos" else
+            torch.nn.Embedding(vocab_size, attention_dim),
+            WENET_EMB_CLASSES[input_layer](attention_dim,
+                                           positional_dropout_rate),
+        )
+
+        assert layer_norm_type in ['layer_norm', 'rms_norm']
+        self.normalize_before = normalize_before
+        self.after_norm = WENET_NORM_CLASSES[layer_norm_type](attention_dim,
+                                                              eps=norm_eps)
+        self.use_output_layer = use_output_layer
+        if use_output_layer:
+            self.output_layer = torch.nn.Linear(attention_dim, vocab_size)
+        else:
+            self.output_layer = torch.nn.Identity()
+        self.num_blocks = num_blocks
+
+        mlp_class = WENET_MLP_CLASSES[mlp_type]
+        self.decoders = torch.nn.ModuleList([
+            DecoderLayer(
+                attention_dim,
+                WENET_ATTENTION_CLASSES["selfattn"](
+                    attention_heads, attention_dim,
+                    self_attention_dropout_rate, query_bias, key_bias,
+                    value_bias, use_sdpa, n_kv_head, head_dim),
+                WENET_ATTENTION_CLASSES["crossattn"](
+                    attention_heads, attention_dim, src_attention_dropout_rate,
+                    query_bias, key_bias, value_bias, use_sdpa, n_kv_head,
+                    head_dim) if src_attention else None,
+                mlp_class(attention_dim,
+                          linear_units,
+                          dropout_rate,
+                          activation,
+                          mlp_bias,
+                          n_expert=n_expert,
+                          n_expert_activated=n_expert_activated),
+                dropout_rate,
+                normalize_before,
+                layer_norm_type,
+                norm_eps,
+            ) for _ in range(self.num_blocks)
+        ])
+
+        self.gradient_checkpointing = gradient_checkpointing
+        self.tie_word_embedding = tie_word_embedding
+        self.use_sdpa = use_sdpa
+
+    def forward(
+        self,
+        memory: torch.Tensor,
+        memory_mask: torch.Tensor,
+        ys_in_pad: torch.Tensor,
+        ys_in_lens: torch.Tensor,
+        r_ys_in_pad: torch.Tensor = torch.empty(0),
+        reverse_weight: float = 0.0,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Forward decoder.
+        Args:
+            memory: encoded memory, float32  (batch, maxlen_in, feat)
+            memory_mask: encoder memory mask, (batch, 1, maxlen_in)
+            ys_in_pad: padded input token ids, int64 (batch, maxlen_out)
+            ys_in_lens: input lengths of this batch (batch)
+            r_ys_in_pad: not used in transformer decoder, in order to unify api
+                with bidirectional decoder
+            reverse_weight: not used in transformer decoder, in order to unify
+                api with bidirectional decode
+        Returns:
+            (tuple): tuple containing:
+                x: decoded token score before softmax (batch, maxlen_out,
+                    vocab_size) if use_output_layer is True,
+                torch.tensor(0.0), in order to unify api with bidirectional decoder
+                olens: (batch, )
+        NOTE(xcsong):
+            We pass the `__call__` method of the modules instead of `forward` to the
+            checkpointing API because `__call__` attaches all the hooks of the module.
+            https://discuss.pytorch.org/t/any-different-between-model-input-and-model-forward-input/3690/2
+        """
+        tgt = ys_in_pad
+        maxlen = tgt.size(1)
+        # tgt_mask: (B, 1, L)
+        tgt_mask = ~make_pad_mask(ys_in_lens, maxlen).unsqueeze(1)
+        tgt_mask = tgt_mask.to(tgt.device)
+        # m: (1, L, L)
+        m = subsequent_mask(tgt_mask.size(-1),
+                            device=tgt_mask.device).unsqueeze(0)
+        # tgt_mask: (B, L, L)
+        tgt_mask = tgt_mask & m
+        if self.use_sdpa:
+            tgt_mask = mask_to_bias(tgt_mask, memory.dtype)
+            memory_mask = mask_to_bias(memory_mask, memory.dtype)
+
+        x, _ = self.embed(tgt)
+        if self.gradient_checkpointing and self.training:
+            x = self.forward_layers_checkpointed(x, tgt_mask, memory,
+                                                 memory_mask)
+        else:
+            x = self.forward_layers(x, tgt_mask, memory, memory_mask)
+        if self.normalize_before:
+            x = self.after_norm(x)
+        if self.use_output_layer:
+            x = self.output_layer(x)
+        olens = tgt_mask.sum(1)
+        return x, torch.tensor(0.0), olens
+
+    def forward_layers(self, x: torch.Tensor, tgt_mask: torch.Tensor,
+                       memory: torch.Tensor,
+                       memory_mask: torch.Tensor) -> torch.Tensor:
+        for layer in self.decoders:
+            x, tgt_mask, memory, memory_mask = layer(x, tgt_mask, memory,
+                                                     memory_mask)
+        return x
+
+    @torch.jit.unused
+    def forward_layers_checkpointed(self, x: torch.Tensor,
+                                    tgt_mask: torch.Tensor,
+                                    memory: torch.Tensor,
+                                    memory_mask: torch.Tensor) -> torch.Tensor:
+        for layer in self.decoders:
+            x, tgt_mask, memory, memory_mask = ckpt.checkpoint(
+                layer.__call__,
+                x,
+                tgt_mask,
+                memory,
+                memory_mask,
+                use_reentrant=False)
+        return x
+
+    def forward_one_step(
+        self,
+        memory: torch.Tensor,
+        memory_mask: torch.Tensor,
+        tgt: torch.Tensor,
+        tgt_mask: torch.Tensor,
+        cache: Dict[str, Dict[str, T_CACHE]],
+    ) -> torch.Tensor:
+        """Forward one step.
+            This is only used for decoding.
+        Args:
+            memory: encoded memory, float32  (batch, maxlen_in, feat)
+            memory_mask: encoded memory mask, (batch, 1, maxlen_in)
+            tgt: input token ids, int64 (batch, maxlen_out)
+            tgt_mask: input token mask,  (batch, maxlen_out)
+                      dtype=torch.uint8 in PyTorch 1.2-
+                      dtype=torch.bool in PyTorch 1.2+ (include 1.2)
+            cache: cached output list of (batch, max_time_out-1, size)
+        Returns:
+            y, cache: NN output value and cache per `self.decoders`.
+            y.shape` is (batch, maxlen_out, token)
+        """
+        x, _ = self.embed(tgt)
+        update_cross_att_cache = True
+        if len(cache['cross_att_cache']) != 0:
+            assert len(cache['cross_att_cache']) == self.num_blocks
+            update_cross_att_cache = False
+        for i, decoder in enumerate(self.decoders):
+            layer_i = 'layer_{}'.format(i)
+            self_att_cache = cache['self_att_cache'].get(layer_i, None)
+            cross_att_cache = cache['cross_att_cache'].get(layer_i, None)
+            c = {
+                'self_att_cache': self_att_cache,
+                'cross_att_cache': cross_att_cache,
+            }
+
+            x, tgt_mask, memory, memory_mask = decoder(x,
+                                                       tgt_mask,
+                                                       memory,
+                                                       memory_mask,
+                                                       cache=c)
+
+            # update cache dict
+            assert c['self_att_cache'] is not None
+            assert c['cross_att_cache'] is not None
+            cache['self_att_cache'][layer_i] = c['self_att_cache']
+            if update_cross_att_cache:
+                cache['cross_att_cache'][layer_i] = c['cross_att_cache']
+
+        if self.normalize_before:
+            y = self.after_norm(x[:, -1])
+        else:
+            y = x[:, -1]
+        if self.use_output_layer:
+            y = torch.log_softmax(self.output_layer(y), dim=-1)
+        return y
+
+    def tie_or_clone_weights(self, jit_mode: bool = True):
+        """Tie or clone module weights (between word_emb and output_layer)
+            depending of whether we are using TorchScript or not"""
+        rank = int(os.environ.get('RANK', 0))
+        if not self.use_output_layer:
+            return
+        if not self.tie_word_embedding:
+            return
+        if jit_mode:
+            if rank == 0:
+                logging.info("clone emb.weight to output.weight")
+            self.output_layer.weight = torch.nn.Parameter(
+                self.embed[0].weight.clone())
+        else:
+            if rank == 0:
+                logging.info("tie emb.weight with output.weight")
+            self.output_layer.weight = self.embed[0].weight
+
+        if getattr(self.output_layer, "bias", None) is not None:
+            self.output_layer.bias.data = torch.nn.functional.pad(
+                self.output_layer.bias.data,
+                (
+                    0,
+                    self.output_layer.weight.shape[0] -
+                    self.output_layer.bias.shape[0],
+                ),
+                "constant",
+                0,
+            )
+
+
+class BiTransformerDecoder(torch.nn.Module):
+    """Base class of Transfomer decoder module.
+    Args:
+        vocab_size: output dim
+        encoder_output_size: dimension of attention
+        attention_heads: the number of heads of multi head attention
+        linear_units: the hidden units number of position-wise feedforward
+        num_blocks: the number of decoder blocks
+        r_num_blocks: the number of right to left decoder blocks
+        dropout_rate: dropout rate
+        self_attention_dropout_rate: dropout rate for attention
+        input_layer: input layer type
+        use_output_layer: whether to use output layer
+        pos_enc_class: PositionalEncoding or ScaledPositionalEncoding
+        normalize_before:
+            True: use layer_norm before each sub-block of a layer.
+            False: use layer_norm after each sub-block of a layer.
+        key_bias: whether use bias in attention.linear_k, False for whisper models.
+    """
+
+    def __init__(
+        self,
+        vocab_size: int,
+        encoder_output_size: int,
+        attention_heads: int = 4,
+        linear_units: int = 2048,
+        num_blocks: int = 6,
+        r_num_blocks: int = 0,
+        dropout_rate: float = 0.1,
+        positional_dropout_rate: float = 0.1,
+        self_attention_dropout_rate: float = 0.0,
+        src_attention_dropout_rate: float = 0.0,
+        input_layer: str = "embed",
+        use_output_layer: bool = True,
+        normalize_before: bool = True,
+        src_attention: bool = True,
+        query_bias: bool = True,
+        key_bias: bool = True,
+        value_bias: bool = True,
+        activation_type: str = "relu",
+        gradient_checkpointing: bool = False,
+        tie_word_embedding: bool = False,
+        use_sdpa: bool = False,
+        layer_norm_type: str = 'layer_norm',
+        norm_eps: float = 1e-5,
+        n_kv_head: Optional[int] = None,
+        head_dim: Optional[int] = None,
+        mlp_type: str = 'position_wise_feed_forward',
+        mlp_bias: bool = True,
+        n_expert: int = 8,
+        n_expert_activated: int = 2,
+    ):
+
+        super().__init__()
+        self.use_sdpa = use_sdpa
+        self.tie_word_embedding = tie_word_embedding
+        self.left_decoder = TransformerDecoder(
+            vocab_size,
+            encoder_output_size,
+            attention_heads,
+            linear_units,
+            num_blocks,
+            dropout_rate,
+            positional_dropout_rate,
+            self_attention_dropout_rate,
+            src_attention_dropout_rate,
+            input_layer,
+            use_output_layer,
+            normalize_before,
+            src_attention=src_attention,
+            query_bias=query_bias,
+            key_bias=key_bias,
+            value_bias=value_bias,
+            activation_type=activation_type,
+            gradient_checkpointing=gradient_checkpointing,
+            tie_word_embedding=tie_word_embedding,
+            use_sdpa=use_sdpa,
+            layer_norm_type=layer_norm_type,
+            norm_eps=norm_eps,
+            n_kv_head=n_kv_head,
+            head_dim=head_dim,
+            mlp_type=mlp_type,
+            mlp_bias=mlp_bias,
+            n_expert=n_expert,
+            n_expert_activated=n_expert_activated)
+
+        self.right_decoder = TransformerDecoder(
+            vocab_size,
+            encoder_output_size,
+            attention_heads,
+            linear_units,
+            r_num_blocks,
+            dropout_rate,
+            positional_dropout_rate,
+            self_attention_dropout_rate,
+            src_attention_dropout_rate,
+            input_layer,
+            use_output_layer,
+            normalize_before,
+            src_attention=src_attention,
+            query_bias=query_bias,
+            key_bias=key_bias,
+            value_bias=value_bias,
+            activation_type=activation_type,
+            gradient_checkpointing=gradient_checkpointing,
+            tie_word_embedding=tie_word_embedding,
+            use_sdpa=use_sdpa,
+            layer_norm_type=layer_norm_type,
+            norm_eps=norm_eps,
+            n_kv_head=n_kv_head,
+            head_dim=head_dim,
+            mlp_type=mlp_type,
+            mlp_bias=mlp_bias,
+            n_expert=n_expert,
+            n_expert_activated=n_expert_activated)
+
+    def forward(
+        self,
+        memory: torch.Tensor,
+        memory_mask: torch.Tensor,
+        ys_in_pad: torch.Tensor,
+        ys_in_lens: torch.Tensor,
+        r_ys_in_pad: torch.Tensor,
+        reverse_weight: float = 0.0,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Forward decoder.
+        Args:
+            memory: encoded memory, float32  (batch, maxlen_in, feat)
+            memory_mask: encoder memory mask, (batch, 1, maxlen_in)
+            ys_in_pad: padded input token ids, int64 (batch, maxlen_out)
+            ys_in_lens: input lengths of this batch (batch)
+            r_ys_in_pad: padded input token ids, int64 (batch, maxlen_out),
+                used for right to left decoder
+            reverse_weight: used for right to left decoder
+        Returns:
+            (tuple): tuple containing:
+                x: decoded token score before softmax (batch, maxlen_out,
+                    vocab_size) if use_output_layer is True,
+                r_x: x: decoded token score (right to left decoder)
+                    before softmax (batch, maxlen_out, vocab_size)
+                    if use_output_layer is True,
+                olens: (batch, )
+        """
+        l_x, _, olens = self.left_decoder(memory, memory_mask, ys_in_pad,
+                                          ys_in_lens)
+        r_x = torch.tensor(0.0)
+        if reverse_weight > 0.0:
+            r_x, _, olens = self.right_decoder(memory, memory_mask,
+                                               r_ys_in_pad, ys_in_lens)
+        return l_x, r_x, olens
+
+    def forward_one_step(
+        self,
+        memory: torch.Tensor,
+        memory_mask: torch.Tensor,
+        tgt: torch.Tensor,
+        tgt_mask: torch.Tensor,
+        cache: Optional[List[torch.Tensor]] = None,
+    ) -> Tuple[torch.Tensor, List[torch.Tensor]]:
+        """Forward one step.
+            This is only used for decoding.
+        Args:
+            memory: encoded memory, float32  (batch, maxlen_in, feat)
+            memory_mask: encoded memory mask, (batch, 1, maxlen_in)
+            tgt: input token ids, int64 (batch, maxlen_out)
+            tgt_mask: input token mask,  (batch, maxlen_out)
+                      dtype=torch.uint8 in PyTorch 1.2-
+                      dtype=torch.bool in PyTorch 1.2+ (include 1.2)
+            cache: cached output list of (batch, max_time_out-1, size)
+        Returns:
+            y, cache: NN output value and cache per `self.decoders`.
+            y.shape` is (batch, maxlen_out, token)
+        """
+        return self.left_decoder.forward_one_step(memory, memory_mask, tgt,
+                                                  tgt_mask, cache)
+
+    def tie_or_clone_weights(self, jit_mode: bool = True):
+        """Tie or clone module weights (between word_emb and output_layer)
+            depending of whether we are using TorchScript or not"""
+        self.left_decoder.tie_or_clone_weights(jit_mode)
+        self.right_decoder.tie_or_clone_weights(jit_mode)
diff --git a/wenet/transformer/decoder_layer.py b/wenet/transformer/decoder_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..f08d597b3c06558ef338411041ef28e600c97b2b
--- /dev/null
+++ b/wenet/transformer/decoder_layer.py
@@ -0,0 +1,153 @@
+# Copyright (c) 2019 Shigeki Karita
+#               2020 Mobvoi Inc (Binbin Zhang)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Decoder self-attention layer definition."""
+from typing import Dict, Optional, Tuple
+
+import torch
+from torch import nn
+from wenet.transformer.attention import T_CACHE
+
+from wenet.utils.class_utils import WENET_NORM_CLASSES
+
+
+class DecoderLayer(nn.Module):
+    """Single decoder layer module.
+
+    Args:
+        size (int): Input dimension.
+        self_attn (torch.nn.Module): Self-attention module instance.
+            `MultiHeadedAttention` instance can be used as the argument.
+        src_attn (torch.nn.Module): Inter-attention module instance.
+            `MultiHeadedAttention` instance can be used as the argument.
+            If `None` is passed, Inter-attention is not used, such as
+            CIF, GPT, and other decoder only model.
+        feed_forward (torch.nn.Module): Feed-forward module instance.
+            `PositionwiseFeedForward` instance can be used as the argument.
+        dropout_rate (float): Dropout rate.
+        normalize_before (bool):
+            True: use layer_norm before each sub-block.
+            False: to use layer_norm after each sub-block.
+    """
+
+    def __init__(
+        self,
+        size: int,
+        self_attn: nn.Module,
+        src_attn: Optional[nn.Module],
+        feed_forward: nn.Module,
+        dropout_rate: float,
+        normalize_before: bool = True,
+        layer_norm_type: str = 'layer_norm',
+        norm_eps: float = 1e-5,
+    ):
+        """Construct an DecoderLayer object."""
+        super().__init__()
+        self.size = size
+        self.self_attn = self_attn
+        self.src_attn = src_attn
+        self.feed_forward = feed_forward
+        assert layer_norm_type in ['layer_norm', 'rms_norm']
+        self.norm1 = WENET_NORM_CLASSES[layer_norm_type](size, eps=norm_eps)
+        self.norm2 = WENET_NORM_CLASSES[layer_norm_type](size, eps=norm_eps)
+        self.norm3 = WENET_NORM_CLASSES[layer_norm_type](size, eps=norm_eps)
+        self.dropout = nn.Dropout(dropout_rate)
+        self.normalize_before = normalize_before
+
+    def forward(
+        self,
+        tgt: torch.Tensor,
+        tgt_mask: torch.Tensor,
+        memory: torch.Tensor,
+        memory_mask: torch.Tensor,
+        cache: Optional[Dict[str, Optional[T_CACHE]]] = None
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Compute decoded features.
+
+        Args:
+            tgt (torch.Tensor): Input tensor (#batch, maxlen_out, size).
+            tgt_mask (torch.Tensor): Mask for input tensor
+                (#batch, maxlen_out).
+            memory (torch.Tensor): Encoded memory
+                (#batch, maxlen_in, size).
+            memory_mask (torch.Tensor): Encoded memory mask
+                (#batch, maxlen_in).
+            cache (torch.Tensor): cached tensors.
+                (#batch, maxlen_out - 1, size).
+
+        Returns:
+            torch.Tensor: Output tensor (#batch, maxlen_out, size).
+            torch.Tensor: Mask for output tensor (#batch, maxlen_out).
+            torch.Tensor: Encoded memory (#batch, maxlen_in, size).
+            torch.Tensor: Encoded memory mask (#batch, maxlen_in).
+
+        """
+        if cache is not None:
+            att_cache = cache['self_att_cache']
+            cross_att_cache = cache['cross_att_cache']
+        else:
+            att_cache, cross_att_cache = None, None
+
+        residual = tgt
+        if self.normalize_before:
+            tgt = self.norm1(tgt)
+
+        if att_cache is None:
+            tgt_q = tgt
+            tgt_q_mask = tgt_mask
+            att_cache = (torch.empty(0, 0, 0, 0), torch.empty(0, 0, 0, 0))
+        else:
+            tgt_q = tgt[:, -1:, :]
+            residual = residual[:, -1:, :]
+            tgt_q_mask = tgt_mask[:, -1:, :]
+
+        x, new_att_cache = self.self_attn(
+            tgt_q,
+            tgt_q,
+            tgt_q,
+            tgt_q_mask,
+            cache=att_cache,
+        )
+        if cache is not None:
+            cache['self_att_cache'] = new_att_cache
+        x = residual + self.dropout(x)
+        if not self.normalize_before:
+            x = self.norm1(x)
+
+        if self.src_attn is not None:
+            residual = x
+            if self.normalize_before:
+                x = self.norm2(x)
+            if cross_att_cache is None:
+                cross_att_cache = (torch.empty(0, 0, 0,
+                                               0), torch.empty(0, 0, 0, 0))
+            x, new_cross_cache = self.src_attn(x,
+                                               memory,
+                                               memory,
+                                               memory_mask,
+                                               cache=cross_att_cache)
+            if cache is not None:
+                cache['cross_att_cache'] = new_cross_cache
+            x = residual + self.dropout(x)
+            if not self.normalize_before:
+                x = self.norm2(x)
+
+        residual = x
+        if self.normalize_before:
+            x = self.norm3(x)
+        x = residual + self.dropout(self.feed_forward(x))
+        if not self.normalize_before:
+            x = self.norm3(x)
+
+        return x, tgt_mask, memory, memory_mask
diff --git a/wenet/transformer/embedding.py b/wenet/transformer/embedding.py
new file mode 100644
index 0000000000000000000000000000000000000000..dcf717da3e04b4012715cc7a922cf745a33e1ced
--- /dev/null
+++ b/wenet/transformer/embedding.py
@@ -0,0 +1,259 @@
+# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from ESPnet(https://github.com/espnet/espnet)
+"""Positonal Encoding Module."""
+
+import math
+from typing import Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import numpy as np
+
+from wenet.utils.rope_utils import precompute_freqs_cis
+
+
+class PositionalEncoding(torch.nn.Module):
+    """Positional encoding.
+
+    :param int d_model: embedding dim
+    :param float dropout_rate: dropout rate
+    :param int max_len: maximum input length
+
+    PE(pos, 2i)   = sin(pos/(10000^(2i/dmodel)))
+    PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel)))
+    """
+
+    def __init__(self,
+                 d_model: int,
+                 dropout_rate: float,
+                 max_len: int = 5000,
+                 reverse: bool = False):
+        """Construct an PositionalEncoding object."""
+        super().__init__()
+        self.d_model = d_model
+        self.xscale = math.sqrt(self.d_model)
+        self.dropout = torch.nn.Dropout(p=dropout_rate)
+        self.max_len = max_len
+
+        pe = torch.zeros(self.max_len, self.d_model)
+        position = torch.arange(0, self.max_len,
+                                dtype=torch.float32).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.d_model, 2, dtype=torch.float32) *
+            -(math.log(10000.0) / self.d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.register_buffer("pe", pe)
+
+    def forward(self,
+                x: torch.Tensor,
+                offset: Union[int, torch.Tensor] = 0) \
+            -> Tuple[torch.Tensor, torch.Tensor]:
+        """Add positional encoding.
+
+        Args:
+            x (torch.Tensor): Input. Its shape is (batch, time, ...)
+            offset (int, torch.tensor): position offset
+
+        Returns:
+            torch.Tensor: Encoded tensor. Its shape is (batch, time, ...)
+            torch.Tensor: for compatibility to RelPositionalEncoding
+        """
+
+        pos_emb = self.position_encoding(offset, x.size(1), False)
+        x = x * self.xscale + pos_emb
+        return self.dropout(x), self.dropout(pos_emb)
+
+    def position_encoding(self,
+                          offset: Union[int, torch.Tensor],
+                          size: int,
+                          apply_dropout: bool = True) -> torch.Tensor:
+        """ For getting encoding in a streaming fashion
+
+        Attention!!!!!
+        we apply dropout only once at the whole utterance level in a none
+        streaming way, but will call this function several times with
+        increasing input size in a streaming scenario, so the dropout will
+        be applied several times.
+
+        Args:
+            offset (int or torch.tensor): start offset
+            size (int): required size of position encoding
+
+        Returns:
+            torch.Tensor: Corresponding encoding
+        """
+        # How to subscript a Union type:
+        #   https://github.com/pytorch/pytorch/issues/69434
+        if isinstance(offset, int):
+            assert offset + size <= self.max_len
+            pos_emb = self.pe[:, offset:offset + size]
+        elif isinstance(offset, torch.Tensor) and offset.dim() == 0:  # scalar
+            assert offset + size <= self.max_len
+            pos_emb = self.pe[:, offset:offset + size]
+        else:  # for batched streaming decoding on GPU
+            assert torch.max(offset) + size <= self.max_len
+            index = offset.unsqueeze(1) + \
+                torch.arange(0, size).to(offset.device)  # B X T
+            flag = index > 0
+            # remove negative offset
+            index = index * flag
+            pos_emb = F.embedding(index, self.pe[0])  # B X T X d_model
+
+        if apply_dropout:
+            pos_emb = self.dropout(pos_emb)
+        return pos_emb
+
+
+class RelPositionalEncoding(PositionalEncoding):
+    """Relative positional encoding module.
+    See : Appendix B in https://arxiv.org/abs/1901.02860
+    Args:
+        d_model (int): Embedding dimension.
+        dropout_rate (float): Dropout rate.
+        max_len (int): Maximum input length.
+    """
+
+    def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000):
+        """Initialize class."""
+        super().__init__(d_model, dropout_rate, max_len, reverse=True)
+
+    def forward(self,
+                x: torch.Tensor,
+                offset: Union[int, torch.Tensor] = 0) \
+            -> Tuple[torch.Tensor, torch.Tensor]:
+        """Compute positional encoding.
+        Args:
+            x (torch.Tensor): Input tensor (batch, time, `*`).
+        Returns:
+            torch.Tensor: Encoded tensor (batch, time, `*`).
+            torch.Tensor: Positional embedding tensor (1, time, `*`).
+        """
+        x = x * self.xscale
+        pos_emb = self.position_encoding(offset, x.size(1), False)
+        return self.dropout(x), self.dropout(pos_emb)
+
+
+class WhisperPositionalEncoding(PositionalEncoding):
+    """ Sinusoids position encoding used in openai-whisper.encoder
+    """
+
+    def __init__(self, d_model: int, dropout_rate: float, max_len: int = 1500):
+        super().__init__(d_model, dropout_rate, max_len)
+        self.xscale = 1.0
+        log_timescale_increment = np.log(10000) / (d_model // 2 - 1)
+        inv_timescales = torch.exp(-log_timescale_increment *
+                                   torch.arange(d_model // 2))
+        scaled_time = torch.arange(max_len)[:, np.newaxis] * \
+            inv_timescales[np.newaxis, :]
+        pe = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1)
+        delattr(self, "pe")
+        self.register_buffer("pe", pe.unsqueeze(0))
+
+
+class LearnablePositionalEncoding(PositionalEncoding):
+    """ Learnable position encoding used in openai-whisper.decoder
+    """
+
+    def __init__(self, d_model: int, dropout_rate: float, max_len: int = 448):
+        super().__init__(d_model, dropout_rate, max_len)
+        # NOTE(xcsong): overwrite self.pe & self.xscale
+        self.pe = torch.nn.Parameter(torch.empty(1, max_len, d_model))
+        self.xscale = 1.0
+
+
+class NoPositionalEncoding(torch.nn.Module):
+    """ No position encoding
+    """
+
+    def __init__(self, d_model: int, dropout_rate: float):
+        super().__init__()
+        self.d_model = d_model
+        self.dropout = torch.nn.Dropout(p=dropout_rate)
+
+    def forward(self,
+                x: torch.Tensor,
+                offset: Union[int, torch.Tensor] = 0) \
+            -> Tuple[torch.Tensor, torch.Tensor]:
+        """ Just return zero vector for interface compatibility
+        """
+        pos_emb = torch.zeros(1, x.size(1), self.d_model).to(x.device)
+        return self.dropout(x), pos_emb
+
+    def position_encoding(self, offset: Union[int, torch.Tensor],
+                          size: int) -> torch.Tensor:
+        return torch.zeros(1, size, self.d_model)
+
+
+class RopePositionalEncoding(PositionalEncoding):
+
+    def __init__(self,
+                 d_model: int,
+                 head_dim: int,
+                 dropout_rate: float,
+                 max_len: int = 1500,
+                 rope_theta=10000.0,
+                 scale: bool = True):
+        super().__init__(d_model, dropout_rate=dropout_rate, max_len=max_len)
+        delattr(self, 'pe')
+        self.max_len = max_len * 2
+        pe = precompute_freqs_cis(head_dim, self.max_len, rope_theta)
+        self.register_buffer("pe", torch.view_as_real(pe.unsqueeze(0)))
+        self.dropout_rate = dropout_rate
+        self.scale = scale
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        offset: Union[int,
+                      torch.Tensor] = 0) -> Tuple[torch.Tensor, torch.Tensor]:
+
+        pos_emb = self.position_encoding(offset, x.size(1), True)
+        pos_emb = pos_emb.unsqueeze(2)  # [1,seq, 1, head_dim//2]
+        # NOTE(Mddct): some model don't scale
+        if self.scale:
+            x = x * self.xscale
+        return self.dropout(x), pos_emb
+
+    def position_encoding(self,
+                          offset: Union[int, torch.Tensor],
+                          size: int,
+                          apply_dropout: bool = True) -> torch.Tensor:
+
+        pe = torch.view_as_complex(self.pe)
+        if isinstance(offset, int):
+            assert offset + size <= self.max_len
+            pos_emb = pe[:, offset:offset + size]
+        else:
+            assert torch.max(offset) + size <= self.max_len
+            index = offset.unsqueeze(1) + torch.arange(0, size).to(
+                offset.device)  # B X T
+            flag = index > 0
+            # remove negative offset
+            index = index * flag
+            pos_emb = F.embedding(index, pe[0])  # B X T X head_dim//2
+        if apply_dropout:
+            # NOTE(Mddct) dropout don't suuport complex float for pos_emb
+            pos_emb = self.dropout_complex(pos_emb)
+        return pos_emb
+
+    def dropout_complex(self, x):
+        mask = torch.nn.functional.dropout(
+            torch.ones_like(x.real),
+            training=self.training,
+            p=self.dropout_rate,
+        )
+        return x * mask
diff --git a/wenet/transformer/encoder.py b/wenet/transformer/encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..e88a6239baae2098b8a2fc7971064b07ddb14a85
--- /dev/null
+++ b/wenet/transformer/encoder.py
@@ -0,0 +1,550 @@
+# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu)
+#               2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from ESPnet(https://github.com/espnet/espnet)
+"""Encoder definition."""
+from typing import Optional, Tuple
+
+import torch
+import torch.utils.checkpoint as ckpt
+
+from wenet.transformer.convolution import ConvolutionModule
+from wenet.transformer.encoder_layer import TransformerEncoderLayer
+from wenet.transformer.encoder_layer import ConformerEncoderLayer
+from wenet.utils.class_utils import (
+    WENET_EMB_CLASSES,
+    WENET_MLP_CLASSES,
+    WENET_NORM_CLASSES,
+    WENET_SUBSAMPLE_CLASSES,
+    WENET_ATTENTION_CLASSES,
+    WENET_ACTIVATION_CLASSES,
+)
+from wenet.utils.mask import make_pad_mask
+from wenet.utils.mask import add_optional_chunk_mask
+from wenet.utils.common import mask_to_bias
+
+
+class BaseEncoder(torch.nn.Module):
+
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int = 256,
+        attention_heads: int = 4,
+        linear_units: int = 2048,
+        num_blocks: int = 6,
+        dropout_rate: float = 0.1,
+        positional_dropout_rate: float = 0.1,
+        attention_dropout_rate: float = 0.0,
+        input_layer: str = "conv2d",
+        pos_enc_layer_type: str = "abs_pos",
+        normalize_before: bool = True,
+        static_chunk_size: int = 0,
+        use_dynamic_chunk: bool = False,
+        global_cmvn: torch.nn.Module = None,
+        use_dynamic_left_chunk: bool = False,
+        gradient_checkpointing: bool = False,
+        use_sdpa: bool = False,
+        layer_norm_type: str = 'layer_norm',
+        norm_eps: float = 1e-5,
+    ):
+        """
+        Args:
+            input_size (int): input dim
+            output_size (int): dimension of attention
+            attention_heads (int): the number of heads of multi head attention
+            linear_units (int): the hidden units number of position-wise feed
+                forward
+            num_blocks (int): the number of decoder blocks
+            dropout_rate (float): dropout rate
+            attention_dropout_rate (float): dropout rate in attention
+            positional_dropout_rate (float): dropout rate after adding
+                positional encoding
+            input_layer (str): input layer type.
+                optional [linear, conv2d, conv2d6, conv2d8]
+            pos_enc_layer_type (str): Encoder positional encoding layer type.
+                opitonal [abs_pos, scaled_abs_pos, rel_pos, no_pos]
+            normalize_before (bool):
+                True: use layer_norm before each sub-block of a layer.
+                False: use layer_norm after each sub-block of a layer.
+            static_chunk_size (int): chunk size for static chunk training and
+                decoding
+            use_dynamic_chunk (bool): whether use dynamic chunk size for
+                training or not, You can only use fixed chunk(chunk_size > 0)
+                or dyanmic chunk size(use_dynamic_chunk = True)
+            global_cmvn (Optional[torch.nn.Module]): Optional GlobalCMVN module
+            use_dynamic_left_chunk (bool): whether use dynamic left chunk in
+                dynamic chunk training
+            query_bias: whether use bias in attention.linear_q
+            key_bias: whether use bias in attention.linear_k, False for whisper models.
+            value_bias: whether use bias in attention.linear_v
+            gradient_checkpointing: rerunning a forward-pass segment for each
+                checkpointed segment during backward.
+            use_sdpa: whether to use SDPA, currently only support transformer for now
+        """
+        super().__init__()
+        self._output_size = output_size
+
+        self.global_cmvn = global_cmvn
+        pos_emb_class = WENET_EMB_CLASSES[pos_enc_layer_type]
+        # NOTE(Mddct): head_dim == output_size // attention_heads for most of
+        #    speech tasks,  but for other task (LLM),
+        #    head_dim == hidden_size * attention_heads. refactor later
+        self.embed = WENET_SUBSAMPLE_CLASSES[input_layer](
+            input_size, output_size, dropout_rate,
+            pos_emb_class(output_size, positional_dropout_rate)
+            if pos_enc_layer_type != 'rope_pos' else pos_emb_class(
+                output_size, output_size //
+                attention_heads, positional_dropout_rate))
+
+        assert layer_norm_type in ['layer_norm', 'rms_norm']
+        self.normalize_before = normalize_before
+        self.after_norm = WENET_NORM_CLASSES[layer_norm_type](output_size,
+                                                              eps=norm_eps)
+        self.static_chunk_size = static_chunk_size
+        self.use_dynamic_chunk = use_dynamic_chunk
+        self.use_dynamic_left_chunk = use_dynamic_left_chunk
+        self.gradient_checkpointing = gradient_checkpointing
+        self.use_sdpa = use_sdpa
+
+    def output_size(self) -> int:
+        return self._output_size
+
+    def forward(
+        self,
+        xs: torch.Tensor,
+        xs_lens: torch.Tensor,
+        decoding_chunk_size: int = 0,
+        num_decoding_left_chunks: int = -1,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Embed positions in tensor.
+
+        Args:
+            xs: padded input tensor (B, T, D)
+            xs_lens: input length (B)
+            decoding_chunk_size: decoding chunk size for dynamic chunk
+                0: default for training, use random dynamic chunk.
+                <0: for decoding, use full chunk.
+                >0: for decoding, use fixed chunk size as set.
+            num_decoding_left_chunks: number of left chunks, this is for decoding,
+            the chunk size is decoding_chunk_size.
+                >=0: use num_decoding_left_chunks
+                <0: use all left chunks
+        Returns:
+            encoder output tensor xs, and subsampled masks
+            xs: padded output tensor (B, T' ~= T/subsample_rate, D)
+            masks: torch.Tensor batch padding mask after subsample
+                (B, 1, T' ~= T/subsample_rate)
+        NOTE(xcsong):
+            We pass the `__call__` method of the modules instead of `forward` to the
+            checkpointing API because `__call__` attaches all the hooks of the module.
+            https://discuss.pytorch.org/t/any-different-between-model-input-and-model-forward-input/3690/2
+        """
+        T = xs.size(1)
+        masks = ~make_pad_mask(xs_lens, T).unsqueeze(1)  # (B, 1, T)
+        if self.global_cmvn is not None:
+            xs = self.global_cmvn(xs)
+        xs, pos_emb, masks = self.embed(xs, masks)
+        mask_pad = masks  # (B, 1, T/subsample_rate)
+        chunk_masks = add_optional_chunk_mask(
+            xs,
+            masks,
+            self.use_dynamic_chunk,
+            self.use_dynamic_left_chunk,
+            decoding_chunk_size,
+            self.static_chunk_size,
+            num_decoding_left_chunks,
+            # Since we allow up to 1s(100 frames) delay, the maximum
+            # chunk_size is 100 / 4 = 25.
+            max_chunk_size=int(100.0 / self.embed.subsampling_rate))
+        if self.use_sdpa:
+            chunk_masks = mask_to_bias(chunk_masks, xs.dtype)
+        if self.gradient_checkpointing and self.training:
+            xs = self.forward_layers_checkpointed(xs, chunk_masks, pos_emb,
+                                                  mask_pad)
+        else:
+            xs = self.forward_layers(xs, chunk_masks, pos_emb, mask_pad)
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+        # Here we assume the mask is not changed in encoder layers, so just
+        # return the masks before encoder layers, and the masks will be used
+        # for cross attention with decoder later
+        return xs, masks
+
+    def forward_layers(self, xs: torch.Tensor, chunk_masks: torch.Tensor,
+                       pos_emb: torch.Tensor,
+                       mask_pad: torch.Tensor) -> torch.Tensor:
+        for layer in self.encoders:
+            xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad)
+        return xs
+
+    @torch.jit.unused
+    def forward_layers_checkpointed(self, xs: torch.Tensor,
+                                    chunk_masks: torch.Tensor,
+                                    pos_emb: torch.Tensor,
+                                    mask_pad: torch.Tensor) -> torch.Tensor:
+        for layer in self.encoders:
+            xs, chunk_masks, _, _ = ckpt.checkpoint(layer.__call__,
+                                                    xs,
+                                                    chunk_masks,
+                                                    pos_emb,
+                                                    mask_pad,
+                                                    use_reentrant=False)
+        return xs
+
+    def forward_chunk(
+        self,
+        xs: torch.Tensor,
+        offset: int,
+        required_cache_size: int,
+        att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0),
+        cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0),
+        att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """ Forward just one chunk
+
+        Args:
+            xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim),
+                where `time == (chunk_size - 1) * subsample_rate + \
+                        subsample.right_context + 1`
+            offset (int): current offset in encoder output time stamp
+            required_cache_size (int): cache size required for next chunk
+                compuation
+                >=0: actual cache size
+                <0: means all history cache is required
+            att_cache (torch.Tensor): cache tensor for KEY & VALUE in
+                transformer/conformer attention, with shape
+                (elayers, head, cache_t1, d_k * 2), where
+                `head * d_k == hidden-dim` and
+                `cache_t1 == chunk_size * num_decoding_left_chunks`.
+            cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer,
+                (elayers, b=1, hidden-dim, cache_t2), where
+                `cache_t2 == cnn.lorder - 1`
+
+        Returns:
+            torch.Tensor: output of current input xs,
+                with shape (b=1, chunk_size, hidden-dim).
+            torch.Tensor: new attention cache required for next chunk, with
+                dynamic shape (elayers, head, ?, d_k * 2)
+                depending on required_cache_size.
+            torch.Tensor: new conformer cnn cache required for next chunk, with
+                same shape as the original cnn_cache.
+
+        """
+        assert xs.size(0) == 1
+        # tmp_masks is just for interface compatibility
+        tmp_masks = torch.ones(1,
+                               xs.size(1),
+                               device=xs.device,
+                               dtype=torch.bool)
+        tmp_masks = tmp_masks.unsqueeze(1)
+        if self.global_cmvn is not None:
+            xs = self.global_cmvn(xs)
+        # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim)
+        xs, pos_emb, _ = self.embed(xs, tmp_masks, offset)
+        # NOTE(xcsong): After  embed, shape(xs) is (b=1, chunk_size, hidden-dim)
+        elayers, cache_t1 = att_cache.size(0), att_cache.size(2)
+        chunk_size = xs.size(1)
+        attention_key_size = cache_t1 + chunk_size
+        pos_emb = self.embed.position_encoding(offset=offset - cache_t1,
+                                               size=attention_key_size)
+        if required_cache_size < 0:
+            next_cache_start = 0
+        elif required_cache_size == 0:
+            next_cache_start = attention_key_size
+        else:
+            next_cache_start = max(attention_key_size - required_cache_size, 0)
+        r_att_cache = []
+        r_cnn_cache = []
+        for i, layer in enumerate(self.encoders):
+            # NOTE(xcsong): Before layer.forward
+            #   shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2),
+            #   shape(cnn_cache[i])       is (b=1, hidden-dim, cache_t2)
+            if elayers == 0:
+                kv_cache = (att_cache, att_cache)
+            else:
+                i_kv_cache = att_cache[i:i + 1]
+                size = att_cache.size(-1) // 2
+                kv_cache = (i_kv_cache[:, :, :, :size], i_kv_cache[:, :, :,
+                                                                   size:])
+            xs, _, new_kv_cache, new_cnn_cache = layer(
+                xs,
+                att_mask,
+                pos_emb,
+                att_cache=kv_cache,
+                cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache)
+            new_att_cache = torch.cat(new_kv_cache, dim=-1)
+            # NOTE(xcsong): After layer.forward
+            #   shape(new_att_cache) is (1, head, attention_key_size, d_k * 2),
+            #   shape(new_cnn_cache) is (b=1, hidden-dim, cache_t2)
+            r_att_cache.append(new_att_cache[:, :, next_cache_start:, :])
+            r_cnn_cache.append(new_cnn_cache.unsqueeze(0))
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+
+        # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2),
+        #   ? may be larger than cache_t1, it depends on required_cache_size
+        r_att_cache = torch.cat(r_att_cache, dim=0)
+        # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2)
+        r_cnn_cache = torch.cat(r_cnn_cache, dim=0)
+
+        return (xs, r_att_cache, r_cnn_cache)
+
+    def forward_chunk_by_chunk(
+        self,
+        xs: torch.Tensor,
+        decoding_chunk_size: int,
+        num_decoding_left_chunks: int = -1,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """ Forward input chunk by chunk with chunk_size like a streaming
+            fashion
+
+        Here we should pay special attention to computation cache in the
+        streaming style forward chunk by chunk. Three things should be taken
+        into account for computation in the current network:
+            1. transformer/conformer encoder layers output cache
+            2. convolution in conformer
+            3. convolution in subsampling
+
+        However, we don't implement subsampling cache for:
+            1. We can control subsampling module to output the right result by
+               overlapping input instead of cache left context, even though it
+               wastes some computation, but subsampling only takes a very
+               small fraction of computation in the whole model.
+            2. Typically, there are several covolution layers with subsampling
+               in subsampling module, it is tricky and complicated to do cache
+               with different convolution layers with different subsampling
+               rate.
+            3. Currently, nn.Sequential is used to stack all the convolution
+               layers in subsampling, we need to rewrite it to make it work
+               with cache, which is not prefered.
+        Args:
+            xs (torch.Tensor): (1, max_len, dim)
+            chunk_size (int): decoding chunk size
+        """
+        assert decoding_chunk_size > 0
+        # The model is trained by static or dynamic chunk
+        assert self.static_chunk_size > 0 or self.use_dynamic_chunk
+        subsampling = self.embed.subsampling_rate
+        context = self.embed.right_context + 1  # Add current frame
+        stride = subsampling * decoding_chunk_size
+        decoding_window = (decoding_chunk_size - 1) * subsampling + context
+        num_frames = xs.size(1)
+        att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device)
+        cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device)
+        outputs = []
+        offset = 0
+        required_cache_size = decoding_chunk_size * num_decoding_left_chunks
+
+        # Feed forward overlap input step by step
+        for cur in range(0, num_frames - context + 1, stride):
+            end = min(cur + decoding_window, num_frames)
+            chunk_xs = xs[:, cur:end, :]
+            (y, att_cache,
+             cnn_cache) = self.forward_chunk(chunk_xs, offset,
+                                             required_cache_size, att_cache,
+                                             cnn_cache)
+            outputs.append(y)
+            offset += y.size(1)
+        ys = torch.cat(outputs, 1)
+        masks = torch.ones((1, 1, ys.size(1)),
+                           device=ys.device,
+                           dtype=torch.bool)
+        return ys, masks
+
+
+class TransformerEncoder(BaseEncoder):
+    """Transformer encoder module."""
+
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int = 256,
+        attention_heads: int = 4,
+        linear_units: int = 2048,
+        num_blocks: int = 6,
+        dropout_rate: float = 0.1,
+        positional_dropout_rate: float = 0.1,
+        attention_dropout_rate: float = 0.0,
+        input_layer: str = "conv2d",
+        pos_enc_layer_type: str = "abs_pos",
+        normalize_before: bool = True,
+        static_chunk_size: int = 0,
+        use_dynamic_chunk: bool = False,
+        global_cmvn: torch.nn.Module = None,
+        use_dynamic_left_chunk: bool = False,
+        query_bias: bool = True,
+        key_bias: bool = True,
+        value_bias: bool = True,
+        activation_type: str = "relu",
+        gradient_checkpointing: bool = False,
+        use_sdpa: bool = False,
+        layer_norm_type: str = 'layer_norm',
+        norm_eps: float = 1e-5,
+        n_kv_head: Optional[int] = None,
+        head_dim: Optional[int] = None,
+        selfattention_layer_type: str = "selfattn",
+        mlp_type: str = 'position_wise_feed_forward',
+        mlp_bias: bool = True,
+        n_expert: int = 8,
+        n_expert_activated: int = 2,
+    ):
+        """ Construct TransformerEncoder
+
+        See Encoder for the meaning of each parameter.
+        """
+        super().__init__(input_size, output_size, attention_heads,
+                         linear_units, num_blocks, dropout_rate,
+                         positional_dropout_rate, attention_dropout_rate,
+                         input_layer, pos_enc_layer_type, normalize_before,
+                         static_chunk_size, use_dynamic_chunk, global_cmvn,
+                         use_dynamic_left_chunk, gradient_checkpointing,
+                         use_sdpa, layer_norm_type, norm_eps)
+
+        assert selfattention_layer_type in ['selfattn', 'rope_abs_selfattn']
+        activation = WENET_ACTIVATION_CLASSES[activation_type]()
+        mlp_class = WENET_MLP_CLASSES[mlp_type]
+        self.encoders = torch.nn.ModuleList([
+            TransformerEncoderLayer(
+                output_size,
+                WENET_ATTENTION_CLASSES[selfattention_layer_type](
+                    attention_heads, output_size, attention_dropout_rate,
+                    query_bias, key_bias, value_bias, use_sdpa, n_kv_head,
+                    head_dim),
+                mlp_class(output_size,
+                          linear_units,
+                          dropout_rate,
+                          activation,
+                          mlp_bias,
+                          n_expert=n_expert,
+                          n_expert_activated=n_expert_activated),
+                dropout_rate,
+                normalize_before,
+                layer_norm_type=layer_norm_type,
+                norm_eps=norm_eps,
+            ) for _ in range(num_blocks)
+        ])
+
+
+class ConformerEncoder(BaseEncoder):
+    """Conformer encoder module."""
+
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int = 256,
+        attention_heads: int = 4,
+        linear_units: int = 2048,
+        num_blocks: int = 6,
+        dropout_rate: float = 0.1,
+        positional_dropout_rate: float = 0.1,
+        attention_dropout_rate: float = 0.0,
+        input_layer: str = "conv2d",
+        pos_enc_layer_type: str = "rel_pos",
+        normalize_before: bool = True,
+        static_chunk_size: int = 0,
+        use_dynamic_chunk: bool = False,
+        global_cmvn: torch.nn.Module = None,
+        use_dynamic_left_chunk: bool = False,
+        positionwise_conv_kernel_size: int = 1,
+        macaron_style: bool = True,
+        selfattention_layer_type: str = "rel_selfattn",
+        activation_type: str = "swish",
+        use_cnn_module: bool = True,
+        cnn_module_kernel: int = 15,
+        causal: bool = False,
+        cnn_module_norm: str = "batch_norm",
+        query_bias: bool = True,
+        key_bias: bool = True,
+        value_bias: bool = True,
+        conv_bias: bool = True,
+        gradient_checkpointing: bool = False,
+        use_sdpa: bool = False,
+        layer_norm_type: str = 'layer_norm',
+        norm_eps: float = 1e-5,
+        n_kv_head: Optional[int] = None,
+        head_dim: Optional[int] = None,
+        mlp_type: str = 'position_wise_feed_forward',
+        mlp_bias: bool = True,
+        n_expert: int = 8,
+        n_expert_activated: int = 2,
+    ):
+        """Construct ConformerEncoder
+
+        Args:
+            input_size to use_dynamic_chunk, see in BaseEncoder
+            positionwise_conv_kernel_size (int): Kernel size of positionwise
+                conv1d layer.
+            macaron_style (bool): Whether to use macaron style for
+                positionwise layer.
+            selfattention_layer_type (str): Encoder attention layer type,
+                the parameter has no effect now, it's just for configure
+                compatibility.
+            activation_type (str): Encoder activation function type.
+            use_cnn_module (bool): Whether to use convolution module.
+            cnn_module_kernel (int): Kernel size of convolution module.
+            causal (bool): whether to use causal convolution or not.
+            key_bias: whether use bias in attention.linear_k, False for whisper models.
+        """
+        super().__init__(input_size, output_size, attention_heads,
+                         linear_units, num_blocks, dropout_rate,
+                         positional_dropout_rate, attention_dropout_rate,
+                         input_layer, pos_enc_layer_type, normalize_before,
+                         static_chunk_size, use_dynamic_chunk, global_cmvn,
+                         use_dynamic_left_chunk, gradient_checkpointing,
+                         use_sdpa, layer_norm_type, norm_eps)
+        activation = WENET_ACTIVATION_CLASSES[activation_type]()
+
+        # self-attention module definition
+        encoder_selfattn_layer_args = (
+            attention_heads,
+            output_size,
+            attention_dropout_rate,
+            query_bias,
+            key_bias,
+            value_bias,
+            use_sdpa,
+            n_kv_head,
+            head_dim,
+        )
+        # feed-forward module definition
+        positionwise_layer_args = (
+            output_size,
+            linear_units,
+            dropout_rate,
+            activation,
+            mlp_bias,
+            n_expert,
+            n_expert_activated,
+        )
+        # convolution module definition
+        convolution_layer_args = (output_size, cnn_module_kernel, activation,
+                                  cnn_module_norm, causal, conv_bias)
+
+        mlp_class = WENET_MLP_CLASSES[mlp_type]
+        self.encoders = torch.nn.ModuleList([
+            ConformerEncoderLayer(
+                output_size,
+                WENET_ATTENTION_CLASSES[selfattention_layer_type](
+                    *encoder_selfattn_layer_args),
+                mlp_class(*positionwise_layer_args),
+                mlp_class(*positionwise_layer_args) if macaron_style else None,
+                ConvolutionModule(
+                    *convolution_layer_args) if use_cnn_module else None,
+                dropout_rate,
+                normalize_before,
+                layer_norm_type=layer_norm_type,
+                norm_eps=norm_eps,
+            ) for _ in range(num_blocks)
+        ])
diff --git a/wenet/transformer/encoder_layer.py b/wenet/transformer/encoder_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..7228dfbf1664d7c4d2f04576c8fbf09f1fd5c0ba
--- /dev/null
+++ b/wenet/transformer/encoder_layer.py
@@ -0,0 +1,265 @@
+# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu)
+#               2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from ESPnet(https://github.com/espnet/espnet)
+"""Encoder self-attention layer definition."""
+
+from functools import partial
+from typing import Optional, Tuple
+
+import torch
+from torch import nn
+from wenet.transformer.attention import T_CACHE
+
+from wenet.utils.class_utils import WENET_NORM_CLASSES
+
+
+class TransformerEncoderLayer(nn.Module):
+    """Encoder layer module.
+
+    Args:
+        size (int): Input dimension.
+        self_attn (torch.nn.Module): Self-attention module instance.
+            `MultiHeadedAttention` or `RelPositionMultiHeadedAttention`
+            instance can be used as the argument.
+        feed_forward (torch.nn.Module): Feed-forward module instance.
+            `PositionwiseFeedForward`, instance can be used as the argument.
+        dropout_rate (float): Dropout rate.
+        normalize_before (bool):
+            True: use layer_norm before each sub-block.
+            False: to use layer_norm after each sub-block.
+    """
+
+    def __init__(
+        self,
+        size: int,
+        self_attn: torch.nn.Module,
+        feed_forward: torch.nn.Module,
+        dropout_rate: float,
+        normalize_before: bool = True,
+        layer_norm_type: str = 'layer_norm',
+        norm_eps: float = 1e-5,
+        rms_norm_offset: bool = True,
+    ):
+        """Construct an EncoderLayer object."""
+        super().__init__()
+        self.self_attn = self_attn
+        self.feed_forward = feed_forward
+        assert layer_norm_type in ['layer_norm', 'rms_norm']
+
+        norm_class = WENET_NORM_CLASSES[layer_norm_type]
+        if layer_norm_type == "rms_norm":
+            norm_class = partial(
+                norm_class,
+                add_unit_offset=rms_norm_offset,
+            )
+        self.norm1 = norm_class(size, eps=norm_eps)
+        self.norm2 = norm_class(size, eps=norm_eps)
+        self.dropout = nn.Dropout(dropout_rate)
+        self.size = size
+        self.normalize_before = normalize_before
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask: torch.Tensor,
+        pos_emb: torch.Tensor,
+        mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+        att_cache: T_CACHE = (torch.zeros(
+            (0, 0, 0, 0)), torch.zeros((0, 0, 0, 0))),
+        cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)),
+    ) -> Tuple[torch.Tensor, torch.Tensor, T_CACHE, torch.Tensor]:
+        """Compute encoded features.
+
+        Args:
+            x (torch.Tensor): (#batch, time, size)
+            mask (torch.Tensor): Mask tensor for the input (#batch, time，time),
+                (0, 0, 0) means fake mask.
+            pos_emb (torch.Tensor): just for interface compatibility
+                to ConformerEncoderLayer
+            mask_pad (torch.Tensor): does not used in transformer layer,
+                just for unified api with conformer.
+            att_cache (torch.Tensor): Cache tensor of the KEY & VALUE
+                (#batch=1, head, cache_t1, d_k * 2), head * d_k == size.
+            cnn_cache (torch.Tensor): Convolution cache in conformer layer
+                (#batch=1, size, cache_t2), not used here, it's for interface
+                compatibility to ConformerEncoderLayer.
+        Returns:
+            torch.Tensor: Output tensor (#batch, time, size).
+            torch.Tensor: Mask tensor (#batch, time, time).
+            torch.Tensor: att_cache tensor,
+                (#batch=1, head, cache_t1 + time, d_k * 2).
+            torch.Tensor: cnn_cahce tensor (#batch=1, size, cache_t2).
+
+        """
+        residual = x
+        if self.normalize_before:
+            x = self.norm1(x)
+        x_att, new_att_cache = self.self_attn(x,
+                                              x,
+                                              x,
+                                              mask,
+                                              pos_emb,
+                                              cache=att_cache)
+        x = residual + self.dropout(x_att)
+        if not self.normalize_before:
+            x = self.norm1(x)
+
+        residual = x
+        if self.normalize_before:
+            x = self.norm2(x)
+        x = residual + self.dropout(self.feed_forward(x))
+        if not self.normalize_before:
+            x = self.norm2(x)
+
+        fake_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device)
+        return x, mask, new_att_cache, fake_cnn_cache
+
+
+class ConformerEncoderLayer(nn.Module):
+    """Encoder layer module.
+    Args:
+        size (int): Input dimension.
+        self_attn (torch.nn.Module): Self-attention module instance.
+            `MultiHeadedAttention` or `RelPositionMultiHeadedAttention`
+            instance can be used as the argument.
+        feed_forward (torch.nn.Module): Feed-forward module instance.
+            `PositionwiseFeedForward` instance can be used as the argument.
+        feed_forward_macaron (torch.nn.Module): Additional feed-forward module
+             instance.
+            `PositionwiseFeedForward` instance can be used as the argument.
+        conv_module (torch.nn.Module): Convolution module instance.
+            `ConvlutionModule` instance can be used as the argument.
+        dropout_rate (float): Dropout rate.
+        normalize_before (bool):
+            True: use layer_norm before each sub-block.
+            False: use layer_norm after each sub-block.
+    """
+
+    def __init__(
+        self,
+        size: int,
+        self_attn: torch.nn.Module,
+        feed_forward: Optional[nn.Module] = None,
+        feed_forward_macaron: Optional[nn.Module] = None,
+        conv_module: Optional[nn.Module] = None,
+        dropout_rate: float = 0.1,
+        normalize_before: bool = True,
+        layer_norm_type: str = 'layer_norm',
+        norm_eps: float = 1e-5,
+    ):
+        """Construct an EncoderLayer object."""
+        super().__init__()
+        self.self_attn = self_attn
+        self.feed_forward = feed_forward
+        assert layer_norm_type in ['layer_norm', 'rms_norm']
+        self.feed_forward_macaron = feed_forward_macaron
+        self.conv_module = conv_module
+        self.norm_ff = WENET_NORM_CLASSES[layer_norm_type](
+            size, eps=norm_eps)  # for the FNN module
+        self.norm_mha = WENET_NORM_CLASSES[layer_norm_type](
+            size, eps=norm_eps)  # for the MHA module
+        if feed_forward_macaron is not None:
+            self.norm_ff_macaron = WENET_NORM_CLASSES[layer_norm_type](
+                size, eps=norm_eps)
+            self.ff_scale = 0.5
+        else:
+            self.ff_scale = 1.0
+        if self.conv_module is not None:
+            self.norm_conv = WENET_NORM_CLASSES[layer_norm_type](
+                size, eps=norm_eps)  # for the CNN module
+            self.norm_final = WENET_NORM_CLASSES[layer_norm_type](
+                size, eps=norm_eps)  # for the final output of the block
+        self.dropout = nn.Dropout(dropout_rate)
+        self.size = size
+        self.normalize_before = normalize_before
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask: torch.Tensor,
+        pos_emb: torch.Tensor,
+        mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+        att_cache: T_CACHE = (torch.zeros(
+            (0, 0, 0, 0)), torch.zeros((0, 0, 0, 0))),
+        cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)),
+    ) -> Tuple[torch.Tensor, torch.Tensor, T_CACHE, torch.Tensor]:
+        """Compute encoded features.
+
+        Args:
+            x (torch.Tensor): (#batch, time, size)
+            mask (torch.Tensor): Mask tensor for the input (#batch, time，time),
+                (0, 0, 0) means fake mask.
+            pos_emb (torch.Tensor): positional encoding, must not be None
+                for ConformerEncoderLayer.
+            mask_pad (torch.Tensor): batch padding mask used for conv module.
+                (#batch, 1，time), (0, 0, 0) means fake mask.
+            att_cache (torch.Tensor): Cache tensor of the KEY & VALUE
+                (#batch=1, head, cache_t1, d_k * 2), head * d_k == size.
+            cnn_cache (torch.Tensor): Convolution cache in conformer layer
+                (#batch=1, size, cache_t2)
+        Returns:
+            torch.Tensor: Output tensor (#batch, time, size).
+            torch.Tensor: Mask tensor (#batch, time, time).
+            torch.Tensor: att_cache tensor,
+                (#batch=1, head, cache_t1 + time, d_k * 2).
+            torch.Tensor: cnn_cahce tensor (#batch, size, cache_t2).
+        """
+
+        # whether to use macaron style
+        if self.feed_forward_macaron is not None:
+            residual = x
+            if self.normalize_before:
+                x = self.norm_ff_macaron(x)
+            x = residual + self.ff_scale * self.dropout(
+                self.feed_forward_macaron(x))
+            if not self.normalize_before:
+                x = self.norm_ff_macaron(x)
+
+        # multi-headed self-attention module
+        residual = x
+        if self.normalize_before:
+            x = self.norm_mha(x)
+        x_att, new_att_cache = self.self_attn(x, x, x, mask, pos_emb,
+                                              att_cache)
+        x = residual + self.dropout(x_att)
+        if not self.normalize_before:
+            x = self.norm_mha(x)
+
+        # convolution module
+        # Fake new cnn cache here, and then change it in conv_module
+        new_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device)
+        if self.conv_module is not None:
+            residual = x
+            if self.normalize_before:
+                x = self.norm_conv(x)
+            x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache)
+            x = residual + self.dropout(x)
+
+            if not self.normalize_before:
+                x = self.norm_conv(x)
+
+        # feed forward module
+        residual = x
+        if self.normalize_before:
+            x = self.norm_ff(x)
+
+        x = residual + self.ff_scale * self.dropout(self.feed_forward(x))
+        if not self.normalize_before:
+            x = self.norm_ff(x)
+
+        if self.conv_module is not None:
+            x = self.norm_final(x)
+
+        return x, mask, new_att_cache, new_cnn_cache
diff --git a/wenet/transformer/label_smoothing_loss.py b/wenet/transformer/label_smoothing_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..feacabf09609ee6eb047c89ce18d372256c72c71
--- /dev/null
+++ b/wenet/transformer/label_smoothing_loss.py
@@ -0,0 +1,96 @@
+# Copyright (c) 2019 Shigeki Karita
+#               2020 Mobvoi Inc (Binbin Zhang)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Label smoothing module."""
+
+import torch
+from torch import nn
+
+
+class LabelSmoothingLoss(nn.Module):
+    """Label-smoothing loss.
+
+    In a standard CE loss, the label's data distribution is:
+    [0,1,2] ->
+    [
+        [1.0, 0.0, 0.0],
+        [0.0, 1.0, 0.0],
+        [0.0, 0.0, 1.0],
+    ]
+
+    In the smoothing version CE Loss,some probabilities
+    are taken from the true label prob (1.0) and are divided
+    among other labels.
+
+    e.g.
+    smoothing=0.1
+    [0,1,2] ->
+    [
+        [0.9, 0.05, 0.05],
+        [0.05, 0.9, 0.05],
+        [0.05, 0.05, 0.9],
+    ]
+
+    Args:
+        size (int): the number of class
+        padding_idx (int): padding class id which will be ignored for loss
+        smoothing (float): smoothing rate (0.0 means the conventional CE)
+        normalize_length (bool):
+            normalize loss by sequence length if True
+            normalize loss by batch size if False
+    """
+
+    def __init__(self,
+                 size: int,
+                 padding_idx: int,
+                 smoothing: float,
+                 normalize_length: bool = False):
+        """Construct an LabelSmoothingLoss object."""
+        super(LabelSmoothingLoss, self).__init__()
+        self.criterion = nn.KLDivLoss(reduction="none")
+        self.padding_idx = padding_idx
+        self.confidence = 1.0 - smoothing
+        self.smoothing = smoothing
+        self.size = size
+        self.normalize_length = normalize_length
+
+    def forward(self, x: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
+        """Compute loss between x and target.
+
+        The model outputs and data labels tensors are flatten to
+        (batch*seqlen, class) shape and a mask is applied to the
+        padding part which should not be calculated for loss.
+
+        Args:
+            x (torch.Tensor): prediction (batch, seqlen, class)
+            target (torch.Tensor):
+                target signal masked with self.padding_id (batch, seqlen)
+        Returns:
+            loss (torch.Tensor) : The KL loss, scalar float value
+        """
+        assert x.size(2) == self.size
+        batch_size = x.size(0)
+        x = x.view(-1, self.size)
+        target = target.view(-1)
+        # use zeros_like instead of torch.no_grad() for true_dist,
+        # since no_grad() can not be exported by JIT
+        true_dist = torch.zeros_like(x)
+        true_dist.fill_(self.smoothing / (self.size - 1))
+        ignore = target == self.padding_idx  # (B,)
+        total = len(target) - ignore.sum().item()
+        target = target.masked_fill(ignore, 0)  # avoid -1 index
+        true_dist.scatter_(1, target.unsqueeze(1), self.confidence)
+        kl = self.criterion(torch.log_softmax(x, dim=1), true_dist)
+        denom = total if self.normalize_length else batch_size
+        return kl.masked_fill(ignore.unsqueeze(1), 0).sum() / denom
diff --git a/wenet/transformer/norm.py b/wenet/transformer/norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..8039228630b6f4090e2fac7be83d95bf93f7afcb
--- /dev/null
+++ b/wenet/transformer/norm.py
@@ -0,0 +1,27 @@
+import torch
+
+
+class RMSNorm(torch.nn.Module):
+    """ https://arxiv.org/pdf/1910.07467.pdf
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        eps: float = 1e-6,
+        add_unit_offset: bool = True,
+    ):
+        super().__init__()
+        self.eps = eps
+        self.weight = torch.nn.Parameter(torch.ones(dim))
+        self.add_unit_offset = add_unit_offset
+
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+
+    def forward(self, x):
+        x = self._norm(x.float()).type_as(x)
+        if self.add_unit_offset:
+            return x * (1 + self.weight)
+        else:
+            return x * self.weight
diff --git a/wenet/transformer/positionwise_feed_forward.py b/wenet/transformer/positionwise_feed_forward.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4c38e0f9983f763efb700dac11384d29ad0b56a
--- /dev/null
+++ b/wenet/transformer/positionwise_feed_forward.py
@@ -0,0 +1,159 @@
+# Copyright (c) 2019 Shigeki Karita
+#               2020 Mobvoi Inc (Binbin Zhang)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Positionwise feed forward layer definition."""
+
+import torch
+
+
+class PositionwiseFeedForward(torch.nn.Module):
+    """Positionwise feed forward layer.
+
+    FeedForward are appied on each position of the sequence.
+    The output dim is same with the input dim.
+
+    Args:
+        idim (int): Input dimenstion.
+        hidden_units (int): The number of hidden units.
+        dropout_rate (float): Dropout rate.
+        activation (torch.nn.Module): Activation function
+    """
+
+    def __init__(
+        self,
+        idim: int,
+        hidden_units: int,
+        dropout_rate: float,
+        activation: torch.nn.Module = torch.nn.ReLU(),
+        bias: bool = True,
+        *dummy_args,
+        **dummy_kwargs,
+    ):
+        """Construct a PositionwiseFeedForward object."""
+        super(PositionwiseFeedForward, self).__init__()
+        self.w_1 = torch.nn.Linear(idim, hidden_units, bias=bias)
+        self.activation = activation
+        self.dropout = torch.nn.Dropout(dropout_rate)
+        self.w_2 = torch.nn.Linear(hidden_units, idim, bias=bias)
+
+    def forward(self, xs: torch.Tensor) -> torch.Tensor:
+        """Forward function.
+
+        Args:
+            xs: input tensor (B, L, D)
+        Returns:
+            output tensor, (B, L, D)
+        """
+        return self.w_2(self.dropout(self.activation(self.w_1(xs))))
+
+
+class MoEFFNLayer(torch.nn.Module):
+    """
+    Mixture of expert with Positionwise feed forward layer
+    See also figure 1 in https://arxiv.org/pdf/2305.15663.pdf
+    The output dim is same with the input dim.
+
+    Modified from https://github.com/Lightning-AI/lit-gpt/pull/823
+                  https://github.com/mistralai/mistral-src/blob/b46d6/moe_one_file_ref.py#L203-L219
+    Args:
+        n_expert: number of expert.
+        n_expert_activated: The actual number of experts used for each frame
+        idim (int): Input dimenstion.
+        hidden_units (int): The number of hidden units.
+        dropout_rate (float): Dropout rate.
+        activation (torch.nn.Module): Activation function
+    """
+
+    def __init__(
+        self,
+        idim: int,
+        hidden_units: int,
+        dropout_rate: float,
+        activation: torch.nn.Module = torch.nn.ReLU(),
+        bias: bool = False,
+        n_expert: int = 8,
+        n_expert_activated: int = 2,
+    ):
+        super(MoEFFNLayer, self).__init__()
+        self.gate = torch.nn.Linear(idim, n_expert, bias=False)
+        self.experts = torch.nn.ModuleList(
+            PositionwiseFeedForward(
+                idim, hidden_units, dropout_rate, activation, bias=bias)
+            for _ in range(n_expert))
+        self.n_expert = n_expert
+        self.n_expert_activated = n_expert_activated
+
+    def forward(self, xs: torch.Tensor) -> torch.Tensor:
+        """Foward function.
+        Args:
+            xs: input tensor (B, L, D)
+        Returns:
+            output tensor, (B, L, D)
+
+        """
+        B, L, D = xs.size(
+        )  # batch size, sequence length, embedding dimension (idim)
+        xs = xs.view(-1, D)  # (B*L, D)
+        router = self.gate(xs)  # (B*L, n_expert)
+        logits, selected_experts = torch.topk(
+            router, self.n_expert_activated
+        )  # probs:(B*L, n_expert_activated), selected_exp: (B*L, n_expert_activated)
+        weights = torch.nn.functional.softmax(
+            logits, dim=1,
+            dtype=torch.float).to(dtype=xs.dtype)  # (B*L, n_expert_activated)
+        output = torch.zeros_like(xs)  # (B*L, D)
+        for i, expert in enumerate(self.experts):
+            mask = selected_experts == i
+            token_ids, ith_expert = torch.where(mask)
+            output[token_ids] += weights[token_ids, ith_expert, None] * expert(
+                xs[token_ids])
+        return output.view(B, L, D)
+
+
+class GatedVariantsMLP(torch.nn.Module):
+    """ https://arxiv.org/pdf/2002.05202.pdf
+    """
+
+    def __init__(
+        self,
+        idim: int,
+        hidden_units: int,
+        dropout_rate: float,
+        activation: torch.nn.Module = torch.nn.GELU(),
+        bias: bool = True,
+        *dummy_args,
+        **dummy_kwargs,
+    ):
+        """Construct a PositionwiseFeedForward object."""
+        super(GatedVariantsMLP, self).__init__()
+        self.gate = torch.nn.Linear(idim, hidden_units, bias=False)
+        self.activation = activation
+        # w_1 as up proj
+        self.w_1 = torch.nn.Linear(idim, hidden_units, bias=bias)
+        self.dropout = torch.nn.Dropout(dropout_rate)
+        # w_2 as down proj
+        self.w_2 = torch.nn.Linear(hidden_units, idim, bias=bias)
+
+    def forward(self, x) -> torch.Tensor:
+        """Foward function.
+        Args:
+            xs: input tensor (B, L, D)
+        Returns:
+            output tensor, (B, L, D)
+
+        """
+        gate = self.activation(self.gate(x))
+        up = self.w_1(x)
+        fuse = gate * up
+        return self.w_2(self.dropout(fuse))
diff --git a/wenet/transformer/search.py b/wenet/transformer/search.py
new file mode 100644
index 0000000000000000000000000000000000000000..9919a44dbafd08f2e8050735d28e5d85dd4afd02
--- /dev/null
+++ b/wenet/transformer/search.py
@@ -0,0 +1,453 @@
+# Copyright (c) 2023 Binbin Zhang (binbzha@qq.com)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from collections import defaultdict
+from typing import List, Dict
+
+import torch
+from torch.nn.utils.rnn import pad_sequence
+
+from wenet.utils.common import (add_sos_eos, log_add, add_whisper_tokens,
+                                mask_to_bias)
+from wenet.utils.ctc_utils import remove_duplicates_and_blank
+from wenet.utils.mask import (make_pad_mask, mask_finished_preds,
+                              mask_finished_scores, subsequent_mask)
+from wenet.utils.context_graph import ContextGraph, ContextState
+
+
+class DecodeResult:
+
+    def __init__(self,
+                 tokens: List[int],
+                 score: float = 0.0,
+                 confidence: float = 0.0,
+                 tokens_confidence: List[float] = None,
+                 times: List[int] = None,
+                 nbest: List[List[int]] = None,
+                 nbest_scores: List[float] = None,
+                 nbest_times: List[List[int]] = None):
+        """
+        Args:
+            tokens: decode token list
+            score: the total decode score of this result
+            confidence: the total confidence of this result, it's in 0~1
+            tokens_confidence: confidence of each token
+            times: timestamp of each token, list of (start, end)
+            nbest: nbest result
+            nbest_scores: score of each nbest
+            nbest_times:
+        """
+        self.tokens = tokens
+        self.score = score
+        self.confidence = confidence
+        self.tokens_confidence = tokens_confidence
+        self.times = times
+        self.nbest = nbest
+        self.nbest_scores = nbest_scores
+        self.nbest_times = nbest_times
+
+
+class PrefixScore:
+    """ For CTC prefix beam search """
+
+    def __init__(self,
+                 s: float = float('-inf'),
+                 ns: float = float('-inf'),
+                 v_s: float = float('-inf'),
+                 v_ns: float = float('-inf'),
+                 context_state: ContextState = None,
+                 context_score: float = 0.0):
+        self.s = s  # blank_ending_score
+        self.ns = ns  # none_blank_ending_score
+        self.v_s = v_s  # viterbi blank ending score
+        self.v_ns = v_ns  # viterbi none blank ending score
+        self.cur_token_prob = float('-inf')  # prob of current token
+        self.times_s = []  # times of viterbi blank path
+        self.times_ns = []  # times of viterbi none blank path
+        self.context_state = context_state
+        self.context_score = context_score
+        self.has_context = False
+
+    def score(self):
+        return log_add(self.s, self.ns)
+
+    def viterbi_score(self):
+        return self.v_s if self.v_s > self.v_ns else self.v_ns
+
+    def times(self):
+        return self.times_s if self.v_s > self.v_ns else self.times_ns
+
+    def total_score(self):
+        return self.score() + self.context_score
+
+    def copy_context(self, prefix_score):
+        self.context_score = prefix_score.context_score
+        self.context_state = prefix_score.context_state
+
+    def update_context(self, context_graph, prefix_score, word_id):
+        self.copy_context(prefix_score)
+        (score, context_state) = context_graph.forward_one_step(
+            prefix_score.context_state, word_id)
+        self.context_score += score
+        self.context_state = context_state
+
+
+def ctc_greedy_search(ctc_probs: torch.Tensor,
+                      ctc_lens: torch.Tensor,
+                      blank_id: int = 0) -> List[DecodeResult]:
+    batch_size = ctc_probs.shape[0]
+    maxlen = ctc_probs.size(1)
+    topk_prob, topk_index = ctc_probs.topk(1, dim=2)  # (B, maxlen, 1)
+    topk_index = topk_index.view(batch_size, maxlen)  # (B, maxlen)
+    mask = make_pad_mask(ctc_lens, maxlen)  # (B, maxlen)
+    topk_index = topk_index.masked_fill_(mask, blank_id)  # (B, maxlen)
+    hyps = [hyp.tolist() for hyp in topk_index]
+    scores = topk_prob.max(1)
+    results = []
+    for hyp in hyps:
+        r = DecodeResult(remove_duplicates_and_blank(hyp, blank_id))
+        results.append(r)
+    return results
+
+
+def ctc_prefix_beam_search(
+    ctc_probs: torch.Tensor,
+    ctc_lens: torch.Tensor,
+    beam_size: int,
+    context_graph: ContextGraph = None,
+    blank_id: int = 0,
+) -> List[DecodeResult]:
+    """
+        Returns:
+            List[List[List[int]]]: nbest result for each utterance
+    """
+    batch_size = ctc_probs.shape[0]
+    results = []
+    # CTC prefix beam search can not be paralleled, so search one by one
+    for i in range(batch_size):
+        ctc_prob = ctc_probs[i]
+        num_t = ctc_lens[i]
+        cur_hyps = [(tuple(),
+                     PrefixScore(s=0.0,
+                                 ns=-float('inf'),
+                                 v_s=0.0,
+                                 v_ns=0.0,
+                                 context_state=None if context_graph is None
+                                 else context_graph.root,
+                                 context_score=0.0))]
+        # 2. CTC beam search step by step
+        for t in range(0, num_t):
+            logp = ctc_prob[t]  # (vocab_size,)
+            # key: prefix, value: PrefixScore
+            next_hyps = defaultdict(lambda: PrefixScore())
+            # 2.1 First beam prune: select topk best
+            top_k_logp, top_k_index = logp.topk(beam_size)  # (beam_size,)
+            for u in top_k_index:
+                u = u.item()
+                prob = logp[u].item()
+                for prefix, prefix_score in cur_hyps:
+                    last = prefix[-1] if len(prefix) > 0 else None
+                    if u == blank_id:  # blank
+                        next_score = next_hyps[prefix]
+                        next_score.s = log_add(next_score.s,
+                                               prefix_score.score() + prob)
+                        next_score.v_s = prefix_score.viterbi_score() + prob
+                        next_score.times_s = prefix_score.times().copy()
+                        # perfix not changed, copy the context from prefix
+                        if context_graph and not next_score.has_context:
+                            next_score.copy_context(prefix_score)
+                            next_score.has_context = True
+                    elif u == last:
+                        #  Update *uu -> *u;
+                        next_score1 = next_hyps[prefix]
+                        next_score1.ns = log_add(next_score1.ns,
+                                                 prefix_score.ns + prob)
+                        if next_score1.v_ns < prefix_score.v_ns + prob:
+                            next_score1.v_ns = prefix_score.v_ns + prob
+                            if next_score1.cur_token_prob < prob:
+                                next_score1.cur_token_prob = prob
+                                next_score1.times_ns = prefix_score.times_ns.copy(
+                                )
+                                next_score1.times_ns[-1] = t
+                        if context_graph and not next_score1.has_context:
+                            next_score1.copy_context(prefix_score)
+                            next_score1.has_context = True
+
+                        # Update *u-u -> *uu, - is for blank
+                        n_prefix = prefix + (u, )
+                        next_score2 = next_hyps[n_prefix]
+                        next_score2.ns = log_add(next_score2.ns,
+                                                 prefix_score.s + prob)
+                        if next_score2.v_ns < prefix_score.v_s + prob:
+                            next_score2.v_ns = prefix_score.v_s + prob
+                            next_score2.cur_token_prob = prob
+                            next_score2.times_ns = prefix_score.times_s.copy()
+                            next_score2.times_ns.append(t)
+                        if context_graph and not next_score2.has_context:
+                            next_score2.update_context(context_graph,
+                                                       prefix_score, u)
+                            next_score2.has_context = True
+                    else:
+                        n_prefix = prefix + (u, )
+                        next_score = next_hyps[n_prefix]
+                        next_score.ns = log_add(next_score.ns,
+                                                prefix_score.score() + prob)
+                        if next_score.v_ns < prefix_score.viterbi_score(
+                        ) + prob:
+                            next_score.v_ns = prefix_score.viterbi_score(
+                            ) + prob
+                            next_score.cur_token_prob = prob
+                            next_score.times_ns = prefix_score.times().copy()
+                            next_score.times_ns.append(t)
+                        if context_graph and not next_score.has_context:
+                            next_score.update_context(context_graph,
+                                                      prefix_score, u)
+                            next_score.has_context = True
+
+            # 2.2 Second beam prune
+            next_hyps = sorted(next_hyps.items(),
+                               key=lambda x: x[1].total_score(),
+                               reverse=True)
+            cur_hyps = next_hyps[:beam_size]
+
+        # We should backoff the context score/state when the context is
+        # not fully matched at the last time.
+        if context_graph is not None:
+            for i, hyp in enumerate(cur_hyps):
+                context_score, new_context_state = context_graph.finalize(
+                    hyp[1].context_state)
+                cur_hyps[i][1].context_score = context_score
+                cur_hyps[i][1].context_state = new_context_state
+
+        nbest = [y[0] for y in cur_hyps]
+        nbest_scores = [y[1].total_score() for y in cur_hyps]
+        nbest_times = [y[1].times() for y in cur_hyps]
+        best = nbest[0]
+        best_score = nbest_scores[0]
+        best_time = nbest_times[0]
+        results.append(
+            DecodeResult(tokens=best,
+                         score=best_score,
+                         times=best_time,
+                         nbest=nbest,
+                         nbest_scores=nbest_scores,
+                         nbest_times=nbest_times))
+    return results
+
+
+def attention_beam_search(
+    model,
+    encoder_out: torch.Tensor,
+    encoder_mask: torch.Tensor,
+    beam_size: int = 10,
+    length_penalty: float = 0.0,
+    infos: Dict[str, List[str]] = None,
+) -> List[DecodeResult]:
+    device = encoder_out.device
+    batch_size = encoder_out.shape[0]
+    # Let's assume B = batch_size and N = beam_size
+    # 1. Encoder
+    maxlen = encoder_out.size(1)
+    encoder_dim = encoder_out.size(2)
+    running_size = batch_size * beam_size
+    if getattr(model, 'special_tokens', None) is not None \
+            and "transcribe" in model.special_tokens:
+        tasks, langs = infos["tasks"], infos["langs"]
+        tasks = [t for t in tasks for _ in range(beam_size)]
+        langs = [l for l in langs for _ in range(beam_size)]
+        hyps = torch.ones([running_size, 0], dtype=torch.long,
+                          device=device)  # (B*N, 0)
+        hyps, _ = add_whisper_tokens(model.special_tokens,
+                                     hyps,
+                                     model.ignore_id,
+                                     tasks=tasks,
+                                     no_timestamp=True,
+                                     langs=langs,
+                                     use_prev=False)
+    else:
+        hyps = torch.ones([running_size, 1], dtype=torch.long,
+                          device=device).fill_(model.sos)  # (B*N, 1)
+    prefix_len = hyps.size(1)
+    scores = torch.tensor([0.0] + [-float('inf')] * (beam_size - 1),
+                          dtype=torch.float)
+    scores = scores.to(device).repeat([batch_size
+                                       ]).unsqueeze(1).to(device)  # (B*N, 1)
+    end_flag = torch.zeros_like(scores, dtype=torch.bool, device=device)
+    cache = {
+        'self_att_cache': {},
+        'cross_att_cache': {},
+    }
+    if model.decoder.use_sdpa:
+        encoder_mask = mask_to_bias(encoder_mask, encoder_out.dtype)
+    if hasattr(model, 'decode_maxlen'):
+        maxlen = model.decode_maxlen
+    # 2. Decoder forward step by step
+    for i in range(prefix_len, maxlen + 1):
+        # Stop if all batch and all beam produce eos
+        if end_flag.sum() == running_size:
+            break
+        # 2.1 Forward decoder step
+        hyps_mask = subsequent_mask(i).unsqueeze(0).repeat(
+            running_size, 1, 1).to(device)  # (B*N, i, i)
+        if model.decoder.use_sdpa:
+            hyps_mask = mask_to_bias(hyps_mask, encoder_out.dtype)
+        # logp: (B*N, vocab)
+        logp = model.decoder.forward_one_step(encoder_out, encoder_mask, hyps,
+                                              hyps_mask, cache)
+        # 2.2 First beam prune: select topk best prob at current time
+        top_k_logp, top_k_index = logp.topk(beam_size)  # (B*N, N)
+        top_k_logp = mask_finished_scores(top_k_logp, end_flag)
+        top_k_index = mask_finished_preds(top_k_index, end_flag, model.eos)
+        # 2.3 Second beam prune: select topk score with history
+        scores = scores + top_k_logp  # (B*N, N), broadcast add
+        scores = scores.view(batch_size, beam_size * beam_size)  # (B, N*N)
+        scores, offset_k_index = scores.topk(k=beam_size)  # (B, N)
+        # Update cache to be consistent with new topk scores / hyps
+        cache_index = (offset_k_index // beam_size).view(-1)  # (B*N)
+        base_cache_index = (torch.arange(batch_size, device=device).view(
+            -1, 1).repeat([1, beam_size]) * beam_size).view(-1)  # (B*N)
+        cache_index = base_cache_index + cache_index
+        cache['self_att_cache'] = {
+            i_layer: (torch.index_select(value[0], dim=0, index=cache_index),
+                      torch.index_select(value[1], dim=0, index=cache_index))
+            for (i_layer, value) in cache['self_att_cache'].items()
+        }
+        # NOTE(Mddct): we don't need select cross att here
+        torch.cuda.empty_cache()
+        scores = scores.view(-1, 1)  # (B*N, 1)
+        # 2.4. Compute base index in top_k_index,
+        # regard top_k_index as (B*N*N),regard offset_k_index as (B*N),
+        # then find offset_k_index in top_k_index
+        base_k_index = torch.arange(batch_size, device=device).view(
+            -1, 1).repeat([1, beam_size])  # (B, N)
+        base_k_index = base_k_index * beam_size * beam_size
+        best_k_index = base_k_index.view(-1) + offset_k_index.view(-1)  # (B*N)
+
+        # 2.5 Update best hyps
+        best_k_pred = torch.index_select(top_k_index.view(-1),
+                                         dim=-1,
+                                         index=best_k_index)  # (B*N)
+        best_hyps_index = best_k_index // beam_size
+        last_best_k_hyps = torch.index_select(
+            hyps, dim=0, index=best_hyps_index)  # (B*N, i)
+        hyps = torch.cat((last_best_k_hyps, best_k_pred.view(-1, 1)),
+                         dim=1)  # (B*N, i+1)
+
+        # 2.6 Update end flag
+        end_flag = torch.eq(hyps[:, -1], model.eos).view(-1, 1)
+
+    # 3. Select best of best
+    scores = scores.view(batch_size, beam_size)
+    lengths = hyps.ne(model.eos).sum(dim=1).view(batch_size, beam_size).float()
+    scores = scores / lengths.pow(length_penalty)
+    best_scores, best_index = scores.max(dim=-1)
+    best_hyps_index = best_index + torch.arange(
+        batch_size, dtype=torch.long, device=device) * beam_size
+    best_hyps = torch.index_select(hyps, dim=0, index=best_hyps_index)
+    best_hyps = best_hyps[:, prefix_len:]
+
+    results = []
+    for i in range(batch_size):
+        hyp = best_hyps[i]
+        hyp = hyp[hyp != model.eos]
+        results.append(DecodeResult(hyp.tolist()))
+    return results
+
+
+def attention_rescoring(
+    model,
+    ctc_prefix_results: List[DecodeResult],
+    encoder_outs: torch.Tensor,
+    encoder_lens: torch.Tensor,
+    ctc_weight: float = 0.0,
+    reverse_weight: float = 0.0,
+    infos: Dict[str, List[str]] = None,
+) -> List[DecodeResult]:
+    """
+        Args:
+            ctc_prefix_results(List[DecodeResult]): ctc prefix beam search results
+    """
+    sos, eos = model.sos_symbol(), model.eos_symbol()
+    device = encoder_outs.device
+    assert encoder_outs.shape[0] == len(ctc_prefix_results)
+    batch_size = encoder_outs.shape[0]
+    results = []
+    for b in range(batch_size):
+        encoder_out = encoder_outs[b, :encoder_lens[b], :].unsqueeze(0)
+        hyps = ctc_prefix_results[b].nbest
+        ctc_scores = ctc_prefix_results[b].nbest_scores
+        hyps_pad = pad_sequence([
+            torch.tensor(hyp, device=device, dtype=torch.long) for hyp in hyps
+        ], True, model.ignore_id)  # (beam_size, max_hyps_len)
+        hyps_lens = torch.tensor([len(hyp) for hyp in hyps],
+                                 device=device,
+                                 dtype=torch.long)  # (beam_size,)
+        if getattr(model, 'special_tokens', None) is not None \
+                and "transcribe" in model.special_tokens:
+            prev_len = hyps_pad.size(1)
+            hyps_pad, _ = add_whisper_tokens(
+                model.special_tokens,
+                hyps_pad,
+                model.ignore_id,
+                tasks=[infos["tasks"][b]] * len(hyps),
+                no_timestamp=True,
+                langs=[infos["langs"][b]] * len(hyps),
+                use_prev=False)
+            cur_len = hyps_pad.size(1)
+            hyps_lens = hyps_lens + cur_len - prev_len
+            prefix_len = 4
+        else:
+            hyps_pad, _ = add_sos_eos(hyps_pad, sos, eos, model.ignore_id)
+            hyps_lens = hyps_lens + 1  # Add <sos> at begining
+            prefix_len = 1
+        decoder_out, r_decoder_out = model.forward_attention_decoder(
+            hyps_pad, hyps_lens, encoder_out, reverse_weight)
+        # Only use decoder score for rescoring
+        best_score = -float('inf')
+        best_index = 0
+        confidences = []
+        tokens_confidences = []
+        for i, hyp in enumerate(hyps):
+            score = 0.0
+            tc = []  # tokens confidences
+            for j, w in enumerate(hyp):
+                s = decoder_out[i][j + (prefix_len - 1)][w]
+                score += s
+                tc.append(math.exp(s))
+            score += decoder_out[i][len(hyp) + (prefix_len - 1)][eos]
+            # add right to left decoder score
+            if reverse_weight > 0 and r_decoder_out.dim() > 0:
+                r_score = 0.0
+                for j, w in enumerate(hyp):
+                    s = r_decoder_out[i][len(hyp) - j - 1 +
+                                         (prefix_len - 1)][w]
+                    r_score += s
+                    tc[j] = (tc[j] + math.exp(s)) / 2
+                r_score += r_decoder_out[i][len(hyp) + (prefix_len - 1)][eos]
+                score = score * (1 - reverse_weight) + r_score * reverse_weight
+            confidences.append(math.exp(score / (len(hyp) + 1)))
+            # add ctc score
+            score += ctc_scores[i] * ctc_weight
+            if score > best_score:
+                best_score = score.item()
+                best_index = i
+            tokens_confidences.append(tc)
+        results.append(
+            DecodeResult(hyps[best_index],
+                         best_score,
+                         confidence=confidences[best_index],
+                         times=ctc_prefix_results[b].nbest_times[best_index],
+                         tokens_confidence=tokens_confidences[best_index]))
+    return results
diff --git a/wenet/transformer/subsampling.py b/wenet/transformer/subsampling.py
new file mode 100644
index 0000000000000000000000000000000000000000..070311b63fe3db0ad0ff884d20287473263992ea
--- /dev/null
+++ b/wenet/transformer/subsampling.py
@@ -0,0 +1,398 @@
+# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from ESPnet(https://github.com/espnet/espnet)
+"""Subsampling layer definition."""
+
+from typing import Tuple, Union
+
+import torch
+
+from wenet.transformer.swish import Swish, New_gelu4npu, new_gelu_func
+from wenet.utils.mask import make_pad_mask
+
+
+class BaseSubsampling(torch.nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.right_context = 0
+        self.subsampling_rate = 1
+
+    def position_encoding(self, offset: Union[int, torch.Tensor],
+                          size: int) -> torch.Tensor:
+        return self.pos_enc.position_encoding(offset, size)
+
+
+class EmbedinigNoSubsampling(BaseSubsampling):
+    """Embedding input without subsampling
+    """
+
+    def __init__(self, idim: int, odim: int, dropout_rate: float,
+                 pos_enc_class: torch.nn.Module):
+        super().__init__()
+        self.embed = torch.nn.Embedding(idim, odim)
+        self.pos_enc = pos_enc_class
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_mask: torch.Tensor,
+        offset: Union[int, torch.Tensor] = 0
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Input x.
+
+        Args:
+            x (torch.Tensor): Input tensor (#batch, time, idim).
+            x_mask (torch.Tensor): Input mask (#batch, 1, time).
+
+        Returns:
+            torch.Tensor: linear input tensor (#batch, time', odim),
+                where time' = time .
+            torch.Tensor: linear input mask (#batch, 1, time'),
+                where time' = time .
+
+        """
+        x = self.embed(x)
+        x, pos_emb = self.pos_enc(x, offset)
+        return x, pos_emb, x_mask
+
+
+class LinearNoSubsampling(BaseSubsampling):
+    """Linear transform the input without subsampling
+
+    Args:
+        idim (int): Input dimension.
+        odim (int): Output dimension.
+        dropout_rate (float): Dropout rate.
+
+    """
+
+    def __init__(self, idim: int, odim: int, dropout_rate: float,
+                 pos_enc_class: torch.nn.Module):
+        """Construct an linear object."""
+        super().__init__()
+        self.out = torch.nn.Sequential(
+            torch.nn.Linear(idim, odim),
+            torch.nn.LayerNorm(odim, eps=1e-5),
+            torch.nn.Dropout(dropout_rate),
+        )
+        self.pos_enc = pos_enc_class
+        self.right_context = 0
+        self.subsampling_rate = 1
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_mask: torch.Tensor,
+        offset: Union[int, torch.Tensor] = 0
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Input x.
+
+        Args:
+            x (torch.Tensor): Input tensor (#batch, time, idim).
+            x_mask (torch.Tensor): Input mask (#batch, 1, time).
+
+        Returns:
+            torch.Tensor: linear input tensor (#batch, time', odim),
+                where time' = time .
+            torch.Tensor: linear input mask (#batch, 1, time'),
+                where time' = time .
+
+        """
+        x = self.out(x)
+        x, pos_emb = self.pos_enc(x, offset)
+        return x, pos_emb, x_mask
+
+
+class Conv1dSubsampling2(BaseSubsampling):
+    """Convolutional 1D subsampling (to 1/2 length).
+       It is designed for Whisper, ref:
+       https://github.com/openai/whisper/blob/main/whisper/model.py
+
+    Args:
+        idim (int): Input dimension.
+        odim (int): Output dimension.
+        dropout_rate (float): Dropout rate.
+
+    """
+
+    def __init__(self, idim: int, odim: int, dropout_rate: float,
+                 pos_enc_class: torch.nn.Module):
+        """Construct an Conv1dSubsampling2 object."""
+        super().__init__()
+        self.conv = torch.nn.Sequential(
+            torch.nn.Conv1d(idim, odim, kernel_size=3, padding=1),
+            New_gelu4npu(),
+            torch.nn.Conv1d(odim, odim, kernel_size=3, stride=2, padding=1),
+            New_gelu4npu(),
+        )
+        self.pos_enc = pos_enc_class
+        # The right context for every conv layer is computed by:
+        # (kernel_size - 1) * frame_rate_of_this_layer
+        self.subsampling_rate = 2
+        # 4 = (3 - 1) * 1 + (3 - 1) * 1
+        self.right_context = 4
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_mask: torch.Tensor,
+        offset: Union[int, torch.Tensor] = 0
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Subsample x.
+
+        Args:
+            x (torch.Tensor): Input tensor (#batch, time, idim).
+            x_mask (torch.Tensor): Input mask (#batch, 1, time).
+
+        Returns:
+            torch.Tensor: Subsampled tensor (#batch, time', odim),
+                where time' = time // 2.
+            torch.Tensor: Subsampled mask (#batch, 1, time'),
+                where time' = time // 2.
+            torch.Tensor: positional encoding
+
+        """
+        time = x.size(1)
+        x_old = x.transpose(1, 2)  # (b, f, t)
+        x = self.conv(x_old)
+        x = x.transpose(1, 2)  # (b, t, f)
+        # x = torch.load(
+        #     "/mnt/sfs/asr/code/wenet_undersdand_and_speech_xlgeng/examples/wenetspeech/whisper/dump/gpu/step0/rank0/dump_tensor_data/Tensor.transpose.1.forward.output.0.pt")
+        # x = x.to(x_old.device)
+        x, pos_emb = self.pos_enc(x, offset)
+        return x, pos_emb, x_mask[:, :, (time + 1) % 2::2]
+
+
+class Conv2dSubsampling4(BaseSubsampling):
+    """Convolutional 2D subsampling (to 1/4 length).
+
+    Args:
+        idim (int): Input dimension.
+        odim (int): Output dimension.
+        dropout_rate (float): Dropout rate.
+
+    """
+
+    def __init__(self, idim: int, odim: int, dropout_rate: float,
+                 pos_enc_class: torch.nn.Module):
+        """Construct an Conv2dSubsampling4 object."""
+        super().__init__()
+        self.conv = torch.nn.Sequential(
+            torch.nn.Conv2d(1, odim, 3, 2),
+            torch.nn.ReLU(),
+            torch.nn.Conv2d(odim, odim, 3, 2),
+            torch.nn.ReLU(),
+        )
+        self.out = torch.nn.Sequential(
+            torch.nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim))
+        self.pos_enc = pos_enc_class
+        # The right context for every conv layer is computed by:
+        # (kernel_size - 1) * frame_rate_of_this_layer
+        self.subsampling_rate = 4
+        # 6 = (3 - 1) * 1 + (3 - 1) * 2
+        self.right_context = 6
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_mask: torch.Tensor,
+        offset: Union[int, torch.Tensor] = 0
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Subsample x.
+
+        Args:
+            x (torch.Tensor): Input tensor (#batch, time, idim).
+            x_mask (torch.Tensor): Input mask (#batch, 1, time).
+
+        Returns:
+            torch.Tensor: Subsampled tensor (#batch, time', odim),
+                where time' = time // 4.
+            torch.Tensor: Subsampled mask (#batch, 1, time'),
+                where time' = time // 4.
+            torch.Tensor: positional encoding
+
+        """
+        x = x.unsqueeze(1)  # (b, c=1, t, f)
+        x = self.conv(x)
+        b, c, t, f = x.size()
+        x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f))
+        x, pos_emb = self.pos_enc(x, offset)
+        return x, pos_emb, x_mask[:, :, 2::2][:, :, 2::2]
+
+
+class Conv2dSubsampling6(BaseSubsampling):
+    """Convolutional 2D subsampling (to 1/6 length).
+    Args:
+        idim (int): Input dimension.
+        odim (int): Output dimension.
+        dropout_rate (float): Dropout rate.
+        pos_enc (torch.nn.Module): Custom position encoding layer.
+    """
+
+    def __init__(self, idim: int, odim: int, dropout_rate: float,
+                 pos_enc_class: torch.nn.Module):
+        """Construct an Conv2dSubsampling6 object."""
+        super().__init__()
+        self.conv = torch.nn.Sequential(
+            torch.nn.Conv2d(1, odim, 3, 2),
+            torch.nn.ReLU(),
+            torch.nn.Conv2d(odim, odim, 5, 3),
+            torch.nn.ReLU(),
+        )
+        self.linear = torch.nn.Linear(odim * (((idim - 1) // 2 - 2) // 3),
+                                      odim)
+        self.pos_enc = pos_enc_class
+        # 10 = (3 - 1) * 1 + (5 - 1) * 2
+        self.subsampling_rate = 6
+        self.right_context = 10
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_mask: torch.Tensor,
+        offset: Union[int, torch.Tensor] = 0
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Subsample x.
+        Args:
+            x (torch.Tensor): Input tensor (#batch, time, idim).
+            x_mask (torch.Tensor): Input mask (#batch, 1, time).
+
+        Returns:
+            torch.Tensor: Subsampled tensor (#batch, time', odim),
+                where time' = time // 6.
+            torch.Tensor: Subsampled mask (#batch, 1, time'),
+                where time' = time // 6.
+            torch.Tensor: positional encoding
+        """
+        x = x.unsqueeze(1)  # (b, c, t, f)
+        x = self.conv(x)
+        b, c, t, f = x.size()
+        x = self.linear(x.transpose(1, 2).contiguous().view(b, t, c * f))
+        x, pos_emb = self.pos_enc(x, offset)
+        return x, pos_emb, x_mask[:, :, 2::2][:, :, 4::3]
+
+
+class Conv2dSubsampling8(BaseSubsampling):
+    """Convolutional 2D subsampling (to 1/8 length).
+
+    Args:
+        idim (int): Input dimension.
+        odim (int): Output dimension.
+        dropout_rate (float): Dropout rate.
+
+    """
+
+    def __init__(self, idim: int, odim: int, dropout_rate: float,
+                 pos_enc_class: torch.nn.Module):
+        """Construct an Conv2dSubsampling8 object."""
+        super().__init__()
+        self.conv = torch.nn.Sequential(
+            torch.nn.Conv2d(1, odim, 3, 2),
+            torch.nn.ReLU(),
+            torch.nn.Conv2d(odim, odim, 3, 2),
+            torch.nn.ReLU(),
+            torch.nn.Conv2d(odim, odim, 3, 2),
+            torch.nn.ReLU(),
+        )
+        self.linear = torch.nn.Linear(
+            odim * ((((idim - 1) // 2 - 1) // 2 - 1) // 2), odim)
+        self.pos_enc = pos_enc_class
+        self.subsampling_rate = 8
+        # 14 = (3 - 1) * 1 + (3 - 1) * 2 + (3 - 1) * 4
+        self.right_context = 14
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_mask: torch.Tensor,
+        offset: Union[int, torch.Tensor] = 0
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Subsample x.
+
+        Args:
+            x (torch.Tensor): Input tensor (#batch, time, idim).
+            x_mask (torch.Tensor): Input mask (#batch, 1, time).
+
+        Returns:
+            torch.Tensor: Subsampled tensor (#batch, time', odim),
+                where time' = time // 8.
+            torch.Tensor: Subsampled mask (#batch, 1, time'),
+                where time' = time // 8.
+            torch.Tensor: positional encoding
+        """
+        x = x.unsqueeze(1)  # (b, c, t, f)
+        x = self.conv(x)
+        b, c, t, f = x.size()
+        x = self.linear(x.transpose(1, 2).contiguous().view(b, t, c * f))
+        x, pos_emb = self.pos_enc(x, offset)
+        return x, pos_emb, x_mask[:, :, 2::2][:, :, 2::2][:, :, 2::2]
+
+
+class StackNFramesSubsampling(BaseSubsampling):
+
+    def __init__(self,
+                 idim: int,
+                 odim: int,
+                 dropout_rate: float,
+                 pos_enc_class: torch.nn.Module,
+                 stride: int = 2):
+
+        super().__init__()
+        del dropout_rate
+        self.pos_enc_class = pos_enc_class
+        self.stride = stride
+        self.idim = idim
+
+        self.norm = torch.nn.LayerNorm(idim * stride, eps=1e-5)
+        self.out = torch.nn.Linear(idim * stride, odim)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_mask: torch.Tensor,
+        offset: Union[int, torch.Tensor] = 0
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Subsample x.
+
+        Args:
+            x (torch.Tensor): Input tensor (#batch, time, idim).
+            x_mask (torch.Tensor): Input mask (#batch, 1, time).
+
+        Returns:
+            torch.Tensor: Subsampled tensor (#batch, time', odim),
+                where time' = time // stride.
+            torch.Tensor: Subsampled mask (#batch, 1, time'),
+                where time' = time // stride.
+            torch.Tensor: positional encoding
+        """
+        with torch.no_grad():
+            b, s, _ = x.size()
+
+            seq_len = x_mask.sum(-1).view(b)
+            r = s % self.stride
+            s -= r
+            x = x[:, :s, :]
+            seq_len = torch.where(seq_len > s, s, seq_len)
+            seq_len = seq_len // self.stride
+            new_mask = ~make_pad_mask(seq_len, max_len=s // self.stride)
+            x = x.view(b, s // self.stride, self.idim * self.stride)
+            _, pos_emb = self.pos_enc_class(x, offset)
+        x = self.norm(x)
+        x = self.out(x)
+        return x, pos_emb, new_mask.unsqueeze(1)
+
+    def position_encoding(self, offset: Union[int, torch.Tensor],
+                          size: int) -> torch.Tensor:
+        return self.pos_enc_class.position_encoding(offset, size)
diff --git a/wenet/transformer/swish.py b/wenet/transformer/swish.py
new file mode 100644
index 0000000000000000000000000000000000000000..f523405c13610b338748bd8b49a5c307e4dfbcae
--- /dev/null
+++ b/wenet/transformer/swish.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2020 Johns Hopkins University (Shinji Watanabe)
+#               2020 Northwestern Polytechnical University (Pengcheng Guo)
+#               2020 Mobvoi Inc (Binbin Zhang)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Swish() activation function for Conformer."""
+import math
+
+import torch
+
+
+class Swish(torch.nn.Module):
+    """Construct an Swish object."""
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Return Swish activation function."""
+        return x * torch.sigmoid(x)
+
+class New_gelu4npu(torch.nn.Module):
+    """Construct an Swish object."""
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Return Swish activation function."""
+        return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
+
+def new_gelu_func(x: torch.Tensor):
+    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
diff --git a/wenet/utils/__init__.py b/wenet/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/wenet/utils/checkpoint.py b/wenet/utils/checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..457a57e33d4c9e8f50812dd499d2a181c17af090
--- /dev/null
+++ b/wenet/utils/checkpoint.py
@@ -0,0 +1,119 @@
+# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import re
+
+import yaml
+import torch
+from collections import OrderedDict
+
+import datetime
+
+
+def load_checkpoint(model: torch.nn.Module, path: str) -> dict:
+    rank = int(os.environ.get('RANK', 0))
+    logging.info('[Rank {}] Checkpoint: loading from checkpoint {}'.format(
+        rank, path))
+    checkpoint = torch.load(path, map_location='cpu')
+    missing_keys, unexpected_keys = model.load_state_dict(checkpoint,
+                                                          strict=False)
+    if rank == 0:
+        for key in missing_keys:
+            logging.info("missing tensor: {}".format(key))
+        for key in unexpected_keys:
+            logging.info("unexpected tensor: {}".format(key))
+    info_path = re.sub('.pt$', '.yaml', path)
+    configs = {}
+    if os.path.exists(info_path):
+        with open(info_path, 'r') as fin:
+            configs = yaml.load(fin, Loader=yaml.FullLoader)
+        if configs is None:
+            configs = {}
+    return configs
+
+
+def save_state_dict_and_infos(state_dict, path: str, infos=None):
+    rank = int(os.environ.get('RANK', 0))
+    logging.info('[Rank {}] Checkpoint: save to checkpoint {}'.format(
+        rank, path))
+    torch.save(state_dict, path)
+    info_path = re.sub('.pt$', '.yaml', path)
+    if infos is None:
+        infos = {}
+    infos['save_time'] = datetime.datetime.now().strftime('%d/%m/%Y %H:%M:%S')
+    with open(info_path, 'w') as fout:
+        data = yaml.dump(infos)
+        fout.write(data)
+
+
+def save_checkpoint(model: torch.nn.Module, path: str, infos=None):
+    '''
+    Args:
+        infos (dict or None): any info you want to save.
+    '''
+    if isinstance(model, torch.nn.DataParallel):
+        state_dict = model.module.state_dict()
+    elif isinstance(model, torch.nn.parallel.DistributedDataParallel):
+        state_dict = model.module.state_dict()
+    else:
+        state_dict = model.state_dict()
+    save_state_dict_and_infos(state_dict, path, infos)
+
+
+def filter_modules(model_state_dict, modules):
+    rank = int(os.environ.get('RANK', 0))
+    new_mods = []
+    incorrect_mods = []
+    mods_model = model_state_dict.keys()
+    for mod in modules:
+        if any(key.startswith(mod) for key in mods_model):
+            new_mods += [mod]
+        else:
+            incorrect_mods += [mod]
+    if incorrect_mods and rank == 0:
+        logging.warning(
+            "module(s) %s don't match or (partially match) "
+            "available modules in model.",
+            incorrect_mods,
+        )
+        logging.warning("for information, the existing modules in model are:")
+        logging.warning("%s", mods_model)
+
+    return new_mods
+
+
+def load_trained_modules(model: torch.nn.Module, args: None):
+    # Load encoder modules with pre-trained model(s).
+    enc_model_path = args.enc_init
+    enc_modules = args.enc_init_mods
+    main_state_dict = model.state_dict()
+    logging.warning("model(s) found for pre-initialization")
+    if os.path.isfile(enc_model_path):
+        logging.info('Checkpoint: loading from checkpoint %s for CPU' %
+                     enc_model_path)
+        model_state_dict = torch.load(enc_model_path, map_location='cpu')
+        modules = filter_modules(model_state_dict, enc_modules)
+        partial_state_dict = OrderedDict()
+        for key, value in model_state_dict.items():
+            if any(key.startswith(m) for m in modules):
+                partial_state_dict[key] = value
+        main_state_dict.update(partial_state_dict)
+    else:
+        logging.warning("model was not found : %s", enc_model_path)
+
+    model.load_state_dict(main_state_dict)
+    configs = {}
+    return configs
diff --git a/wenet/utils/class_utils.py b/wenet/utils/class_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..997036a7b548d90e7229010fcec356c672773a20
--- /dev/null
+++ b/wenet/utils/class_utils.py
@@ -0,0 +1,95 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Copyright [2023-11-28] <sxc19@mails.tsinghua.edu.cn, Xingchen Song>
+import torch
+from torch.nn import BatchNorm1d, LayerNorm
+from wenet.paraformer.embedding import ParaformerPositinoalEncoding
+from wenet.transformer.norm import RMSNorm
+from wenet.transformer.positionwise_feed_forward import (
+    GatedVariantsMLP, MoEFFNLayer, PositionwiseFeedForward)
+
+from wenet.transformer.swish import Swish, New_gelu4npu
+from wenet.transformer.subsampling import (
+    LinearNoSubsampling,
+    EmbedinigNoSubsampling,
+    Conv1dSubsampling2,
+    Conv2dSubsampling4,
+    Conv2dSubsampling6,
+    Conv2dSubsampling8,
+    StackNFramesSubsampling,
+)
+from wenet.efficient_conformer.subsampling import Conv2dSubsampling2
+from wenet.squeezeformer.subsampling import DepthwiseConv2dSubsampling4
+from wenet.transformer.embedding import (PositionalEncoding,
+                                         RelPositionalEncoding,
+                                         RopePositionalEncoding,
+                                         WhisperPositionalEncoding,
+                                         LearnablePositionalEncoding,
+                                         NoPositionalEncoding)
+from wenet.transformer.attention import (MultiHeadedAttention,
+                                         MultiHeadedCrossAttention,
+                                         RelPositionMultiHeadedAttention,
+                                         RopeMultiHeadedAttention,
+                                         ShawRelPositionMultiHeadedAttention)
+from wenet.efficient_conformer.attention import (
+    GroupedRelPositionMultiHeadedAttention)
+
+WENET_ACTIVATION_CLASSES = {
+    "hardtanh": torch.nn.Hardtanh,
+    "tanh": torch.nn.Tanh,
+    "relu": torch.nn.ReLU,
+    "selu": torch.nn.SELU,
+    "swish": getattr(torch.nn, "SiLU", Swish),
+    "gelu": New_gelu4npu,
+}
+
+WENET_RNN_CLASSES = {
+    "rnn": torch.nn.RNN,
+    "lstm": torch.nn.LSTM,
+    "gru": torch.nn.GRU,
+}
+
+WENET_SUBSAMPLE_CLASSES = {
+    "linear": LinearNoSubsampling,
+    "embed": EmbedinigNoSubsampling,
+    "conv1d2": Conv1dSubsampling2,
+    "conv2d2": Conv2dSubsampling2,
+    "conv2d": Conv2dSubsampling4,
+    "dwconv2d4": DepthwiseConv2dSubsampling4,
+    "conv2d6": Conv2dSubsampling6,
+    "conv2d8": Conv2dSubsampling8,
+    'paraformer_dummy': torch.nn.Identity,
+    'stack_n_frames': StackNFramesSubsampling,
+}
+
+WENET_EMB_CLASSES = {
+    "embed": PositionalEncoding,
+    "abs_pos": PositionalEncoding,
+    "rel_pos": RelPositionalEncoding,
+    "no_pos": NoPositionalEncoding,
+    "abs_pos_whisper": WhisperPositionalEncoding,
+    "embed_learnable_pe": LearnablePositionalEncoding,
+    "abs_pos_paraformer": ParaformerPositinoalEncoding,
+    'rope_pos': RopePositionalEncoding,
+}
+
+WENET_ATTENTION_CLASSES = {
+    "selfattn": MultiHeadedAttention,
+    "rel_selfattn": RelPositionMultiHeadedAttention,
+    "grouped_rel_selfattn": GroupedRelPositionMultiHeadedAttention,
+    "crossattn": MultiHeadedCrossAttention,
+    'shaw_rel_selfattn': ShawRelPositionMultiHeadedAttention,
+    'rope_abs_selfattn': RopeMultiHeadedAttention,
+}
+
+WENET_MLP_CLASSES = {
+    'position_wise_feed_forward': PositionwiseFeedForward,
+    'moe': MoEFFNLayer,
+    'gated': GatedVariantsMLP
+}
+
+WENET_NORM_CLASSES = {
+    'layer_norm': LayerNorm,
+    'batch_norm': BatchNorm1d,
+    'rms_norm': RMSNorm
+}
diff --git a/wenet/utils/cmvn.py b/wenet/utils/cmvn.py
new file mode 100644
index 0000000000000000000000000000000000000000..3101c619f54991c947124f393f3459c317356a2f
--- /dev/null
+++ b/wenet/utils/cmvn.py
@@ -0,0 +1,93 @@
+# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import math
+
+import numpy as np
+
+
+def _load_json_cmvn(json_cmvn_file):
+    """ Load the json format cmvn stats file and calculate cmvn
+
+    Args:
+        json_cmvn_file: cmvn stats file in json format
+
+    Returns:
+        a numpy array of [means, vars]
+    """
+    with open(json_cmvn_file) as f:
+        cmvn_stats = json.load(f)
+
+    means = cmvn_stats['mean_stat']
+    variance = cmvn_stats['var_stat']
+    count = cmvn_stats['frame_num']
+    for i in range(len(means)):
+        means[i] /= count
+        variance[i] = variance[i] / count - means[i] * means[i]
+        if variance[i] < 1.0e-20:
+            variance[i] = 1.0e-20
+        variance[i] = 1.0 / math.sqrt(variance[i])
+    cmvn = np.array([means, variance])
+    return cmvn
+
+
+def _load_kaldi_cmvn(kaldi_cmvn_file):
+    """ Load the kaldi format cmvn stats file and calculate cmvn
+
+    Args:
+        kaldi_cmvn_file:  kaldi text style global cmvn file, which
+           is generated by:
+           compute-cmvn-stats --binary=false scp:feats.scp global_cmvn
+
+    Returns:
+        a numpy array of [means, vars]
+    """
+    means = []
+    variance = []
+    with open(kaldi_cmvn_file, 'r') as fid:
+        # kaldi binary file start with '\0B'
+        if fid.read(2) == '\0B':
+            logging.error('kaldi cmvn binary file is not supported, please '
+                          'recompute it by: compute-cmvn-stats --binary=false '
+                          ' scp:feats.scp global_cmvn')
+            sys.exit(1)
+        fid.seek(0)
+        arr = fid.read().split()
+        assert (arr[0] == '[')
+        assert (arr[-2] == '0')
+        assert (arr[-1] == ']')
+        feat_dim = int((len(arr) - 2 - 2) / 2)
+        for i in range(1, feat_dim + 1):
+            means.append(float(arr[i]))
+        count = float(arr[feat_dim + 1])
+        for i in range(feat_dim + 2, 2 * feat_dim + 2):
+            variance.append(float(arr[i]))
+
+    for i in range(len(means)):
+        means[i] /= count
+        variance[i] = variance[i] / count - means[i] * means[i]
+        if variance[i] < 1.0e-20:
+            variance[i] = 1.0e-20
+        variance[i] = 1.0 / math.sqrt(variance[i])
+    cmvn = np.array([means, variance])
+    return cmvn
+
+
+def load_cmvn(cmvn_file, is_json):
+    if is_json:
+        cmvn = _load_json_cmvn(cmvn_file)
+    else:
+        cmvn = _load_kaldi_cmvn(cmvn_file)
+    return cmvn[0], cmvn[1]
diff --git a/wenet/utils/common.py b/wenet/utils/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..e36707476d8938e217188b45dcc892a3358e7050
--- /dev/null
+++ b/wenet/utils/common.py
@@ -0,0 +1,380 @@
+# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from ESPnet(https://github.com/espnet/espnet)
+"""Unility functions for Transformer."""
+
+import math
+import time
+from typing import List, Tuple
+
+import torch
+from torch.nn.utils.rnn import pad_sequence
+
+from whisper.tokenizer import LANGUAGES as WhiserLanguages
+
+WHISPER_LANGS = tuple(WhiserLanguages.keys())
+IGNORE_ID = -1
+
+
+def pad_list(xs: List[torch.Tensor], pad_value: int):
+    """Perform padding for the list of tensors.
+
+    Args:
+        xs (List): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)].
+        pad_value (float): Value for padding.
+
+    Returns:
+        Tensor: Padded tensor (B, Tmax, `*`).
+
+    Examples:
+        >>> x = [torch.ones(4), torch.ones(2), torch.ones(1)]
+        >>> x
+        [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])]
+        >>> pad_list(x, 0)
+        tensor([[1., 1., 1., 1.],
+                [1., 1., 0., 0.],
+                [1., 0., 0., 0.]])
+
+    """
+    max_len = max([len(item) for item in xs])
+    batchs = len(xs)
+    ndim = xs[0].ndim
+    if ndim == 1:
+        pad_res = torch.zeros(batchs,
+                              max_len,
+                              dtype=xs[0].dtype,
+                              device=xs[0].device)
+    elif ndim == 2:
+        pad_res = torch.zeros(batchs,
+                              max_len,
+                              xs[0].shape[1],
+                              dtype=xs[0].dtype,
+                              device=xs[0].device)
+    elif ndim == 3:
+        pad_res = torch.zeros(batchs,
+                              max_len,
+                              xs[0].shape[1],
+                              xs[0].shape[2],
+                              dtype=xs[0].dtype,
+                              device=xs[0].device)
+    else:
+        raise ValueError(f"Unsupported ndim: {ndim}")
+    pad_res.fill_(pad_value)
+    for i in range(batchs):
+        pad_res[i, :len(xs[i])] = xs[i]
+    return pad_res
+
+
+def add_blank(ys_pad: torch.Tensor, blank: int,
+              ignore_id: int) -> torch.Tensor:
+    """ Prepad blank for transducer predictor
+
+    Args:
+        ys_pad (torch.Tensor): batch of padded target sequences (B, Lmax)
+        blank (int): index of <blank>
+
+    Returns:
+        ys_in (torch.Tensor) : (B, Lmax + 1)
+
+    Examples:
+        >>> blank = 0
+        >>> ignore_id = -1
+        >>> ys_pad
+        tensor([[ 1,  2,  3,   4,   5],
+                [ 4,  5,  6,  -1,  -1],
+                [ 7,  8,  9,  -1,  -1]], dtype=torch.int32)
+        >>> ys_in = add_blank(ys_pad, 0, -1)
+        >>> ys_in
+        tensor([[0,  1,  2,  3,  4,  5],
+                [0,  4,  5,  6,  0,  0],
+                [0,  7,  8,  9,  0,  0]])
+    """
+    bs = ys_pad.size(0)
+    _blank = torch.tensor([blank],
+                          dtype=torch.long,
+                          requires_grad=False,
+                          device=ys_pad.device)
+    _blank = _blank.repeat(bs).unsqueeze(1)  # [bs,1]
+    out = torch.cat([_blank, ys_pad], dim=1)  # [bs, Lmax+1]
+    return torch.where(out == ignore_id, blank, out)
+
+
+def add_sos_eos(ys_pad: torch.Tensor, sos: int, eos: int,
+                ignore_id: int) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Add <sos> and <eos> labels.
+
+    Args:
+        ys_pad (torch.Tensor): batch of padded target sequences (B, Lmax)
+        sos (int): index of <sos>
+        eos (int): index of <eeos>
+        ignore_id (int): index of padding
+
+    Returns:
+        ys_in (torch.Tensor) : (B, Lmax + 1)
+        ys_out (torch.Tensor) : (B, Lmax + 1)
+
+    Examples:
+        >>> sos_id = 10
+        >>> eos_id = 11
+        >>> ignore_id = -1
+        >>> ys_pad
+        tensor([[ 1,  2,  3,  4,  5],
+                [ 4,  5,  6, -1, -1],
+                [ 7,  8,  9, -1, -1]], dtype=torch.int32)
+        >>> ys_in,ys_out=add_sos_eos(ys_pad, sos_id , eos_id, ignore_id)
+        >>> ys_in
+        tensor([[10,  1,  2,  3,  4,  5],
+                [10,  4,  5,  6, 11, 11],
+                [10,  7,  8,  9, 11, 11]])
+        >>> ys_out
+        tensor([[ 1,  2,  3,  4,  5, 11],
+                [ 4,  5,  6, 11, -1, -1],
+                [ 7,  8,  9, 11, -1, -1]])
+    """
+    _sos = torch.tensor([sos],
+                        dtype=torch.long,
+                        requires_grad=False,
+                        device=ys_pad.device)
+    _eos = torch.tensor([eos],
+                        dtype=torch.long,
+                        requires_grad=False,
+                        device=ys_pad.device)
+    ys = [y[y != ignore_id] for y in ys_pad]  # parse padded ys
+    ys_in = [torch.cat([_sos, y], dim=0) for y in ys]
+    ys_out = [torch.cat([y, _eos], dim=0) for y in ys]
+    return pad_list(ys_in, eos), pad_list(ys_out, ignore_id)
+
+
+def add_whisper_tokens(special_tokens, ys_pad: torch.Tensor, ignore_id: int,
+                       tasks: List[str], no_timestamp: bool, langs: List[str],
+                       use_prev: bool) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Add whisper-style tokens.
+
+    ([PREV] -> [previous text tokens or hotwords]).optional --
+      ┌------------------------------------------------------↲
+      ↓
+    [sot] -> [language id] -> [transcribe] -> [begin time] -> [text tokens] -> [end time] -> ... -> [eot]    # noqa
+        |          |                |-------> [no timestamps] -> [text tokens] ----------------------↑       # noqa
+        |          |                                                                                 |       # noqa
+        |          |--------> [translate]  -> [begin time] -> [text tokens] -> [end time] -> ... --->|       # noqa
+        |                           |-------> [no timestamps] -> [text tokens] --------------------->|       # noqa
+        |                                                                                            |       # noqa
+        |--> [no speech(VAD)] ---------------------------------------------------------------------->|       # noqa
+
+    Args:
+        special_tokens: get IDs of special tokens
+        ignore_id (int): index of padding
+        no_timestamp (bool): whether to add timestamps tokens
+        tasks (List[str]): list of task tags
+        langs (List[str]): list of language tags
+
+    Returns:
+        ys_in (torch.Tensor) : (B, Lmax + ?)
+        ys_out (torch.Tensor) : (B, Lmax + ?)
+
+    """
+    assert len(langs) == ys_pad.size(0)
+    assert len(tasks) == ys_pad.size(0)
+    if use_prev:
+        # i.e., hotword list
+        _prev = [special_tokens["sot_prev"]]
+        # append hotword list to _prev
+        # ...
+        raise NotImplementedError
+    else:
+        _prev = []
+
+    _sot = []
+    for task, lang in zip(tasks, langs):
+        if task == "transcribe":
+            task_id = special_tokens["transcribe"]
+        elif task == "translate":
+            task_id = special_tokens["translate"]
+        elif task == "vad":
+            task_id = special_tokens["no_speech"]
+        else:
+            if task in special_tokens:
+                task_id = special_tokens[task]
+            else:
+                raise NotImplementedError("unsupported task {}".format(task))
+        language_id = special_tokens["sot"] + 1 + WHISPER_LANGS.index(lang)
+        prefix = _prev + [special_tokens["sot"], language_id, task_id]
+        if task != 'vad':
+            if no_timestamp:
+                prefix.append(special_tokens["no_timestamps"])
+            else:
+                prefix.append(special_tokens["timestamp_begin"])
+                # add subsequent tokens
+                # ...
+                raise NotImplementedError
+        elif task == "vad":
+            prefix.append(special_tokens["no_speech"])
+        else:
+            raise NotImplementedError
+        prefix = torch.tensor(prefix,
+                              dtype=torch.long,
+                              requires_grad=False,
+                              device=ys_pad.device)
+        _sot.append(prefix)
+
+    _eot = torch.tensor([special_tokens["eot"]],
+                        dtype=torch.long,
+                        requires_grad=False,
+                        device=ys_pad.device)
+    ys = [y[y != ignore_id] for y in ys_pad]  # parse padded ys
+
+    ys_in = [torch.cat([prefix, y], dim=0) for prefix, y in zip(_sot, ys)]
+    ys_out = [
+        torch.cat([prefix[1:], y, _eot], dim=0) for prefix, y in zip(_sot, ys)
+    ]
+    return pad_list(ys_in, special_tokens["eot"]), pad_list(ys_out, ignore_id)
+
+
+def reverse_pad_list(ys_pad: torch.Tensor,
+                     ys_lens: torch.Tensor,
+                     pad_value: float = -1.0) -> torch.Tensor:
+    """Reverse padding for the list of tensors.
+
+    Args:
+        ys_pad (tensor): The padded tensor (B, Tokenmax).
+        ys_lens (tensor): The lens of token seqs (B)
+        pad_value (int): Value for padding.
+
+    Returns:
+        Tensor: Padded tensor (B, Tokenmax).
+
+    Examples:
+        >>> x
+        tensor([[1, 2, 3, 4], [5, 6, 7, 0], [8, 9, 0, 0]])
+        >>> pad_list(x, 0)
+        tensor([[4, 3, 2, 1],
+                [7, 6, 5, 0],
+                [9, 8, 0, 0]])
+
+    """
+    r_ys_pad = pad_sequence([(torch.flip(y.int()[:i], [0]))
+                             for y, i in zip(ys_pad, ys_lens)], True,
+                            pad_value)
+    return r_ys_pad
+
+
+def th_accuracy(pad_outputs: torch.Tensor, pad_targets: torch.Tensor,
+                ignore_label: int) -> torch.Tensor:
+    """Calculate accuracy.
+
+    Args:
+        pad_outputs (Tensor): Prediction tensors (B * Lmax, D).
+        pad_targets (LongTensor): Target label tensors (B, Lmax).
+        ignore_label (int): Ignore label id.
+
+    Returns:
+        torch.Tensor: Accuracy value (0.0 - 1.0).
+
+    """
+    pad_pred = pad_outputs.view(pad_targets.size(0), pad_targets.size(1),
+                                pad_outputs.size(1)).argmax(2)
+    mask = pad_targets != ignore_label
+    numerator = torch.sum(
+        pad_pred.masked_select(mask) == pad_targets.masked_select(mask))
+    denominator = torch.sum(mask)
+    return (numerator / denominator).detach()
+
+
+def get_subsample(config):
+    input_layer = config["encoder_conf"]["input_layer"]
+    assert input_layer in ["conv2d", "conv2d6", "conv2d8"]
+    if input_layer == "conv2d":
+        return 4
+    elif input_layer == "conv2d6":
+        return 6
+    elif input_layer == "conv2d8":
+        return 8
+
+
+def log_add(*args) -> float:
+    """
+    Stable log add
+    """
+    if all(a == -float('inf') for a in args):
+        return -float('inf')
+    a_max = max(args)
+    lsp = math.log(sum(math.exp(a - a_max) for a in args))
+    return a_max + lsp
+
+
+def mask_to_bias(mask: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
+    assert mask.dtype == torch.bool
+    assert dtype in [torch.float32, torch.bfloat16, torch.float16]
+    mask = mask.to(dtype)
+    # attention mask bias
+    # NOTE(Mddct): torch.finfo jit issues
+    #     chunk_masks = (1.0 - chunk_masks) * torch.finfo(dtype).min
+    mask = (1.0 - mask) * -1.0e+10
+    return mask
+
+
+def get_nested_attribute(obj, attr_path):
+    if isinstance(obj, torch.nn.parallel.DistributedDataParallel):
+        obj = obj.module
+    attributes = attr_path.split('.')
+    for attr in attributes:
+        obj = getattr(obj, attr)
+    return obj
+
+
+def lrs_to_str(lrs: List):
+    return " ".join(["{:.4e}".format(lr) for lr in lrs])
+
+
+class StepTimer:
+    """Utility class for measuring steps/second."""
+
+    def __init__(self, step=0.0):
+        self.last_iteration = step
+        self.start()
+
+    def start(self):
+        self.last_time = time.time()
+
+    def steps_per_second(self, cur_step, restart=True):
+        value = ((float(cur_step) - self.last_iteration) /
+                 (time.time() - self.last_time))
+        if restart:
+            self.start()
+            self.last_iteration = float(cur_step)
+        return value
+
+
+def tensor_to_scalar(x):
+    if torch.is_tensor(x):
+        return x.item()
+    return x
+
+
+def is_torch_npu_available() -> bool:
+    '''
+        check if torch_npu is available.
+        torch_npu is a npu adapter of PyTorch
+    '''
+    try:
+        import torch_npu  # noqa
+        return True
+    except ImportError:
+        if not torch.cuda.is_available():
+            print("Module \"torch_npu\" not found. \"pip install torch_npu\" \
+                if you are using Ascend NPU, otherwise, ignore it")
+    return False
+
+
+TORCH_NPU_AVAILABLE = is_torch_npu_available()
diff --git a/wenet/utils/config.py b/wenet/utils/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..e153d02426816fa650ae33b3bce9757b9ccca41d
--- /dev/null
+++ b/wenet/utils/config.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2021 Shaoshang Qi
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+
+
+def override_config(configs, override_list):
+    new_configs = copy.deepcopy(configs)
+    for item in override_list:
+        arr = item.split()
+        if len(arr) != 2:
+            print(f"the overrive {item} format not correct, skip it")
+            continue
+        keys = arr[0].split('.')
+        s_configs = new_configs
+        for i, key in enumerate(keys):
+            if key not in s_configs:
+                print(f"the overrive {item} format not correct, skip it")
+            if i == len(keys) - 1:
+                param_type = type(s_configs[key])
+                if param_type != bool:
+                    s_configs[key] = param_type(arr[1])
+                else:
+                    s_configs[key] = arr[1] in ['true', 'True']
+                print(f"override {arr[0]} with {arr[1]}")
+            else:
+                s_configs = s_configs[key]
+    return new_configs
diff --git a/wenet/utils/context_graph.py b/wenet/utils/context_graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3fadd3d01a42afcc87b703348de79c85874e998
--- /dev/null
+++ b/wenet/utils/context_graph.py
@@ -0,0 +1,265 @@
+# Copyright    2023  Xiaomi Corp.        (authors: Wei Kang)
+#              2023  Binbin Zhang (binbzha@qq.com)
+#              2023  Kaixun Huang
+#              2023  Chengdong Liang (liangchengdong@mail.nwpu.edu.cn)
+# See ../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from wenet.text.tokenize_utils import tokenize_by_bpe_model
+from typing import Dict, List, Tuple
+from collections import deque
+
+
+def tokenize(context_list_path, symbol_table, bpe_model=None):
+    """ Read biasing list from the biasing list address, tokenize and convert it
+        into token id
+    """
+    if bpe_model is not None:
+        import sentencepiece as spm
+        sp = spm.SentencePieceProcessor()
+        sp.load(bpe_model)
+    else:
+        sp = None
+
+    with open(context_list_path, "r") as fin:
+        context_txts = fin.readlines()
+
+    context_list = []
+    for context_txt in context_txts:
+        context_txt = context_txt.strip()
+
+        labels = []
+        tokens = []
+        if bpe_model is not None:
+            tokens = tokenize_by_bpe_model(sp, context_txt)
+        else:
+            for ch in context_txt:
+                if ch == ' ':
+                    ch = "▁"
+                tokens.append(ch)
+        for ch in tokens:
+            if ch in symbol_table:
+                labels.append(symbol_table[ch])
+            elif '<unk>' in symbol_table:
+                labels.append(symbol_table['<unk>'])
+        context_list.append(labels)
+    return context_list
+
+
+class ContextState:
+    """The state in ContextGraph"""
+
+    def __init__(
+        self,
+        id: int,
+        token: int,
+        token_score: float,
+        node_score: float,
+        output_score: float,
+        is_end: bool,
+    ):
+        """Create a ContextState.
+
+        Args:
+          id:
+            The node id, only for visualization now. A node is in [0, graph.num_nodes).
+            The id of the root node is always 0.
+          token:
+            The token id.
+          token_score:
+            The bonus for each token during decoding, which will hopefully
+            boost the token up to survive beam search.
+          node_score:
+            The accumulated bonus from root of graph to current node, it will be
+            used to calculate the score for fail arc.
+          output_score:
+            The total scores of matched phrases, sum of the node_score of all
+            the output node for current node.
+          is_end:
+            True if current token is the end of a context.
+        """
+        self.id = id
+        self.token = token
+        self.token_score = token_score
+        self.node_score = node_score
+        self.output_score = output_score
+        self.is_end = is_end
+        self.next = {}
+        self.fail = None
+        self.output = None
+
+
+class ContextGraph:
+    """The ContextGraph is modified from Aho-Corasick which is mainly
+    a Trie with a fail arc for each node.
+    See https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm for more details
+    of Aho-Corasick algorithm.
+
+    A ContextGraph contains some words / phrases that we expect to boost their
+    scores during decoding. If the substring of a decoded sequence matches the word / phrase  # noqa
+    in the ContextGraph, we will give the decoded sequence a bonus to make it survive
+    beam search.
+    """
+
+    def __init__(self,
+                 context_list_path: str,
+                 symbol_table: Dict[str, int],
+                 bpe_model: str = None,
+                 context_score: float = 6.0):
+        """Initialize a ContextGraph with the given ``context_score``.
+
+        A root node will be created (**NOTE:** the token of root is hardcoded to -1).
+
+        Args:
+          context_score:
+            The bonus score for each token(note: NOT for each word/phrase, it means longer  # noqa
+            word/phrase will have larger bonus score, they have to be matched though).
+        """
+        self.context_score = context_score
+        self.context_list = tokenize(context_list_path, symbol_table,
+                                     bpe_model)
+        self.num_nodes = 0
+        self.root = ContextState(
+            id=self.num_nodes,
+            token=-1,
+            token_score=0,
+            node_score=0,
+            output_score=0,
+            is_end=False,
+        )
+        self.root.fail = self.root
+        self.build_graph(self.context_list)
+
+    def build_graph(self, token_ids: List[List[int]]):
+        """Build the ContextGraph from a list of token list.
+        It first build a trie from the given token lists, then fill the fail arc
+        for each trie node.
+
+        See https://en.wikipedia.org/wiki/Trie for how to build a trie.
+
+        Args:
+          token_ids:
+            The given token lists to build the ContextGraph, it is a list of token list,
+            each token list contains the token ids for a word/phrase. The token id
+            could be an id of a char (modeling with single Chinese char) or an id
+            of a BPE (modeling with BPEs).
+        """
+        for tokens in token_ids:
+            node = self.root
+            for i, token in enumerate(tokens):
+                if token not in node.next:
+                    self.num_nodes += 1
+                    is_end = i == len(tokens) - 1
+                    node_score = node.node_score + self.context_score
+                    node.next[token] = ContextState(
+                        id=self.num_nodes,
+                        token=token,
+                        token_score=self.context_score,
+                        node_score=node_score,
+                        output_score=node_score if is_end else 0,
+                        is_end=is_end,
+                    )
+                node = node.next[token]
+        self._fill_fail_output()  # AC
+
+    def _fill_fail_output(self):
+        """This function fills the fail arc for each trie node, it can be computed
+        in linear time by performing a breadth-first search starting from the root.
+        See https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm for the
+        details of the algorithm.
+        """
+        queue = deque()
+        for token, node in self.root.next.items():
+            node.fail = self.root
+            queue.append(node)
+        while queue:
+            current_node = queue.popleft()
+            for token, node in current_node.next.items():
+                fail = current_node.fail
+                if token in fail.next:
+                    fail = fail.next[token]
+                else:
+                    fail = fail.fail
+                    while token not in fail.next:
+                        fail = fail.fail
+                        if fail.token == -1:  # root
+                            break
+                    if token in fail.next:
+                        fail = fail.next[token]
+                node.fail = fail
+                # fill the output arc
+                output = node.fail
+                while not output.is_end:
+                    output = output.fail
+                    if output.token == -1:  # root
+                        output = None
+                        break
+                node.output = output
+                node.output_score += 0 if output is None else output.output_score
+                queue.append(node)
+
+    def forward_one_step(self, state: ContextState,
+                         token: int) -> Tuple[float, ContextState]:
+        """Search the graph with given state and token.
+
+        Args:
+          state:
+            The given token containing trie node to start.
+          token:
+            The given token.
+
+        Returns:
+          Return a tuple of score and next state.
+        """
+        node = None
+        score = 0
+        # token matched
+        if token in state.next:
+            node = state.next[token]
+            score = node.token_score
+        else:
+            # token not matched
+            # We will trace along the fail arc until it matches the token or reaching
+            # root of the graph.
+            node = state.fail
+            while token not in node.next:
+                node = node.fail
+                if node.token == -1:  # root
+                    break
+
+            if token in node.next:
+                node = node.next[token]
+
+            # The score of the fail path
+            score = node.node_score - state.node_score
+        assert node is not None
+        return (score + node.output_score, node)
+
+    def finalize(self, state: ContextState) -> Tuple[float, ContextState]:
+        """When reaching the end of the decoded sequence, we need to finalize
+        the matching, the purpose is to subtract the added bonus score for the
+        state that is not the end of a word/phrase.
+
+        Args:
+          state:
+            The given state(trie node).
+
+        Returns:
+          Return a tuple of score and next state. If state is the end of a word/phrase
+          the score is zero, otherwise the score is the score of a implicit fail arc
+          to root. The next state is always root.
+        """
+        # The score of the fail arc
+        score = -state.node_score
+        return (score, self.root)
diff --git a/wenet/utils/ctc_utils.py b/wenet/utils/ctc_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..99751f34ccbc67c9bd3c6ff046fd449474c7b3e5
--- /dev/null
+++ b/wenet/utils/ctc_utils.py
@@ -0,0 +1,136 @@
+# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Tuple
+
+import numpy as np
+
+import torch
+import torchaudio.functional as F
+
+
+def remove_duplicates_and_blank(hyp: List[int],
+                                blank_id: int = 0) -> List[int]:
+    new_hyp: List[int] = []
+    cur = 0
+    while cur < len(hyp):
+        if hyp[cur] != blank_id:
+            new_hyp.append(hyp[cur])
+        prev = cur
+        while cur < len(hyp) and hyp[cur] == hyp[prev]:
+            cur += 1
+    return new_hyp
+
+
+def replace_duplicates_with_blank(hyp: List[int],
+                                  blank_id: int = 0) -> List[int]:
+    new_hyp: List[int] = []
+    cur = 0
+    while cur < len(hyp):
+        new_hyp.append(hyp[cur])
+        prev = cur
+        cur += 1
+        while cur < len(
+                hyp) and hyp[cur] == hyp[prev] and hyp[cur] != blank_id:
+            new_hyp.append(blank_id)
+            cur += 1
+    return new_hyp
+
+
+def gen_ctc_peak_time(hyp: List[int], blank_id: int = 0) -> List[int]:
+    times = []
+    cur = 0
+    while cur < len(hyp):
+        if hyp[cur] != blank_id:
+            times.append(cur)
+        prev = cur
+        while cur < len(hyp) and hyp[cur] == hyp[prev]:
+            cur += 1
+    return times
+
+
+def gen_timestamps_from_peak(
+    peaks: List[int],
+    max_duration: float,
+    frame_rate: float = 0.04,
+    max_token_duration: float = 1.0,
+) -> List[Tuple[float, float]]:
+    """
+    Args:
+        peaks: ctc peaks time stamp
+        max_duration: max_duration of the sentence
+        frame_rate: frame rate of every time stamp, in seconds
+        max_token_duration: max duration of the token, in seconds
+    Returns:
+        list(start, end) of each token
+    """
+    times = []
+    half_max = max_token_duration / 2
+    for i in range(len(peaks)):
+        if i == 0:
+            start = max(0, peaks[0] * frame_rate - half_max)
+        else:
+            start = max((peaks[i - 1] + peaks[i]) / 2 * frame_rate,
+                        peaks[i] * frame_rate - half_max)
+
+        if i == len(peaks) - 1:
+            end = min(max_duration, peaks[-1] * frame_rate + half_max)
+        else:
+            end = min((peaks[i] + peaks[i + 1]) / 2 * frame_rate,
+                      peaks[i] * frame_rate + half_max)
+        times.append((start, end))
+    return times
+
+
+def insert_blank(label, blank_id=0):
+    """Insert blank token between every two label token."""
+    label = np.expand_dims(label, 1)
+    blanks = np.zeros((label.shape[0], 1), dtype=np.int64) + blank_id
+    label = np.concatenate([blanks, label], axis=1)
+    label = label.reshape(-1)
+    label = np.append(label, label[0])
+    return label
+
+
+def force_align(ctc_probs: torch.Tensor, y: torch.Tensor, blank_id=0) -> list:
+    """ctc forced alignment.
+
+    Args:
+        torch.Tensor ctc_probs: hidden state sequence, 2d tensor (T, D)
+        torch.Tensor y: id sequence tensor 1d tensor (L)
+        int blank_id: blank symbol index
+    Returns:
+        torch.Tensor: alignment result
+    """
+    ctc_probs = ctc_probs[None].cpu()
+    y = y[None].cpu()
+    alignments, _ = F.forced_align(ctc_probs, y, blank=blank_id)
+    return alignments[0]
+
+
+def get_blank_id(configs, symbol_table):
+    if 'ctc_conf' not in configs:
+        configs['ctc_conf'] = {}
+
+    if '<blank>' in symbol_table:
+        if 'ctc_blank_id' in configs['ctc_conf']:
+            assert configs['ctc_conf']['ctc_blank_id'] == symbol_table[
+                '<blank>']
+        else:
+            configs['ctc_conf']['ctc_blank_id'] = symbol_table['<blank>']
+    else:
+        assert 'ctc_blank_id' in configs[
+            'ctc_conf'], "PLZ set ctc_blank_id in yaml"
+
+    return configs, configs['ctc_conf']['ctc_blank_id']
diff --git a/wenet/utils/executor.py b/wenet/utils/executor.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6dd528860023930dc50e3a2d0c970b82945eab4
--- /dev/null
+++ b/wenet/utils/executor.py
@@ -0,0 +1,160 @@
+# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import datetime
+import logging
+import sys
+from contextlib import nullcontext
+
+# if your python version < 3.7 use the below one
+# from contextlib import suppress as nullcontext
+import torch
+from wenet.utils.common import StepTimer
+
+from wenet.utils.train_utils import (wenet_join, batch_forward, batch_backward,
+                                     update_parameter_and_lr, log_per_step,
+                                     save_model)
+
+
+class Executor:
+
+    def __init__(self,
+                 global_step: int = 0,
+                 device: torch.device = torch.device("cpu")):
+        self.step = global_step + 1
+        self.train_step_timer = None
+        self.cv_step_timer = None
+        self.device = device
+
+    def train(self, model, optimizer, scheduler, train_data_loader,
+              cv_data_loader, writer, configs, scaler, group_join):
+        ''' Train one epoch
+        '''
+        if self.train_step_timer is None:
+            self.train_step_timer = StepTimer(self.step)
+        model.train()
+        info_dict = copy.deepcopy(configs)
+        logging.info('using accumulate grad, new batch size is {} times'
+                     ' larger than before'.format(info_dict['accum_grad']))
+        # A context manager to be used in conjunction with an instance of
+        # torch.nn.parallel.DistributedDataParallel to be able to train
+        # with uneven inputs across participating processes.
+        if isinstance(model, torch.nn.parallel.DistributedDataParallel):
+            model_context = model.join
+        else:
+            model_context = nullcontext
+
+        with model_context():
+            for batch_idx, batch_dict in enumerate(train_data_loader):
+                info_dict["tag"] = "TRAIN"
+                info_dict["step"] = self.step
+                info_dict["batch_idx"] = batch_idx
+                if wenet_join(group_join, info_dict):
+                    break    # fix by zhaoyi ,促进多机训练
+
+                if batch_dict["target_lengths"].size(0) == 0:
+                    continue
+
+                context = None
+                # Disable gradient synchronizations across DDP processes.
+                # Within this context, gradients will be accumulated on module
+                # variables, which will later be synchronized.
+                if info_dict.get("train_engine", "torch_ddp") in [
+                        "torch_ddp", "torch_fsdp"
+                ] and (batch_idx + 1) % info_dict["accum_grad"] != 0:
+                    context = model.no_sync
+                # Used for single gpu training and DDP gradient synchronization
+                # processes.
+                else:
+                    context = nullcontext
+
+                with context():
+                    info_dict = batch_forward(model, batch_dict, scaler,
+                                              info_dict, self.device)
+                    info_dict = batch_backward(model, scaler, info_dict)
+
+                info_dict = update_parameter_and_lr(model, optimizer,
+                                                    scheduler, scaler,
+                                                    info_dict)
+                # write training: tensorboard && log
+                log_per_step(writer, info_dict, timer=self.train_step_timer)
+                # save_interval = info_dict.get('save_interval', sys.maxsize)
+                # if (self.step +
+                #         1) % save_interval == 0 and self.step != 0 and (
+                #             batch_idx + 1) % info_dict["accum_grad"] == 0:
+                #     import torch.distributed as dist
+                #     # Ensure all ranks start CV at the same time in step mode
+                #     dist.barrier()
+                #     # loss_dict = self.cv(model, cv_data_loader, configs)
+                #     model.train()
+                #     info_dict.update({
+                #         "tag":
+                #         "step_{}".format(self.step),
+                #         "loss_dict": {'loss':999,'acc':999},
+                #         "save_time":
+                #         datetime.datetime.now().strftime('%d/%m/%Y %H:%M:%S'),
+                #         "lrs":
+                #         [group['lr'] for group in optimizer.param_groups]
+                #     })
+                #     save_model(model, info_dict)
+                #     # write final cv: tensorboard
+                #     log_per_step(writer, info_dict)
+                #     # Ensure all ranks start Train at the same time in step mode
+                #     dist.barrier()
+                self.step += 1 if (batch_idx +
+                                   1) % info_dict["accum_grad"] == 0 else 0
+
+    def cv(self, model, cv_data_loader, configs):
+        ''' Cross validation on
+        '''
+        if self.cv_step_timer is None:
+            self.cv_step_timer = StepTimer(0.0)
+        else:
+            self.cv_step_timer.last_iteration = 0.0
+        model.eval()
+        info_dict = copy.deepcopy(configs)
+        num_seen_utts, loss_dict, total_acc = 1, {}, []  # avoid division by 0
+        with torch.no_grad():
+            for batch_idx, batch_dict in enumerate(cv_data_loader):
+                info_dict["tag"] = "CV"
+                info_dict["step"] = self.step
+                info_dict["batch_idx"] = batch_idx
+                info_dict["cv_step"] = batch_idx
+
+                num_utts = batch_dict["target_lengths"].size(0)
+                if num_utts == 0:
+                    continue
+
+                info_dict = batch_forward(model, batch_dict, None, info_dict,
+                                          self.device)
+                _dict = info_dict["loss_dict"]
+
+                num_seen_utts += num_utts
+                total_acc.append(_dict['th_accuracy'].item(
+                ) if _dict.get('th_accuracy', None) is not None else 0.0)
+                for loss_name, loss_value in _dict.items():
+                    if loss_value is not None and "loss" in loss_name \
+                            and torch.isfinite(loss_value):
+                        loss_value = loss_value.item()
+                        loss_dict[loss_name] = loss_dict.get(loss_name, 0) + \
+                            loss_value * num_utts
+                # write cv: log
+                log_per_step(writer=None,
+                             info_dict=info_dict,
+                             timer=self.cv_step_timer)
+        for loss_name, loss_value in loss_dict.items():
+            loss_dict[loss_name] = loss_dict[loss_name] / num_seen_utts
+        loss_dict["acc"] = sum(total_acc) / len(total_acc)
+        return loss_dict
\ No newline at end of file
diff --git a/wenet/utils/file_utils.py b/wenet/utils/file_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..264e37e73ed017c1e058a9b3496fd64f0e3705fa
--- /dev/null
+++ b/wenet/utils/file_utils.py
@@ -0,0 +1,68 @@
+# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+
+
+def read_lists(list_file):
+    lists = []
+    with open(list_file, 'r', encoding='utf8') as fin:
+        for line in fin:
+            lists.append(line.strip())
+    return lists
+
+
+def read_non_lang_symbols(non_lang_sym_path):
+    """read non-linguistic symbol from file.
+
+    The file format is like below:
+
+    {NOISE}\n
+    {BRK}\n
+    ...
+
+
+    Args:
+        non_lang_sym_path: non-linguistic symbol file path, None means no any
+        syms.
+
+    """
+    if non_lang_sym_path is None:
+        return []
+    else:
+        syms = read_lists(non_lang_sym_path)
+        non_lang_syms_pattern = re.compile(r"(\[[^\[\]]+\]|<[^<>]+>|{[^{}]+})")
+        for sym in syms:
+            if non_lang_syms_pattern.fullmatch(sym) is None:
+
+                class BadSymbolFormat(Exception):
+                    pass
+
+                raise BadSymbolFormat(
+                    "Non-linguistic symbols should be "
+                    "formatted in {xxx}/<xxx>/[xxx], consider"
+                    " modify '%s' to meet the requirment. "
+                    "More details can be found in discussions here : "
+                    "https://github.com/wenet-e2e/wenet/pull/819" % (sym))
+        return syms
+
+
+def read_symbol_table(symbol_table_file):
+    symbol_table = {}
+    with open(symbol_table_file, 'r', encoding='utf8') as fin:
+        for line in fin:
+            arr = line.strip().split()
+            assert len(arr) == 2
+            symbol_table[arr[0]] = int(arr[1])
+    return symbol_table
diff --git a/wenet/utils/fsdp_utils.py b/wenet/utils/fsdp_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..77ca195953532abe18a820b8ca9a69350b3a2e68
--- /dev/null
+++ b/wenet/utils/fsdp_utils.py
@@ -0,0 +1,118 @@
+from functools import partial
+import os
+from torch.distributed.fsdp import (FullyShardedDataParallel as FSDP,
+                                    FullStateDictConfig, StateDictType)
+
+from torch.distributed.fsdp.wrap import (lambda_auto_wrap_policy,
+                                         transformer_auto_wrap_policy)
+from wenet.LLM.decoder import DecoderOnly
+from wenet.branchformer.encoder_layer import BranchformerEncoderLayer
+from wenet.e_branchformer.encoder_layer import EBranchformerEncoderLayer
+from wenet.efficient_conformer.encoder_layer import StrideConformerEncoderLayer
+from wenet.paraformer.layers import AliParaformerEncoderLayer, SanmDecoderLayer
+from wenet.squeezeformer.encoder_layer import SqueezeformerEncoderLayer
+from wenet.transformer.encoder_layer import (ConformerEncoderLayer,
+                                             TransformerEncoderLayer)
+from wenet.transformer.decoder_layer import DecoderLayer
+from wenet.utils.checkpoint import save_state_dict_and_infos
+from wenet.utils.init_model import WENET_DECODER_CLASSES, WENET_ENCODER_CLASSES
+
+WENET_ENCODER_LAYERS_CLASSES = {
+    'transformer_encoder_layer': TransformerEncoderLayer,
+    'conformer_encoder_layer': ConformerEncoderLayer,
+    'paraformer_encoder_layer': AliParaformerEncoderLayer,
+    'squeezeformer_encoder_layer': SqueezeformerEncoderLayer,
+    'ebranchformer_encoder_layer': EBranchformerEncoderLayer,
+    'efficient_conformer_encoder_layer': StrideConformerEncoderLayer,
+    'branchformer_encoder_layer': BranchformerEncoderLayer,
+}
+
+WENET_DECODER_LAYERS_CLASSES = {
+    'transformer_decoder_layer': DecoderLayer,
+    'paraformer_decoder_layer': SanmDecoderLayer,
+    # TODO(Mddct):
+    #     1 wrap transducer's predictor and joint
+    #     2 wrap paraformer's cif and ignore lstm
+}
+
+
+def wenet_fsdp_wrap_policy(mode):
+    # different wrap methods
+    # please refer： https://openmmlab.medium.com/its-2023-is-pytorch-s-fsdp-the-best-choice-for-training-large-models-fe8d2848832f # noqa
+    assert mode in ['no_shard', 'model', 'zero2', 'zero3']
+    if mode == 'no_shard':
+        return None
+    else:
+        # TODO(Mddct):  Support user customization
+        # see more wrap methods:
+        # https://github.com/meta-llama/llama-recipes/blob/main/src/llama_recipes/utils/fsdp_utils.py#L13 # noqa
+        if mode == 'model':
+            enc_dec_wrap_policy = partial(
+                lambda_auto_wrap_policy,
+                lambda_fn=lambda module: isinstance(
+                    module,
+                    tuple(WENET_ENCODER_CLASSES.values()) + tuple(
+                        WENET_DECODER_CLASSES.values())))
+            return enc_dec_wrap_policy
+        else:
+            to_wrap_class = set()
+            to_wrap_class.update(set(WENET_ENCODER_LAYERS_CLASSES.values()))
+            to_wrap_class.update(set(WENET_DECODER_LAYERS_CLASSES.values()))
+            layers_wrap_policy = partial(transformer_auto_wrap_policy,
+                                         transformer_layer_cls=to_wrap_class)
+            return layers_wrap_policy
+
+
+fullstate_save_policy = FullStateDictConfig(offload_to_cpu=True,
+                                            rank0_only=True)
+
+
+def fsdp_save_model(model, save_model_path, info_dict):
+    # TODO(Mddct); When the model is large, saving a model will take a long time.
+    # We only need to keep the sharding in an asynchronous manner, but it is
+    # good now. This feature will be supported when llm is supported in the future.
+
+    rank = int(os.environ.get('RANK', 0))
+    with FSDP.state_dict_type(model, StateDictType.FULL_STATE_DICT,
+                              fullstate_save_policy):
+        state_dict = model.state_dict()
+        if rank == 0:
+            save_state_dict_and_infos(state_dict, save_model_path, info_dict)
+
+
+def check_gradient_checkpoint(model):
+    ckpt_laye_types = []
+    if hasattr(model, 'encoder') and hasattr(model.encoder,
+                                             'gradient_checkpointing'):
+        if model.encoder.gradient_checkpointing:
+            model.encoder.gradient_checkpointing = False
+            ckpt_laye_types += list(WENET_ENCODER_LAYERS_CLASSES.values())
+    if hasattr(model, 'decoder') and hasattr(model.decoder,
+                                             'gradient_checkpointing'):
+        if model.decoder.gradient_checkpointing:
+            model.decoder.gradient_checkpointing = False
+            ckpt_laye_types += list(WENET_DECODER_LAYERS_CLASSES.values())
+            if isinstance(model.decoder, DecoderOnly):
+                ckpt_laye_types += [DecoderOnly]
+    return tuple(ckpt_laye_types)
+
+
+def apply_fsdp_checkpointing(model, ckpt_layer_types: tuple):
+    # NOTE(Mddct):  torch.utils.checkpoint is currently incompatible with
+    # wenet's model mode. Using this writing method, Please refer to
+    # https://github.com/meta-llama/llama-recipes/blob/main/src/llama_recipes/policies/activation_checkpointing_functions.py#L21 # noqa
+    if len(ckpt_layer_types) == 0:
+        return
+    from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
+        checkpoint_wrapper,
+        CheckpointImpl,
+        apply_activation_checkpointing,
+    )
+    non_reentrant_wrapper = partial(
+        checkpoint_wrapper,
+        checkpoint_impl=CheckpointImpl.NO_REENTRANT,
+    )
+    apply_activation_checkpointing(
+        model,
+        checkpoint_wrapper_fn=non_reentrant_wrapper,
+        check_fn=lambda submodule: isinstance(submodule, ckpt_layer_types))
diff --git a/wenet/utils/init_dataset.py b/wenet/utils/init_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b839ec3fd7757bc8a02845d340f6814ee5351bc
--- /dev/null
+++ b/wenet/utils/init_dataset.py
@@ -0,0 +1,41 @@
+import copy
+from typing import Optional
+from wenet.dataset.dataset import Dataset
+
+from wenet.text.base_tokenizer import BaseTokenizer
+
+
+def init_asr_dataset(data_type,
+                     data_list_file,
+                     tokenizer: Optional[BaseTokenizer] = None,
+                     conf=None,
+                     partition=True):
+    return Dataset(data_type, data_list_file, tokenizer, conf, partition)
+
+
+def init_dataset(dataset_type,
+                 data_type,
+                 data_list_file,
+                 tokenizer: Optional[BaseTokenizer] = None,
+                 conf=None,
+                 partition=True,
+                 split='train'):
+    assert dataset_type in ['asr', 'ssl']
+
+    if split != 'train':
+        cv_conf = copy.deepcopy(conf)
+        cv_conf['cycle'] = 1
+        cv_conf['speed_perturb'] = False
+        cv_conf['spec_aug'] = False
+        cv_conf['spec_sub'] = False
+        cv_conf['spec_trim'] = False
+        cv_conf['shuffle'] = False
+        cv_conf['list_shuffle'] = False
+        conf = cv_conf
+
+    if dataset_type == 'asr':
+        return init_asr_dataset(data_type, data_list_file, tokenizer, conf,
+                                partition)
+    else:
+        from wenet.ssl.init_dataset import init_dataset as init_ssl_dataset
+        return init_ssl_dataset(data_type, data_list_file, conf, partition)
diff --git a/wenet/utils/init_model.py b/wenet/utils/init_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f300967befe406ecaed1205345403ead845e429
--- /dev/null
+++ b/wenet/utils/init_model.py
@@ -0,0 +1,235 @@
+# Copyright (c) 2022 Binbin Zhang (binbzha@qq.com)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import torch
+
+from wenet.finetune.lora.utils import (inject_lora_to_model,
+                                       mark_only_lora_as_trainable)
+from wenet.k2.model import K2Model
+from wenet.llm_asr.init_llmasr import init_llmasr
+from wenet.paraformer.cif import Cif
+from wenet.paraformer.layers import SanmDecoder, SanmEncoder
+from wenet.paraformer.paraformer import Paraformer, Predictor
+from wenet.LLM.causallm_model import CausalLM
+from wenet.LLM.decoder import DecoderOnly
+from wenet.ssl.init_model import WENET_SSL_MODEL_CLASS
+from wenet.transducer.joint import TransducerJoint
+from wenet.transducer.predictor import (ConvPredictor, EmbeddingPredictor,
+                                        RNNPredictor)
+from wenet.transducer.transducer import Transducer
+from wenet.transformer.asr_model import ASRModel
+from wenet.transformer.cmvn import GlobalCMVN
+from wenet.transformer.ctc import CTC
+from wenet.transformer.encoder import TransformerEncoder, ConformerEncoder
+from wenet.transformer.decoder import BiTransformerDecoder, TransformerDecoder
+from wenet.branchformer.encoder import BranchformerEncoder
+from wenet.e_branchformer.encoder import EBranchformerEncoder
+from wenet.squeezeformer.encoder import SqueezeformerEncoder
+from wenet.efficient_conformer.encoder import EfficientConformerEncoder
+from wenet.ctl_model.encoder import DualTransformerEncoder, DualConformerEncoder
+from wenet.ctl_model.asr_model_ctl import CTLModel
+from wenet.whisper.whisper import Whisper
+from wenet.utils.cmvn import load_cmvn
+from wenet.utils.checkpoint import load_checkpoint, load_trained_modules
+
+
+WENET_ENCODER_CLASSES = {
+    "transformer": TransformerEncoder,
+    "conformer": ConformerEncoder,
+    "squeezeformer": SqueezeformerEncoder,
+    "efficientConformer": EfficientConformerEncoder,
+    "branchformer": BranchformerEncoder,
+    "e_branchformer": EBranchformerEncoder,
+    "dual_transformer": DualTransformerEncoder,
+    "dual_conformer": DualConformerEncoder,
+    'sanm_encoder': SanmEncoder,
+}
+
+WENET_DECODER_CLASSES = {
+    "transformer": TransformerDecoder,
+    "bitransformer": BiTransformerDecoder,
+    "sanm_decoder": SanmDecoder,
+}
+
+WENET_CTC_CLASSES = {
+    "ctc": CTC,
+}
+
+WENET_PREDICTOR_CLASSES = {
+    "rnn": RNNPredictor,
+    "embedding": EmbeddingPredictor,
+    "conv": ConvPredictor,
+    "cif_predictor": Cif,
+    "paraformer_predictor": Predictor,
+}
+
+WENET_JOINT_CLASSES = {
+    "transducer_joint": TransducerJoint,
+}
+
+WENET_MODEL_CLASSES = {
+    "asr_model": ASRModel,
+    "ctl_model": CTLModel,
+    "whisper": Whisper,
+    "k2_model": K2Model,
+    "transducer": Transducer,
+    'paraformer': Paraformer,
+    'causal_llm': CausalLM,
+}
+
+
+def init_speech_model(args, configs):
+    # TODO(xcsong): Forcefully read the 'cmvn' attribute.
+    if configs.get('cmvn', None) == 'global_cmvn':
+        mean, istd = load_cmvn(configs['cmvn_conf']['cmvn_file'],
+                               configs['cmvn_conf']['is_json_cmvn'])
+        global_cmvn = GlobalCMVN(
+            torch.from_numpy(mean).float(),
+            torch.from_numpy(istd).float())
+    else:
+        global_cmvn = None
+
+    input_dim = configs['input_dim']
+    vocab_size = configs['output_dim']
+
+    encoder_type = configs.get('encoder', 'conformer')
+    decoder_type = configs.get('decoder', 'bitransformer')
+    ctc_type = configs.get('ctc', 'ctc')
+
+    encoder = WENET_ENCODER_CLASSES[encoder_type](
+        input_dim,
+        global_cmvn=global_cmvn,
+        **configs['encoder_conf'],
+        **configs['encoder_conf']['efficient_conf']
+        if 'efficient_conf' in configs['encoder_conf'] else {})
+
+    decoder = WENET_DECODER_CLASSES[decoder_type](vocab_size,
+                                                  encoder.output_size(),
+                                                  **configs['decoder_conf'])
+
+    ctc = WENET_CTC_CLASSES[ctc_type](
+        vocab_size,
+        encoder.output_size(),
+        blank_id=configs['ctc_conf']['ctc_blank_id']
+        if 'ctc_conf' in configs else 0)
+
+    model_type = configs.get('model', 'asr_model')
+    if model_type == "transducer":
+        predictor_type = configs.get('predictor', 'rnn')
+        joint_type = configs.get('joint', 'transducer_joint')
+        predictor = WENET_PREDICTOR_CLASSES[predictor_type](
+            vocab_size, **configs['predictor_conf'])
+        joint = WENET_JOINT_CLASSES[joint_type](vocab_size,
+                                                **configs['joint_conf'])
+        model = WENET_MODEL_CLASSES[model_type](
+            vocab_size=vocab_size,
+            blank=0,
+            predictor=predictor,
+            encoder=encoder,
+            attention_decoder=decoder,
+            joint=joint,
+            ctc=ctc,
+            special_tokens=configs.get('tokenizer_conf',
+                                       {}).get('special_tokens', None),
+            **configs['model_conf'])
+    elif model_type == 'paraformer':
+        predictor_type = configs.get('predictor', 'cif')
+        predictor = WENET_PREDICTOR_CLASSES[predictor_type](
+            **configs['predictor_conf'])
+        model = WENET_MODEL_CLASSES[model_type](
+            vocab_size=vocab_size,
+            encoder=encoder,
+            decoder=decoder,
+            predictor=predictor,
+            ctc=ctc,
+            **configs['model_conf'],
+            special_tokens=configs.get('tokenizer_conf',
+                                       {}).get('special_tokens', None),
+        )
+    elif model_type in WENET_SSL_MODEL_CLASS.keys():
+        from wenet.ssl.init_model import init_model as init_ssl_model
+        model = init_ssl_model(configs, encoder)
+    else:
+        model = WENET_MODEL_CLASSES[model_type](
+            vocab_size=vocab_size,
+            encoder=encoder,
+            decoder=decoder,
+            ctc=ctc,
+            special_tokens=configs.get('tokenizer_conf',
+                                       {}).get('special_tokens', None),
+            **configs['model_conf'])
+    return model, configs
+
+
+def init_causal_llm(configs):
+    vocab_size = configs['output_dim']
+    assert configs['decoder'] == 'decoder_only'
+    assert configs['model'] == 'causal_lm'
+    decoder_only = DecoderOnly(**configs['decoder_conf'])
+
+    model = CausalLM(
+        vocab_size,
+        decoder_only,
+        **configs['model_conf'],
+        special_tokens=configs.get('tokenizer_conf',
+                                   {}).get('special_tokens', None),
+    )
+    return model, configs
+
+
+def init_model(args, configs):
+
+    model_type = configs.get('model', 'asr_model')
+    configs['model'] = model_type
+    if model_type == 'causal_lm':
+        model, configs = init_causal_llm(configs)
+    elif model_type == "llmasr":
+        model = init_llmasr(args, configs)
+        return model
+    else:
+        model, configs = init_speech_model(args, configs)
+
+    if hasattr(args, 'use_lora') and args.use_lora:
+        inject_lora_to_model(model, configs['lora_conf'])
+
+    # If specify checkpoint, load some info from checkpoint
+    if hasattr(args, 'checkpoint') and args.checkpoint is not None:
+        infos = load_checkpoint(model, args.checkpoint)
+    elif hasattr(args, 'enc_init') and args.enc_init is not None:
+        infos = load_trained_modules(model, args)
+    else:
+        infos = {}
+    if configs.get('init_step', False):
+        infos = {}
+    configs["init_infos"] = infos
+
+    if hasattr(args, 'use_lora') and args.use_lora:
+        if hasattr(args, 'lora_ckpt_path') and args.lora_ckpt_path:
+            load_checkpoint(model, args.lora_ckpt_path)
+
+    print(configs)
+    # Trye to tie some weights
+    if hasattr(model, 'tie_or_clone_weights'):
+        if not hasattr(args, 'jit'):
+            args.jit = True  # i.e. export onnx/jit/ipex
+        model.tie_or_clone_weights(args.jit)
+
+    if hasattr(args, 'only_optimize_lora') and args.only_optimize_lora:
+        mark_only_lora_as_trainable(model, bias='lora_only')
+
+    if int(os.environ.get('RANK', 0)) == 0:
+        print(configs)
+
+    return model, configs
diff --git a/wenet/utils/init_tokenizer.py b/wenet/utils/init_tokenizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf1a9a4440f23e6c9a32394b955799ed31579d44
--- /dev/null
+++ b/wenet/utils/init_tokenizer.py
@@ -0,0 +1,58 @@
+# Copyright (c) 2023 Wenet Community. (authors: Dinghao Zhou)
+#                                     (authors: Xingchen Song)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+
+from wenet.text.base_tokenizer import BaseTokenizer
+from wenet.text.bpe_tokenizer import BpeTokenizer
+from wenet.text.char_tokenizer import CharTokenizer
+from wenet.text.hugging_face_tokenizer import HuggingFaceTokenizer
+from wenet.text.paraformer_tokenizer import ParaformerTokenizer
+from wenet.text.whisper_tokenizer import WhisperTokenizer
+
+
+def init_tokenizer(configs) -> BaseTokenizer:
+    # TODO(xcsong): Forcefully read the 'tokenizer' attribute.
+    tokenizer_type = configs.get("tokenizer", "char")
+    if tokenizer_type == "whisper":
+        tokenizer = WhisperTokenizer(
+            multilingual=configs['tokenizer_conf']['is_multilingual'],
+            num_languages=configs['tokenizer_conf']['num_languages'])
+    elif tokenizer_type == "char":
+        tokenizer = CharTokenizer(
+            configs['tokenizer_conf']['symbol_table_path'],
+            configs['tokenizer_conf']['non_lang_syms_path'],
+            split_with_space=configs['tokenizer_conf'].get(
+                'split_with_space', False),
+            connect_symbol=configs['tokenizer_conf'].get('connect_symbol', ''))
+    elif tokenizer_type == "bpe":
+        tokenizer = BpeTokenizer(
+            configs['tokenizer_conf']['bpe_path'],
+            configs['tokenizer_conf']['symbol_table_path'],
+            configs['tokenizer_conf']['non_lang_syms_path'],
+            split_with_space=configs['tokenizer_conf'].get(
+                'split_with_space', False))
+    elif tokenizer_type == 'paraformer':
+        tokenizer = ParaformerTokenizer(
+            symbol_table=configs['tokenizer_conf']['symbol_table_path'],
+            seg_dict=configs['tokenizer_conf']['seg_dict_path'])
+    elif tokenizer_type == 'huggingface':
+        tokenizer = HuggingFaceTokenizer(
+            model=configs['tokenizer_conf']['llm_path'])
+    else:
+        raise NotImplementedError
+    logging.info("use {} tokenizer".format(configs["tokenizer"]))
+
+    return tokenizer
diff --git a/wenet/utils/mask.py b/wenet/utils/mask.py
new file mode 100644
index 0000000000000000000000000000000000000000..80d45d314178d2ea47b1fbeb7275bc10ff13e5d9
--- /dev/null
+++ b/wenet/utils/mask.py
@@ -0,0 +1,373 @@
+# Copyright (c) 2019 Shigeki Karita
+#               2020 Mobvoi Inc (Binbin Zhang)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+'''
+def subsequent_mask(
+        size: int,
+        device: torch.device = torch.device("cpu"),
+) -> torch.Tensor:
+    """Create mask for subsequent steps (size, size).
+
+    This mask is used only in decoder which works in an auto-regressive mode.
+    This means the current step could only do attention with its left steps.
+
+    In encoder, fully attention is used when streaming is not necessary and
+    the sequence is not long. In this  case, no attention mask is needed.
+
+    When streaming is need, chunk-based attention is used in encoder. See
+    subsequent_chunk_mask for the chunk-based attention mask.
+
+    Args:
+        size (int): size of mask
+        str device (str): "cpu" or "cuda" or torch.Tensor.device
+        dtype (torch.device): result dtype
+
+    Returns:
+        torch.Tensor: mask
+
+    Examples:
+        >>> subsequent_mask(3)
+        [[1, 0, 0],
+         [1, 1, 0],
+         [1, 1, 1]]
+    """
+    ret = torch.ones(size, size, device=device, dtype=torch.bool)
+    return torch.tril(ret)
+'''
+
+
+def subsequent_mask(
+        size: int,
+        device: torch.device = torch.device("cpu"),
+) -> torch.Tensor:
+    """Create mask for subsequent steps (size, size).
+
+    This mask is used only in decoder which works in an auto-regressive mode.
+    This means the current step could only do attention with its left steps.
+
+    In encoder, fully attention is used when streaming is not necessary and
+    the sequence is not long. In this  case, no attention mask is needed.
+
+    When streaming is need, chunk-based attention is used in encoder. See
+    subsequent_chunk_mask for the chunk-based attention mask.
+
+    Args:
+        size (int): size of mask
+        str device (str): "cpu" or "cuda" or torch.Tensor.device
+        dtype (torch.device): result dtype
+
+    Returns:
+        torch.Tensor: mask
+
+    Examples:
+        >>> subsequent_mask(3)
+        [[1, 0, 0],
+         [1, 1, 0],
+         [1, 1, 1]]
+    """
+    arange = torch.arange(size, device=device)
+    mask = arange.expand(size, size)
+    arange = arange.unsqueeze(-1)
+    mask = mask <= arange
+    return mask
+
+
+def subsequent_chunk_mask(
+        size: int,
+        chunk_size: int,
+        num_left_chunks: int = -1,
+        device: torch.device = torch.device("cpu"),
+) -> torch.Tensor:
+    """Create mask for subsequent steps (size, size) with chunk size,
+       this is for streaming encoder
+
+    Args:
+        size (int): size of mask
+        chunk_size (int): size of chunk
+        num_left_chunks (int): number of left chunks
+            <0: use full chunk
+            >=0: use num_left_chunks
+        device (torch.device): "cpu" or "cuda" or torch.Tensor.device
+
+    Returns:
+        torch.Tensor: mask
+
+    Examples:
+        >>> subsequent_chunk_mask(4, 2)
+        [[1, 1, 0, 0],
+         [1, 1, 0, 0],
+         [1, 1, 1, 1],
+         [1, 1, 1, 1]]
+    """
+    ret = torch.zeros(size, size, device=device, dtype=torch.bool)
+    for i in range(size):
+        if num_left_chunks < 0:
+            start = 0
+        else:
+            start = max((i // chunk_size - num_left_chunks) * chunk_size, 0)
+        ending = min((i // chunk_size + 1) * chunk_size, size)
+        ret[i, start:ending] = True
+    return ret
+
+
+def add_optional_chunk_mask(xs: torch.Tensor,
+                            masks: torch.Tensor,
+                            use_dynamic_chunk: bool,
+                            use_dynamic_left_chunk: bool,
+                            decoding_chunk_size: int,
+                            static_chunk_size: int,
+                            num_decoding_left_chunks: int,
+                            enable_full_context: bool = True,
+                            max_chunk_size: int = 25):
+    """ Apply optional mask for encoder.
+
+    Args:
+        xs (torch.Tensor): padded input, (B, L, D), L for max length
+        mask (torch.Tensor): mask for xs, (B, 1, L)
+        use_dynamic_chunk (bool): whether to use dynamic chunk or not
+        use_dynamic_left_chunk (bool): whether to use dynamic left chunk for
+            training.
+        decoding_chunk_size (int): decoding chunk size for dynamic chunk, it's
+            0: default for training, use random dynamic chunk.
+            <0: for decoding, use full chunk.
+            >0: for decoding, use fixed chunk size as set.
+        static_chunk_size (int): chunk size for static chunk training/decoding
+            if it's greater than 0, if use_dynamic_chunk is true,
+            this parameter will be ignored
+        num_decoding_left_chunks: number of left chunks, this is for decoding,
+            the chunk size is decoding_chunk_size.
+            >=0: use num_decoding_left_chunks
+            <0: use all left chunks
+        enable_full_context (bool):
+            True: chunk size is either [1, max_chunk_size] or full context(max_len)
+            False: chunk size ~ U[1, max_chunk_size]
+
+    Returns:
+        torch.Tensor: chunk mask of the input xs.
+    """
+    # Whether to use chunk mask or not
+    if use_dynamic_chunk:
+        max_len = xs.size(1)
+        if decoding_chunk_size < 0:
+            chunk_size = max_len
+            num_left_chunks = -1
+        elif decoding_chunk_size > 0:
+            chunk_size = decoding_chunk_size
+            num_left_chunks = num_decoding_left_chunks
+        else:
+            # chunk size is either [1, max_chunk_size] or full context(max_len).
+            # Since we use 4 times subsampling and allow up to 1s(100 frames)
+            # delay, the maximum frame is 100 / 4 = 25.
+            chunk_size = torch.randint(1, max_len, (1, )).item()
+            num_left_chunks = -1
+            if chunk_size > max_len // 2 and enable_full_context:
+                chunk_size = max_len
+            else:
+                chunk_size = chunk_size % max_chunk_size + 1
+                if use_dynamic_left_chunk:
+                    max_left_chunks = (max_len - 1) // chunk_size
+                    num_left_chunks = torch.randint(0, max_left_chunks,
+                                                    (1, )).item()
+        chunk_masks = subsequent_chunk_mask(xs.size(1), chunk_size,
+                                            num_left_chunks,
+                                            xs.device)  # (L, L)
+        chunk_masks = chunk_masks.unsqueeze(0)  # (1, L, L)
+        chunk_masks = masks & chunk_masks  # (B, L, L)
+    elif static_chunk_size > 0:
+        num_left_chunks = num_decoding_left_chunks
+        chunk_masks = subsequent_chunk_mask(xs.size(1), static_chunk_size,
+                                            num_left_chunks,
+                                            xs.device)  # (L, L)
+        chunk_masks = chunk_masks.unsqueeze(0)  # (1, L, L)
+        chunk_masks = masks & chunk_masks  # (B, L, L)
+    else:
+        chunk_masks = masks
+    return chunk_masks
+
+
+def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor:
+    """Make mask tensor containing indices of padded part.
+
+    See description of make_non_pad_mask.
+
+    Args:
+        lengths (torch.Tensor): Batch of lengths (B,).
+    Returns:
+        torch.Tensor: Mask tensor containing indices of padded part.
+
+    Examples:
+        >>> lengths = [5, 3, 2]
+        >>> make_pad_mask(lengths)
+        masks = [[0, 0, 0, 0 ,0],
+                 [0, 0, 0, 1, 1],
+                 [0, 0, 1, 1, 1]]
+    """
+    batch_size = lengths.size(0)
+    max_len = max_len if max_len > 0 else lengths.max().item()
+    seq_range = torch.arange(0,
+                             max_len,
+                             dtype=torch.int64,
+                             device=lengths.device)
+    seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len)
+    seq_length_expand = lengths.unsqueeze(-1)
+    mask = seq_range_expand >= seq_length_expand
+    return mask
+
+
+def make_non_pad_mask(lengths: torch.Tensor) -> torch.Tensor:
+    """Make mask tensor containing indices of non-padded part.
+
+    The sequences in a batch may have different lengths. To enable
+    batch computing, padding is need to make all sequence in same
+    size. To avoid the padding part pass value to context dependent
+    block such as attention or convolution , this padding part is
+    masked.
+
+    This pad_mask is used in both encoder and decoder.
+
+    1 for non-padded part and 0 for padded part.
+
+    Args:
+        lengths (torch.Tensor): Batch of lengths (B,).
+    Returns:
+        torch.Tensor: mask tensor containing indices of padded part.
+
+    Examples:
+        >>> lengths = [5, 3, 2]
+        >>> make_non_pad_mask(lengths)
+        masks = [[1, 1, 1, 1 ,1],
+                 [1, 1, 1, 0, 0],
+                 [1, 1, 0, 0, 0]]
+    """
+    return ~make_pad_mask(lengths)
+
+
+def mask_finished_scores(score: torch.Tensor,
+                         flag: torch.Tensor) -> torch.Tensor:
+    """
+    If a sequence is finished, we only allow one alive branch. This function
+    aims to give one branch a zero score and the rest -inf score.
+
+    Args:
+        score (torch.Tensor): A real value array with shape
+            (batch_size * beam_size, beam_size).
+        flag (torch.Tensor): A bool array with shape
+            (batch_size * beam_size, 1).
+
+    Returns:
+        torch.Tensor: (batch_size * beam_size, beam_size).
+    """
+    beam_size = score.size(-1)
+    zero_mask = torch.zeros_like(flag, dtype=torch.bool)
+    if beam_size > 1:
+        unfinished = torch.cat((zero_mask, flag.repeat([1, beam_size - 1])),
+                               dim=1)
+        finished = torch.cat((flag, zero_mask.repeat([1, beam_size - 1])),
+                             dim=1)
+    else:
+        unfinished = zero_mask
+        finished = flag
+    score.masked_fill_(unfinished, -float('inf'))
+    score.masked_fill_(finished, 0)
+    return score
+
+
+def mask_finished_preds(pred: torch.Tensor, flag: torch.Tensor,
+                        eos: int) -> torch.Tensor:
+    """
+    If a sequence is finished, all of its branch should be <eos>
+
+    Args:
+        pred (torch.Tensor): A int array with shape
+            (batch_size * beam_size, beam_size).
+        flag (torch.Tensor): A bool array with shape
+            (batch_size * beam_size, 1).
+
+    Returns:
+        torch.Tensor: (batch_size * beam_size).
+    """
+    beam_size = pred.size(-1)
+    finished = flag.repeat([1, beam_size])
+    return pred.masked_fill_(finished, eos)
+
+
+def causal_or_lookahead_mask(
+    mask: torch.Tensor,
+    right_context: int,
+    left_context: int,
+    left_t_valid: int = 0,
+) -> torch.Tensor:
+    """Create mask (B, T, T) with history or future or both,
+       this is for causal or noncausal streaming encoder
+
+    Args:
+        mask (torch.Tensor): size of mask shape (B, 1, T)
+        right_context (int): future context size
+        left_context (int): history context size
+        left_t_valid (int): valid start offset
+
+    Returns:
+        torch.Tensor: mask shape (B, T, T)
+
+    Examples:
+        >>> seq_len  = torch.tensor([2,3,4])
+        >>> seq_mask = make_non_pad_mask(seq_len)
+        [[1, 1, 0, 0],
+        [1, 1, 1, 0],
+        [1, 1, 1, 1]]
+        >>> causal_or_lookahead_mask(seq_mask.unsqueeze(1), 0, 2)
+        [[[1, 0, 0, 0],
+         [1, 1, 0, 0],
+         [0, 0, 0, 0],
+         [0, 0, 0, 0]],
+
+        [[1, 0, 0, 0],
+         [1, 1, 0, 0],
+         [1, 1, 1, 0],
+         [0, 0, 0, 0]],
+
+        [[1, 0, 0, 0],
+         [1, 1, 0, 0],
+         [1, 1, 1, 0],
+         [0, 1, 1, 1]]]
+        >>> causal_or_lookahead_mask(seq_mask.unsqueeze(1), 1, 2)
+        [[[1, 1, 0, 0],
+         [1, 1, 0, 0],
+         [0, 0, 0, 0],
+         [0, 0, 0, 0]],
+
+        [[1, 1, 0, 0],
+         [1, 1, 1, 0],
+         [1, 1, 1, 0],
+         [0, 0, 0, 0]],
+
+        [[1, 1, 0, 0],
+         [1, 1, 1, 0],
+         [1, 1, 1, 1],
+         [0, 1, 1, 1]]]
+    """
+    _, _, T = mask.size()
+    indices = torch.arange(T, device=mask.device)
+    start = torch.where(indices > left_context, indices - left_context, 0)
+    start = torch.where(indices < left_t_valid, indices, start).unsqueeze(1)
+
+    end = indices + right_context + 1
+    end = end.unsqueeze(1)
+    indices_expand = indices.unsqueeze(0)
+    gt = (indices_expand >= start)
+    lt = (indices_expand < end)
+
+    return (gt & lt) * mask.transpose(1, 2) * mask
diff --git a/wenet/utils/rope_utils.py b/wenet/utils/rope_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..54f13c47b8e81383bed947844306c55fe5501e05
--- /dev/null
+++ b/wenet/utils/rope_utils.py
@@ -0,0 +1,39 @@
+import torch
+
+
+# copy from:https://github.com/google/gemma_pytorch/blob/main/gemma/model.py#L84
+def precompute_freqs_cis(dim: int,
+                         end: int,
+                         theta: float = 10000.0) -> torch.Tensor:
+    """Precomputes the frequency cis."""
+    freqs = 1.0 / (theta**(torch.arange(0, dim, 2)[:(dim // 2)].float() / dim))
+    t = torch.arange(end, device=freqs.device)
+    freqs = torch.outer(t, freqs).float()
+    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64
+    return freqs_cis
+
+
+# modified from:
+#     https://github.com/google/gemma_pytorch/blob/main/gemma/model.py#L95
+def google_apply_rotary_emb(x: torch.Tensor,
+                            freqs_cis: torch.Tensor) -> torch.Tensor:
+    """Applies the rotary embedding to the query and key tensors."""
+    x_ = torch.view_as_complex(
+        torch.stack(torch.chunk(x.float(), 2, dim=-1), dim=-1))
+    x_out = torch.view_as_real(x_ * freqs_cis).type_as(x)
+    x_out = torch.cat(torch.chunk(x_out, 2, dim=-1), dim=-2)
+    x_out = x_out.reshape(x_out.shape[0], x_out.shape[1], x_out.shape[2], -1)
+    return x_out
+
+
+def llama_apply_rotary_emb(x: torch.Tensor,
+                           freqs_cis: torch.Tensor) -> torch.Tensor:
+    x_ = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
+    x_out = torch.view_as_real(x_ * freqs_cis).flatten(3)
+    return x_out.type_as(x)
+
+
+WENET_APPLY_ROTARY_EMB = {
+    'google': google_apply_rotary_emb,
+    'llama': llama_apply_rotary_emb,
+}
diff --git a/wenet/utils/scheduler.py b/wenet/utils/scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..170e4fd1db5ecced2d992e8b6e7502afd1523619
--- /dev/null
+++ b/wenet/utils/scheduler.py
@@ -0,0 +1,722 @@
+# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang)
+#               2022 Ximalaya Inc (Yuguang Yang)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from ESPnet(https://github.com/espnet/espnet)
+#               NeMo(https://github.com/NVIDIA/NeMo)
+
+from typing import List, Union
+
+import math
+import warnings
+import torch
+from torch.optim.lr_scheduler import _LRScheduler
+
+
+class WarmupLR(_LRScheduler):
+    """The WarmupLR scheduler
+
+    This scheduler is almost same as NoamLR Scheduler except for following
+    difference:
+
+    NoamLR:
+        lr = optimizer.lr * model_size ** -0.5
+             * min(step ** -0.5, step * warmup_step ** -1.5)
+    WarmupLR:
+        lr = optimizer.lr * warmup_step ** 0.5
+             * min(step ** -0.5, step * warmup_step ** -1.5)
+
+    Note that the maximum lr equals to optimizer.lr in this scheduler.
+
+    """
+
+    def __init__(
+        self,
+        optimizer: torch.optim.Optimizer,
+        warmup_steps: Union[int, float, List[Union[int, float]]] = 25000,
+        last_epoch: int = -1,
+    ):
+        self.warmup_steps = warmup_steps
+        # __init__() must be invoked before setting field
+        # because step() is also invoked in __init__()
+        super().__init__(optimizer, last_epoch)
+
+    def __repr__(self):
+        return f"{self.__class__.__name__}(warmup_steps={self.warmup_steps})"
+
+    def get_lr(self):
+        step_num = self.last_epoch + 1
+        warmup_steps = self.warmup_steps
+        if not isinstance(warmup_steps, List):
+            warmup_steps = [self.warmup_steps] * len(self.base_lrs)
+
+        def initlr_fn(lr):
+            return lr * step_num**-0.5
+
+        def warmuplr_fn(lr, warmup_step):
+            return lr * warmup_step**0.5 * min(step_num**-0.5,
+                                               step_num * warmup_step**-1.5)
+
+        return [
+            initlr_fn(lr) if warmup_steps[i] == 0 else warmuplr_fn(
+                lr, warmup_steps[i]) for (i, lr) in enumerate(self.base_lrs)
+        ]
+
+    def set_step(self, step: int):
+        self.last_epoch = step
+
+
+class WarmupPolicy(_LRScheduler):
+    """Adds warmup kwargs and warmup logic to lr policy.
+    All arguments should be passed as kwargs for clarity,
+    Args:
+        warmup_steps: Number of training steps in warmup stage
+        warmup_ratio: Ratio of warmup steps to total steps
+        max_steps: Total number of steps while training or `None` for
+            infinite training
+    """
+
+    def __init__(self,
+                 optimizer,
+                 *,
+                 warmup_steps=None,
+                 warmup_ratio=None,
+                 max_steps=None,
+                 min_lr=0.0,
+                 last_epoch=-1):
+        assert not (warmup_steps is not None and warmup_ratio is not None),\
+            "Either use particular number of step or ratio"
+        assert warmup_ratio is None or max_steps is not None, \
+            "If there is a ratio, there should be a total steps"
+
+        # It is necessary to assign all attributes *before* __init__,
+        # as class is wrapped by an inner class.
+        self.max_steps = max_steps
+        if warmup_steps is not None:
+            self.warmup_steps = warmup_steps
+        elif warmup_ratio is not None:
+            self.warmup_steps = int(warmup_ratio * max_steps)
+        else:
+            self.warmup_steps = 0
+
+        self.min_lr = min_lr
+        super().__init__(optimizer, last_epoch)
+
+    def get_lr(self):
+        if not self._get_lr_called_within_step:
+            warnings.warn(
+                "To get the last learning rate computed "
+                "by the scheduler, please use `get_last_lr()`.",
+                UserWarning,
+                stacklevel=2)
+
+        step = self.last_epoch
+
+        if step <= self.warmup_steps and self.warmup_steps > 0:
+            return self._get_warmup_lr(step)
+
+        if step > self.max_steps:
+            return [self.min_lr for _ in self.base_lrs]
+
+        return self._get_lr(step)
+
+    def _get_warmup_lr(self, step):
+        lr_val = (step + 1) / (self.warmup_steps + 1)
+        return [initial_lr * lr_val for initial_lr in self.base_lrs]
+
+    def _get_lr(self, step):
+        """Simple const lr policy"""
+        return self.base_lrs
+
+
+class SquareRootConstantPolicy(_LRScheduler):
+    """Adds warmup kwargs and warmup logic to lr policy.
+    All arguments should be passed as kwargs for clarity,
+    Args:
+        warmup_steps: Number of training steps in warmup stage
+        warmup_ratio: Ratio of warmup steps to total steps
+        max_steps: Total number of steps while training or `None` for
+            infinite training
+    """
+
+    def __init__(self,
+                 optimizer,
+                 *,
+                 constant_steps=None,
+                 constant_ratio=None,
+                 max_steps=None,
+                 min_lr=0.0,
+                 last_epoch=-1):
+        assert not (constant_steps is not None
+                    and constant_ratio is not None), \
+            "Either use particular number of step or ratio"
+        assert constant_ratio is None or max_steps is not None, \
+            "If there is a ratio, there should be a total steps"
+
+        # It is necessary to assign all attributes *before* __init__,
+        # as class is wrapped by an inner class.
+        self.max_steps = max_steps
+        if constant_steps is not None:
+            self.constant_steps = constant_steps
+        elif constant_ratio is not None:
+            self.constant_steps = int(constant_ratio * max_steps)
+        else:
+            self.constant_steps = 0
+
+        self.constant_lr = 1 / (constant_steps**0.5)
+        self.min_lr = min_lr
+        super().__init__(optimizer, last_epoch)
+
+    def get_lr(self):
+        if not self._get_lr_called_within_step:
+            warnings.warn(
+                "To get the last learning rate computed "
+                "by the scheduler, please use `get_last_lr()`.",
+                UserWarning,
+                stacklevel=2)
+
+        step = self.last_epoch
+
+        if step <= self.constant_steps:
+            return [self.constant_lr for _ in self.base_lrs]
+
+        if step > self.max_steps:
+            return [self.min_lr for _ in self.base_lrs]
+
+        return self._get_lr(step)
+
+    def _get_lr(self, step):
+        """Simple const lr policy"""
+        return self.base_lrs
+
+
+class WarmupHoldPolicy(WarmupPolicy):
+    """Variant of WarmupPolicy which maintains high
+       learning rate for a defined number of steps.
+    All arguments should be passed as kwargs for clarity,
+    Args:
+        warmup_steps: Number of training steps in warmup stage
+        warmup_ratio: Ratio of warmup steps to total steps
+        hold_steps: Number of training steps to
+                    hold the learning rate after warm up
+        hold_ratio: Ratio of hold steps to total steps
+        max_steps: Total number of steps while training or `None` for
+            infinite training
+    """
+
+    def __init__(
+        self,
+        optimizer,
+        *,
+        warmup_steps=None,
+        warmup_ratio=None,
+        hold_steps=None,
+        hold_ratio=None,
+        max_steps=None,
+        min_lr=0.0,
+        last_epoch=-1,
+    ):
+        assert not (hold_steps is not None and hold_ratio is not None), \
+            "Either use particular number of step or ratio"
+        assert hold_ratio is None or max_steps is not None, \
+            "If there is a ratio, there should be a total steps"
+
+        self.min_lr = min_lr
+        self._last_warmup_lr = 0.0
+
+        # Necessary to duplicate as class attributes are hidden in inner class
+        self.max_steps = max_steps
+        if warmup_steps is not None:
+            self.warmup_steps = warmup_steps
+        elif warmup_ratio is not None:
+            self.warmup_steps = int(warmup_ratio * max_steps)
+        else:
+            self.warmup_steps = 0
+
+        if hold_steps is not None:
+            self.hold_steps = hold_steps + self.warmup_steps
+        elif hold_ratio is not None:
+            self.hold_steps = int(hold_ratio * max_steps) + self.warmup_steps
+        else:
+            self.hold_steps = 0
+
+        super().__init__(
+            optimizer,
+            warmup_steps=warmup_steps,
+            warmup_ratio=warmup_ratio,
+            max_steps=max_steps,
+            last_epoch=last_epoch,
+            min_lr=min_lr,
+        )
+
+    def get_lr(self):
+        if not self._get_lr_called_within_step:
+            warnings.warn(
+                "To get the last learning rate computed by the scheduler,"
+                " "
+                "please use `get_last_lr()`.",
+                UserWarning,
+                stacklevel=2)
+
+        step = self.last_epoch
+
+        # Warmup phase
+        if step <= self.warmup_steps and self.warmup_steps > 0:
+            return self._get_warmup_lr(step)
+
+        # Hold phase
+        if (step >= self.warmup_steps) and (step < self.hold_steps):
+            return self.base_lrs
+
+        if step > self.max_steps:
+            return [self.min_lr for _ in self.base_lrs]
+
+        return self._get_lr(step)
+
+
+class WarmupAnnealHoldPolicy(_LRScheduler):
+    """Adds warmup kwargs and warmup logic to lr policy.
+    All arguments should be passed as kwargs for clarity,
+    Args:
+        warmup_steps: Number of training steps in warmup stage
+        warmup_ratio: Ratio of warmup steps to total steps
+        max_steps: Total number of steps while training or `None` for
+            infinite training
+        min_lr: Minimum lr to hold the learning rate after decay at.
+        constant_steps: Number of steps to keep lr constant at.
+        constant_ratio: Ratio of steps to keep lr constant.
+    """
+
+    def __init__(
+        self,
+        optimizer,
+        *,
+        warmup_steps=None,
+        warmup_ratio=None,
+        constant_steps=None,
+        constant_ratio=None,
+        max_steps=None,
+        min_lr=0.0,
+        last_epoch=-1,
+    ):
+        assert not (warmup_steps is not None
+                    and warmup_ratio is not None), \
+            "Either use particular number of step or ratio"
+        assert not (constant_steps is not None
+                    and constant_ratio is not None), \
+            "Either use constant_steps or constant_ratio"
+        assert warmup_ratio is None or max_steps is not None, \
+            "If there is a ratio, there should be a total steps"
+
+        # It is necessary to assign all attributes *before* __init__,
+        # as class is wrapped by an inner class.
+        self.max_steps = max_steps
+
+        if warmup_steps is not None:
+            self.warmup_steps = warmup_steps
+        elif warmup_ratio is not None:
+            self.warmup_steps = int(warmup_ratio * max_steps)
+        else:
+            self.warmup_steps = 0
+
+        if constant_steps is not None:
+            self.constant_steps = constant_steps
+        elif constant_ratio is not None:
+            self.constant_steps = int(constant_ratio * max_steps)
+        else:
+            self.constant_steps = 0
+
+        self.decay_steps = max_steps - (self.constant_steps +
+                                        self.warmup_steps)
+
+        self.min_lr = min_lr
+        super().__init__(optimizer, last_epoch)
+
+    def get_lr(self):
+        if not self._get_lr_called_within_step:
+            warnings.warn(
+                "To get the last learning rate computed "
+                "by the scheduler, please use `get_last_lr()`.",
+                UserWarning,
+                stacklevel=2)
+
+        step = self.last_epoch
+
+        # Warmup steps
+        if self.warmup_steps > 0 and step <= self.warmup_steps:
+            return self._get_warmup_lr(step)
+
+        # Constant steps after warmup and decay
+        if self.constant_steps > 0 and (
+                self.warmup_steps + self.decay_steps) < step <= self.max_steps:
+            return self._get_constant_lr(step)
+
+        # Min lr after max steps of updates
+        if step > self.max_steps:
+            return [self.min_lr for _ in self.base_lrs]
+
+        return self._get_lr(step)
+
+    def _get_warmup_lr(self, step):
+        lr_val = (step + 1) / (self.warmup_steps + 1)
+        return [initial_lr * lr_val for initial_lr in self.base_lrs]
+
+    def _get_constant_lr(self, step):
+        return [self.min_lr for _ in self.base_lrs]
+
+    def _get_lr(self, step):
+        """Simple const lr policy"""
+        return self.base_lrs
+
+
+def _squareroot_annealing(initial_lr, step, max_steps, min_lr):
+    mult = ((max_steps - step) / max_steps)**0.5
+    out_lr = initial_lr * mult
+    out_lr = max(out_lr, min_lr)
+    return out_lr
+
+
+def _square_annealing(initial_lr, step, max_steps, min_lr):
+    mult = ((max_steps - step) / max_steps)**2
+    out_lr = initial_lr * mult
+    out_lr = max(out_lr, min_lr)
+    return out_lr
+
+
+def _cosine_annealing(initial_lr, step, max_steps, min_lr):
+    mult = 0.5 * (1 + math.cos(math.pi * step / max_steps))
+    out_lr = (initial_lr - min_lr) * mult + min_lr
+    return out_lr
+
+
+def _linear_warmup_with_cosine_annealing(max_lr, warmup_steps, step,
+                                         decay_steps, min_lr):
+    assert max_lr > min_lr
+    # Use linear warmup for the initial part.
+    if warmup_steps > 0 and step <= warmup_steps:
+        return max_lr * float(step) / float(warmup_steps)
+
+    # For any steps larger than `decay_steps`, use `min_lr`.
+    if step > warmup_steps + decay_steps:
+        return min_lr
+
+    # If we are done with the warmup period, use the decay style.
+    num_steps_ = step - warmup_steps
+    decay_steps_ = decay_steps
+    decay_ratio = float(num_steps_) / float(decay_steps_)
+    assert decay_ratio >= 0.0
+    assert decay_ratio <= 1.0
+    delta_lr = max_lr - min_lr
+
+    coeff = 0.5 * (math.cos(math.pi * decay_ratio) + 1.0)
+
+    return min_lr + coeff * delta_lr
+
+
+def _poly_decay(initial_lr, step, decay_steps, power, min_lr, cycle):
+    if cycle:
+        multiplier = 1.0 if step == 0 else math.ceil(step / decay_steps)
+        decay_steps *= multiplier
+    else:
+        step = min(step, decay_steps)
+    p = step / decay_steps
+    lr = (initial_lr - min_lr) * math.pow(1.0 - p, power)
+    lr += min_lr
+    return lr
+
+
+def _noam_hold_annealing(initial_lr, step, warmup_steps, hold_steps,
+                         decay_rate, min_lr):
+    # hold_steps = total number of steps
+    # to hold the LR, not the warmup + hold steps.
+    T_warmup_decay = max(1, warmup_steps**decay_rate)
+    T_hold_decay = max(1, (step - hold_steps)**decay_rate)
+    lr = (initial_lr * T_warmup_decay) / T_hold_decay
+    lr = max(lr, min_lr)
+    return lr
+
+
+class SquareAnnealing(WarmupPolicy):
+
+    def __init__(self,
+                 optimizer,
+                 *,
+                 max_steps,
+                 min_lr=1e-5,
+                 last_epoch=-1,
+                 **kwargs):
+        super().__init__(optimizer=optimizer,
+                         max_steps=max_steps,
+                         last_epoch=last_epoch,
+                         min_lr=min_lr,
+                         **kwargs)
+
+    def _get_lr(self, step):
+        new_lrs = [
+            _square_annealing(
+                initial_lr=initial_lr,
+                step=step - self.warmup_steps,
+                max_steps=self.max_steps - self.warmup_steps,
+                min_lr=self.min_lr,
+            ) for initial_lr in self.base_lrs
+        ]
+        return new_lrs
+
+
+class SquareRootAnnealing(WarmupPolicy):
+
+    def __init__(self,
+                 optimizer,
+                 *,
+                 max_steps,
+                 min_lr=0,
+                 last_epoch=-1,
+                 **kwargs):
+        super().__init__(optimizer=optimizer,
+                         max_steps=max_steps,
+                         last_epoch=last_epoch,
+                         min_lr=min_lr,
+                         **kwargs)
+
+    def _get_lr(self, step):
+        new_lrs = [
+            _squareroot_annealing(initial_lr=initial_lr,
+                                  step=step,
+                                  max_steps=self.max_steps,
+                                  min_lr=self.min_lr)
+            for initial_lr in self.base_lrs
+        ]
+        return new_lrs
+
+
+class CosineAnnealing(WarmupAnnealHoldPolicy):
+
+    def __init__(self,
+                 optimizer,
+                 *,
+                 max_steps,
+                 min_lr=0,
+                 last_epoch=-1,
+                 **kwargs):
+        super().__init__(optimizer=optimizer,
+                         max_steps=max_steps,
+                         last_epoch=last_epoch,
+                         min_lr=min_lr,
+                         **kwargs)
+
+    def _get_lr(self, step):
+        for initial_lr in self.base_lrs:
+            if initial_lr < self.min_lr:
+                raise ValueError(
+                    f"{self} received an initial learning rate "
+                    f"that was lower than the minimum learning rate.")
+
+        if self.constant_steps is None or self.constant_steps == 0:
+            new_lrs = [
+                _cosine_annealing(
+                    initial_lr=initial_lr,
+                    step=step - self.warmup_steps,
+                    max_steps=self.max_steps - self.warmup_steps,
+                    min_lr=self.min_lr,
+                ) for initial_lr in self.base_lrs
+            ]
+        else:
+            new_lrs = self._get_linear_warmup_with_cosine_annealing_lr(step)
+        return new_lrs
+
+    def _get_warmup_lr(self, step):
+        if self.constant_steps is None or self.constant_steps == 0:
+            return super()._get_warmup_lr(step)
+        else:
+            # Use linear warmup for the initial part.
+            return self._get_linear_warmup_with_cosine_annealing_lr(step)
+
+    def _get_constant_lr(self, step):
+        # Only called when `constant_steps` > 0.
+        return self._get_linear_warmup_with_cosine_annealing_lr(step)
+
+    def _get_linear_warmup_with_cosine_annealing_lr(self, step):
+        # Cosine Schedule for Megatron LM,
+        # slightly different warmup schedule + constant LR at the end.
+        new_lrs = [
+            _linear_warmup_with_cosine_annealing(
+                max_lr=self.base_lrs[0],
+                warmup_steps=self.warmup_steps,
+                step=step,
+                decay_steps=self.decay_steps,
+                min_lr=self.min_lr,
+            ) for _ in self.base_lrs
+        ]
+        return new_lrs
+
+
+class NoamAnnealing(_LRScheduler):
+
+    def __init__(self,
+                 optimizer,
+                 *,
+                 d_model,
+                 warmup_steps=None,
+                 warmup_ratio=None,
+                 max_steps=None,
+                 min_lr=0.0,
+                 last_epoch=-1):
+        self._normalize = d_model**(-0.5)
+        assert not (warmup_steps is not None
+                    and warmup_ratio is not None), \
+            "Either use particular number of step or ratio"
+        assert warmup_ratio is None or max_steps is not None, \
+            "If there is a ratio, there should be a total steps"
+
+        # It is necessary to assign all attributes *before* __init__,
+        # as class is wrapped by an inner class.
+        self.max_steps = max_steps
+        if warmup_steps is not None:
+            self.warmup_steps = warmup_steps
+        elif warmup_ratio is not None:
+            self.warmup_steps = int(warmup_ratio * max_steps)
+        else:
+            self.warmup_steps = 0
+
+        self.min_lr = min_lr
+        super().__init__(optimizer, last_epoch)
+
+    def get_lr(self):
+        if not self._get_lr_called_within_step:
+            warnings.warn(
+                "To get the last learning rate computed "
+                "by the scheduler, please use `get_last_lr()`.",
+                UserWarning,
+                stacklevel=2)
+
+        step = max(1, self.last_epoch)
+
+        for initial_lr in self.base_lrs:
+            if initial_lr < self.min_lr:
+                raise ValueError(
+                    f"{self} received an initial learning rate "
+                    f"that was lower than the minimum learning rate.")
+
+        new_lrs = [
+            self._noam_annealing(initial_lr=initial_lr, step=step)
+            for initial_lr in self.base_lrs
+        ]
+        return new_lrs
+
+    def _noam_annealing(self, initial_lr, step):
+        if self.warmup_steps > 0:
+            mult = self._normalize * min(step**(-0.5),
+                                         step * (self.warmup_steps**(-1.5)))
+        else:
+            mult = self._normalize * step**(-0.5)
+
+        out_lr = initial_lr * mult
+        if step > self.warmup_steps:
+            out_lr = max(out_lr, self.min_lr)
+        return out_lr
+
+
+class NoamHoldAnnealing(WarmupHoldPolicy):
+
+    def __init__(self,
+                 optimizer,
+                 *,
+                 max_steps,
+                 decay_rate=0.5,
+                 min_lr=0.0,
+                 last_epoch=-1,
+                 **kwargs):
+        """
+        From Nemo:
+        Implementation of the Noam Hold Annealing policy
+        from the SqueezeFormer paper.
+
+        Unlike NoamAnnealing, the peak learning rate
+        can be explicitly set for this scheduler.
+        The schedule first performs linear warmup,
+        then holds the peak LR, then decays with some schedule for
+        the remainder of the steps.
+        Therefore the min-lr is still dependent
+        on the hyper parameters selected.
+
+        It's schedule is determined by three factors-
+
+        Warmup Steps: Initial stage, where linear warmup
+            occurs uptil the peak LR is reached. Unlike NoamAnnealing,
+            the peak LR is explicitly stated here instead of a scaling factor.
+
+        Hold Steps: Intermediate stage, where the peak LR
+            is maintained for some number of steps. In this region,
+            the high peak LR allows the model to converge faster
+            if training is stable. However the high LR
+            may also cause instability during training.
+            Should usually be a significant fraction of training
+            steps (around 30-40% of the entire training steps).
+
+        Decay Steps: Final stage, where the LR rapidly decays
+            with some scaling rate (set by decay rate).
+            To attain Noam decay, use 0.5,
+            for Squeezeformer recommended decay, use 1.0.
+            The fast decay after prolonged high LR during
+            hold phase allows for rapid convergence.
+
+        References:
+            - [Squeezeformer:
+            An Efficient Transformer for Automatic Speech Recognition]
+            (https://arxiv.org/abs/2206.00888)
+
+        Args:
+            optimizer: Pytorch compatible Optimizer object.
+            warmup_steps: Number of training steps in warmup stage
+            warmup_ratio: Ratio of warmup steps to total steps
+            hold_steps: Number of training steps to
+                        hold the learning rate after warm up
+            hold_ratio: Ratio of hold steps to total steps
+            max_steps: Total number of steps while training or `None` for
+                infinite training
+            decay_rate: Float value describing the polynomial decay
+                        after the hold period. Default value
+                        of 0.5 corresponds to Noam decay.
+            min_lr: Minimum learning rate.
+        """
+        self.decay_rate = decay_rate
+        super().__init__(optimizer=optimizer,
+                         max_steps=max_steps,
+                         last_epoch=last_epoch,
+                         min_lr=min_lr,
+                         **kwargs)
+
+    def _get_lr(self, step):
+        if self.warmup_steps is None or self.warmup_steps == 0:
+            raise ValueError(
+                "Noam scheduler cannot be used without warmup steps")
+
+        if self.hold_steps > 0:
+            hold_steps = self.hold_steps - self.warmup_steps
+        else:
+            hold_steps = 0
+
+        new_lrs = [
+            _noam_hold_annealing(
+                initial_lr,
+                step=step,
+                warmup_steps=self.warmup_steps,
+                hold_steps=hold_steps,
+                decay_rate=self.decay_rate,
+                min_lr=self.min_lr,
+            ) for initial_lr in self.base_lrs
+        ]
+        return new_lrs
+
+    def set_step(self, step: int):
+        self.last_epoch = step
diff --git a/wenet/utils/train_utils.py b/wenet/utils/train_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..69d478b7b3d7c626c5e6ef7358e2a3ca2e6edada
--- /dev/null
+++ b/wenet/utils/train_utils.py
@@ -0,0 +1,942 @@
+# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)
+#               2023 Tsinghua Univ. (authors: Xingchen Song)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from contextlib import nullcontext
+import copy
+from typing import List, Optional
+
+import json
+import logging
+import os
+import torch
+import yaml
+
+import torch.optim as optim
+import torch.distributed as dist
+
+from tensorboardX import SummaryWriter
+from torch.utils.data import DataLoader
+from torch.nn.utils import clip_grad_norm_
+from torch.distributed.fsdp import (FullyShardedDataParallel as FSDP,
+                                    CPUOffload, MixedPrecision,
+                                    sharded_grad_scaler, ShardingStrategy)
+try:
+    import deepspeed
+    from deepspeed.runtime.zero.stage_1_and_2 import (
+        estimate_zero2_model_states_mem_needs_all_live)
+    from deepspeed.runtime.zero.stage3 import (
+        estimate_zero3_model_states_mem_needs_all_live)
+    from deepspeed.utils.zero_to_fp32 import (
+        convert_zero_checkpoint_to_fp32_state_dict)
+except ImportError:
+    pass
+
+
+from wenet.utils.checkpoint import save_checkpoint
+from wenet.utils.common import (StepTimer, get_nested_attribute, lrs_to_str,
+                                tensor_to_scalar)
+from wenet.utils.fsdp_utils import (check_gradient_checkpoint, fsdp_save_model,
+                                    apply_fsdp_checkpointing,
+                                    wenet_fsdp_wrap_policy)
+from wenet.utils.scheduler import WarmupLR, NoamHoldAnnealing
+from wenet.utils.ctc_utils import get_blank_id
+from wenet.utils.common import TORCH_NPU_AVAILABLE
+from wenet.utils.init_dataset import init_dataset
+
+
+def add_model_args(parser):
+    parser.add_argument('--config', required=True, help='config file')
+    parser.add_argument('--model_dir', required=True, help='save model dir')
+    parser.add_argument('--checkpoint', help='checkpoint model')
+    parser.add_argument('--tensorboard_dir',
+                        default='tensorboard',
+                        help='tensorboard log dir')
+    parser.add_argument('--override_config',
+                        action='append',
+                        default=[],
+                        help="override yaml config")
+    parser.add_argument("--enc_init",
+                        default=None,
+                        type=str,
+                        help="Pre-trained model to initialize encoder")
+    parser.add_argument(
+        '--enc_init_mods',
+        default="encoder.",
+        type=lambda s: [str(mod) for mod in s.split(",") if s != ""],
+        help="List of encoder modules \
+                        to initialize ,separated by a comma")
+    parser.add_argument(
+        '--freeze_modules',
+        default="",
+        type=lambda s: [str(mod) for mod in s.split(",") if s != ""],
+        help='free module names',
+    )
+    return parser
+
+
+def add_trace_args(parser):
+    parser.add_argument('--jit',
+                        action='store_true',
+                        default=False,
+                        help='if use jit to trace model while training stage')
+    parser.add_argument('--print_model',
+                        action='store_true',
+                        default=False,
+                        help='print model')
+    return parser
+
+
+def add_dataset_args(parser):
+    parser.add_argument('--data_type',
+                        default='raw',
+                        # choices=['raw', 'shard'],
+                        help='train and cv data type')
+    parser.add_argument('--train_data', required=True, help='train data file')
+    parser.add_argument('--cv_data', required=True, help='cv data file')
+    parser.add_argument('--num_workers',
+                        default=0,
+                        type=int,
+                        help='num of subprocess workers for reading')
+    parser.add_argument('--pin_memory',
+                        action='store_true',
+                        default=False,
+                        help='Use pinned memory buffers used for reading')
+    parser.add_argument('--prefetch',
+                        default=100,
+                        type=int,
+                        help='prefetch number')
+    return parser
+
+
+def add_lora_args(parser):
+    '''Configure parameters for LoRA fine-tuning. Set use_lora and
+       only_optimize_lora to true to enable LoRA functionality.
+       LoRA will be injected to model through (lora_modules, lora_attn_attr,
+       lora_list).
+       LoRA weights will be merged after calling model.eval()
+       (or model.train(mode=False)).
+       LoRA weights need to be loaded after fine-tuning with DeepSpeed.
+    '''
+    parser.add_argument("--use_lora",
+                        default=False,
+                        type=bool,
+                        help="whether use the lora finetune.")
+    parser.add_argument("--only_optimize_lora",
+                        default=False,
+                        type=bool,
+                        help="freeze all other paramters and only optimize \
+                        LoRA-related prameters.")
+    parser.add_argument(
+        '--lora_modules',
+        default="encoder.encoders",
+        type=lambda s: [str(mod) for mod in s.split(",") if s != ""],
+        help='modules names needs inject lora',
+    )
+    parser.add_argument(
+        "--lora_attn_attr",
+        default="self_attn,src_attn",
+        type=lambda s: [str(mod) for mod in s.split(",") if s != ""],
+        help="lora_attn_attr.")
+    parser.add_argument(
+        "--lora_list",
+        default="linear_out,linear_q,linear_k,linear_v",
+        type=lambda s: [str(mod) for mod in s.split(",") if s != ""],
+        help="lora module list.")
+    parser.add_argument("--lora_rank",
+                        default=8,
+                        type=int,
+                        help="lora rank num.")
+    parser.add_argument("--lora_alpha",
+                        default=8,
+                        type=int,
+                        help="lora scale param, scale=lora_alpha/lora_rank.")
+    parser.add_argument("--lora_dropout",
+                        default=0,
+                        type=float,
+                        help="lora dropout param.")
+    parser.add_argument("--lora_ckpt_path",
+                        default=None,
+                        type=str,
+                        help="lora checkpoint path.")
+    parser.add_argument("--lora_reinit",
+                        default=False,
+                        type=bool,
+                        help="whether use the lora init, default is zero init.")
+    parser.add_argument('--lora_init_yaml',
+                        default="wenet/finetune/lora/config.yaml",
+                        type=str,
+                        help='Path to the configuration YAML file')
+    return parser
+
+
+def add_ddp_args(parser):
+    parser.add_argument('--ddp.dist_backend',
+                        dest='dist_backend',
+                        default='nccl',
+                        choices=['nccl', 'gloo', "hccl"],
+                        help='distributed backend')
+    parser.add_argument('--use_amp',
+                        action='store_true',
+                        default=False,
+                        help='Use automatic mixed precision training')
+    parser.add_argument('--fp16_grad_sync',
+                        action='store_true',
+                        default=False,
+                        help='Use fp16 gradient sync for ddp')
+    return parser
+
+
+def add_deepspeed_args(parser):
+    parser.add_argument('--timeout',
+                        default=30,
+                        type=int,
+                        help='timeout (in seconds) of wenet_join. ' +
+                        '30s for aishell & 300s for wenetspeech')
+    parser.add_argument('--local_rank',
+                        type=int,
+                        default=-1,
+                        help='local rank passed from distributed launcher')
+    parser.add_argument('--deepspeed.save_states',
+                        dest='save_states',
+                        default='model_only',
+                        choices=['model_only', 'model+optimizer'],
+                        help='save model/optimizer states')
+    # DeepSpeed automaticly add '--deepspeed' and '--deepspeed_config' to parser
+    try:
+        parser = deepspeed.add_config_arguments(parser)
+    except Exception as e:
+        print(e)
+    return parser
+
+
+def add_fsdp_args(parser):
+    parser.add_argument(
+        '--dtype',
+        default='fp32',
+        choices=['fp32', 'fp16', 'bf16'],
+        help='when amp is used, dtype is automatically set to fp16.\
+        this arg has no effect when deepspeed is enabled.')
+    parser.add_argument(
+        '--fsdp_cpu_offload',
+        default=False,
+        type=bool,
+        help='whether to offload parameters to CPU',
+    )
+    parser.add_argument(
+        '--fsdp_sync_module_states',
+        type=bool,
+        default=True,
+        help='\
+        each FSDP module will broadcast module parameters and buffers from \
+        rank 0 to ensure that they are replicated across ranks',
+    )
+    parser.add_argument(
+        '--fsdp_sharding_strategy',
+        default='zero2',
+        # TODO(Mddct): pipeline and model parallel (3-D parallelism)
+        choices=['no_shard', 'model', 'zero2', 'zero3'],
+        help='Sharding strategy for FSDP. Choose from the following options:\n'
+        '  - "no_shard": Equivalent to DistributedDataParallel (DDP).\n'
+        '  - "model": WENET_ENC_DEC strategy, equivalent to DeepSpeed zero1.\n'
+        '  - "zero2": SHARD_GRAD_OP strategy, equivalent to DeepSpeed zero2.\n'
+        '  - "zero3": FULL_SHARD strategy, equivalent to DeepSpeed zero3.\n'
+        'For more information, refer to the FSDP API documentation.')
+    return parser
+
+
+def init_distributed(args):
+    world_size = int(os.environ.get('WORLD_SIZE', 1))
+    local_rank = int(os.environ.get('LOCAL_RANK', 0))
+    rank = int(os.environ.get('RANK', 0))
+    logging.info('training on multiple gpus, this gpu {}'.format(local_rank) +
+                 ', rank {}, world_size {}'.format(rank, world_size))
+    if args.train_engine in ["torch_ddp", "torch_fsdp"]:
+        if "cuda" in args.device:
+            torch.cuda.set_device(local_rank)
+        elif "npu" in args.device and TORCH_NPU_AVAILABLE:
+            torch.npu.set_device(local_rank)
+        else:
+            logging.error("not supported device: {}".format(args.device))
+        dist.init_process_group(args.dist_backend)
+    elif args.train_engine == "deepspeed":
+        deepspeed.init_distributed(dist_backend=args.dist_backend)
+    else:
+        logging.error("not supported engine: {}".format(args.train_engine))
+    return world_size, local_rank, rank
+
+
+def check_modify_and_save_config(args, configs, symbol_table):
+    if args.train_engine in ["torch_ddp", "torch_fsdp"]:
+        if args.use_amp:
+            configs["dtype"] = "fp16"
+            args.dtype = 'fp16'
+        else:
+            configs["dtype"] = args.dtype
+    elif args.train_engine == "deepspeed":
+        # NOTE(xcsong): DeepSpeed does not support uneven data. When using custom
+        #   dataset, we need to manually ensure that the data is evenly distributed
+        #   across all processe. we impl `train_utils.py::wenet_join` for this func
+        #   ref: https://github.com/microsoft/DeepSpeed/issues/2223
+        #
+        # NOTE(xsong):  We also need to keep:
+        #       1. `train_micro_batch_size_per_gpu == 1`
+        #       2. `accum_grad (in train_confomrer.yaml)
+        #               == gradient_accumulation_steps (in ds_config.json)`
+        #       3. `grad_clip (in train_confomrer.yaml)
+        #               == gradient_clipping (in ds_config.json)`
+        #   The reason for such consistence checking lies in that deepspeed's native
+        #   dataloader uses PyTorch's torch.utils.data.DistributedSampler which does
+        #   not support IterableDataset, IterableDataset is extremly useful in large
+        #   scale training because it lets you stream the data without having to
+        #   download the complete dataset.
+        #       ref: https://github.com/microsoft/DeepSpeed/issues/1371
+        #           https://github.com/microsoft/DeepSpeed/issues/285
+        #   To make deepspeed training compatible with IterableDataset, we have to
+        #   use custom dataloader instead of deepspeed's native loader and thus we
+        #   should configure batchsize in train_confomrer.yaml instead of
+        #   ds_config.json. On the contrary, gradient accumulation / clipping should be
+        #   configured in ds_config.json since they will be handled by ds automatically.
+        #       ref: https://github.com/microsoft/DeepSpeed/issues/62
+        with open(args.deepspeed_config, 'r') as fin:
+            ds_configs = json.load(fin)
+        if "fp16" in ds_configs and ds_configs["fp16"]["enabled"]:
+            configs["dtype"] = "fp16"
+        elif "bf16" in ds_configs and ds_configs["bf16"]["enabled"]:
+            configs["dtype"] = "bf16"
+        else:
+            configs["dtype"] = "fp32"
+        assert ds_configs["train_micro_batch_size_per_gpu"] == 1
+        assert ds_configs["gradient_accumulation_steps"] == configs[
+            'accum_grad']
+        assert ds_configs["gradient_clipping"] == configs['grad_clip']
+        assert ds_configs["steps_per_print"] == configs['log_interval']
+
+    if args.use_lora:
+        configs['lora_conf'] = {}
+        configs['lora_conf']['lora_modules'] = args.lora_modules
+        configs['lora_conf']['lora_attn_attr'] = args.lora_attn_attr
+        configs['lora_conf']['lora_list'] = args.lora_list
+        configs['lora_conf']['lora_rank'] = args.lora_rank
+        configs['lora_conf']['lora_alpha'] = args.lora_alpha
+        configs['lora_conf']['lora_dropout'] = args.lora_dropout
+
+    if configs["model"] == 'asr_model':
+        if 'input_dim' not in configs:
+            if 'fbank_conf' in configs['dataset_conf']:
+                input_dim = configs['dataset_conf']['fbank_conf'][
+                    'num_mel_bins']
+            elif 'log_mel_spectrogram_conf' in configs['dataset_conf']:
+                input_dim = configs['dataset_conf'][
+                    'log_mel_spectrogram_conf']['num_mel_bins']
+            else:
+                input_dim = configs['dataset_conf']['mfcc_conf'][
+                    'num_mel_bins']
+        else:
+            input_dim = configs['input_dim']
+
+        configs['input_dim'] = input_dim
+
+    configs, _ = get_blank_id(configs, symbol_table)
+    configs['output_dim'] = configs['vocab_size']
+
+    configs['train_engine'] = args.train_engine
+    configs['use_amp'] = args.use_amp
+    configs['model_dir'] = args.model_dir
+    configs['save_states'] = args.save_states
+
+    # Save configs to model_dir/train.yaml for inference and export
+    if int(os.environ.get('RANK', 0)) == 0:
+        saved_config_path = os.path.join(args.model_dir, 'train.yaml')
+        with open(saved_config_path, 'w') as fout:
+            data = yaml.dump(configs)
+            fout.write(data)
+
+    if configs["model_conf"].get("apply_non_blank_embedding", False):
+        logging.warn('Had better load a well trained model'
+                     'if apply_non_blank_embedding is true !!!')
+
+    return configs
+
+
+def init_dataset_and_dataloader(args, configs, tokenizer, seed=777):
+    generator = torch.Generator()
+    generator.manual_seed(seed)
+
+    # if save_interval in configs, steps mode else epoch mode
+    if "save_interval" in configs:
+        configs['dataset_conf']['cycle'] = configs.get('max_epoch', 100)
+    conf = configs['dataset_conf']
+    dataset_type = configs.get('dataset', 'asr')
+    configs['vocab_size'] = tokenizer.vocab_size()
+    train_dataset = init_dataset(dataset_type,
+                                 args.data_type,
+                                 args.train_data,
+                                 tokenizer,
+                                 conf,
+                                 True,
+                                 split='train')
+    tag = configs["init_infos"].get("tag", "init")
+    train_dataset.set_epoch(configs["init_infos"].get('epoch', 0) + int("epoch_" in tag) - 1)
+    cv_conf = copy.deepcopy(conf)
+    cv_conf['split_num'] = 1
+    cv_dataset = init_dataset(dataset_type,
+                              args.data_type,
+                              args.cv_data,
+                              tokenizer,
+                              cv_conf,
+                              partition=False,
+                              split='cv')
+
+    # NOTE(xcsong): Why we prefer persistent_workers=True ?
+    #   https://discuss.pytorch.org/t/what-are-the-dis-advantages-of-persistent-workers/102110
+    train_data_loader = DataLoader(train_dataset,
+                                   batch_size=None,
+                                   pin_memory=args.pin_memory,
+                                   num_workers=args.num_workers,
+                                   persistent_workers=True,
+                                   generator=generator,
+                                   prefetch_factor=args.prefetch)
+    cv_data_loader = DataLoader(cv_dataset,
+                                batch_size=None,
+                                pin_memory=args.pin_memory,
+                                num_workers=args.num_workers,
+                                persistent_workers=True,
+                                generator=generator,
+                                prefetch_factor=args.prefetch)
+    return train_dataset, cv_dataset, train_data_loader, cv_data_loader
+
+
+def wrap_cuda_model(args, model, configs=None):
+    local_world_size = int(os.environ.get('LOCAL_WORLD_SIZE', 1))
+    world_size = int(os.environ.get('WORLD_SIZE', 1))
+    if hasattr(model, 'encoder'):
+        grad_ckpt = getattr(model.encoder, 'gradient_checkpointing', False)
+    else:
+        grad_ckpt = False
+    if args.train_engine == "torch_ddp":  # native pytorch ddp
+        device = torch.device(args.device)
+        model.to(device)
+        # model = torch.nn.parallel.DistributedDataParallel(
+        #     model, find_unused_parameters=not grad_ckpt)
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, find_unused_parameters=True)
+    elif args.train_engine == "deepspeed":  # deepspeed
+        # NOTE(xcsong): look in detail how the memory estimator API works:
+        #   https://deepspeed.readthedocs.io/en/latest/memory.html#discussion
+        if int(os.environ.get('RANK', 0)) == 0:
+            logging.info("Estimating model states memory needs (zero2)...")
+            estimate_zero2_model_states_mem_needs_all_live(
+                model,
+                num_gpus_per_node=local_world_size,
+                num_nodes=world_size // local_world_size)
+            logging.info("Estimating model states memory needs (zero3)...")
+            estimate_zero3_model_states_mem_needs_all_live(
+                model,
+                num_gpus_per_node=local_world_size,
+                num_nodes=world_size // local_world_size)
+        device = torch.device(args.device)  # Init device later
+        pass  # Init DeepSpeed later
+    elif args.train_engine == 'torch_fsdp':
+        assert configs is not None
+        mixed_precision_dtype = {
+            'fp32': torch.float32,
+            "fp16": torch.float16,
+            "bf16": torch.bfloat16,
+        }[configs['dtype']]
+
+        sharding_strategy = {
+            'model': ShardingStrategy.SHARD_GRAD_OP,
+            'zero2': ShardingStrategy.SHARD_GRAD_OP,
+            'zero3': ShardingStrategy.FULL_SHARD,
+            'no_shard': ShardingStrategy.NO_SHARD,
+        }[args.fsdp_sharding_strategy]
+        wrap_policy = wenet_fsdp_wrap_policy(mode=args.fsdp_sharding_strategy)
+        layer_types = check_gradient_checkpoint(model)
+        if "cuda" in args.device:
+            device_id = torch.cuda.current_device()
+        elif "npu" in args.device and TORCH_NPU_AVAILABLE:
+            device_id = torch.npu.current_device()
+        else:
+            logging.error("not supported device: {}".format(args.device))
+        model = FSDP(
+            model,
+            auto_wrap_policy=wrap_policy,
+            cpu_offload=CPUOffload(offload_params=True)
+            if args.fsdp_cpu_offload is True else None,
+            mixed_precision=MixedPrecision(
+                param_dtype=mixed_precision_dtype,
+                reduce_dtype=mixed_precision_dtype,
+                buffer_dtype=mixed_precision_dtype,
+            ),
+            sharding_strategy=sharding_strategy,
+            limit_all_gathers=True,
+            use_orig_params=True,
+            sync_module_states=args.fsdp_sync_module_states,
+            # init_distributed is called (torch.cuda.set_device),
+            # we should set device_id, see FSDP api
+            device_id=device_id)
+        apply_fsdp_checkpointing(model, layer_types)
+        device = torch.device(args.device)
+    else:
+        logging.error("not supported engine: {}".format(args.train_engine))
+    if args.train_engine in ["torch_fsdp", "torch_ddp"]:
+        if args.fp16_grad_sync:
+            from torch.distributed.algorithms.ddp_comm_hooks import (
+                default as comm_hooks, )
+            model.register_comm_hook(state=None,
+                                     hook=comm_hooks.fp16_compress_hook)
+
+    return model, device
+
+
+def init_optimizer_and_scheduler(args, configs, model):
+    groups = []
+    lr = configs['optim_conf'].get('lr')
+    if isinstance(lr, List):
+        assert configs['scheduler'] == 'warmuplr'
+        modules_m = configs['optim_conf']['modules']
+        assert isinstance(modules_m, List)
+        assert len(modules_m) + 1 == len(lr)
+        special_param_ids = set()
+        rest_params = []
+        for (i, m_str) in enumerate(modules_m):
+            sub_module = get_nested_attribute(model, m_str)
+            subs_params = []
+            for _, sub_params in sub_module.named_parameters():
+                subs_params.append(sub_params)
+                special_param_ids.add(id(sub_params))
+            groups.append({'params': subs_params, 'lr': lr[i]})
+        # other model's parameters
+        for _, param in model.named_parameters():
+            if id(param) not in special_param_ids:
+                rest_params.append(param)
+        groups.append({'params': rest_params, 'lr': lr[-1]})
+
+    params = groups if len(groups) > 0 else model.parameters()
+    optim_conf = copy.deepcopy(configs['optim_conf'])
+    if 'modules' in optim_conf:
+        del optim_conf['modules']
+    if isinstance(lr, List):
+        optim_conf['lr'] = lr[-1]
+    if configs['optim'] == 'adam':
+        optimizer = optim.Adam(params, **optim_conf)
+    elif configs['optim'] == 'adamw':
+        optimizer = optim.AdamW(params, **optim_conf)
+    else:
+        raise ValueError("unknown optimizer: " + configs['optim'])
+
+    scheduler_type = None
+    if configs['scheduler'] == 'warmuplr':
+        scheduler_type = WarmupLR
+        scheduler = WarmupLR(optimizer, **configs['scheduler_conf'])
+    elif configs['scheduler'] == 'NoamHoldAnnealing':
+        scheduler_type = NoamHoldAnnealing
+        scheduler = NoamHoldAnnealing(optimizer, **configs['scheduler_conf'])
+    else:
+        raise ValueError("unknown scheduler: " + configs['scheduler'])
+
+    # NOTE(xcsong): Custom optimizer might yield poor performance when
+    #   zero-offload is enabled, if you do want to offload optimizer to CPU,
+    #   please set optimizer in ds_config.json, see:
+    #   (https://www.deepspeed.ai/docs/config-json/#optimizer-parameters)
+    if args.train_engine == "deepspeed":
+        with open(args.deepspeed_config, 'r') as fin:
+            ds_configs = json.load(fin)
+        if "optimizer" in ds_configs:
+            # NOTE(xcsong): Disable custom optimizer if it is set in ds_config,
+            # extremely useful when enable cpu_offload, DeepspeedCpuAdam
+            # could be 4~5x faster than torch native adam
+            optimizer = None
+            if "scheduler" in ds_configs:
+                scheduler = None
+            else:
+
+                def scheduler(opt):
+                    return scheduler_type(opt, **configs['scheduler_conf'])
+
+        model, optimizer, _, scheduler = deepspeed.initialize(
+            args=args,
+            model=model,
+            optimizer=optimizer,
+            lr_scheduler=scheduler,
+            model_parameters=model.parameters())
+
+    step = configs.get("init_infos", {}).get("step", -1)
+    scheduler.set_step(step)
+    return model, optimizer, scheduler
+
+
+def trace_and_print_model(args, model):
+    # !!!IMPORTANT!!!
+    # Try to export the model by script, if fails, we should refine
+    # the code to satisfy the script export requirements
+    if int(os.environ.get('RANK', 0)) == 0:
+        if args.jit:
+            script_model = torch.jit.script(model)
+            script_model.save(os.path.join(args.model_dir, 'init.zip'))
+        if args.print_model:
+            print(model)
+            num_params = sum(p.numel() for p in model.parameters())
+            print('the number of model params: {:,d}'.format(num_params))
+
+
+def init_summarywriter(args):
+    writer = None
+    if int(os.environ.get('RANK', 0)) == 0:
+        os.makedirs(args.model_dir, exist_ok=True)
+        exp_id = os.path.basename(args.model_dir)
+        writer = SummaryWriter(os.path.join(args.tensorboard_dir, exp_id))
+    return writer
+
+
+def init_scaler(args):
+    scaler = None
+    if args.use_amp:
+        if "cuda" in args.device:
+            scaler = torch.cuda.amp.GradScaler()
+        elif "npu" in args.device and TORCH_NPU_AVAILABLE:
+            scaler = torch.npu.amp.GradScaler()
+        else:
+            logging.error("not supported device: {}".format(args.device))
+    elif args.train_engine == 'torch_fsdp':
+        # why bf16 don't need scaler:
+        # https://discuss.pytorch.org/t/why-bf16-do-not-need-loss-scaling/176596
+        if args.dtype in ['fp16']:
+            scaler = sharded_grad_scaler.ShardedGradScaler(enabled=True)
+    return scaler
+
+
+def save_model(model, info_dict):
+    rank = int(os.environ.get('RANK', 0))
+    tag = info_dict["tag"]
+    model_dir = info_dict["model_dir"]
+    save_model_path = os.path.join(model_dir, '{}.pt'.format(tag))
+    # save ckpt
+    if info_dict["train_engine"] == "deepspeed":
+        # NOTE(xcsong): All ranks should call this API, but only rank 0
+        #   save the general model params. see:
+        #   https://github.com/microsoft/DeepSpeed/issues/2993
+        with torch.no_grad():
+            model.save_checkpoint(save_dir=model_dir,
+                                  tag=tag,
+                                  client_state=info_dict)
+            if info_dict["save_states"] == "model_only" and rank == 0:
+                convert_zero_checkpoint_to_fp32_state_dict(model_dir,
+                                                           save_model_path,
+                                                           tag=tag)
+                os.system("rm -rf {}/{}".format(model_dir, tag))
+
+    elif info_dict['train_engine'] == "torch_fsdp":
+        fsdp_save_model(model, save_model_path, info_dict)
+    elif rank == 0:
+        # NOTE(xcsong): For torch_ddp, only rank-0 should call this.
+        save_checkpoint(model, save_model_path, info_dict)
+    # save yaml
+    if rank == 0:
+        with open("{}/{}.yaml".format(model_dir, tag), 'w') as fout:
+            data = yaml.dump(info_dict)
+            fout.write(data)
+
+
+def wenet_join(group_join, info_dict):
+    world_size = int(os.environ.get('WORLD_SIZE', 1))
+    local_rank = int(os.environ.get('LOCAL_RANK', 0))
+    rank = int(os.environ.get('RANK', 0))
+    train_engine = info_dict.get('train_engine', "torch_ddp")
+
+    if info_dict["batch_idx"] == 0 or train_engine == "torch_ddp":
+        # NOTE(xcsong): skip first batch because its processing time includes
+        #   dataloader initialization time, which may exceed 30 seconds
+        return False
+
+    try:
+        # NOTE(xcsong): Why we need a new group?
+        #   Because Deepspeed has its own group where all the relevant communication
+        #   operations are executed. If we add a communication operation that is not
+        #   managed by Deepspeed in this group, it's highly likely to cause
+        #   communication chaos, resulting in hard-to-troubleshoot hangs.
+        dist.monitored_barrier(group=group_join,
+                               timeout=group_join.options._timeout)
+    except RuntimeError as e:
+        logging.info("Detected uneven workload distribution: {}\n".format(e) +
+                     "Break current worker to manually join all workers, " +
+                     "world_size {}, current rank {}, current local_rank {}\n".
+                     format(world_size, rank, local_rank))
+        return True
+
+    return False
+
+
+def batch_forward(model, batch, scaler, info_dict, device):
+    train_engine = info_dict.get('train_engine', "torch_ddp")
+    accum_grad = info_dict.get('accum_grad', 1)
+
+    dtype = info_dict.get("dtype", "fp32")
+    if dtype == "fp16":
+        dtype = torch.float16
+    elif dtype == "bf16":
+        dtype = torch.bfloat16
+    else:  # fp32
+        dtype = None
+
+    # autocast context
+    # The more details about amp can be found in
+    # https://pytorch.org/docs/stable/notes/amp_examples.html
+    amp_autocast = torch.cuda.amp.autocast
+    if "npu" in device.__str__() and TORCH_NPU_AVAILABLE:
+        amp_autocast = torch.npu.amp.autocast
+    autocast = {
+        "deepspeed":
+        amp_autocast(enabled=dtype is not None,
+                     dtype=dtype,
+                     cache_enabled=False),
+        "torch_ddp":
+        amp_autocast(enabled=scaler is not None),
+        "torch_fsdp":
+        amp_autocast(enabled=True, dtype=dtype)
+        if dtype is not None else nullcontext()
+    }[train_engine]
+    with autocast:
+        loss_dict = model(batch, device)
+
+    info_dict['loss_dict'] = loss_dict
+    return info_dict
+
+
+def batch_backward(model, scaler, info_dict):
+    train_engine = info_dict.get("train_engine", "torch_ddp")
+    accum_grad = info_dict.get('accum_grad', 1)
+    use_amp = info_dict.get('use_amp', False)
+    if use_amp:
+        assert scaler is not None
+    loss = info_dict['loss_dict']['loss']
+
+    if train_engine == "deepspeed":
+        # NOTE(xcsong): `model.backward(loss)` is equivalent to
+        #               `scale_loss_wrt_accum_grad + loss.backward()`
+        #   ref: https://www.deepspeed.ai/tutorials/megatron/#using-the-training-api
+        scaled_loss = model.backward(loss)
+    else:
+        assert train_engine in ["torch_ddp", "torch_fsdp"]
+        scaled_loss = loss / accum_grad
+        if scaler is not None:
+            # fp16 (amp and fsdp)
+            scaler.scale(scaled_loss).backward()
+        else:
+            # float32  (ddp and fsdp)
+            # bf16 (fsdp)
+            scaled_loss.backward()
+
+    info_dict['loss_dict']['loss'] = scaled_loss
+    for loss_name, loss_value in info_dict['loss_dict'].items():
+        if loss_value is not None:
+            info_dict['loss_dict'][loss_name] = tensor_to_scalar(loss_value)
+
+    return info_dict
+
+
+def update_parameter_and_lr(model, optimizer, scheduler, scaler, info_dict):
+    rank = int(os.environ.get('RANK', 0))
+    train_engine = info_dict.get("train_engine", "torch_ddp")
+    accum_grad = info_dict.get('accum_grad', 1)
+    use_amp = info_dict.get('use_amp', False)
+    clip = info_dict.get('grad_clip', 50.0)
+    batch_idx = info_dict["batch_idx"]
+    if use_amp:
+        assert scaler is not None
+
+    grad_norm = 0.0
+    if train_engine == "deepspeed":
+        # NOTE(xcsong): The step() function in DeepSpeed engine updates the
+        #   model parameters as well as the learning rate.
+        #   Zeroing the gradients is handled automatically by
+        #   DeepSpeed after the weights have been updated using a mini-batch.
+        #   DeepSpeed also performs gradient averaging automatically at the
+        #   gradient accumulation boundaries and addresses clip_grad_norm internally.
+        #   `ds_model.step() =  clip_grad_norm_() + optimizer.step()
+        #                       + optimizer.zero_grad() + scheduler.step()`
+        #   ref: https://www.deepspeed.ai/tutorials/megatron/#using-the-training-api
+        info_dict["is_gradient_accumulation_boundary"] = \
+            model.is_gradient_accumulation_boundary()
+        model.step()
+        grad_norm = model.get_global_grad_norm()
+        if grad_norm is None:
+            grad_norm = 0.0
+    elif (batch_idx + 1) % accum_grad == 0:
+        # Use mixed precision training
+        # fp16 (ddp fsdp)
+        if scaler is not None:
+            scaler.unscale_(optimizer)
+            if train_engine == "torch_ddp":
+                grad_norm = clip_grad_norm_(model.parameters(), clip)
+            else:
+                # fsdp
+                grad_norm = model.clip_grad_norm_(clip)
+            # Must invoke scaler.update() if unscale_() is used in
+            # the iteration to avoid the following error:
+            #   RuntimeError: unscale_() has already been called
+            #   on this optimizer since the last update().
+            # We don't check grad here since that if the gradient
+            # has inf/nan values, scaler.step will skip
+            # optimizer.step().
+            scaler.step(optimizer)
+            scaler.update()
+        else:
+            if train_engine == "torch_ddp":
+                grad_norm = clip_grad_norm_(model.parameters(), clip)
+            else:
+                grad_norm = model.clip_grad_norm_(clip)
+            if torch.isfinite(grad_norm):
+                optimizer.step()
+        optimizer.zero_grad()
+        scheduler.step()
+
+    info_dict["lrs"] = [group['lr'] for group in optimizer.param_groups]
+    info_dict["grad_norm"] = tensor_to_scalar(grad_norm)
+
+    return info_dict
+
+
+def log_per_step(writer, info_dict, timer: Optional[StepTimer] = None):
+    tag = info_dict["tag"]
+    step = info_dict["step"]
+    batch_idx = info_dict["batch_idx"]
+    loss_dict = info_dict['loss_dict']
+    epoch = info_dict.get('epoch', 0)
+    train_engine = info_dict.get("train_engine", "torch_ddp")
+    accum_grad = info_dict.get('accum_grad', 1) if tag != "CV" else 1
+    log_interval = info_dict.get('log_interval', 10)
+    lrs = info_dict.get("lrs", [0.0])
+    is_gradient_accumulation_boundary = info_dict.get(
+        "is_gradient_accumulation_boundary", False)
+
+    rank = int(os.environ.get('RANK', 0))
+    # TRAIN Tensorboard
+    if tag == "TRAIN" and rank == 0 and writer is not None:
+        if (train_engine == "deepspeed" and is_gradient_accumulation_boundary
+            ) or (train_engine in ["torch_ddp", "torch_fsdp"] and
+                  (batch_idx + 1) % accum_grad == 0):
+            writer.add_scalar('train/train_loss',
+                              tensor_to_scalar(loss_dict['loss']) * accum_grad,
+                              step)
+            writer.add_scalar('train/grad_norm', info_dict['grad_norm'], step)
+            for name, value in loss_dict.items():
+                if name != 'loss' and value is not None:
+                    writer.add_scalar('train/{}'.format(name),
+                                      tensor_to_scalar(value), step)
+            # lr
+            for i, lr in enumerate(lrs):
+                writer.add_scalar('train/lr_{}'.format(i), lr, step)
+    # CV Tensorboard
+    elif "step_" in tag and rank == 0 and writer is not None:
+        for name, value in loss_dict.items():
+            writer.add_scalar('cv/{}'.format(name), tensor_to_scalar(value),
+                              step)
+        logging.info(
+            'Epoch {} Step {} CV info lr {} cv_loss {} rank {} acc {}'.format(
+                epoch, step + 1, lrs_to_str(lrs),
+                tensor_to_scalar(loss_dict["loss"]), rank,
+                tensor_to_scalar(loss_dict["acc"])))
+        return
+
+    # TRAIN & CV, Shell log (stdout)
+    if (batch_idx + 1) % log_interval == 0:
+        log_str = '{} | '.format(tag)
+        if timer is not None:
+            timer_step = step
+            if info_dict.get("cv_step", None) is not None:
+                timer_step = info_dict['cv_step']
+            steps_per_second = timer.steps_per_second(timer_step)
+            log_str += 'steps/sec {:.3f}| '.format(steps_per_second)
+        log_str += 'Batch {}/{} loss {:.6f} '.format(
+            epoch, batch_idx + 1 if 'save_interval' not in info_dict else
+            (step + 1) * accum_grad,
+            tensor_to_scalar(loss_dict['loss']) * accum_grad)
+        for name, value in loss_dict.items():
+            if name != 'loss' and value is not None:
+                log_str += '{} {:.6f} '.format(name, tensor_to_scalar(value))
+        if tag == "TRAIN":
+            log_str += 'lr {} grad_norm {:.6f} rank {}'.format(
+                lrs_to_str(lrs), info_dict['grad_norm'], rank)
+        logging.debug(log_str)
+
+
+def log_per_epoch(writer, info_dict):
+    epoch = info_dict["epoch"]
+    loss_dict = info_dict["loss_dict"]
+    lrs = info_dict['lrs']
+    rank = int(os.environ.get('RANK', 0))
+    step = info_dict["step"]
+    logging.info(
+        'Epoch {} Step {} CV info lr {} cv_loss {} rank {} acc {}'.format(
+            epoch, step, lrs_to_str(lrs), tensor_to_scalar(loss_dict["loss"]),
+            rank, tensor_to_scalar(loss_dict["acc"])))
+
+    if int(os.environ.get('RANK', 0)) == 0:
+        for i, lr in enumerate(info_dict["lrs"]):
+            writer.add_scalar('epoch/lr_{}'.format(i), lr, epoch)
+        for name, value in loss_dict.items():
+            writer.add_scalar('epoch/{}'.format(name), tensor_to_scalar(value),
+                              epoch)
+
+
+def freeze_modules(model, args):
+    for name, param in model.named_parameters():
+        for module_name in args.freeze_modules:
+            if module_name in name:
+                param.requires_grad = False
+                logging.debug("{} module is freezed".format(name))
+
+
+def reinit_lora(model, args, configs, tokenizer, seed=777):
+    from tqdm import tqdm
+    from wenet.finetune.lora.utils import estimate_gradient, reinit_lora_modules
+    from wenet.finetune.lora.layers import LoRALayer
+    from types import SimpleNamespace
+
+    logging.info("reinit lora modules.")
+    with open(args.lora_init_yaml, 'r') as file:
+        lora_config = yaml.safe_load(file)
+
+    generator = torch.Generator()
+    generator.manual_seed(seed)
+    dataset_conf = copy.deepcopy(configs['dataset_conf'])
+    dataset_conf['batch_conf']['batch_size'] = lora_config['init_batch_size']
+    dataset_type = configs.get('dataset', 'asr')
+    dataset = init_dataset(dataset_type, args.data_type, args.train_data,
+                           tokenizer, dataset_conf, True)
+    dataloader = DataLoader(dataset,
+                            batch_size=None,
+                            pin_memory=args.pin_memory,
+                            num_workers=args.num_workers,
+                            persistent_workers=True,
+                            generator=generator,
+                            prefetch_factor=args.prefetch)
+    additional_kwargs = {}
+    if lora_config["init_config"]["mode"] == "gradient":
+        named_grads = estimate_gradient(model, dataloader,
+                                        lora_config['init_iters'])
+        additional_kwargs["named_grads"] = named_grads
+    lora_config = SimpleNamespace(**lora_config["init_config"])
+    for name, module in tqdm(
+        model.named_modules(),
+        desc="Reinitializing Lora",
+        total=len(list(model.named_modules())),
+    ):
+        if isinstance(module, LoRALayer):
+            reinit_lora_modules(name, module, lora_config, **additional_kwargs)
+    # lora_init_model needs to be saved, w0 = w0 - A0 * B0
+    save_checkpoint(model, os.path.join(args.model_dir, "lora_init.pt"),
+                    infos={"tag": "lora_init", **configs})
diff --git a/wenet/whisper/__init__.py b/wenet/whisper/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/wenet/whisper/convert_whisper_to_wenet_config_and_ckpt.py b/wenet/whisper/convert_whisper_to_wenet_config_and_ckpt.py
new file mode 100644
index 0000000000000000000000000000000000000000..fff096db19726a0a89af04672143551686482930
--- /dev/null
+++ b/wenet/whisper/convert_whisper_to_wenet_config_and_ckpt.py
@@ -0,0 +1,300 @@
+# Copyright (c) 2023 Wenet Community. (authors: Xingchen Song)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Requirements:
+
+```bash
+pip install -U openai-whisper
+```
+
+Example:
+
+```bash
+# Converts the model from OpenAI to WeNet format:
+python convert_whisper_to_wenet_config_and_ckpt.py \
+    --whisper_ckpt large-v3.pt \
+    --output_dir exp/whisper/large-v3
+```
+"""
+
+import argparse
+import copy
+import os
+import sys
+import torch
+import yaml
+
+_cpath_ = sys.path[0]
+sys.path.remove(_cpath_)
+from whisper.tokenizer import get_tokenizer
+
+sys.path.insert(0, _cpath_)
+
+
+def convert_to_wenet_yaml(tokenizer, dims, wenet_yaml_path: str):
+    configs = {}
+    configs['input_dim'] = dims['n_mels']
+    configs['output_dim'] = dims['n_vocab']
+    assert dims['n_vocab'] == tokenizer.encoding.n_vocab, "{} v.s. {}".format(
+        dims['n_vocab'], tokenizer.encoding.n_vocab)
+
+    configs['encoder'] = 'transformer'
+    configs['encoder_conf'] = {}
+    configs['encoder_conf']['gradient_checkpointing'] = True
+    configs['encoder_conf']['input_layer'] = 'conv1d2'
+    configs['encoder_conf']['output_size'] = dims['n_audio_state']
+    configs['encoder_conf']['attention_heads'] = dims['n_audio_head']
+    configs['encoder_conf']['linear_units'] = dims['n_audio_state'] * 4
+    configs['encoder_conf']['num_blocks'] = dims['n_audio_layer']
+    configs['encoder_conf']['dropout_rate'] = 0.1
+    configs['encoder_conf']['positional_dropout_rate'] = 0.1
+    configs['encoder_conf']['attention_dropout_rate'] = 0.0
+    configs['encoder_conf']['normalize_before'] = True
+    configs['encoder_conf']['use_dynamic_chunk'] = False
+    configs['encoder_conf']['use_dynamic_left_chunk'] = False
+    configs['encoder_conf']['pos_enc_layer_type'] = "abs_pos_whisper"
+    configs['encoder_conf']['static_chunk_size'] = -1
+    configs['encoder_conf']['key_bias'] = False
+    configs['encoder_conf']['activation_type'] = "gelu"
+
+    configs['decoder'] = 'transformer'
+    configs['decoder_conf'] = {}
+    configs['decoder_conf']['tie_word_embedding'] = True
+    configs['decoder_conf']['gradient_checkpointing'] = True
+    configs['decoder_conf']['attention_heads'] = dims['n_text_head']
+    configs['decoder_conf']['linear_units'] = dims['n_text_state'] * 4
+    configs['decoder_conf']['num_blocks'] = dims['n_text_layer']
+    configs['decoder_conf']['dropout_rate'] = 0.1
+    configs['decoder_conf']['positional_dropout_rate'] = 0.1
+    configs['decoder_conf']['self_attention_dropout_rate'] = 0.0
+    configs['decoder_conf']['src_attention_dropout_rate'] = 0.0
+    configs['decoder_conf']['input_layer'] = "embed_learnable_pe"
+    configs['decoder_conf']['use_output_layer'] = True
+    configs['decoder_conf']['normalize_before'] = True
+    configs['decoder_conf']['src_attention'] = True
+    configs['decoder_conf']['key_bias'] = False
+    configs['decoder_conf']['activation_type'] = "gelu"
+
+    configs['tokenizer'] = 'whisper'
+    configs['tokenizer_conf'] = {}
+    configs['tokenizer_conf']['is_multilingual'] = dims['n_vocab'] >= 51865
+    configs['tokenizer_conf']['num_languages'] = dims['n_vocab'] - 51765 - \
+        int(configs['tokenizer_conf']['is_multilingual'])
+    configs['tokenizer_conf']['split_with_space'] = False
+    configs['tokenizer_conf']['bpe_path'] = None
+    configs['tokenizer_conf']['symbol_table_path'] = None
+    configs['tokenizer_conf']['non_lang_syms_path'] = None
+    configs['tokenizer_conf']['special_tokens'] = {}
+    configs['tokenizer_conf']['special_tokens']['sot'] = tokenizer.sot
+    configs['tokenizer_conf']['special_tokens']['eot'] = tokenizer.eot
+    configs['tokenizer_conf']['special_tokens'][
+        'sot_prev'] = tokenizer.sot_prev
+    configs['tokenizer_conf']['special_tokens'][
+        'transcribe'] = tokenizer.transcribe
+    configs['tokenizer_conf']['special_tokens'][
+        'translate'] = tokenizer.translate
+    configs['tokenizer_conf']['special_tokens'][
+        'no_timestamps'] = tokenizer.no_timestamps
+    configs['tokenizer_conf']['special_tokens'][
+        'no_speech'] = tokenizer.no_speech
+    configs['tokenizer_conf']['special_tokens']['timestamp_begin'] = \
+        tokenizer.timestamp_begin
+
+    configs['ctc_conf'] = {}
+    configs['ctc_conf']['ctc_blank_id'] = tokenizer.no_speech
+
+    configs['cmvn'] = None
+    configs['cmvn_conf'] = {}
+    configs['cmvn_conf']['cmvn_file'] = None
+    configs['cmvn_conf']['is_json_cmvn'] = None
+
+    configs['model'] = "whisper"
+    configs['model_conf'] = {}
+    configs['model_conf']['ctc_weight'] = 0.3
+    configs['model_conf']['lsm_weight'] = 0.1
+    configs['model_conf']['length_normalized_loss'] = False
+
+    configs['dataset'] = "asr"
+    configs['dataset_conf'] = {}
+    configs['dataset_conf']['filter_conf'] = {}
+    configs['dataset_conf']['filter_conf'][
+        'max_length'] = dims['n_audio_ctx'] * 2  # 1/2 subsample # noqa
+    configs['dataset_conf']['filter_conf']['min_length'] = 0
+    configs['dataset_conf']['filter_conf']['token_max_length'] = dims[
+        'n_text_ctx']
+    configs['dataset_conf']['filter_conf']['token_min_length'] = 1
+    configs['dataset_conf']['resample_conf'] = {}
+    configs['dataset_conf']['resample_conf']['resample_rate'] = 16000
+    # NOTE: Disable speed_perturb, https://github.com/wenet-e2e/wenet/issues/2171
+    configs['dataset_conf']['speed_perturb'] = False
+    configs['dataset_conf']['spec_aug'] = True
+    configs['dataset_conf']['spec_aug_conf'] = {}
+    configs['dataset_conf']['spec_aug_conf']['num_t_mask'] = 2
+    configs['dataset_conf']['spec_aug_conf']['num_f_mask'] = 2
+    configs['dataset_conf']['spec_aug_conf']['max_t'] = 50
+    configs['dataset_conf']['spec_aug_conf']['max_f'] = 10
+    configs['dataset_conf']['spec_sub'] = True
+    configs['dataset_conf']['spec_sub_conf'] = {}
+    configs['dataset_conf']['spec_sub_conf']['num_t_sub'] = 3
+    configs['dataset_conf']['spec_sub_conf']['max_t'] = 30
+    configs['dataset_conf']['spec_trim'] = False
+    configs['dataset_conf']['shuffle'] = True
+    configs['dataset_conf']['shuffle_conf'] = {}
+    configs['dataset_conf']['shuffle_conf']['shuffle_size'] = 1500
+    configs['dataset_conf']['sort'] = True
+    configs['dataset_conf']['sort_conf'] = {}
+    configs['dataset_conf']['sort_conf']['sort_size'] = 500
+    configs['dataset_conf']['feats_type'] = "log_mel_spectrogram"
+    configs['dataset_conf']['log_mel_spectrogram_conf'] = {}
+    configs['dataset_conf']['log_mel_spectrogram_conf']['n_fft'] = 400
+    configs['dataset_conf']['log_mel_spectrogram_conf']['hop_length'] = 160
+    configs['dataset_conf']['log_mel_spectrogram_conf']['num_mel_bins'] = dims[
+        'n_mels']
+    configs['dataset_conf']['log_mel_spectrogram_conf']['padding'] = 0
+    configs['dataset_conf']['batch_conf'] = {}
+    configs['dataset_conf']['batch_conf']['batch_type'] = 'dynamic'
+    configs['dataset_conf']['batch_conf']['batch_size'] = 26
+    configs['dataset_conf']['batch_conf']['max_frames_in_batch'] = 12000
+    configs['dataset_conf']['language_conf'] = {}
+    configs['dataset_conf']['language_conf']['limited_langs'] = ['zh']
+
+    configs['grad_clip'] = 5
+    configs['accum_grad'] = 4
+    configs['max_epoch'] = 100
+    configs['log_interval'] = 100
+
+    configs['optim'] = "adam"
+    configs['optim_conf'] = {}
+    configs['optim_conf']['lr'] = 0.0005
+    configs['scheduler'] = "warmuplr"
+    configs['scheduler_conf'] = {}
+    configs['scheduler_conf']['warmup_steps'] = 12000
+
+    with open(wenet_yaml_path, '+w') as f:
+        f.write(yaml.dump(configs))
+        f.flush()
+
+    print(configs)
+
+
+def convert_to_wenet_state_dict(whisper_state_dict, wenet_state_dict_path):
+    wenet_state_dict = {}
+    unused = []
+    print(
+        "===================== start CKPT Conversion ========================="
+    )
+    for name in whisper_state_dict.keys():
+        original_name = copy.deepcopy(name)
+        name = name.replace("encoder.conv1", "encoder.embed.conv.0")
+        name = name.replace("encoder.conv2", "encoder.embed.conv.2")
+        name = name.replace("decoder.token_embedding", "decoder.embed.0")
+        name = name.replace("encoder.blocks", "encoder.encoders")
+        name = name.replace("decoder.blocks", "decoder.decoders")
+        name = name.replace(".cross_attn.query", ".src_attn.linear_q")
+        name = name.replace(".cross_attn.key", ".src_attn.linear_k")
+        name = name.replace(".cross_attn.value", ".src_attn.linear_v")
+        name = name.replace(".cross_attn.out", ".src_attn.linear_out")
+        name = name.replace(".attn.query", ".self_attn.linear_q")
+        name = name.replace(".attn.key", ".self_attn.linear_k")
+        name = name.replace(".attn.value", ".self_attn.linear_v")
+        name = name.replace(".attn.out", ".self_attn.linear_out")
+        name = name.replace("mlp.0", "feed_forward.w_1")
+        name = name.replace("mlp.2", "feed_forward.w_2")
+        if "decoder" in name:
+            name = name.replace("cross_attn_ln", "norm2")
+            name = name.replace("mlp_ln", "norm3")
+        else:
+            name = name.replace("mlp_ln", "norm2")
+        name = name.replace("attn_ln", "norm1")
+        name = name.replace("encoder.ln_post", "encoder.after_norm")
+        name = name.replace("decoder.ln", "decoder.after_norm")
+        if original_name == "decoder.positional_embedding":
+            whisper_state_dict[name] = whisper_state_dict[name].unsqueeze(0)
+            name = "decoder.embed.1.pe"
+        elif original_name == "encoder.positional_embedding":
+            whisper_state_dict[name] = whisper_state_dict[name].unsqueeze(0)
+            name = "encoder.embed.pos_enc.pe"
+        print("name  {} ==> {}".format(original_name, name))
+        print("type  {} ==> torch.float32".format(
+            whisper_state_dict[original_name].dtype))
+        print("shape {}\n".format(whisper_state_dict[original_name].shape))
+        if (original_name == name):
+            unused.append(name)
+        else:
+            wenet_state_dict[name] = whisper_state_dict[original_name].float()
+    for name in unused:
+        print("NOTE!!! drop {}".format(name))
+    print("Saving fp32 ckpt to {}...".format(wenet_state_dict_path))
+    torch.save(wenet_state_dict, wenet_state_dict_path)
+    print(
+        "DONE\n===================== End CKPT Conversion =========================\n"
+    )
+
+
+def convert_to_wenet_units(tokenizer, units_txt_path):
+    """ NOTE(xcsong):
+        The "units.txt" file is solely for adapting to the training API of Wenet
+        and for quickly checking the corresponding text of an ID when necessary.
+        It does not play any role in the tokenization process,
+        which is carried out by the tokenizer of openai-whisper.
+    """
+    n_vocab = tokenizer.encoding.n_vocab
+    with open(units_txt_path, "+w") as f:
+        for i in range(n_vocab):
+            unit = str(tokenizer.encoding.decode_single_token_bytes(i))
+            if len(unit) == 0:
+                unit = str(i)
+                print("can not decode id {}, convert to str({})".format(i, i))
+            unit = unit.replace(" ", "<space>")
+            f.write("{} {}\n".format(unit, i))
+            f.flush()
+
+
+def get_args():
+    parser = argparse.ArgumentParser(description='load and parse whisper')
+    # yapf: disable
+    parser.add_argument(
+        '--whisper_ckpt',
+        required=True,
+        help='https://openaipublic.azureedge.net/main/whisper/models/e5b1a55b89c1367dacf97e3e19bfd829a01529dbfdeefa8caeb59b3f1b81dadb/large-v3.pt'  # noqa
+    )
+    # yapf: enable
+    parser.add_argument('--output_dir',
+                        default='.',
+                        help='output file in wenet\'s style: ' +
+                        'units.txt, train.yaml, model.pt')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = get_args()
+    checkpoint = torch.load(args.whisper_ckpt, map_location="cpu")
+    multilingual = checkpoint["dims"]['n_vocab'] >= 51865
+    num_languages = checkpoint["dims"]['n_vocab'] - 51765 - int(multilingual)
+    tokenizer = get_tokenizer(multilingual=multilingual,
+                              num_languages=num_languages)
+
+    convert_to_wenet_state_dict(
+        checkpoint["model_state_dict"],
+        os.path.join(args.output_dir, 'wenet_whisper.pt'))
+    convert_to_wenet_units(tokenizer, os.path.join(args.output_dir,
+                                                   'units.txt'))
+    convert_to_wenet_yaml(tokenizer, checkpoint["dims"],
+                          os.path.join(args.output_dir, 'train.yaml'))
+
+
+if __name__ == "__main__":
+
+    main()
diff --git a/wenet/whisper/whisper.py b/wenet/whisper/whisper.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc95e79657d7e86674d6140c10dab748c56d47dd
--- /dev/null
+++ b/wenet/whisper/whisper.py
@@ -0,0 +1,93 @@
+# Copyright (c) 2023 Wenet Community. (authors: Xingchen Song)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Modified from [Whisper](https://github.com/openai/whisper)
+
+import torch
+
+from typing import Tuple, Dict, List
+
+from wenet.transformer.asr_model import ASRModel
+from wenet.transformer.ctc import CTC
+from wenet.transformer.encoder import TransformerEncoder
+from wenet.transformer.decoder import TransformerDecoder
+from wenet.utils.common import IGNORE_ID, add_whisper_tokens, th_accuracy
+
+
+class Whisper(ASRModel):
+
+    def __init__(
+        self,
+        vocab_size: int,
+        encoder: TransformerEncoder,
+        decoder: TransformerDecoder,
+        ctc: CTC = None,
+        ctc_weight: float = 0.5,
+        ignore_id: int = IGNORE_ID,
+        reverse_weight: float = 0.0,
+        lsm_weight: float = 0.0,
+        length_normalized_loss: bool = False,
+        special_tokens: dict = None,
+    ):
+        super().__init__(vocab_size, encoder, decoder, ctc, ctc_weight,
+                         ignore_id, reverse_weight, lsm_weight,
+                         length_normalized_loss, special_tokens)
+        assert reverse_weight == 0.0
+        self.sos = special_tokens["sot"]
+        self.eos = special_tokens["eot"]
+        self.decode_maxlen = self.decoder.embed[1].max_len
+
+    # TODO(xcsong): time align
+    def set_alignment_heads(self, dump: bytes):
+        raise NotImplementedError
+
+    @property
+    def is_multilingual(self):
+        return self.vocab_size >= 51865
+
+    @property
+    def num_languages(self):
+        return self.vocab_size - 51765 - int(self.is_multilingual)
+
+    def _calc_att_loss(
+        self,
+        encoder_out: torch.Tensor,
+        encoder_mask: torch.Tensor,
+        ys_pad: torch.Tensor,
+        ys_pad_lens: torch.Tensor,
+        infos: Dict[str, List[str]],
+    ) -> Tuple[torch.Tensor, float]:
+        prev_len = ys_pad.size(1)
+        ys_in_pad, ys_out_pad = add_whisper_tokens(self.special_tokens,
+                                                   ys_pad,
+                                                   self.ignore_id,
+                                                   tasks=infos['tasks'],
+                                                   no_timestamp=True,
+                                                   langs=infos['langs'],
+                                                   use_prev=False)
+        cur_len = ys_in_pad.size(1)
+        ys_in_lens = ys_pad_lens + cur_len - prev_len
+
+        # 1. Forward decoder
+        decoder_out, r_decoder_out, _ = self.decoder(encoder_out, encoder_mask,
+                                                     ys_in_pad, ys_in_lens)
+
+        # 2. Compute attention loss
+        loss_att = self.criterion_att(decoder_out, ys_out_pad)
+        acc_att = th_accuracy(
+            decoder_out.view(-1, self.vocab_size),
+            ys_out_pad,
+            ignore_label=self.ignore_id,
+        )
+        return loss_att, acc_att
diff --git a/wenet/whisper/whisper_with_clap.py b/wenet/whisper/whisper_with_clap.py
new file mode 100644
index 0000000000000000000000000000000000000000..980f352905e1cd478bee4e095a67bae415dd38cd
--- /dev/null
+++ b/wenet/whisper/whisper_with_clap.py
@@ -0,0 +1,119 @@
+# Copyright (c) 2023 Wenet Community. (authors: Xingchen Song)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Modified from [Whisper](https://github.com/openai/whisper)
+
+import torch
+
+from typing import Tuple, Dict, List
+
+from torch import nn
+
+from wenet.transformer.asr_model import ASRModel
+from wenet.transformer.ctc import CTC
+from wenet.transformer.encoder import TransformerEncoder
+from wenet.transformer.decoder import TransformerDecoder
+from wenet.utils.common import IGNORE_ID, add_whisper_tokens, th_accuracy
+
+
+class Whisper(ASRModel):
+
+    def __init__(
+        self,
+        vocab_size: int,
+        encoder: TransformerEncoder,
+        decoder: TransformerDecoder,
+        ctc: CTC = None,
+        ctc_weight: float = 0.5,
+        ignore_id: int = IGNORE_ID,
+        reverse_weight: float = 0.0,
+        lsm_weight: float = 0.0,
+        length_normalized_loss: bool = False,
+        special_tokens: dict = None,
+    ):
+        super().__init__(vocab_size, encoder, decoder, ctc, ctc_weight,
+                         ignore_id, reverse_weight, lsm_weight,
+                         length_normalized_loss, special_tokens)
+        assert reverse_weight == 0.0
+        self.sos = special_tokens["sot"]
+        self.eos = special_tokens["eot"]
+        self.decode_maxlen = self.decoder.embed[1].max_len
+
+        # 添加clap
+        self.clip_length = 40
+        self.prefix_length = 40
+        num_layers = 12
+        dim_embedding = 1024
+        dim_clip = 512
+        # 修改一下使用nn.transformer
+        nhead = 8
+        self.ttt = nn.TransformerEncoder(
+            encoder_layer=nn.TransformerEncoderLayer(d_model=dim_embedding, nhead=nhead),
+            num_layers=num_layers
+        )
+        self.linear = nn.Linear(dim_clip, self.clip_length * dim_embedding)
+        self.prefix_const = nn.Parameter(torch.randn(self.prefix_length, dim_embedding), requires_grad=True)
+
+        from transformers import ClapModel, AutoFeatureExtractor
+        # 加载模型和处理器
+        self.model = ClapModel.from_pretrained(
+            "/home/work_nfs11/wjtian/work_space/wenet_whisper_finetune/examples/wenetspeech/whisper/pretrain_ckpt/clap-htsat-unfused")
+        self.processor = AutoFeatureExtractor.from_pretrained(
+            "/home/work_nfs11/wjtian/work_space/wenet_whisper_finetune/examples/wenetspeech/whisper/pretrain_ckpt/clap-htsat-unfused")
+        for param in self.model.parameters():
+            param.requires_grad = False
+
+    # TODO(xcsong): time align
+    def set_alignment_heads(self, dump: bytes):
+        raise NotImplementedError
+
+    @property
+    def is_multilingual(self):
+        return self.vocab_size >= 51865
+
+    @property
+    def num_languages(self):
+        return self.vocab_size - 51765 - int(self.is_multilingual)
+
+    def _calc_att_loss(
+        self,
+        encoder_out: torch.Tensor,
+        encoder_mask: torch.Tensor,
+        ys_pad: torch.Tensor,
+        ys_pad_lens: torch.Tensor,
+        infos: Dict[str, List[str]],
+    ) -> Tuple[torch.Tensor, float]:
+        prev_len = ys_pad.size(1)
+        ys_in_pad, ys_out_pad = add_whisper_tokens(self.special_tokens,
+                                                   ys_pad,
+                                                   self.ignore_id,
+                                                   tasks=infos['tasks'],
+                                                   no_timestamp=True,
+                                                   langs=infos['langs'],
+                                                   use_prev=False)
+        cur_len = ys_in_pad.size(1)
+        ys_in_lens = ys_pad_lens + cur_len - prev_len
+
+        # 1. Forward decoder
+        decoder_out, r_decoder_out, _ = self.decoder(encoder_out, encoder_mask,
+                                                     ys_in_pad, ys_in_lens)
+
+        # 2. Compute attention loss
+        loss_att = self.criterion_att(decoder_out, ys_out_pad)
+        acc_att = th_accuracy(
+            decoder_out.view(-1, self.vocab_size),
+            ys_out_pad,
+            ignore_label=self.ignore_id,
+        )
+        return loss_att, acc_att