diff --git a/README.md b/README.md
index 2c13254cadd2424eca8efeb8a3aca455d60b85c4..bbfe7a7020020b8faf07527e0f1bf24746902dd4 100644
--- a/README.md
+++ b/README.md
@@ -1,14 +1,14 @@
 ---
 title: Audio Flamingo 2
-emoji: 🐢
-colorFrom: purple
+emoji: 🏃
+colorFrom: yellow
 colorTo: red
 sdk: gradio
 sdk_version: 5.15.0
 app_file: app.py
 pinned: false
 license: apache-2.0
-short_description: NVIDIA Audio Flamingo 2 Demo
+short_description: Audio Flamingo 2 Demo
 ---
 
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
diff --git a/app.py b/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc3418ef74c7b6c4ad0f9f02bc839cd78c2e9395
--- /dev/null
+++ b/app.py
@@ -0,0 +1,232 @@
+import os
+import yaml
+import json
+import torch
+import spaces
+import librosa
+import argparse
+import numpy as np
+import gradio as gr
+from tqdm import tqdm
+import soundfile as sf
+from pydub import AudioSegment
+from safetensors.torch import load_file
+from huggingface_hub import snapshot_download
+
+from data.data import get_audiotext_dataloader
+from src.factory import create_model_and_transforms
+from train.train_utils import Dict2Class, get_autocast, get_cast_dtype
+
+def int16_to_float32(x):
+    return (x / 32767.0).astype(np.float32)
+
+def float32_to_int16(x):
+    x = np.clip(x, a_min=-1., a_max=1.)
+    return (x * 32767.).astype(np.int16)
+
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+snapshot_download(repo_id="SreyanG-NVIDIA/audio-flamingo-2", local_dir="./")
+
+config = yaml.load(open("configs/inference.yaml"), Loader=yaml.FullLoader)
+
+data_config = config['data_config']
+model_config = config['model_config']
+clap_config = config['clap_config']
+args = Dict2Class(config['train_config'])
+
+model, tokenizer = create_model_and_transforms(
+    **model_config,
+    clap_config=clap_config, 
+    use_local_files=args.offline,
+    gradient_checkpointing=args.gradient_checkpointing,
+    freeze_lm_embeddings=args.freeze_lm_embeddings,
+)
+
+device_id = 0
+model = model.to(device_id)
+model.eval()
+
+# Load metadata
+with open("safe_ckpt/metadata.json", "r") as f:
+    metadata = json.load(f)
+
+# Reconstruct the full state_dict
+state_dict = {}
+
+# Load each SafeTensors chunk
+for chunk_name in metadata:
+    chunk_path = f"safe_ckpt/{chunk_name}.safetensors"
+    chunk_tensors = load_file(chunk_path)
+
+    # Merge tensors into state_dict
+    state_dict.update(chunk_tensors)
+
+x,y = model.load_state_dict(state_dict, False)
+
+autocast = get_autocast(
+    args.precision, cache_enabled=(not args.fsdp)
+)
+
+cast_dtype = get_cast_dtype(args.precision)
+
+def get_num_windows(T, sr):
+
+    window_length  = int(float(clap_config["window_length"]) * sr)
+    window_overlap = int(float(clap_config["window_overlap"]) * sr)
+    max_num_window = int(clap_config["max_num_window"])
+
+    num_windows = 1
+    if T <= window_length:
+        num_windows = 1
+        full_length = window_length
+    elif T >= (max_num_window * window_length - (max_num_window - 1) * window_overlap):
+        num_windows = max_num_window
+        full_length = (max_num_window * window_length - (max_num_window - 1) * window_overlap)
+    else:
+        num_windows = 1 + int(np.ceil((T - window_length) / float(window_length - window_overlap)))
+        full_length = num_windows * window_length - (num_windows - 1) * window_overlap
+    
+    return num_windows, full_length
+
+
+def read_audio(file_path, target_sr=16000, duration=30.0, start=0.0):
+
+    if file_path.endswith('.mp3'):
+        audio = AudioSegment.from_file(file_path)
+        if len(audio) > (start + duration) * 1000:
+            audio = audio[start * 1000:(start + duration) * 1000]
+
+        if audio.frame_rate != target_sr:
+            audio = audio.set_frame_rate(target_sr)
+
+        if audio.channels > 1:
+            audio = audio.set_channels(1)
+        
+        data = np.array(audio.get_array_of_samples())
+        if audio.sample_width == 2:
+            data = data.astype(np.float32) / np.iinfo(np.int16).max
+        elif audio.sample_width == 4:
+            data = data.astype(np.float32) / np.iinfo(np.int32).max
+        else:
+            raise ValueError("Unsupported bit depth: {}".format(audio.sample_width))
+
+    else:
+        with sf.SoundFile(file_path) as audio:
+            original_sr = audio.samplerate
+            channels = audio.channels
+
+            max_frames = int((start + duration) * original_sr)
+
+            audio.seek(int(start * original_sr))
+            frames_to_read = min(max_frames, len(audio))
+            data = audio.read(frames_to_read)
+
+            if data.max() > 1 or data.min() < -1:
+                data = data / max(abs(data.max()), abs(data.min()))
+        
+        if original_sr != target_sr:
+            if channels == 1:
+                data = librosa.resample(data.flatten(), orig_sr=original_sr, target_sr=target_sr)
+            else:
+                data = librosa.resample(data.T, orig_sr=original_sr, target_sr=target_sr)[0]
+        else:
+            if channels != 1:
+                data = data.T[0]
+    
+    if data.min() >= 0:
+        data = 2 * data / abs(data.max()) - 1.0
+    else:
+        data = data / max(abs(data.max()), abs(data.min()))
+    
+    assert len(data.shape) == 1, data.shape
+    return data
+
+def load_audio(audio_path):
+
+    sr = 16000
+    window_length  = int(float(clap_config["window_length"]) * sr)
+    window_overlap = int(float(clap_config["window_overlap"]) * sr)
+    max_num_window = int(clap_config["max_num_window"])
+    duration = max_num_window * (clap_config["window_length"] - clap_config["window_overlap"]) + clap_config["window_overlap"]
+
+    audio_data = read_audio(audio_path, sr, duration, 0.0) # hard code audio start to 0.0
+    T = len(audio_data)
+    num_windows, full_length = get_num_windows(T, sr)
+
+    # pads to the nearest multiple of window_length
+    if full_length > T:
+        audio_data = np.append(audio_data, np.zeros(full_length - T))
+
+    audio_data = audio_data.reshape(1, -1)
+    audio_data_tensor = torch.from_numpy(int16_to_float32(float32_to_int16(audio_data))).float()
+
+    audio_clips = []
+    audio_embed_mask = torch.ones(num_windows)
+    for i in range(num_windows):
+        start = i * (window_length - window_overlap)
+        audio_data_tensor_this = audio_data_tensor[:, start:start+window_length]
+        audio_clips.append(audio_data_tensor_this)
+
+    if len(audio_clips) < max_num_window:
+        audio_clips = audio_clips[:max_num_window]
+        audio_embed_mask = audio_embed_mask[:max_num_window]
+
+    audio_clips = torch.cat(audio_clips)
+    
+    return audio_clips, audio_embed_mask
+
+@spaces.GPU
+def predict(filepath, question):
+
+    audio_clips, audio_embed_mask = load_audio(filepath)
+    audio_clips = audio_clips.to(device_id, dtype=cast_dtype, non_blocking=True)
+    audio_embed_mask = audio_embed_mask.to(device_id, dtype=cast_dtype, non_blocking=True)
+
+    text_prompt = str(question).lower()
+    text_output = str(question).lower()
+
+    sample = f"<audio>{text_prompt.strip()}{tokenizer.sep_token}"
+    # None<|endofchunk|>{tokenizer.eos_token}"
+
+    text = tokenizer(
+        sample,
+        max_length=512,
+        padding="longest",
+        truncation="only_first",
+        return_tensors="pt"
+    )
+
+    input_ids = text["input_ids"].to(device_id, non_blocking=True)
+
+    media_token_id = tokenizer.encode("<audio>")[-1]
+    sep_token_id = tokenizer.sep_token_id
+
+    prompt = input_ids
+
+    with torch.no_grad():
+        output = model.generate(
+            audio_x=audio_clips.unsqueeze(0),
+            audio_x_mask=audio_embed_mask.unsqueeze(0),
+            lang_x=prompt,
+            eos_token_id=tokenizer.eos_token_id,
+            max_new_tokens=256,
+            temperature=0.0)[0]
+    
+    output_decoded = tokenizer.decode(output).split(tokenizer.sep_token)[-1].replace(tokenizer.eos_token, '').replace(tokenizer.pad_token, '').replace('<|endofchunk|>', '')
+
+    return output_decoded
+
+link = "TBD"
+text = "[Github]"
+paper_link = "https://github.com/NVIDIA/audio-flamingo/"
+paper_text = "TBD"
+demo = gr.Interface(fn=predict,
+                    inputs=[gr.Audio(type="filepath"), gr.Textbox(value='Describe the audio.', label='Edit the textbox to ask your own questions!')],
+                    outputs=[gr.Textbox(label="Audio Flamingo 2 Output")],
+                    cache_examples=True,
+                    title="Audio Flamingo 2 Demo",
+                    description="Audio Flamingo 2 is NVIDIA's latest Large Audio-Language Model that is capable of understanding audio inputs and answer any open-ended question about it." + f"<a href='{paper_link}'>{paper_text}</a> " + f"<a href='{link}'>{text}</a> <br>" +
+                    "**Audio Flamingo 2 is not an ASR model and has limited ability to recognize the speech content. It primarily focuses on perception and understanding of non-speech sounds and music.**<br>" +
+                    "The demo is hosted on the Stage 2 checkpoints and supports upto 90 seconds of audios. Stage 3 checkpoints that support upto 5 minutes will be released at a later points.")
+demo.launch(share=True)
\ No newline at end of file
diff --git a/configs/inference.yaml b/configs/inference.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1e5cbc2df82cf77f201f394162c0137c036f63da
--- /dev/null
+++ b/configs/inference.yaml
@@ -0,0 +1,284 @@
+train_config:
+  expdir: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed-sft
+  run_name: run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-3b-fixed-sft-3
+  delete_previous_checkpoint: true 
+  batch_size: 8
+  gradient_accumulation_steps: 2
+  seed: 42
+  learning_rate: 0.00002
+  lr_scheduler: constant
+  loss_multiplier: 1.0
+  warmup_steps: 1875
+  weight_decay: 0.1
+  precision: amp_bf16  # ["amp_bf16", "amp_bfloat16", "bf16", "fp16", "fp32"]
+  gradient_checkpointing: False 
+  num_epochs: 200  # num_epochs * dataset_blending_global_weight = 1
+  offline: false
+  freeze_lm_embeddings: false
+  logging_steps: 10
+  dist_backend: nccl
+  dist_url: env:// # tcp://localhost:7000
+  no_set_device_rank: false 
+  fsdp: true 
+  fsdp_use_orig_params: false  # Passed into the FSDP constructor. Enables param_groups and gradient masking for weight_decay. Does not work with OPT.
+  fsdp_sharding_strategy: full  # full, hybrid
+  horovod: false
+
+# instruction tuning hparams
+# sft_config:
+#   pretrained_path: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-3b-fixed_ckpt_stage1/
+#   pretrained_ckpt: checkpoint_199.pt
+#   unfreeze_full_lm: false
+
+data_config:
+  dataset_blending_global_weight: 0.005
+
+  dataset_blending_config:
+
+    MMAUQA/train:
+      weight: 1.5
+
+    AudioSet-Temporal-Speech-Audio-QA/train:
+      weight: 1.0
+
+    CompA-R-AQA/train:
+      weight: 1.0
+
+    # Audio QA
+    Clotho-AQA-AQA/train:
+      weight: 1.0
+    
+    OpenAQA-AQA/train:
+      weight: 1.0
+
+    SalmonnQA/train:
+      weight: 1.0
+
+    AudioEntailmentQA/train:
+      weight: 1.0
+
+    # Audio Captioning 
+
+    Clotho-v2-AudioCaptioning/train: 
+      weight: 1.0
+
+    audiocaps-AudioCaptioning/train: 
+      weight: 1.0
+
+    Epidemic_sound-AudioCaptioning/train: 
+      weight: 1.0
+
+    MACS-AudioCaptioning/train: 
+      weight: 1.0
+
+    # Audio Classification
+
+    FSD50k-EventClassification/train: 
+      weight: 1.0
+
+    CochlScene-SceneClassification/train: 
+      weight: 1.0
+
+    NonSpeech7k-EventClassification/train: 
+      weight: 1.0
+
+    chime-home-EventClassification/train: 
+      weight: 1.0
+
+    SONYC-UST-EventClassification/train: 
+      weight: 1.0
+
+    # Speech Emotion Classification
+
+    MELD-EmotionClassification/train: 
+      weight: 0.5
+
+    MELD-SentimentClassification/train:
+      weight: 0.5
+
+    emov-db-EmotionClassification/train:
+      weight: 1.0
+
+    jl-corpus-EmotionClassification/train:
+      weight: 6.0
+
+    tess-EmotionClassification/train:
+      weight: 2.5
+
+    IEMOCAP-EmotionClassification/train:
+      weight: 3.0
+
+    OMGEmotion-EmotionClassification/train:
+      weight: 3.0
+
+    VocalSound-VocalClassification/train:
+      weight: 1.5
+
+    # Music QA
+    
+    Music-AVQA-AQA_All/train:
+      weight: 3.0
+    
+    MU-LLAMA-AQA/train:
+      weight: 1.0
+
+    # Music Captioning
+
+    LP-MusicCaps-MSD-AudioCaptioning/train:
+      weight: 0.06
+    
+    LP-MusicCaps-MC-AudioCaptioning/train:
+      weight: 2.0
+
+    LP-MusicCaps-MTT-AudioCaptioning/train:
+      weight: 1.0
+
+    MusicCaps-AudioCaptioning/train:
+      weight: 6.0
+
+    musdbhq-captioning/train:
+      weight: 2.0
+
+    # Music Understanding 
+
+    NSynth-MIR/train:
+      weight: 0.2
+
+    mtg-jamendo-MusicTagging/train:
+      weight: 0.1
+    
+    FMA-GenreClassification/train:
+      weight: 0.5
+    
+    musdbhq-InstrClassification/train:
+      weight: 0.8
+
+    LLARK_FMA-mir/train:
+      weight: 1.0
+
+    LLARK_FMA-reasoning/train:
+      weight: 1.0
+    
+    LLARK_MagnaTagATune-mir/train:
+      weight: 1.0
+
+    LLARK_MTG-Jamendo-reasoning/train:
+      weight: 1.0
+
+    LLARK_MagnaTagATune-reasoning/train:
+      weight: 1.0
+
+    LLARK_MTG-Jamendo-mir/train:
+      weight: 1.0
+
+    MusicBenchQA/train:
+      weight: 1.0
+
+  dataset_file_root: /lustre/fsw/portfolios/adlr/users/sreyang/final_qa/foundational_data
+  data_root: /lustre/fsw/portfolios/adlr/users/zkong/datasets
+  dataset_blending_output: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed/dataset_blending.json
+  max_tokens: 512
+  num_workers: 4
+
+  valid_dataset_config: 
+  
+    Clotho-AQA-AQA/test: true
+
+    Clotho-v2-AudioCaptioning/test: true
+    audiocaps-AudioCaptioning/test: true
+
+    FSD50k-EventClassification/test: true
+    CochlScene-SceneClassification/test: true
+    NonSpeech7k-EventClassification/test: true
+    SONYC-UST-EventClassification/test: true
+
+    MELD-EmotionClassification/test: true
+    MELD-SentimentClassification/test: true
+    emov-db-EmotionClassification/val: true
+    jl-corpus-EmotionClassification/val: true
+    tess-EmotionClassification/val: true
+    IEMOCAP-EmotionClassification/val: true
+    OMGEmotion-EmotionClassification/val: true
+    VocalSound-VocalClassification/test: true
+    
+    Music-AVQA-AQA_All/test: true
+    MU-LLAMA-AQA/test: true
+    
+    LP-MusicCaps-MSD-AudioCaptioning/test: true
+    LP-MusicCaps-MC-AudioCaptioning/test: true
+    LP-MusicCaps-MTT-AudioCaptioning/test: true
+    MusicCaps-AudioCaptioning/test: true
+    
+    NSynth-MIR/test: true
+    mtg-jamendo-MusicTagging/val: true
+    musdbhq-InstrClassification/test: true
+
+    # # zero shot
+    # CREMA-D-EmotionClassification/train:
+    #   prefix_prob: 1.0
+
+    # ravdess-EmotionClassification/train:
+    #   prefix_prob: 1.0
+
+    # UrbanSound8K-EventClassification/train: 
+    #   prefix_prob: 1.0
+
+    # ESC50-EventClassification/train:
+    #   prefix_prob: 1.0
+    
+    # DCASE17Task4-SceneClassification/test:
+    #   prefix_prob: 1.0
+
+    # GTZAN-GenreClassification/train:
+    #   prefix_prob: 1.0
+
+    # Medley-solos-DB-InstrClassification/test: 
+    #   prefix_prob: 1.0
+
+clap_config:  
+  method: nvclap-large
+  audio_embed_dim: 2048
+  checkpoint: clap_ckpt/epoch_15.pt
+
+  window_length: 10.0  # seconds
+  window_overlap: 0.0  # seconds
+  max_num_window: 9  # 1.5 minutes
+  max_num_fewshot: 1  # number of fewshot samples (including the final one)
+  finetune: true
+
+whisper_config:  
+  method: whisper-large-v3
+  path: openai/whisper-large-v3
+  audio_embed_dim: 1280
+  sampling_rate: 16000
+
+  window_length: 30.0  # seconds
+  window_overlap: 0.0  # seconds
+  max_num_window: 1  # 5 minutes
+  max_num_fewshot: 1  # number of fewshot samples (including the final one)
+
+mert_config:  
+  method: mert-v1
+  path: m-a-p/MERT-v1-330M
+  audio_embed_dim: 1024
+  sampling_rate: 24000
+
+  window_length: 10.0  # seconds
+  window_overlap: 0.0  # seconds
+  max_num_window: 1  # 5 minutes
+  max_num_fewshot: 1  # number of fewshot samples (including the final one)
+
+model_config:
+  cache_dir: /lustre/fsw/portfolios/adlr/users/sreyang/.cache
+
+  lang_encoder_path: Qwen/Qwen2.5-3B
+  tokenizer_path: Qwen/Qwen2.5-3B
+  cross_attn_every_n_layers: 1
+  audio_transformer_kwargs: {
+    n_head: 8,
+    n_layers: 3,
+    d_inner: 2048,
+    max_num_media: 128,  # must be >= max_num_window * num_fewshot_samples (4)
+    max_window_per_audio: 1,  # must = max_num_window
+    common_encoder_embed_dim: 1024
+  }
\ No newline at end of file
diff --git a/configs/inference_1.5.yaml b/configs/inference_1.5.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..675e8d36631a6999fb95f461771bcc60186dc34d
--- /dev/null
+++ b/configs/inference_1.5.yaml
@@ -0,0 +1,302 @@
+train_config:
+  expdir: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-1.5b
+  run_name: run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-1.5B-sft
+  delete_previous_checkpoint: true 
+  batch_size: 32
+  gradient_accumulation_steps: 2
+  seed: 42
+  learning_rate: 0.00002
+  lr_scheduler: constant
+  loss_multiplier: 1.0
+  warmup_steps: 1875
+  weight_decay: 0.1
+  precision: amp_bf16  # ["amp_bf16", "amp_bfloat16", "bf16", "fp16", "fp32"]
+  gradient_checkpointing: False 
+  num_epochs: 200  # num_epochs * dataset_blending_global_weight = 1
+  offline: false
+  freeze_lm_embeddings: false
+  logging_steps: 10
+  dist_backend: nccl
+  dist_url: env:// # tcp://localhost:7000
+  no_set_device_rank: false 
+  fsdp: true 
+  fsdp_use_orig_params: false  # Passed into the FSDP constructor. Enables param_groups and gradient masking for weight_decay. Does not work with OPT.
+  fsdp_sharding_strategy: full  # full, hybrid
+  horovod: false
+
+# instruction tuning hparams
+# sft_config:
+#   pretrained_path: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-7b-fixed/
+#   pretrained_ckpt: checkpoint_199.pt
+#   unfreeze_full_lm: false
+
+data_config:
+  dataset_blending_global_weight: 0.005
+
+  dataset_blending_config:
+
+    MMAUQA/train:
+      weight: 1.5
+
+    AudioSet-Temporal-Speech-Audio-QA/train:
+      weight: 1.0
+
+    CompA-R-AQA/train:
+      weight: 1.0
+
+    # Audio QA
+    Clotho-AQA-AQA/train:
+      weight: 1.0
+    
+    OpenAQA-AQA/train:
+      weight: 1.0
+
+    SalmonnQA/train:
+      weight: 0.8
+
+    AudioEntailmentQA/train:
+      weight: 1.0
+
+    # Audio Captioning 
+
+    Clotho-v2-AudioCaptioning/train: 
+      weight: 1.0
+
+    audiocaps-AudioCaptioning/train: 
+      weight: 1.0
+
+    Epidemic_sound-AudioCaptioning/train: 
+      weight: 1.0
+
+    MACS-AudioCaptioning/train: 
+      weight: 1.0
+
+    # Audio Classification
+
+    UrbanSound8K-EventClassification/train:
+      weight: 0.5
+
+    TUT-EventClassification/train:
+      weight: 2.0
+
+    FSD50k-EventClassification/train: 
+      weight: 1.0
+
+    CochlScene-SceneClassification/train: 
+      weight: 1.0
+
+    NonSpeech7k-EventClassification/train: 
+      weight: 1.0
+
+    chime-home-EventClassification/train: 
+      weight: 1.0
+
+    SONYC-UST-EventClassification/train: 
+      weight: 1.0
+
+    # Speech Emotion Classification
+
+    MELD-EmotionClassification/train: 
+      weight: 0.5
+
+    MELD-SentimentClassification/train:
+      weight: 0.5
+
+    emov-db-EmotionClassification/train:
+      weight: 1.0
+
+    jl-corpus-EmotionClassification/train:
+      weight: 6.0
+
+    tess-EmotionClassification/train:
+      weight: 2.5
+
+    IEMOCAP-EmotionClassification/train:
+      weight: 3.0
+
+    OMGEmotion-EmotionClassification/train:
+      weight: 3.0
+
+    VocalSound-VocalClassification/train:
+      weight: 1.5
+
+    # Music QA
+    
+    Music-AVQA-AQA_All/train:
+      weight: 3.0
+    
+    MU-LLAMA-AQA/train:
+      weight: 1.0
+
+    # Music Captioning
+
+    LP-MusicCaps-MSD-AudioCaptioning/train:
+      weight: 0.06
+    
+    LP-MusicCaps-MC-AudioCaptioning/train:
+      weight: 2.0
+
+    LP-MusicCaps-MTT-AudioCaptioning/train:
+      weight: 1.0
+
+    MusicCaps-AudioCaptioning/train:
+      weight: 6.0
+
+    musdbhq-captioning/train:
+      weight: 2.0
+
+    # Music Understanding 
+
+    Medley-solos-DB-InstrClassification/train:
+      weight: 1.5
+
+    GTZAN-GenreClassification/train:
+      weight: 2.0
+
+    NSynth-MIR/train:
+      weight: 0.4
+
+    NSynth-Instrument/train:
+      weight: 1.5
+
+    NSynth-Source/train:
+      weight: 1.5
+
+    mtg-jamendo-MusicTagging/train:
+      weight: 1.0
+    
+    FMA-GenreClassification/train:
+      weight: 1.0
+    
+    musdbhq-InstrClassification/train:
+      weight: 1.0
+
+    LLARK_FMA-mir/train:
+      weight: 1.0
+
+    LLARK_FMA-reasoning/train:
+      weight: 1.0
+    
+    LLARK_MagnaTagATune-mir/train:
+      weight: 1.0
+
+    LLARK_MTG-Jamendo-reasoning/train:
+      weight: 1.0
+
+    LLARK_MagnaTagATune-reasoning/train:
+      weight: 1.0
+
+    LLARK_MTG-Jamendo-mir/train:
+      weight: 1.0
+
+    MusicBenchQA/train:
+      weight: 1.0
+
+  dataset_file_root: /lustre/fsw/portfolios/adlr/users/sreyang/final_qa/foundational_data
+  data_root: /lustre/fsw/portfolios/adlr/users/zkong/datasets
+  dataset_blending_output: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers/dataset_blending.json
+  max_tokens: 512
+  num_workers: 4
+
+  valid_dataset_config: 
+  
+    Clotho-AQA-AQA/test: true
+
+    Clotho-v2-AudioCaptioning/test: true
+    audiocaps-AudioCaptioning/test: true
+
+    FSD50k-EventClassification/test: true
+    CochlScene-SceneClassification/test: true
+    NonSpeech7k-EventClassification/test: true
+    SONYC-UST-EventClassification/test: true
+
+    MELD-EmotionClassification/test: true
+    MELD-SentimentClassification/test: true
+    emov-db-EmotionClassification/val: true
+    jl-corpus-EmotionClassification/val: true
+    tess-EmotionClassification/val: true
+    IEMOCAP-EmotionClassification/val: true
+    OMGEmotion-EmotionClassification/val: true
+    VocalSound-VocalClassification/test: true
+    
+    Music-AVQA-AQA_All/test: true
+    MU-LLAMA-AQA/test: true
+    
+    LP-MusicCaps-MSD-AudioCaptioning/test: true
+    LP-MusicCaps-MC-AudioCaptioning/test: true
+    LP-MusicCaps-MTT-AudioCaptioning/test: true
+    MusicCaps-AudioCaptioning/test: true
+    
+    NSynth-MIR/test: true
+    mtg-jamendo-MusicTagging/val: true
+    musdbhq-InstrClassification/test: true
+
+    # zero shot
+    # CREMA-D-EmotionClassification/train:
+    #   prefix_prob: 1.0
+
+    # ravdess-EmotionClassification/train:
+    #   prefix_prob: 1.0
+
+    # UrbanSound8K-EventClassification/train: 
+    #   prefix_prob: 1.0
+
+    # ESC50-EventClassification/train:
+    #   prefix_prob: 1.0
+    
+    # DCASE17Task4-SceneClassification/test:
+    #   prefix_prob: 1.0
+
+    # GTZAN-GenreClassification/train:
+    #   prefix_prob: 1.0
+
+    # Medley-solos-DB-InstrClassification/test: 
+    #   prefix_prob: 1.0
+
+clap_config:  
+  method: nvclap-large
+  audio_embed_dim: 2048
+  checkpoint: /lustre/fsw/portfolios/adlr/users/sreyang/datasets/clap_datasets/clap_ckpts_5/15/ck_sim/checkpoints/epoch_15.pt
+
+  window_length: 10.0  # seconds
+  window_overlap: 0.0  # seconds
+  max_num_window: 9  # 1.5 minutes
+  max_num_fewshot: 1  # number of fewshot samples (including the final one)
+  finetune: true
+
+whisper_config:  
+  method: whisper-large-v3
+  path: openai/whisper-large-v3
+  audio_embed_dim: 1280
+  sampling_rate: 16000
+
+  window_length: 30.0  # seconds
+  window_overlap: 0.0  # seconds
+  max_num_window: 1  # 5 minutes
+  max_num_fewshot: 1  # number of fewshot samples (including the final one)
+
+mert_config:  
+  method: mert-v1
+  path: m-a-p/MERT-v1-330M
+  audio_embed_dim: 1024
+  sampling_rate: 24000
+
+  window_length: 10.0  # seconds
+  window_overlap: 0.0  # seconds
+  max_num_window: 1  # 5 minutes
+  max_num_fewshot: 1  # number of fewshot samples (including the final one)
+
+model_config:
+  cache_dir: /lustre/fsw/portfolios/adlr/users/sreyang/.cache
+
+  lang_encoder_path: Qwen/Qwen2.5-1.5B
+  tokenizer_path: Qwen/Qwen2.5-1.5B
+  cross_attn_every_n_layers: 1
+  audio_transformer_kwargs: {
+    n_head: 8,
+    n_layers: 3,
+    d_inner: 2048,
+    max_num_media: 128,  # must be >= max_num_window * num_fewshot_samples (4)
+    max_window_per_audio: 1,  # must = max_num_window
+    common_encoder_embed_dim: 1024
+  }
\ No newline at end of file
diff --git a/configs/inference_2.yaml b/configs/inference_2.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ea3d5df2c398df20a5dc4e2ca7139d0a7aadf4ad
--- /dev/null
+++ b/configs/inference_2.yaml
@@ -0,0 +1,302 @@
+train_config:
+  expdir: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers
+  run_name: run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-7b-fixed
+  delete_previous_checkpoint: true 
+  batch_size: 4
+  gradient_accumulation_steps: 2
+  seed: 42
+  learning_rate: 0.00002
+  lr_scheduler: constant
+  loss_multiplier: 1.0
+  warmup_steps: 1875
+  weight_decay: 0.1
+  precision: amp_bf16  # ["amp_bf16", "amp_bfloat16", "bf16", "fp16", "fp32"]
+  gradient_checkpointing: False 
+  num_epochs: 200  # num_epochs * dataset_blending_global_weight = 1
+  offline: false
+  freeze_lm_embeddings: false
+  logging_steps: 10
+  dist_backend: nccl
+  dist_url: env:// # tcp://localhost:7000
+  no_set_device_rank: false 
+  fsdp: true 
+  fsdp_use_orig_params: false  # Passed into the FSDP constructor. Enables param_groups and gradient masking for weight_decay. Does not work with OPT.
+  fsdp_sharding_strategy: full  # full, hybrid
+  horovod: false
+
+# instruction tuning hparams
+sft_config:
+  pretrained_path: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-7b-fixed/
+  pretrained_ckpt: checkpoint_199.pt
+  unfreeze_full_lm: false
+
+data_config:
+  dataset_blending_global_weight: 0.005
+
+  dataset_blending_config:
+
+    MMAUQA/train:
+      weight: 1.5
+
+    AudioSet-Temporal-Speech-Audio-QA/train:
+      weight: 1.0
+
+    CompA-R-AQA/train:
+      weight: 1.0
+
+    # Audio QA
+    Clotho-AQA-AQA/train:
+      weight: 1.0
+    
+    OpenAQA-AQA/train:
+      weight: 1.0
+
+    SalmonnQA/train:
+      weight: 0.8
+
+    AudioEntailmentQA/train:
+      weight: 1.0
+
+    # Audio Captioning 
+
+    Clotho-v2-AudioCaptioning/train: 
+      weight: 1.0
+
+    audiocaps-AudioCaptioning/train: 
+      weight: 1.0
+
+    Epidemic_sound-AudioCaptioning/train: 
+      weight: 1.0
+
+    MACS-AudioCaptioning/train: 
+      weight: 1.0
+
+    # Audio Classification
+
+    UrbanSound8K-EventClassification/train:
+      weight: 0.5
+
+    TUT-EventClassification/train:
+      weight: 2.0
+
+    FSD50k-EventClassification/train: 
+      weight: 1.0
+
+    CochlScene-SceneClassification/train: 
+      weight: 1.0
+
+    NonSpeech7k-EventClassification/train: 
+      weight: 1.0
+
+    chime-home-EventClassification/train: 
+      weight: 1.0
+
+    SONYC-UST-EventClassification/train: 
+      weight: 1.0
+
+    # Speech Emotion Classification
+
+    MELD-EmotionClassification/train: 
+      weight: 0.5
+
+    MELD-SentimentClassification/train:
+      weight: 0.5
+
+    emov-db-EmotionClassification/train:
+      weight: 1.0
+
+    jl-corpus-EmotionClassification/train:
+      weight: 6.0
+
+    tess-EmotionClassification/train:
+      weight: 2.5
+
+    IEMOCAP-EmotionClassification/train:
+      weight: 3.0
+
+    OMGEmotion-EmotionClassification/train:
+      weight: 3.0
+
+    VocalSound-VocalClassification/train:
+      weight: 1.5
+
+    # Music QA
+    
+    Music-AVQA-AQA_All/train:
+      weight: 3.0
+    
+    MU-LLAMA-AQA/train:
+      weight: 1.0
+
+    # Music Captioning
+
+    LP-MusicCaps-MSD-AudioCaptioning/train:
+      weight: 0.06
+    
+    LP-MusicCaps-MC-AudioCaptioning/train:
+      weight: 2.0
+
+    LP-MusicCaps-MTT-AudioCaptioning/train:
+      weight: 1.0
+
+    MusicCaps-AudioCaptioning/train:
+      weight: 6.0
+
+    musdbhq-captioning/train:
+      weight: 2.0
+
+    # Music Understanding 
+
+    Medley-solos-DB-InstrClassification/train:
+      weight: 1.5
+
+    GTZAN-GenreClassification/train:
+      weight: 2.0
+
+    NSynth-MIR/train:
+      weight: 0.4
+
+    NSynth-Instrument/train:
+      weight: 1.5
+
+    NSynth-Source/train:
+      weight: 1.5
+
+    mtg-jamendo-MusicTagging/train:
+      weight: 1.0
+    
+    FMA-GenreClassification/train:
+      weight: 1.0
+    
+    musdbhq-InstrClassification/train:
+      weight: 1.0
+
+    LLARK_FMA-mir/train:
+      weight: 1.0
+
+    LLARK_FMA-reasoning/train:
+      weight: 1.0
+    
+    LLARK_MagnaTagATune-mir/train:
+      weight: 1.0
+
+    LLARK_MTG-Jamendo-reasoning/train:
+      weight: 1.0
+
+    LLARK_MagnaTagATune-reasoning/train:
+      weight: 1.0
+
+    LLARK_MTG-Jamendo-mir/train:
+      weight: 1.0
+
+    MusicBenchQA/train:
+      weight: 1.0
+
+  dataset_file_root: /lustre/fsw/portfolios/adlr/users/sreyang/final_qa/foundational_data
+  data_root: /lustre/fsw/portfolios/adlr/users/zkong/datasets
+  dataset_blending_output: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers/dataset_blending.json
+  max_tokens: 512
+  num_workers: 4
+
+  valid_dataset_config: 
+  
+    Clotho-AQA-AQA/test: true
+
+    Clotho-v2-AudioCaptioning/test: true
+    audiocaps-AudioCaptioning/test: true
+
+    FSD50k-EventClassification/test: true
+    CochlScene-SceneClassification/test: true
+    NonSpeech7k-EventClassification/test: true
+    SONYC-UST-EventClassification/test: true
+
+    MELD-EmotionClassification/test: true
+    MELD-SentimentClassification/test: true
+    emov-db-EmotionClassification/val: true
+    jl-corpus-EmotionClassification/val: true
+    tess-EmotionClassification/val: true
+    IEMOCAP-EmotionClassification/val: true
+    OMGEmotion-EmotionClassification/val: true
+    VocalSound-VocalClassification/test: true
+    
+    Music-AVQA-AQA_All/test: true
+    MU-LLAMA-AQA/test: true
+    
+    LP-MusicCaps-MSD-AudioCaptioning/test: true
+    LP-MusicCaps-MC-AudioCaptioning/test: true
+    LP-MusicCaps-MTT-AudioCaptioning/test: true
+    MusicCaps-AudioCaptioning/test: true
+    
+    NSynth-MIR/test: true
+    mtg-jamendo-MusicTagging/val: true
+    musdbhq-InstrClassification/test: true
+
+    # zero shot
+    # CREMA-D-EmotionClassification/train:
+    #   prefix_prob: 1.0
+
+    # ravdess-EmotionClassification/train:
+    #   prefix_prob: 1.0
+
+    # UrbanSound8K-EventClassification/train: 
+    #   prefix_prob: 1.0
+
+    # ESC50-EventClassification/train:
+    #   prefix_prob: 1.0
+    
+    # DCASE17Task4-SceneClassification/test:
+    #   prefix_prob: 1.0
+
+    # GTZAN-GenreClassification/train:
+    #   prefix_prob: 1.0
+
+    # Medley-solos-DB-InstrClassification/test: 
+    #   prefix_prob: 1.0
+
+clap_config:  
+  method: nvclap-large
+  audio_embed_dim: 2048
+  checkpoint: /lustre/fsw/portfolios/adlr/users/sreyang/datasets/clap_datasets/clap_ckpts_5/15/ck_sim/checkpoints/epoch_15.pt
+
+  window_length: 10.0  # seconds
+  window_overlap: 0.0  # seconds
+  max_num_window: 9  # 1.5 minutes
+  max_num_fewshot: 1  # number of fewshot samples (including the final one)
+  finetune: true
+
+whisper_config:  
+  method: whisper-large-v3
+  path: openai/whisper-large-v3
+  audio_embed_dim: 1280
+  sampling_rate: 16000
+
+  window_length: 30.0  # seconds
+  window_overlap: 0.0  # seconds
+  max_num_window: 1  # 5 minutes
+  max_num_fewshot: 1  # number of fewshot samples (including the final one)
+
+mert_config:  
+  method: mert-v1
+  path: m-a-p/MERT-v1-330M
+  audio_embed_dim: 1024
+  sampling_rate: 24000
+
+  window_length: 10.0  # seconds
+  window_overlap: 0.0  # seconds
+  max_num_window: 1  # 5 minutes
+  max_num_fewshot: 1  # number of fewshot samples (including the final one)
+
+model_config:
+  cache_dir: /lustre/fsw/portfolios/adlr/users/sreyang/.cache
+
+  lang_encoder_path: Qwen/Qwen2.5-3B
+  tokenizer_path: Qwen/Qwen2.5-3B
+  cross_attn_every_n_layers: 1
+  audio_transformer_kwargs: {
+    n_head: 8,
+    n_layers: 3,
+    d_inner: 2048,
+    max_num_media: 128,  # must be >= max_num_window * num_fewshot_samples (4)
+    max_window_per_audio: 1,  # must = max_num_window
+    common_encoder_embed_dim: 1024
+  }
\ No newline at end of file
diff --git a/configs/inference_long.yaml b/configs/inference_long.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cfcdf228fa1bf1b26c9c94652f2b34276f089db8
--- /dev/null
+++ b/configs/inference_long.yaml
@@ -0,0 +1,284 @@
+train_config:
+  expdir: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed-sft
+  run_name: run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-3b-fixed-sft-long
+  delete_previous_checkpoint: true 
+  batch_size: 2
+  gradient_accumulation_steps: 2
+  seed: 42
+  learning_rate: 0.00002
+  lr_scheduler: constant
+  loss_multiplier: 1.0
+  warmup_steps: 1875
+  weight_decay: 0.1
+  precision: amp_bf16  # ["amp_bf16", "amp_bfloat16", "bf16", "fp16", "fp32"]
+  gradient_checkpointing: False 
+  num_epochs: 200  # num_epochs * dataset_blending_global_weight = 1
+  offline: false
+  freeze_lm_embeddings: false
+  logging_steps: 10
+  dist_backend: nccl
+  dist_url: env:// # tcp://localhost:7000
+  no_set_device_rank: false 
+  fsdp: true 
+  fsdp_use_orig_params: false  # Passed into the FSDP constructor. Enables param_groups and gradient masking for weight_decay. Does not work with OPT.
+  fsdp_sharding_strategy: full  # full, hybrid
+  horovod: false
+
+# instruction tuning hparams
+# sft_config:
+#   pretrained_path: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-3b-fixed_ckpt_stage1/
+#   pretrained_ckpt: checkpoint_199.pt
+#   unfreeze_full_lm: false
+
+data_config:
+  dataset_blending_global_weight: 0.005
+
+  dataset_blending_config:
+
+    MMAUQA/train:
+      weight: 1.5
+
+    AudioSet-Temporal-Speech-Audio-QA/train:
+      weight: 1.0
+
+    CompA-R-AQA/train:
+      weight: 1.0
+
+    # Audio QA
+    Clotho-AQA-AQA/train:
+      weight: 1.0
+    
+    OpenAQA-AQA/train:
+      weight: 1.0
+
+    SalmonnQA/train:
+      weight: 1.0
+
+    AudioEntailmentQA/train:
+      weight: 1.0
+
+    # Audio Captioning 
+
+    Clotho-v2-AudioCaptioning/train: 
+      weight: 1.0
+
+    audiocaps-AudioCaptioning/train: 
+      weight: 1.0
+
+    Epidemic_sound-AudioCaptioning/train: 
+      weight: 1.0
+
+    MACS-AudioCaptioning/train: 
+      weight: 1.0
+
+    # Audio Classification
+
+    FSD50k-EventClassification/train: 
+      weight: 1.0
+
+    CochlScene-SceneClassification/train: 
+      weight: 1.0
+
+    NonSpeech7k-EventClassification/train: 
+      weight: 1.0
+
+    chime-home-EventClassification/train: 
+      weight: 1.0
+
+    SONYC-UST-EventClassification/train: 
+      weight: 1.0
+
+    # Speech Emotion Classification
+
+    MELD-EmotionClassification/train: 
+      weight: 0.5
+
+    MELD-SentimentClassification/train:
+      weight: 0.5
+
+    emov-db-EmotionClassification/train:
+      weight: 1.0
+
+    jl-corpus-EmotionClassification/train:
+      weight: 6.0
+
+    tess-EmotionClassification/train:
+      weight: 2.5
+
+    IEMOCAP-EmotionClassification/train:
+      weight: 3.0
+
+    OMGEmotion-EmotionClassification/train:
+      weight: 3.0
+
+    VocalSound-VocalClassification/train:
+      weight: 1.5
+
+    # Music QA
+    
+    Music-AVQA-AQA_All/train:
+      weight: 3.0
+    
+    MU-LLAMA-AQA/train:
+      weight: 1.0
+
+    # Music Captioning
+
+    LP-MusicCaps-MSD-AudioCaptioning/train:
+      weight: 0.06
+    
+    LP-MusicCaps-MC-AudioCaptioning/train:
+      weight: 2.0
+
+    LP-MusicCaps-MTT-AudioCaptioning/train:
+      weight: 1.0
+
+    MusicCaps-AudioCaptioning/train:
+      weight: 6.0
+
+    musdbhq-captioning/train:
+      weight: 2.0
+
+    # Music Understanding 
+
+    NSynth-MIR/train:
+      weight: 0.2
+
+    mtg-jamendo-MusicTagging/train:
+      weight: 0.1
+    
+    FMA-GenreClassification/train:
+      weight: 0.5
+    
+    musdbhq-InstrClassification/train:
+      weight: 0.8
+
+    LLARK_FMA-mir/train:
+      weight: 1.0
+
+    LLARK_FMA-reasoning/train:
+      weight: 1.0
+    
+    LLARK_MagnaTagATune-mir/train:
+      weight: 1.0
+
+    LLARK_MTG-Jamendo-reasoning/train:
+      weight: 1.0
+
+    LLARK_MagnaTagATune-reasoning/train:
+      weight: 1.0
+
+    LLARK_MTG-Jamendo-mir/train:
+      weight: 1.0
+
+    MusicBenchQA/train:
+      weight: 1.0
+
+  dataset_file_root: /lustre/fsw/portfolios/adlr/users/sreyang/final_qa/foundational_data
+  data_root: /lustre/fsw/portfolios/adlr/users/zkong/datasets
+  dataset_blending_output: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed/dataset_blending.json
+  max_tokens: 512
+  num_workers: 4
+
+  valid_dataset_config: 
+  
+    Clotho-AQA-AQA/test: true
+
+    Clotho-v2-AudioCaptioning/test: true
+    audiocaps-AudioCaptioning/test: true
+
+    FSD50k-EventClassification/test: true
+    CochlScene-SceneClassification/test: true
+    NonSpeech7k-EventClassification/test: true
+    SONYC-UST-EventClassification/test: true
+
+    MELD-EmotionClassification/test: true
+    MELD-SentimentClassification/test: true
+    emov-db-EmotionClassification/val: true
+    jl-corpus-EmotionClassification/val: true
+    tess-EmotionClassification/val: true
+    IEMOCAP-EmotionClassification/val: true
+    OMGEmotion-EmotionClassification/val: true
+    VocalSound-VocalClassification/test: true
+    
+    Music-AVQA-AQA_All/test: true
+    MU-LLAMA-AQA/test: true
+    
+    LP-MusicCaps-MSD-AudioCaptioning/test: true
+    LP-MusicCaps-MC-AudioCaptioning/test: true
+    LP-MusicCaps-MTT-AudioCaptioning/test: true
+    MusicCaps-AudioCaptioning/test: true
+    
+    NSynth-MIR/test: true
+    mtg-jamendo-MusicTagging/val: true
+    musdbhq-InstrClassification/test: true
+
+    # # zero shot
+    # CREMA-D-EmotionClassification/train:
+    #   prefix_prob: 1.0
+
+    # ravdess-EmotionClassification/train:
+    #   prefix_prob: 1.0
+
+    # UrbanSound8K-EventClassification/train: 
+    #   prefix_prob: 1.0
+
+    # ESC50-EventClassification/train:
+    #   prefix_prob: 1.0
+    
+    # DCASE17Task4-SceneClassification/test:
+    #   prefix_prob: 1.0
+
+    # GTZAN-GenreClassification/train:
+    #   prefix_prob: 1.0
+
+    # Medley-solos-DB-InstrClassification/test: 
+    #   prefix_prob: 1.0
+
+clap_config:  
+  method: nvclap-large
+  audio_embed_dim: 2048
+  checkpoint: /lustre/fsw/portfolios/adlr/users/sreyang/datasets/clap_datasets/clap_ckpts_5/15/ck_sim/checkpoints/epoch_15.pt
+
+  window_length: 10.0  # seconds
+  window_overlap: 0.0  # seconds
+  max_num_window: 30  # 1.5 minutes
+  max_num_fewshot: 1  # number of fewshot samples (including the final one)
+  finetune: true
+
+whisper_config:  
+  method: whisper-large-v3
+  path: openai/whisper-large-v3
+  audio_embed_dim: 1280
+  sampling_rate: 16000
+
+  window_length: 30.0  # seconds
+  window_overlap: 0.0  # seconds
+  max_num_window: 1  # 5 minutes
+  max_num_fewshot: 1  # number of fewshot samples (including the final one)
+
+mert_config:  
+  method: mert-v1
+  path: m-a-p/MERT-v1-330M
+  audio_embed_dim: 1024
+  sampling_rate: 24000
+
+  window_length: 10.0  # seconds
+  window_overlap: 0.0  # seconds
+  max_num_window: 1  # 5 minutes
+  max_num_fewshot: 1  # number of fewshot samples (including the final one)
+
+model_config:
+  cache_dir: /lustre/fsw/portfolios/adlr/users/sreyang/.cache
+
+  lang_encoder_path: Qwen/Qwen2.5-3B
+  tokenizer_path: Qwen/Qwen2.5-3B
+  cross_attn_every_n_layers: 1
+  audio_transformer_kwargs: {
+    n_head: 8,
+    n_layers: 3,
+    d_inner: 2048,
+    max_num_media: 128,  # must be >= max_num_window * num_fewshot_samples (4)
+    max_window_per_audio: 1,  # must = max_num_window
+    common_encoder_embed_dim: 1024
+  }
\ No newline at end of file
diff --git a/configs/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_ICL4x16win-4node.yaml b/configs/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_ICL4x16win-4node.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4303c9d55b167f22d012ba127fd3fb660b1e0e4a
--- /dev/null
+++ b/configs/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_ICL4x16win-4node.yaml
@@ -0,0 +1,255 @@
+train_config:
+  expdir: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/v1.0_optimlmax1.3b_foundation
+  run_name: run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_ICL4x16win-4node
+  delete_previous_checkpoint: true 
+  batch_size: 6
+  gradient_accumulation_steps: 2  # 4 nodes
+  seed: 42
+  learning_rate: 0.0001
+  lr_scheduler: constant
+  loss_multiplier: 1.0
+  warmup_steps: 1875
+  weight_decay: 0.1
+  precision: amp_bf16  # ["amp_bf16", "amp_bfloat16", "bf16", "fp16", "fp32"]
+  gradient_checkpointing: False 
+  num_epochs: 100  # num_epochs * dataset_blending_global_weight = 1
+  offline: false
+  freeze_lm_embeddings: true
+  logging_steps: 10
+  dist_backend: nccl
+  dist_url: env:// # tcp://localhost:7000
+  no_set_device_rank: false 
+  fsdp: true 
+  fsdp_use_orig_params: false  # Passed into the FSDP constructor. Enables param_groups and gradient masking for weight_decay. Does not work with OPT.
+  fsdp_sharding_strategy: full  # full, hybrid
+  horovod: false
+
+data_config:
+  dataset_blending_global_weight: 0.01
+
+  dataset_blending_config:
+
+    # Audio QA
+    OpenAQA-AQA/train:
+      weight: 1.0
+      prefix_prob: 0.0
+      augmentations:
+        do_nothing: 1.0
+
+    # Audio Captioning 
+
+    BBCSoundEffects-AudioDescription/train: 
+      weight: 5.0
+      prefix_prob: 0.5
+      augmentations:
+        do_nothing: 1.0
+
+    CLAP_freesound-AudioCaptioning/train: 
+      weight: 1.0
+      prefix_prob: 0.5
+      augmentations:
+        do_nothing: 1.0
+
+    SoundDescs-AudioDescription/train:
+      weight: 1.0
+      prefix_prob: 0.5
+      augmentations:
+        do_nothing: 1.0
+
+    WavCaps-AudioSet_SL-AudioCaptioning/train: 
+      weight: 1.0
+      prefix_prob: 0.5
+      augmentations:
+        do_nothing: 1.0
+
+    WavCaps-BBC_Sound_Effects-AudioCaptioning/train: 
+      weight: 2
+      prefix_prob: 0.5
+      augmentations:
+        do_nothing: 1.0
+
+    WavCaps-FreeSound-AudioCaptioning/train: 
+      weight: 2
+      prefix_prob: 0.5
+      augmentations:
+        do_nothing: 1.0
+    
+    WavCaps-SoundBible-AudioCaptioning/train: 
+      weight: 5
+      prefix_prob: 0.5
+      augmentations:
+        do_nothing: 1.0
+
+    # Audio Classification
+
+    AudioSetFullwoAudioMusicCaps-EventClassification/train:
+      weight: 1.0
+      prefix_prob: 0.5
+      augmentations:
+        num_words: 0.8
+        do_nothing: 0.2
+
+    AudioSet-EventClassification/train: 
+      weight: 5.0
+      prefix_prob: 0.5
+      augmentations:
+        num_words: 0.8
+        do_nothing: 0.2
+
+    Clotho-AQA-EventClassification/train: 
+      weight: 5.0
+      prefix_prob: 0.5
+      augmentations:
+        num_words: 0.8
+        do_nothing: 0.2
+    
+    WavText5K-Tagging/train: 
+      weight: 3.0
+      prefix_prob: 0.5
+      augmentations:
+        num_words: 0.8
+        do_nothing: 0.2
+
+    # Speech Emotion Classification
+
+    MSP-PODCAST-Publish-1.9-EmotionClassification/train:
+      weight: 1.8
+      prefix_prob: 0.5
+      augmentations:
+        provide_all_labels: 0.9
+        do_nothing: 0.1
+    MSP-PODCAST-Publish-1.9-EmotionClassification/interleaved_knn-train:
+      weight: 1.2
+      prefix_prob: 0.5
+      augmentations:
+        provide_all_labels: 0.9
+        do_nothing: 0.1
+    
+    MELD-EmotionClassification/train:
+      weight: 1.8
+      prefix_prob: 0.5
+      augmentations:
+        provide_all_labels: 0.9
+        do_nothing: 0.1
+    MELD-EmotionClassification/interleaved_knn-train:
+      weight: 1.2
+      prefix_prob: 0.5
+      augmentations:
+        provide_all_labels: 0.9
+        do_nothing: 0.1
+
+    MELD-SentimentClassification/train:
+      weight: 1.8
+      prefix_prob: 0.5
+      augmentations:
+        provide_all_labels: 0.9
+        do_nothing: 0.1
+    MELD-SentimentClassification/interleaved_knn-train:
+      weight: 1.2
+      prefix_prob: 0.5
+      augmentations:
+        provide_all_labels: 0.9
+        do_nothing: 0.1
+    
+    # Music QA
+    
+    Music-AVQA-AVQA_All/train:
+      weight: 3.0
+      prefix_prob: 0.5
+      augmentations:
+        AQA_binary_instruction: 1.0
+    
+    MU-LLAMA-AQA/train:
+      weight: 1.8
+      prefix_prob: 0.5
+      augmentations:
+        do_nothing: 1.0
+    MU-LLAMA-AQA/interleaved_knn-train:
+      weight: 1.2
+      prefix_prob: 0.5
+      augmentations:
+        do_nothing: 1.0
+    
+    # Music Captioning
+
+    LP-MusicCaps-MSD-AudioCaptioning/train:
+      weight: 1.0
+      prefix_prob: 0.5
+      augmentations:
+        do_nothing: 1.0
+
+    # Music Understanding 
+
+    NSynth-MIR/train:
+      weight: 0.6
+      prefix_prob: 0.5
+      augmentations:
+        do_nothing: 1.0
+    NSynth-MIR/interleaved_knn-train:
+      weight: 0.4
+      prefix_prob: 0.5
+      augmentations:
+        do_nothing: 1.0
+    
+    mtg-jamendo-MusicTagging/train:
+      weight: 1.0
+      prefix_prob: 0.5
+      augmentations:
+        do_nothing: 1.0
+    
+  dataset_file_root: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/dataset_files
+  data_root: /lustre/fsw/portfolios/adlr/users/zkong/datasets
+  dataset_blending_output: dataset_blending.json
+  max_tokens: 512
+  num_workers: 4
+
+  valid_dataset_config: 
+    CLAP_freesound-AudioCaptioning/test: true 
+    SoundDescs-AudioDescription/test: true
+    Clotho-AQA-EventClassification/test: true
+
+    MSP-PODCAST-Publish-1.9-EmotionClassification/test: true 
+    MSP-PODCAST-Publish-1.9-EmotionClassification/interleaved_knn-test: true 
+    MELD-EmotionClassification/test: true
+    MELD-EmotionClassification/interleaved_knn-test: true
+    MELD-SentimentClassification/test: true
+    MELD-SentimentClassification/interleaved_knn-test: true
+
+    MU-LLAMA-AQA/test: true 
+    LP-MusicCaps-MSD-AudioCaptioning/val: true 
+    NSynth-MIR/test: true
+    NSynth-MIR/interleaved_knn-test: true
+    mtg-jamendo-MusicTagging/val: true
+
+clap_config:
+  # method: laion-clap
+  # audio_embed_dim: 512
+  # model_name: 630k-fusion-best
+  # checkpoint: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/laion-clap-pretrained/laion_clap/630k-fusion-best.pt
+  
+  method: microsoft-clap
+  audio_embed_dim: 1024
+  config_root: /home/zkong/audio_flamingo/audio_flamingo_v1/v1.0_optimlmax1.3b_foundation/my_ms_clap/src/configs
+  # model_name: '2023'
+  # checkpoint: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/clap/CLAP_weights_2023.pth
+  model_name: 'clapcap'
+  checkpoint: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/clap/clapcap_weights_2023.pth
+
+  window_length: 7.0  # seconds
+  window_overlap: 5.25  # seconds
+  max_num_window: 16  # 35 seconds
+  max_num_fewshot: 4  # number of fewshot samples (including the final one)
+
+model_config:
+  cache_dir: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/LLM_pretrained/.cache
+
+  lang_encoder_path: facebook/opt-iml-max-1.3b
+  tokenizer_path: facebook/opt-iml-max-1.3b
+  cross_attn_every_n_layers: 1
+  audio_transformer_kwargs: {
+    n_head: 8,
+    n_layers: 3,
+    d_inner: 2048,
+    max_num_media: 128,  # must be >= max_num_window * num_fewshot_samples (4)
+    max_window_per_audio: 16,  # must = max_num_window
+  }
\ No newline at end of file
diff --git a/configs/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node.yaml b/configs/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3c3a33a145db66fe77a7690e38f0e9e1daa372d3
--- /dev/null
+++ b/configs/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node.yaml
@@ -0,0 +1,183 @@
+train_config:
+  expdir: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed
+  run_name: run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-3b-fixed
+  delete_previous_checkpoint: true 
+  batch_size: 4
+  gradient_accumulation_steps: 2  # 4 nodes
+  seed: 42
+  learning_rate: 0.0001
+  lr_scheduler: constant
+  loss_multiplier: 1.0
+  warmup_steps: 1875
+  weight_decay: 0.1
+  precision: amp_bf16  # ["amp_bf16", "amp_bfloat16", "bf16", "fp16", "fp32"]
+  gradient_checkpointing: False 
+  num_epochs: 200  # num_epochs * dataset_blending_global_weight = 1
+  offline: false
+  freeze_lm_embeddings: false
+  logging_steps: 10
+  dist_backend: nccl
+  dist_url: env:// # tcp://localhost:7000
+  no_set_device_rank: false 
+  fsdp: true 
+  fsdp_use_orig_params: false  # Passed into the FSDP constructor. Enables param_groups and gradient masking for weight_decay. Does not work with OPT.
+  fsdp_sharding_strategy: full  # full, hybrid
+  horovod: false
+
+data_config:
+  dataset_blending_global_weight: 0.005
+
+  dataset_blending_config:
+  
+    # Audio QA
+    OpenAQA-AQA/train:
+      weight: 1.0
+
+    AudioSet-Temporal-Speech-Audio-QA/train:
+      weight: 2.0
+
+    CompA-R-AQA/train:
+      weight: 2.0
+
+    # Audio Captioning 
+
+    BBCSoundEffects-AudioDescription/train: 
+      weight: 5.0
+
+    CLAP_freesound-AudioCaptioning/train: 
+      weight: 1.0
+
+    SoundDescs-AudioDescription/train:
+      weight: 1.0
+
+    WavCaps-AudioSet_SL-AudioCaptioning/train: 
+      weight: 1.0
+
+    WavCaps-BBC_Sound_Effects-AudioCaptioning/train: 
+      weight: 2.0
+
+    WavCaps-FreeSound-AudioCaptioning/train: 
+      weight: 2.0
+
+    WavCaps-SoundBible-AudioCaptioning/train: 
+      weight: 5.0
+
+    Ego-10-AudioCaptioning/train: 
+      weight: 2.0
+
+    Ego-30-AudioCaptioning/train: 
+      weight: 2.0
+    
+    # Audio Classification
+
+    AudioSetFullwoAudioMusicCaps-EventClassification/train:
+      weight: 1.0
+
+    AudioSet-EventClassification/train: 
+      weight: 5.0
+
+    Clotho-AQA-EventClassification/train: 
+      weight: 5.0
+
+    WavText5K-Tagging/train: 
+      weight: 3.0
+
+    # Speech Emotion Classification
+
+    MSP-PODCAST-Publish-1.9-EmotionClassification/train:
+      weight: 3.0
+
+    MELD-EmotionClassification/train:
+      weight: 3.0
+
+    MELD-SentimentClassification/train:
+      weight: 3.0
+    
+    # Music QA
+    
+    Music-AVQA-AVQA_All/train:
+      weight: 3.0
+    
+    MU-LLAMA-AQA/train:
+      weight: 3.0
+    
+    # Music Captioning
+
+    LP-MusicCaps-MSD-AudioCaptioning/train:
+      weight: 1.0
+
+    # Music Understanding 
+
+    NSynth-MIR/train:
+      weight: 1.0
+    
+    mtg-jamendo-MusicTagging/train:
+      weight: 1.0
+    
+  dataset_file_root: /lustre/fsw/portfolios/adlr/users/sreyang/final_qa/foundational_data
+  data_root: /lustre/fsw/portfolios/adlr/users/zkong/datasets
+  dataset_blending_output: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed/dataset_blending.json
+  max_tokens: 512
+  num_workers: 4
+
+  valid_dataset_config: 
+    CLAP_freesound-AudioCaptioning/test: true 
+    SoundDescs-AudioDescription/test: true
+    Clotho-AQA-EventClassification/test: true
+
+    MSP-PODCAST-Publish-1.9-EmotionClassification/test: true 
+    MELD-EmotionClassification/test: true
+    MELD-SentimentClassification/test: true
+
+    MU-LLAMA-AQA/test: true 
+    LP-MusicCaps-MSD-AudioCaptioning/val: true 
+    NSynth-MIR/test: true
+    mtg-jamendo-MusicTagging/val: true
+
+clap_config:  
+  method: nvclap-large
+  audio_embed_dim: 2048
+  checkpoint: /lustre/fsw/portfolios/adlr/users/sreyang/datasets/clap_datasets/clap_ckpts_5/15/ck_sim/checkpoints/epoch_15.pt
+
+  window_length: 10.0  # seconds
+  window_overlap: 0.0  # seconds
+  max_num_window: 3  # 5 minutes
+  max_num_fewshot: 1  # number of fewshot samples (including the final one)
+
+whisper_config:  
+  method: whisper-large-v3
+  path: openai/whisper-large-v3
+  audio_embed_dim: 1280
+  sampling_rate: 16000
+
+  window_length: 30.0  # seconds
+  window_overlap: 0.0  # seconds
+  max_num_window: 1  # 5 minutes
+  max_num_fewshot: 1  # number of fewshot samples (including the final one)
+  finetune: true
+
+mert_config:  
+  method: mert-v1
+  path: m-a-p/MERT-v1-330M
+  audio_embed_dim: 1024
+  sampling_rate: 24000
+
+  window_length: 10.0  # seconds
+  window_overlap: 0.0  # seconds
+  max_num_window: 1  # 5 minutes
+  max_num_fewshot: 1  # number of fewshot samples (including the final one)
+
+model_config:
+  cache_dir: /lustre/fsw/portfolios/adlr/users/sreyang/.cache
+
+  lang_encoder_path: Qwen/Qwen2.5-3B
+  tokenizer_path: Qwen/Qwen2.5-3B
+  cross_attn_every_n_layers: 1
+  audio_transformer_kwargs: {
+    n_head: 8,
+    n_layers: 3,
+    d_inner: 2048,
+    max_num_media: 128,  # must be >= max_num_window * num_fewshot_samples (4)
+    max_window_per_audio: 1,  # must = max_num_window
+    common_encoder_embed_dim: 1024
+  }
\ No newline at end of file
diff --git a/configs/run_demo_sft_fp32_xattnevery1_msclapcap_win7_ovlp5.25_ICL4x16win-4node.yaml b/configs/run_demo_sft_fp32_xattnevery1_msclapcap_win7_ovlp5.25_ICL4x16win-4node.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4c4e6662247755f894710f672f031eb42d6910e7
--- /dev/null
+++ b/configs/run_demo_sft_fp32_xattnevery1_msclapcap_win7_ovlp5.25_ICL4x16win-4node.yaml
@@ -0,0 +1,483 @@
+train_config:
+  expdir: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/v1.0_optimlmax1.3b_foundation
+  run_name: run_demo_sft_fp32_xattnevery1_msclapcap_win7_ovlp5.25_ICL4x16win-4node
+  delete_previous_checkpoint: true 
+  batch_size: 4
+  gradient_accumulation_steps: 1
+  seed: 42
+  learning_rate: 0.00002
+  lr_scheduler: constant
+  loss_multiplier: 1.0
+  warmup_steps: 1875
+  weight_decay: 0.1
+  precision: fp32  # ["amp_bf16", "amp_bfloat16", "bf16", "fp16", "fp32"]
+  gradient_checkpointing: False 
+  num_epochs: 160  # num_epochs * dataset_blending_global_weight = 1
+  offline: false
+  freeze_lm_embeddings: false
+  logging_steps: 10
+  dist_backend: nccl
+  dist_url: env:// # tcp://localhost:7000
+  no_set_device_rank: false 
+  fsdp: true 
+  fsdp_use_orig_params: false  # Passed into the FSDP constructor. Enables param_groups and gradient masking for weight_decay. Does not work with OPT.
+  fsdp_sharding_strategy: full  # full, hybrid
+  horovod: false
+
+# instruction tuning hparams
+sft_config:
+  pretrained_path: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/v1.0_optimlmax1.3b_foundation/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_ICL4x16win-4node/
+  pretrained_ckpt: checkpoint_99.pt
+  unfreeze_full_lm: true
+
+data_config:
+  dataset_blending_global_weight: 0.01
+
+  dataset_blending_config:
+
+    # Audio QA
+    Clotho-AQA-AQA/train:
+      weight: 0.8
+      prefix_prob: 1.0
+      augmentations:
+        AQA_binary_instruction: 1.0
+    Clotho-AQA-AQA/interleaved_knn-train:
+      weight: 0.2
+      prefix_prob: 1.0
+      augmentations:
+        AQA_binary_instruction: 1.0
+
+    OpenAQA-AQA/train:
+      weight: 1.0
+      prefix_prob: 1.0
+      augmentations:
+        do_nothing: 1.0
+
+    # Audio Captioning 
+
+    Clotho-v2-AudioCaptioning/train: 
+      weight: 0.8
+      prefix_prob: 1.0
+      augmentations:
+        AC_short: 1.0
+    Clotho-v2-AudioCaptioning/interleaved_knn-train: 
+      weight: 0.2
+      prefix_prob: 1.0
+      augmentations:
+        AC_short: 1.0
+
+    audiocaps-AudioCaptioning/train: 
+      weight: 0.8
+      prefix_prob: 1.0
+      augmentations:
+        AC_short: 1.0
+    audiocaps-AudioCaptioning/interleaved_knn-train: 
+      weight: 0.2
+      prefix_prob: 1.0
+      augmentations:
+        AC_short: 1.0
+
+    Epidemic_sound-AudioCaptioning/train: 
+      weight: 0.8
+      prefix_prob: 1.0
+      augmentations:
+        AC_short: 1.0
+    Epidemic_sound-AudioCaptioning/interleaved_knn-train: 
+      weight: 0.2
+      prefix_prob: 1.0
+      augmentations:
+        AC_short: 1.0
+
+    MACS-AudioCaptioning/train: 
+      weight: 0.8
+      prefix_prob: 1.0
+      augmentations:
+        AC_short: 1.0
+    MACS-AudioCaptioning/interleaved_knn-train: 
+      weight: 0.2
+      prefix_prob: 1.0
+      augmentations:
+        AC_short: 1.0
+
+    # Audio Classification
+
+    FSD50k-EventClassification/train: 
+      weight: 0.8
+      prefix_prob: 1.0
+      augmentations:
+        default: 1.0
+    FSD50k-EventClassification/interleaved_knn-train: 
+      weight: 0.2
+      prefix_prob: 1.0
+      augmentations:
+        default: 1.0
+
+    CochlScene-SceneClassification/train: 
+      weight: 0.8
+      prefix_prob: 1.0
+      augmentations:
+        provide_all_labels: 0.5
+        default: 0.5
+    CochlScene-SceneClassification/interleaved_knn-train: 
+      weight: 0.2
+      prefix_prob: 1.0
+      augmentations:
+        provide_all_labels: 0.5
+        default: 0.5
+
+    NonSpeech7k-EventClassification/train: 
+      weight: 0.8
+      prefix_prob: 1.0
+      augmentations:
+        provide_all_labels: 0.5
+        default: 0.5
+    NonSpeech7k-EventClassification/interleaved_knn-train: 
+      weight: 0.2
+      prefix_prob: 1.0
+      augmentations:
+        provide_all_labels: 0.5
+        default: 0.5
+
+    chime-home-EventClassification/train: 
+      weight: 0.8
+      prefix_prob: 1.0
+      augmentations:
+        default: 0.5
+        num_words: 0.5
+    chime-home-EventClassification/interleaved_knn-train: 
+      weight: 0.2
+      prefix_prob: 1.0
+      augmentations:
+        default: 0.5
+        num_words: 0.5
+
+    SONYC-UST-EventClassification/train: 
+      weight: 0.8
+      prefix_prob: 1.0
+      augmentations:
+        default: 0.5
+        num_words: 0.5
+    SONYC-UST-EventClassification/interleaved_knn-train: 
+      weight: 0.2
+      prefix_prob: 1.0
+      augmentations:
+        default: 0.5
+        num_words: 0.5
+
+    # Speech Emotion Classification
+
+    MELD-EmotionClassification/train: 
+      weight: 0.5
+      prefix_prob: 1.0
+      augmentations:
+        provide_all_labels: 0.5
+        default: 0.5
+
+    MELD-SentimentClassification/train:
+      weight: 0.5
+      prefix_prob: 1.0
+      augmentations:
+        provide_all_labels: 0.1
+        default: 0.9
+
+    emov-db-EmotionClassification/train:
+      weight: 1.6
+      prefix_prob: 1.0
+      augmentations:
+        provide_all_labels: 0.5
+        default: 0.5
+    emov-db-EmotionClassification/interleaved_knn-train:
+      weight: 0.4
+      prefix_prob: 1.0
+      augmentations:
+        provide_all_labels: 0.5
+        default: 0.5
+
+    jl-corpus-EmotionClassification/train:
+      weight: 6.0
+      prefix_prob: 1.0
+      augmentations:
+        provide_all_labels: 0.5
+        default: 0.5
+    jl-corpus-EmotionClassification/interleaved_knn-train:
+      weight: 1.5
+      prefix_prob: 1.0
+      augmentations:
+        provide_all_labels: 0.5
+        default: 0.5
+
+    tess-EmotionClassification/train:
+      weight: 2.0
+      prefix_prob: 1.0
+      augmentations:
+        provide_all_labels: 0.5
+        default: 0.5
+    tess-EmotionClassification/interleaved_knn-train:
+      weight: 0.5
+      prefix_prob: 1.0
+      augmentations:
+        provide_all_labels: 0.5
+        default: 0.5
+
+    IEMOCAP-EmotionClassification/train:
+      weight: 2.4
+      prefix_prob: 1.0
+      augmentations:
+        provide_all_labels: 0.5
+        default: 0.5
+    IEMOCAP-EmotionClassification/interleaved_knn-train:
+      weight: 0.6
+      prefix_prob: 1.0
+      augmentations:
+        provide_all_labels: 0.5
+        default: 0.5
+
+    OMGEmotion-EmotionClassification/train:
+      weight: 3.0
+      prefix_prob: 1.0
+      augmentations:
+        provide_all_labels: 0.5
+        default: 0.5
+
+    VocalSound-VocalClassification/train:
+      weight: 1.0
+      prefix_prob: 1.0
+      augmentations:
+        provide_all_labels: 0.5
+        default: 0.5
+
+    # Music QA
+    
+    Music-AVQA-AQA_All/train:
+      weight: 2.0
+      prefix_prob: 1.0
+      augmentations:
+        AQA_binary_instruction: 1.0
+    Music-AVQA-AQA_All/interleaved_knn-train:
+      weight: 1.0
+      prefix_prob: 1.0
+      augmentations:
+        AQA_binary_instruction: 1.0
+
+    MU-LLAMA-AQA/train:
+      weight: 0.9
+      prefix_prob: 1.0
+      augmentations:
+        do_nothing: 1.0
+    MU-LLAMA-AQA/interleaved_knn-train:
+      weight: 0.1
+      prefix_prob: 1.0
+      augmentations:
+        do_nothing: 1.0
+
+    # Music Captioning
+
+    LP-MusicCaps-MSD-AudioCaptioning/train:
+      weight: 0.05  # 1.3M
+      prefix_prob: 1.0
+      augmentations:
+        AC_paragraph: 1.0
+    LP-MusicCaps-MSD-AudioCaptioning/interleaved_knn-train:
+      weight: 0.05  # 111k
+      prefix_prob: 1.0
+      augmentations:
+        AC_paragraph: 1.0
+    
+    LP-MusicCaps-MC-AudioCaptioning/train:
+      weight: 1.6
+      prefix_prob: 1.0
+      augmentations:
+        AC_paragraph: 1.0
+    LP-MusicCaps-MC-AudioCaptioning/interleaved_knn-train:
+      weight: 0.4
+      prefix_prob: 1.0
+      augmentations:
+        AC_paragraph: 1.0
+
+    LP-MusicCaps-MTT-AudioCaptioning/train:
+      weight: 0.8
+      prefix_prob: 1.0
+      augmentations:
+        AC_long: 1.0
+    LP-MusicCaps-MTT-AudioCaptioning/interleaved_knn-train:
+      weight: 0.2
+      prefix_prob: 1.0
+      augmentations:
+        AC_long: 1.0
+
+    MusicCaps-AudioCaptioning/train:
+      weight: 6.0
+      prefix_prob: 1.0
+      augmentations:
+        AC_paragraph: 1.0
+    MusicCaps-AudioCaptioning/interleaved_knn-train:
+      weight: 1.5
+      prefix_prob: 1.0
+      augmentations:
+        AC_paragraph: 1.0
+
+    SongDescriber-AudioCaptioning/train:
+      weight: 0.8
+      prefix_prob: 1.0
+      augmentations:
+        AC_long: 1.0
+    SongDescriber-AudioCaptioning/interleaved_knn-train:
+      weight: 0.2
+      prefix_prob: 1.0
+      augmentations:
+        AC_long: 1.0
+
+    # Music Understanding 
+
+    NSynth-MIR/train:
+      weight: 0.2  # 289k for weight = 1
+      prefix_prob: 1.0
+      augmentations:
+        do_nothing: 1.0
+    NSynth-MIR/interleaved_knn-train:
+      weight: 0.2  # 60k for weight = 1
+      prefix_prob: 1.0
+      augmentations:
+        do_nothing: 1.0
+
+    mtg-jamendo-MusicTagging/train:
+      weight: 0.1
+      prefix_prob: 1.0
+      augmentations:
+        default: 1.0
+    
+    FMA-GenreClassification/train:
+      weight: 0.4  # 104k for weight = 1
+      prefix_prob: 1.0
+      augmentations:
+        do_nothing: 1.0
+    FMA-GenreClassification/interleaved_knn-train:
+      weight: 0.3  # 46k for weight = 1
+      prefix_prob: 1.0
+      augmentations:
+        do_nothing: 1.0
+
+    musdbhq-InstrClassification/train:
+      weight: 0.8
+      prefix_prob: 1.0
+      augmentations:
+        provide_all_labels: 0.5
+        default: 0.5
+    
+  dataset_file_root: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/dataset_files
+  data_root: /lustre/fsw/portfolios/adlr/users/zkong/datasets
+  dataset_blending_output: dataset_blending.json
+  max_tokens: 512
+  num_workers: 4
+
+  valid_dataset_config: 
+  
+    Clotho-AQA-AQA/test: true
+    Clotho-AQA-AQA/interleaved_knn-test: true
+
+    Clotho-v2-AudioCaptioning/test: true
+    Clotho-v2-AudioCaptioning/interleaved_knn-test: true
+
+    FSD50k-EventClassification/test: true
+    FSD50k-EventClassification/interleaved_knn-test: true
+
+    CochlScene-SceneClassification/test: true
+    CochlScene-SceneClassification/interleaved_knn-test: true
+
+    NonSpeech7k-EventClassification/test: true
+    NonSpeech7k-EventClassification/interleaved_knn-test: true
+
+    SONYC-UST-EventClassification/test: true
+    SONYC-UST-EventClassification/interleaved_knn-test: true
+
+    emov-db-EmotionClassification/val: true
+    emov-db-EmotionClassification/interleaved_knn-val: true
+
+    jl-corpus-EmotionClassification/val: true
+    jl-corpus-EmotionClassification/interleaved_knn-val: true
+
+    tess-EmotionClassification/val: true
+    tess-EmotionClassification/interleaved_knn-val: true
+
+    IEMOCAP-EmotionClassification/test: true
+    IEMOCAP-EmotionClassification/interleaved_knn-test: true
+
+    OMGEmotion-EmotionClassification/val: true
+
+    Music-AVQA-AQA_All/test: true
+    Music-AVQA-AQA_All/interleaved_knn-test: true
+
+    MU-LLAMA-AQA/test: true
+    
+    LP-MusicCaps-MSD-AudioCaptioning/test: true
+    LP-MusicCaps-MC-AudioCaptioning/test: true
+    LP-MusicCaps-MTT-AudioCaptioning/test: true
+    LP-MusicCaps-MTT-AudioCaptioning/interleaved_knn-test: true
+    
+    NSynth-MIR/test: true
+    NSynth-MIR/interleaved_knn-test: true
+
+    mtg-jamendo-MusicTagging/val: true
+
+    audiocaps-AudioCaptioning/test: true
+    audiocaps-AudioCaptioning/interleaved_knn-test: true
+
+    MusicCaps-AudioCaptioning/test: true
+
+    MELD-EmotionClassification/test: true
+    MELD-SentimentClassification/test: true
+    VocalSound-VocalClassification/test: true
+    musdbhq-InstrClassification/test: true
+    
+    # zero shot
+
+    GTZAN-GenreClassification/train:
+      prefix_prob: 1.0
+      augmentations:
+        provide_all_labels: 1.0
+    GTZAN-GenreClassification/interleaved_knn-train:
+      prefix_prob: 1.0
+      augmentations:
+        provide_all_labels: 1.0
+
+    Medley-solos-DB-InstrClassification/test: 
+      prefix_prob: 1.0
+      augmentations:
+        provide_all_labels: 1.0
+    Medley-solos-DB-InstrClassification/interleaved_knn-test: 
+      prefix_prob: 1.0
+      augmentations:
+        provide_all_labels: 1.0
+
+clap_config:
+  # method: laion-clap
+  # audio_embed_dim: 512
+  # model_name: 630k-fusion-best
+  # checkpoint: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/laion-clap-pretrained/laion_clap/630k-fusion-best.pt
+  
+  method: microsoft-clap
+  audio_embed_dim: 1024
+  config_root: /home/zkong/audio_flamingo/audio_flamingo_v1/v1.0_optimlmax1.3b_foundation/my_ms_clap/src/configs
+  # model_name: '2023'
+  # checkpoint: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/clap/CLAP_weights_2023.pth
+  model_name: 'clapcap'
+  checkpoint: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/clap/clapcap_weights_2023.pth
+
+  window_length: 7.0  # seconds
+  window_overlap: 5.25  # seconds
+  max_num_window: 16  # 35 seconds
+  max_num_fewshot: 4  # number of fewshot samples (including the final one)
+
+model_config:
+  cache_dir: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/LLM_pretrained/.cache
+
+  lang_encoder_path: facebook/opt-iml-max-1.3b
+  tokenizer_path: facebook/opt-iml-max-1.3b
+  cross_attn_every_n_layers: 1
+  audio_transformer_kwargs: {
+    n_head: 8,
+    n_layers: 3,
+    d_inner: 2048,
+    max_num_media: 128,  # must be >= max_num_window * num_fewshot_samples (4)
+    max_window_per_audio: 16,  # must = max_num_window
+  }
\ No newline at end of file
diff --git a/configs/run_demo_sft_fp32_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node.yaml b/configs/run_demo_sft_fp32_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f774bfed5af4265f8a1d9801baa59802409f219a
--- /dev/null
+++ b/configs/run_demo_sft_fp32_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node.yaml
@@ -0,0 +1,284 @@
+train_config:
+  expdir: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed
+  run_name: run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-3b-fixed-sft
+  delete_previous_checkpoint: true 
+  batch_size: 4
+  gradient_accumulation_steps: 2
+  seed: 42
+  learning_rate: 0.00002
+  lr_scheduler: constant
+  loss_multiplier: 1.0
+  warmup_steps: 1875
+  weight_decay: 0.1
+  precision: amp_bf16  # ["amp_bf16", "amp_bfloat16", "bf16", "fp16", "fp32"]
+  gradient_checkpointing: False 
+  num_epochs: 200  # num_epochs * dataset_blending_global_weight = 1
+  offline: false
+  freeze_lm_embeddings: false
+  logging_steps: 10
+  dist_backend: nccl
+  dist_url: env:// # tcp://localhost:7000
+  no_set_device_rank: false 
+  fsdp: true 
+  fsdp_use_orig_params: false  # Passed into the FSDP constructor. Enables param_groups and gradient masking for weight_decay. Does not work with OPT.
+  fsdp_sharding_strategy: full  # full, hybrid
+  horovod: false
+
+# instruction tuning hparams
+sft_config:
+  pretrained_path: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-3b-fixed_ckpt_stage1/
+  pretrained_ckpt: checkpoint_199.pt
+  unfreeze_full_lm: false
+
+data_config:
+  dataset_blending_global_weight: 0.005
+
+  dataset_blending_config:
+
+    MMAUQA/train:
+      weight: 1.5
+
+    AudioSet-Temporal-Speech-Audio-QA/train:
+      weight: 1.0
+
+    CompA-R-AQA/train:
+      weight: 1.0
+
+    # Audio QA
+    Clotho-AQA-AQA/train:
+      weight: 1.0
+    
+    OpenAQA-AQA/train:
+      weight: 1.0
+
+    SalmonnQA/train:
+      weight: 1.0
+
+    AudioEntailmentQA/train:
+      weight: 1.0
+
+    # Audio Captioning 
+
+    Clotho-v2-AudioCaptioning/train: 
+      weight: 1.0
+
+    audiocaps-AudioCaptioning/train: 
+      weight: 1.0
+
+    Epidemic_sound-AudioCaptioning/train: 
+      weight: 1.0
+
+    MACS-AudioCaptioning/train: 
+      weight: 1.0
+
+    # Audio Classification
+
+    FSD50k-EventClassification/train: 
+      weight: 1.0
+
+    CochlScene-SceneClassification/train: 
+      weight: 1.0
+
+    NonSpeech7k-EventClassification/train: 
+      weight: 1.0
+
+    chime-home-EventClassification/train: 
+      weight: 1.0
+
+    SONYC-UST-EventClassification/train: 
+      weight: 1.0
+
+    # Speech Emotion Classification
+
+    MELD-EmotionClassification/train: 
+      weight: 0.5
+
+    MELD-SentimentClassification/train:
+      weight: 0.5
+
+    emov-db-EmotionClassification/train:
+      weight: 1.0
+
+    jl-corpus-EmotionClassification/train:
+      weight: 6.0
+
+    tess-EmotionClassification/train:
+      weight: 2.5
+
+    IEMOCAP-EmotionClassification/train:
+      weight: 3.0
+
+    OMGEmotion-EmotionClassification/train:
+      weight: 3.0
+
+    VocalSound-VocalClassification/train:
+      weight: 1.5
+
+    # Music QA
+    
+    Music-AVQA-AQA_All/train:
+      weight: 3.0
+    
+    MU-LLAMA-AQA/train:
+      weight: 1.0
+
+    # Music Captioning
+
+    LP-MusicCaps-MSD-AudioCaptioning/train:
+      weight: 0.06
+    
+    LP-MusicCaps-MC-AudioCaptioning/train:
+      weight: 2.0
+
+    LP-MusicCaps-MTT-AudioCaptioning/train:
+      weight: 1.0
+
+    MusicCaps-AudioCaptioning/train:
+      weight: 6.0
+
+    musdbhq-captioning/train:
+      weight: 2.0
+
+    # Music Understanding 
+
+    NSynth-MIR/train:
+      weight: 0.2
+
+    mtg-jamendo-MusicTagging/train:
+      weight: 0.1
+    
+    FMA-GenreClassification/train:
+      weight: 0.5
+    
+    musdbhq-InstrClassification/train:
+      weight: 0.8
+
+    LLARK_FMA-mir/train:
+      weight: 1.0
+
+    LLARK_FMA-reasoning/train:
+      weight: 1.0
+    
+    LLARK_MagnaTagATune-mir/train:
+      weight: 1.0
+
+    LLARK_MTG-Jamendo-reasoning/train:
+      weight: 1.0
+
+    LLARK_MagnaTagATune-reasoning/train:
+      weight: 1.0
+
+    LLARK_MTG-Jamendo-mir/train:
+      weight: 1.0
+
+    MusicBenchQA/train:
+      weight: 1.0
+
+  dataset_file_root: /lustre/fsw/portfolios/adlr/users/sreyang/final_qa/foundational_data_w_duration
+  data_root: /lustre/fsw/portfolios/adlr/users/zkong/datasets
+  dataset_blending_output: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed/dataset_blending.json
+  max_tokens: 512
+  num_workers: 4
+
+  valid_dataset_config: 
+  
+    Clotho-AQA-AQA/test: true
+
+    Clotho-v2-AudioCaptioning/test: true
+    audiocaps-AudioCaptioning/test: true
+
+    FSD50k-EventClassification/test: true
+    CochlScene-SceneClassification/test: true
+    NonSpeech7k-EventClassification/test: true
+    SONYC-UST-EventClassification/test: true
+
+    MELD-EmotionClassification/test: true
+    MELD-SentimentClassification/test: true
+    emov-db-EmotionClassification/val: true
+    jl-corpus-EmotionClassification/val: true
+    tess-EmotionClassification/val: true
+    IEMOCAP-EmotionClassification/val: true
+    OMGEmotion-EmotionClassification/val: true
+    VocalSound-VocalClassification/test: true
+    
+    Music-AVQA-AQA_All/test: true
+    MU-LLAMA-AQA/test: true
+    
+    LP-MusicCaps-MSD-AudioCaptioning/test: true
+    LP-MusicCaps-MC-AudioCaptioning/test: true
+    LP-MusicCaps-MTT-AudioCaptioning/test: true
+    MusicCaps-AudioCaptioning/test: true
+    
+    NSynth-MIR/test: true
+    mtg-jamendo-MusicTagging/val: true
+    musdbhq-InstrClassification/test: true
+
+    # # zero shot
+    # CREMA-D-EmotionClassification/train:
+    #   prefix_prob: 1.0
+
+    # ravdess-EmotionClassification/train:
+    #   prefix_prob: 1.0
+
+    # UrbanSound8K-EventClassification/train: 
+    #   prefix_prob: 1.0
+
+    # ESC50-EventClassification/train:
+    #   prefix_prob: 1.0
+    
+    # DCASE17Task4-SceneClassification/test:
+    #   prefix_prob: 1.0
+
+    # GTZAN-GenreClassification/train:
+    #   prefix_prob: 1.0
+
+    # Medley-solos-DB-InstrClassification/test: 
+    #   prefix_prob: 1.0
+
+clap_config:  
+  method: nvclap-large
+  audio_embed_dim: 2048
+  checkpoint: /lustre/fsw/portfolios/adlr/users/sreyang/datasets/clap_datasets/clap_ckpts_5/15/ck_sim/checkpoints/epoch_15.pt
+
+  window_length: 10.0  # seconds
+  window_overlap: 0.0  # seconds
+  max_num_window: 9  # 1.5 minutes
+  max_num_fewshot: 1  # number of fewshot samples (including the final one)
+  finetune: true
+
+whisper_config:  
+  method: whisper-large-v3
+  path: openai/whisper-large-v3
+  audio_embed_dim: 1280
+  sampling_rate: 16000
+
+  window_length: 30.0  # seconds
+  window_overlap: 0.0  # seconds
+  max_num_window: 1  # 5 minutes
+  max_num_fewshot: 1  # number of fewshot samples (including the final one)
+
+mert_config:  
+  method: mert-v1
+  path: m-a-p/MERT-v1-330M
+  audio_embed_dim: 1024
+  sampling_rate: 24000
+
+  window_length: 10.0  # seconds
+  window_overlap: 0.0  # seconds
+  max_num_window: 1  # 5 minutes
+  max_num_fewshot: 1  # number of fewshot samples (including the final one)
+
+model_config:
+  cache_dir: /lustre/fsw/portfolios/adlr/users/sreyang/.cache
+
+  lang_encoder_path: Qwen/Qwen2.5-3B
+  tokenizer_path: Qwen/Qwen2.5-3B
+  cross_attn_every_n_layers: 1
+  audio_transformer_kwargs: {
+    n_head: 8,
+    n_layers: 3,
+    d_inner: 2048,
+    max_num_media: 128,  # must be >= max_num_window * num_fewshot_samples (4)
+    max_window_per_audio: 1,  # must = max_num_window
+    common_encoder_embed_dim: 1024
+  }
\ No newline at end of file
diff --git a/data/__pycache__/data.cpython-38.pyc b/data/__pycache__/data.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d67281ebea5123384be852dc3e0729717baf4297
Binary files /dev/null and b/data/__pycache__/data.cpython-38.pyc differ
diff --git a/data/data.py b/data/data.py
new file mode 100644
index 0000000000000000000000000000000000000000..13b19ed27187e84a8e43e04a494e51c97cd40011
--- /dev/null
+++ b/data/data.py
@@ -0,0 +1,669 @@
+import functools
+import io
+import json
+import math
+import os
+os.environ["TOKENIZERS_PARALLELISM"] = "false"  # disable the tokenizer parallelism warning
+import random
+import re
+import string
+import subprocess
+import sys
+import yaml
+
+import numpy as np
+
+from collections import defaultdict
+from copy import deepcopy
+from dataclasses import dataclass
+from functools import partial
+from pydub import AudioSegment
+from tqdm import tqdm
+
+import torch
+import torchvision
+import torch.nn.functional as F
+from torch.utils.data import DataLoader, Dataset, get_worker_info
+from torch.utils.data.distributed import DistributedSampler
+
+
+from transformers import AutoTokenizer
+
+import librosa
+import soundfile as sf
+
+EMOTION_MAP_DICT = {
+    'amused':       'amused'      , 
+    'anger':        'angry'       , 'angry':        'angry'       , 
+    'anxious':      'anxious'     , 
+    'apologetic':   'apologetic'  , 
+    'assertive':    'assertive'   ,
+    'calm':         'calm'        , 
+    'concerned':    'concerned'   , 
+    'contempt':     'contempt'    , 
+    'disgust':      'disgusted'   , 'disgusted':    'disgusted'   , 
+    'encouraging':  'encouraging' , 
+    'excited':      'excited'     , 
+    'fear':         'fearful'     , 'fearful':      'fearful'     , 
+    'frustated':    'frustated'   ,
+    'happy':        'happy'       , 'joy':          'happy'       , 
+    'neutral':      'neutral'     , 
+    'sad':          'sad'         , 'sadness':      'sad'         , 
+    'sleepy':       'sleepy'      , 
+    'surprise':     'surprised'   , 'surprised':    'surprised'   ,
+    'pleasantly surprised': 'pleasantly surprised' ,
+}
+
+
+def int16_to_float32(x):
+    return (x / 32767.0).astype(np.float32)
+
+
+def float32_to_int16(x):
+    x = np.clip(x, a_min=-1., a_max=1.)
+    return (x * 32767.).astype(np.int16)
+
+
+class DataCollator:
+    def __init__(self, tokenizer, clap_config):
+
+        self.tokenizer = tokenizer
+        self.clap_config = clap_config
+        self.max_num_window = clap_config["max_num_window"]
+
+    def __call__(self, batch):
+
+        filenames, audio_clips, audio_embed_masks, input_ids, attention_masks = zip(*batch)
+
+        num_windows_all = [sum(audio_embed_mask) for audio_embed_mask in audio_embed_masks]
+        max_window_batch = int(max(num_windows_all))
+
+        if max_window_batch > self.max_num_window:
+            max_window_batch = self.max_num_window
+
+        padded_audio_clips = []
+        padded_audio_embed_masks = []
+        for audio_clip, audio_embed_mask in zip(audio_clips,audio_embed_masks):
+            this_audio_clip_clips = [clip for clip in audio_clip]
+            num_windows = len(this_audio_clip_clips)
+            if num_windows < max_window_batch:
+                for _ in range(max_window_batch - num_windows):
+                    this_audio_clip_clips.append(torch.zeros_like(this_audio_clip_clips[-1]))
+                audio_clip = torch.cat(this_audio_clip_clips)
+                audio_embed_mask = torch.zeros(max_window_batch)
+                audio_embed_mask[:num_windows] = 1
+            elif num_windows < max_window_batch:
+                audio_clip = this_audio_clip_clips[:max_window_batch]
+                audio_clip = torch.cat(this_audio_clip_clips)
+                audio_embed_mask = audio_embed_mask[:max_window_batch]
+            else:
+                audio_clip = torch.cat(this_audio_clip_clips)
+                
+            padded_audio_clips.append(audio_clip)
+            padded_audio_embed_masks.append(audio_embed_mask)
+
+        audio_clips = torch.cat([x.unsqueeze(0) for x in padded_audio_clips], dim=0)
+        audio_embed_mask = torch.cat([x.unsqueeze(0) for x in padded_audio_embed_masks], dim=0)
+
+        max_length = max([ids.shape[1] for ids in input_ids])
+
+        padded_input_ids = []
+        padded_attention_masks = []
+        for ids, mask in zip(input_ids, attention_masks):
+            if ids.shape[1] < max_length:
+                padded_input_ids.append(
+                    torch.cat([ids, torch.LongTensor([self.tokenizer.pad_token_id] * (max_length - ids.shape[1])).unsqueeze(0)], dim=1)
+                )
+                padded_attention_masks.append(
+                    torch.cat([mask, torch.LongTensor([0] * (max_length - mask.shape[1])).unsqueeze(0)], dim=1)
+                )
+            else:
+                padded_input_ids.append(ids)
+                padded_attention_masks.append(mask)
+        
+        padded_input_ids = torch.cat(padded_input_ids, dim=0)
+        padded_attention_masks = torch.cat(padded_attention_masks, dim=0).bool()
+        
+        out_dict = dict(
+            filenames=filenames,
+            audio_clips=audio_clips,
+            audio_embed_mask=audio_embed_mask,
+            input_ids=padded_input_ids,
+            attention_mask=padded_attention_masks
+        )
+        return out_dict
+
+
+class AudioTextData(torch.utils.data.Dataset):
+    def __init__(
+        self,
+        dataset_file_root: str,
+        data_root: str,
+        clap_config: dict,
+        dataset_blending_global_weight: float,
+        dataset_blending_config: dict,
+        dataset_blending_output: str,
+        tokenizer,
+        max_tokens: int,
+        split: str = 'train',
+        valid_dataset_config: dict = {},
+        valid_dataset_name: str = '',
+        epoch: int = 0,
+        force_reblend: bool = False,
+        sr = 16000,
+        **kwargs
+    ):
+        self.dataset_file_root = dataset_file_root
+        self.data_root = data_root
+        self.clap_config = clap_config
+        self.dataset_blending_global_weight = dataset_blending_global_weight
+        self.dataset_blending_config = dataset_blending_config
+        self.sr = sr
+        
+        self.split = split
+        self.epoch = epoch
+        self.force_reblend = force_reblend
+
+        assert self.split in ['train', 'val', 'test']
+
+        if self.split == 'train':
+            self.data = self.blend_dataset(dataset_blending_config, dataset_blending_output)
+
+        elif self.split in ['val', 'test']:
+            self.valid_data = self.validation_dataset(valid_dataset_config, valid_dataset_name)
+        
+        self.tokenizer = tokenizer
+        self.tokenizer.padding_side = "right"
+        self.max_tokens = max_tokens
+
+    @staticmethod
+    def shuffle_dict_fixed_rand(dic, seed=0):
+        print('randomly shuffling key-value pairs')
+        
+        local_random = np.random.default_rng(seed)
+        original_keys = list(dic.keys())
+        shuffled_keys = deepcopy(original_keys)
+        local_random.shuffle(shuffled_keys)
+        shuffling_mapping = {x: y for (x, y) in zip(original_keys, shuffled_keys)}
+
+        shuffled_dic = {}
+        for idx in original_keys:
+            shuffled_idx = shuffling_mapping[idx]
+            shuffled_dic[idx] = dic[shuffled_idx]
+        return shuffled_dic
+
+    @staticmethod
+    def is_broken_file(audiopath):
+        BROKEN_FILES = [
+            "/lustre/fsw/portfolios/adlr/users/zkong/datasets/FMA/fma_large/023/023431.mp3",
+            "/lustre/fsw/portfolios/adlr/users/zkong/datasets/FMA/fma_large/033/033690.mp3",
+            "/lustre/fsw/portfolios/adlr/users/zkong/datasets/FMA/fma_large/119/119217.mp3",
+            "/lustre/fsw/portfolios/adlr/users/zkong/datasets/FMA/fma_large/119/119222.mp3",
+            "/lustre/fsw/portfolios/adlr/users/zkong/datasets/FMA/fma_large/119/119219.mp3",
+            "/lustre/fsw/portfolios/adlr/users/zkong/datasets/GTZAN/gtzan/data/genres/jazz/jazz.00054.wav"
+        ]
+        return audiopath in BROKEN_FILES
+
+    def _read_dataset_file(self, dataset_file):
+        print("reading", dataset_file)
+        with open(dataset_file) as f:
+            contents = f.read()
+        contents = json.loads(contents)
+
+        if contents['split_path'] is not None:
+            abs_path = contents['split_path']
+
+        """
+        for normal data
+        contents['data'] = {idx: {
+                'name': rel_path/name, 
+                'prompt': prompt, 
+                'output': output, 
+                [optional] 'audio_start': audio_start,
+                'task': task,
+            }}
+        """
+
+        if 'interleaved' not in dataset_file:
+            for idx in contents["data"]:
+                contents["data"][idx]['task'] = contents["flamingo_task"]
+                contents["data"][idx]['name'] = os.path.join(
+                    abs_path, contents["data"][idx]['name']
+                )
+            return contents
+    
+    def blend_dataset(self, dataset_blending_config, dataset_blending_output):
+        if os.path.exists(dataset_blending_output) and not self.force_reblend:
+            print("loading blended dataset file from:", dataset_blending_output)
+            with open(dataset_blending_output) as f:
+                contents = f.read()
+            self_data = json.loads(contents)
+        
+        else:
+            if not self.force_reblend:
+                print("no blended dataset file found; reading all dataset files")
+            else:
+                print("force reblending dataset at epoch {}; reading all dataset files".format(self.epoch))
+
+            all_data = {}
+            for dataset_name in dataset_blending_config:
+                dataset_file = os.path.join(self.dataset_file_root, '{}.json'.format(dataset_name))
+                contents = self._read_dataset_file(dataset_file)
+                contents['data'] = self.shuffle_dict_fixed_rand(
+                    contents['data'], 
+                    seed=sum(list(map(ord, dataset_name)))
+                )
+
+                weight_global = float(self.dataset_blending_global_weight)
+                weight_dataset = float(dataset_blending_config[dataset_name]["weight"])
+                weight = weight_global * weight_dataset
+
+                all_data[dataset_name] = {
+                    "contents": contents,
+                    "weight": weight
+                }
+
+            self_data = {
+                "dataset_path": self.data_root,
+                "split_path": None,
+                "total_num": 0,
+                "data": {}  # {id: {'name': rel_path/name or [rel_path/names], 'prompt': prompt or [prompts], 'output': output or [outputs], 'task': task, 'interleaved': interleave_method}}
+            }
+
+            for dataset_name in all_data:
+                print('blending {}'.format(dataset_name))
+
+                contents = all_data[dataset_name]["contents"]
+                shuffled_contents_data = contents['data']
+                weight = all_data[dataset_name]["weight"]
+                assert type(weight) == float and weight > 0.0
+
+                dataset_total_num = contents['total_num']
+                start_idx = int(self.epoch * dataset_total_num * weight)
+                end_idx = int((self.epoch + 1) * dataset_total_num * weight)
+
+                for idx in range(start_idx, end_idx):
+                    if idx > 0 and idx % dataset_total_num == 0:
+                        print('force shuffling at new epoch {} for dataset {}'.format(idx // dataset_total_num, dataset_name))
+                        shuffled_contents_data = self.shuffle_dict_fixed_rand(
+                            contents['data'], 
+                            seed=sum(list(map(ord, '{}-epoch-{}'.format(dataset_name, idx // dataset_total_num))))
+                        )
+
+                    key = str(idx % dataset_total_num)
+                    item = shuffled_contents_data[key]
+
+                    found_broken = False
+                    if type(item['name']) is str:
+                        audiopath = item['name']
+                        if self.is_broken_file(audiopath):
+                            print('cannot read {}'.format(audiopath))
+                            found_broken = True
+
+                    if found_broken:
+                        continue 
+                    
+                    self_data['data'][self_data['total_num']] = item
+                    self_data['total_num'] += 1 
+
+            if not self.force_reblend:
+                print('writing blended dataset file to:', dataset_blending_output)
+                with open(dataset_blending_output, 'w') as json_file:
+                    json.dump(self_data, json_file)
+            else:
+                print('writing reblended dataset file to:', dataset_blending_output.replace('.json', '-reblended.json'))
+                with open(dataset_blending_output.replace('.json', '-reblended.json'), 'w') as json_file:
+                    json.dump(self_data, json_file)
+
+        return self_data
+
+    def get_num_windows(self, T, sr):
+        clap_config = self.clap_config
+        window_length  = int(float(clap_config["window_length"]) * sr)
+        window_overlap = int(float(clap_config["window_overlap"]) * sr)
+        max_num_window = int(clap_config["max_num_window"])
+
+        num_windows = 1
+        if T <= window_length:
+            num_windows = 1
+            full_length = window_length
+        elif T >= (max_num_window * window_length - (max_num_window - 1) * window_overlap):
+            num_windows = max_num_window
+            full_length = (max_num_window * window_length - (max_num_window - 1) * window_overlap)
+        else:
+            num_windows = 1 + int(np.ceil((T - window_length) / float(window_length - window_overlap)))
+            full_length = num_windows * window_length - (num_windows - 1) * window_overlap
+        
+        return num_windows, full_length
+
+    def load_audio(self, file_path, target_sr=16000, duration=30.0, start=0.0):
+        if file_path.endswith('.mp3'):
+            audio = AudioSegment.from_file(file_path)
+            if len(audio) > (start + duration) * 1000:
+                audio = audio[start * 1000:(start + duration) * 1000]
+
+            if audio.frame_rate != target_sr:
+                audio = audio.set_frame_rate(target_sr)
+
+            if audio.channels > 1:
+                audio = audio.set_channels(1)
+            
+            data = np.array(audio.get_array_of_samples())
+            if audio.sample_width == 2:
+                data = data.astype(np.float32) / np.iinfo(np.int16).max
+            elif audio.sample_width == 4:
+                data = data.astype(np.float32) / np.iinfo(np.int32).max
+            else:
+                raise ValueError("Unsupported bit depth: {}".format(audio.sample_width))
+
+        else:
+            with sf.SoundFile(file_path) as audio:
+                original_sr = audio.samplerate
+                channels = audio.channels
+
+                max_frames = int((start + duration) * original_sr)
+
+                audio.seek(int(start * original_sr))
+                frames_to_read = min(max_frames, len(audio))
+                data = audio.read(frames_to_read)
+
+                if data.max() > 1 or data.min() < -1:
+                    data = data / max(abs(data.max()), abs(data.min()))
+            
+            if original_sr != target_sr:
+                if channels == 1:
+                    data = librosa.resample(data.flatten(), orig_sr=original_sr, target_sr=target_sr)
+                else:
+                    data = librosa.resample(data.T, orig_sr=original_sr, target_sr=target_sr)[0]
+            else:
+                if channels != 1:
+                    data = data.T[0]
+        
+        if data.min() >= 0:
+            data = 2 * data / abs(data.max()) - 1.0
+        else:
+            data = data / max(abs(data.max()), abs(data.min()))
+        
+        assert len(data.shape) == 1, data.shape
+        return data
+
+    def compute_sliding_window(self, audio_file, audio_start=0.0, audio="sound"):
+        if type(audio_start) == str:
+            audio_start = float(audio_start)
+
+        if audio == "sound":
+            encoder_config = self.clap_config
+        else:
+            raise NotImplementedError
+
+        if encoder_config["method"] == 'nvclap-large':
+            sr = 16000
+        else:
+            raise NotImplementedError
+
+        window_length  = int(float(encoder_config["window_length"]) * sr)
+        window_overlap = int(float(encoder_config["window_overlap"]) * sr)
+        max_num_window = int(encoder_config["max_num_window"])
+        duration = max_num_window * (encoder_config["window_length"] - encoder_config["window_overlap"]) + encoder_config["window_overlap"]
+
+        audio_data = self.load_audio(os.path.join(self.data_root, audio_file), sr, duration, audio_start) # already cuts to max duration
+        T = len(audio_data)
+        num_windows, full_length = self.get_num_windows(T, sr)
+
+        # pads to the nearest multiple of window_length
+        if full_length > T:
+            audio_data = np.append(audio_data, np.zeros(full_length - T))
+
+        audio_data = audio_data.reshape(1, -1)
+        audio_data_tensor = torch.from_numpy(int16_to_float32(float32_to_int16(audio_data))).float()
+
+        audio_clips = []
+        audio_embed_mask = torch.ones(num_windows)
+        for i in range(num_windows):
+            start = i * (window_length - window_overlap)
+            audio_data_tensor_this = audio_data_tensor[:, start:start+window_length]
+            audio_clips.append(audio_data_tensor_this)            
+        
+        return audio_clips, audio_embed_mask
+
+    def validation_dataset(self, valid_dataset_config, valid_dataset_name):
+        dataset_file = os.path.join(self.dataset_file_root, '{}.json'.format(valid_dataset_name))
+        contents = self._read_dataset_file(dataset_file)
+
+        contents['data'] = self.shuffle_dict_fixed_rand(
+            contents['data'], 
+            seed=sum(list(map(ord, valid_dataset_name)))
+        )
+
+        return contents
+
+    def preprocess_string_for_eval(self, x):
+        x = x.rstrip().lstrip()
+        x = x.lower()
+        return x
+
+    def _actual_getitem(self, i):
+        if self.split == 'train':
+            try:
+                item = self.data['data'][str(i)]
+            except:
+                item = self.data['data'][i]
+
+            if type(item['name']) is str:
+                audio_file = item['name']
+                audio_start = 0 if 'audio_start' not in item else float(item['audio_start'])
+            else:
+                raise Exception(f"The item has a {type(item['name'])}. Only single path as a string is supported")
+
+            # compute window for long audios
+            audio_clips, audio_embed_mask = self.compute_sliding_window(audio_file, audio_start, audio="sound")
+        
+            # make the text prompt
+            text_prompt = str(item['prompt']).lower()
+            text_output = str(item['output']).lower()
+
+            sample = f"<audio>{text_prompt.strip()}{self.tokenizer.sep_token}{text_output.strip()}<|endofchunk|>{self.tokenizer.eos_token}"
+
+            text = self.tokenizer(
+                sample,
+                max_length=self.max_tokens,
+                padding="longest",
+                truncation="only_first",
+                return_tensors="pt"
+            )
+        
+        elif self.split in ['val', 'test']:
+            try:
+                item = self.valid_data['data'][str(i)]
+            except:
+                item = self.valid_data['data'][i]
+
+            if type(item['name']) is str:
+                audio_file = os.path.join(self.data_root, item['name'])
+                audio_start = 0 if 'audio_start' not in item else float(item['audio_start'])
+            else:
+                raise Exception(f"The item has a {type(item['name'])}. Only single path as a string is supported")
+
+            # compute window for long audios
+            audio_clips, audio_embed_mask = self.compute_sliding_window(audio_file, audio_start, audio="sound")
+        
+            # make the text prompt
+            text_prompt = self.preprocess_string_for_eval(str(item['prompt']).lower())
+            text_output = self.preprocess_string_for_eval(str(item['output']).lower())
+
+            sample = f"<audio>{text_prompt.strip()}{self.tokenizer.sep_token}{text_output.strip()}<|endofchunk|>{self.tokenizer.eos_token}"
+
+            text = self.tokenizer(
+                sample,
+                max_length=self.max_tokens,
+                padding="longest",
+                truncation="only_first",
+                return_tensors="pt"
+            )
+            
+        # audio_clips_clap, audio_embed_mask_clap, audio_clips_speech, audio_embed_mask_speech, audio_clips_music, audio_embed_mask_music,
+        return (item['name'], audio_clips, audio_embed_mask, text["input_ids"], text["attention_mask"])
+
+    def __getitem__(self, i):
+        try: 
+            return self._actual_getitem(i)
+        except Exception as e:
+            print('batch {} failed with reason {}'.format(i, e))
+            try:
+                return self._actual_getitem((i-42)%99)
+            except:
+                return self._actual_getitem((i-84)%99)
+
+    def __len__(self):
+        if self.split == 'train':
+            return len(list(self.data['data'].keys()))
+
+        elif self.split == 'val':
+            return min(len(list(self.valid_data['data'].keys())), 64)
+
+        elif self.split == 'test':
+            return len(list(self.valid_data['data'].keys()))
+
+
+@dataclass
+class DataInfo:
+    dataset: Dataset
+    dataloader: DataLoader
+    sampler: DistributedSampler = None
+
+    def set_epoch(self, epoch):
+        if self.sampler is not None and isinstance(self.sampler, DistributedSampler):
+            self.sampler.set_epoch(epoch)
+
+
+def get_audiotext_dataloader(data_config, clap_config, text_tokenizer, batch_size, split='train', epoch=0, force_reblend=False):
+    assert split in ['train', 'val', 'test']
+
+    data_collator = DataCollator(text_tokenizer, clap_config)
+    dataloader_shuffle = False
+
+    if split == 'train':
+        trainset = AudioTextData(
+            **data_config, 
+            clap_config=clap_config,
+            tokenizer=text_tokenizer, 
+            split=split,
+            epoch=epoch,
+            force_reblend=force_reblend
+        )
+        sampler = DistributedSampler(trainset, shuffle=True)
+        trainloader = DataLoader(
+            trainset, 
+            sampler=sampler, 
+            batch_size=batch_size, 
+            shuffle=dataloader_shuffle, 
+            collate_fn=data_collator, 
+            num_workers=data_config["num_workers"]
+        )
+        return DataInfo(dataset=trainset, dataloader=trainloader, sampler=sampler)
+    
+    elif split in ['val', 'test']:
+        all_DataInfo = {}
+        for valid_dataset_name in list(data_config["valid_dataset_config"].keys()):
+            valid_dataset_name = valid_dataset_name.strip()
+            validset = AudioTextData(
+                **data_config, 
+                clap_config=clap_config,
+                tokenizer=text_tokenizer, 
+                split=split, 
+                valid_dataset_name=valid_dataset_name
+            )
+            if split == 'val':
+                # distributed sampler
+                all_DataInfo[valid_dataset_name] = DataInfo(
+                    dataset=validset,
+                    dataloader=DataLoader(
+                        validset, 
+                        sampler=DistributedSampler(validset, shuffle=False),
+                        batch_size=batch_size, 
+                        shuffle=dataloader_shuffle, 
+                        collate_fn=data_collator, 
+                        num_workers=data_config["num_workers"]
+                ))
+            else:
+                # single GPU
+                all_DataInfo[valid_dataset_name] = DataInfo(
+                    dataset=validset,
+                    dataloader=DataLoader(
+                        validset, 
+                        batch_size=batch_size, 
+                        shuffle=dataloader_shuffle, 
+                        collate_fn=data_collator, 
+                        num_workers=data_config["num_workers"]
+                ))
+
+        return all_DataInfo
+    
+
+def main():
+    import time
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-c', '--config', type=str, default='../configs/config.yaml', help='yaml config path')
+    args = parser.parse_args()
+
+    config = yaml.load(open(args.config), Loader=yaml.FullLoader)
+
+    data_config = config['data_config']
+    clap_config = config['clap_config']
+    whisper_config = config["whisper_config"]
+    mert_config = config["mert_config"]
+
+    tokenizer_path = "facebook/opt-1.3b"
+    cache_dir = '/lustre/fsw/portfolios/adlr/users/sreyang/.cache'
+    text_tokenizer = AutoTokenizer.from_pretrained(
+        tokenizer_path,
+        local_files_only=False,
+        trust_remote_code=True,
+        cache_dir=cache_dir,
+    )
+    text_tokenizer.add_special_tokens(
+        {"additional_special_tokens": ["<audio>", "<|endofchunk|>"]}
+    )
+    if text_tokenizer.pad_token is None:
+        text_tokenizer.add_special_tokens({"pad_token": "<|PAD_TOKEN|>"})
+    if text_tokenizer.sep_token is None:
+        text_tokenizer.add_special_tokens({"sep_token": "<SEP>"})
+
+    trainset = AudioTextData(
+        **data_config,
+        clap_config=clap_config, tokenizer=text_tokenizer,
+        epoch=66, force_reblend=True
+    )
+
+    data_collator = DataCollator(text_tokenizer)
+    dataloader = DataLoader(trainset, batch_size=16, shuffle=True, collate_fn=data_collator, num_workers=4)
+
+    for step, batch in enumerate(dataloader):
+        filenames = batch["filenames"]
+        audio_clips = batch["audio_clips"]
+        audio_embed_mask = batch["audio_embed_mask"]
+        input_ids = batch["input_ids"]
+        attention_mask = batch["attention_mask"]
+
+        print(
+            'batch {}:'.format(step+1), 
+            audio_clips.shape, audio_embed_mask.shape, 
+            input_ids.shape, attention_mask.shape
+        )
+
+        print('filenames', filenames)
+        print('audio_embed_mask', audio_embed_mask)
+        print('input_ids', input_ids)
+        for input_id in input_ids:
+            print('-' * 50)
+            print(text_tokenizer.decode(input_id))
+        print('attention_mask', attention_mask)
+
+        if step == 20:
+            break
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/data/prepare_each_dataset.py b/data/prepare_each_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..2188d27550183fb8682d3efe7fd790734024e1d4
--- /dev/null
+++ b/data/prepare_each_dataset.py
@@ -0,0 +1,3247 @@
+import os 
+import json
+import csv
+import yaml
+from collections import defaultdict
+import pickle
+import glob
+import math
+from functools import partial
+import sys
+import io
+import warnings
+import random
+
+import numpy as np
+import torch
+import laion_clap
+
+import librosa
+from pydub import AudioSegment
+import soundfile as sf
+
+import faiss
+
+import multiprocessing
+multiprocessing.set_start_method('spawn', force=True)
+
+try:
+    from tqdm import tqdm 
+except:
+    tqdm = lambda x: x
+
+
+def suppress_all_output(func):
+    def wrapper(*args, **kwargs):
+        old_stdout = sys.stdout
+        old_stderr = sys.stderr
+        
+        sys.stdout = io.StringIO()
+        sys.stderr = io.StringIO()
+        
+        old_fd_out = os.dup(1)
+        old_fd_err = os.dup(2)
+        null_fd = os.open(os.devnull, os.O_RDWR)
+        
+        os.dup2(null_fd, 1)
+        os.dup2(null_fd, 2)
+        
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            
+            try:
+                result = func(*args, **kwargs)
+            finally:
+                os.dup2(old_fd_out, 1)
+                os.dup2(old_fd_err, 2)
+                os.close(null_fd)
+                os.close(old_fd_out)
+                os.close(old_fd_err)
+        
+        sys.stdout = old_stdout
+        sys.stderr = old_stderr
+        
+        return result
+    return wrapper
+
+
+def filter_file(file_path, file_list, filename):
+    if file_list is not None:
+        if filename not in file_list:
+            print(filename, 'not exist')
+            return True 
+    else:
+        if not os.path.exists(os.path.join(file_path, filename)):
+            print(filename, 'not exist')
+            return True 
+
+    if os.path.getsize(os.path.join(file_path, filename)) < 16000:
+        print(filename, 'less than 0.5 to 1 second')
+        return True
+    
+    return False
+
+
+# ==================== Prepare dataset files from each data folder ====================
+
+EMOTION_MAP_DICT = {
+    'amused':       'amused'      , 
+    'anger':        'angry'       , 'angry':        'angry'       , 
+    'anxious':      'anxious'     , 
+    'apologetic':   'apologetic'  , 
+    'assertive':    'assertive'   ,
+    'calm':         'calm'        , 
+    'concerned':    'concerned'   , 
+    'contempt':     'contempt'    , 
+    'disgust':      'disgusted'   , 'disgusted':    'disgusted'   , 
+    'encouraging':  'encouraging' , 
+    'excited':      'excited'     , 
+    'fear':         'fearful'     , 'fearful':      'fearful'     , 
+    'frustated':    'frustated'   ,
+    'happy':        'happy'       , 'joy':          'happy'       , 
+    'neutral':      'neutral'     , 
+    'sad':          'sad'         , 'sadness':      'sad'         , 
+    'sleepy':       'sleepy'      , 
+    'surprise':     'surprised'   , 'surprised':    'surprised'   ,
+    'pleasantly surprised': 'pleasantly surprised' ,
+}
+
+def load_dataset_file(dataset_file):
+    with open(dataset_file) as f:
+        contents = f.read()
+    contents = json.loads(contents)
+
+    audio_files = [
+        os.path.join(
+            contents["dataset_path"],
+            contents["split_path"],
+            contents["data"][str(i)]["name"]
+        ) for i in range(contents["total_num"])
+    ]
+
+    return contents, audio_files
+
+
+def compute_label_graph(dataset_name, dataset_path, top_n, output_file):
+    if os.path.exists(output_file):
+        print('loading precomputed graph:', output_file)
+        with open(output_file, 'r') as json_file:
+            graph = json.load(json_file)
+            
+    else:
+        import torch
+        from sentence_transformers import SentenceTransformer, util
+        embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
+
+        print('precomputing graph and save to:', output_file)
+
+        if dataset_name == 'AudioSetSL_singlelabel':
+            names = []
+            with open(os.path.join(dataset_path, 'class_labels_indices.csv'), newline='') as csvfile:
+                reader = csv.reader(csvfile, delimiter=',', quotechar='"')
+                next(reader)
+                for row in reader:
+                    _, label, name = row  # 123, /m/02zsn, "Female speech, woman speaking"
+                    names += name.split(', ')
+            names = [x.lower().strip() for x in names]
+
+        elif dataset_name == "Clotho-AQA_singlelabel":
+            names = set([])
+            with open(os.path.join(dataset_path, 'clotho_aqa_metadata.csv'), newline='') as csvfile:
+                reader = csv.reader(csvfile, delimiter=',', quotechar='"')
+                next(reader)
+                for row in tqdm(reader):
+                    _, file_name, keywords, _, _, _, _ = row
+                    names |= set(keywords.split(';'))
+            names = [x.lower().strip() for x in names]
+
+        names_embeddings = embedding_model.encode(names, convert_to_tensor=True)
+        similarity_matrix = util.pytorch_cos_sim(names_embeddings, names_embeddings)
+
+        similarity_threshold = 0.75
+        n_items = len(names)
+        
+        graph = {}
+        for i in range(n_items):
+            adjusted_top_n = min(top_n, n_items - 1)
+            values, indices = torch.topk(similarity_matrix[i], adjusted_top_n + 1, largest=True)
+
+            most_similar_items = []
+            for value, idx in zip(values, indices):
+                if idx != i and value <= similarity_threshold:
+                    most_similar_items.append(idx.item())
+                if len(most_similar_items) == adjusted_top_n:
+                    break
+            graph[names[i]] = [names[j] for j in most_similar_items]
+
+        with open(output_file, 'w') as json_file:
+            json.dump(graph, json_file)
+    
+    # graph is a dict: key = each label, value = List[20 similar labels]
+    return graph
+
+
+def prepare_files(dataset_name, dataset_path, split, flamingo_task, output_file):
+    
+    assert not os.path.exists(output_file)
+    dataset_dic = {
+        "dataset_path": dataset_path,
+        "split": split,
+        "split_path": None,
+        "flamingo_task": "{}-{}".format(dataset_name, flamingo_task),
+        "total_num": 0,
+        "data": {}  # {id: {'name': name, 'prompt': prompt, 'output': output}}
+    }
+
+    if dataset_name == "AudioSet":
+        assert flamingo_task == "EventClassification"
+
+        assert split == 'train'
+        map_split = lambda split: 'train_wav' if split == 'train' else ''
+        file_path = os.path.join(
+            dataset_path,
+            map_split(split)
+        )
+        assert os.path.exists(file_path), '{} not exist'.format(file_path)
+
+        dataset_dic["split_path"] = map_split(split)
+        file_list = list(filter(lambda x: x.endswith('.wav'), os.listdir(file_path)))
+
+        dic = defaultdict(str)
+        with open(os.path.join(dataset_path, 'class_labels_indices.csv'), newline='') as csvfile:
+            reader = csv.reader(csvfile, delimiter=',', quotechar='"')
+            next(reader)
+            for row in tqdm(reader):
+                _, label, name = row  # /m/02zsn,"Female speech, woman speaking"
+                dic[label] = name
+        
+        with open(os.path.join(dataset_path, 'train.csv'), newline='') as csvfile:
+            reader = csv.reader(csvfile, delimiter=',', quotechar='"')
+            next(reader)
+            for row in tqdm(reader):
+                filename, _, _, labels = row  # --aE2O5G5WE /m/03fwl,/m/04rlf,/m/09x0r 
+                filename = filename + '.wav'
+                if filter_file(file_path, file_list, filename):
+                    continue
+                    
+                label_list = labels.split(",")
+                assert all(label in dic for label in label_list)
+
+                text_output = ", ".join([dic[label] for label in label_list])
+                if len(text_output) <= 1:
+                    continue
+                text_prompt = 'this is a sound of'
+
+                dataset_dic["data"][dataset_dic["total_num"]] = {
+                    "name": filename,
+                    "prompt": text_prompt,
+                    "output": text_output.replace('\n', ' ')
+                }
+                dataset_dic["total_num"] += 1
+    
+    elif dataset_name == "AudioSetFull":
+        assert flamingo_task == "EventClassification"
+
+        assert split == 'train'
+        map_split = lambda split: '/mnt/fsx-main/rafaelvalle/datasets/audioset/unbalanced_train_segments/22khz'
+        file_path = map_split(split)
+        assert os.path.exists(file_path), '{} not exist'.format(file_path)
+
+        dataset_dic["split_path"] = map_split(split)
+        file_list = None
+
+        dic_code2label = defaultdict(str)
+        with open(os.path.join(dataset_path, 'audioset-processing/data/class_labels_indices.csv'), newline='') as csvfile:
+            reader = csv.reader(csvfile, delimiter=',', quotechar='"')
+            next(reader)
+            for row in tqdm(reader):
+                _, code, name = row  # /m/02zsn,"Female speech, woman speaking"
+                dic_code2label[code] = name
+        
+        dic_filename2code = {}
+        with open(os.path.join(dataset_path, 'audioset-processing/data/unbalanced_train_segments.csv'), newline='') as csvfile:
+            reader = csv.reader(csvfile, delimiter=',', quotechar='"')
+            next(reader)
+            next(reader)
+            for row in tqdm(reader):
+                filename, _, _, codes = row  # --aE2O5G5WE /m/03fwl,/m/04rlf,/m/09x0r 
+                filename = 'Y' + filename + '.wav'
+                dic_filename2code[filename] = codes.split(",")
+
+        for part in tqdm(range(41)):
+            part_str = str(part)
+            if len(part_str) == 1:
+                part_str = '0' + part_str
+            part_folder = 'unbalanced_train_segments_part{}'.format(part_str)
+
+            for filename in os.listdir(os.path.join(file_path, part_folder)):
+                if not filename.endswith('.wav'):
+                    continue 
+
+                if filter_file(file_path, file_list, os.path.join(part_folder, filename)):
+                    continue
+                
+                if filename not in dic_filename2code:
+                    continue 
+
+                text_output = ", ".join([dic_code2label[code] for code in dic_filename2code[filename] if code in dic_code2label])
+                if len(text_output) <= 1:
+                    continue
+                text_prompt = 'this is a sound of'
+
+                dataset_dic["data"][dataset_dic["total_num"]] = {
+                    "name": os.path.join(part_folder, filename),
+                    "prompt": text_prompt,
+                    "output": text_output.replace('\n', ' ')
+                }
+                dataset_dic["total_num"] += 1
+    
+    elif dataset_name == "AudioSetFullwoAudioMusicCaps":
+        assert flamingo_task == "EventClassification"
+
+        assert split == 'train'
+        map_split = lambda split: '/mnt/fsx-main/rafaelvalle/datasets/audioset/unbalanced_train_segments/22khz'
+        file_path = map_split(split)
+        assert os.path.exists(file_path), '{} not exist'.format(file_path)
+
+        dataset_dic["split_path"] = map_split(split)
+        file_list = None
+
+        print('extracting AudioCaps and MusicCaps ytid to avoid these samples')
+        audiocaps_ytid = []
+        for f in ['audiocaps_dataset/train.csv', 'audiocaps_dataset/test.csv', 'audiocaps_dataset/val.csv']:
+            with open(os.path.join(dataset_path, f), newline='') as csvfile:
+                reader = csv.reader(csvfile, delimiter=',', quotechar='"')
+                next(reader)
+                for row in reader:
+                    _, ytid, _, _ = row 
+                    audiocaps_ytid.append('Y' + ytid + '.wav')
+        audiocaps_ytid = set(audiocaps_ytid)
+        
+        musiccaps_ytid = []
+        with open(os.path.join(dataset_path, 'musiccaps_dataset/musiccaps_manifest.json')) as f:
+            data = f.read()
+        musiccaps_list = json.loads(data)
+        for row in musiccaps_list:
+            musiccaps_ytid.append('Y' + row["ytid"] + '.wav')
+        musiccaps_ytid = set(musiccaps_ytid)
+
+        print('Will exclude {} samples from MusicCaps and {} from AudioCaps'.format(len(audiocaps_ytid), len(musiccaps_ytid)))
+
+        dic_code2label = defaultdict(str)
+        with open(os.path.join(dataset_path, '../AudioSetFull/audioset-processing/data/class_labels_indices.csv'), newline='') as csvfile:
+            reader = csv.reader(csvfile, delimiter=',', quotechar='"')
+            next(reader)
+            for row in tqdm(reader):
+                _, code, name = row  # /m/02zsn,"Female speech, woman speaking"
+                dic_code2label[code] = name
+        
+        dic_filename2code = {}
+        with open(os.path.join(dataset_path, '../AudioSetFull/audioset-processing/data/unbalanced_train_segments.csv'), newline='') as csvfile:
+            reader = csv.reader(csvfile, delimiter=',', quotechar='"')
+            next(reader)
+            next(reader)
+            for row in tqdm(reader):
+                filename, _, _, codes = row  # --aE2O5G5WE /m/03fwl,/m/04rlf,/m/09x0r 
+                filename = 'Y' + filename + '.wav'
+                dic_filename2code[filename] = codes.split(",")
+
+        music_audio_caps_excluded = 0
+        for part in tqdm(range(41)):
+            part_str = str(part)
+            if len(part_str) == 1:
+                part_str = '0' + part_str
+            part_folder = 'unbalanced_train_segments_part{}'.format(part_str)
+
+            for filename in os.listdir(os.path.join(file_path, part_folder)):
+                if not filename.endswith('.wav'):
+                    continue 
+
+                if filename in audiocaps_ytid or filename in musiccaps_ytid:
+                    music_audio_caps_excluded += 1
+                    continue
+
+                if filter_file(file_path, file_list, os.path.join(part_folder, filename)):
+                    continue
+                
+                if filename not in dic_filename2code:
+                    continue 
+
+                text_output = ", ".join([dic_code2label[code] for code in dic_filename2code[filename] if code in dic_code2label])
+                if len(text_output) <= 1:
+                    continue
+                text_prompt = 'this is a sound of'
+
+                dataset_dic["data"][dataset_dic["total_num"]] = {
+                    "name": os.path.join(part_folder, filename),
+                    "prompt": text_prompt,
+                    "output": text_output.replace('\n', ' ')
+                }
+                dataset_dic["total_num"] += 1
+
+    elif dataset_name == "AudioSetSL_singlelabel":
+        import numpy as np 
+
+        assert flamingo_task == "EventClassification"
+
+        assert split == 'train'
+        map_split = lambda split: '../AudioSet/train_wav'
+        file_path = os.path.join(
+            dataset_path,
+            map_split(split)
+        )
+        assert os.path.exists(file_path), '{} not exist'.format(file_path)
+
+        dataset_dic["split_path"] = map_split(split)
+        file_list = list(filter(lambda x: x.endswith('.wav'), os.listdir(file_path)))
+
+        dic = defaultdict(str)
+        with open(os.path.join(dataset_path, 'class_labels_indices.csv'), newline='') as csvfile:
+            reader = csv.reader(csvfile, delimiter=',', quotechar='"')
+            next(reader)
+            for row in tqdm(reader):
+                _, label, name = row  # /m/02zsn,"Female speech, woman speaking"
+                dic[label] = name
+        
+        graph = compute_label_graph(
+            dataset_name, 
+            dataset_path, 
+            top_n=200,
+            output_file=os.path.join(dataset_path, 'label_graph.json')
+        )
+        
+        with open(os.path.join(dataset_path, 'train.csv'), newline='') as csvfile:
+            reader = csv.reader(csvfile, delimiter=',', quotechar='"')
+            next(reader)
+            for row in tqdm(reader):
+                filename, _, _, labels = row  # --aE2O5G5WE /m/03fwl,/m/04rlf,/m/09x0r 
+                filename = filename + '.wav'
+                if filter_file(file_path, file_list, filename):
+                    continue
+                    
+                label_list = labels.split(",")
+                assert all(label in dic for label in label_list)
+
+                text_labels = ", ".join([dic[label] for label in label_list]).lower()
+                text_labels = text_labels.split(', ')
+                text_output = np.random.choice(text_labels)
+                if len(text_output) <= 1:
+                    continue
+
+                num_options = np.random.choice(
+                    [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+                    p=[ 0.05, 0.1, 0.1, 0.1, 0.1, 
+                        0.05, 0.05, 0.05, 0.1, 0.05, 
+                        0.05, 0.1, 0.05, 0.05]
+                )
+
+                negative_samples = [x for x in graph[text_output] if x not in set(text_labels)]
+                candidate_negative_labels = list(np.random.choice(
+                    negative_samples[:num_options*10],
+                    size=num_options-1, 
+                    replace=False
+                ))
+                if type(candidate_negative_labels) is str:
+                    candidate_negative_labels = [candidate_negative_labels]
+
+                all_options = [text_output] + candidate_negative_labels
+                np.random.shuffle(all_options)
+
+                text_prompt = 'Classify this sound.\nOPTIONS:\n - {}.'.format(
+                    '.\n - '.join(all_options)
+                )
+
+                dataset_dic["data"][dataset_dic["total_num"]] = {
+                    "name": filename,
+                    "prompt": text_prompt,
+                    "output": text_output.replace('\n', ' ')
+                }
+                dataset_dic["total_num"] += 1
+    
+    elif dataset_name == "AUDIOCAPS13k":
+        assert flamingo_task == 'AudioCaptioning'
+
+        map_split = lambda split: 'audio_32000Hz/{}'.format(split)
+        file_path = os.path.join(
+            dataset_path,
+            map_split(split)
+        )
+        assert os.path.exists(file_path), '{} not exist'.format(file_path)
+
+        dataset_dic["split_path"] = map_split(split)
+        file_list = list(filter(lambda x: x.endswith('.flac'), os.listdir(file_path)))
+
+        with open(os.path.join(
+            dataset_path,
+            '{}_manifest.json'.format(split + ('_v2' if split == 'train' else ''))
+        ), 'r') as f:
+            data = f.readlines()
+        data = [json.loads(row) for row in data]
+
+        for row in tqdm(data):
+            filename = row['audio_filepath'].split('/')[-1]
+            if filter_file(file_path, file_list, filename):
+                continue
+            
+            text_output = row['text']
+            if len(text_output) <= 1:
+                continue
+            text_prompt = 'generate audio caption'
+
+            dataset_dic["data"][dataset_dic["total_num"]] = {
+                "name": filename,
+                "prompt": text_prompt,
+                "output": text_output.replace('\n', ' ')
+            }
+            dataset_dic["total_num"] += 1
+
+    elif dataset_name == "audiocaps":
+        assert flamingo_task == 'AudioCaptioning'
+
+        map_split = lambda split: 'audio/{}'.format(split if split in ['train', 'test'] else 'valid')
+        file_path = os.path.join(
+            dataset_path,
+            map_split(split)
+        )
+        assert os.path.exists(file_path), '{} not exist'.format(file_path)
+
+        dataset_dic["split_path"] = map_split(split)
+        file_list = list(filter(lambda x: x.endswith('.flac'), os.listdir(file_path)))
+
+        for filename in tqdm(file_list):
+            if filter_file(file_path, file_list, filename):
+                continue
+            
+            with open(os.path.join(file_path, filename.replace('.flac', '.json')), 'r') as f:
+                data = json.load(f)
+            
+            captions = data['text']
+            for text_output in captions:
+                if len(text_output) <= 1:
+                    continue
+                text_prompt = 'generate audio caption'
+
+                dataset_dic["data"][dataset_dic["total_num"]] = {
+                    "name": filename,
+                    "prompt": text_prompt,
+                    "output": text_output.replace('\n', ' ')
+                }
+                dataset_dic["total_num"] += 1
+    
+    elif dataset_name == 'BG-Gun-Sound-Dataset':
+        assert flamingo_task == "SoundClassification"
+        assert split in ["train", "test"]
+        
+        map_split = lambda split: 'data/gun_sound_v2'
+        file_path = os.path.join(
+            dataset_path,
+            map_split(split)
+        )
+        assert os.path.exists(file_path), '{} not exist'.format(file_path)
+
+        dataset_dic["split_path"] = map_split(split)
+        file_list = os.listdir(file_path)
+
+        all_cates = set([])
+        with open(os.path.join(dataset_path, 'data/v3_exp3_{}.csv'.format(split)), newline='') as csvfile:
+            reader = csv.reader(csvfile, delimiter=',', quotechar='"')
+            next(reader)
+            for row in tqdm(reader):
+                filename, cate, dist, dire = row
+                if filter_file(file_path, file_list, filename):
+                    continue
+
+                text_output = cate
+                if len(text_output) <= 1:
+                    continue
+                text_prompt = 'What is the gun of this sound?'
+
+                all_cates.add(cate)
+                
+                dataset_dic["data"][dataset_dic["total_num"]] = {
+                    "name": filename,
+                    "prompt": text_prompt,
+                    "output": text_output.replace('\n', ' ')
+                }
+                dataset_dic["total_num"] += 1
+
+        print(all_cates)
+
+    elif dataset_name == "BirdsDataset":
+        assert flamingo_task == "SoundClassification"
+        assert split == 'train'
+
+        map_split = lambda split: 'Voice_of_Birds'
+        file_path = os.path.join(
+            dataset_path,
+            map_split(split)
+        )
+        assert os.path.exists(file_path), '{} not exist'.format(file_path)
+
+        dataset_dic["split_path"] = map_split(split)
+        file_list = None
+
+        for bird_type in tqdm(os.listdir(file_path)):
+            bird_name = ' '.join(bird_type.split('_')[:-1])
+            for filename in os.listdir(os.path.join(file_path, bird_type)):
+                if filter_file(file_path, file_list, os.path.join(bird_type, filename)):
+                    continue
+
+                text_output = bird_name
+                if len(text_output) <= 1:
+                    continue
+                text_prompt = 'What is the name of bird in this sound?'
+                
+                dataset_dic["data"][dataset_dic["total_num"]] = {
+                    "name": os.path.join(bird_type, filename),
+                    "prompt": text_prompt,
+                    "output": text_output.replace('\n', ' ')
+                }
+                dataset_dic["total_num"] += 1
+
+    elif dataset_name == "BBCSoundEffects":
+        assert split in ['train']
+        assert flamingo_task == 'AudioDescription'
+
+        map_split = lambda split: '../WavCaps/BBC_Sound_Effects_flac'
+        file_path = os.path.join(
+            dataset_path,
+            map_split(split)
+        )
+        assert os.path.exists(file_path), '{} not exist'.format(file_path)
+
+        dataset_dic["split_path"] = map_split(split)
+        file_list = list(filter(lambda x: x.endswith('.flac'), os.listdir(file_path)))
+
+        with open(os.path.join(dataset_path, 'BBCSoundDownloader/BBCSoundEffects.csv'), newline='') as csvfile:
+            reader = csv.reader(csvfile, delimiter=',', quotechar='"')
+            next(reader)
+            for row in tqdm(reader):
+                if len(row) != 7:
+                    continue
+                filename, description, _, _, _, _, _ = row
+                filename = filename.replace('.wav', '.flac')
+
+                if filter_file(file_path, file_list, filename):
+                    continue
+                
+                text_output = description
+                if len(text_output) <= 1:
+                    continue
+                text_prompt = 'generate audio description'
+
+                dataset_dic["data"][dataset_dic["total_num"]] = {
+                    "name": filename,
+                    "prompt": text_prompt,
+                    "output": text_output.replace('\n', ' ')
+                }
+                dataset_dic["total_num"] += 1
+    
+    elif dataset_name == "chime-home":
+        assert flamingo_task == "EventClassification"
+        assert split == 'train'
+
+        map_split = lambda split: 'chime_home/chunks'
+        file_path = os.path.join(
+            dataset_path,
+            map_split(split)
+        )
+        assert os.path.exists(file_path), '{} not exist'.format(file_path)
+
+        dataset_dic["split_path"] = map_split(split)
+        file48k_list = list(filter(lambda x: x.endswith('48kHz.wav'), os.listdir(file_path)))
+        file16k_list = list(filter(lambda x: x.endswith('16kHz.wav'), os.listdir(file_path)))
+        csv_file_list = list(filter(lambda x: x.endswith('.csv'), os.listdir(file_path)))
+
+        label_mapping = {
+            'c': 'child speaking',
+            'm': 'male speaking',
+            'f': 'female speaking',
+            'p': 'human activity',
+            't': 'television',
+            'b': 'household appliances',
+            's': 'silence'
+        }
+
+        for csv_file in tqdm(csv_file_list):
+            with open(os.path.join(file_path, csv_file), newline='') as csvfile:
+                reader = csv.reader(csvfile, delimiter=',', quotechar='"')
+
+                labels = None
+                for row in reader:
+                    if row[0] == 'majorityvote':
+                        labels = row[1]
+                        break 
+                
+            if labels is None or len(labels) == 0:
+                continue 
+            
+            filename = csv_file.replace('.csv', '.48kHz.wav')
+            if filter_file(file_path, file48k_list, filename):
+                filename = csv_file.replace('.csv', '.16kHz.wav')
+                if filter_file(file_path, file16k_list, filename):
+                    continue
+
+            text_output = ", ".join([label_mapping[l] for l in labels if l in label_mapping])
+            if len(text_output) <= 1:
+                continue
+            text_prompt = 'this is a sound of'
+            
+            dataset_dic["data"][dataset_dic["total_num"]] = {
+                "name": filename,
+                "prompt": text_prompt,
+                "output": text_output.replace('\n', ' ')
+            }
+            dataset_dic["total_num"] += 1
+
+    elif dataset_name == "CLAP_freesound":
+        assert flamingo_task == "AudioCaptioning"
+        assert split in ["train", "test"]
+
+        map_split = lambda split: os.path.join('freesound_no_overlap/split', split)
+        file_path = os.path.join(
+            dataset_path,
+            map_split(split)
+        )
+        assert os.path.exists(file_path), '{} not exist'.format(file_path)
+
+        dataset_dic["split_path"] = map_split(split)
+        file_list = list(filter(lambda x: x.endswith('.flac'), os.listdir(file_path)))
+        
+        with open(os.path.join(
+            dataset_path, 
+            'freesound_no_overlap_meta.csv'
+        ), newline='') as csvfile:
+            reader = csv.reader(csvfile, delimiter=',', quotechar='"')
+            next(reader)
+            for row in tqdm(reader):
+                if len(row[0].split('/')) != 2:
+                    continue 
+                if len(row) <= 1:
+                    continue
+
+                file_split, filename = row[0].split('/')
+
+                if file_split != split:
+                    continue 
+                if filter_file(file_path, file_list, filename):
+                    continue
+
+                caption_1 = row[1]  # caption_2 = row[2] but not very good
+                text_output = caption_1
+                if len(text_output) <= 2:
+                    continue
+
+                text_prompt = 'generate audio caption'
+                dataset_dic["data"][dataset_dic["total_num"]] = {
+                    "name": filename,
+                    "prompt": text_prompt,
+                    "output": text_output.replace('\n', ' ')
+                }
+                dataset_dic["total_num"] += 1
+
+    elif dataset_name == "Clotho-AQA":
+
+        map_split = lambda split: 'audio_files'
+        file_path = os.path.join(
+            dataset_path,
+            map_split(split)
+        )
+        assert os.path.exists(file_path), '{} not exist'.format(file_path)
+
+        dataset_dic["split_path"] = map_split(split)
+        file_list = list(filter(lambda x: x.endswith('.wav'), os.listdir(file_path)))
+
+        if flamingo_task == "EventClassification":
+            dic = defaultdict(str)
+            with open(os.path.join(dataset_path, 'clotho_aqa_metadata.csv'), newline='') as csvfile:
+                reader = csv.reader(csvfile, delimiter=',', quotechar='"')
+                next(reader)
+                for row in tqdm(reader):
+                    _, file_name, keywords, _, _, _, _ = row
+                    dic[file_name] = keywords.replace(';', ', ')
+
+            with open(os.path.join(dataset_path, 'clotho_aqa_{}.csv'.format(split)), newline='') as csvfile:
+                reader = csv.reader(csvfile, delimiter=',', quotechar='"')
+                next(reader)
+                for row in tqdm(reader):
+                    filename = row[0]
+                    if filename not in dic or filter_file(file_path, file_list, filename):
+                        continue
+
+                    text_output = dic[filename]
+                    if len(text_output) <= 1:
+                        continue
+                    text_prompt = 'this is a sound of'
+                    del dic[filename]
+
+                    dataset_dic["data"][dataset_dic["total_num"]] = {
+                        "name": filename,
+                        "prompt": text_prompt,
+                        "output": text_output.replace('\n', ' ')
+                    }
+                    dataset_dic["total_num"] += 1
+        
+        elif flamingo_task == "AQA":
+            dic_qa = defaultdict(list)
+            with open(os.path.join(dataset_path, 'clotho_aqa_{}.csv'.format(split)), newline='') as csvfile:
+                reader = csv.reader(csvfile, delimiter=',', quotechar='"')
+                next(reader)
+                for row in tqdm(reader):
+                    filename, question, answer, confidence = row
+                    dic_qa[(filename, question)].append((answer.lower(), confidence.lower()))
+
+            # get binary -> trinary
+            def preprocess(list_ans_conf):
+                assert set([x[1] for x in list_ans_conf]) <= set(['yes', 'no', 'maybe'])
+
+                answers = set([x[0].lower() for x in list_ans_conf])
+                if answers <= set(['yes', 'no']):
+                    if len(answers) > 1:
+                        return ['unsure']
+                    else:
+                        return list(answers)
+                else:
+                    return list(answers)
+            
+            # get majority vote
+            def majority_vote(list_ans_conf):
+                assert set([x[1] for x in list_ans_conf]) <= set(['yes', 'no', 'maybe'])
+                weight = {'yes': 1.0, 'no': 0.1, 'maybe': 0.6}
+
+                if set([x[0] for x in list_ans_conf]) <= set(['yes', 'no']):
+                    score = {'yes': 1.0, 'no': -1.0}
+                    pred = sum([score[x[0]] * weight[x[1]] for x in list_ans_conf])
+                    if pred > 0:
+                        return ['yes']
+                    else:
+                        return ['no']
+                else:
+                    return list(set([x[0] for x in list_ans_conf]))
+
+            for key in dic_qa:
+                filename, question = key
+                if filter_file(file_path, file_list, filename):
+                    continue
+
+                if split == 'train':
+                    answers = majority_vote(dic_qa[key])  # majority vote
+                else:
+                    answers = [x[0].strip().lower() for x in dic_qa[key]]
+                    answers = [', '.join(answers)]
+
+                for answer in answers:
+                    text_output = answer
+                    if len(text_output) <= 1:
+                        continue
+                    text_prompt = "Question: " + question
+
+                    dataset_dic["data"][dataset_dic["total_num"]] = {
+                        "name": filename,
+                        "prompt": text_prompt,
+                        "output": text_output.replace('\n', ' ')
+                    }
+                    dataset_dic["total_num"] += 1
+    
+    elif dataset_name == "Clotho-AQA_singlelabel":
+        import numpy as np 
+        
+        assert flamingo_task == "EventClassification"
+
+        map_split = lambda split: '../Clotho-AQA/audio_files'
+        file_path = os.path.join(
+            dataset_path,
+            map_split(split)
+        )
+        assert os.path.exists(file_path), '{} not exist'.format(file_path)
+
+        dataset_dic["split_path"] = map_split(split)
+        file_list = list(filter(lambda x: x.endswith('.wav'), os.listdir(file_path)))
+
+        dic = defaultdict(str)
+        with open(os.path.join(dataset_path, 'clotho_aqa_metadata.csv'), newline='') as csvfile:
+            reader = csv.reader(csvfile, delimiter=',', quotechar='"')
+            next(reader)
+            for row in tqdm(reader):
+                _, file_name, keywords, _, _, _, _ = row
+                dic[file_name] = keywords.split(';')
+        
+        graph = compute_label_graph(
+            dataset_name, 
+            dataset_path, 
+            top_n=300,
+            output_file=os.path.join(dataset_path, 'label_graph.json')
+        )
+
+        with open(os.path.join(dataset_path, 'clotho_aqa_{}.csv'.format(split)), newline='') as csvfile:
+            reader = csv.reader(csvfile, delimiter=',', quotechar='"')
+            next(reader)
+            for row in tqdm(reader):
+                filename = row[0]
+                if filename not in dic or filter_file(file_path, file_list, filename):
+                    continue
+
+                text_labels = [x.lower().strip() for x in dic[filename]]
+                del dic[filename]
+
+                for _ in range(6):
+                    text_output = np.random.choice(text_labels)
+                    if len(text_output) <= 1:
+                        continue
+
+                    num_options = np.random.choice(
+                        [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+                        p=[ 0.05, 0.1, 0.1, 0.1, 0.1, 
+                            0.05, 0.05, 0.05, 0.1, 0.05, 
+                            0.05, 0.1, 0.05, 0.05]
+                    )
+
+                    negative_samples = [x for x in graph[text_output] if x not in set(text_labels)]
+                    candidate_negative_labels = list(np.random.choice(
+                        negative_samples[:num_options*20],
+                        size=num_options-1, 
+                        replace=False
+                    ))
+                    if type(candidate_negative_labels) is str:
+                        candidate_negative_labels = [candidate_negative_labels]
+
+                    all_options = [text_output] + candidate_negative_labels
+                    np.random.shuffle(all_options)
+
+                    text_prompt = 'Classify this sound.\nOPTIONS:\n - {}.'.format(
+                        '.\n - '.join(all_options)
+                    )
+                    
+                    dataset_dic["data"][dataset_dic["total_num"]] = {
+                        "name": filename,
+                        "prompt": text_prompt,
+                        "output": text_output.replace('\n', ' ')
+                    }
+                    dataset_dic["total_num"] += 1
+
+    elif dataset_name == "Clotho-v2":
+        assert flamingo_task == "AudioCaptioning"
+        assert split in ["train", "val", "test"]
+
+        map_split = lambda split: 'development' if split == 'train' else ('validation' if split == "val" else "evaluation")
+        file_path = os.path.join(
+            dataset_path,
+            map_split(split)
+        )
+        assert os.path.exists(file_path), '{} not exist'.format(file_path)
+
+        dataset_dic["split_path"] = map_split(split)
+        file_list = list(filter(lambda x: x.endswith('.wav'), os.listdir(file_path)))
+
+        with open(os.path.join(
+            dataset_path, 
+            'clotho_captions_{}.csv'.format(map_split(split))
+        ), newline='') as csvfile:
+            reader = csv.reader(csvfile, delimiter=',', quotechar='"')
+            next(reader)
+            for row in tqdm(reader):
+                filename = row[0]
+                if filter_file(file_path, file_list, filename):
+                    continue
+
+                for text_output in row[1:]:
+                    if len(text_output) <= 1:
+                        continue
+                    text_prompt = 'generate audio caption'
+                    dataset_dic["data"][dataset_dic["total_num"]] = {
+                        "name": filename,
+                        "prompt": text_prompt,
+                        "output": text_output.replace('\n', ' ')
+                    }
+                    dataset_dic["total_num"] += 1
+    
+    elif dataset_name == "CochlScene":
+        import ndjson
+        assert flamingo_task == "SceneClassification"
+
+        map_split = lambda split: split.capitalize()
+        file_path = os.path.join(
+            dataset_path,
+            map_split(split)
+        )
+        assert os.path.exists(file_path), '{} not exist'.format(file_path)
+
+        dataset_dic["split_path"] = map_split(split)
+        file_list = None
+
+        with open(os.path.join(dataset_path, 'cochlscene_{}.ndjson'.format(split))) as ndjsonfile:
+            reader = ndjson.load(ndjsonfile)
+            for row in tqdm(reader):
+                filename = "/".join(row["audiopath"].split("/")[1:])
+                if filter_file(file_path, file_list, filename):
+                    continue
+
+                text_output = row["labels"].lower()
+                if len(text_output) <= 1:
+                    continue
+                text_prompt = 'this acoustic scene is'
+
+                dataset_dic["data"][dataset_dic["total_num"]] = {
+                    "name": filename,
+                    "prompt": text_prompt,
+                    "output": text_output.replace('\n', ' ')
+                }
+                dataset_dic["total_num"] += 1
+    
+    elif dataset_name == "common-accent":
+        import ndjson 
+        import re
+
+        assert flamingo_task == "AccentClassification"
+        assert split in ["train", "test"]
+
+        map_split = lambda split: '22khz'
+        file_path = os.path.join(
+            dataset_path,
+            map_split(split)
+        )
+        assert os.path.exists(file_path), '{} not exist'.format(file_path)
+
+        dataset_dic["split_path"] = map_split(split)
+        file_list = os.listdir(file_path)
+
+        all_accent = []
+        split_file = [f for f in os.listdir(dataset_path) if f.startswith(split) and f.endswith('.ndjson')][0]
+        with open(os.path.join(dataset_path, split_file)) as ndjsonfile:
+            reader = ndjson.load(ndjsonfile)
+            for row in tqdm(reader):
+                accent = row["accent"]
+                accent = re.sub(r'\(.*?\)', '', accent)
+                accent = accent.replace('English', '')
+                accent = accent.split(',')
+                accent = [x.strip() for x in accent if 'school' not in x]
+                all_accent += accent
+
+                filename = row["filename"]
+                if filter_file(file_path, file_list, filename):
+                    continue
+
+                for accent_each in accent:
+                    if accent_each == 'Javanese':
+                        accent_each = 'Japanese'
+                    if len(accent_each) > 25:
+                        continue 
+
+                    text_output = accent_each
+                    if len(text_output) <= 1:
+                        continue
+                    text_prompt = 'Classify the accent of this speech.'
+                    
+                    dataset_dic["data"][dataset_dic["total_num"]] = {
+                        "name": filename,
+                        "prompt": text_prompt,
+                        "output": text_output.replace('\n', ' ')
+                    }
+                    dataset_dic["total_num"] += 1
+
+        print('all accents:', list(set(all_accent)))
+
+    elif dataset_name == "CREMA-D":
+        assert flamingo_task == "EmotionClassification"
+        assert split in ["train"]
+
+        map_split = lambda split: 'AudioWAV'
+        file_path = os.path.join(
+            dataset_path,
+            map_split(split)
+        )
+        assert os.path.exists(file_path), '{} not exist'.format(file_path)
+
+        dataset_dic["split_path"] = map_split(split)
+        file_list = list(filter(lambda x: x.endswith('.wav'), os.listdir(file_path)))
+
+        split_file = os.path.join(
+            dataset_path, 
+            'crema-d_audiopath_text_sid_emotion_filelist.txt'
+        )
+        with open(split_file, 'r') as f:
+            data = f.readlines()
+        data = [x.replace('\n', '') for x in data]
+
+        for row in tqdm(data):
+            if row.count('|') != 3:
+                continue
+            filename, utterances, speaker, emotion = row.split('|')
+            if filter_file(file_path, file_list, filename):
+                continue
+                
+            text_output = emotion
+            text_prompt = 'this emotion is'
+            
+            if len(text_output) <= 1:
+                continue
+
+            dataset_dic["data"][dataset_dic["total_num"]] = {
+                "name": filename,
+                "prompt": text_prompt,
+                "output": text_output.replace('\n', ' ')
+            }
+            dataset_dic["total_num"] += 1
+    
+    elif dataset_name == "DCASE17Task4":
+        assert flamingo_task == "SceneClassification"
+        assert split in ["test"]
+
+        map_split = lambda split: 'unbalanced_train_segments_testing_set_audio_formatted_and_segmented_downloads'
+        file_path = os.path.join(
+            dataset_path,
+            map_split(split)
+        )
+        assert os.path.exists(file_path), '{} not exist'.format(file_path)
+
+        dataset_dic["split_path"] = map_split(split)
+        file_list = list(filter(lambda x: x.endswith('.wav'), os.listdir(file_path)))
+
+        split_file = os.path.join(
+            dataset_path, 
+            'Task-4-Large-scale-weakly-supervised-sound-event-detection-for-smart-cars',
+            'groundtruth_release',
+            'groundtruth_strong_label_testing_set.csv'
+        )
+
+        dic = defaultdict(list)
+        all_labels = []
+        with open(split_file, newline='') as csvfile:
+            reader = csv.reader(csvfile, delimiter='\t', quotechar='"')
+            for row in tqdm(reader):
+                filename = 'Y' + row[0]
+                label = row[-1]
+
+                if filter_file(file_path, file_list, filename):
+                    continue
+
+                dic[filename] += label.split(', ')
+                all_labels += label.split(', ')
+        
+        print('all labels:\n', ', '.join(list(set(all_labels))))
+        
+        for filename in dic:
+            text_output = ', '.join(list(set(dic[filename])))
+            text_prompt = 'this acoustic scene is'
+            
+            if len(text_output) <= 1:
+                continue
+
+            dataset_dic["data"][dataset_dic["total_num"]] = {
+                "name": filename,
+                "prompt": text_prompt,
+                "output": text_output.replace('\n', ' ')
+            }
+            dataset_dic["total_num"] += 1
+    
+    elif dataset_name == "emov-db":
+        assert flamingo_task == "EmotionClassification"
+        assert split in ["train", "val"]
+
+        map_split = lambda split: '22khz_from_16khz'
+        file_path = os.path.join(
+            dataset_path,
+            map_split(split)
+        )
+        assert os.path.exists(file_path), '{} not exist'.format(file_path)
+
+        dataset_dic["split_path"] = map_split(split)
+        file_list = list(filter(lambda x: x.endswith('.wav'), os.listdir(file_path)))
+
+        split_file = os.path.join(
+            dataset_path, 
+            'cleaned_emov_db_audiopath_text_sid_emotion_duration_filelist_merged_{}.txt'.format(split)
+        )
+        with open(split_file, 'r') as f:
+            data = f.readlines()
+        data = [x.replace('\n', '') for x in data]
+
+        for row in tqdm(data):
+            if row.count('|') != 4:
+                continue
+            filename, utterances, speaker, emotion, duration = row.split('|')
+            if filter_file(file_path, file_list, filename):
+                continue
+                
+            text_output = emotion
+            text_output = EMOTION_MAP_DICT[text_output]
+            text_prompt = 'this emotion is'
+            
+            if len(text_output) <= 1:
+                continue
+
+            dataset_dic["data"][dataset_dic["total_num"]] = {
+                "name": filename,
+                "prompt": text_prompt,
+                "output": text_output.replace('\n', ' ')
+            }
+            dataset_dic["total_num"] += 1
+    
+    elif dataset_name == "Epidemic_sound":
+        assert split == 'train'
+        assert flamingo_task in ["AudioCaptioning", "Tagging"]
+
+        map_split = lambda split: 'audio'
+        file_path = os.path.join(
+            dataset_path,
+            map_split(split)
+        )
+        assert os.path.exists(file_path), '{} not exist'.format(file_path)
+
+        dataset_dic["split_path"] = map_split(split)
+        file_list = list(filter(lambda x: x.endswith('.mp3'), os.listdir(file_path)))
+
+        with open(os.path.join(dataset_path, 'Epidemic_all_debiased.csv'), newline='') as csvfile:
+            reader = csv.reader(csvfile, delimiter=',', quotechar='"')
+            next(reader)
+            for row in tqdm(reader):
+                if len(row) != 5:
+                    continue 
+                _, caption_1, caption_2, caption_t5, fileid = row
+                filename = '{}.mp3'.format(fileid)
+                if filter_file(file_path, file_list, filename):
+                    continue
+
+                if flamingo_task == "AudioCaptioning":
+                    text_output = caption_t5
+                    if len(text_output) <= 1:
+                        continue
+                    text_prompt = 'generate audio caption'
+
+                    dataset_dic["data"][dataset_dic["total_num"]] = {
+                        "name": filename,
+                        "prompt": text_prompt,
+                        "output": text_output.replace('\n', ' ')
+                    }
+                    dataset_dic["total_num"] += 1
+                    
+                elif flamingo_task == "Tagging":
+                    if not caption_2.startswith('the sounds of'):
+                        continue 
+                    caption_2 = caption_2.replace('the sounds of ', '')
+                    caption_2 = caption_2.replace(', and', ',')
+                    if len(caption_2) < 2:
+                        continue
+
+                    tags = caption_2.split(', ')
+                    tags = list(map(lambda x: x.replace("'", "").strip().lower(), tags))
+                    text_output = '{}'.format(', '.join(tags))
+                    if len(text_output) <= 1:
+                        continue
+                    text_prompt = 'generate tags'
+
+                    dataset_dic["data"][dataset_dic["total_num"]] = {
+                        "name": filename,
+                        "prompt": text_prompt,
+                        "output": text_output.replace('\n', ' ')
+                    }
+                    dataset_dic["total_num"] += 1
+    
+    elif dataset_name == "ESC50":
+        assert flamingo_task in ["EventClassification"]
+        assert split == 'train'
+
+        map_split = lambda split: 'ESC-50-master/audio'
+        file_path = os.path.join(
+            dataset_path,
+            map_split(split)
+        )
+        assert os.path.exists(file_path), '{} not exist'.format(file_path)
+
+        dataset_dic["split_path"] = map_split(split)
+        file_list = list(filter(lambda x: x.endswith('.wav'), os.listdir(file_path)))
+
+        with open(os.path.join(dataset_path, 'ESC-50-master/meta/esc50.csv'), newline='') as csvfile:
+            reader = csv.reader(csvfile, delimiter=',', quotechar='"')
+            next(reader)
+            for row in tqdm(reader):
+                if len(row) != 7:
+                    continue
+
+                filename, fold, target, category, esc10, src_file, take = row
+                if filter_file(file_path, file_list, filename):
+                    continue
+
+                text_output = category.replace('_', ' ')
+                text_prompt = 'classify this sound.'
+                if len(text_output) <= 1:
+                    continue 
+
+                dataset_dic["data"][dataset_dic["total_num"]] = {
+                    "name": filename,
+                    "prompt": text_prompt,
+                    "output": text_output.replace('\n', ' ')
+                }
+                dataset_dic["total_num"] += 1
+    
+    elif dataset_name == "FMA":
+        import ast 
+
+        assert flamingo_task in ["GenreClassification"]
+        assert split == 'train'
+
+        map_split = lambda split: 'fma_large'
+        file_path = os.path.join(
+            dataset_path,
+            map_split(split)
+        )
+        assert os.path.exists(file_path), '{} not exist'.format(file_path)
+
+        dataset_dic["split_path"] = map_split(split)
+        file_list = None
+
+        with open(os.path.join(dataset_path, 'fma_metadata/raw_tracks.csv'), newline='') as csvfile:
+            reader = csv.reader(csvfile, delimiter=',', quotechar='"')
+            next(reader)
+            for row in tqdm(reader):
+                if len(row) != 39:
+                    continue
+                track_id,album_id,album_title,album_url, \
+                    artist_id,artist_name,artist_url,artist_website, \
+                    license_image_file,license_image_file_large, \
+                    license_parent_id,license_title,license_url, \
+                    tags,track_bit_rate,track_comments,track_composer, \
+                    track_copyright_c,track_copyright_p,track_date_created,track_date_recorded, \
+                    track_disc_number,track_duration,track_explicit,track_explicit_notes, \
+                    track_favorites,track_file,track_genres,track_image_file,track_information, \
+                    track_instrumental,track_interest,track_language_code, \
+                    track_listens,track_lyricist,track_number,track_publisher,track_title,track_url = row
+                
+                l = len(str(track_id))
+                if l <= 3:
+                    filename = '{}/{}.mp3'.format(
+                        '000',
+                        '0'*(6-l)+str(track_id)
+                    )
+                else:
+                    filename = '{}/{}.mp3'.format(
+                        '0'*(6-l)+str(track_id)[:l-3],
+                        '0'*(6-l)+str(track_id)
+                    )
+                if filter_file(file_path, file_list, filename):
+                    continue
+
+                if len(track_genres) == 0:
+                    continue
+                    
+                track_genres = ast.literal_eval(track_genres)
+                genres = ', '.join([dic['genre_title'].lower().strip() for dic in track_genres])
+                text_output = genres + '.'
+
+                text_prompt = "what is the genre of this music?"
+                
+                if len(text_output) <= 1:
+                    continue 
+
+                dataset_dic["data"][dataset_dic["total_num"]] = {
+                    "name": filename,
+                    "prompt": text_prompt,
+                    "output": text_output.replace('\n', ' ')
+                }
+                dataset_dic["total_num"] += 1
+    
+    elif dataset_name == "FSD50k":
+        import ndjson
+        assert flamingo_task == "EventClassification"
+        assert split in ["train", "test"]
+
+        map_split = lambda split: '44khz/dev' if split == 'train' else '44khz/eval'
+        file_path = os.path.join(
+            dataset_path,
+            map_split(split)
+        )
+        assert os.path.exists(file_path), '{} not exist'.format(file_path)
+
+        dataset_dic["split_path"] = map_split(split)
+        file_list = list(filter(lambda x: x.endswith('.wav'), os.listdir(file_path)))
+
+        with open(os.path.join(dataset_path, '{}.ndjson'.format(map_split(split).replace('44khz/', '')))) as ndjsonfile:
+            reader = ndjson.load(ndjsonfile)
+            for row in tqdm(reader):
+                filename = row["filepath"].split("/")[1]
+                if filter_file(file_path, file_list, filename):
+                    continue
+
+                labels = [x.replace("_", " ").lower() for x in row["labels"]]
+                text_output = ", ".join(labels)
+                if len(text_output) <= 1:
+                    continue
+                text_prompt = 'this is a sound of'
+
+                dataset_dic["data"][dataset_dic["total_num"]] = {
+                    "name": filename,
+                    "prompt": text_prompt,
+                    "output": text_output.replace('\n', ' ')
+                }
+                dataset_dic["total_num"] += 1
+    
+    elif dataset_name == "GTZAN":
+        assert flamingo_task == "GenreClassification"
+        assert split in ["train"]
+
+        map_split = lambda split: 'gtzan/data/genres'
+        file_path = os.path.join(
+            dataset_path,
+            map_split(split)
+        )
+        assert os.path.exists(file_path), '{} not exist'.format(file_path)
+
+        dataset_dic["split_path"] = map_split(split)
+        file_list = None
+
+        for genre in os.listdir(file_path):
+            genre_wavs = [x for x in os.listdir(os.path.join(file_path, genre)) if x.endswith('.wav')]
+
+            for genre_wav in genre_wavs:
+                filename = os.path.join(genre, genre_wav)
+                if filter_file(file_path, file_list, filename):
+                    continue
+
+                text_output = genre
+                if len(text_output) <= 1:
+                    continue
+                text_prompt = 'What is the genre of this music?'
+
+                dataset_dic["data"][dataset_dic["total_num"]] = {
+                    "name": filename,
+                    "prompt": text_prompt,
+                    "output": text_output.replace('\n', ' ')
+                }
+                dataset_dic["total_num"] += 1
+    
+    elif dataset_name == "IEMOCAP":
+        assert flamingo_task == "EmotionClassification"
+        assert split in ["train", "test"]
+
+        map_split = lambda split: 'IEMOCAP_full_release/16khz'
+        file_path = os.path.join(
+            dataset_path,
+            map_split(split)
+        )
+        assert os.path.exists(file_path), '{} not exist'.format(file_path)
+
+        dataset_dic["split_path"] = map_split(split)
+        file_list = None
+
+        def read_this_ndjson(file_path):
+            dic_list = []
+            with open(file_path, 'r') as f:
+                for line in f:
+                    turn_name = line.split("'turn_name': ")[-1].split(',')[0].replace("'", "")
+                    emotion = line.split("'emotion': ")[-1].split(',')[0].replace("'", "")
+                    dic = {
+                        'turn_name': turn_name,
+                        'emotion': emotion
+                    }
+                    dic_list.append(dic)
+            return dic_list
+
+        all_emotions = []
+        meta_files = [x for x in os.listdir(os.path.join(dataset_path, 'IEMOCAP_full_release/ndjson')) if x.endswith('.ndjson')]
+        for meta_file in tqdm(meta_files):
+            main_folder = meta_file.split('_')[0]
+            sub_folder = (meta_file.split('.ndjson')[0])[len(main_folder)+1:]
+
+            if split == "train" and main_folder == "Session5":
+                continue
+            elif split == "test" and main_folder != "Session5":
+                continue
+
+            metadata_list = read_this_ndjson(os.path.join(dataset_path, 'IEMOCAP_full_release/ndjson', meta_file))
+
+            for dic in metadata_list:
+                filename = os.path.join(main_folder, sub_folder, dic['turn_name']+'.wav')
+                if filter_file(file_path, file_list, filename):
+                    continue
+
+                if dic['emotion'] in ['unknown', 'other']:
+                    continue 
+
+                text_output = dic['emotion']
+                text_output = EMOTION_MAP_DICT[text_output]
+                all_emotions.append(text_output)
+
+                text_prompt = 'this emotion is'
+        
+                if len(text_output) <= 1:
+                    continue
+
+                dataset_dic["data"][dataset_dic["total_num"]] = {
+                    "name": filename,
+                    "prompt": text_prompt,
+                    "output": text_output.replace('\n', ' ')
+                }
+                dataset_dic["total_num"] += 1
+        
+        print('all emotions:', list(set(all_emotions)))
+    
+    elif dataset_name == "jl-corpus":
+        assert flamingo_task == "EmotionClassification"
+        assert split in ["train", "val"]
+
+        map_split = lambda split: '44khz'
+        file_path = os.path.join(
+            dataset_path,
+            map_split(split)
+        )
+        assert os.path.exists(file_path), '{} not exist'.format(file_path)
+
+        dataset_dic["split_path"] = map_split(split)
+        file_list = list(filter(lambda x: x.endswith('.wav'), os.listdir(file_path)))
+
+        split_file = os.path.join(
+            dataset_path, 
+            'jl-corpus_audiopath_text_sid_emotion_duration_{}_filelist.txt'.format(split)
+        )
+        with open(split_file, 'r') as f:
+            data = f.readlines()
+        data = [x.replace('\n', '') for x in data]
+
+        for row in tqdm(data):
+            if row.count('|') != 4:
+                continue
+            filename, utterances, speaker, emotion, duration = row.split('|')
+            if filter_file(file_path, file_list, filename):
+                continue
+                
+            text_output = emotion
+            text_output = EMOTION_MAP_DICT[text_output]
+            text_prompt = 'this emotion is'
+            
+            if len(text_output) <= 1:
+                continue
+
+            dataset_dic["data"][dataset_dic["total_num"]] = {
+                "name": filename,
+                "prompt": text_prompt,
+                "output": text_output.replace('\n', ' ')
+            }
+            dataset_dic["total_num"] += 1
+    
+    elif dataset_name == "LP-MusicCaps-MC":
+        import pandas as pd
+        assert flamingo_task in ["AudioCaptioning"]
+        assert split in ["train", "test"]
+
+        map_split = lambda split: '../MusicCaps/44khz'
+        file_path = os.path.join(
+            dataset_path,
+            map_split(split)
+        )
+        assert os.path.exists(file_path), '{} not exist'.format(file_path)
+
+        dataset_dic["split_path"] = map_split(split)
+        file_list = list(filter(lambda x: x.endswith('.wav'), os.listdir(file_path)))
+
+        parquet_files = [f for f in os.listdir(os.path.join(dataset_path, 'data')) if f.endswith('.parquet') and f.startswith(split)]
+        print('parquet_files', parquet_files)
+        metadata_df = pd.concat([pd.read_parquet(os.path.join(dataset_path, 'data', f)) for f in parquet_files])
+
+        for index, row in tqdm(metadata_df.iterrows()):
+            filename = row['ytid'] + '.wav'
+            if filter_file(file_path, file_list, filename):
+                continue
+            
+            text_prompt = 'generate audio caption'
+            for caption in [row['caption_writing'], row['caption_summary'], row['caption_paraphrase']]:
+                text_output = caption
+
+                if len(text_output) <= 1:
+                    continue
+                    
+                dataset_dic["data"][dataset_dic["total_num"]] = {
+                    "name": filename,
+                    "prompt": text_prompt,
+                    "output": text_output.replace('\n', ' ')
+                }
+                dataset_dic["total_num"] += 1
+
+    elif dataset_name == "LP-MusicCaps-MSD":
+        import pandas as pd
+        assert flamingo_task in ["AudioCaptioning"]
+        assert split in ["train", "test", "val"]
+
+        map_split = lambda split: '../MSD/mp3s_22khz'
+        file_path = os.path.join(
+            dataset_path,
+            map_split(split)
+        )
+        assert os.path.exists(file_path), '{} not exist'.format(file_path)
+
+        dataset_dic["split_path"] = map_split(split)
+        file_list = None
+
+        parquet_files = [f for f in os.listdir(dataset_path) if f.endswith('.parquet') and f.startswith(split)]
+        print('parquet_files', parquet_files)
+        metadata_df = pd.concat([pd.read_parquet(os.path.join(dataset_path, f)) for f in parquet_files])
+
+        for index, row in tqdm(metadata_df.iterrows()):
+            filename = row['path']
+            if filter_file(file_path, file_list, filename):
+                continue
+            
+            text_prompt = 'generate audio caption'
+            for caption in [row['caption_writing'], row['caption_summary'], row['caption_paraphrase']]:
+                text_output = caption
+
+                if len(text_output) <= 1:
+                    continue
+                    
+                dataset_dic["data"][dataset_dic["total_num"]] = {
+                    "name": filename,
+                    "prompt": text_prompt,
+                    "output": text_output.replace('\n', ' ')
+                }
+                dataset_dic["total_num"] += 1
+    
+    elif dataset_name == "LP-MusicCaps-MTT":
+        import pandas as pd
+        assert flamingo_task in ["AudioCaptioning"]
+        assert split in ["train", "test", "val"]
+
+        map_split = lambda split: '../MagnaTagATune/16khz'
+        file_path = os.path.join(
+            dataset_path,
+            map_split(split)
+        )
+        assert os.path.exists(file_path), '{} not exist'.format(file_path)
+
+        dataset_dic["split_path"] = map_split(split)
+        file_list = None
+
+        parquet_files = [f for f in os.listdir(dataset_path) if f.endswith('.parquet') and f.startswith(split)]
+        print('parquet_files', parquet_files)
+        metadata_df = pd.concat([pd.read_parquet(os.path.join(dataset_path, f)) for f in parquet_files])
+
+        for index, row in tqdm(metadata_df.iterrows()):
+            filename = row['path']
+            if filter_file(file_path, file_list, filename):
+                continue
+            
+            text_prompt = 'generate audio caption'
+            for caption in [row['caption_writing'], row['caption_summary'], row['caption_paraphrase']]:
+                text_output = caption
+
+                if len(text_output) <= 1:
+                    continue
+                    
+                dataset_dic["data"][dataset_dic["total_num"]] = {
+                    "name": filename,
+                    "prompt": text_prompt,
+                    "output": text_output.replace('\n', ' ')
+                }
+                dataset_dic["total_num"] += 1
+
+    elif dataset_name == "MACS":
+        assert flamingo_task in ["AudioCaptioning", "Tagging"]
+        assert split == 'train'
+
+        map_split = lambda split: 'TAU_Urban_Acoustic_Scenes_2019/TAU-urban-acoustic-scenes-2019-development/audio'
+        file_path = os.path.join(
+            dataset_path,
+            map_split(split)
+        )
+        assert os.path.exists(file_path), '{} not exist'.format(file_path)
+
+        dataset_dic["split_path"] = map_split(split)
+        file_list = list(filter(lambda x: x.endswith('.wav'), os.listdir(file_path)))
+
+        metadata_list = yaml.load(open(os.path.join(dataset_path, 'MACS.yaml')), Loader=yaml.FullLoader)['files']
+
+        for file_metadata in tqdm(metadata_list):
+            filename = file_metadata['filename']
+            if filter_file(file_path, file_list, filename):
+                continue
+
+            for each_annotated in file_metadata['annotations']:
+                caption = each_annotated['sentence']
+                tags = ', '.join(each_annotated['tags']).replace('_', ' ')
+
+                if flamingo_task == "AudioCaptioning":
+                    text_output = caption
+                    text_prompt = 'generate audio caption'
+
+                elif flamingo_task == "Tagging":
+                    raise NotImplementedError
+
+                if len(text_output) <= 1:
+                    continue
+                    
+                dataset_dic["data"][dataset_dic["total_num"]] = {
+                    "name": filename,
+                    "prompt": text_prompt,
+                    "output": text_output.replace('\n', ' ')
+                }
+                dataset_dic["total_num"] += 1
+    
+    elif dataset_name == "Medley-solos-DB":
+        import ndjson
+        assert flamingo_task in ["InstrClassification"]
+
+        map_split = lambda split: '44khz'
+        file_path = os.path.join(
+            dataset_path,
+            map_split(split)
+        )
+        assert os.path.exists(file_path), '{} not exist'.format(file_path)
+
+        dataset_dic["split_path"] = map_split(split)
+        file_list = list(filter(lambda x: x.endswith('.wav'), os.listdir(file_path)))
+
+        with open(os.path.join(dataset_path, 'medleysolosdb_manifest.ndjson')) as ndjsonfile:
+            metadata_list = ndjson.load(ndjsonfile)
+
+        for file_metadata in tqdm(metadata_list):
+            subset = file_metadata['subset']
+            if not subset.startswith(split):
+                continue
+
+            filename = file_metadata['filepath']
+            if filter_file(file_path, file_list, filename):
+                continue
+            
+            instrument = file_metadata["instrument"]
+
+            text_output = instrument
+            text_prompt = 'this music note is produced by'
+
+            if len(text_output) <= 1:
+                continue
+                
+            dataset_dic["data"][dataset_dic["total_num"]] = {
+                "name": filename,
+                "prompt": text_prompt,
+                "output": text_output.replace('\n', ' ')
+            }
+            dataset_dic["total_num"] += 1
+
+    elif dataset_name == "MELD":
+        import numpy as np
+        assert flamingo_task in ["EmotionClassification", "SentimentClassification"]
+
+        map_split = lambda split: '44khz'
+        file_path = os.path.join(
+            dataset_path,
+            map_split(split)
+        )
+        assert os.path.exists(file_path), '{} not exist'.format(file_path)
+
+        dataset_dic["split_path"] = map_split(split)
+        file_list = list(filter(lambda x: x.endswith('.wav'), os.listdir(file_path)))
+
+        split_file = os.path.join(
+            dataset_path, 
+            '{}.txt'.format(split if split in ['train', 'test'] else 'dev')
+        )
+        with open(split_file, 'r') as f:
+            data = f.readlines()
+        data = [x.replace('\n', '') for x in data]
+
+        emotion_count = {
+            'neutral': 4703, 'happy': 1739, 'sad': 683, 'surprised': 1204,
+            'disgusted': 271, 'angry': 1108, 'fearful': 268,
+        }
+        sentiment_count = {
+            'neutral': 4703, 'positive': 2330, 'negative': 2943,
+        }
+        balancing_factor = 1
+
+        for row in tqdm(data):
+            if row.count('|') != 4:
+                continue
+            filename, utterances, speaker, emotion, sentiment = row.split('|')
+            if filter_file(file_path, file_list, filename):
+                continue
+
+            if flamingo_task == "EmotionClassification":
+                text_output = emotion
+                text_output = EMOTION_MAP_DICT[text_output]
+                text_prompt = 'this emotion is'
+
+                if split == 'train':
+                    balancing_factor = float(emotion_count['neutral']) / float(emotion_count[text_output])
+            
+            elif flamingo_task == "SentimentClassification":
+                text_output = sentiment
+                text_prompt = 'this sentiment is'
+
+                if split == 'train':
+                    balancing_factor = float(sentiment_count['neutral']) / float(sentiment_count[text_output])
+            
+            if len(text_output) <= 1:
+                continue
+
+            for _ in range(int(np.floor(balancing_factor))):
+                dataset_dic["data"][dataset_dic["total_num"]] = {
+                    "name": filename,
+                    "prompt": text_prompt,
+                    "output": text_output.replace('\n', ' ')
+                }
+                dataset_dic["total_num"] += 1
+            
+            if np.random.rand() < balancing_factor - np.floor(balancing_factor):
+                dataset_dic["data"][dataset_dic["total_num"]] = {
+                    "name": filename,
+                    "prompt": text_prompt,
+                    "output": text_output.replace('\n', ' ')
+                }
+                dataset_dic["total_num"] += 1
+    
+    elif dataset_name == "MSP-PODCAST-Publish-1.9":
+        assert flamingo_task == "EmotionClassification"
+        assert split in ["train", "val", "test"]
+
+        map_split = lambda split: 'Audio'
+        file_path = os.path.join(
+            dataset_path,
+            map_split(split)
+        )
+        assert os.path.exists(file_path), '{} not exist'.format(file_path)
+
+        dataset_dic["split_path"] = map_split(split)
+        
+        file_list = glob.glob('{}/*/*.wav'.format(file_path))
+        file_list = [x[len(file_path)+1:] for x in file_list]
+
+        subfolder_map = {}
+        for f in tqdm(file_list):
+            subfolder, filename = f.split('/')
+            subfolder_map[filename] = subfolder
+        file_list = None
+
+        emotion_dic = {
+            'A': 'Angry',
+            'S': 'Sad',
+            'H': 'Happy',
+            'U': 'Surprise',
+            'F': 'Fear',
+            'D': 'Disgust',
+            'C': 'Contempt',
+            'N': 'Neutral',
+            'O': 'Other',
+            'X': 'Not clear'
+        }
+
+        with open(os.path.join(dataset_path, 'Labels/labels_concensus.json')) as f:
+            data = f.read()
+        metadata_dic = json.loads(data)
+
+        for filename in tqdm(list(metadata_dic.keys())):
+            values = metadata_dic[filename]
+            if not values["Split_Set"].lower().startswith(split):
+                continue
+            if values["EmoClass"] in ["O", "X"] or values["EmoClass"] not in emotion_dic.keys():
+                continue
+
+            subfolder = subfolder_map[filename]
+            filename = '{}/{}'.format(subfolder, filename)
+            if filter_file(file_path, file_list, filename):
+                continue
+                
+            text_output = emotion_dic[values["EmoClass"]].lower()
+            text_output = EMOTION_MAP_DICT[text_output]
+            text_prompt = 'this emotion is'
+            
+            if len(text_output) <= 1:
+                continue
+
+            dataset_dic["data"][dataset_dic["total_num"]] = {
+                "name": filename,
+                "prompt": text_prompt,
+                "output": text_output.replace('\n', ' ')
+            }
+            dataset_dic["total_num"] += 1
+    
+    elif dataset_name == "mtg-jamendo":
+        import ndjson
+        assert flamingo_task == "MusicTagging"
+        assert split in ["train", "val"]
+
+        map_split = lambda split: '44khz'
+        file_path = os.path.join(
+            dataset_path,
+            map_split(split)
+        )
+        assert os.path.exists(file_path), '{} not exist'.format(file_path)
+
+        dataset_dic["split_path"] = map_split(split)
+        file_list = None
+
+        with open(os.path.join(dataset_path, 'mtg_jamendo_{}_manifest.ndjson'.format(split))) as ndjsonfile:
+            reader = ndjson.load(ndjsonfile)
+            for row in tqdm(reader):
+                filename = row["audiopath"]
+                if filter_file(file_path, file_list, filename):
+                    continue
+                
+                text_output = row["caption"]
+                text_prompt = 'generate music tags (genre, instrument, mood/theme)'
+                
+                if len(text_output) <= 1:
+                    continue
+
+                dataset_dic["data"][dataset_dic["total_num"]] = {
+                    "name": filename,
+                    "prompt": text_prompt,
+                    "output": text_output.replace('\n', ' ')
+                }
+                dataset_dic["total_num"] += 1
+    
+    elif dataset_name == "MU-LLAMA":
+        
+        assert flamingo_task in ['AQA']
+        assert split in ['train', 'test']
+
+        map_split = lambda split: 'MusicQA/audios'
+        file_path = os.path.join(
+            dataset_path,
+            map_split(split)
+        )
+        assert os.path.exists(file_path), '{} not exist'.format(file_path)
+
+        dataset_dic["split_path"] = map_split(split)
+        file_list = list(filter(lambda x: x.endswith('.wav'), os.listdir(file_path)))
+
+        split_file = 'MusicQA/FinetuneMusicQA.json' if split == 'train' else 'MusicQA/EvalMusicQA.json'
+        with open(os.path.join(dataset_path, split_file), 'r') as f:
+            data = f.read()
+        metadata_list = json.loads(data)
+
+        for dic in tqdm(metadata_list):
+            filename = dic["audio_name"]
+            if filter_file(file_path, file_list, filename):
+                continue
+
+            text_prompt = 'Question: ' + dic["conversation"][0]["value"].strip()
+            if not (text_prompt.endswith('.') or text_prompt.endswith('?')):
+                text_prompt = text_prompt + '.'
+
+            text_output = dic["conversation"][1]["value"].strip()
+            if len(text_output) <= 1:
+                continue
+
+            dataset_dic["data"][dataset_dic["total_num"]] = {
+                "name": filename,
+                "prompt": text_prompt,
+                "output": text_output.replace('\n', ' ')
+            }
+            dataset_dic["total_num"] += 1
+    
+    elif dataset_name == "musdbhq":
+        assert flamingo_task in ["InstrClassification"]
+        assert split in ["train", "test", "val"]
+
+        map_split = lambda split: './'
+        file_path = os.path.join(
+            dataset_path,
+            map_split(split)
+        )
+        assert os.path.exists(file_path), '{} not exist'.format(file_path)
+
+        dataset_dic["split_path"] = map_split(split)
+        file_list = None
+
+        with open(os.path.join(dataset_path, 'file_list_44k_{}.txt'.format(split))) as f:
+            data = f.readlines()
+        data = [x.replace('\n', '') for x in data]
+
+        for row in tqdm(data):
+            if row.count('|') != 1:
+                continue 
+
+            filename, duration = row.split('|')
+            duration = float(duration)
+
+            if filter_file(file_path, file_list, filename):
+                continue
+            
+            text_output = filename.split('/')[-1].split('.wav')[0]
+            if len(text_output) <= 1:
+                continue
+            text_prompt = 'this music is produced by'
+
+            segment_length = 10
+            for audio_start_idx in range(int(duration // segment_length)):
+                dataset_dic["data"][dataset_dic["total_num"]] = {
+                    "name": filename,
+                    "prompt": text_prompt,
+                    "output": text_output.replace('\n', ' '),
+                    "audio_start": audio_start_idx * segment_length
+                }
+                dataset_dic["total_num"] += 1
+    
+    elif dataset_name == "Music-AVQA":
+        import ast
+        import re 
+
+        assert flamingo_task in [
+            "{}_{}".format(q, t) \
+            for q in ['AQA', 'AVQA'] \
+            for t in ['Comparative', 'Counting', 'Existential', 'Location', 'Temporal', 'All']
+        ]
+
+        def replace_bracketed_words(input_string, replacements):
+            def replacer(match):
+                word = next(replacements)
+                return word
+
+            replacements = iter(replacements)
+            output_string = re.sub(r'<[^>]*>', replacer, input_string)
+            return output_string
+
+        map_split = lambda split: 'audio'
+        file_path = os.path.join(
+            dataset_path,
+            map_split(split)
+        )
+        assert os.path.exists(file_path), '{} not exist'.format(file_path)
+
+        dataset_dic["split_path"] = map_split(split)
+        file_list = list(filter(lambda x: x.endswith('.wav'), os.listdir(file_path)))
+
+        with open(os.path.join(dataset_path, 'MUSIC-AVQA/data/json/avqa-{}.json'.format(split)), 'r') as f:
+            data = f.read()
+        metadata_list = json.loads(data)
+
+        for dic in tqdm(metadata_list):
+            filename = dic["video_id"] + '.wav'
+            if filter_file(file_path, file_list, filename):
+                continue
+
+            types = ast.literal_eval(dic["type"])
+            if 'Visual' in types:
+                continue 
+            
+            if flamingo_task.startswith('AQA_') and 'Audio-Visual' in types:
+                continue
+            
+            if flamingo_task.startswith('AVQA_') and 'Audio' in types:
+                continue
+            
+            t = flamingo_task.split('_')[1]
+            if (not t == 'All') and (not t in types):
+                continue
+
+            text_output = dic["anser"]
+            if len(text_output) <= 1:
+                continue
+            
+            question = dic["question_content"].replace("\uff1f", '?')
+            templ_values = ast.literal_eval(dic["templ_values"])
+            if len(templ_values) > 0:
+                question = replace_bracketed_words(question, templ_values)
+            text_prompt = "Question: " + question
+
+            dataset_dic["data"][dataset_dic["total_num"]] = {
+                "name": filename,
+                "prompt": text_prompt,
+                "output": text_output.replace('\n', ' ')
+            }
+            dataset_dic["total_num"] += 1
+    
+    elif dataset_name == "MusicCaps":
+        assert flamingo_task in ["AudioCaptioning", "EventClassification"]
+        assert split in ["train", "test"]
+
+        map_split = lambda split: '44khz'
+        file_path = os.path.join(
+            dataset_path,
+            map_split(split)
+        )
+        assert os.path.exists(file_path), '{} not exist'.format(file_path)
+
+        dataset_dic["split_path"] = map_split(split)
+        file_list = list(filter(lambda x: x.endswith('.wav'), os.listdir(file_path)))
+
+        with open(os.path.join(dataset_path, 'musiccaps_manifest.json')) as f:
+            data = f.read()
+        metadata_list = json.loads(data)
+
+        for file_metadata in tqdm(metadata_list):
+            filename = file_metadata['filepath']
+            if filter_file(file_path, file_list, filename):
+                continue
+            
+            start_s, end_s = file_metadata["start_s"], file_metadata["end_s"]
+            caption = file_metadata["caption"]
+            audioset_positive_labels = file_metadata["audioset_positive_labels"]  # audioset classes
+            aspect_list = file_metadata["aspect_list"]  # annotated classes
+
+            if (split == 'train') == file_metadata["is_audioset_eval"]:
+                continue
+
+            if flamingo_task == "AudioCaptioning":
+                text_output = caption
+                text_prompt = 'generate audio caption'
+
+            elif flamingo_task == "EventClassification":
+                raise NotImplementedError
+
+            if len(text_output) <= 1:
+                continue
+                
+            dataset_dic["data"][dataset_dic["total_num"]] = {
+                "name": filename,
+                "prompt": text_prompt,
+                "output": text_output.replace('\n', ' ')
+            }
+            dataset_dic["total_num"] += 1
+    
+    elif dataset_name == "NonSpeech7k":
+        assert flamingo_task in ["EventClassification"]
+        assert split in ["train", "test"]
+
+        map_split = lambda split: split
+        file_path = os.path.join(
+            dataset_path,
+            map_split(split)
+        )
+        assert os.path.exists(file_path), '{} not exist'.format(file_path)
+
+        dataset_dic["split_path"] = map_split(split)
+        file_list = list(filter(lambda x: x.endswith('.wav'), os.listdir(file_path)))
+
+        all_classes = []
+        with open(os.path.join(dataset_path, 'metadata of {} set.csv').format(split), newline='') as csvfile:
+            reader = csv.reader(csvfile, delimiter=',', quotechar='"')
+            next(reader)
+            for row in tqdm(reader):
+                filename, _, _, _, classname, _, _, _ = row
+                if filter_file(file_path, file_list, filename):
+                    continue
+
+                text_output = classname.lower()
+                if len(text_output) <= 1:
+                    continue
+                text_prompt = 'this is a sound of'
+
+                all_classes.append(classname)
+
+                dataset_dic["data"][dataset_dic["total_num"]] = {
+                    "name": filename,
+                    "prompt": text_prompt,
+                    "output": text_output.replace('\n', ' ')
+                }
+                dataset_dic["total_num"] += 1
+        
+        print('all classes:', list(set(all_classes)))
+
+    elif dataset_name == "NSynth":
+        import ndjson
+        assert flamingo_task in [
+            "InstrClassification", 
+            "PitchClassification", 
+            "VelocityClassification", 
+            "SourceClassification",
+            "QualityClassification",
+            "MIR"
+        ]
+        assert split in ["train", "test", "val"]
+
+        map_split = lambda split: 'nsynth-{}/audio'.format('valid' if split == 'val' else split)
+        file_path = os.path.join(
+            dataset_path,
+            map_split(split)
+        )
+        assert os.path.exists(file_path), '{} not exist'.format(file_path)
+
+        dataset_dic["split_path"] = map_split(split)
+        file_list = list(filter(lambda x: x.endswith('.wav'), os.listdir(file_path)))
+
+        with open(os.path.join(dataset_path, map_split(split), '../examples.json')) as f:
+            data = f.read()
+        reader = json.loads(data)
+
+        for key in tqdm(reader):
+            filename = key + '.wav'
+            if filter_file(file_path, file_list, filename):
+                continue
+
+            if flamingo_task == "InstrClassification":
+                text_output = reader[key]["instrument_family_str"]
+                text_prompt = 'this music note is produced by'
+                
+            elif flamingo_task == "PitchClassification":
+                text_output = str(reader[key]["pitch"])
+                text_prompt = 'this music note has pitch'
+
+            elif flamingo_task == "VelocityClassification":
+                text_output = str(reader[key]["velocity"])
+                text_prompt = 'this music note has velocity'
+
+            elif flamingo_task == "SourceClassification":
+                text_output = reader[key]["instrument_source_str"]
+                text_prompt = 'this music note has sonic source'
+
+            elif flamingo_task == "QualityClassification":
+                qualities_str = reader[key]["qualities_str"]
+                if len(qualities_str) >= 1:
+                    text_output = ', '.join(qualities_str).replace('_', ' ')
+                else:
+                    text_output = 'none'
+                text_prompt = 'this music note has sonic qualities' 
+            
+            elif flamingo_task == "MIR":
+                instrument = reader[key]["instrument_family_str"]
+                pitch = str(reader[key]["pitch"])
+                velocity = str(reader[key]["velocity"])
+                source = reader[key]["instrument_source_str"]
+                qualities_str = ', '.join(reader[key]["qualities_str"]).replace('_', ' ')
+
+                assert len(instrument) > 0
+                text_output = 'produced by {}'.format(instrument)
+                if len(pitch) > 0:
+                    text_output = text_output + ', pitch {}'.format(pitch)
+                if len(velocity) > 0:
+                    text_output = text_output + ', velocity {}'.format(velocity)
+                if len(source) > 0:
+                    text_output = text_output + ', source {}'.format(source)
+                if len(qualities_str) > 0:
+                    text_output = text_output + ', and having qualities like {}'.format(qualities_str)
+
+                text_prompt = 'this music note is' 
+
+            if len(text_output) <= 1:
+                continue 
+
+            dataset_dic["data"][dataset_dic["total_num"]] = {
+                "name": filename,
+                "prompt": text_prompt,
+                "output": text_output.replace('\n', ' ')
+            }
+            dataset_dic["total_num"] += 1
+    
+    elif dataset_name == "OMGEmotion":
+        import numpy as np
+        import webrtcvad
+        import wave
+        from pydub import AudioSegment
+
+        assert flamingo_task == "EmotionClassification"
+        assert split in ["train", "val"]
+
+        def convert_to_wav(file_path):
+            audio = AudioSegment.from_file(file_path).set_frame_rate(16000).set_channels(1)
+            wav_path = file_path.rsplit('.', 1)[0] + "_converted.wav"
+            audio.export(wav_path, format="wav")
+            return wav_path
+
+        def contains_speech(file_path, aggressiveness=0):
+            # aggressiveness between 0 and 3, 0 for very clean speech, and 3 for noisy speech
+            wav_path = convert_to_wav(file_path)
+            vad = webrtcvad.Vad(aggressiveness)
+
+            with wave.open(wav_path, 'rb') as audio:
+                assert audio.getsampwidth() == 2, "Audio must be 16-bit"
+                assert audio.getnchannels() == 1, "Audio must be mono"
+                assert audio.getframerate() == 16000, "Audio must be sampled at 16kHz"
+
+                frame_duration = 10  # ms
+                frame_size = int(audio.getframerate() * frame_duration / 1000)
+                num_frames = int(audio.getnframes() / frame_size)
+
+                for _ in range(num_frames):
+                    frame = audio.readframes(frame_size)
+                    if vad.is_speech(frame, audio.getframerate()):
+                        return True
+
+            return False
+
+        map_split = lambda split: 'processed-{}_utterance_data'.format(split)
+        file_path = os.path.join(
+            dataset_path,
+            map_split(split)
+        )
+        assert os.path.exists(file_path), '{} not exist'.format(file_path)
+
+        dataset_dic["split_path"] = map_split(split)
+        file_list = None
+
+        dic_code2emotion = {
+            "0": "anger",
+            "1": "disgust",
+            "2": "fear",
+            "3": "happy",
+            "4": "neutral",
+            "5": "sad",
+            "6": "surprise",
+        }
+
+        all_emotions = []
+        meta_file = os.path.join(
+            dataset_path,
+            'OMGEmotionChallenge',
+            'omg_{}Videos.csv'.format('Train' if split == 'train' else 'Validation')
+        )
+
+        with open(meta_file, newline='') as csvfile:
+            reader = csv.reader(csvfile, delimiter=',', quotechar='"')
+            next(reader)
+            for row in tqdm(reader):
+                link, start, end, video, utterance, _, _, EmotionMaxVote = row
+                emotion = dic_code2emotion[str(EmotionMaxVote)]
+
+                filename = os.path.join(video, utterance.replace('.mp4', '.mp3'))
+                if filter_file(file_path, file_list, filename):
+                    continue 
+                
+                if not contains_speech(os.path.join(file_path, filename)):
+                    print('{} does not contain speech'.format(filename))
+                    continue
+
+                text_prompt = 'this emotion is'
+                text_output = emotion
+                if len(text_output) <= 1:
+                    continue
+
+                all_emotions.append(EMOTION_MAP_DICT[emotion])
+
+                dataset_dic["data"][dataset_dic["total_num"]] = {
+                    "name": filename,
+                    "prompt": text_prompt,
+                    "output": text_output.replace('\n', ' ')
+                }
+                dataset_dic["total_num"] += 1
+        
+        print('all emotions:', list(set(all_emotions)))
+    
+    elif dataset_name == "OpenAQA":
+
+        assert flamingo_task == 'AQA'
+        assert split == 'train'
+
+        map_split = lambda split: './'
+        file_path = os.path.join(
+            dataset_path,
+            map_split(split)
+        )
+        assert os.path.exists(file_path), '{} not exist'.format(file_path)
+
+        dataset_dic["split_path"] = map_split(split)
+        file_list = None
+
+        no_word_list = [
+            'cannot determine', 'not provided', 'cannot be determined', 'sorry', 'i cannot',
+            'without more information', 'enough information',
+            'not possible', 'more context', 'enough', 'impossible', 'cannot be determined',
+            'without additional information',
+            'unclear', 'cannot', 'not clear', 'do not provide sufficient', 'does not provide',
+            'difficult to determine', 'no information provided',
+            "can't infer", "difficult to infer", "not specified", "no specific", "no information",
+            "without additional", 'it is difficult to', "no indication"
+        ]
+
+        print('computing dic_audiosetfull_parts')
+        audiosetfull_root = '/mnt/fsx-main/rafaelvalle/datasets/audioset/unbalanced_train_segments/22khz/'
+        part_strings = [('0'*(2-len(str(p))) + str(p)) for p in range(41)]
+        dic_audiosetfull_parts = {
+            part: set(os.listdir(os.path.join(audiosetfull_root, 'unbalanced_train_segments_part{}'.format(part)))) \
+                for part in part_strings
+        }
+
+        audioset20k_filelist = set(os.listdir(os.path.join(file_path, '../AudioSet/train_wav')))
+
+        print('computing dic_clotho_filename')
+        clotho_files = os.listdir(os.path.join(dataset_path, '../Clotho-AQA/audio_files'))
+        dic_clotho_filename = {
+            '_'.join([s for s in f.split(' ') if len(s) > 0]): f \
+                for f in clotho_files
+        }
+
+        print('reading open_ended/all_open_qa.json')
+        with open(os.path.join(dataset_path, 'openaqa/data/open_ended/all_open_qa.json'), 'r') as f:
+            data = f.read()
+        metadata_list = json.loads(data)
+
+        for dic in tqdm(metadata_list):
+            #keys: instruction, input, dataset, audio_id, output, task
+
+            text_output = dic["output"]
+            if len(text_output) <= 1:
+                continue
+            if any(word in text_output.lower() for word in no_word_list):
+                continue
+            
+            question = dic["instruction"]
+            text_prompt = question 
+
+            audio_id = dic["audio_id"]
+            subset = dic["dataset"]
+            if subset == 'clotho_development':
+                filename = audio_id.split('/')[-1]
+                processed_filename = '_'.join([s for s in filename.split('_') if len(s) > 0])
+                if processed_filename in dic_clotho_filename:
+                    filename = os.path.join(
+                        '../Clotho-AQA/audio_files',
+                        dic_clotho_filename[processed_filename]
+                    )
+                else:
+                    continue
+
+            elif subset in ['audiocaps_train', 'as_20k', 'as_strong_train']:
+                found = False
+
+                filename = audio_id.split('/')[-1].split('.flac')[0] + '.wav'
+                if filename in audioset20k_filelist:
+                    filename = os.path.join('../AudioSet/train_wav', filename)
+                    found = True
+                else:
+                    filename = 'Y' + filename
+                    for part in part_strings:
+                        if filename in dic_audiosetfull_parts[part]:
+                            filename = os.path.join(
+                                audiosetfull_root, 
+                                'unbalanced_train_segments_part{}'.format(part),
+                                filename
+                            )
+                            found = True
+                            break 
+                    
+                if not found:
+                    print(filename, 'not found')
+                    continue
+
+            elif subset == 'freesound_10s':
+                filename = os.path.join(
+                    '../CLAP_freesound/freesound_no_overlap/split/train', 
+                    audio_id.split('/')[-1]
+                )
+
+            elif subset == 'vggsound_train':
+                continue
+            
+            if filter_file(file_path, file_list, filename):
+                continue
+
+            dataset_dic["data"][dataset_dic["total_num"]] = {
+                "name": filename,
+                "prompt": text_prompt,
+                "output": text_output.replace('\n', ' ')
+            }
+            dataset_dic["total_num"] += 1
+    
+    elif dataset_name == "ravdess":
+        assert flamingo_task == "EmotionClassification"
+        assert split in ["train", "val"]
+
+        map_split = lambda split: '44khz'
+        file_path = os.path.join(
+            dataset_path,
+            map_split(split)
+        )
+        assert os.path.exists(file_path), '{} not exist'.format(file_path)
+
+        dataset_dic["split_path"] = map_split(split)
+        file_list = None
+
+        split_file = os.path.join(
+            dataset_path, 
+            'ravdess_audiopath_text_sid_emotion_duration_{}_filelist.txt'.format(split)
+        )
+        with open(split_file, 'r') as f:
+            data = f.readlines()
+        data = [x.replace('\n', '') for x in data]
+
+        for row in tqdm(data):
+            if row.count('|') != 4:
+                continue
+            filename, utterances, speaker, emotion, duration = row.split('|')
+            if filter_file(file_path, file_list, filename):
+                continue
+                
+            text_output = emotion
+            text_prompt = 'this emotion is'
+            
+            if len(text_output) <= 1:
+                continue
+
+            dataset_dic["data"][dataset_dic["total_num"]] = {
+                "name": filename,
+                "prompt": text_prompt,
+                "output": text_output.replace('\n', ' ')
+            }
+            dataset_dic["total_num"] += 1
+    
+    elif dataset_name == "SongDescriber":
+        assert flamingo_task in ["AudioCaptioning"]
+        assert split in ["train"]
+
+        map_split = lambda split: './audio/audio'
+        file_path = os.path.join(
+            dataset_path,
+            map_split(split)
+        )
+        assert os.path.exists(file_path), '{} not exist'.format(file_path)
+
+        dataset_dic["split_path"] = map_split(split)
+        file_list = None
+
+        with open(os.path.join(dataset_path, 'song_describer.csv'), newline='') as csvfile:
+            reader = csv.reader(csvfile, delimiter=',', quotechar='"')
+            next(reader)
+
+            for row in tqdm(reader):
+                caption_id,track_id,caption,is_valid_subset,familiarity,artist_id,album_id,path,duration = row 
+                filename = '{}/{}.2min.mp3'.format(track_id[-2:], track_id)
+                duration = float(duration)
+
+                if filter_file(file_path, file_list, filename):
+                    continue
+                
+                text_output = caption
+                if len(text_output) <= 1:
+                    continue
+                text_prompt = 'generate audio caption'
+
+                segment_length = 30
+                for audio_start_idx in range(int(duration // segment_length)):
+                    dataset_dic["data"][dataset_dic["total_num"]] = {
+                        "name": filename,
+                        "prompt": text_prompt,
+                        "output": text_output.replace('\n', ' '),
+                        "audio_start": audio_start_idx * segment_length
+                    }
+                    dataset_dic["total_num"] += 1
+    
+    elif dataset_name == "SONYC-UST":
+        import numpy as np 
+
+        assert flamingo_task == "EventClassification"
+        assert split in ["train", "test", "val"]
+
+        map_split = lambda split: 'audio'
+        file_path = os.path.join(
+            dataset_path,
+            map_split(split)
+        )
+        assert os.path.exists(file_path), '{} not exist'.format(file_path)
+
+        dataset_dic["split_path"] = map_split(split)
+        file_list = list(filter(lambda x: x.endswith('.wav'), os.listdir(file_path)))
+
+        all_labels = []
+        with open(os.path.join(dataset_path, 'annotations.csv'), newline='') as csvfile:
+            reader = csv.reader(csvfile, delimiter=',', quotechar='"')
+            for idx, row in tqdm(enumerate(reader)):
+                if idx == 0:
+                    header = np.array(row)
+                    continue 
+                    
+                if not row[0].startswith(split):
+                    continue 
+                
+                filename = row[2]
+                if filter_file(file_path, file_list, filename):
+                    continue
+
+                labels = [header[i] for i in range(12, len(header)-8) if str(row[i]) == "1"]
+                labels = [x.split("_")[1].replace('-', ' ').lower() for x in labels if 'X_' not in x]
+                all_labels += labels 
+
+                text_output = ", ".join(labels)
+                if len(text_output) <= 1:
+                    continue
+                text_prompt = 'this is a sound of'
+
+                dataset_dic["data"][dataset_dic["total_num"]] = {
+                    "name": filename,
+                    "prompt": text_prompt,
+                    "output": text_output.replace('\n', ' ')
+                }
+                dataset_dic["total_num"] += 1
+        
+        print('all labels:', list(set(all_labels)))
+
+    elif dataset_name == "SoundDescs":
+        import torch
+        assert flamingo_task in ["AudioDescription"]
+        assert split in ["train"]
+
+        map_split = lambda split: 'raw/audios'
+        file_path = os.path.join(
+            dataset_path,
+            map_split(split)
+        )
+        assert os.path.exists(file_path), '{} not exist'.format(file_path)
+
+        dataset_dic["split_path"] = map_split(split)
+        file_list = list(filter(lambda x: x.endswith('.wav'), os.listdir(file_path)))
+
+        split_file = os.path.join(dataset_path, 'audio-retrieval-benchmark/data/SoundDescs/{}_list.txt'.format(split))
+        with open(split_file, 'r') as f:
+            data = f.readlines()
+        names = set([x.replace('\n', '') for x in data])
+
+        with open(os.path.join(dataset_path, 'audio-retrieval-benchmark/sounddescs_data/descriptions.pkl'), 'rb') as f:
+            obj = f.read()
+            metadata_dic = pickle.loads(obj, encoding='latin1')
+
+        for name in tqdm(names):
+            if name not in metadata_dic.keys():
+                continue 
+
+            filename = '{}.wav'.format(name)
+            if filter_file(file_path, file_list, filename):
+                continue
+            
+            description = metadata_dic[name]
+            text_output = description
+            text_prompt = 'generate audio description'
+
+            if len(text_output) <= 1:
+                continue
+                
+            dataset_dic["data"][dataset_dic["total_num"]] = {
+                "name": filename,
+                "prompt": text_prompt,
+                "output": text_output.replace('\n', ' ')
+            }
+            dataset_dic["total_num"] += 1  
+
+    elif dataset_name == "tess":
+        assert flamingo_task == "EmotionClassification"
+        assert split in ["train", "val"]
+
+        map_split = lambda split: '24414hz'
+        file_path = os.path.join(
+            dataset_path,
+            map_split(split)
+        )
+        assert os.path.exists(file_path), '{} not exist'.format(file_path)
+
+        dataset_dic["split_path"] = map_split(split)
+        file_list = None
+
+        split_file = os.path.join(
+            dataset_path, 
+            'tess_audiopath_text_sid_emotion_duration_{}_filelist.txt'.format(split)
+        )
+        with open(split_file, 'r') as f:
+            data = f.readlines()
+        data = [x.replace('\n', '') for x in data]
+
+        for row in tqdm(data):
+            if row.count('|') != 4:
+                continue
+            filename, utterances, speaker, emotion, duration = row.split('|')
+            if filter_file(file_path, file_list, filename):
+                continue
+                
+            text_output = emotion.replace('_', ' ')
+            text_output = EMOTION_MAP_DICT[text_output]
+            text_prompt = 'this emotion is'
+            
+            if len(text_output) <= 1:
+                continue
+
+            dataset_dic["data"][dataset_dic["total_num"]] = {
+                "name": filename,
+                "prompt": text_prompt,
+                "output": text_output.replace('\n', ' ')
+            }
+            dataset_dic["total_num"] += 1
+    
+    elif dataset_name == "UrbanSound8K":
+        assert flamingo_task in ["EventClassification"]
+        assert split in ["train"]
+
+        map_split = lambda split: 'audio'
+        file_path = os.path.join(
+            dataset_path,
+            map_split(split)
+        )
+        assert os.path.exists(file_path), '{} not exist'.format(file_path)
+
+        dataset_dic["split_path"] = map_split(split)
+        file_list = None
+
+        with open(os.path.join(dataset_path, 'metadata/UrbanSound8K.csv'), newline='') as csvfile:
+            reader = csv.reader(csvfile, delimiter=',', quotechar='"')
+            next(reader)
+            for row in tqdm(reader):
+                filename, fsID, start, end, salience, fold, classID, class_name = row
+                filename = 'fold{}/{}'.format(fold, filename)
+                if filter_file(file_path, file_list, filename):
+                    continue
+
+                text_output = class_name.replace("_", " ").lower()
+                if len(text_output) <= 1:
+                    continue
+                text_prompt = 'this is a sound of'
+
+                dataset_dic["data"][dataset_dic["total_num"]] = {
+                    "name": filename,
+                    "prompt": text_prompt,
+                    "output": text_output.replace('\n', ' ')
+                }
+                dataset_dic["total_num"] += 1
+
+    elif dataset_name == "VocalSound":
+        assert flamingo_task == "VocalClassification"
+
+        map_split = lambda split: 'data_44k'
+        file_path = os.path.join(
+            dataset_path,
+            map_split(split)
+        )
+        assert os.path.exists(file_path), '{} not exist'.format(file_path)
+
+        dataset_dic["split_path"] = map_split(split)
+        file_list = list(filter(lambda x: x.endswith('.wav'), os.listdir(file_path)))
+
+        split_file = os.path.join(
+            dataset_path, 
+            'meta/{}_meta.csv'.format(split[:2] if split in ['train', 'test'] else split[:3])
+        )
+
+        prefix = set([])
+        with open(split_file, newline='') as csvfile:
+            reader = csv.reader(csvfile, delimiter=',', quotechar='"')
+            for row in reader:
+                prefix.add(row[0])
+        
+        all_labels = set([])
+        for filename in tqdm(file_list):
+            if not filename.split('_')[0] in prefix:
+                continue 
+            
+            if filter_file(file_path, file_list, filename):
+                    continue
+            
+            label = filename.split('_')[2].split('.wav')[0]
+            if label == 'throatclearing':
+                label = 'throat clearing'
+            
+            text_output = label
+            text_prompt = 'this vocal sound is'
+            all_labels.add(label)
+        
+            if len(text_output) <= 1:
+                continue
+
+            dataset_dic["data"][dataset_dic["total_num"]] = {
+                "name": filename,
+                "prompt": text_prompt,
+                "output": text_output.replace('\n', ' ')
+            }
+            dataset_dic["total_num"] += 1
+
+        print('all labels:\n', "\'" + "\', \'".join(list(all_labels)) + "\'")
+
+    elif dataset_name.startswith("WavCaps"):
+        assert split in ["train"]
+
+        dataset_name, subset_name = dataset_name.split('-')
+        dataset_path = os.path.join(
+            '/'.join(dataset_path.split('/')[:-1]),
+            dataset_name
+        )
+        dataset_dic['dataset_path'] = dataset_path
+
+        map_split = lambda split: subset_name + '_flac'
+        file_path = os.path.join(
+            dataset_path,
+            map_split(split)
+        )
+        assert os.path.exists(file_path), '{} not exist'.format(file_path)
+
+        dataset_dic["split_path"] = map_split(split)
+        file_list = list(filter(lambda x: x.endswith('.flac'), os.listdir(file_path)))
+
+        metadata_file = os.listdir(os.path.join(dataset_path, "json_files", subset_name))
+        metadata_file = [x for x in metadata_file if x.endswith('json')][0]
+        with open(os.path.join(dataset_path, "json_files", subset_name, metadata_file)) as f:
+            data = f.read()
+        reader = json.loads(data)
+
+        if subset_name == "AudioSet_SL":
+            assert flamingo_task == 'AudioCaptioning'
+
+            for sample in tqdm(reader['data']):
+                filename = sample["id"].replace('.wav', '.flac')
+                if filter_file(file_path, file_list, filename):
+                    continue
+
+                text_output = sample['caption']
+                if len(text_output) <= 1:
+                    continue
+                text_prompt = 'generate audio caption'
+
+                dataset_dic["data"][dataset_dic["total_num"]] = {
+                    "name": filename,
+                    "prompt": text_prompt,
+                    "output": text_output.replace('\n', ' ')
+                }
+                dataset_dic["total_num"] += 1
+        
+        else:
+            assert flamingo_task in ['AudioCaptioning', 'AudioDescription']
+
+            for sample in tqdm(reader['data']):
+                filename = sample["id"] + '.flac'
+                if filter_file(file_path, file_list, filename):
+                    continue
+
+                if flamingo_task == 'AudioCaptioning':
+                    text_output = sample['caption']
+                    text_prompt = 'generate audio caption'
+                
+                elif flamingo_task == 'AudioDescription':
+                    text_output = sample['description']
+                    text_prompt = 'generate audio description'
+                
+                if len(text_output) <= 1:
+                        continue
+
+                dataset_dic["data"][dataset_dic["total_num"]] = {
+                    "name": filename,
+                    "prompt": text_prompt,
+                    "output": text_output.replace('\n', ' ')
+                }
+                dataset_dic["total_num"] += 1
+
+    elif dataset_name == "WavText5K":
+        assert split == 'train'
+
+        map_split = lambda split: 'Webcrawl/44100/audios'
+        file_path = os.path.join(
+            dataset_path,
+            map_split(split)
+        )
+        assert os.path.exists(file_path), '{} not exist'.format(file_path)
+
+        dataset_dic["split_path"] = map_split(split)
+        file_list = list(filter(lambda x: x.endswith('.wav'), os.listdir(file_path)))
+
+        dic = defaultdict(str)
+        with open(os.path.join(dataset_path, 'WavText5K.csv'), newline='') as csvfile:
+            reader = csv.reader(csvfile, delimiter=',', quotechar='"')
+            next(reader)
+            for row in tqdm(reader):
+                _, _, title, description, filename, tags = row
+                dic[filename] = (title, description, tags)
+
+        if flamingo_task == "AudioCaptioning":
+            for filename in tqdm(dic.keys()):
+                if filter_file(file_path, file_list, filename):
+                    continue
+
+                title, description, tags = dic[filename]
+                text_output = description
+                if len(text_output) <= 1:
+                    continue
+                text_prompt = 'generate audio caption'
+
+                dataset_dic["data"][dataset_dic["total_num"]] = {
+                    "name": filename,
+                    "prompt": text_prompt,
+                    "output": text_output.replace('\n', ' ')
+                }
+                dataset_dic["total_num"] += 1
+            
+        elif flamingo_task == "Tagging":
+            for filename in tqdm(dic.keys()):
+                if filter_file(file_path, file_list, filename):
+                    continue
+
+                title, description, tags = dic[filename]
+                if len(tags) < 2 or not tags.startswith('[') or not tags.endswith(']'):
+                    continue
+
+                tags = tags[1:-1].split(', ')
+                tags = list(map(lambda x: x.replace("'", ""), tags))
+                text_output = '{}'.format(', '.join(tags))
+                if len(text_output) <= 1:
+                    continue
+                text_prompt = 'generate tags'
+
+                dataset_dic["data"][dataset_dic["total_num"]] = {
+                    "name": filename,
+                    "prompt": text_prompt,
+                    "output": text_output.replace('\n', ' ')
+                }
+                dataset_dic["total_num"] += 1
+    
+
+    with open(output_file, 'w') as json_file:
+        json.dump(dataset_dic, json_file)
+
+
+# ==================== Precompute CLAP and build Hashing ====================
+
+def int16_to_float32(x):
+    return (x / 32767.0).astype(np.float32)
+
+
+def float32_to_int16(x):
+    x = np.clip(x, a_min=-1., a_max=1.)
+    return (x * 32767.).astype(np.int16)
+
+
+def update_progress_bar(arg):
+    pbar.update()
+
+
+@suppress_all_output
+def load_clap_model(checkpoint):
+    if checkpoint in ['630k-audioset-best.pt', '630k-best.pt', '630k-audioset-fusion-best.pt', '630k-fusion-best.pt']:
+        amodel = 'HTSAT-tiny'
+    elif checkpoint in ['music_speech_audioset_epoch_15_esc_89.98.pt']:
+        amodel = 'HTSAT-base'
+    else:
+        raise NotImplementedError
+    
+    model = laion_clap.CLAP_Module(
+        enable_fusion=('fusion' in checkpoint.lower()), 
+        amodel=amodel
+    ).cuda()
+    model.load_ckpt(ckpt=os.path.join(
+        '/lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/laion-clap-pretrained/laion_clap',
+        checkpoint
+    ))
+    return model
+
+
+def load_audio(file_path, target_sr=44100, duration=30.0, start=0.0):
+    if file_path.endswith('.mp3'):
+        audio = AudioSegment.from_file(file_path)
+        if len(audio) > (start + duration) * 1000:
+            audio = audio[start * 1000:(start + duration) * 1000]
+
+        if audio.frame_rate != target_sr:
+            audio = audio.set_frame_rate(target_sr)
+
+        if audio.channels > 1:
+            audio = audio.set_channels(1)
+        
+        data = np.array(audio.get_array_of_samples())
+        if audio.sample_width == 2:
+            data = data.astype(np.float32) / np.iinfo(np.int16).max
+        elif audio.sample_width == 4:
+            data = data.astype(np.float32) / np.iinfo(np.int32).max
+        else:
+            raise ValueError("Unsupported bit depth: {}".format(audio.sample_width))
+
+    else:
+        with sf.SoundFile(file_path) as audio:
+            original_sr = audio.samplerate
+            channels = audio.channels
+
+            max_frames = int((start + duration) * original_sr)
+
+            audio.seek(int(start * original_sr))
+            frames_to_read = min(max_frames, len(audio))
+            data = audio.read(frames_to_read)
+
+            if data.max() > 1 or data.min() < -1:
+                data = data / max(abs(data.max()), abs(data.min()))
+        
+        if original_sr != target_sr:
+            if channels == 1:
+                data = librosa.resample(data.flatten(), orig_sr=original_sr, target_sr=target_sr)
+            else:
+                data = librosa.resample(data.T, orig_sr=original_sr, target_sr=target_sr)[0]
+        else:
+            if channels != 1:
+                data = data.T[0]
+    
+    if data.min() >= 0:
+        data = 2 * data / abs(data.max()) - 1.0
+    else:
+        data = data / max(abs(data.max()), abs(data.min()))
+    return data
+
+
+@torch.no_grad()
+def compute_clap_each(audio_file, model):
+    try:
+        data = load_audio(audio_file, target_sr=48000, duration=10)
+        print(audio_file, 'loaded')
+    
+    except Exception as e:
+        print(audio_file, 'unsuccessful due to', e)
+        return None
+    
+    audio_data = data.reshape(1, -1)
+
+    audio_data_tensor = torch.from_numpy(int16_to_float32(float32_to_int16(audio_data))).float().cuda()
+    audio_embed = model.get_audio_embedding_from_data(x=audio_data_tensor, use_tensor=True)
+    audio_embed = audio_embed.squeeze(0).cpu()
+    return audio_embed
+
+
+@torch.no_grad()
+def compute_embeddings_batch(batch, audio_files, model):
+    batch_results = []
+    for i in batch:
+        if i >= len(audio_files):
+            break
+        audio_file = audio_files[i]
+        audio_embed = compute_clap_each(audio_file, model)
+        batch_results.append((i, audio_file, audio_embed))
+    return batch_results
+
+
+@torch.no_grad()
+def precompute_clap_for_dataset(
+    dataset_file, 
+    embedding_output_file, 
+    checkpoint='630k-audioset-fusion-best.pt'
+):
+    contents, audio_files = load_dataset_file(dataset_file)
+
+    model = load_clap_model(checkpoint)
+
+    if os.path.exists(embedding_output_file):
+        print('loading already computed embedding file from', embedding_output_file)
+        with open(embedding_output_file, 'rb') as f:
+            saved_data = pickle.load(f)
+            curr_audio_indices = saved_data['audio_indices']
+            curr_audio_files = saved_data['audio_files']
+            curr_audio_embeds = saved_data['audio_embeds']
+
+    else:
+        curr_audio_indices = []
+        curr_audio_files = []
+        curr_audio_embeds = []
+
+    print('computing embeddings for {}'.format(dataset_file))
+    start_index = len(curr_audio_files)
+    remaining_indices = list(range(start_index, len(audio_files)))
+
+    batch_size = 128
+    batches = [
+        list(range(i, min(i + batch_size, len(audio_files)))) \
+            for i in range(start_index, len(audio_files), batch_size)
+    ]
+
+    with multiprocessing.Pool(processes=4) as pool:
+        for i, batch in enumerate(batches):
+            batch_results = pool.map(
+                partial(compute_embeddings_batch, model=model, audio_files=audio_files), 
+                [batch]
+            )
+
+            for result in batch_results[0]:
+                curr_audio_indices.append(result[0])
+                curr_audio_files.append(result[1])
+                curr_audio_embeds.append(result[2])
+
+            with open(embedding_output_file, 'wb') as f:
+                pickle.dump({
+                    'audio_indices': curr_audio_indices,
+                    'audio_files': curr_audio_files, 
+                    'audio_embeds': curr_audio_embeds
+                }, f)
+
+            print(f"Saved progress for batch {i+1}/{len(batches)}: \
+                audio_indices {len(curr_audio_indices)}, \
+                audio_files {len(curr_audio_files)}, \
+                audio_embeds {len(curr_audio_embeds)}*{curr_audio_embeds[0].shape}")
+    
+    return curr_audio_indices, curr_audio_files, curr_audio_embeds
+
+
+def build_faiss_index(embeddings):
+    d = embeddings[0].size(0)
+    index = faiss.IndexFlatL2(d)
+    np_embeddings = np.vstack([emb.numpy() for emb in embeddings])
+    index.add(np_embeddings)
+    return index
+
+
+def build_faiss_index_dataset(
+    dataset_file, 
+    embedding_output_file, 
+    faiss_output_file, 
+    checkpoint='630k-audioset-fusion-best.pt',
+    only_precompute_clap=False
+):
+    audio_indices, audio_files, audio_embeds = precompute_clap_for_dataset(dataset_file, embedding_output_file, checkpoint)
+    
+    if only_precompute_clap:
+        return 
+
+    valid_indices, valid_files, valid_embeds = [], [], []
+    for audio_index, audio_file, audio_embed in zip(audio_indices, audio_files, audio_embeds):
+        if audio_embed is not None:
+            valid_indices.append(audio_index)
+            valid_files.append(audio_file)
+            valid_embeds.append(audio_embed)
+
+    print('building faiss index')
+    faiss_index = build_faiss_index(valid_embeds)
+
+    print('saving faiss index')
+    faiss.write_index(faiss_index, faiss_output_file)
+    with open(faiss_output_file + '.filenames', 'wb') as f:
+        pickle.dump({'audio_indices': valid_indices, 'audio_files': valid_files}, f)
+
+
+# ==================== Generate interleaved dataset files ====================
+# only save index so that one can recover
+
+def build_interleaved_dataset(dataset_file, interleaved_output_file, embedding_output_file, faiss_output_file, mode='random', n_samples=3):
+    contents, audio_files = load_dataset_file(dataset_file)
+
+    dataset_dic = {
+        "dataset_path": contents["dataset_path"],
+        "split": contents["split"],
+        "split_path": contents["split_path"],
+        "flamingo_task": contents["flamingo_task"],
+        "total_num": 0,
+        "interleaved_data": {},   
+    }
+
+    # interleaved_data is 
+    # {
+    #     id: {
+    #         "generation_index_in_split": index of sample in the train or val or test.json,
+    #         "fewshot_indices_in_train": list(indices) of few shot samples in train.json
+    #     }
+    # }
+
+    if mode == 'knn':
+        model = load_clap_model(checkpoint='630k-audioset-fusion-best.pt')
+
+        print('loading already computed embedding file from', embedding_output_file)
+        with open(embedding_output_file, 'rb') as f:
+            precomputed_data = pickle.load(f)
+            precomputed_audio_indices = precomputed_data['audio_indices']
+            precomputed_audio_files = precomputed_data['audio_files']
+            precomputed_audio_embeds = precomputed_data['audio_embeds']
+
+        faiss_index = faiss.read_index(faiss_output_file)
+        with open(faiss_output_file+'.filenames', 'rb') as f:
+            _data = pickle.load(f)
+        faiss_index_audio_indices = _data['audio_indices']
+        faiss_index_audio_files = _data['audio_files']
+
+    print('looking for few shot samples and building interleaved_{} data'.format(mode))
+    for i in tqdm(range(contents["total_num"])):
+        if mode == 'random':
+            few_shot_indices = list(np.random.choice(
+                list(set(list(range(contents["total_num"]))) - set([i])),
+                size=n_samples-1,
+                replace=False
+            ))
+            few_shot_indices = list(map(int, few_shot_indices))
+
+        elif mode == 'knn':
+            if audio_files[i] in precomputed_audio_files:
+                idx = precomputed_audio_files.index(audio_files[i])
+                query_embedding_np = precomputed_audio_embeds[idx]
+                if query_embedding_np is not None:
+                    query_embedding_np = query_embedding_np.numpy().reshape(1, -1)
+                else:
+                    continue
+
+            else:
+                query_embedding_np = compute_clap_each(audio_files[i], model)
+                if query_embedding_np is not None:
+                    query_embedding_np = query_embedding_np.numpy().reshape(1, -1)      
+                else:
+                    continue       
+
+            distances, knn_indices = faiss_index.search(query_embedding_np, n_samples+50)
+            distances = distances[0]
+            knn_indices = knn_indices[0]
+
+            knn_filenames = [faiss_index_audio_files[idx] for idx in knn_indices]
+            combined = list(zip(knn_indices, knn_filenames))
+            unique_indices = defaultdict(list)
+            for idx, filename in combined:
+                unique_indices[filename].append(idx)
+
+            cleared_knn_indices = [random.choice(unique_indices[filename]) for filename in unique_indices if filename != audio_files[i]]
+
+            if dataset_file.endswith('train.json'):
+                cleared_knn_indices = [knn_i for knn_i in cleared_knn_indices if faiss_index_audio_indices[knn_i] != i]
+            cleared_knn_indices = cleared_knn_indices[:n_samples-1]
+            np.random.shuffle(cleared_knn_indices)
+            
+            few_shot_indices = [faiss_index_audio_indices[knn_i] for knn_i in cleared_knn_indices]
+
+        dataset_dic["interleaved_data"][dataset_dic["total_num"]] = {
+            "generation_index_in_split": i,
+            "fewshot_indices_in_train": few_shot_indices
+        }
+        dataset_dic["total_num"] += 1
+    
+    with open(interleaved_output_file, 'w') as json_file:
+        json.dump(dataset_dic, json_file)
+
+
+if __name__ == '__main__':
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-d', '--dataset_name', type=str, help='dataset name')
+    parser.add_argument('-f', '--flamingo_task', type=str, help='flamingo task')
+    parser.add_argument('--interleave', action="store_true", help='prepare the interleave dataset')
+    args = parser.parse_args()
+
+    ROOT = "/lustre/fsw/portfolios/adlr/users/zkong"
+    dataset_root = os.path.join(ROOT, "datasets")
+    output_root = os.path.join(ROOT, "audio-flamingo-data/dataset_files")
+    os.makedirs(output_root, exist_ok=True)
+
+    dataset_name = args.dataset_name  # "Clotho-v2", "AudioSet", "Clotho-AQA", "WavText5K", "FSD50k", ...
+    flamingo_task = args.flamingo_task  # AQA, AudioCaptioning, EventClassification, SceneClassification, Tagging, ...
+
+    # must be train first otherwise there's no train.embedding for query
+    for split in ["train", "val", "test"]:
+        dataset_path = os.path.join(dataset_root, dataset_name)
+
+        output_folder = '{}-{}'.format(dataset_name, flamingo_task)
+        os.makedirs(os.path.join(output_root, output_folder), exist_ok=True)
+
+        dataset_file = os.path.join(output_root, output_folder, '{}.json'.format(split))
+        if not os.path.exists(dataset_file):
+            try:
+                prepare_files(dataset_name, dataset_path, split, flamingo_task, dataset_file)
+            except AssertionError as e:
+                print('split {} not exist for {}: {}'.format(split, dataset_name, e))
+                continue
+        else:
+            print('{} exists; exiting'.format(dataset_file))
+        
+        if args.interleave:
+            faiss_output_file = dataset_file.replace('{}.json'.format(split), "train_faiss_index.index")
+            embedding_output_file = dataset_file.replace('.json', ".embedding")
+
+            if split == 'train':
+                if (not os.path.exists(faiss_output_file)) or (not os.path.exists(faiss_output_file + '.filenames')):
+                    build_faiss_index_dataset(
+                        dataset_file, embedding_output_file, faiss_output_file, 
+                        only_precompute_clap=False
+                    )
+                else:
+                    print('{} exists; exiting'.format(faiss_output_file))
+            else:
+                build_faiss_index_dataset(
+                    dataset_file, embedding_output_file, 
+                    faiss_output_file=None, 
+                    only_precompute_clap=True
+                )
+                print('precomputing embedding for {} subset finished'.format(split))
+
+            for mode in ['knn', 'random']:
+                interleaved_output_file = '/'.join(
+                    dataset_file.split('/')[:-1] + \
+                    ['interleaved_{}-'.format(mode) + dataset_file.split('/')[-1]]
+                )
+                if not os.path.exists(interleaved_output_file):
+                    build_interleaved_dataset(
+                        dataset_file=dataset_file, 
+                        interleaved_output_file=interleaved_output_file, 
+                        embedding_output_file=embedding_output_file, 
+                        faiss_output_file=faiss_output_file, 
+                        mode=mode, 
+                        n_samples=4
+                    )
+                else:
+                    print('{} exists; exiting'.format(interleaved_output_file))
+            
+
+
diff --git a/eval/README.md b/eval/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..55fc29c140dd45dc14debe62bd167271bf72f74c
--- /dev/null
+++ b/eval/README.md
@@ -0,0 +1 @@
+# Audio Flamingo Inference
diff --git a/eval/__init__.py b/eval/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/eval/inference.py b/eval/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2d7d203ffe59813bcde34364593501b0f6abd30
--- /dev/null
+++ b/eval/inference.py
@@ -0,0 +1,229 @@
+import argparse
+import functools
+import glob
+import os
+import random
+import string
+import json
+import sys 
+sys.path.append('../')
+from tqdm import tqdm
+import yaml
+from collections import defaultdict
+import io
+import warnings
+import subprocess
+import pickle
+
+import numpy as np
+import torch
+
+from data.data import get_audiotext_dataloader
+from src.factory import create_model_and_transforms
+from train.train_utils import Dict2Class, get_autocast, get_cast_dtype
+
+def inference_this(
+    args, data_config, clap_config, model_config, test_dataset_name, tmp_file,
+    temperature=1.0, num_beams=3, ckpt=-1, end_batch_idx=-2, verbose=False,
+):
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"  # disable the tokenizer parallelism warning
+    model, tokenizer = create_model_and_transforms(
+        **model_config,
+        clap_config=clap_config, 
+        use_local_files=args.offline,
+        gradient_checkpointing=args.gradient_checkpointing,
+        freeze_lm_embeddings=args.freeze_lm_embeddings,
+    )
+
+    device_id = 0
+    model = model.to(device_id)
+    model.eval()
+
+    if ckpt == -1:
+        checkpoint_list = glob.glob(f"{args.expdir}/{args.run_name}/checkpoint_*.pt")
+        resume_from_checkpoint = sorted(checkpoint_list, key=lambda x: int(x.split("_")[-1].split(".")[0]))[-1]
+    else:
+        resume_from_checkpoint = f"{args.expdir}/{args.run_name}/checkpoint_{ckpt}.pt"
+    checkpoint = torch.load(resume_from_checkpoint, map_location="cpu")
+    msd = checkpoint["model_state_dict"]
+    msd = {k.replace("module.", ""): v for k, v in msd.items()}
+    x,y = model.load_state_dict(msd, False)
+    print(x)
+    print(y)
+    
+    autocast = get_autocast(
+        args.precision, cache_enabled=(not args.fsdp)
+    )
+    cast_dtype = get_cast_dtype(args.precision)
+
+    # model = model.to(dtype=cast_dtype)
+
+    if test_dataset_name in data_config["valid_dataset_config"]:
+        data_config["valid_dataset_config"] = {test_dataset_name: data_config["valid_dataset_config"][test_dataset_name]}
+    else:
+        data_config["valid_dataset_config"] = {test_dataset_name: True}
+    
+    all_test_AudioTextDataInfo = get_audiotext_dataloader(data_config, clap_config, tokenizer, args.batch_size, split='test')
+    
+    assert test_dataset_name in list(all_test_AudioTextDataInfo.keys()), "{} not a test set".format(test_dataset_name)
+    dataloader = all_test_AudioTextDataInfo[test_dataset_name].dataloader
+
+    deduplicate_tasks = ["Clotho-v2-AudioCaptioning", "audiocaps-AudioCaptioning", "MACS-AudioCaptioning", "LP-MusicCaps-MSD-AudioCaptioning", "LP-MusicCaps-MC-AudioCaptioning"]
+    if any([test_dataset_name.startswith(x) for x in deduplicate_tasks]):
+        deduplicate = True 
+    else:
+        deduplicate = False
+
+    if os.path.exists(tmp_file):
+        with open(tmp_file, 'rb') as pickle_file:
+            tmp_data = pickle.load(pickle_file)
+        results_dic = tmp_data['results_dic']
+        results = tmp_data['results']
+        finished_batches = tmp_data['finished_batches']
+        print('reading tmp data from {}: {} batches already computed'.format(tmp_file, finished_batches+1))
+    
+    else:
+        tmp_data = {}
+        results_dic = {}  # for deduplicate
+        results = []  # for non-deduplicate
+        finished_batches = -1
+        print('no tmp data found; will store tmp data to {}'.format(tmp_file))
+
+    # print(len(dataloader))
+    # print('---------------------')
+    from itertools import islice
+    for batch_idx, batch in tqdm(enumerate(islice(dataloader, finished_batches, None), start=finished_batches)):
+    # for batch_idx, batch in tqdm(enumerate(dataloader)):
+        if end_batch_idx > 0 and batch_idx == end_batch_idx:
+            break
+        
+        if batch_idx <= finished_batches:
+            continue
+
+        audio_clips = batch["audio_clips"].to(device_id, dtype=cast_dtype, non_blocking=True)
+        audio_embed_mask = batch["audio_embed_mask"].to(device_id, dtype=cast_dtype, non_blocking=True)
+        input_ids = batch["input_ids"].to(device_id, non_blocking=True)
+        filenames = batch["filenames"]
+        # print(input_ids)
+
+        media_token_id = tokenizer.encode("<audio>")[-1]
+        sep_token_id = tokenizer.sep_token_id
+
+        for idx in range(input_ids.shape[0]):
+            filename = filenames[idx]
+            if type(filename) is list:
+                # interleaved data
+                filename = filename[-1]
+
+            input_id = input_ids[idx]
+            for sep_location in range(len(input_id)-1, -1, -1):
+                # find last <SEP>
+                if input_id[sep_location] == sep_token_id:
+                    break
+            # print(tokenizer.decode(input_id))
+            prompt = input_id[:sep_location+1]
+
+            prompt_decoded = tokenizer.decode(prompt).replace(tokenizer.sep_token, '')
+            ground_truth_decoded = tokenizer.decode(input_id).split(tokenizer.sep_token)[-1].replace(tokenizer.eos_token, '').replace(tokenizer.pad_token, '').replace('<|endofchunk|>', '')
+            
+            if not (deduplicate and (filename, prompt_decoded) in results_dic):
+                # print(prompt)
+                # print(prompt_decoded)
+                output = model.generate(
+                    audio_x=audio_clips[idx].unsqueeze(0),
+                    audio_x_mask=audio_embed_mask[idx].unsqueeze(0),
+                    lang_x=prompt.unsqueeze(0),
+                    eos_token_id=tokenizer.eos_token_id,
+                    max_new_tokens=256,
+                    temperature=temperature,
+                )[0]
+                output_decoded = tokenizer.decode(output).split(tokenizer.sep_token)[-1].replace(tokenizer.eos_token, '').replace(tokenizer.pad_token, '').replace('<|endofchunk|>', '')
+                # print(ground_truth_decoded)
+                # print('------')
+                # print(output_decoded)
+
+            if deduplicate:
+                if (filename, prompt_decoded) in results_dic:
+                    results_dic[(filename, prompt_decoded)]['ground_truth'].append(ground_truth_decoded)
+            
+                else:
+                    results_dic[(filename, prompt_decoded)] = {
+                        'ground_truth': [ground_truth_decoded], 
+                        'output': output_decoded
+                    }
+            else:
+                results.append((filename, prompt_decoded, ground_truth_decoded, output_decoded))
+                
+
+        tmp_data['results_dic'] = results_dic
+        tmp_data['results'] = results
+        tmp_data['finished_batches'] = batch_idx
+        with open(tmp_file, 'wb') as pickle_file:
+            pickle.dump(tmp_data, pickle_file)
+
+    if deduplicate:
+        for (filename, prompt) in results_dic:
+            ground_truth = '|'.join(results_dic[(filename, prompt)]['ground_truth'])
+            output = results_dic[(filename, prompt)]['output']
+            results.append((filename, prompt, ground_truth, output))
+
+    # if verbose:
+    #     for filename, prompt, ground_truth, output in results:
+    #         print('-'*30)
+    #         print('filename:', filename)
+    #         print('prompt:', prompt)
+    #         print('ground_truth:', ground_truth)
+    #         print('output:', output)
+
+    return results
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-c', '--config', type=str, default='../config/config.yaml', help='yaml config path')
+    parser.add_argument('-t', '--task', type=str, help='which task to inference')
+    parser.add_argument('-temp', '--temperature', type=float, default=1.0, help='temperature')
+    parser.add_argument('-nb', '--num_beams', type=int, default=1, help='num beams for beam search')
+    parser.add_argument('--ckpt', type=int, default=-1, help='checkpoint idx, -1 means latest')
+    parsed_args = parser.parse_args()
+
+    print(parsed_args)
+
+    test_dataset_name = parsed_args.task
+
+    output_file = os.path.join(
+        '../outputs/', 
+        parsed_args.task.replace('/', '-'), 
+        '{}-ckpt{}-{}.log'.format(
+            parsed_args.config.split('/')[-1][:-5], 
+            parsed_args.ckpt,
+            "sft"
+        )
+    )
+    tmp_file = output_file.replace('.log', '.tmp.pickle')
+    print('output file:', output_file)
+
+    print('no previous log file; generating samples')
+
+    config = yaml.load(open(parsed_args.config), Loader=yaml.FullLoader)
+    # print(config)
+    # print('----------------------')
+    data_config = config['data_config']
+    model_config = config['model_config']
+    print(model_config)
+    clap_config = config['clap_config']
+    clap_config = config['clap_config']
+    mert_config = config['mert_config']
+    args = Dict2Class(config['train_config'])
+
+    results = inference_this(
+        args, data_config, clap_config, model_config, test_dataset_name, 
+        temperature=float(parsed_args.temperature),
+        num_beams=int(parsed_args.num_beams),
+        ckpt=parsed_args.ckpt,
+        verbose=True,
+        tmp_file=tmp_file,
+    )
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/eval/inference.sh b/eval/inference.sh
new file mode 100644
index 0000000000000000000000000000000000000000..4cf45bfbb76083e9549dd4059ee44f7e2e3435e5
--- /dev/null
+++ b/eval/inference.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+
+TO_SUBMIT_JOBS=$(ls ../configs | grep "inference.yaml")
+
+ALL_TASK=$1
+# ALL_TASK=""
+# ALL_TASK="${ALL_TASK} MMAU/test"
+# ALL_TASK="${ALL_TASK} MusicCaps-AudioCaptioning/test"
+# ALL_TASK="${ALL_TASK} MusicCaps-AudioCaptioning/test"
+# ALL_TASK="${ALL_TASK} audiocaps-AudioCaptioning/interleaved_knn-test"
+# ALL_TASK="${ALL_TASK} MusicCaps-AudioCaptioning/interleaved_knn-test"
+
+# # # ===== Classification =====
+# ALL_TASK="${ALL_TASK} CochlScene-SceneClassification/test"
+# ALL_TASK="${ALL_TASK} NonSpeech7k-EventClassification/test"
+
+# # # ===== zero-shot =====
+# ALL_TASK="${ALL_TASK} CREMA-D-EmotionClassification/train"
+# ALL_TASK="${ALL_TASK} ravdess-EmotionClassification/train"
+# ALL_TASK="${ALL_TASK} UrbanSound8K-EventClassification/train"
+# ALL_TASK="${ALL_TASK} GTZAN-GenreClassification/train"
+# ALL_TASK="${ALL_TASK} Medley-solos-DB-InstrClassification/test"
+
+for task in ${ALL_TASK}
+do 
+    OUTFOLDER=${task//\//-}  # replace / into -
+    mkdir -p ../outputs/$OUTFOLDER
+done
+
+temp=0.0
+numbeams=1
+ckpt=199
+
+for EXP in $TO_SUBMIT_JOBS
+do
+    L=${#EXP}
+    NAME=$(echo ${EXP} | cut -c 1-$(($L-5)))  # remove last .yaml
+
+    for task in ${ALL_TASK}
+    do
+        echo "task: $task, config: $NAME, ckpt: $ckpt"
+
+        OUTFOLDER=${task//\//-}
+        OUTFILE="../outputs/$OUTFOLDER/$NAME-ckpt${ckpt}.log"
+        CKPT_DIR="/lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed-sft/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-3b-fixed-sft-3/$NAME"
+        python -u inference.py \
+            -c ../configs/$EXP \
+            -t $task \
+            -temp $temp \
+            -nb $numbeams \
+            --ckpt ${ckpt}
+
+    done
+    wait
+done
\ No newline at end of file
diff --git a/eval/interactive.sh b/eval/interactive.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b4357dd70e51d8d69ea4123eb4a138ae514d7cce
--- /dev/null
+++ b/eval/interactive.sh
@@ -0,0 +1,8 @@
+# IMAGE=gitlab-master.nvidia.com/zkong/audio_flamingo_v1/audiolm:0.2
+IMAGE="/lustre/fsw/portfolios/adlr/users/zkong/docker/audiolm-0.2/image.sqsh"
+
+submit_job -i -n interactive \
+    --gpu 1 \
+    --duration 2 \
+    --image $IMAGE \
+    --mounts /home/zkong,/lustre/fsw/portfolios/adlr/users/zkong
diff --git a/eval/keep_run.sh b/eval/keep_run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c5ab2bd79fbfc843ba1558a21fe6abd447f41dcf
--- /dev/null
+++ b/eval/keep_run.sh
@@ -0,0 +1,64 @@
+# CHECK_EVERY=900
+# DURATION_DAYS=10
+# CHECK_TOTAL=$((DURATION_DAYS*86400/CHECK_EVERY))
+# NEPOCH_PRE=99
+# NEPOCH_SFT=159
+# NAME="audio-gen-train_audiogen"
+
+# for (( i = 1; i <= $CHECK_TOTAL; i++ )) 
+# do
+#     RUNNING_JOBS=$(sacct -o JobName%-150,JobID,Partition%-15,State | grep -v inference | grep RUNNING | grep polar | sort)
+#     PENDING_JOBS=$(sacct -o JobName%-150,JobID,Partition%-15,State | grep -v inference | grep PENDING | grep polar | sort)
+
+#     for STATE in "RUNNING" "PENDING" "NOT-RUN"
+#     do
+#         echo "===========${STATE}=========="
+
+#         if [[ ${STATE} == "RUNNING" && ${RUNNING_JOBS} =~ "${NAME}" ]]; then
+#             echo ${NAME}
+#         elif [[ ${STATE} == "PENDING" && ${PENDING_JOBS} =~ "${NAME}" ]]; then
+#             echo ${NAME}
+#         elif [[ ${STATE} == "NOT-RUN" && ! ${RUNNING_JOBS} =~ "${NAME}" && ! ${PENDING_JOBS} =~ "${NAME}" ]]; then
+
+#             base_path="/lustre/fsw/portfolios/adlr/users/sreyang/ckpts/stable_llm/harmonai_train/"
+#             # Find the last subfolder
+#             last_subfolder=$(ls -d "$base_path"*/ | sort -V | tail -n 1)
+#             # Find the last checkpoint in the subfolder
+#             last_ckpt=$(ls "$last_subfolder/checkpoints/"*.ckpt | sort -V | tail -n 1)
+#             echo $last_ckpt
+#             sh submit_job.sh "True" $last_ckpt
+#             sleep 1
+#         fi
+#     done
+#     echo "============================"
+#     sleep $CHECK_EVERY
+# done
+
+CHECK_EVERY=900
+DURATION_DAYS=10
+CHECK_TOTAL=$((DURATION_DAYS*86400/CHECK_EVERY))
+NEPOCH_PRE=99
+NEPOCH_SFT=159
+NAME="eval"
+
+for (( i = 1; i <= $CHECK_TOTAL; i++ )) 
+do
+    RUNNING_JOBS=$(sacct -o JobName%-150,JobID,Partition%-15,State | grep -v inference | grep RUNNING | grep polar | sort)
+    PENDING_JOBS=$(sacct -o JobName%-150,JobID,Partition%-15,State | grep -v inference | grep PENDING | grep polar | sort)
+
+    for STATE in "RUNNING" "PENDING" "NOT-RUN"
+    do
+        echo "===========${STATE}=========="
+
+        if [[ ${STATE} == "RUNNING" && ${RUNNING_JOBS} =~ "${NAME}" ]]; then
+            echo ${NAME}
+        elif [[ ${STATE} == "PENDING" && ${PENDING_JOBS} =~ "${NAME}" ]]; then
+            echo ${NAME}
+        elif [[ ${STATE} == "NOT-RUN" && ! ${RUNNING_JOBS} =~ "${NAME}" && ! ${PENDING_JOBS} =~ "${NAME}" ]]; then
+            sh submit.sh
+            sleep 1
+        fi
+    done
+    echo "============================"
+    sleep $CHECK_EVERY
+done
\ No newline at end of file
diff --git a/eval/submit.sh b/eval/submit.sh
new file mode 100644
index 0000000000000000000000000000000000000000..9bea7cf7989856b4bae02d84714e4e30afed33b4
--- /dev/null
+++ b/eval/submit.sh
@@ -0,0 +1,54 @@
+IMAGE="/lustre/fsw/portfolios/adlr/users/zkong/docker/audiolm-0.1/image.sqsh"
+NAME=eval
+PARTITION="polar,polar3,polar4"
+MOUNTS="/home/zkong,/lustre/fsw/portfolios/adlr/users/zkong,/lustre/fsw/portfolios/adlr/users/sreyang,/home/sreyang"
+
+LOGDIR=/lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_fixed_rotary_all_layers_logs_infer
+
+# "MMAU/test" "MusicCaps-AudioCaptioning/test" "audiocaps-AudioCaptioning/test" "FSD50k-EventClassification/test"
+# Predefined list of strings
+
+
+STRING_LIST=("Music4All/train")
+    
+    
+# "MusicCaps-AudioCaptioning/test_2")
+    
+# "Clotho-AQA-AQA/test" "MusicCaps-AudioCaptioning/test" "audiocaps-AudioCaptioning/test" "FSD50k-EventClassification/test" "AudioHalQA/test_compa" "MMAU/test" "AIR-Bench/test" "MuschoMusicQA/test" "CREMA-D-EmotionClassification/train" "ravdess-EmotionClassification/train" "UrbanSound8K-EventClassification/train" "ESC50-EventClassification/train" "DCASE17Task4-SceneClassification/test" "GTZAN-GenreClassification/train" "Medley-solos-DB-InstrClassification/test" "Music-AVQA-AQA_All/test" "MU-LLAMA-AQA/test" "AudioEntailmentQA/test" "AudioEntailmentQA/test_audiocaps" "SongDescriber-AudioCaptioning/train")
+    
+# "Clotho-v2-AudioCaptioning/test")
+
+# "NSynth-Source/test" "NSynth-Instrument/test" "CochlScene-SceneClassification/test")
+    
+#"Clotho-AQA-AQA/test" "MusicCaps-AudioCaptioning/test" "audiocaps-AudioCaptioning/test" "FSD50k-EventClassification/test" "AudioHalQA/test_compa" "MMAU/test" "AIR-Bench/test" "MuschoMusicQA/test" "CREMA-D-EmotionClassification/train" "ravdess-EmotionClassification/train" "UrbanSound8K-EventClassification/train" "ESC50-EventClassification/train" "DCASE17Task4-SceneClassification/test" "GTZAN-GenreClassification/train" "Medley-solos-DB-InstrClassification/test" "Music-AVQA-AQA_All/test" "MU-LLAMA-AQA/test" "AudioEntailmentQA/test" "AudioEntailmentQA/test_audiocaps" "SongDescriber-AudioCaptioning/train"
+    
+#"Clotho-AQA-AQA/test" "MusicCaps-AudioCaptioning/test" "audiocaps-AudioCaptioning/test" "FSD50k-EventClassification/test" "AudioHalQA/test_compa" "MMAU/test" "AIR-Bench/test" "MuschoMusicQA/test" "CREMA-D-EmotionClassification/train" "ravdess-EmotionClassification/train" "UrbanSound8K-EventClassification/train" "ESC50-EventClassification/train" "DCASE17Task4-SceneClassification/test" "GTZAN-GenreClassification/train" "Medley-solos-DB-InstrClassification/test" "Music-AVQA-AQA_All/test" "MU-LLAMA-AQA/test" "AudioEntailmentQA/test" "AudioEntailmentQA/test_audiocaps" "SongDescriber-AudioCaptioning/train")
+    
+# "Clotho-AQA-AQA/test" "MusicCaps-AudioCaptioning/test" "audiocaps-AudioCaptioning/test" "FSD50k-EventClassification/test" "AudioHalQA/test_compa" "MMAU/test" "AIR-Bench/test" "MuschoMusicQA/test" "CREMA-D-EmotionClassification/train" "ravdess-EmotionClassification/train" "UrbanSound8K-EventClassification/train" "ESC50-EventClassification/train" "DCASE17Task4-SceneClassification/test" "GTZAN-GenreClassification/train" "Medley-solos-DB-InstrClassification/test"
+
+# "CREMA-D-EmotionClassification/train" "ravdess-EmotionClassification/train" "UrbanSound8K-EventClassification/train" "ESC50-EventClassification/train" "DCASE17Task4-SceneClassification/test" "GTZAN-GenreClassification/train" "Medley-solos-DB-InstrClassification/test"
+#("Clotho-AQA-AQA/test" "AudioHalQA/test_compa" "MMAU/test" "AIR-Bench/test")
+
+for i in "${STRING_LIST[@]}"; do
+
+    OUTFILE=$LOGDIR/output_$i-2.out
+
+    TASK=""
+    TASK="${TASK} $i"
+
+    SUBMIT_SUBPROJECT_NAME="llmservice_fm_audio" submit_job \
+        --mounts $MOUNTS \
+        --name audio-flamingo-$NAME \
+        --duration 4 \
+        --partition $PARTITION \
+        --gpu 2 \
+        --nodes 1 \
+        --image $IMAGE \
+        --email_mode never \
+        --outfile $OUTFILE \
+        --logdir $LOGDIR \
+        --prolog_command "pip install nnAudio; pip install tokenizers==0.20.3; pip install transformers==4.46.3" \
+        --command "sh inference.sh $TASK"
+    sleep 30
+done
+
diff --git a/eval/submit_2.sh b/eval/submit_2.sh
new file mode 100644
index 0000000000000000000000000000000000000000..62cfac6689e7a8f8a2b41d340f94223a37129c87
--- /dev/null
+++ b/eval/submit_2.sh
@@ -0,0 +1,49 @@
+IMAGE="/lustre/fsw/portfolios/adlr/users/zkong/docker/audiolm-0.1/image.sqsh"
+NAME=eval
+PARTITION="polar,polar2,polar3,polar4"
+MOUNTS="/home/zkong,/lustre/fsw/portfolios/adlr/users/zkong,/lustre/fsw/portfolios/adlr/users/sreyang,/home/sreyang"
+
+LOGDIR=/lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_fixed_rotary_all_layers_logs_infer
+
+# "MMAU/test" "MusicCaps-AudioCaptioning/test" "audiocaps-AudioCaptioning/test" "FSD50k-EventClassification/test"
+# Predefined list of strings
+
+
+STRING_LIST=("Clotho-v2-AudioCaptioning/test" "NSynth-Source/test" "NSynth-Instrument/test" "CochlScene-SceneClassification/test" "Clotho-AQA-AQA/test" "MusicCaps-AudioCaptioning/test" "audiocaps-AudioCaptioning/test" "FSD50k-EventClassification/test" "AudioHalQA/test_compa" "MMAU/test" "AIR-Bench/test" "MuschoMusicQA/test" "CREMA-D-EmotionClassification/train" "ravdess-EmotionClassification/train" "UrbanSound8K-EventClassification/train" "ESC50-EventClassification/train" "DCASE17Task4-SceneClassification/test" "GTZAN-GenreClassification/train" "Medley-solos-DB-InstrClassification/test" "Music-AVQA-AQA_All/test" "MU-LLAMA-AQA/test" "AudioEntailmentQA/test" "AudioEntailmentQA/test_audiocaps" "SongDescriber-AudioCaptioning/train")
+    
+# "Clotho-v2-AudioCaptioning/test")
+
+# "NSynth-Source/test" "NSynth-Instrument/test" "CochlScene-SceneClassification/test")
+    
+#"Clotho-AQA-AQA/test" "MusicCaps-AudioCaptioning/test" "audiocaps-AudioCaptioning/test" "FSD50k-EventClassification/test" "AudioHalQA/test_compa" "MMAU/test" "AIR-Bench/test" "MuschoMusicQA/test" "CREMA-D-EmotionClassification/train" "ravdess-EmotionClassification/train" "UrbanSound8K-EventClassification/train" "ESC50-EventClassification/train" "DCASE17Task4-SceneClassification/test" "GTZAN-GenreClassification/train" "Medley-solos-DB-InstrClassification/test" "Music-AVQA-AQA_All/test" "MU-LLAMA-AQA/test" "AudioEntailmentQA/test" "AudioEntailmentQA/test_audiocaps" "SongDescriber-AudioCaptioning/train"
+    
+#"Clotho-AQA-AQA/test" "MusicCaps-AudioCaptioning/test" "audiocaps-AudioCaptioning/test" "FSD50k-EventClassification/test" "AudioHalQA/test_compa" "MMAU/test" "AIR-Bench/test" "MuschoMusicQA/test" "CREMA-D-EmotionClassification/train" "ravdess-EmotionClassification/train" "UrbanSound8K-EventClassification/train" "ESC50-EventClassification/train" "DCASE17Task4-SceneClassification/test" "GTZAN-GenreClassification/train" "Medley-solos-DB-InstrClassification/test" "Music-AVQA-AQA_All/test" "MU-LLAMA-AQA/test" "AudioEntailmentQA/test" "AudioEntailmentQA/test_audiocaps" "SongDescriber-AudioCaptioning/train")
+    
+# "Clotho-AQA-AQA/test" "MusicCaps-AudioCaptioning/test" "audiocaps-AudioCaptioning/test" "FSD50k-EventClassification/test" "AudioHalQA/test_compa" "MMAU/test" "AIR-Bench/test" "MuschoMusicQA/test" "CREMA-D-EmotionClassification/train" "ravdess-EmotionClassification/train" "UrbanSound8K-EventClassification/train" "ESC50-EventClassification/train" "DCASE17Task4-SceneClassification/test" "GTZAN-GenreClassification/train" "Medley-solos-DB-InstrClassification/test"
+
+# "CREMA-D-EmotionClassification/train" "ravdess-EmotionClassification/train" "UrbanSound8K-EventClassification/train" "ESC50-EventClassification/train" "DCASE17Task4-SceneClassification/test" "GTZAN-GenreClassification/train" "Medley-solos-DB-InstrClassification/test"
+#("Clotho-AQA-AQA/test" "AudioHalQA/test_compa" "MMAU/test" "AIR-Bench/test")
+
+for i in "${STRING_LIST[@]}"; do
+
+    OUTFILE=$LOGDIR/output_$i-4096_7.out
+
+    TASK=""
+    TASK="${TASK} $i"
+
+    SUBMIT_SUBPROJECT_NAME="llmservice_fm_audio" submit_job \
+        --mounts $MOUNTS \
+        --name audio-flamingo-$NAME \
+        --duration 4 \
+        --partition $PARTITION \
+        --gpu 2 \
+        --nodes 1 \
+        --image $IMAGE \
+        --email_mode never \
+        --outfile $OUTFILE \
+        --logdir $LOGDIR \
+        --prolog_command "pip install nnAudio; pip install tokenizers==0.20.3; pip install transformers==4.46.3" \
+        --command "sh inference.sh $TASK"
+    sleep 30
+done
+
diff --git a/my_laion_clap/CLAP/LICENSE b/my_laion_clap/CLAP/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..0e259d42c996742e9e3cba14c677129b2c1b6311
--- /dev/null
+++ b/my_laion_clap/CLAP/LICENSE
@@ -0,0 +1,121 @@
+Creative Commons Legal Code
+
+CC0 1.0 Universal
+
+    CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
+    LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
+    ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
+    INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
+    REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
+    PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
+    THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
+    HEREUNDER.
+
+Statement of Purpose
+
+The laws of most jurisdictions throughout the world automatically confer
+exclusive Copyright and Related Rights (defined below) upon the creator
+and subsequent owner(s) (each and all, an "owner") of an original work of
+authorship and/or a database (each, a "Work").
+
+Certain owners wish to permanently relinquish those rights to a Work for
+the purpose of contributing to a commons of creative, cultural and
+scientific works ("Commons") that the public can reliably and without fear
+of later claims of infringement build upon, modify, incorporate in other
+works, reuse and redistribute as freely as possible in any form whatsoever
+and for any purposes, including without limitation commercial purposes.
+These owners may contribute to the Commons to promote the ideal of a free
+culture and the further production of creative, cultural and scientific
+works, or to gain reputation or greater distribution for their Work in
+part through the use and efforts of others.
+
+For these and/or other purposes and motivations, and without any
+expectation of additional consideration or compensation, the person
+associating CC0 with a Work (the "Affirmer"), to the extent that he or she
+is an owner of Copyright and Related Rights in the Work, voluntarily
+elects to apply CC0 to the Work and publicly distribute the Work under its
+terms, with knowledge of his or her Copyright and Related Rights in the
+Work and the meaning and intended legal effect of CC0 on those rights.
+
+1. Copyright and Related Rights. A Work made available under CC0 may be
+protected by copyright and related or neighboring rights ("Copyright and
+Related Rights"). Copyright and Related Rights include, but are not
+limited to, the following:
+
+  i. the right to reproduce, adapt, distribute, perform, display,
+     communicate, and translate a Work;
+ ii. moral rights retained by the original author(s) and/or performer(s);
+iii. publicity and privacy rights pertaining to a person's image or
+     likeness depicted in a Work;
+ iv. rights protecting against unfair competition in regards to a Work,
+     subject to the limitations in paragraph 4(a), below;
+  v. rights protecting the extraction, dissemination, use and reuse of data
+     in a Work;
+ vi. database rights (such as those arising under Directive 96/9/EC of the
+     European Parliament and of the Council of 11 March 1996 on the legal
+     protection of databases, and under any national implementation
+     thereof, including any amended or successor version of such
+     directive); and
+vii. other similar, equivalent or corresponding rights throughout the
+     world based on applicable law or treaty, and any national
+     implementations thereof.
+
+2. Waiver. To the greatest extent permitted by, but not in contravention
+of, applicable law, Affirmer hereby overtly, fully, permanently,
+irrevocably and unconditionally waives, abandons, and surrenders all of
+Affirmer's Copyright and Related Rights and associated claims and causes
+of action, whether now known or unknown (including existing as well as
+future claims and causes of action), in the Work (i) in all territories
+worldwide, (ii) for the maximum duration provided by applicable law or
+treaty (including future time extensions), (iii) in any current or future
+medium and for any number of copies, and (iv) for any purpose whatsoever,
+including without limitation commercial, advertising or promotional
+purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
+member of the public at large and to the detriment of Affirmer's heirs and
+successors, fully intending that such Waiver shall not be subject to
+revocation, rescission, cancellation, termination, or any other legal or
+equitable action to disrupt the quiet enjoyment of the Work by the public
+as contemplated by Affirmer's express Statement of Purpose.
+
+3. Public License Fallback. Should any part of the Waiver for any reason
+be judged legally invalid or ineffective under applicable law, then the
+Waiver shall be preserved to the maximum extent permitted taking into
+account Affirmer's express Statement of Purpose. In addition, to the
+extent the Waiver is so judged Affirmer hereby grants to each affected
+person a royalty-free, non transferable, non sublicensable, non exclusive,
+irrevocable and unconditional license to exercise Affirmer's Copyright and
+Related Rights in the Work (i) in all territories worldwide, (ii) for the
+maximum duration provided by applicable law or treaty (including future
+time extensions), (iii) in any current or future medium and for any number
+of copies, and (iv) for any purpose whatsoever, including without
+limitation commercial, advertising or promotional purposes (the
+"License"). The License shall be deemed effective as of the date CC0 was
+applied by Affirmer to the Work. Should any part of the License for any
+reason be judged legally invalid or ineffective under applicable law, such
+partial invalidity or ineffectiveness shall not invalidate the remainder
+of the License, and in such case Affirmer hereby affirms that he or she
+will not (i) exercise any of his or her remaining Copyright and Related
+Rights in the Work or (ii) assert any associated claims and causes of
+action with respect to the Work, in either case contrary to Affirmer's
+express Statement of Purpose.
+
+4. Limitations and Disclaimers.
+
+ a. No trademark or patent rights held by Affirmer are waived, abandoned,
+    surrendered, licensed or otherwise affected by this document.
+ b. Affirmer offers the Work as-is and makes no representations or
+    warranties of any kind concerning the Work, express, implied,
+    statutory or otherwise, including without limitation warranties of
+    title, merchantability, fitness for a particular purpose, non
+    infringement, or the absence of latent or other defects, accuracy, or
+    the present or absence of errors, whether or not discoverable, all to
+    the greatest extent permissible under applicable law.
+ c. Affirmer disclaims responsibility for clearing rights of other persons
+    that may apply to the Work or any use thereof, including without
+    limitation any person's Copyright and Related Rights in the Work.
+    Further, Affirmer disclaims responsibility for obtaining any necessary
+    consents, permissions or other rights required for any use of the
+    Work.
+ d. Affirmer understands and acknowledges that Creative Commons is not a
+    party to this document and has no duty or obligation with respect to
+    this CC0 or use of the Work.
diff --git a/my_laion_clap/CLAP/MANIFEST.in b/my_laion_clap/CLAP/MANIFEST.in
new file mode 100644
index 0000000000000000000000000000000000000000..ee6efa5317ac22519316f126f757455f91532138
--- /dev/null
+++ b/my_laion_clap/CLAP/MANIFEST.in
@@ -0,0 +1,3 @@
+recursive-include src/laion_clap/clap_module/model_configs *.json
+recursive-include src/laion_clap/clap_module bpe_simple_vocab_16e6.txt.gz
+recursive-include src/laion_clap/training audioset_textmap.npy
diff --git a/my_laion_clap/CLAP/README.md b/my_laion_clap/CLAP/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..19701d2c97e77530ac6ed968bc35af1843d14a2f
--- /dev/null
+++ b/my_laion_clap/CLAP/README.md
@@ -0,0 +1,287 @@
+# CLAP
+<p align="center">
+  <img src="https://raw.githubusercontent.com/LAION-AI/CLAP/main/assets/logo.PNG" alt="The Contrastive Language-Audio Pretraining Model Architecture" width="60%"/>
+</p>
+<p align="center">
+  <a href="https://arxiv.org/abs/2211.06687"><img src="https://img.shields.io/badge/arXiv-2211.06687-brightgreen.svg?style=flat-square"/></a>
+  <a href="https://pypi.org/project/laion-clap"><img src="https://badge.fury.io/py/laion-clap.svg"/></a>
+  <a href="https://huggingface.co/docs/transformers/v4.27.2/en/model_doc/clap"><img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Transformers-blue"/></a>
+</p>
+ 
+### This repository provides representations of audios and texts via Contrastive Language-Audio Pretraining (CLAP)
+
+With CLAP, you can extract a latent representation of any given audio and text for your own model, or for different downstream tasks.
+
+All codes are comming officially with the following paper, accepted by IEEE International Conference on Acoustics, Speech and Signal Processing, ICASSP 2023:
+ - [Large-Scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687)
+
+**New Updates:** 
+
+<b>1. We release new CLAP pretrained checkpoints pretrained on music and speech data collecstions from [our dataset collection repo](https://github.com/LAION-AI/audio-dataset).</b>
+
+<b>2. CLAP model is incorporated and supported by [HuggingFace Transformers](https://huggingface.co/docs/transformers/v4.27.2/en/model_doc/clap). Many thanks to [Younes Belkada](https://huggingface.co/ybelkada) and [Arthur Zucker](https://fr.linkedin.com/in/arthur-zucker-8a0445144) for contributing to the HuggingFace support. </b>
+
+## About this project
+
+This project is a project in [LAION](https://laion.ai/) that aims at learning better audio understanding and getting more audio data. 
+This is an opensource project. We adopt the codebase of [open_clip](https://github.com/mlfoundations/open_clip) for this project. 
+
+many thanks to <a href="https://github.com/cfoster0/CLAP">@cfoster0</a> for allowing us to use his repo name.
+
+## Architecture
+Contrastive Language-Audio Pretraining, known as CLAP. Referring to the CLIP (Contrastive Language-Image Pretraining) architecture, the CLAP architecture is as follows.  
+<p align="center">
+  <img src="https://raw.githubusercontent.com/LAION-AI/CLAP/main/assets/audioclip-arch.png" alt="The Contrastive Language-Audio Pretraining Model Architecture" width="60%"/>
+</p>
+
+## Quick Start 
+We provide the PyPI library for our CLAP model:
+```bash
+pip install laion-clap
+```
+
+Then you can follow the below usage or refer to [unit_test.py](https://github.com/LAION-AI/CLAP/blob/laion_clap_pip/src/laion_clap/unit_test.py).
+
+For the documentation of the API, please refer to [hook.py](https://github.com/LAION-AI/CLAP/blob/main/src/laion_clap/hook.py).
+
+```python
+import numpy as np
+import librosa
+import torch
+import laion_clap
+
+# quantization
+def int16_to_float32(x):
+    return (x / 32767.0).astype(np.float32)
+
+
+def float32_to_int16(x):
+    x = np.clip(x, a_min=-1., a_max=1.)
+    return (x * 32767.).astype(np.int16)
+
+model = laion_clap.CLAP_Module(enable_fusion=False)
+model.load_ckpt() # download the default pretrained checkpoint.
+
+# Directly get audio embeddings from audio files
+audio_file = [
+    '/home/data/test_clap_short.wav',
+    '/home/data/test_clap_long.wav'
+]
+audio_embed = model.get_audio_embedding_from_filelist(x = audio_file, use_tensor=False)
+print(audio_embed[:,-20:])
+print(audio_embed.shape)
+
+# Get audio embeddings from audio data
+audio_data, _ = librosa.load('/home/data/test_clap_short.wav', sr=48000) # sample rate should be 48000
+audio_data = audio_data.reshape(1, -1) # Make it (1,T) or (N,T)
+audio_embed = model.get_audio_embedding_from_data(x = audio_data, use_tensor=False)
+print(audio_embed[:,-20:])
+print(audio_embed.shape)
+
+# Directly get audio embeddings from audio files, but return torch tensor
+audio_file = [
+    '/home/data/test_clap_short.wav',
+    '/home/data/test_clap_long.wav'
+]
+audio_embed = model.get_audio_embedding_from_filelist(x = audio_file, use_tensor=True)
+print(audio_embed[:,-20:])
+print(audio_embed.shape)
+
+# Get audio embeddings from audio data
+audio_data, _ = librosa.load('/home/data/test_clap_short.wav', sr=48000) # sample rate should be 48000
+audio_data = audio_data.reshape(1, -1) # Make it (1,T) or (N,T)
+audio_data = torch.from_numpy(int16_to_float32(float32_to_int16(audio_data))).float() # quantize before send it in to the model
+audio_embed = model.get_audio_embedding_from_data(x = audio_data, use_tensor=True)
+print(audio_embed[:,-20:])
+print(audio_embed.shape)
+
+# Get text embedings from texts:
+text_data = ["I love the contrastive learning", "I love the pretrain model"] 
+text_embed = model.get_text_embedding(text_data)
+print(text_embed)
+print(text_embed.shape)
+
+# Get text embedings from texts, but return torch tensor:
+text_data = ["I love the contrastive learning", "I love the pretrain model"] 
+text_embed = model.get_text_embedding(text_data, use_tensor=True)
+print(text_embed)
+print(text_embed.shape)
+
+```
+
+## Pretrained Models
+The pretrained checkpoints can be found in [here](https://huggingface.co/lukewys/laion_clap/tree/main).
+Please refer to the previous section for how to load and run the checkpoints.
+For the PyPI library, [630k-audioset-best.pt](https://huggingface.co/lukewys/laion_clap/blob/main/630k-audioset-best.pt) and [630k-audioset-fusion-best.pt](https://huggingface.co/lukewys/laion_clap/blob/main/630k-audioset-fusion-best.pt) are our default models (non-fusion and fusion)
+
+We further provide below pretrained models according to your usages:
+
+* For general audio less than 10-sec: [630k-audioset-best.pt](https://huggingface.co/lukewys/laion_clap/blob/main/630k-audioset-best.pt) or [630k-best.pt](https://huggingface.co/lukewys/laion_clap/blob/main/630k-best.pt)
+* For general audio with variable-length: [630k-audioset-fusion-best.pt](https://huggingface.co/lukewys/laion_clap/blob/main/630k-audioset-fusion-best.pt) or [630k-fusion-best.pt](https://huggingface.co/lukewys/laion_clap/blob/main/630k-fusion-best.pt)
+* For music: [music_audioset_epoch_15_esc_90.14.pt](https://huggingface.co/lukewys/laion_clap/blob/main/music_audioset_epoch_15_esc_90.14.pt)
+* For music and speech: [music_speech_epoch_15_esc_89.25.pt](https://huggingface.co/lukewys/laion_clap/blob/main/music_speech_epoch_15_esc_89.25.pt)
+* For speech, music and general audio: [music_speech_audioset_epoch_15_esc_89.98.pt](https://huggingface.co/lukewys/laion_clap/blob/main/music_speech_audioset_epoch_15_esc_89.98.pt)
+
+The checkpoints list here for each model setting is the one with the highest average mAP score in training.
+The average mAP score is calculated by averaging 4 scores: A-->T mAP@10 on AudioCaps, and T-->A mAP@10 on AudioCaps, A-->T mAP@10 on Clotho, and T-->A mAP@10 on Clotho.
+
+To use above pretrained models, you need to load the ckpt by yourself, as:
+
+Update 2023.4.7: we have released 3 larger CLAP models trained on music, speech dataset in addition to LAION-Audio-630k. Here are descriptions of the model and their performance:
+
+ - `music_speech_audioset_epoch_15_esc_89.98.pt`: trained on music + speech + Audioset + LAION-Audio-630k. The zeroshot ESC50 performance is 89.98%, the GTZAN performance is 51%.
+ - `music_audioset_epoch_15_esc_90.14.pt`: trained on music + Audioset + LAION-Audio-630k. The zeroshot ESC50 performance is 90.14%, the GTZAN performance is 71%.
+ - `music_speech_epoch_15_esc_89.25.pt`: trained on music + speech + LAION-Audio-630k. The zeroshot ESC50 performance is 89.25%, the GTZAN performance is 69%.
+
+The model uses a larger audio encoder. To load the model using the pip API:
+```python
+import laion_clap
+model = laion_clap.CLAP_Module(enable_fusion=False, amodel= 'HTSAT-base')
+model.load_ckpt('checkpoint_path/checkpoint_name.pt')
+```
+
+Please note that this is a temporary release for people who are working on larger-scale down-stream task. 
+We will release a more comprehensive version of the model with detailed experiments in the future.
+Please take your own risk when using this model.
+
+* All the new checkpoints did not trained with fusion. The training dataset size for `music_speech_audioset_epoch_15_esc_89.98.pt` is around 4M samples. The zeroshot GTZAN score is evaluated using the prompt `This audio is a <genre> song.`
+
+<!-- We provide the CLAP's performance on audio classification tasks under the zero-shot setting or the supervised setting. More results can be found at our paper.
+<p align="center">
+  <img src="https://raw.githubusercontent.com/LAION-AI/CLAP/main/assets/clap-zeroshot.PNG" alt="Zero-shot Performance" width="100%"/>
+</p> -->
+
+
+
+
+## Environment Installation
+If you want to check and reuse our model into your project instead of directly using the pip library, you need to install the same environment as we use, please run the following command:
+```bash
+conda create env -n clap python=3.10
+conda activate clap
+git clone https://github.com/LAION-AI/CLAP.git
+cd CLAP
+# you can also install pytorch by following the official instruction (https://pytorch.org/get-started/locally/)
+pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 torchaudio==0.11.0+cu113 -f https://download.pytorch.org/whl/torch_stable.html
+pip install -r requirements.txt
+```
+## Dataset format
+We use training data in webdataset format. For details of our dataset please see https://github.com/LAION-AI/audio-dataset.
+
+Due to copyright reasons, we cannot release the dataset we train this model on. However, we released [LAION-audio-630K](https://github.com/LAION-AI/audio-dataset/tree/main/laion-audio-630k), the data source we used to compose the dataset with link to each audio and their caption. Please refer to [LAION-audio-630K](https://github.com/LAION-AI/audio-dataset/tree/main/laion-audio-630k) for more details. You could download the dataset, preprocess it on your own and train it locally. To train on the local dataset, please change the `--remotedata` in training scripts (see [experiment_scripts](./experiment_scripts) folder) with `--datasetpath <your dir to datasets>`.
+
+You can find an example of our dataset format in [here](https://drive.google.com/drive/folders/1scyH43eQAcrBz-5fAw44C6RNBhC3ejvX?usp=sharing).
+It contains the full ESC50 dataset, split according to the first 5-fold split.
+
+## Training, Fine-tuning and Evaluation
+Please find the script of training, fine-tuning and evaluation (zero-shot and retrieval) in the [experiment_scripts](./experiment_scripts) folder. 
+The scripts included there are the one we used to train our model on a SLURM cluster. 
+You need to change the script to fit your own environment.
+For example, in a single machine multi-GPU setting, you might want to use `torchrun` instead of `srun` to run the script.
+To train on a single GPU machine, use `CUDA_VISIBLE_DEVICES=0 python -m ...` instead of `srun`.
+We use [Weights and Biases](https://wandb.ai/site) for experiment logging. You need to configure the weights and biases in your environment.
+To train on local dataset, please change the `--remotedata` in training scripts (see [experiment_scripts](./experiment_scripts) folder) with `--datasetpath <your dir to datasets>`.
+
+## Core Code
+Please refer to [main.py](https://github.com/LAION-AI/CLAP/blob/laion_clap_pip/src/laion_clap/training/main.py), [train.py](https://github.com/LAION-AI/CLAP/blob/laion_clap_pip/src/laion_clap/training/train.py), [data.py](https://github.com/LAION-AI/CLAP/blob/laion_clap_pip/src/laion_clap/training/data.py),and [model.py](https://github.com/LAION-AI/CLAP/blob/laion_clap_pip/src/laion_clap/clap_module/model.py) to quicly get familiar with our model.
+
+
+## Reproducibility
+An example of the preprocessed Clotho dataset in webdataset format can be download [here](https://drive.google.com/drive/folders/1mU9mBOe11jTFCrQRJQsUa4S-3TlNuYoI?usp=sharing) (by downloading, you will be agreeing the license described in the [Clotho dataset](https://zenodo.org/record/3490684#.Y9ALPeyZP1w)). The audio encoder pretrained with 48kHz AudioSet can be found [here](https://drive.google.com/drive/folders/1SMQyzJvc6DwJNuhQ_WI8tlCFL5HG2vk6?usp=sharing), where `HTSAT-fullset-imagenet-map=0.467.ckpt` is the checkpoint used to initalize our HTSAT audio encoder. You should get similar result by loading from the audio encoder checkpoint and training on same dataset.
+
+The script to train the model on Clotho dataset is included [here](experiment_scripts/train-only-clotho.sh). You need to replace the `datasetpath` and `pretrained-audio` to pointing to your own directory. You could check the [report](https://stability.wandb.io/clap/clap/reports/CLAP-trained-on-Clotho-dataset--VmlldzoyNzY?accessToken=c0erq9hhp7h880jclihd9j9if679s6bylwto33vo14yo5jg40ppe38qeoafoonpz) of the training script on a single A100 GPU for reference.
+
+Because most of the dataset has copyright restriction, unfortunatly we cannot directly share other preprocessed datasets. The caption generated by keyword-to-caption model for Audioset can be found [here](https://github.com/LAION-AI/audio-dataset/tree/main/laion-audio-630k#keyword-to-caption-augmentation)
+
+
+## Zeroshot Classification with ESC50 official split
+
+Here is an example code to run the zeroshot classification on **first** ESC50 official split with the pip API:
+
+```python
+import laion_clap
+import glob
+import json
+import torch
+import numpy as np
+
+device = torch.device('cuda:0')
+
+# download https://drive.google.com/drive/folders/1scyH43eQAcrBz-5fAw44C6RNBhC3ejvX?usp=sharing and extract ./ESC50_1/test/0.tar to ./ESC50_1/test/
+esc50_test_dir = './ESC50_1/test/*/'
+class_index_dict_path = './class_labels/ESC50_class_labels_indices_space.json'
+
+# Load the model
+model = laion_clap.CLAP_Module(enable_fusion=False, device=device)
+model.load_ckpt()
+
+# Get the class index dict
+class_index_dict = {v: k for v, k in json.load(open(class_index_dict_path)).items()}
+
+# Get all the data
+audio_files = sorted(glob.glob(esc50_test_dir + '**/*.flac', recursive=True))
+json_files = sorted(glob.glob(esc50_test_dir + '**/*.json', recursive=True))
+ground_truth_idx = [class_index_dict[json.load(open(jf))['tag'][0]] for jf in json_files]
+
+with torch.no_grad():
+    ground_truth = torch.tensor(ground_truth_idx).view(-1, 1)
+
+    # Get text features
+    all_texts = ["This is a sound of " + t for t in class_index_dict.keys()]
+    text_embed = model.get_text_embedding(all_texts)
+    audio_embed = model.get_audio_embedding_from_filelist(x=audio_files)
+
+    ranking = torch.argsort(torch.tensor(audio_embed) @ torch.tensor(text_embed).t(), descending=True)
+    preds = torch.where(ranking == ground_truth)[1]
+    preds = preds.cpu().numpy()
+
+    metrics = {}
+    metrics[f"mean_rank"] = preds.mean() + 1
+    metrics[f"median_rank"] = np.floor(np.median(preds)) + 1
+    for k in [1, 5, 10]:
+        metrics[f"R@{k}"] = np.mean(preds < k)
+    # map@10
+    metrics[f"mAP@10"] = np.mean(np.where(preds < 10, 1 / (preds + 1), 0.0))
+
+    print(
+        f"Zeroshot Classification Results: "
+        + "\t".join([f"{k}: {round(v, 4):.4f}" for k, v in metrics.items()])
+    )
+```
+
+For ESC50 dataset, you could either download our processed ESC50 in webdataset format 
+from [here](https://drive.google.com/drive/folders/1scyH43eQAcrBz-5fAw44C6RNBhC3ejvX?usp=sharing), and extract the 
+`./test/0.tar` to `./test/`. Or you could download the original ESC50 dataset and 
+preprocess the label to the format of `class_labels/ESC50_class_labels_indices_space.json` by yourself (replace `_` with space).
+
+The result should be the same as the following:
+
+For `model = laion_clap.CLAP_Module(enable_fusion=True, device=device)`: `mean_rank: 1.2425	median_rank: 1.0000	R@1: 0.9050	R@5: 0.9900	R@10: 0.9925	mAP@10: 0.9407`
+
+For `model = laion_clap.CLAP_Module(enable_fusion=False, device=device)`: `mean_rank: 1.1450	median_rank: 1.0000	R@1: 0.9275	R@5: 0.9975	R@10: 1.0000	mAP@10: 0.9556`
+
+Note that the results is slightly higher than the reported results in the paper, because we use the train + test data of ESC50 and removing the data overlap in other training datasets (mainly freesound).
+
+## Citation
+If you find this project and the LAION-Audio-630K dataset useful, please cite our paper:
+```
+@inproceedings{laionclap2023,
+  title = {Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation},
+  author = {Wu*, Yusong and Chen*, Ke and Zhang*, Tianyu and Hui*, Yuchen and Berg-Kirkpatrick, Taylor and Dubnov, Shlomo},
+  booktitle={IEEE International Conference on Acoustics, Speech and Signal Processing, ICASSP},
+  year = {2023}
+}
+@inproceedings{htsatke2022,
+  author = {Ke Chen and Xingjian Du and Bilei Zhu and Zejun Ma and Taylor Berg-Kirkpatrick and Shlomo Dubnov},
+  title = {HTS-AT: A Hierarchical Token-Semantic Audio Transformer for Sound Classification and Detection},
+  booktitle={IEEE International Conference on Acoustics, Speech and Signal Processing, ICASSP},
+  year = {2022}
+}
+```
+
+## Acknowledgements
+
+This project is working in progress, thus the codebase and model might not be perfect or bug-free. 
+We will very much appreciate any kind of contribution or and issue raised.
+If you find a bug or have any suggestion, please feel free to open an issue or contact us.
+If you would actively contribute to this project, please join the discord of LAION.
diff --git a/my_laion_clap/CLAP/assets/audioclip-arch.png b/my_laion_clap/CLAP/assets/audioclip-arch.png
new file mode 100644
index 0000000000000000000000000000000000000000..fec10520b6dd5a0c3fa5e5c1f9e267feb37fc103
Binary files /dev/null and b/my_laion_clap/CLAP/assets/audioclip-arch.png differ
diff --git a/my_laion_clap/CLAP/assets/clap-zeroshot.PNG b/my_laion_clap/CLAP/assets/clap-zeroshot.PNG
new file mode 100644
index 0000000000000000000000000000000000000000..655cea313d5458efe547431b225ebfa45d42921a
Binary files /dev/null and b/my_laion_clap/CLAP/assets/clap-zeroshot.PNG differ
diff --git a/my_laion_clap/CLAP/assets/logo.PNG b/my_laion_clap/CLAP/assets/logo.PNG
new file mode 100644
index 0000000000000000000000000000000000000000..b21fcb38c66bd65080bf935c782d397d518d3627
Binary files /dev/null and b/my_laion_clap/CLAP/assets/logo.PNG differ
diff --git a/my_laion_clap/CLAP/experiment_scripts/esc50_api.py b/my_laion_clap/CLAP/experiment_scripts/esc50_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..168cef6545f240f20ad37491a5f4b4ce203ead9a
--- /dev/null
+++ b/my_laion_clap/CLAP/experiment_scripts/esc50_api.py
@@ -0,0 +1,48 @@
+import laion_clap
+import glob
+import json
+import torch
+import numpy as np
+
+device = torch.device('cuda:0')
+
+# download https://drive.google.com/drive/folders/1scyH43eQAcrBz-5fAw44C6RNBhC3ejvX?usp=sharing and extract ./ESC50_1/test/0.tar to ./ESC50_1/test/
+esc50_test_dir = './ESC50_1/test/*/'
+class_index_dict_path = '/fsx/yusong/CLAP/class_labels/ESC50_class_labels_indices_space.json'
+
+# Load the model
+model = laion_clap.CLAP_Module(enable_fusion=False, device=device)
+model.load_ckpt()
+
+# Get the class index dict
+class_index_dict = {v: k for v, k in json.load(open(class_index_dict_path)).items()}
+
+# Get all the data
+audio_files = sorted(glob.glob(esc50_test_dir + '**/*.flac', recursive=True))
+json_files = sorted(glob.glob(esc50_test_dir + '**/*.json', recursive=True))
+ground_truth_idx = [class_index_dict[json.load(open(jf))['tag'][0]] for jf in json_files]
+
+with torch.no_grad():
+    ground_truth = torch.tensor(ground_truth_idx).view(-1, 1)
+
+    # Get text features
+    all_texts = ["This is a sound of " + t for t in class_index_dict.keys()]
+    text_embed = model.get_text_embedding(all_texts)
+    audio_embed = model.get_audio_embedding_from_filelist(x=audio_files)
+
+    ranking = torch.argsort(torch.tensor(audio_embed) @ torch.tensor(text_embed).t(), descending=True)
+    preds = torch.where(ranking == ground_truth)[1]
+    preds = preds.cpu().numpy()
+
+    metrics = {}
+    metrics[f"mean_rank"] = preds.mean() + 1
+    metrics[f"median_rank"] = np.floor(np.median(preds)) + 1
+    for k in [1, 5, 10]:
+        metrics[f"R@{k}"] = np.mean(preds < k)
+    # map@10
+    metrics[f"mAP@10"] = np.mean(np.where(preds < 10, 1 / (preds + 1), 0.0))
+
+    print(
+        f"Zeroshot Classification Results: "
+        + "\t".join([f"{k}: {round(v, 4):.4f}" for k, v in metrics.items()])
+    )
diff --git a/my_laion_clap/CLAP/experiment_scripts/eval_retrieval_freesound.sh b/my_laion_clap/CLAP/experiment_scripts/eval_retrieval_freesound.sh
new file mode 100644
index 0000000000000000000000000000000000000000..d58ac3437d5e49ee17dc3bee767fd2aa9d426295
--- /dev/null
+++ b/my_laion_clap/CLAP/experiment_scripts/eval_retrieval_freesound.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+#SBATCH --comment clap
+#SBATCH --partition=g40423
+#SBATCH --job-name=mclap
+#SBATCH --nodes 3
+#SBATCH --ntasks-per-node 8
+#SBATCH --cpus-per-gpu=6
+#SBATCH --exclusive
+#SBATCH --output=%x_%j.out
+
+module load openmpi
+module load cuda/11.7
+export NCCL_PROTO=simple
+export FI_EFA_FORK_SAFE=1
+export FI_LOG_LEVEL=1
+export FI_EFA_USE_DEVICE_RDMA=1 # use for p4dn
+export NCCL_DEBUG=info
+export OMPI_MCA_mtl_base_verbose=1
+export FI_EFA_ENABLE_SHM_TRANSFER=0
+export FI_PROVIDER=efa
+export FI_EFA_TX_MIN_CREDITS=64
+export NCCL_TREE_THRESHOLD=0
+
+# sent to sub script
+export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"`
+export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
+export MASTER_PORT=12802
+export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l`
+
+echo go $COUNT_NODE
+echo $HOSTNAMES
+
+source /fsx/yusong/clap/bin/activate
+cd /fsx/yusong/CLAP/src
+export TRANSFORMERS_CACHE=/fsx/yusong/transformers_cache
+
+srun --comment clap --cpu_bind=v --accel-bind=gn python -m evaluate.eval_retrieval_main \
+    --save-frequency 5 \
+    --save-top-performance 3 \
+    --save-most-recent \
+    --dataset-type="webdataset" \
+    --precision="fp32" \
+    --warmup 0 \
+    --batch-size=512 \
+    --wd=0.0 \
+    --epochs=50 \
+    --workers=6 \
+    --use-bn-sync \
+    --freeze-text \
+    --amodel HTSAT-tiny \
+    --tmodel roberta \
+    --report-to "wandb" \
+    --wandb-notes "10.17-freesound-dataset-4#" \
+    --datasetnames "freesound_no_overlap_noesc50" \
+    --datasetinfos "train" \
+    --seed 3407 \
+    --remotedata \
+    --logs /fsx/clap_logs \
+    --gather-with-grad \
+    --openai-model-cache-dir /fsx/yusong/transformers_cache \
+    --data-filling "repeatpad" \
+    --data-truncating "rand_trunc" \
+    --pretrained="/fsx/clap_logs/2022_10_17-02_08_21-model_HTSAT-tiny-lr_0.0001-b_96-j_6-p_fp32/checkpoints"
\ No newline at end of file
diff --git a/my_laion_clap/CLAP/experiment_scripts/finetune-esc50.sh b/my_laion_clap/CLAP/experiment_scripts/finetune-esc50.sh
new file mode 100644
index 0000000000000000000000000000000000000000..14144ab47fbef0b8f7646a244bda11ae3cffaeb4
--- /dev/null
+++ b/my_laion_clap/CLAP/experiment_scripts/finetune-esc50.sh
@@ -0,0 +1,70 @@
+#!/bin/bash
+#SBATCH --comment clap
+#SBATCH --partition=g40423
+#SBATCH --job-name=mclap
+#SBATCH --nodes 3
+#SBATCH --ntasks-per-node 8
+#SBATCH --cpus-per-gpu=6
+#SBATCH --exclusive
+#SBATCH --output=%x_%j.out
+
+module load openmpi
+module load cuda/11.7
+export NCCL_PROTO=simple
+export FI_EFA_FORK_SAFE=1
+export FI_LOG_LEVEL=1
+export FI_EFA_USE_DEVICE_RDMA=1 # use for p4dn
+export NCCL_DEBUG=info
+export OMPI_MCA_mtl_base_verbose=1
+export FI_EFA_ENABLE_SHM_TRANSFER=0
+export FI_PROVIDER=efa
+export FI_EFA_TX_MIN_CREDITS=64
+export NCCL_TREE_THRESHOLD=0
+
+# sent to sub script
+export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"`
+export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
+export MASTER_PORT=12802
+export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l`
+
+echo go $COUNT_NODE
+echo $HOSTNAMES
+
+source /fsx/yusong/clap/bin/activate
+cd /fsx/yusong/CLAP/src
+export TRANSFORMERS_CACHE=/fsx/yusong/transformers_cache
+
+srun --comment clap --cpu_bind=v --accel-bind=gn python -m evaluate.eval_linear_probe \
+    --save-frequency 50 \
+    --save-top-performance 3 \
+    --save-most-recent \
+    --dataset-type="webdataset" \
+    --precision="fp32" \
+    --warmup 0 \
+    --batch-size=160 \
+    --lr=1e-4 \
+    --wd=0.1 \
+    --epochs=100 \
+    --workers=4 \
+    --use-bn-sync \
+    --freeze-text \
+    --amodel PANN-14 \
+    --tmodel roberta \
+    --report-to "wandb" \
+    --wandb-notes "10.14-finetune-esc50" \
+    --datasetnames "esc50" \
+    --datasetinfos "train" \
+    --seed 3407 \
+    --remotedata \
+    --logs /fsx/clap_logs \
+    --gather-with-grad \
+    --lp-loss="ce" \
+    --lp-metrics="acc" \
+    --lp-lr=1e-4 \
+    --lp-mlp \
+    --class-label-path="../class_labels/ESC50_class_labels_indices_space.json" \
+    --openai-model-cache-dir /fsx/yusong/transformers_cache \
+    --pretrained="/fsx/clap_logs/2022_10_14-04_05_14-model_PANN-14-lr_0.0001-b_160-j_6-p_fp32/checkpoints" \
+    --data-filling "repeatpad" \
+    --data-truncating "rand_trunc" \
+    --optimizer "adam"
\ No newline at end of file
diff --git a/my_laion_clap/CLAP/experiment_scripts/finetune-fsd50k.sh b/my_laion_clap/CLAP/experiment_scripts/finetune-fsd50k.sh
new file mode 100644
index 0000000000000000000000000000000000000000..e27d9155b62c3783673ea85243744d56251555b8
--- /dev/null
+++ b/my_laion_clap/CLAP/experiment_scripts/finetune-fsd50k.sh
@@ -0,0 +1,70 @@
+#!/bin/bash
+#SBATCH --comment clap
+#SBATCH --partition=g40423
+#SBATCH --job-name=mclap
+#SBATCH --nodes 3
+#SBATCH --ntasks-per-node 8
+#SBATCH --cpus-per-gpu=6
+#SBATCH --exclusive
+#SBATCH --output=%x_%j.out
+
+module load openmpi
+module load cuda/11.7
+export NCCL_PROTO=simple
+export FI_EFA_FORK_SAFE=1
+export FI_LOG_LEVEL=1
+export FI_EFA_USE_DEVICE_RDMA=1 # use for p4dn
+export NCCL_DEBUG=info
+export OMPI_MCA_mtl_base_verbose=1
+export FI_EFA_ENABLE_SHM_TRANSFER=0
+export FI_PROVIDER=efa
+export FI_EFA_TX_MIN_CREDITS=64
+export NCCL_TREE_THRESHOLD=0
+
+# sent to sub script
+export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"`
+export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
+export MASTER_PORT=12802
+export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l`
+
+echo go $COUNT_NODE
+echo $HOSTNAMES
+
+source /fsx/yusong/clap/bin/activate
+cd /fsx/yusong/CLAP/src
+export TRANSFORMERS_CACHE=/fsx/yusong/transformers_cache
+
+srun --comment clap --cpu_bind=v --accel-bind=gn python -m evaluate.eval_linear_probe \
+    --save-frequency 50 \
+    --save-top-performance 3 \
+    --save-most-recent \
+    --dataset-type="webdataset" \
+    --precision="fp32" \
+    --warmup 0 \
+    --batch-size=160 \
+    --lr=1e-4 \
+    --wd=0.1 \
+    --epochs=100 \
+    --workers=4 \
+    --use-bn-sync \
+    --freeze-text \
+    --amodel PANN-14 \
+    --tmodel roberta \
+    --report-to "wandb" \
+    --wandb-notes "10.14-finetune-fsd50k" \
+    --datasetnames "fsd50k_class_label" \
+    --datasetinfos "train" \
+    --seed 3407 \
+    --remotedata \
+    --logs /fsx/clap_logs \
+    --gather-with-grad \
+    --lp-loss="bce" \
+    --lp-metrics="map" \
+    --lp-lr=1e-4 \
+    --lp-mlp \
+    --class-label-path="../class_labels/FSD50k_class_labels_indices.json" \
+    --openai-model-cache-dir /fsx/yusong/transformers_cache \
+    --pretrained="/fsx/clap_logs/2022_10_14-04_05_14-model_PANN-14-lr_0.0001-b_160-j_6-p_fp32/checkpoints" \
+    --data-filling "repeatpad" \
+    --data-truncating "rand_trunc" \
+    --optimizer "adam"
\ No newline at end of file
diff --git a/my_laion_clap/CLAP/experiment_scripts/htsat-roberta-large-dataset-fusion.sh b/my_laion_clap/CLAP/experiment_scripts/htsat-roberta-large-dataset-fusion.sh
new file mode 100644
index 0000000000000000000000000000000000000000..70082a415b2dfcb93c0c2e7616ac9911f24f9156
--- /dev/null
+++ b/my_laion_clap/CLAP/experiment_scripts/htsat-roberta-large-dataset-fusion.sh
@@ -0,0 +1,70 @@
+#!/bin/bash
+#SBATCH --comment clap
+#SBATCH --partition=g40423
+#SBATCH --job-name=mclap
+#SBATCH --nodes 3
+#SBATCH --ntasks-per-node 8
+#SBATCH --cpus-per-gpu=6
+#SBATCH --exclusive
+#SBATCH --output=%x_%j.out
+
+module load openmpi
+module load cuda/11.7
+export NCCL_PROTO=simple
+export FI_EFA_FORK_SAFE=1
+export FI_LOG_LEVEL=1
+export FI_EFA_USE_DEVICE_RDMA=1 # use for p4dn
+export NCCL_DEBUG=info
+export OMPI_MCA_mtl_base_verbose=1
+export FI_EFA_ENABLE_SHM_TRANSFER=0
+export FI_PROVIDER=efa
+export FI_EFA_TX_MIN_CREDITS=64
+export NCCL_TREE_THRESHOLD=0
+
+# sent to sub script
+export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"`
+export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
+export MASTER_PORT=12802
+export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l`
+
+echo go $COUNT_NODE
+echo $HOSTNAMES
+
+source /fsx/yusong/clap/bin/activate
+cd /fsx/yusong/CLAP/src
+export TRANSFORMERS_CACHE=/fsx/yusong/transformers_cache
+
+srun --comment clap --cpu_bind=v --accel-bind=gn python -m training.main \
+    --save-frequency 5 \
+    --save-top-performance 3 \
+    --save-most-recent \
+    --dataset-type="webdataset" \
+    --precision="fp32" \
+    --batch-size=96 \
+    --lr=1e-4 \
+    --wd=0.0 \
+    --epochs=45 \
+    --workers=6 \
+    --use-bn-sync \
+    --amodel HTSAT-tiny \
+    --tmodel roberta \
+    --warmup 3200 \
+    --report-to "wandb" \
+    --wandb-notes "10.16-clap-dataset-2#-htsat-roberta-fusion" \
+    --datasetnames "Clotho" "audiocaps" "BBCSoundEffects" "free_to_use_sounds" "paramount_motion" "sonniss_game_effects" "wesoundeffects" "freesound_no_overlap_noesc50" "audiostock" "epidemic_sound_effects" "fsd50k_class_label" "MACS" "WavText5K" \
+    --full-train-dataset "BBCSoundEffects" "free_to_use_sounds" "paramount_motion" "sonniss_game_effects" "wesoundeffects" "audiostock" "epidemic_sound_effects" "fsd50k_class_label" \
+    --exclude-eval-dataset "freesound_no_overlap_noesc50" "MACS" "WavText5K" "fsd50k_class_label" \
+    --datasetinfos "train" "unbalanced_train" \
+    --top-k-checkpoint-select-dataset="Clotho-test" \
+    --top-k-checkpoint-select-metric="mAP@10" \
+    --openai-model-cache-dir /fsx/yusong/transformers_cache \
+    --logs /fsx/clap_logs \
+    --seed 3407 \
+    --remotedata \
+    --gather-with-grad \
+    --optimizer "adam" \
+    --data-filling "repeatpad" \
+    --data-truncating "fusion" \
+    --enable-fusion \
+    --fusion-type "aff_2d" \
+    --pretrained-audio /fsx/yusong/audio_pretrained_model/HTSAT-fullset-imagenet-map=0.467.ckpt
diff --git a/my_laion_clap/CLAP/experiment_scripts/train-htsat-roberta.sh b/my_laion_clap/CLAP/experiment_scripts/train-htsat-roberta.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c2da5432456df02c2712766f0402e9ade6b522d7
--- /dev/null
+++ b/my_laion_clap/CLAP/experiment_scripts/train-htsat-roberta.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+#SBATCH --comment clap
+#SBATCH --partition=g40423
+#SBATCH --job-name=mclap
+#SBATCH --nodes 3
+#SBATCH --ntasks-per-node 8
+#SBATCH --cpus-per-gpu=6
+#SBATCH --exclusive
+#SBATCH --output=%x_%j.out
+
+module load openmpi
+module load cuda/11.7
+export NCCL_PROTO=simple
+export FI_EFA_FORK_SAFE=1
+export FI_LOG_LEVEL=1
+export FI_EFA_USE_DEVICE_RDMA=1 # use for p4dn
+export NCCL_DEBUG=info
+export OMPI_MCA_mtl_base_verbose=1
+export FI_EFA_ENABLE_SHM_TRANSFER=0
+export FI_PROVIDER=efa
+export FI_EFA_TX_MIN_CREDITS=64
+export NCCL_TREE_THRESHOLD=0
+
+# sent to sub script
+export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"`
+export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
+export MASTER_PORT=12802
+export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l`
+
+echo go $COUNT_NODE
+echo $HOSTNAMES
+
+source /fsx/yusong/clap/bin/activate
+cd /fsx/yusong/CLAP/src
+export TRANSFORMERS_CACHE=/fsx/yusong/transformers_cache
+
+srun --comment clap --cpu_bind=v --accel-bind=gn python -m training.main \
+    --save-frequency 5 \
+    --save-top-performance 3 \
+    --save-most-recent \
+    --dataset-type="webdataset" \
+    --precision="fp32" \
+    --batch-size=96 \
+    --lr=1e-4 \
+    --wd=0.0 \
+    --epochs=45 \
+    --workers=6 \
+    --use-bn-sync \
+    --amodel HTSAT-tiny \
+    --tmodel roberta \
+    --warmup 3200 \
+    --report-to "wandb" \
+    --wandb-notes "10.16-clap-dataset-1#-htsat-roberta" \
+    --datasetnames "Clotho" "audiocaps" \
+    --datasetinfos "train" "unbalanced_train" \
+    --top-k-checkpoint-select-dataset="Clotho-test" \
+    --top-k-checkpoint-select-metric="mAP@10" \
+    --openai-model-cache-dir /fsx/yusong/transformers_cache \
+    --logs /fsx/clap_logs \
+    --seed 3407 \
+    --remotedata \
+    --gather-with-grad \
+    --optimizer "adam" \
+    --data-filling "repeatpad" \
+    --data-truncating "rand_trunc" \
+    --pretrained-audio /fsx/yusong/audio_pretrained_model/HTSAT-fullset-imagenet-map=0.467.ckpt
diff --git a/my_laion_clap/CLAP/experiment_scripts/train-only-clotho.sh b/my_laion_clap/CLAP/experiment_scripts/train-only-clotho.sh
new file mode 100644
index 0000000000000000000000000000000000000000..4d2d1d81a8f2911eb6b3f2f98e8fc01f21228239
--- /dev/null
+++ b/my_laion_clap/CLAP/experiment_scripts/train-only-clotho.sh
@@ -0,0 +1,28 @@
+python -m laion_clap.training.main \
+    --save-frequency 5 \
+    --save-top-performance 3 \
+    --save-most-recent \
+    --dataset-type="webdataset" \
+    --datasetpath="<to-your-directory-containing-Clotho-not-the-path-to-Clotho>" \
+    --precision="fp32" \
+    --batch-size=96 \
+    --lr=1e-4 \
+    --wd=0.0 \
+    --epochs=45 \
+    --workers=6 \
+    --use-bn-sync \
+    --amodel HTSAT-tiny \
+    --tmodel roberta \
+    --warmup 3200 \
+    --datasetnames "Clotho" \
+    --datasetinfos "train" \
+    --top-k-checkpoint-select-dataset="Clotho-test" \
+    --top-k-checkpoint-select-metric="mAP@10" \
+    --logs 'logs' \
+    --seed 3407 \
+    --gather-with-grad \
+    --optimizer "adam" \
+    --data-filling "repeatpad" \
+    --data-truncating "rand_trunc" \
+    --pretrained-audio '<path-to>/HTSAT-fullset-imagenet-map=0.467.ckpt' \
+    --prefetch-factor 2
\ No newline at end of file
diff --git a/my_laion_clap/CLAP/experiment_scripts/train-pann-roberta.sh b/my_laion_clap/CLAP/experiment_scripts/train-pann-roberta.sh
new file mode 100644
index 0000000000000000000000000000000000000000..8294157caa7713f78ad9fecc70f5fda3b7d19d7c
--- /dev/null
+++ b/my_laion_clap/CLAP/experiment_scripts/train-pann-roberta.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+#SBATCH --comment clap
+#SBATCH --partition=g40423
+#SBATCH --job-name=mclap
+#SBATCH --nodes 3
+#SBATCH --ntasks-per-node 8
+#SBATCH --cpus-per-gpu=6
+#SBATCH --exclusive
+#SBATCH --output=%x_%j.out
+
+module load openmpi
+module load cuda/11.7
+export NCCL_PROTO=simple
+export FI_EFA_FORK_SAFE=1
+export FI_LOG_LEVEL=1
+export FI_EFA_USE_DEVICE_RDMA=1 # use for p4dn
+export NCCL_DEBUG=info
+export OMPI_MCA_mtl_base_verbose=1
+export FI_EFA_ENABLE_SHM_TRANSFER=0
+export FI_PROVIDER=efa
+export FI_EFA_TX_MIN_CREDITS=64
+export NCCL_TREE_THRESHOLD=0
+
+# sent to sub script
+export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"`
+export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
+export MASTER_PORT=12802
+export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l`
+
+echo go $COUNT_NODE
+echo $HOSTNAMES
+
+source /fsx/yusong/clap/bin/activate
+cd /fsx/yusong/CLAP/src
+export TRANSFORMERS_CACHE=/fsx/yusong/transformers_cache
+
+srun --comment clap --cpu_bind=v --accel-bind=gn python -m training.main \
+    --save-frequency 5 \
+    --save-top-performance 3 \
+    --save-most-recent \
+    --dataset-type="webdataset" \
+    --precision="fp32" \
+    --batch-size=96 \
+    --lr=1e-4 \
+    --wd=0.0 \
+    --epochs=45 \
+    --workers=6 \
+    --use-bn-sync \
+    --amodel PANN-14 \
+    --tmodel roberta \
+    --warmup 500 \
+    --report-to "wandb" \
+    --wandb-notes "10.16-clap-dataset-1#-pann-roberta" \
+    --datasetnames "Clotho" "audiocaps" \
+    --datasetinfos "train" "unbalanced_train" \
+    --top-k-checkpoint-select-dataset="Clotho-test" \
+    --top-k-checkpoint-select-metric="mAP@10" \
+    --openai-model-cache-dir /fsx/yusong/transformers_cache \
+    --logs /fsx/clap_logs \
+    --seed 3407 \
+    --remotedata \
+    --gather-with-grad \
+    --optimizer "adam" \
+    --data-filling "repeatpad" \
+    --data-truncating "rand_trunc" \
+    --pretrained-audio /fsx/yusong/audio_pretrained_model/PANN-fullset-map=0.439.ckpt
diff --git a/my_laion_clap/CLAP/experiment_scripts/zeroshot_esc50.sh b/my_laion_clap/CLAP/experiment_scripts/zeroshot_esc50.sh
new file mode 100644
index 0000000000000000000000000000000000000000..51d2703daeb06220334e9a99d17a35ee6666c1ce
--- /dev/null
+++ b/my_laion_clap/CLAP/experiment_scripts/zeroshot_esc50.sh
@@ -0,0 +1,19 @@
+# run from CLAP directory
+python -m evaluate.eval_zeroshot_classification \
+  --dataset-type="webdataset" \
+  --precision="fp32" \
+  --batch-size=512 \
+  --workers=6 \
+  --amodel HTSAT-tiny \
+  --tmodel roberta \
+  --datasetnames "esc50_no_overlap" \
+  --remotedata \
+  --datasetinfos "train" \
+  --seed 3407 \
+  --logs ./logs \
+  --data-filling "repeatpad" \
+  --data-truncating "rand_trunc" \
+  --freeze-text \
+  --class-label-path="../class_labels/ESC50_class_labels_indices_space.json" \
+  --pretrained="/fsx/clap_logs/2023_02_18-00_03_45-model_HTSAT-tiny-lr_0.0001-b_96-j_6-p_fp32/checkpoints"
+
diff --git a/my_laion_clap/CLAP/pyproject.toml b/my_laion_clap/CLAP/pyproject.toml
new file mode 100644
index 0000000000000000000000000000000000000000..9d913bd2afe62a82aeec7b171ecc7d0b9a35179a
--- /dev/null
+++ b/my_laion_clap/CLAP/pyproject.toml
@@ -0,0 +1,54 @@
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "laion_clap"
+version = "1.1.4"
+authors = [
+  { name="Ke Chen", email="knutchen@ucsd.edu" },
+  { name="Yusong Wu" },
+  { name="Tianyu Zhang" },
+  { name="Yuchen Hui" }
+]
+maintainers = [
+  { name="Ke Chen", email="knutchen@ucsd.edu" },
+  { name="Yusong Wu" },
+  { name="Tianyu Zhang" },
+  { name="Yuchen Hui" }
+]
+description = "Contrastive Language-Audio Pretraining Model from LAION"
+license = {file = "LICENSE"}
+readme = "README.md"
+requires-python = ">=3.7"
+dependencies = [
+  "numpy==1.23.5",
+  "soundfile",
+  "librosa",
+  "torchlibrosa",
+  "ftfy",
+  "braceexpand",
+  "webdataset",
+  "wget",
+  "wandb",
+  "llvmlite",
+  "scipy",
+  "scikit-learn",
+  "pandas",
+  "h5py",
+  "tqdm",
+  "regex",
+  "transformers",
+  "progressbar"
+]
+classifiers = [
+    'Development Status :: 3 - Alpha',
+    'Intended Audience :: Developers',
+    'Intended Audience :: Science/Research',
+    'License :: OSI Approved :: Apache Software License',
+    'Topic :: Scientific/Engineering :: Artificial Intelligence',
+]
+
+
+[project.urls]
+"Homepage" = "https://github.com/LAION-AI/CLAP"
+"Bug Tracker" = "https://github.com/LAION-AI/CLAP/issues"
\ No newline at end of file
diff --git a/my_laion_clap/CLAP/requirements.txt b/my_laion_clap/CLAP/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..de15c8e96fd9deb5b36e7f11efdbc4e44480ef7e
--- /dev/null
+++ b/my_laion_clap/CLAP/requirements.txt
@@ -0,0 +1,16 @@
+soundfile
+librosa
+torchlibrosa
+ftfy
+braceexpand
+webdataset
+wget
+wandb
+llvmlite
+scipy
+scikit-learn
+pandas
+h5py
+tqdm
+regex
+transformers<=4.30.2
diff --git a/my_laion_clap/CLAP/src/laion_clap/__init__.py b/my_laion_clap/CLAP/src/laion_clap/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..96d4b618dcb091479e2c9092ea2b807527f239de
--- /dev/null
+++ b/my_laion_clap/CLAP/src/laion_clap/__init__.py
@@ -0,0 +1,5 @@
+import os
+import sys
+dir_path = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(dir_path)
+from .hook import CLAP_Module
\ No newline at end of file
diff --git a/my_laion_clap/CLAP/src/laion_clap/__pycache__/__init__.cpython-38.pyc b/my_laion_clap/CLAP/src/laion_clap/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..195b5151c6b01018f0095b65e23bad759797c542
Binary files /dev/null and b/my_laion_clap/CLAP/src/laion_clap/__pycache__/__init__.cpython-38.pyc differ
diff --git a/my_laion_clap/CLAP/src/laion_clap/__pycache__/hook.cpython-38.pyc b/my_laion_clap/CLAP/src/laion_clap/__pycache__/hook.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..31a7d9bee60052c1d707cb4a36b06d9b64f347e5
Binary files /dev/null and b/my_laion_clap/CLAP/src/laion_clap/__pycache__/hook.cpython-38.pyc differ
diff --git a/my_laion_clap/CLAP/src/laion_clap/clap_module/__init__.py b/my_laion_clap/CLAP/src/laion_clap/clap_module/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b585be6540fe21eef8bc6594375baee5017877ef
--- /dev/null
+++ b/my_laion_clap/CLAP/src/laion_clap/clap_module/__init__.py
@@ -0,0 +1,8 @@
+from .factory import list_models, create_model, create_model_and_transforms, add_model_config
+from .loss import ClipLoss, gather_features, LPLoss, lp_gather_features, LPMetrics
+from .model import CLAP, CLAPTextCfg, CLAPVisionCfg, CLAPAudioCfp, convert_weights_to_fp16, trace_model
+from .openai import load_openai_model, list_openai_models
+from .pretrained import list_pretrained, list_pretrained_tag_models, list_pretrained_model_tags,\
+    get_pretrained_url, download_pretrained
+from .tokenizer import SimpleTokenizer, tokenize
+from .transform import image_transform
\ No newline at end of file
diff --git a/my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/__init__.cpython-38.pyc b/my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3d2e51728258589c50cf49a2558ac065f93bd3b5
Binary files /dev/null and b/my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/__init__.cpython-38.pyc differ
diff --git a/my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/factory.cpython-38.pyc b/my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/factory.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d4208450eb6345e4babb7217df0fac0373b3ed68
Binary files /dev/null and b/my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/factory.cpython-38.pyc differ
diff --git a/my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/feature_fusion.cpython-38.pyc b/my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/feature_fusion.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..54f230019b240963b97a9ab513b8a005ae43598e
Binary files /dev/null and b/my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/feature_fusion.cpython-38.pyc differ
diff --git a/my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/htsat.cpython-38.pyc b/my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/htsat.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..98833710164561604b9276e7794ee3245c701344
Binary files /dev/null and b/my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/htsat.cpython-38.pyc differ
diff --git a/my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/loss.cpython-38.pyc b/my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/loss.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..119c9d72d27c59666bfcc37ae5130a9619da9108
Binary files /dev/null and b/my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/loss.cpython-38.pyc differ
diff --git a/my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/model.cpython-38.pyc b/my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/model.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0e91762ee0a1b8eb908480802eb568f1f0eae051
Binary files /dev/null and b/my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/model.cpython-38.pyc differ
diff --git a/my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/openai.cpython-38.pyc b/my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/openai.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..540b8aae544eed2a43bca86fbaa427f8d64e3903
Binary files /dev/null and b/my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/openai.cpython-38.pyc differ
diff --git a/my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/pann_model.cpython-38.pyc b/my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/pann_model.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..32bb90402110ae39cfc97ede5299c4cf8cf579f2
Binary files /dev/null and b/my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/pann_model.cpython-38.pyc differ
diff --git a/my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/pretrained.cpython-38.pyc b/my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/pretrained.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f3b4a8bb2a02cd9f4ce07067064bc9c8c211cfe6
Binary files /dev/null and b/my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/pretrained.cpython-38.pyc differ
diff --git a/my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/timm_model.cpython-38.pyc b/my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/timm_model.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..25a46e2b04373314b77b0daa4ee2703f3b922d38
Binary files /dev/null and b/my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/timm_model.cpython-38.pyc differ
diff --git a/my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/tokenizer.cpython-38.pyc b/my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/tokenizer.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..32c6d99452f4fb0d0085b7016bae95715ee22026
Binary files /dev/null and b/my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/tokenizer.cpython-38.pyc differ
diff --git a/my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/transform.cpython-38.pyc b/my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/transform.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a86e557e5303e4c575d1066fc1c4d473717ed6af
Binary files /dev/null and b/my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/transform.cpython-38.pyc differ
diff --git a/my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/utils.cpython-38.pyc b/my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/utils.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..907a0d4ab6665ebb4a923700efc8400cfdda00c9
Binary files /dev/null and b/my_laion_clap/CLAP/src/laion_clap/clap_module/__pycache__/utils.cpython-38.pyc differ
diff --git a/my_laion_clap/CLAP/src/laion_clap/clap_module/bert.py b/my_laion_clap/CLAP/src/laion_clap/clap_module/bert.py
new file mode 100644
index 0000000000000000000000000000000000000000..005e72dec67e4b1c05063dbd4d024166344fd2c4
--- /dev/null
+++ b/my_laion_clap/CLAP/src/laion_clap/clap_module/bert.py
@@ -0,0 +1,32 @@
+from transformers import BertTokenizer, BertModel
+tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+model = BertModel.from_pretrained("bert-base-uncased")
+text = "Replace me by any text you'd like."
+
+def bert_embeddings(text):
+    # text = "Replace me by any text you'd like."
+    encoded_input = tokenizer(text, return_tensors='pt')
+    output = model(**encoded_input)
+    return output
+    
+from transformers import RobertaTokenizer, RobertaModel
+
+tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
+model = RobertaModel.from_pretrained('roberta-base')
+text = "Replace me by any text you'd like."
+def Roberta_embeddings(text):
+    # text = "Replace me by any text you'd like."
+    encoded_input = tokenizer(text, return_tensors='pt')
+    output = model(**encoded_input)
+    return output
+
+from transformers import BartTokenizer, BartModel
+
+tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
+model = BartModel.from_pretrained('facebook/bart-base')
+text = "Replace me by any text you'd like."
+def bart_embeddings(text):
+    # text = "Replace me by any text you'd like."
+    encoded_input = tokenizer(text, return_tensors='pt')
+    output = model(**encoded_input)
+    return output
\ No newline at end of file
diff --git a/my_laion_clap/CLAP/src/laion_clap/clap_module/bpe_simple_vocab_16e6.txt.gz b/my_laion_clap/CLAP/src/laion_clap/clap_module/bpe_simple_vocab_16e6.txt.gz
new file mode 100644
index 0000000000000000000000000000000000000000..36a15856e00a06a9fbed8cdd34d2393fea4a3113
--- /dev/null
+++ b/my_laion_clap/CLAP/src/laion_clap/clap_module/bpe_simple_vocab_16e6.txt.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:924691ac288e54409236115652ad4aa250f48203de50a9e4722a6ecd48d6804a
+size 1356917
diff --git a/my_laion_clap/CLAP/src/laion_clap/clap_module/factory.py b/my_laion_clap/CLAP/src/laion_clap/clap_module/factory.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c3b28658adb03462b9c4b5405548d4e0d1edc5e
--- /dev/null
+++ b/my_laion_clap/CLAP/src/laion_clap/clap_module/factory.py
@@ -0,0 +1,257 @@
+import json
+import logging
+import os
+import pathlib
+import re
+from copy import deepcopy
+from pathlib import Path
+
+import torch
+
+from .model import CLAP, convert_weights_to_fp16
+from .openai import load_openai_model
+from .pretrained import get_pretrained_url, download_pretrained
+from .transform import image_transform
+
+_MODEL_CONFIG_PATHS = [Path(__file__).parent / f"model_configs/"]
+_MODEL_CONFIGS = {}  # directory (model_name: config) of model architecture configs
+
+
+def _natural_key(string_):
+    return [int(s) if s.isdigit() else s for s in re.split(r"(\d+)", string_.lower())]
+
+
+def _rescan_model_configs():
+    global _MODEL_CONFIGS
+
+    config_ext = (".json",)
+    config_files = []
+    for config_path in _MODEL_CONFIG_PATHS:
+        if config_path.is_file() and config_path.suffix in config_ext:
+            config_files.append(config_path)
+        elif config_path.is_dir():
+            for ext in config_ext:
+                config_files.extend(config_path.glob(f"*{ext}"))
+
+    for cf in config_files:
+        with open(cf, "r") as f:
+            model_cfg = json.load(f)
+            if all(a in model_cfg for a in ("embed_dim", "audio_cfg", "text_cfg")):
+                _MODEL_CONFIGS[cf.stem] = model_cfg
+
+    _MODEL_CONFIGS = {
+        k: v
+        for k, v in sorted(_MODEL_CONFIGS.items(), key=lambda x: _natural_key(x[0]))
+    }
+
+
+_rescan_model_configs()  # initial populate of model config registry
+
+
+def load_state_dict(checkpoint_path: str, map_location="cpu", skip_params=True):
+    checkpoint = torch.load(checkpoint_path, map_location=map_location)
+    if isinstance(checkpoint, dict) and "state_dict" in checkpoint:
+        state_dict = checkpoint["state_dict"]
+    else:
+        state_dict = checkpoint
+    if skip_params:
+        if next(iter(state_dict.items()))[0].startswith("module"):
+            state_dict = {k[7:]: v for k, v in state_dict.items()}
+    # for k in state_dict:
+    #     if k.startswith('transformer'):
+    #         v = state_dict.pop(k)
+    #         state_dict['text_branch.' + k[12:]] = v
+    return state_dict
+
+
+def create_model(
+    amodel_name: str,
+    tmodel_name: str,
+    pretrained: str = "",
+    precision: str = "fp32",
+    device: torch.device = torch.device("cpu"),
+    jit: bool = False,
+    force_quick_gelu: bool = False,
+    openai_model_cache_dir: str = os.path.expanduser("~/.cache/clip"),
+    skip_params=True,
+    pretrained_audio: str = "",
+    pretrained_text: str = "",
+    enable_fusion: bool = False,
+    fusion_type: str = 'None'
+    # pretrained_image: bool = False,
+):
+    amodel_name = amodel_name.replace(
+        "/", "-"
+    )  # for callers using old naming with / in ViT names
+    pretrained_orig = pretrained
+    pretrained = pretrained.lower()
+    if pretrained == "openai":
+        if amodel_name in _MODEL_CONFIGS:
+            logging.info(f"Loading {amodel_name} model config.")
+            model_cfg = deepcopy(_MODEL_CONFIGS[amodel_name])
+        else:
+            logging.error(
+                f"Model config for {amodel_name} not found; available models {list_models()}."
+            )
+            raise RuntimeError(f"Model config for {amodel_name} not found.")
+
+        logging.info(f"Loading pretrained ViT-B-16 text encoder from OpenAI.")
+        # Hard Code in model name
+        model_cfg["text_cfg"]["model_type"] = tmodel_name
+        model = load_openai_model(
+            "ViT-B-16",
+            model_cfg,
+            device=device,
+            jit=jit,
+            cache_dir=openai_model_cache_dir,
+            enable_fusion=enable_fusion,
+            fusion_type=fusion_type
+        )
+        # See https://discuss.pytorch.org/t/valueerror-attemting-to-unscale-fp16-gradients/81372
+        if precision == "amp" or precision == "fp32":
+            model = model.float()
+    else:
+        if amodel_name in _MODEL_CONFIGS:
+            logging.info(f"Loading {amodel_name} model config.")
+            model_cfg = deepcopy(_MODEL_CONFIGS[amodel_name])
+        else:
+            logging.error(
+                f"Model config for {amodel_name} not found; available models {list_models()}."
+            )
+            raise RuntimeError(f"Model config for {amodel_name} not found.")
+
+        if force_quick_gelu:
+            # override for use of QuickGELU on non-OpenAI transformer models
+            model_cfg["quick_gelu"] = True
+
+        # if pretrained_image:
+        #     if 'timm_amodel_name' in model_cfg.get('vision_cfg', {}):
+        #         # pretrained weight loading for timm models set via vision_cfg
+        #         model_cfg['vision_cfg']['timm_model_pretrained'] = True
+        #     else:
+        #         assert False, 'pretrained image towers currently only supported for timm models'
+        model_cfg["text_cfg"]["model_type"] = tmodel_name
+        model_cfg["enable_fusion"] = enable_fusion
+        model_cfg["fusion_type"] = fusion_type
+        model = CLAP(**model_cfg)
+
+        if pretrained:
+            checkpoint_path = ""
+            url = get_pretrained_url(amodel_name, pretrained)
+            if url:
+                checkpoint_path = download_pretrained(url, root=openai_model_cache_dir)
+            elif os.path.exists(pretrained_orig):
+                checkpoint_path = pretrained_orig
+            if checkpoint_path:
+                logging.info(f"Loading pretrained {amodel_name}-{tmodel_name} weights ({pretrained}).")
+                ckpt = load_state_dict(checkpoint_path, skip_params=True)
+                model.load_state_dict(ckpt)
+                param_names = [n for n, p in model.named_parameters()]
+                for n in param_names:
+                    print(n, "\t", "Loaded" if n in ckpt else "Unloaded")
+            else:
+                logging.warning(
+                    f"Pretrained weights ({pretrained}) not found for model {amodel_name}."
+                )
+                raise RuntimeError(
+                    f"Pretrained weights ({pretrained}) not found for model {amodel_name}."
+                )
+
+        if pretrained_audio:
+            if amodel_name.startswith('PANN'):
+                if 'Cnn14_mAP' in pretrained_audio:  # official checkpoint
+                    audio_ckpt = torch.load(pretrained_audio, map_location='cpu')
+                    audio_ckpt = audio_ckpt['model']
+                    keys = list(audio_ckpt.keys())
+                    for key in keys:
+                        if 'spectrogram_extractor' not in key and 'logmel_extractor' not in key:
+                            v = audio_ckpt.pop(key)
+                            audio_ckpt['audio_branch.' + key] = v
+                elif os.path.basename(pretrained_audio).startswith('PANN'):  # checkpoint trained via HTSAT codebase
+                    audio_ckpt = torch.load(pretrained_audio, map_location='cpu')
+                    audio_ckpt = audio_ckpt['state_dict']
+                    keys = list(audio_ckpt.keys())
+                    for key in keys:
+                        if key.startswith('sed_model'):
+                            v = audio_ckpt.pop(key)
+                            audio_ckpt['audio_branch.' + key[10:]] = v
+                elif os.path.basename(pretrained_audio).startswith('finetuned'):  # checkpoint trained via linear probe codebase
+                    audio_ckpt = torch.load(pretrained_audio, map_location='cpu')
+                else:
+                    raise ValueError('Unknown audio checkpoint')
+            elif amodel_name.startswith('HTSAT'):
+                if 'HTSAT_AudioSet_Saved' in pretrained_audio:  # official checkpoint
+                    audio_ckpt = torch.load(pretrained_audio, map_location='cpu')
+                    audio_ckpt = audio_ckpt['state_dict']
+                    keys = list(audio_ckpt.keys())
+                    for key in keys:
+                        if key.startswith('sed_model') and ('spectrogram_extractor' not in key
+                                                            and 'logmel_extractor' not in key):
+                            v = audio_ckpt.pop(key)
+                            audio_ckpt['audio_branch.' + key[10:]] = v
+                elif os.path.basename(pretrained_audio).startswith('HTSAT'):  # checkpoint trained via HTSAT codebase
+                    audio_ckpt = torch.load(pretrained_audio, map_location='cpu')
+                    audio_ckpt = audio_ckpt['state_dict']
+                    keys = list(audio_ckpt.keys())
+                    for key in keys:
+                        if key.startswith('sed_model'):
+                            v = audio_ckpt.pop(key)
+                            audio_ckpt['audio_branch.' + key[10:]] = v
+                elif os.path.basename(pretrained_audio).startswith('finetuned'):  # checkpoint trained via linear probe codebase
+                    audio_ckpt = torch.load(pretrained_audio, map_location='cpu')
+                else:
+                    raise ValueError('Unknown audio checkpoint')
+            else:
+                raise f'this audio encoder pretrained checkpoint is not support'
+
+            model.load_state_dict(audio_ckpt, strict=False)
+            logging.info(f"Loading pretrained {amodel_name} weights ({pretrained_audio}).")
+            param_names = [n for n, p in model.named_parameters()]
+            for n in param_names:
+                print(n, "\t", "Loaded" if n in audio_ckpt else "Unloaded")
+            
+        model.to(device=device)
+        if precision == "fp16":
+            assert device.type != "cpu"
+            convert_weights_to_fp16(model)
+
+        if jit:
+            model = torch.jit.script(model)
+
+    return model, model_cfg
+
+
+def create_model_and_transforms(
+    model_name: str,
+    pretrained: str = "",
+    precision: str = "fp32",
+    device: torch.device = torch.device("cpu"),
+    jit: bool = False,
+    force_quick_gelu: bool = False,
+    # pretrained_image: bool = False,
+):
+    model = create_model(
+        model_name,
+        pretrained,
+        precision,
+        device,
+        jit,
+        force_quick_gelu=force_quick_gelu,
+        # pretrained_image=pretrained_image
+    )
+    preprocess_train = image_transform(model.visual.image_size, is_train=True)
+    preprocess_val = image_transform(model.visual.image_size, is_train=False)
+    return model, preprocess_train, preprocess_val
+
+
+def list_models():
+    """enumerate available model architectures based on config files"""
+    return list(_MODEL_CONFIGS.keys())
+
+
+def add_model_config(path):
+    """add model config path or file and update registry"""
+    if not isinstance(path, Path):
+        path = Path(path)
+    _MODEL_CONFIG_PATHS.append(path)
+    _rescan_model_configs()
diff --git a/my_laion_clap/CLAP/src/laion_clap/clap_module/feature_fusion.py b/my_laion_clap/CLAP/src/laion_clap/clap_module/feature_fusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2419516b76931f0aa801d78e1b5f04a92a909e6
--- /dev/null
+++ b/my_laion_clap/CLAP/src/laion_clap/clap_module/feature_fusion.py
@@ -0,0 +1,193 @@
+'''
+Feature Fusion for Varible-Length Data Processing
+AFF/iAFF is referred and modified from https://github.com/YimianDai/open-aff/blob/master/aff_pytorch/aff_net/fusion.py
+According to the paper: Yimian Dai et al, Attentional Feature Fusion, IEEE Winter Conference on Applications of Computer Vision, WACV 2021
+'''
+
+import torch
+import torch.nn as nn
+
+
+class DAF(nn.Module):
+    '''
+    直接相加 DirectAddFuse
+    '''
+
+    def __init__(self):
+        super(DAF, self).__init__()
+
+    def forward(self, x, residual):
+        return x + residual
+
+
+class iAFF(nn.Module):
+    '''
+    多特征融合 iAFF
+    '''
+
+    def __init__(self, channels=64, r=4, type='2D'):
+        super(iAFF, self).__init__()
+        inter_channels = int(channels // r)
+
+        if type == '1D':
+            # 本地注意力
+            self.local_att = nn.Sequential(
+                nn.Conv1d(channels, inter_channels, kernel_size=1, stride=1, padding=0),
+                nn.BatchNorm1d(inter_channels),
+                nn.ReLU(inplace=True),
+                nn.Conv1d(inter_channels, channels, kernel_size=1, stride=1, padding=0),
+                nn.BatchNorm1d(channels),
+            )
+
+            # 全局注意力
+            self.global_att = nn.Sequential(
+                nn.AdaptiveAvgPool1d(1),
+                nn.Conv1d(channels, inter_channels, kernel_size=1, stride=1, padding=0),
+                nn.BatchNorm1d(inter_channels),
+                nn.ReLU(inplace=True),
+                nn.Conv1d(inter_channels, channels, kernel_size=1, stride=1, padding=0),
+                nn.BatchNorm1d(channels),
+            )
+
+            # 第二次本地注意力
+            self.local_att2 = nn.Sequential(
+                nn.Conv1d(channels, inter_channels, kernel_size=1, stride=1, padding=0),
+                nn.BatchNorm1d(inter_channels),
+                nn.ReLU(inplace=True),
+                nn.Conv1d(inter_channels, channels, kernel_size=1, stride=1, padding=0),
+                nn.BatchNorm1d(channels),
+            )
+            # 第二次全局注意力
+            self.global_att2 = nn.Sequential(
+                nn.AdaptiveAvgPool1d(1),
+                nn.Conv1d(channels, inter_channels, kernel_size=1, stride=1, padding=0),
+                nn.BatchNorm1d(inter_channels),
+                nn.ReLU(inplace=True),
+                nn.Conv1d(inter_channels, channels, kernel_size=1, stride=1, padding=0),
+                nn.BatchNorm1d(channels),
+            )
+        elif type == '2D':
+            # 本地注意力
+            self.local_att = nn.Sequential(
+                nn.Conv2d(channels, inter_channels, kernel_size=1, stride=1, padding=0),
+                nn.BatchNorm2d(inter_channels),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(inter_channels, channels, kernel_size=1, stride=1, padding=0),
+                nn.BatchNorm2d(channels),
+            )
+
+            # 全局注意力
+            self.global_att = nn.Sequential(
+                nn.AdaptiveAvgPool2d(1),
+                nn.Conv2d(channels, inter_channels, kernel_size=1, stride=1, padding=0),
+                nn.BatchNorm2d(inter_channels),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(inter_channels, channels, kernel_size=1, stride=1, padding=0),
+                nn.BatchNorm2d(channels),
+            )
+
+            # 第二次本地注意力
+            self.local_att2 = nn.Sequential(
+                nn.Conv2d(channels, inter_channels, kernel_size=1, stride=1, padding=0),
+                nn.BatchNorm2d(inter_channels),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(inter_channels, channels, kernel_size=1, stride=1, padding=0),
+                nn.BatchNorm2d(channels),
+            )
+            # 第二次全局注意力
+            self.global_att2 = nn.Sequential(
+                nn.AdaptiveAvgPool2d(1),
+                nn.Conv2d(channels, inter_channels, kernel_size=1, stride=1, padding=0),
+                nn.BatchNorm2d(inter_channels),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(inter_channels, channels, kernel_size=1, stride=1, padding=0),
+                nn.BatchNorm2d(channels),
+            )
+        else:
+            raise f'the type is not supported'
+
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, x, residual):
+        flag = False
+        xa = x + residual
+        if xa.size(0) == 1:
+            xa = torch.cat([xa,xa],dim=0)
+            flag = True
+        xl = self.local_att(xa)
+        xg = self.global_att(xa)
+        xlg = xl + xg
+        wei = self.sigmoid(xlg)
+        xi = x * wei + residual * (1 - wei)
+
+        xl2 = self.local_att2(xi)
+        xg2 = self.global_att(xi)
+        xlg2 = xl2 + xg2
+        wei2 = self.sigmoid(xlg2)
+        xo = x * wei2 + residual * (1 - wei2)
+        if flag:
+            xo = xo[0].unsqueeze(0)
+        return xo
+
+
+class AFF(nn.Module):
+    '''
+    多特征融合 AFF
+    '''
+
+    def __init__(self, channels=64, r=4, type='2D'):
+        super(AFF, self).__init__()
+        inter_channels = int(channels // r)
+
+        if type == '1D':
+            self.local_att = nn.Sequential(
+                nn.Conv1d(channels, inter_channels, kernel_size=1, stride=1, padding=0),
+                nn.BatchNorm1d(inter_channels),
+                nn.ReLU(inplace=True),
+                nn.Conv1d(inter_channels, channels, kernel_size=1, stride=1, padding=0),
+                nn.BatchNorm1d(channels),
+            )
+            self.global_att = nn.Sequential(
+                nn.AdaptiveAvgPool1d(1),
+                nn.Conv1d(channels, inter_channels, kernel_size=1, stride=1, padding=0),
+                nn.BatchNorm1d(inter_channels),
+                nn.ReLU(inplace=True),
+                nn.Conv1d(inter_channels, channels, kernel_size=1, stride=1, padding=0),
+                nn.BatchNorm1d(channels),
+            )
+        elif type == '2D':
+            self.local_att = nn.Sequential(
+                nn.Conv2d(channels, inter_channels, kernel_size=1, stride=1, padding=0),
+                nn.BatchNorm2d(inter_channels),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(inter_channels, channels, kernel_size=1, stride=1, padding=0),
+                nn.BatchNorm2d(channels),
+            )
+            self.global_att = nn.Sequential(
+                nn.AdaptiveAvgPool2d(1),
+                nn.Conv2d(channels, inter_channels, kernel_size=1, stride=1, padding=0),
+                nn.BatchNorm2d(inter_channels),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(inter_channels, channels, kernel_size=1, stride=1, padding=0),
+                nn.BatchNorm2d(channels),
+            )
+        else:
+            raise f'the type is not supported.'
+        
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, x, residual):
+        flag = False
+        xa = x + residual
+        if xa.size(0) == 1:
+            xa = torch.cat([xa,xa],dim=0)
+            flag = True
+        xl = self.local_att(xa)
+        xg = self.global_att(xa)
+        xlg = xl + xg
+        wei = self.sigmoid(xlg)
+        xo = 2 * x * wei + 2 * residual * (1 - wei)
+        if flag:
+            xo = xo[0].unsqueeze(0)
+        return xo
+
diff --git a/my_laion_clap/CLAP/src/laion_clap/clap_module/htsat.py b/my_laion_clap/CLAP/src/laion_clap/clap_module/htsat.py
new file mode 100644
index 0000000000000000000000000000000000000000..21c6366fc7b4366f9bee104b255b40c050e207c1
--- /dev/null
+++ b/my_laion_clap/CLAP/src/laion_clap/clap_module/htsat.py
@@ -0,0 +1,1034 @@
+# Ke Chen
+# knutchen@ucsd.edu
+# HTS-AT: A HIERARCHICAL TOKEN-SEMANTIC AUDIO TRANSFORMER FOR SOUND CLASSIFICATION AND DETECTION
+# Some layers designed on the model
+# below codes are based and referred from https://github.com/microsoft/Swin-Transformer
+# Swin Transformer for Computer Vision: https://arxiv.org/pdf/2103.14030.pdf
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from itertools import repeat
+import collections.abc
+import math
+import warnings
+
+from torch.nn.init import _calculate_fan_in_and_fan_out
+import torch.utils.checkpoint as checkpoint
+
+import random
+
+from torchlibrosa.stft import Spectrogram, LogmelFilterBank
+from torchlibrosa.augmentation import SpecAugmentation
+
+from itertools import repeat
+from .utils import do_mixup, interpolate
+
+from .feature_fusion import iAFF, AFF, DAF
+
+# from PyTorch internals
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable):
+            return x
+        return tuple(repeat(x, n))
+    return parse
+
+to_1tuple = _ntuple(1)
+to_2tuple = _ntuple(2)
+to_3tuple = _ntuple(3)
+to_4tuple = _ntuple(4)
+to_ntuple = _ntuple
+
+def drop_path(x, drop_prob: float = 0., training: bool = False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+    'survival rate' as the argument.
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
+    random_tensor.floor_()  # binarize
+    output = x.div(keep_prob) * random_tensor
+    return output
+
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+class PatchEmbed(nn.Module):
+    """ 2D Image to Patch Embedding
+    """
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, norm_layer=None, flatten=True, patch_stride = 16,
+        enable_fusion=False, fusion_type='None'):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        patch_stride = to_2tuple(patch_stride)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.patch_stride = patch_stride
+        self.grid_size = (img_size[0] // patch_stride[0], img_size[1] // patch_stride[1])
+        self.num_patches = self.grid_size[0] * self.grid_size[1]
+        self.flatten = flatten
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+
+        self.enable_fusion = enable_fusion
+        self.fusion_type = fusion_type
+        
+        padding = ((patch_size[0] - patch_stride[0]) // 2, (patch_size[1] - patch_stride[1]) // 2)
+
+        if (self.enable_fusion) and (self.fusion_type == 'channel_map'):
+            self.proj = nn.Conv2d(in_chans*4, embed_dim, kernel_size=patch_size, stride=patch_stride, padding=padding)
+        else:
+            self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_stride, padding=padding)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+
+        if (self.enable_fusion) and (self.fusion_type in ['daf_2d','aff_2d','iaff_2d']):
+            self.mel_conv2d = nn.Conv2d(in_chans, embed_dim, kernel_size=(patch_size[0], patch_size[1]*3), stride=(patch_stride[0], patch_stride[1] * 3), padding=padding)
+            if self.fusion_type == 'daf_2d':
+                self.fusion_model = DAF()
+            elif self.fusion_type == 'aff_2d':
+                self.fusion_model = AFF(channels=embed_dim, type='2D')
+            elif self.fusion_type == 'iaff_2d':
+                self.fusion_model = iAFF(channels=embed_dim, type='2D')    
+    def forward(self, x, longer_idx = None):
+        if (self.enable_fusion) and (self.fusion_type in ['daf_2d','aff_2d','iaff_2d']):
+            global_x = x[:,0:1,:,:]
+            
+
+            # global processing
+            B, C, H, W = global_x.shape
+            assert H == self.img_size[0] and W == self.img_size[1], \
+                f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+            global_x = self.proj(global_x)
+            TW = global_x.size(-1)
+            if len(longer_idx) > 0:
+                # local processing
+                local_x = x[longer_idx,1:,:,:].contiguous()
+                B, C, H, W = local_x.shape
+                local_x = local_x.view(B*C,1,H,W)
+                local_x = self.mel_conv2d(local_x)
+                local_x = local_x.view(B,C,local_x.size(1),local_x.size(2),local_x.size(3))
+                local_x = local_x.permute((0,2,3,1,4)).contiguous().flatten(3)
+                TB,TC,TH,_ = local_x.size()
+                if local_x.size(-1) < TW:
+                    local_x = torch.cat([local_x, torch.zeros((TB,TC,TH,TW-local_x.size(-1)), device=global_x.device)], dim=-1)
+                else:
+                    local_x = local_x[:,:,:,:TW]
+                
+                global_x[longer_idx] = self.fusion_model(global_x[longer_idx],local_x) #.to(torch.bfloat16)
+            x = global_x
+        else:
+            B, C, H, W = x.shape
+            assert H == self.img_size[0] and W == self.img_size[1], \
+                f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+            x = self.proj(x)
+        
+        if self.flatten:
+            x = x.flatten(2).transpose(1, 2)  # BCHW -> BNC
+        x = self.norm(x)
+        return x
+
+class Mlp(nn.Module):
+    """ MLP as used in Vision Transformer, MLP-Mixer and related networks
+    """
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+def _no_grad_trunc_normal_(tensor, mean, std, a, b):
+    # Cut & paste from PyTorch official master until it's in a few official releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1. + math.erf(x / math.sqrt(2.))) / 2.
+
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+                      "The distribution of values may be incorrect.",
+                      stacklevel=2)
+
+    with torch.no_grad():
+        # Values are generated by using a truncated uniform distribution and
+        # then using the inverse CDF for the normal distribution.
+        # Get upper and lower cdf values
+        l = norm_cdf((a - mean) / std)
+        u = norm_cdf((b - mean) / std)
+
+        # Uniformly fill tensor with values from [l, u], then translate to
+        # [2l-1, 2u-1].
+        tensor.uniform_(2 * l - 1, 2 * u - 1)
+
+        # Use inverse cdf transform for normal distribution to get truncated
+        # standard normal
+        tensor.erfinv_()
+
+        # Transform to proper mean, std
+        tensor.mul_(std * math.sqrt(2.))
+        tensor.add_(mean)
+
+        # Clamp to ensure it's in the proper range
+        tensor.clamp_(min=a, max=b)
+        return tensor
+
+
+def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
+    # type: (Tensor, float, float, float, float) -> Tensor
+    r"""Fills the input Tensor with values drawn from a truncated
+    normal distribution. The values are effectively drawn from the
+    normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \leq \text{mean} \leq b`.
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        a: the minimum cutoff value
+        b: the maximum cutoff value
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.trunc_normal_(w)
+    """
+    return _no_grad_trunc_normal_(tensor, mean, std, a, b)
+
+
+def variance_scaling_(tensor, scale=1.0, mode='fan_in', distribution='normal'):
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
+    if mode == 'fan_in':
+        denom = fan_in
+    elif mode == 'fan_out':
+        denom = fan_out
+    elif mode == 'fan_avg':
+        denom = (fan_in + fan_out) / 2
+
+    variance = scale / denom
+
+    if distribution == "truncated_normal":
+        # constant is stddev of standard normal truncated to (-2, 2)
+        trunc_normal_(tensor, std=math.sqrt(variance) / .87962566103423978)
+    elif distribution == "normal":
+        tensor.normal_(std=math.sqrt(variance))
+    elif distribution == "uniform":
+        bound = math.sqrt(3 * variance)
+        tensor.uniform_(-bound, bound)
+    else:
+        raise ValueError(f"invalid distribution {distribution}")
+
+
+def lecun_normal_(tensor):
+    variance_scaling_(tensor, mode='fan_in', distribution='truncated_normal')
+
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    return windows
+
+
+def window_reverse(windows, window_size, H, W):
+    """
+    Args:
+        windows: (num_windows*B, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+    Returns:
+        x: (B, H, W, C)
+    """
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    return x
+
+
+class WindowAttention(nn.Module):
+    r""" Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+
+    def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.):
+
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(self.window_size[0])
+        coords_w = torch.arange(self.window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        self.register_buffer("relative_position_index", relative_position_index)
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        trunc_normal_(self.relative_position_bias_table, std=.02)
+        self.softmax = nn.Softmax(dim=-1)
+
+    def forward(self, x, mask=None):
+        """
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+        """
+        B_, N, C = x.shape
+        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
+
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+
+        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1)  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x, attn
+
+    def extra_repr(self):
+        return f'dim={self.dim}, window_size={self.window_size}, num_heads={self.num_heads}'
+
+
+# We use the model based on Swintransformer Block, therefore we can use the swin-transformer pretrained model
+class SwinTransformerBlock(nn.Module):
+    r""" Swin Transformer Block.
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resulotion.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(self, dim, input_resolution, num_heads, window_size=7, shift_size=0,
+                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0.,
+                 act_layer=nn.GELU, norm_layer=nn.LayerNorm, norm_before_mlp='ln'):
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        self.norm_before_mlp = norm_before_mlp
+        if min(self.input_resolution) <= self.window_size:
+            # if window size is larger than input resolution, we don't partition windows
+            self.shift_size = 0
+            self.window_size = min(self.input_resolution)
+        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
+
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention(
+            dim, window_size=to_2tuple(self.window_size), num_heads=num_heads,
+            qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        if self.norm_before_mlp == 'ln':
+            self.norm2 = nn.LayerNorm(dim)
+        elif self.norm_before_mlp == 'bn':
+            self.norm2 = lambda x: nn.BatchNorm1d(dim)(x.transpose(1, 2)).transpose(1, 2)
+        else:
+            raise NotImplementedError
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+
+        if self.shift_size > 0:
+            # calculate attention mask for SW-MSA
+            H, W = self.input_resolution
+            img_mask = torch.zeros((1, H, W, 1))  # 1 H W 1
+            h_slices = (slice(0, -self.window_size),
+                        slice(-self.window_size, -self.shift_size),
+                        slice(-self.shift_size, None))
+            w_slices = (slice(0, -self.window_size),
+                        slice(-self.window_size, -self.shift_size),
+                        slice(-self.shift_size, None))
+            cnt = 0
+            for h in h_slices:
+                for w in w_slices:
+                    img_mask[:, h, w, :] = cnt
+                    cnt += 1
+
+            mask_windows = window_partition(img_mask, self.window_size)  # nW, window_size, window_size, 1
+            mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
+            attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+            attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+        else:
+            attn_mask = None
+
+        self.register_buffer("attn_mask", attn_mask)
+
+    def forward(self, x):
+        # pdb.set_trace()
+        H, W = self.input_resolution
+        # print("H: ", H)
+        # print("W: ", W)
+        # pdb.set_trace()
+        B, L, C = x.shape
+        # assert L == H * W, "input feature has wrong size"
+
+        shortcut = x
+        x = self.norm1(x)
+        x = x.view(B, H, W, C)
+
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
+        else:
+            shifted_x = x
+
+        # partition windows
+        x_windows = window_partition(shifted_x, self.window_size)  # nW*B, window_size, window_size, C
+        x_windows = x_windows.view(-1, self.window_size * self.window_size, C)  # nW*B, window_size*window_size, C
+
+        # W-MSA/SW-MSA
+        attn_windows, attn = self.attn(x_windows, mask=self.attn_mask)  # nW*B, window_size*window_size, C
+
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
+        shifted_x = window_reverse(attn_windows, self.window_size, H, W)  # B H' W' C
+
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
+        else:
+            x = shifted_x
+        x = x.view(B, H * W, C)
+
+        # FFN
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+
+        return x, attn
+
+    def extra_repr(self):
+        return f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, " \
+               f"window_size={self.window_size}, shift_size={self.shift_size}, mlp_ratio={self.mlp_ratio}"
+
+
+
+class PatchMerging(nn.Module):
+    r""" Patch Merging Layer.
+    Args:
+        input_resolution (tuple[int]): Resolution of input feature.
+        dim (int): Number of input channels.
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+        self.norm = norm_layer(4 * dim)
+
+    def forward(self, x):
+        """
+        x: B, H*W, C
+        """
+        H, W = self.input_resolution
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+        assert H % 2 == 0 and W % 2 == 0, f"x size ({H}*{W}) are not even."
+
+        x = x.view(B, H, W, C)
+
+        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
+        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
+        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
+        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
+        x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
+        x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C
+
+        x = self.norm(x)
+        x = self.reduction(x)
+
+        return x
+
+    def extra_repr(self):
+        return f"input_resolution={self.input_resolution}, dim={self.dim}"
+
+
+class BasicLayer(nn.Module):
+    """ A basic Swin Transformer layer for one stage.
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resolution.
+        depth (int): Number of blocks.
+        num_heads (int): Number of attention heads.
+        window_size (int): Local window size.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+
+    def __init__(self, dim, input_resolution, depth, num_heads, window_size,
+                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., norm_layer=nn.LayerNorm, downsample=None, use_checkpoint=False,
+                 norm_before_mlp='ln'):
+
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+
+        # build blocks
+        self.blocks = nn.ModuleList([
+            SwinTransformerBlock(dim=dim, input_resolution=input_resolution,
+                                 num_heads=num_heads, window_size=window_size,
+                                 shift_size=0 if (i % 2 == 0) else window_size // 2,
+                                 mlp_ratio=mlp_ratio,
+                                 qkv_bias=qkv_bias, qk_scale=qk_scale,
+                                 drop=drop, attn_drop=attn_drop,
+                                 drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
+                                 norm_layer=norm_layer, norm_before_mlp=norm_before_mlp)
+            for i in range(depth)])
+
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(input_resolution, dim=dim, norm_layer=norm_layer)
+        else:
+            self.downsample = None
+
+    def forward(self, x):
+        attns = []
+        for blk in self.blocks:
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x)
+            else:
+                x, attn = blk(x)
+                if not self.training:
+                    attns.append(attn.unsqueeze(0))
+        if self.downsample is not None:
+            x = self.downsample(x)
+        if not self.training:
+            attn = torch.cat(attns, dim = 0)
+            attn = torch.mean(attn, dim = 0)
+        return x, attn
+
+    def extra_repr(self):
+        return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}"
+
+
+# The Core of HTSAT
+class HTSAT_Swin_Transformer(nn.Module):
+    r"""HTSAT based on the Swin Transformer
+    Args:
+        spec_size (int | tuple(int)): Input Spectrogram size. Default 256
+        patch_size (int | tuple(int)): Patch size. Default: 4
+        path_stride (iot | tuple(int)): Patch Stride for Frequency and Time Axis. Default: 4
+        in_chans (int): Number of input image channels. Default: 1 (mono)
+        num_classes (int): Number of classes for classification head. Default: 527
+        embed_dim (int): Patch embedding dimension. Default: 96
+        depths (tuple(int)): Depth of each HTSAT-Swin Transformer layer.
+        num_heads (tuple(int)): Number of attention heads in different layers.
+        window_size (int): Window size. Default: 8
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None
+        drop_rate (float): Dropout rate. Default: 0
+        attn_drop_rate (float): Attention dropout rate. Default: 0
+        drop_path_rate (float): Stochastic depth rate. Default: 0.1
+        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
+        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False
+        patch_norm (bool): If True, add normalization after patch embedding. Default: True
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False
+        config (module): The configuration Module from config.py
+    """
+
+    def __init__(self, spec_size=256, patch_size=4, patch_stride=(4,4), 
+                in_chans=1, num_classes=527,
+                 embed_dim=96, depths=[2, 2, 6, 2], num_heads=[4, 8, 16, 32],
+                 window_size=8, mlp_ratio=4., qkv_bias=True, qk_scale=None,
+                 drop_rate=0., attn_drop_rate=0., drop_path_rate=0.1,
+                 norm_layer=nn.LayerNorm, 
+                 ape=False, patch_norm=True,
+                 use_checkpoint=False, norm_before_mlp='ln', config = None, 
+                 enable_fusion = False, fusion_type = 'None', **kwargs):
+        super(HTSAT_Swin_Transformer, self).__init__()
+
+        self.config = config
+        self.spec_size = spec_size 
+        self.patch_stride = patch_stride
+        self.patch_size = patch_size
+        self.window_size = window_size
+        self.embed_dim = embed_dim
+        self.depths = depths
+        self.ape = ape
+        self.in_chans = in_chans
+        self.num_classes = num_classes
+        self.num_heads = num_heads
+        self.num_layers = len(self.depths)
+        self.num_features = int(self.embed_dim * 2 ** (self.num_layers - 1))
+        
+        self.drop_rate = drop_rate
+        self.attn_drop_rate = attn_drop_rate
+        self.drop_path_rate = drop_path_rate
+
+        self.qkv_bias = qkv_bias
+        self.qk_scale = None
+
+        self.patch_norm = patch_norm
+        self.norm_layer = norm_layer if self.patch_norm else None
+        self.norm_before_mlp = norm_before_mlp
+        self.mlp_ratio = mlp_ratio
+
+        self.use_checkpoint = use_checkpoint
+
+        self.enable_fusion = enable_fusion
+        self.fusion_type = fusion_type
+
+        #  process mel-spec ; used only once
+        self.freq_ratio = self.spec_size // self.config.mel_bins
+        window = 'hann'
+        center = True
+        pad_mode = 'reflect'
+        ref = 1.0
+        amin = 1e-10
+        top_db = None
+        self.interpolate_ratio = 32     # Downsampled ratio
+        # Spectrogram extractor
+        self.spectrogram_extractor = Spectrogram(n_fft=config.window_size, hop_length=config.hop_size, 
+            win_length=config.window_size, window=window, center=center, pad_mode=pad_mode, 
+            freeze_parameters=True)
+        # Logmel feature extractor
+        self.logmel_extractor = LogmelFilterBank(sr=config.sample_rate, n_fft=config.window_size, 
+            n_mels=config.mel_bins, fmin=config.fmin, fmax=config.fmax, ref=ref, amin=amin, top_db=top_db, 
+            freeze_parameters=True)
+        # Spec augmenter
+        self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, 
+            freq_drop_width=8, freq_stripes_num=2) # 2 2
+        self.bn0 = nn.BatchNorm2d(self.config.mel_bins)
+
+
+        # split spctrogram into non-overlapping patches
+        self.patch_embed = PatchEmbed(
+            img_size=self.spec_size, patch_size=self.patch_size, in_chans=self.in_chans, 
+            embed_dim=self.embed_dim, norm_layer=self.norm_layer, patch_stride = patch_stride,
+            enable_fusion=self.enable_fusion, fusion_type=self.fusion_type
+            )
+
+        num_patches = self.patch_embed.num_patches
+        patches_resolution = self.patch_embed.grid_size
+        self.patches_resolution = patches_resolution
+
+        # absolute position embedding
+        if self.ape:
+            self.absolute_pos_embed = nn.Parameter(torch.zeros(1, num_patches, self.embed_dim))
+            trunc_normal_(self.absolute_pos_embed, std=.02)
+
+        self.pos_drop = nn.Dropout(p=self.drop_rate)
+
+        # stochastic depth
+        dpr = [x.item() for x in torch.linspace(0, self.drop_path_rate, sum(self.depths))]  # stochastic depth decay rule
+
+        # build layers
+        self.layers = nn.ModuleList()
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(dim=int(self.embed_dim * 2 ** i_layer),
+                input_resolution=(patches_resolution[0] // (2 ** i_layer),
+                                    patches_resolution[1] // (2 ** i_layer)),
+                depth=self.depths[i_layer],
+                num_heads=self.num_heads[i_layer],
+                window_size=self.window_size,
+                mlp_ratio=self.mlp_ratio,
+                qkv_bias=self.qkv_bias, qk_scale=self.qk_scale,
+                drop=self.drop_rate, attn_drop=self.attn_drop_rate,
+                drop_path=dpr[sum(self.depths[:i_layer]):sum(self.depths[:i_layer + 1])],
+                norm_layer=self.norm_layer,
+                downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
+                use_checkpoint=use_checkpoint,
+                norm_before_mlp=self.norm_before_mlp)
+            self.layers.append(layer)
+
+        self.norm = self.norm_layer(self.num_features)
+        self.avgpool = nn.AdaptiveAvgPool1d(1)
+        self.maxpool = nn.AdaptiveMaxPool1d(1)
+        
+        SF = self.spec_size // (2 ** (len(self.depths) - 1)) // self.patch_stride[0] // self.freq_ratio
+        self.tscam_conv = nn.Conv2d(
+            in_channels = self.num_features,
+            out_channels = self.num_classes,
+            kernel_size = (SF,3),
+            padding = (0,1)
+        )
+        self.head = nn.Linear(num_classes, num_classes)
+
+        if (self.enable_fusion) and (self.fusion_type in ['daf_1d','aff_1d','iaff_1d']):
+            self.mel_conv1d = nn.Sequential(
+                nn.Conv1d(64, 64, kernel_size=5, stride=3, padding=2),
+                nn.BatchNorm1d(64)
+            )
+            if self.fusion_type == 'daf_1d':
+                self.fusion_model = DAF()
+            elif self.fusion_type == 'aff_1d':
+                self.fusion_model = AFF(channels=64, type='1D')
+            elif self.fusion_type == 'iaff_1d':
+                self.fusion_model = iAFF(channels=64, type='1D')
+                
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'absolute_pos_embed'}
+
+    @torch.jit.ignore
+    def no_weight_decay_keywords(self):
+        return {'relative_position_bias_table'}
+
+
+    def forward_features(self, x, longer_idx = None):
+        # A deprecated optimization for using a hierarchical output from different blocks
+
+        frames_num = x.shape[2]        
+        x = self.patch_embed(x, longer_idx = longer_idx)
+        if self.ape:
+            x = x + self.absolute_pos_embed
+        x = self.pos_drop(x)
+        for i, layer in enumerate(self.layers):
+            x, attn = layer(x)
+        # for x
+        x = self.norm(x)
+        B, N, C = x.shape
+        SF = frames_num // (2 ** (len(self.depths) - 1)) // self.patch_stride[0]
+        ST = frames_num // (2 ** (len(self.depths) - 1)) // self.patch_stride[1]
+        x = x.permute(0,2,1).contiguous().reshape(B, C, SF, ST)
+        B, C, F, T = x.shape
+        # group 2D CNN
+        c_freq_bin = F // self.freq_ratio
+        x = x.reshape(B, C, F // c_freq_bin, c_freq_bin, T)
+        x = x.permute(0,1,3,2,4).contiguous().reshape(B, C, c_freq_bin, -1)
+        # added for latent output
+        encodings = torch.flatten(x, 2) # x = [2, 768, 2, 32]
+        encodings = encodings.permute(0,2,1)
+        # get latent_output
+        fine_grained_latent_output = torch.mean(x, dim = 2)
+        fine_grained_latent_output = interpolate(fine_grained_latent_output.permute(0,2,1).contiguous(), 8 * self.patch_stride[1]) 
+        
+        latent_output = self.avgpool(torch.flatten(x,2))
+        latent_output = torch.flatten(latent_output, 1)
+
+        # display the attention map, if needed
+
+        x = self.tscam_conv(x)
+        x = torch.flatten(x, 2) # B, C, T
+ 
+        fpx = interpolate(torch.sigmoid(x).permute(0,2,1).contiguous(), 8 * self.patch_stride[1]) 
+            
+        x = self.avgpool(x)
+        x = torch.flatten(x, 1)
+
+        output_dict = {
+            'framewise_output': fpx, # already sigmoided
+            'clipwise_output': torch.sigmoid(x),
+            'fine_grained_embedding': fine_grained_latent_output,
+            'embedding': latent_output
+        }
+
+        return encodings
+
+    def crop_wav(self, x, crop_size, spe_pos = None):
+        time_steps = x.shape[2]
+        tx = torch.zeros(x.shape[0], x.shape[1], crop_size, x.shape[3]).to(x.device)
+        for i in range(len(x)):
+            if spe_pos is None:
+                crop_pos = random.randint(0, time_steps - crop_size - 1)
+            else:
+                crop_pos = spe_pos
+            tx[i][0] = x[i, 0, crop_pos:crop_pos + crop_size,:]
+        return tx
+
+    # Reshape the wavform to a img size, if you want to use the pretrained swin transformer model
+    def reshape_wav2img(self, x):
+        B, C, T, F = x.shape
+        target_T = int(self.spec_size * self.freq_ratio)
+        target_F = self.spec_size // self.freq_ratio
+        assert T <= target_T and F <= target_F, "the wav size should less than or equal to the swin input size"
+        # to avoid bicubic zero error
+        if T < target_T:
+            x = nn.functional.interpolate(x, (target_T, x.shape[3]), mode="bicubic", align_corners=True)
+        if F < target_F:
+            x = nn.functional.interpolate(x, (x.shape[2], target_F), mode="bicubic", align_corners=True)
+        x = x.permute(0,1,3,2).contiguous()
+        x = x.reshape(x.shape[0], x.shape[1], x.shape[2], self.freq_ratio, x.shape[3] // self.freq_ratio)
+        # print(x.shape)
+        x = x.permute(0,1,3,2,4).contiguous()
+        x = x.reshape(x.shape[0], x.shape[1], x.shape[2] * x.shape[3], x.shape[4])
+        return x
+    
+    # Repeat the wavform to a img size, if you want to use the pretrained swin transformer model
+    def repeat_wat2img(self, x, cur_pos):
+        B, C, T, F = x.shape
+        target_T = int(self.spec_size * self.freq_ratio)
+        target_F = self.spec_size // self.freq_ratio
+        assert T <= target_T and F <= target_F, "the wav size should less than or equal to the swin input size"
+        # to avoid bicubic zero error
+        if T < target_T:
+            x = nn.functional.interpolate(x, (target_T, x.shape[3]), mode="bicubic", align_corners=True)
+        if F < target_F:
+            x = nn.functional.interpolate(x, (x.shape[2], target_F), mode="bicubic", align_corners=True)  
+        x = x.permute(0,1,3,2).contiguous() # B C F T
+        x = x[:,:,:,cur_pos:cur_pos + self.spec_size]
+        x = x.repeat(repeats = (1,1,4,1))
+        return x
+
+    def forward(self, x: torch.Tensor, mixup_lambda = None, infer_mode = False, device=None):# out_feat_keys: List[str] = None):
+
+        if self.enable_fusion and x["longer"].sum() == 0:
+            # if no audio is longer than 10s, then randomly select one audio to be longer
+            if self.training:
+                x["longer"][torch.randint(0, x["longer"].shape[0], (1,))] = True
+            else:
+                x = x["mel_fusion"].to(device=device, non_blocking=True)
+                x = x.transpose(1, 3)
+                x = self.bn0(x)
+                x = x.transpose(1, 3)
+                x = self.reshape_wav2img(x)
+                output_dict = self.forward_features(x, longer_idx=[])
+                return output_dict
+                
+        if not self.enable_fusion:
+            x = x["waveform"].to(device=device, non_blocking=True)
+            x = self.spectrogram_extractor(x)   # (batch_size, 1, time_steps, freq_bins)
+            x = self.logmel_extractor(x)    # (batch_size, 1, time_steps, mel_bins)
+            x = x.transpose(1, 3)
+            x = self.bn0(x)
+            x = x.transpose(1, 3)
+            if self.training:
+                x = self.spec_augmenter(x)
+
+            if self.training and mixup_lambda is not None:
+                x = do_mixup(x, mixup_lambda)
+                
+            x = self.reshape_wav2img(x)
+            output_dict = self.forward_features(x)
+        else:
+            longer_list = x["longer"].to(device=device, non_blocking=True)
+            x = x["mel_fusion"].to(device=device, non_blocking=True)
+            x = x.transpose(1, 3)
+            x = self.bn0(x)
+            x = x.transpose(1, 3)
+            longer_list_idx = torch.where(longer_list)[0]
+            if self.fusion_type in ['daf_1d','aff_1d','iaff_1d']:
+                new_x = x[:,0:1,:,:].clone().contiguous()
+                if len(longer_list_idx) > 0:
+                # local processing
+                    fusion_x_local = x[longer_list_idx,1:,:,:].clone().contiguous()
+                    FB,FC,FT,FF = fusion_x_local.size()
+                    fusion_x_local = fusion_x_local.view(FB * FC, FT, FF)
+                    fusion_x_local = torch.permute(fusion_x_local, (0,2,1)).contiguous()
+                    fusion_x_local = self.mel_conv1d(fusion_x_local)
+                    fusion_x_local = fusion_x_local.view(FB,FC,FF,fusion_x_local.size(-1))
+                    fusion_x_local = torch.permute(fusion_x_local, (0,2,1,3)).contiguous().flatten(2)
+                    if fusion_x_local.size(-1) < FT:
+                        fusion_x_local = torch.cat([fusion_x_local, torch.zeros((FB,FF,FT- fusion_x_local.size(-1)), device=device)], dim=-1)
+                    else:
+                        fusion_x_local = fusion_x_local[:,:,:FT]
+                    # 1D fusion
+                    new_x = new_x.squeeze(1).permute((0,2,1)).contiguous()
+                    new_x[longer_list_idx] = self.fusion_model(new_x[longer_list_idx], fusion_x_local)
+                    x = new_x.permute((0,2,1)).contiguous()[:,None,:,:]
+                else:
+                    x = new_x
+
+            elif self.fusion_type in ['daf_2d','aff_2d','iaff_2d','channel_map']:
+                x = x # no change
+
+            if self.training:
+                x = self.spec_augmenter(x)
+            if self.training and mixup_lambda is not None:
+                x = do_mixup(x, mixup_lambda)
+
+            x = self.reshape_wav2img(x)
+            output_dict = self.forward_features(x, longer_idx = longer_list_idx)
+       
+        # if infer_mode:
+        #     # in infer mode. we need to handle different length audio input
+        #     frame_num = x.shape[2]
+        #     target_T = int(self.spec_size * self.freq_ratio)
+        #     repeat_ratio = math.floor(target_T / frame_num)
+        #     x = x.repeat(repeats=(1,1,repeat_ratio,1))
+        #     x = self.reshape_wav2img(x)
+        #     output_dict = self.forward_features(x)
+        # else:
+        #     if x.shape[2] > self.freq_ratio * self.spec_size:
+        #         if self.training:
+        #             x = self.crop_wav(x, crop_size=self.freq_ratio * self.spec_size)
+        #             x = self.reshape_wav2img(x)
+        #             output_dict = self.forward_features(x)
+        #         else:
+        #             # Change: Hard code here
+        #             overlap_size = (x.shape[2] - 1) // 4
+        #             output_dicts = []
+        #             crop_size = (x.shape[2] - 1) // 2
+        #             for cur_pos in range(0, x.shape[2] - crop_size - 1, overlap_size):
+        #                 tx = self.crop_wav(x, crop_size = crop_size, spe_pos = cur_pos)
+        #                 tx = self.reshape_wav2img(tx)
+        #                 output_dicts.append(self.forward_features(tx))
+        #             clipwise_output = torch.zeros_like(output_dicts[0]["clipwise_output"]).float().to(x.device)
+        #             framewise_output = torch.zeros_like(output_dicts[0]["framewise_output"]).float().to(x.device)
+        #             for d in output_dicts:
+        #                 clipwise_output += d["clipwise_output"]
+        #                 framewise_output += d["framewise_output"]
+        #             clipwise_output  = clipwise_output / len(output_dicts)
+        #             framewise_output = framewise_output / len(output_dicts)
+        #             output_dict = {
+        #                 'framewise_output': framewise_output, 
+        #                 'clipwise_output': clipwise_output
+        #             }
+        #     else: # this part is typically used, and most easy one
+        #         x = self.reshape_wav2img(x)
+        #         output_dict = self.forward_features(x)
+        # x = self.head(x)
+
+        # We process the data in the dataloader part, in that here we only consider the input_T < fixed_T
+        
+        
+
+        return output_dict
+
+def create_htsat_model(audio_cfg, enable_fusion=False, fusion_type='None'):
+    try:
+
+        assert audio_cfg.model_name in ["tiny", "base", "large"], "model name for HTS-AT is wrong!"
+        if audio_cfg.model_name == "tiny":
+            model = HTSAT_Swin_Transformer(
+                spec_size=256,
+                patch_size=4,
+                patch_stride=(4,4),
+                num_classes=audio_cfg.class_num,
+                embed_dim=96,
+                depths=[2,2,6,2],
+                num_heads=[4,8,16,32],
+                window_size=8,
+                config = audio_cfg,
+                enable_fusion = enable_fusion,
+                fusion_type = fusion_type
+            )
+        elif audio_cfg.model_name == "base":
+            model = HTSAT_Swin_Transformer(
+                spec_size=256,
+                patch_size=4,
+                patch_stride=(4,4),
+                num_classes=audio_cfg.class_num,
+                embed_dim=128,
+                depths=[2,2,12,2],
+                num_heads=[4,8,16,32],
+                window_size=8,
+                config = audio_cfg,
+                enable_fusion = enable_fusion,
+                fusion_type = fusion_type
+            )
+        elif audio_cfg.model_name == "large":
+            model = HTSAT_Swin_Transformer(
+                spec_size=256,
+                patch_size=4,
+                patch_stride=(4,4),
+                num_classes=audio_cfg.class_num,
+                embed_dim=256,
+                depths=[2,2,12,2],
+                num_heads=[4,8,16,32],
+                window_size=8,
+                config = audio_cfg,
+                enable_fusion = enable_fusion,
+                fusion_type = fusion_type
+            )
+        
+        return model
+    except:
+        raise RuntimeError(f'Import Model for {audio_cfg.model_name} not found, or the audio cfg parameters are not enough.')
+        
\ No newline at end of file
diff --git a/my_laion_clap/CLAP/src/laion_clap/clap_module/linear_probe.py b/my_laion_clap/CLAP/src/laion_clap/clap_module/linear_probe.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb2841dd4e28201db8b5bd4a215e1b8b9a60d25a
--- /dev/null
+++ b/my_laion_clap/CLAP/src/laion_clap/clap_module/linear_probe.py
@@ -0,0 +1,63 @@
+import numpy as np
+import torch.nn.functional as F
+from torch import nn
+from .model import MLPLayers
+
+
+class LinearProbe(nn.Module):
+    def __init__(self, model, mlp, freeze, in_ch, out_ch, act=None):
+        """
+        Args:
+            model: nn.Module
+            mlp: bool, if True, then use the MLP layer as the linear probe module
+            freeze: bool, if Ture, then freeze all the CLAP model's layers when training the linear probe
+            in_ch: int, the output channel from CLAP model
+            out_ch: int, the output channel from linear probe (class_num)
+            act: torch.nn.functional, the activation function before the loss function
+        """
+        super().__init__()
+        in_ch = 512
+        self.clap_model = model
+        self.clap_model.text_branch = None  # to save memory
+        self.freeze = freeze
+        if mlp:
+            self.lp_layer = MLPLayers(units=[in_ch, in_ch * 2, out_ch])
+        else:
+            self.lp_layer = nn.Linear(in_ch, out_ch)
+
+        if self.freeze:
+            for param in self.clap_model.parameters():
+                param.requires_grad = False
+
+        if act == 'None':
+            self.act = None
+        elif act == 'relu':
+            self.act = nn.ReLU()
+        elif act == 'elu':
+            self.act = nn.ELU()
+        elif act == 'prelu':
+            self.act = nn.PReLU(num_parameters=in_ch)
+        elif act == 'softmax':
+            self.act = nn.Softmax(dim=-1)
+        elif act == 'sigmoid':
+            self.act = nn.Sigmoid()
+
+    def forward(self, x, mix_lambda=None, device=None):
+        """
+        Args:
+            x: waveform, torch.tensor [batch, t_samples] / batch of mel_spec and longer list
+            mix_lambda: torch.tensor [batch], the mixup lambda
+        Returns:
+            class_prob: torch.tensor [batch, class_num]
+
+        """
+        # batchnorm cancel grandient
+        if self.freeze:
+            self.clap_model.eval()
+
+        x = self.clap_model.audio_projection(
+            self.clap_model.audio_branch(x, mixup_lambda=mix_lambda, device=device)["embedding"])
+        out = self.lp_layer(x)
+        if self.act is not None:
+            out = self.act(out)
+        return out
diff --git a/my_laion_clap/CLAP/src/laion_clap/clap_module/loss.py b/my_laion_clap/CLAP/src/laion_clap/clap_module/loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..53bbedd959813b072b146c16c14cd96df6cada14
--- /dev/null
+++ b/my_laion_clap/CLAP/src/laion_clap/clap_module/loss.py
@@ -0,0 +1,307 @@
+from multiprocessing.sharedctypes import Value
+import torch
+import torch.distributed.nn
+from torch import distributed as dist, nn as nn
+from torch.nn import functional as F
+import numpy as np
+from sklearn.metrics import average_precision_score, roc_auc_score, accuracy_score 
+
+try:
+    import horovod.torch as hvd
+except ImportError:
+    hvd = None
+
+
+def gather_features(
+        audio_features,
+        text_features,
+        audio_features_mlp=None, 
+        text_features_mlp=None,
+        local_loss=False,
+        gather_with_grad=False,
+        rank=0,
+        world_size=1,
+        use_horovod=False,
+        mlp_loss=False
+):
+    if use_horovod:
+        assert hvd is not None, 'Please install horovod'
+        if gather_with_grad:
+            all_audio_features = hvd.allgather(audio_features)
+            all_text_features = hvd.allgather(text_features)
+            if mlp_loss:
+                all_audio_features_mlp = hvd.allgather(audio_features_mlp)
+                all_text_features_mlp = hvd.allgather(text_features_mlp)
+        else:
+            with torch.no_grad():
+                all_audio_features = hvd.allgather(audio_features)
+                all_text_features = hvd.allgather(text_features)
+                if mlp_loss:
+                    all_audio_features_mlp = hvd.allgather(audio_features_mlp)
+                    all_text_features_mlp = hvd.allgather(text_features_mlp)
+            if not local_loss:
+                # ensure grads for local rank when all_* features don't have a gradient
+                gathered_audio_features = list(all_audio_features.chunk(world_size, dim=0))
+                gathered_text_features = list(all_text_features.chunk(world_size, dim=0))
+                gathered_audio_features[rank] = audio_features
+                gathered_text_features[rank] = text_features
+                all_audio_features = torch.cat(gathered_audio_features, dim=0)
+                all_text_features = torch.cat(gathered_text_features, dim=0)
+                if mlp_loss:
+                    gathered_audio_features_mlp = list(all_audio_features_mlp.chunk(world_size, dim=0))
+                    gathered_text_features_mlp = list(all_text_features_mlp.chunk(world_size, dim=0))
+                    gathered_audio_features_mlp[rank] = audio_features_mlp
+                    gathered_text_features_mlp[rank] = text_features_mlp
+                    all_audio_features_mlp = torch.cat(gathered_audio_features_mlp, dim=0)
+                    all_text_features_mlp = torch.cat(gathered_text_features_mlp, dim=0)
+    else:
+        # We gather tensors from all gpus
+        if gather_with_grad:
+            all_audio_features = torch.cat(torch.distributed.nn.all_gather(audio_features), dim=0)
+            all_text_features = torch.cat(torch.distributed.nn.all_gather(text_features), dim=0)
+            if mlp_loss:
+                all_audio_features_mlp = torch.cat(torch.distributed.nn.all_gather(audio_features_mlp), dim=0)
+                all_text_features_mlp = torch.cat(torch.distributed.nn.all_gather(text_features_mlp), dim=0)
+        else:
+            gathered_audio_features = [torch.zeros_like(audio_features) for _ in range(world_size)]
+            gathered_text_features = [torch.zeros_like(text_features) for _ in range(world_size)]
+            dist.all_gather(gathered_audio_features, audio_features)
+            dist.all_gather(gathered_text_features, text_features)
+            if mlp_loss:
+                gathered_audio_features_mlp = [torch.zeros_like(audio_features_mlp) for _ in range(world_size)]
+                gathered_text_features_mlp = [torch.zeros_like(text_features_mlp) for _ in range(world_size)]
+                dist.all_gather(gathered_audio_features_mlp, audio_features_mlp)
+                dist.all_gather(gathered_text_features_mlp, text_features_mlp)
+            if not local_loss:
+                # ensure grads for local rank when all_* features don't have a gradient
+                gathered_audio_features[rank] = audio_features
+                gathered_text_features[rank] = text_features
+                if mlp_loss:
+                    gathered_audio_features_mlp[rank] = audio_features_mlp
+                    gathered_text_features_mlp[rank] = text_features_mlp
+
+            all_audio_features = torch.cat(gathered_audio_features, dim=0)
+            all_text_features = torch.cat(gathered_text_features, dim=0)
+            if mlp_loss:
+                all_audio_features_mlp = torch.cat(gathered_audio_features_mlp, dim=0)
+                all_text_features_mlp = torch.cat(gathered_text_features_mlp, dim=0)
+    if mlp_loss:
+        return all_audio_features, all_text_features, all_audio_features_mlp, all_text_features_mlp
+    else:
+        return all_audio_features, all_text_features
+
+class ClipLoss(nn.Module):
+
+    def __init__(
+            self,
+            local_loss=False,
+            gather_with_grad=False,
+            cache_labels=False,
+            rank=0,
+            world_size=1,
+            use_horovod=False,
+            mlp_loss=False,
+            weight_loss_kappa=0,
+    ):
+        super().__init__()
+        self.local_loss = local_loss
+        self.gather_with_grad = gather_with_grad
+        self.cache_labels = cache_labels
+        self.rank = rank
+        self.world_size = world_size
+        self.use_horovod = use_horovod
+        self.mlp_loss = mlp_loss
+        self.weighted_loss = bool(weight_loss_kappa!=0)
+        self.weight_loss_kappa = weight_loss_kappa
+        # cache state
+        self.prev_num_logits = 0
+        self.labels = {}
+
+    def forward(self, audio_features, text_features, logit_scale_a, logit_scale_t=None, audio_features_mlp=None, text_features_mlp=None):
+        device = audio_features.device
+        if self.mlp_loss:
+            if self.world_size > 1:
+                all_audio_features, all_text_features, all_audio_features_mlp, all_text_features_mlp = gather_features(
+                    audio_features=audio_features,text_features=text_features,
+                    audio_features_mlp=audio_features_mlp,text_features_mlp=text_features_mlp,
+                    local_loss=self.local_loss,gather_with_grad=self.gather_with_grad,
+                    rank=self.rank,world_size=self.world_size,use_horovod=self.use_horovod,
+                    mlp_loss=self.mlp_loss
+                )
+                if self.local_loss:
+                    a_logits_per_audio = logit_scale_a * audio_features @ all_text_features_mlp.T
+                    a_logits_per_text = logit_scale_a * text_features_mlp @ all_audio_features.T
+                    t_logits_per_audio = logit_scale_t * audio_features_mlp @ all_text_features.T
+                    t_logits_per_text = logit_scale_t * text_features @ all_audio_features_mlp.T
+                else:
+                    a_logits_per_audio = logit_scale_a * all_audio_features @ all_text_features_mlp.T
+                    a_logits_per_text = a_logits_per_audio.T
+                    t_logits_per_audio = logit_scale_t * all_audio_features_mlp @ all_text_features.T
+                    t_logits_per_text = t_logits_per_audio.T
+            else:
+                a_logits_per_audio = logit_scale_a * audio_features @ text_features_mlp.T
+                a_logits_per_text = logit_scale_a * text_features_mlp @ audio_features.T
+                t_logits_per_audio = logit_scale_t * audio_features_mlp @ text_features.T
+                t_logits_per_text = logit_scale_t * text_features @ audio_features_mlp.T
+
+            # calculated ground-truth and cache if enabled
+            num_logits = a_logits_per_audio.shape[0]
+            if self.prev_num_logits != num_logits or device not in self.labels:
+                labels = torch.arange(num_logits, device=device, dtype=torch.long)
+                if self.world_size > 1 and self.local_loss:
+                    labels = labels + num_logits * self.rank
+                if self.cache_labels:
+                    self.labels[device] = labels
+                    self.prev_num_logits = num_logits
+            else:
+                labels = self.labels[device]
+
+            if not self.weighted_loss:
+                total_loss = (
+                    F.cross_entropy(a_logits_per_audio, labels) +
+                    F.cross_entropy(a_logits_per_text, labels) + 
+                    F.cross_entropy(t_logits_per_audio, labels) +
+                    F.cross_entropy(t_logits_per_text, labels) 
+                    ) / 4
+            else:
+                audio_weight = (audio_features@audio_features.T).detach()
+                audio_weight = (torch.exp(torch.sum(audio_weight, axis=1)/(self.weight_loss_kappa*len(audio_weight)))).detach()
+                text_weight = (text_features@text_features.T).detach()
+                text_weight = (torch.exp(torch.sum(text_weight, axis=1)/(self.weight_loss_kappa*len(text_features)))).detach()
+                total_loss = (
+                    F.cross_entropy(a_logits_per_audio, labels, weight=audio_weight) +
+                    F.cross_entropy(a_logits_per_text, labels, weight=audio_weight) + 
+                    F.cross_entropy(t_logits_per_audio, labels, weight=text_weight) +
+                    F.cross_entropy(t_logits_per_text, labels, weight=text_weight) 
+                    ) / 4
+        else:
+            if self.world_size > 1:
+                all_audio_features, all_text_features = gather_features(
+                    audio_features=audio_features,text_features=text_features,
+                    local_loss=self.local_loss,gather_with_grad=self.gather_with_grad,
+                    rank=self.rank,world_size=self.world_size,use_horovod=self.use_horovod,
+                    mlp_loss=self.mlp_loss
+                )
+
+                if self.local_loss:
+                    logits_per_audio = logit_scale_a * audio_features @ all_text_features.T
+                    logits_per_text = logit_scale_a * text_features @ all_audio_features.T
+                else:
+                    logits_per_audio = logit_scale_a * all_audio_features @ all_text_features.T
+                    logits_per_text = logits_per_audio.T
+            else:
+                logits_per_audio = logit_scale_a * audio_features @ text_features.T
+                logits_per_text = logit_scale_a * text_features @ audio_features.T
+
+            # calculated ground-truth and cache if enabled
+            num_logits = logits_per_audio.shape[0]
+            if self.prev_num_logits != num_logits or device not in self.labels:
+                labels = torch.arange(num_logits, device=device, dtype=torch.long)
+                if self.world_size > 1 and self.local_loss:
+                    labels = labels + num_logits * self.rank
+                if self.cache_labels:
+                    self.labels[device] = labels
+                    self.prev_num_logits = num_logits
+            else:
+                labels = self.labels[device]
+            if not self.weighted_loss:
+                total_loss = (
+                    F.cross_entropy(logits_per_audio, labels) +
+                    F.cross_entropy(logits_per_text, labels)
+                    ) / 2
+            else:
+                audio_weight = (all_audio_features@all_audio_features.T).detach()
+                audio_weight = (torch.exp(torch.sum(audio_weight, axis=1)/(self.weight_loss_kappa*len(all_audio_features)))).detach()
+                text_weight = (all_text_features@all_text_features.T).detach()
+                text_weight = (torch.exp(torch.sum(text_weight, axis=1)/(self.weight_loss_kappa*len(all_text_features)))).detach()
+                total_loss = (
+                    F.cross_entropy(logits_per_audio, labels, weight=text_weight) +
+                    F.cross_entropy(logits_per_text, labels, weight=audio_weight)
+                    ) / 2
+        return total_loss
+
+def lp_gather_features(
+        pred,
+        target,
+        world_size=1,
+        use_horovod=False
+):
+    if use_horovod:
+        assert hvd is not None, 'Please install horovod'
+        with torch.no_grad():
+            all_preds = hvd.allgather(pred)
+            all_targets = hvd.allgath(target)
+    else:
+        gathered_preds = [torch.zeros_like(pred) for _ in range(world_size)]
+        gathered_targets = [torch.zeros_like(target) for _ in range(world_size)]
+
+        dist.all_gather(gathered_preds, pred)
+        dist.all_gather(gathered_targets, target)
+        all_preds = torch.cat(gathered_preds, dim=0)
+        all_targets = torch.cat(gathered_targets, dim=0)
+
+    return all_preds, all_targets
+
+
+def get_map(pred, target):
+    pred = torch.sigmoid(pred).numpy()
+    target = target.numpy()
+    return np.mean(average_precision_score(target, pred, average=None))
+
+def get_acc(pred, target):
+    pred = torch.argmax(pred,1).numpy()
+    target = torch.argmax(target,1).numpy()
+    return accuracy_score(target, pred)
+
+def get_mauc(pred, target):
+    pred = torch.sigmoid(pred).numpy()
+    target = target.numpy()
+    return np.mean(roc_auc_score(target, pred, average=None))
+
+
+class LPMetrics(object):
+    def __init__(self, metric_names = ['map','acc','mauc']):
+        self.metrics = []
+        for name in metric_names:
+            self.metrics.append(self.get_metric(name))
+        self.metric_names = metric_names
+
+    def get_metric(self,name):
+        if name == 'map':
+            return get_map
+        elif name == 'acc':
+            return get_acc
+        elif name == 'mauc':
+            return get_mauc
+        else:
+            raise ValueError(f'the metric should be at least one of [map, acc, mauc]')
+
+    def evaluate_mertics(self, pred, target):
+        metric_dict = {}
+        for i in range(len(self.metric_names)):
+            metric_dict[self.metric_names[i]] = self.metrics[i](pred, target)
+        return metric_dict
+
+
+def calc_celoss(pred, target):
+    target = torch.argmax(target, 1).long()
+    return nn.CrossEntropyLoss()(pred, target)
+
+
+class LPLoss(nn.Module):
+
+    def __init__(self, loss_name):
+        super().__init__()
+        if loss_name == 'bce':
+            self.loss_func = nn.BCEWithLogitsLoss()
+        elif loss_name == 'ce':
+            self.loss_func = calc_celoss
+        elif loss_name == 'mse':
+            self.loss_func = nn.MSELoss()
+        else:
+            raise ValueError(f'the loss func should be at least one of [bce, ce, mse]')
+
+    def forward(self, pred, target):
+        loss = self.loss_func(pred, target)
+        return loss
+        
\ No newline at end of file
diff --git a/my_laion_clap/CLAP/src/laion_clap/clap_module/model.py b/my_laion_clap/CLAP/src/laion_clap/clap_module/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..60663ec2658cb302f093625c5ce02fc843e6a5bc
--- /dev/null
+++ b/my_laion_clap/CLAP/src/laion_clap/clap_module/model.py
@@ -0,0 +1,892 @@
+""" CLAP Model
+
+Adapted from CLIP: https://github.com/openai/CLIP. Originally MIT License, Copyright (c) 2021 OpenAI.
+Adapted to the Audio Task.
+"""
+
+from collections import OrderedDict
+from dataclasses import dataclass
+from email.mime import audio
+from typing import Tuple, Union, Callable, Optional
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from .timm_model import TimmModel
+import logging
+from .utils import freeze_batch_norm_2d
+
+from .pann_model import create_pann_model
+from .htsat import create_htsat_model
+from transformers import BertModel, RobertaModel, BartModel
+from transformers.tokenization_utils_base import BatchEncoding
+
+
+class MLPLayers(nn.Module):
+    def __init__(self, units=[512, 512, 512], nonlin=nn.ReLU(), dropout=0.1):
+        super(MLPLayers, self).__init__()
+        self.nonlin = nonlin
+        self.dropout = dropout
+
+        sequence = []
+        for u0, u1 in zip(units[:-1], units[1:]):
+            sequence.append(nn.Linear(u0, u1))
+            sequence.append(self.nonlin)
+            sequence.append(nn.Dropout(self.dropout))
+        sequence = sequence[:-2]
+
+        self.sequential = nn.Sequential(*sequence)
+
+    def forward(self, X):
+        X = self.sequential(X)
+        return X
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1):
+        super().__init__()
+
+        # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
+        self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+
+        self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+
+        self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity()
+
+        self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
+
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = None
+        self.stride = stride
+
+        if stride > 1 or inplanes != planes * Bottleneck.expansion:
+            # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1
+            self.downsample = nn.Sequential(
+                OrderedDict(
+                    [
+                        ("-1", nn.AvgPool2d(stride)),
+                        (
+                            "0",
+                            nn.Conv2d(
+                                inplanes,
+                                planes * self.expansion,
+                                1,
+                                stride=1,
+                                bias=False,
+                            ),
+                        ),
+                        ("1", nn.BatchNorm2d(planes * self.expansion)),
+                    ]
+                )
+            )
+
+    def forward(self, x: torch.Tensor):
+        identity = x
+
+        out = self.relu(self.bn1(self.conv1(x)))
+        out = self.relu(self.bn2(self.conv2(out)))
+        out = self.avgpool(out)
+        out = self.bn3(self.conv3(out))
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+        return out
+
+
+class AttentionPool2d(nn.Module):
+    def __init__(
+        self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None
+    ):
+        super().__init__()
+        self.positional_embedding = nn.Parameter(
+            torch.randn(spacial_dim**2 + 1, embed_dim) / embed_dim**0.5
+        )
+        self.k_proj = nn.Linear(embed_dim, embed_dim)
+        self.q_proj = nn.Linear(embed_dim, embed_dim)
+        self.v_proj = nn.Linear(embed_dim, embed_dim)
+        self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
+        self.num_heads = num_heads
+
+    def forward(self, x):
+        x = x.reshape(x.shape[0], x.shape[1], x.shape[2] * x.shape[3]).permute(
+            2, 0, 1
+        )  # NCHW -> (HW)NC
+        x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0)  # (HW+1)NC
+        x = x + self.positional_embedding[:, None, :].to(x.dtype)  # (HW+1)NC
+        x, _ = F.multi_head_attention_forward(
+            query=x,
+            key=x,
+            value=x,
+            embed_dim_to_check=x.shape[-1],
+            num_heads=self.num_heads,
+            q_proj_weight=self.q_proj.weight,
+            k_proj_weight=self.k_proj.weight,
+            v_proj_weight=self.v_proj.weight,
+            in_proj_weight=None,
+            in_proj_bias=torch.cat(
+                [self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]
+            ),
+            bias_k=None,
+            bias_v=None,
+            add_zero_attn=False,
+            dropout_p=0,
+            out_proj_weight=self.c_proj.weight,
+            out_proj_bias=self.c_proj.bias,
+            use_separate_proj_weight=True,
+            training=self.training,
+            need_weights=False,
+        )
+
+        return x[0]
+
+
+class ModifiedResNet(nn.Module):
+    """
+    A ResNet class that is similar to torchvision's but contains the following changes:
+    - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
+    - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
+    - The final pooling layer is a QKV attention instead of an average pool
+    """
+
+    def __init__(self, layers, output_dim, heads, image_size=224, width=64):
+        super().__init__()
+        self.output_dim = output_dim
+        self.image_size = image_size
+
+        # the 3-layer stem
+        self.conv1 = nn.Conv2d(
+            3, width // 2, kernel_size=3, stride=2, padding=1, bias=False
+        )
+        self.bn1 = nn.BatchNorm2d(width // 2)
+        self.conv2 = nn.Conv2d(
+            width // 2, width // 2, kernel_size=3, padding=1, bias=False
+        )
+        self.bn2 = nn.BatchNorm2d(width // 2)
+        self.conv3 = nn.Conv2d(width // 2, width, kernel_size=3, padding=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(width)
+        self.avgpool = nn.AvgPool2d(2)
+        self.relu = nn.ReLU(inplace=True)
+
+        # residual layers
+        self._inplanes = width  # this is a *mutable* variable used during construction
+        self.layer1 = self._make_layer(width, layers[0])
+        self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
+        self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
+        self.layer4 = self._make_layer(width * 8, layers[3], stride=2)
+
+        embed_dim = width * 32  # the ResNet feature dimension
+        self.attnpool = AttentionPool2d(image_size // 32, embed_dim, heads, output_dim)
+
+        self.init_parameters()
+
+    def _make_layer(self, planes, blocks, stride=1):
+        layers = [Bottleneck(self._inplanes, planes, stride)]
+
+        self._inplanes = planes * Bottleneck.expansion
+        for _ in range(1, blocks):
+            layers.append(Bottleneck(self._inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def init_parameters(self):
+        if self.attnpool is not None:
+            std = self.attnpool.c_proj.in_features**-0.5
+            nn.init.normal_(self.attnpool.q_proj.weight, std=std)
+            nn.init.normal_(self.attnpool.k_proj.weight, std=std)
+            nn.init.normal_(self.attnpool.v_proj.weight, std=std)
+            nn.init.normal_(self.attnpool.c_proj.weight, std=std)
+
+        for resnet_block in [self.layer1, self.layer2, self.layer3, self.layer4]:
+            for name, param in resnet_block.named_parameters():
+                if name.endswith("bn3.weight"):
+                    nn.init.zeros_(param)
+
+    def lock(self, unlocked_groups=0, freeze_bn_stats=False):
+        assert (
+            unlocked_groups == 0
+        ), "partial locking not currently supported for this model"
+        for param in self.parameters():
+            param.requires_grad = False
+        if freeze_bn_stats:
+            freeze_batch_norm_2d(self)
+
+    def stem(self, x):
+        for conv, bn in [
+            (self.conv1, self.bn1),
+            (self.conv2, self.bn2),
+            (self.conv3, self.bn3),
+        ]:
+            x = self.relu(bn(conv(x)))
+        x = self.avgpool(x)
+        return x
+
+    def forward(self, x):
+        x = self.stem(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.attnpool(x)
+
+        return x
+
+
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16."""
+
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        x = F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
+        return x.to(orig_type)
+
+
+class QuickGELU(nn.Module):
+    # NOTE This is slower than nn.GELU or nn.SiLU and uses more GPU memory
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+
+
+class ResidualAttentionBlock(nn.Module):
+    def __init__(self, d_model: int, n_head: int, act_layer: Callable = nn.GELU):
+        super().__init__()
+
+        self.attn = nn.MultiheadAttention(d_model, n_head)
+        self.ln_1 = LayerNorm(d_model)
+        self.mlp = nn.Sequential(
+            OrderedDict(
+                [
+                    ("c_fc", nn.Linear(d_model, d_model * 4)),
+                    ("gelu", act_layer()),
+                    ("c_proj", nn.Linear(d_model * 4, d_model)),
+                ]
+            )
+        )
+        self.ln_2 = LayerNorm(d_model)
+
+    def attention(self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None):
+        return self.attn(x, x, x, need_weights=False, attn_mask=attn_mask)[0]
+
+    def forward(self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None):
+        x = x + self.attention(self.ln_1(x), attn_mask=attn_mask)
+        x = x + self.mlp(self.ln_2(x))
+        return x
+
+
+class Transformer(nn.Module):
+    def __init__(
+        self, width: int, layers: int, heads: int, act_layer: Callable = nn.GELU
+    ):
+        super().__init__()
+        self.width = width
+        self.layers = layers
+        self.resblocks = nn.ModuleList(
+            [
+                ResidualAttentionBlock(width, heads, act_layer=act_layer)
+                for _ in range(layers)
+            ]
+        )
+
+    def forward(self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None):
+        for r in self.resblocks:
+            x = r(x, attn_mask=attn_mask)
+        return x
+
+
+class VisualTransformer(nn.Module):
+    def __init__(
+        self,
+        image_size: int,
+        patch_size: int,
+        width: int,
+        layers: int,
+        heads: int,
+        output_dim: int,
+        act_layer: Callable = nn.GELU,
+    ):
+        super().__init__()
+        self.image_size = image_size
+        self.output_dim = output_dim
+        self.conv1 = nn.Conv2d(
+            in_channels=3,
+            out_channels=width,
+            kernel_size=patch_size,
+            stride=patch_size,
+            bias=False,
+        )
+
+        scale = width**-0.5
+        self.class_embedding = nn.Parameter(scale * torch.randn(width))
+        self.positional_embedding = nn.Parameter(
+            scale * torch.randn((image_size // patch_size) ** 2 + 1, width)
+        )
+        self.ln_pre = LayerNorm(width)
+
+        self.text_branch = Transformer(width, layers, heads, act_layer=act_layer)
+
+        self.ln_post = LayerNorm(width)
+        self.proj = nn.Parameter(scale * torch.randn(width, output_dim))
+
+    def lock(self, unlocked_groups=0, freeze_bn_stats=False):
+        assert (
+            unlocked_groups == 0
+        ), "partial locking not currently supported for this model"
+        for param in self.parameters():
+            param.requires_grad = False
+
+    def forward(self, x: torch.Tensor):
+        x = self.conv1(x)  # shape = [*, width, grid, grid]
+        x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
+        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
+        x = torch.cat(
+            [
+                self.class_embedding.to(x.dtype)
+                + torch.zeros(
+                    x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device
+                ),
+                x,
+            ],
+            dim=1,
+        )  # shape = [*, grid ** 2 + 1, width]
+        x = x + self.positional_embedding.to(x.dtype)
+        x = self.ln_pre(x)
+
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.text_branch(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+
+        x = self.ln_post(x[:, 0, :])
+
+        if self.proj is not None:
+            x = x @ self.proj
+
+        return x
+
+
+@dataclass
+class CLAPVisionCfg:
+    layers: Union[Tuple[int, int, int, int], int] = 12
+    width: int = 768
+    patch_size: int = 16
+    image_size: Union[Tuple[int, int], int] = 224
+    timm_model_name: str = (
+        None  # a valid model name overrides layers, width, patch_size
+    )
+    timm_model_pretrained: bool = (
+        False  # use (imagenet) pretrained weights for named model
+    )
+    timm_pool: str = (
+        "avg"  # feature pooling for timm model ('abs_attn', 'rot_attn', 'avg', '')
+    )
+    timm_proj: str = (
+        "linear"  # linear projection for timm model output ('linear', 'mlp', '')
+    )
+
+
+# Audio Config Class
+@dataclass
+class CLAPAudioCfp:
+    model_type: str = "PANN"
+    model_name: str = "Cnn14"
+    sample_rate: int = 48000
+    # Param
+    audio_length: int = 1024
+    window_size: int = 1024
+    hop_size: int = 1024
+    fmin: int = 50
+    fmax: int = 14000
+    class_num: int = 527
+    mel_bins: int = 64
+    clip_samples: int = 480000
+
+
+@dataclass
+class CLAPTextCfg:
+    context_length: int
+    vocab_size: int
+    width: int
+    heads: int
+    layers: int
+    model_type: str
+
+
+class CLAP(nn.Module):
+    def __init__(
+        self,
+        embed_dim: int,
+        audio_cfg: CLAPAudioCfp,
+        text_cfg: CLAPTextCfg,
+        quick_gelu: bool = False,
+        enable_fusion: bool = False,
+        fusion_type: str = 'None',
+        joint_embed_shape: int = 512,
+        mlp_act: str = 'relu',
+    ):
+        super().__init__()
+        if isinstance(audio_cfg, dict):
+            audio_cfg = CLAPAudioCfp(**audio_cfg)
+        if isinstance(text_cfg, dict):
+            text_cfg = CLAPTextCfg(**text_cfg)
+
+        self.audio_cfg = audio_cfg
+        self.text_cfg = text_cfg
+        self.enable_fusion = enable_fusion
+        self.fusion_type = fusion_type
+        self.joint_embed_shape = joint_embed_shape
+        self.mlp_act = mlp_act
+
+
+        self.context_length = text_cfg.context_length
+
+        # OpenAI models are pretrained w/ QuickGELU but native nn.GELU is both faster and more
+        # memory efficient in recent PyTorch releases (>= 1.10).
+        # NOTE: timm models always use native GELU regardless of quick_gelu flag.
+        act_layer = QuickGELU if quick_gelu else nn.GELU
+
+        if mlp_act == 'relu':
+            mlp_act_layer = nn.ReLU()
+        elif mlp_act == 'gelu':
+            mlp_act_layer = nn.GELU()
+        else:
+            raise NotImplementedError
+
+        # audio branch
+        # audio branch parameters
+        if audio_cfg.model_type == "PANN":
+            self.audio_branch = create_pann_model(audio_cfg, enable_fusion, fusion_type)
+        elif audio_cfg.model_type == "HTSAT":
+            self.audio_branch = create_htsat_model(audio_cfg, enable_fusion, fusion_type)
+        else:
+            logging.error(f"Model config for {audio_cfg.model_type} not found")
+            raise RuntimeError(f"Model config for {audio_cfg.model_type} not found.")
+
+        # text branch
+        # text branch parameters
+        if text_cfg.model_type == "transformer":
+            self.text_branch = Transformer(
+                width=text_cfg.width,
+                layers=text_cfg.layers,
+                heads=text_cfg.heads,
+                act_layer=act_layer,
+            )
+            self.vocab_size = text_cfg.vocab_size
+            self.token_embedding = nn.Embedding(text_cfg.vocab_size, text_cfg.width)
+            self.positional_embedding = nn.Parameter(
+                torch.empty(self.context_length, text_cfg.width)
+            )
+            self.ln_final = LayerNorm(text_cfg.width)
+            self.text_transform = MLPLayers(units=[self.joint_embed_shape,
+                                                   self.joint_embed_shape,
+                                                   self.joint_embed_shape], dropout=0.1)
+            self.text_projection = nn.Sequential(
+                nn.Linear(text_cfg.width, self.joint_embed_shape),
+                mlp_act_layer,
+                nn.Linear(self.joint_embed_shape, self.joint_embed_shape)
+            )
+        elif text_cfg.model_type == "bert":
+            self.text_branch = BertModel.from_pretrained("bert-base-uncased")
+            self.text_transform = MLPLayers(units=[self.joint_embed_shape,
+                                                   self.joint_embed_shape,
+                                                   self.joint_embed_shape], dropout=0.1)
+            self.text_projection = nn.Sequential(
+                nn.Linear(768, self.joint_embed_shape),
+                mlp_act_layer,
+                nn.Linear(self.joint_embed_shape, self.joint_embed_shape)
+            )
+        elif text_cfg.model_type == "roberta":
+            self.text_branch = RobertaModel.from_pretrained('roberta-base')
+            self.text_transform = MLPLayers(units=[self.joint_embed_shape,
+                                                   self.joint_embed_shape,
+                                                   self.joint_embed_shape], dropout=0.1)
+            self.text_projection = nn.Sequential(
+                nn.Linear(768, self.joint_embed_shape),
+                mlp_act_layer,
+                nn.Linear(self.joint_embed_shape, self.joint_embed_shape)
+            )
+        elif text_cfg.model_type == "bart":
+            self.text_branch = BartModel.from_pretrained('facebook/bart-base')
+            self.text_transform = MLPLayers(units=[self.joint_embed_shape,
+                                                   self.joint_embed_shape,
+                                                   self.joint_embed_shape], dropout=0.1)
+            self.text_projection = nn.Sequential(
+                nn.Linear(768, self.joint_embed_shape),
+                mlp_act_layer,
+                nn.Linear(self.joint_embed_shape, self.joint_embed_shape)
+            )
+        else:
+            logging.error(f"Model config for {text_cfg.model_type} not found")
+            raise RuntimeError(f"Model config for {text_cfg.model_type} not found.")
+        self.text_branch_type = text_cfg.model_type
+        # text branch parameters
+
+        # audio branch parameters
+        self.audio_transform = MLPLayers(units=[self.joint_embed_shape,
+                                                self.joint_embed_shape,
+                                                self.joint_embed_shape], dropout=0.1)
+
+        # below here is text branch parameters
+
+        # ============================================================================================================
+        self.audio_projection = nn.Sequential(
+                nn.Linear(embed_dim, self.joint_embed_shape),
+                mlp_act_layer,
+                nn.Linear(self.joint_embed_shape, self.joint_embed_shape)
+            )
+
+        self.logit_scale_a = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        self.logit_scale_t = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        self.register_buffer("attn_mask", self.build_attention_mask(), persistent=False)
+
+        self.init_text_branch_parameters()
+
+    def init_text_branch_parameters(self):
+        if self.text_branch_type == "transformer":
+            nn.init.normal_(self.token_embedding.weight, std=0.02)
+            nn.init.normal_(self.positional_embedding, std=0.01)
+            proj_std = (self.text_branch.width**-0.5) * (
+                (2 * self.text_branch.layers) ** -0.5
+            )
+            attn_std = self.text_branch.width**-0.5
+            fc_std = (2 * self.text_branch.width) ** -0.5
+            for block in self.text_branch.resblocks:
+                nn.init.normal_(block.attn.in_proj_weight, std=attn_std)
+                nn.init.normal_(block.attn.out_proj.weight, std=proj_std)
+                nn.init.normal_(block.mlp.c_fc.weight, std=fc_std)
+                nn.init.normal_(block.mlp.c_proj.weight, std=proj_std)
+        if self.text_branch_type == "bert" or self.text_branch_type == "roberta":
+            width = self.text_branch.embeddings.word_embeddings.weight.shape[-1]
+        elif self.text_branch_type == "bart":
+            width = self.text_branch.shared.weight.shape[-1]
+        else:
+            width = self.text_branch.width
+        nn.init.constant_(self.logit_scale_a, np.log(1 / 0.07))
+        nn.init.constant_(self.logit_scale_t, np.log(1 / 0.07))
+
+        # deprecated
+        # if hasattr(self.visual, 'init_parameters'):
+        # self.visual.init_parameters()
+
+        # if self.text_projection is not None:
+        #     nn.init.normal_(self.text_projection, std=width**-0.5)
+
+    def build_attention_mask(self):
+        # lazily create causal attention mask, with full attention between the vision tokens
+        # pytorch uses additive attention mask; fill with -inf
+        mask = torch.empty(self.context_length, self.context_length)
+        mask.fill_(float("-inf"))
+        mask.triu_(1)  # zero out the lower diagonal
+        return mask
+
+    def encode_audio(self, audio, device):
+        return self.audio_branch(audio, mixup_lambda=None, device=device)  # mix lambda needs to add
+
+    # def list_of_dict_of_tensor2dict_of_tensor(self, x, device):
+    #     tmp = {}
+    #     for k in x[0].keys():
+    #         tmp[k] = []
+    #         for i in range(len(x)):
+    #             tmp[k].append(x[i][k][:77])
+    #     for k in x[0].keys():
+    #         tmp[k] = torch.tensor(tmp[k]).to(device=device, non_blocking=True)
+    #     return tmp
+
+    def encode_text(self, text, device):
+        if self.text_branch_type == "transformer":
+            text = text.to(device=device, non_blocking=True)
+            x = self.token_embedding(text)  # [batch_size, n_ctx, d_model]
+
+            x = x + self.positional_embedding
+            x = x.permute(1, 0, 2)  # NLD -> LND
+            x = self.text_branch(x, attn_mask=self.attn_mask)
+            x = x.permute(1, 0, 2)  # LND -> NLD
+            x = self.ln_final(x)
+
+            # x.shape = [batch_size, n_ctx, transformer.width]
+            # take features from the eot embedding (eot_token is the highest number in each sequence)
+            x = self.text_projection(x[torch.arange(x.shape[0]), text.argmax(dim=-1)])
+        elif self.text_branch_type == "bert":
+            # text = self.list_of_dict_of_tensor2dict_of_tensor(text, device)
+            # text = BatchEncoding(text)
+            x = self.text_branch(
+                input_ids=text["input_ids"].to(device=device, non_blocking=True),
+                attention_mask=text["attention_mask"].to(
+                    device=device, non_blocking=True
+                ),
+                token_type_ids=text["token_type_ids"].to(
+                    device=device, non_blocking=True
+                ),
+            )["pooler_output"]
+            x = self.text_projection(x)
+        elif self.text_branch_type == "roberta":
+            x = self.text_branch(
+                input_ids=text["input_ids"].to(device=device, non_blocking=True),
+                attention_mask=text["attention_mask"].to(
+                    device=device, non_blocking=True
+                ),
+            )["pooler_output"]
+            x = self.text_projection(x)
+        elif self.text_branch_type == "bart":
+            x = torch.mean(self.text_branch(
+                input_ids=text["input_ids"].to(device=device, non_blocking=True),
+                attention_mask=text["attention_mask"].to(
+                    device=device, non_blocking=True
+                ),
+            )["encoder_last_hidden_state"],axis=1)
+            x = self.text_projection(x)
+        else:
+            logging.error(f"Model type {self.text_branch_type} not found")
+            raise RuntimeError(f"Model type {self.text_branch_type} not found.")
+        return x
+
+    def forward(self, audio, text, device=None):
+        """Forward audio and text into the CLAP
+
+        Parameters
+        ----------
+        audio: torch.Tensor (batch_size, audio_length)
+            the time-domain audio input / the batch of mel_spec and longer list.
+        text: torch.Tensor () // need to add
+            the text token input
+        """
+        if device is None:
+            if audio is not None:
+                device = audio.device
+            elif text is not None:
+                device = text.device
+        if audio is None and text is None:
+            # a hack to get the logit scale
+            return self.logit_scale_a.exp(), self.logit_scale_t.exp()
+        elif audio is None:
+            return self.encode_text(text, device=device)
+        elif text is None:
+            return self.audio_projection(self.encode_audio(audio, device=device)["embedding"])
+        audio_features = self.audio_projection(self.encode_audio(audio, device=device)["embedding"])
+        audio_features = F.normalize(audio_features, dim=-1)
+
+        text_features = self.encode_text(
+            text, device=device
+        )
+        # print("text_features", text_features)
+        # print("text_features.shape", text_features.shape)
+        # print("text_features.type", type(text_features))
+        text_features = F.normalize(text_features, dim=-1)
+
+        audio_features_mlp = self.audio_transform(audio_features)
+        text_features_mlp = self.text_transform(text_features)
+        # Four outputs: audio features (basic & MLP), text features (basic & MLP)
+        return (
+            audio_features,
+            text_features,
+            audio_features_mlp,
+            text_features_mlp,
+            self.logit_scale_a.exp(),
+            self.logit_scale_t.exp(),
+        )
+
+    def get_logit_scale(self):
+        return self.logit_scale_a.exp(), self.logit_scale_t.exp()
+
+    def get_text_embedding(self, data):
+        """Get the text embedding from the model
+
+        Parameters
+        ----------
+        data: torch.Tensor 
+            a tensor of text embedding
+
+        Returns
+        ----------
+        text_embed: torch.Tensor
+            a tensor of text_embeds (N, D)
+
+        """
+        device = next(self.parameters()).device
+        for k in data:
+            data[k] = data[k].to(device)
+        text_embeds = self.encode_text(data, device=device)
+        text_embeds = F.normalize(text_embeds, dim=-1)
+        
+        return text_embeds
+
+    def get_audio_embedding(self, data):
+        """Get the audio embedding from the model
+
+        Parameters
+        ----------
+        data: a list of dict
+            the audio input dict list from 'get_audio_feature' method
+
+        Returns
+        ----------
+        audio_embed: torch.Tensor
+            a tensor of audio_embeds (N, D)
+
+        """
+        device = next(self.parameters()).device
+        input_dict = {}
+        keys = data[0].keys()
+        for k in keys:
+            input_dict[k] = torch.cat([d[k].unsqueeze(0) for d in data], dim=0).to(device)
+        audio_embeds = self.encode_audio(input_dict, device=device)["embedding"]
+        audio_embeds = self.audio_projection(audio_embeds)
+        audio_embeds = F.normalize(audio_embeds, dim=-1)
+        return audio_embeds
+
+            
+
+    def audio_infer(self, audio, hopsize=None, device=None):
+        """Forward one audio and produce the audio embedding
+
+        Parameters
+        ----------
+        audio:  (audio_length)
+            the time-domain audio input, notice that it must be only one input
+        hopsize: int
+            the overlap hopsize as the sliding window
+
+        Returns
+        ----------
+        output_dict: {
+            key: [n, (embedding_shape)] if "HTS-AT"
+            or
+            key: [(embedding_shape)] if "PANN"
+        }
+            the list of key values of the audio branch
+
+        """
+
+        assert not self.training, "the inference mode must be run at eval stage"
+        output_dict = {}
+        # PANN
+        if self.audio_cfg.model_type == "PANN":
+            audio_input = audio.unsqueeze(dim=0)
+            output_dict[key] = self.encode_audio(audio_input, device=device)[key].squeeze(dim=0)
+        elif self.audio_cfg.model_type == "HTSAT":
+            # repeat
+            audio_len = len(audio)
+            k = self.audio_cfg.clip_samples // audio_len
+            if k > 1:
+                audio = audio.repeat(k)
+                audio_len = len(audio)
+
+            if hopsize is None:
+                hopsize = min(hopsize, audio_len)
+
+            if audio_len > self.audio_cfg.clip_samples:
+                audio_input = [
+                    audio[pos : pos + self.audio_cfg.clip_samples].clone()
+                    for pos in range(
+                        0, audio_len - self.audio_cfg.clip_samples, hopsize
+                    )
+                ]
+                audio_input.append(audio[-self.audio_cfg.clip_samples :].clone())
+                audio_input = torch.stack(audio_input)
+                output_dict[key] = self.encode_audio(audio_input, device=device)[key]
+            else:
+                audio_input = audio.unsqueeze(dim=0)
+                output_dict[key] = self.encode_audio(audio_input, device=device)[key].squeeze(dim=0)
+
+        return output_dict
+
+
+def convert_weights_to_fp16(model: nn.Module):
+    """Convert applicable model parameters to fp16"""
+
+    def _convert_weights_to_fp16(l):
+        if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Linear)):
+            l.weight.data = l.weight.data.half()
+            if l.bias is not None:
+                l.bias.data = l.bias.data.half()
+
+        if isinstance(l, nn.MultiheadAttention):
+            for attr in [
+                *[f"{s}_proj_weight" for s in ["in", "q", "k", "v"]],
+                "in_proj_bias",
+                "bias_k",
+                "bias_v",
+            ]:
+                tensor = getattr(l, attr)
+                if tensor is not None:
+                    tensor.data = tensor.data.half()
+
+        for name in ["text_projection", "proj"]:
+            if hasattr(l, name):
+                attr = getattr(l, name)
+                if attr is not None:
+                    attr.data = attr.data.half()
+
+    model.apply(_convert_weights_to_fp16)
+
+
+# Ignore the state dict of the vision part
+def build_model_from_openai_state_dict(state_dict: dict, model_cfg, enable_fusion: bool = False, fusion_type: str = 'None'):
+
+    embed_dim = model_cfg["embed_dim"]
+    audio_cfg = model_cfg["audio_cfg"]
+    text_cfg = model_cfg["text_cfg"]
+    context_length = state_dict["positional_embedding"].shape[0]
+    vocab_size = state_dict["token_embedding.weight"].shape[0]
+    transformer_width = state_dict["ln_final.weight"].shape[0]
+    transformer_heads = transformer_width // 64
+    transformer_layers = len(
+        set(
+            k.split(".")[2]
+            for k in state_dict
+            if k.startswith(f"transformer.resblocks")
+        )
+    )
+
+    audio_cfg = CLAPAudioCfp(**audio_cfg)
+    text_cfg = CLAPTextCfg(**text_cfg)
+
+    model = CLAP(
+        embed_dim,
+        audio_cfg=audio_cfg,
+        text_cfg=text_cfg,
+        quick_gelu=True,  # OpenAI models were trained with QuickGELU
+        enable_fusion=enable_fusion,
+        fusion_type=fusion_type
+    )
+    state_dict["logit_scale_a"] = state_dict["logit_scale"]
+    state_dict["logit_scale_t"] = state_dict["logit_scale"]
+    pop_keys = list(state_dict.keys())[::]
+    # pop the visual branch saved weights
+    for key in pop_keys:
+        if key.startswith("visual."):
+            state_dict.pop(key, None)
+
+    for key in ["logit_scale", "input_resolution", "context_length", "vocab_size"]:
+        state_dict.pop(key, None)
+
+    # not use fp16
+    # convert_weights_to_fp16(model)
+    model.load_state_dict(state_dict, strict=False)
+    return model.eval()
+
+
+def trace_model(model, batch_size=256, device=torch.device("cpu")):
+    model.eval()
+    audio_length = model.audio_cfg.audio_length
+    example_audio = torch.ones((batch_size, audio_length), device=device)
+    example_text = torch.zeros(
+        (batch_size, model.context_length), dtype=torch.int, device=device
+    )
+    model = torch.jit.trace_module(
+        model,
+        inputs=dict(
+            forward=(example_audio, example_text),
+            encode_text=(example_text,),
+            encode_image=(example_audio,),
+        ),
+    )
+    model.audio_cfg.audio_length = audio_length  # Question: what does this do?
+    return model
diff --git a/my_laion_clap/CLAP/src/laion_clap/clap_module/openai.py b/my_laion_clap/CLAP/src/laion_clap/clap_module/openai.py
new file mode 100644
index 0000000000000000000000000000000000000000..9911b6e135e51970177fcac067c12192b0b57c1c
--- /dev/null
+++ b/my_laion_clap/CLAP/src/laion_clap/clap_module/openai.py
@@ -0,0 +1,129 @@
+""" OpenAI pretrained model functions
+
+Adapted from https://github.com/openai/CLIP. Originally MIT License, Copyright (c) 2021 OpenAI.
+"""
+
+import os
+import warnings
+from typing import Union, List
+
+import torch
+
+from .model import build_model_from_openai_state_dict
+from .pretrained import get_pretrained_url, list_pretrained_tag_models, download_pretrained
+
+__all__ = ["list_openai_models", "load_openai_model"]
+
+
+def list_openai_models() -> List[str]:
+    """Returns the names of available CLIP models"""
+    return list_pretrained_tag_models('openai')
+
+
+def load_openai_model(
+        name: str,
+        model_cfg,
+        device: Union[str, torch.device] = "cuda" if torch.cuda.is_available() else "cpu",
+        jit=True,
+        cache_dir=os.path.expanduser("~/.cache/clip"),
+        enable_fusion: bool = False,
+        fusion_type: str = 'None'
+):
+    """Load a CLIP model, preserve its text pretrained part, and set in the CLAP model
+
+    Parameters
+    ----------
+    name : str
+        A model name listed by `clip.available_models()`, or the path to a model checkpoint containing the state_dict
+    device : Union[str, torch.device]
+        The device to put the loaded model
+    jit : bool
+        Whether to load the optimized JIT model (default) or more hackable non-JIT model.
+
+    Returns
+    -------
+    model : torch.nn.Module
+        The CLAP model
+    preprocess : Callable[[PIL.Image], torch.Tensor]
+        A torchvision transform that converts a PIL image into a tensor that the returned model can take as its input
+    """
+    if get_pretrained_url(name, 'openai'):
+        model_path = download_pretrained(get_pretrained_url(name, 'openai'), root=cache_dir)
+    elif os.path.isfile(name):
+        model_path = name
+    else:
+        raise RuntimeError(f"Model {name} not found; available models = {list_openai_models()}")
+
+    try:
+        # loading JIT archive
+        model = torch.jit.load(model_path, map_location=device if jit else "cpu").eval()
+        state_dict = None
+    except RuntimeError:
+        # loading saved state dict
+        if jit:
+            warnings.warn(f"File {model_path} is not a JIT archive. Loading as a state dict instead")
+            jit = False
+        state_dict = torch.load(model_path, map_location="cpu")
+
+    if not jit:
+        try:
+            model = build_model_from_openai_state_dict(state_dict or model.state_dict(), model_cfg, enable_fusion, fusion_type).to(device)
+        except KeyError:
+            sd = {k[7:]: v for k, v in state_dict["state_dict"].items()}
+            model = build_model_from_openai_state_dict(sd, model_cfg, enable_fusion, fusion_type).to(device)
+
+        if str(device) == "cpu":
+            model.float()
+        return model
+
+    # patch the device names
+    device_holder = torch.jit.trace(lambda: torch.ones([]).to(torch.device(device)), example_inputs=[])
+    device_node = [n for n in device_holder.graph.findAllNodes("prim::Constant") if "Device" in repr(n)][-1]
+
+    def patch_device(module):
+        try:
+            graphs = [module.graph] if hasattr(module, "graph") else []
+        except RuntimeError:
+            graphs = []
+
+        if hasattr(module, "forward1"):
+            graphs.append(module.forward1.graph)
+
+        for graph in graphs:
+            for node in graph.findAllNodes("prim::Constant"):
+                if "value" in node.attributeNames() and str(node["value"]).startswith("cuda"):
+                    node.copyAttributes(device_node)
+
+    model.apply(patch_device)
+    patch_device(model.encode_audio)
+    patch_device(model.encode_text)
+
+    # patch dtype to float32 on CPU
+    if str(device) == "cpu":
+        float_holder = torch.jit.trace(lambda: torch.ones([]).float(), example_inputs=[])
+        float_input = list(float_holder.graph.findNode("aten::to").inputs())[1]
+        float_node = float_input.node()
+
+        def patch_float(module):
+            try:
+                graphs = [module.graph] if hasattr(module, "graph") else []
+            except RuntimeError:
+                graphs = []
+
+            if hasattr(module, "forward1"):
+                graphs.append(module.forward1.graph)
+
+            for graph in graphs:
+                for node in graph.findAllNodes("aten::to"):
+                    inputs = list(node.inputs())
+                    for i in [1, 2]:  # dtype can be the second or third argument to aten::to()
+                        if inputs[i].node()["value"] == 5:
+                            inputs[i].node().copyAttributes(float_node)
+
+        model.apply(patch_float)
+        patch_float(model.encode_audio)
+        patch_float(model.encode_text)
+        model.float()
+
+    model.audio_branch.audio_length = model.audio_cfg.audio_length
+    return model
diff --git a/my_laion_clap/CLAP/src/laion_clap/clap_module/pann_model.py b/my_laion_clap/CLAP/src/laion_clap/clap_module/pann_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..109db5f418a0bad32cae2452742589ff52a19b85
--- /dev/null
+++ b/my_laion_clap/CLAP/src/laion_clap/clap_module/pann_model.py
@@ -0,0 +1,543 @@
+# PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition
+# Reference from https://github.com/qiuqiangkong/audioset_tagging_cnn
+# Some layers are re-designed for CLAP
+import os
+os.environ['NUMBA_CACHE_DIR'] = '/tmp/'
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchlibrosa.stft import Spectrogram, LogmelFilterBank
+from torchlibrosa.augmentation import SpecAugmentation
+
+from .utils import do_mixup, interpolate, pad_framewise_output
+from .feature_fusion import iAFF, AFF, DAF
+
+
+def init_layer(layer):
+    """Initialize a Linear or Convolutional layer. """
+    nn.init.xavier_uniform_(layer.weight)
+
+    if hasattr(layer, 'bias'):
+        if layer.bias is not None:
+            layer.bias.data.fill_(0.)
+
+
+def init_bn(bn):
+    """Initialize a Batchnorm layer. """
+    bn.bias.data.fill_(0.)
+    bn.weight.data.fill_(1.)
+
+
+class ConvBlock(nn.Module):
+    def __init__(self, in_channels, out_channels):
+
+        super(ConvBlock, self).__init__()
+
+        self.conv1 = nn.Conv2d(in_channels=in_channels,
+                              out_channels=out_channels,
+                              kernel_size=(3, 3), stride=(1, 1),
+                              padding=(1, 1), bias=False)
+
+        self.conv2 = nn.Conv2d(in_channels=out_channels,
+                              out_channels=out_channels,
+                              kernel_size=(3, 3), stride=(1, 1),
+                              padding=(1, 1), bias=False)
+
+        self.bn1 = nn.BatchNorm2d(out_channels)
+        self.bn2 = nn.BatchNorm2d(out_channels)
+
+        self.init_weight()
+
+    def init_weight(self):
+        init_layer(self.conv1)
+        init_layer(self.conv2)
+        init_bn(self.bn1)
+        init_bn(self.bn2)
+
+
+    def forward(self, input, pool_size=(2, 2), pool_type='avg'):
+
+        x = input
+        x = F.relu_(self.bn1(self.conv1(x)))
+        x = F.relu_(self.bn2(self.conv2(x)))
+        if pool_type == 'max':
+            x = F.max_pool2d(x, kernel_size=pool_size)
+        elif pool_type == 'avg':
+            x = F.avg_pool2d(x, kernel_size=pool_size)
+        elif pool_type == 'avg+max':
+            x1 = F.avg_pool2d(x, kernel_size=pool_size)
+            x2 = F.max_pool2d(x, kernel_size=pool_size)
+            x = x1 + x2
+        else:
+            raise Exception('Incorrect argument!')
+
+        return x
+
+
+class ConvBlock5x5(nn.Module):
+    def __init__(self, in_channels, out_channels):
+
+        super(ConvBlock5x5, self).__init__()
+
+        self.conv1 = nn.Conv2d(in_channels=in_channels,
+                              out_channels=out_channels,
+                              kernel_size=(5, 5), stride=(1, 1),
+                              padding=(2, 2), bias=False)
+
+        self.bn1 = nn.BatchNorm2d(out_channels)
+
+        self.init_weight()
+
+    def init_weight(self):
+        init_layer(self.conv1)
+        init_bn(self.bn1)
+
+
+    def forward(self, input, pool_size=(2, 2), pool_type='avg'):
+
+        x = input
+        x = F.relu_(self.bn1(self.conv1(x)))
+        if pool_type == 'max':
+            x = F.max_pool2d(x, kernel_size=pool_size)
+        elif pool_type == 'avg':
+            x = F.avg_pool2d(x, kernel_size=pool_size)
+        elif pool_type == 'avg+max':
+            x1 = F.avg_pool2d(x, kernel_size=pool_size)
+            x2 = F.max_pool2d(x, kernel_size=pool_size)
+            x = x1 + x2
+        else:
+            raise Exception('Incorrect argument!')
+
+        return x
+
+
+class AttBlock(nn.Module):
+    def __init__(self, n_in, n_out, activation='linear', temperature=1.):
+        super(AttBlock, self).__init__()
+
+        self.activation = activation
+        self.temperature = temperature
+        self.att = nn.Conv1d(in_channels=n_in, out_channels=n_out, kernel_size=1, stride=1, padding=0, bias=True)
+        self.cla = nn.Conv1d(in_channels=n_in, out_channels=n_out, kernel_size=1, stride=1, padding=0, bias=True)
+
+        self.bn_att = nn.BatchNorm1d(n_out)
+        self.init_weights()
+
+    def init_weights(self):
+        init_layer(self.att)
+        init_layer(self.cla)
+        init_bn(self.bn_att)
+
+    def forward(self, x):
+        # x: (n_samples, n_in, n_time)
+        norm_att = torch.softmax(torch.clamp(self.att(x), -10, 10), dim=-1)
+        cla = self.nonlinear_transform(self.cla(x))
+        x = torch.sum(norm_att * cla, dim=2)
+        return x, norm_att, cla
+
+    def nonlinear_transform(self, x):
+        if self.activation == 'linear':
+            return x
+        elif self.activation == 'sigmoid':
+            return torch.sigmoid(x)
+
+
+class Cnn14(nn.Module):
+    def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin,
+        fmax, classes_num, enable_fusion=False, fusion_type='None'):
+
+        super(Cnn14, self).__init__()
+
+        window = 'hann'
+        center = True
+        pad_mode = 'reflect'
+        ref = 1.0
+        amin = 1e-10
+        top_db = None
+
+        self.enable_fusion = enable_fusion
+        self.fusion_type = fusion_type
+
+        # Spectrogram extractor
+        self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size,
+            win_length=window_size, window=window, center=center, pad_mode=pad_mode,
+            freeze_parameters=True)
+
+        # Logmel feature extractor
+        self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size,
+            n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db,
+            freeze_parameters=True)
+
+        # Spec augmenter
+        self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2,
+            freq_drop_width=8, freq_stripes_num=2)
+
+        self.bn0 = nn.BatchNorm2d(64)
+
+        if (self.enable_fusion) and (self.fusion_type == 'channel_map'):
+            self.conv_block1 = ConvBlock(in_channels=4, out_channels=64)
+        else:
+            self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
+        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
+        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
+        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
+        self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024)
+        self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048)
+
+        self.fc1 = nn.Linear(2048, 2048, bias=True)
+        self.fc_audioset = nn.Linear(2048, classes_num, bias=True)
+
+        if (self.enable_fusion) and (self.fusion_type in ['daf_1d','aff_1d','iaff_1d']):
+            self.mel_conv1d = nn.Sequential(
+                nn.Conv1d(64, 64, kernel_size=5, stride=3, padding=2),
+                nn.BatchNorm1d(64) # No Relu
+            )
+            if self.fusion_type == 'daf_1d':
+                self.fusion_model = DAF()
+            elif self.fusion_type == 'aff_1d':
+                self.fusion_model = AFF(channels=64, type='1D')
+            elif self.fusion_type == 'iaff_1d':
+                self.fusion_model = iAFF(channels=64, type='1D')
+
+        if (self.enable_fusion) and (self.fusion_type in ['daf_2d','aff_2d','iaff_2d']):
+            self.mel_conv2d = nn.Sequential(
+                nn.Conv2d(1, 64, kernel_size=(5,5), stride=(6, 2), padding=(2,2)),
+                nn.BatchNorm2d(64),
+                nn.ReLU(inplace=True)
+            )
+
+            if self.fusion_type == 'daf_2d':
+                self.fusion_model = DAF()
+            elif self.fusion_type == 'aff_2d':
+                self.fusion_model = AFF(channels=64, type='2D')
+            elif self.fusion_type == 'iaff_2d':
+                self.fusion_model = iAFF(channels=64, type='2D')
+        self.init_weight()
+
+    def init_weight(self):
+        init_bn(self.bn0)
+        init_layer(self.fc1)
+        init_layer(self.fc_audioset)
+
+    def forward(self, input, mixup_lambda=None, device=None):
+        """
+        Input: (batch_size, data_length)"""
+
+        if self.enable_fusion and input["longer"].sum() == 0:
+            # if no audio is longer than 10s, then randomly select one audio to be longer
+            input["longer"][torch.randint(0, input["longer"].shape[0], (1,))] = True
+
+        if not self.enable_fusion:
+            x = self.spectrogram_extractor(input['waveform'].to(device=device, non_blocking=True))   # (batch_size, 1, time_steps, freq_bins)
+            x = self.logmel_extractor(x)    # (batch_size, 1, time_steps, mel_bins)
+
+            x = x.transpose(1, 3)
+            x = self.bn0(x)
+            x = x.transpose(1, 3)
+        else:
+            longer_list = input["longer"].to(device=device, non_blocking=True)
+            x = input["mel_fusion"].to(device=device, non_blocking=True)
+            longer_list_idx = torch.where(longer_list)[0]
+            x = x.transpose(1, 3)
+            x = self.bn0(x)
+            x = x.transpose(1, 3)
+            if self.fusion_type in ['daf_1d','aff_1d','iaff_1d']:
+                new_x = x[:,0:1,:,:].clone().contiguous()
+                # local processing
+                if len(longer_list_idx) > 0:
+                    fusion_x_local = x[longer_list_idx,1:,:,:].clone().contiguous()
+                    FB,FC,FT,FF = fusion_x_local.size()
+                    fusion_x_local = fusion_x_local.view(FB * FC, FT, FF)
+                    fusion_x_local = torch.permute(fusion_x_local, (0,2,1)).contiguous()
+                    fusion_x_local = self.mel_conv1d(fusion_x_local)
+                    fusion_x_local = fusion_x_local.view(FB,FC,FF,fusion_x_local.size(-1))
+                    fusion_x_local = torch.permute(fusion_x_local, (0,2,1,3)).contiguous().flatten(2)
+                    if fusion_x_local.size(-1) < FT:
+                        fusion_x_local = torch.cat([fusion_x_local, torch.zeros((FB,FF,FT- fusion_x_local.size(-1)), device=device)], dim=-1)
+                    else:
+                        fusion_x_local = fusion_x_local[:,:,:FT]
+                    # 1D fusion
+                    new_x = new_x.squeeze(1).permute((0,2,1)).contiguous()
+                    new_x[longer_list_idx] = self.fusion_model(new_x[longer_list_idx], fusion_x_local)
+                    x = new_x.permute((0,2,1)).contiguous()[:,None,:,:]
+                else:
+                    x = new_x
+            elif self.fusion_type in ['daf_2d','aff_2d','iaff_2d','channel_map']:
+                x = x # no change
+
+        if self.training:
+            x = self.spec_augmenter(x)
+        # Mixup on spectrogram
+        if self.training and mixup_lambda is not None:
+            x = do_mixup(x, mixup_lambda)
+        if (self.enable_fusion) and (self.fusion_type in ['daf_2d','aff_2d','iaff_2d']):
+            global_x = x[:,0:1,:,:]
+
+             # global processing
+            B, C, H, W = global_x.shape
+            global_x = self.conv_block1(global_x, pool_size=(2, 2), pool_type='avg')
+            if len(longer_list_idx) > 0:
+                local_x = x[longer_list_idx,1:,:,:].contiguous()
+                TH = global_x.size(-2)
+                # local processing
+                B, C, H, W = local_x.shape
+                local_x = local_x.view(B*C,1,H,W)
+                local_x = self.mel_conv2d(local_x)
+                local_x = local_x.view(B,C,local_x.size(1),local_x.size(2),local_x.size(3))
+                local_x = local_x.permute((0,2,1,3,4)).contiguous().flatten(2,3)
+                TB,TC,_,TW = local_x.size()
+                if local_x.size(-2) < TH:
+                    local_x = torch.cat([local_x, torch.zeros((TB,TC,TH-local_x.size(-2),TW), device=global_x.device)], dim=-2)
+                else:
+                    local_x = local_x[:,:,:TH,:]
+
+                global_x[longer_list_idx] = self.fusion_model(global_x[longer_list_idx],local_x)
+            x = global_x
+        else:
+            x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg')
+
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block5(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block6(x, pool_size=(1, 1), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = torch.mean(x, dim=3)
+
+        latent_x1 = F.max_pool1d(x, kernel_size=3, stride=1, padding=1)
+        latent_x2 = F.avg_pool1d(x, kernel_size=3, stride=1, padding=1)
+        latent_x = latent_x1 + latent_x2
+        latent_x = latent_x.transpose(1, 2)
+        latent_x = F.relu_(self.fc1(latent_x))
+        latent_output = interpolate(latent_x, 32)
+
+
+        (x1, _) = torch.max(x, dim=2)
+        x2 = torch.mean(x, dim=2)
+        x = x1 + x2
+        x = F.dropout(x, p=0.5, training=self.training)
+        x = F.relu_(self.fc1(x))
+        embedding = F.dropout(x, p=0.5, training=self.training)
+        clipwise_output = torch.sigmoid(self.fc_audioset(x))
+
+        output_dict = {'clipwise_output': clipwise_output, 'embedding': embedding, 'fine_grained_embedding': latent_output}
+        return output_dict
+
+
+class Cnn6(nn.Module):
+    def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin,
+        fmax, classes_num, enable_fusion=False, fusion_type='None'):
+
+        super(Cnn6, self).__init__()
+
+        window = 'hann'
+        center = True
+        pad_mode = 'reflect'
+        ref = 1.0
+        amin = 1e-10
+        top_db = None
+
+        self.enable_fusion = enable_fusion
+        self.fusion_type = fusion_type
+
+        # Spectrogram extractor
+        self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size,
+            win_length=window_size, window=window, center=center, pad_mode=pad_mode,
+            freeze_parameters=True)
+
+        # Logmel feature extractor
+        self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size,
+            n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db,
+            freeze_parameters=True)
+
+        # Spec augmenter
+        self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2,
+            freq_drop_width=8, freq_stripes_num=2)
+
+        self.bn0 = nn.BatchNorm2d(64)
+
+        self.conv_block1 = ConvBlock5x5(in_channels=1, out_channels=64)
+        self.conv_block2 = ConvBlock5x5(in_channels=64, out_channels=128)
+        self.conv_block3 = ConvBlock5x5(in_channels=128, out_channels=256)
+        self.conv_block4 = ConvBlock5x5(in_channels=256, out_channels=512)
+
+        self.fc1 = nn.Linear(512, 512, bias=True)
+        self.fc_audioset = nn.Linear(512, classes_num, bias=True)
+
+        self.init_weight()
+
+    def init_weight(self):
+        init_bn(self.bn0)
+        init_layer(self.fc1)
+        init_layer(self.fc_audioset)
+
+    def forward(self, input, mixup_lambda=None, device=None):
+        """
+        Input: (batch_size, data_length)"""
+
+        x = self.spectrogram_extractor(input)   # (batch_size, 1, time_steps, freq_bins)
+        x = self.logmel_extractor(x)    # (batch_size, 1, time_steps, mel_bins)
+
+        x = x.transpose(1, 3)
+        x = self.bn0(x)
+        x = x.transpose(1, 3)
+
+        if self.training:
+            x = self.spec_augmenter(x)
+
+        # Mixup on spectrogram
+        if self.training and mixup_lambda is not None:
+            x = do_mixup(x, mixup_lambda)
+
+        x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = torch.mean(x, dim=3)
+
+        latent_x1 = F.max_pool1d(x, kernel_size=3, stride=1, padding=1)
+        latent_x2 = F.avg_pool1d(x, kernel_size=3, stride=1, padding=1)
+        latent_x = latent_x1 + latent_x2
+        latent_x = latent_x.transpose(1, 2)
+        latent_x = F.relu_(self.fc1(latent_x))
+        latent_output = interpolate(latent_x, 16)
+
+        (x1, _) = torch.max(x, dim=2)
+        x2 = torch.mean(x, dim=2)
+        x = x1 + x2
+        x = F.dropout(x, p=0.5, training=self.training)
+        x = F.relu_(self.fc1(x))
+        embedding = F.dropout(x, p=0.5, training=self.training)
+        clipwise_output = torch.sigmoid(self.fc_audioset(x))
+
+        output_dict = {'clipwise_output': clipwise_output, 'embedding': embedding, 'fine_grained_embedding': latent_output}
+
+        return output_dict
+
+
+class Cnn10(nn.Module):
+    def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin,
+        fmax, classes_num, enable_fusion=False, fusion_type='None'):
+
+        super(Cnn10, self).__init__()
+
+        window = 'hann'
+        center = True
+        pad_mode = 'reflect'
+        ref = 1.0
+        amin = 1e-10
+        top_db = None
+
+        self.enable_fusion = enable_fusion
+        self.fusion_type = fusion_type
+
+        # Spectrogram extractor
+        self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size,
+            win_length=window_size, window=window, center=center, pad_mode=pad_mode,
+            freeze_parameters=True)
+
+        # Logmel feature extractor
+        self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size,
+            n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db,
+            freeze_parameters=True)
+
+        # Spec augmenter
+        self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2,
+            freq_drop_width=8, freq_stripes_num=2)
+
+        self.bn0 = nn.BatchNorm2d(64)
+
+        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
+        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
+        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
+        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
+        self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024)
+
+        self.fc1 = nn.Linear(1024, 1024, bias=True)
+        self.fc_audioset = nn.Linear(1024, classes_num, bias=True)
+
+        self.init_weight()
+
+    def init_weight(self):
+        init_bn(self.bn0)
+        init_layer(self.fc1)
+        init_layer(self.fc_audioset)
+
+    def forward(self, input, mixup_lambda=None, device=None):
+        """
+        Input: (batch_size, data_length)"""
+
+        x = self.spectrogram_extractor(input)   # (batch_size, 1, time_steps, freq_bins)
+        x = self.logmel_extractor(x)    # (batch_size, 1, time_steps, mel_bins)
+
+        x = x.transpose(1, 3)
+        x = self.bn0(x)
+        x = x.transpose(1, 3)
+
+        if self.training:
+            x = self.spec_augmenter(x)
+
+        # Mixup on spectrogram
+        if self.training and mixup_lambda is not None:
+            x = do_mixup(x, mixup_lambda)
+
+        x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block5(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = torch.mean(x, dim=3)
+
+        latent_x1 = F.max_pool1d(x, kernel_size=3, stride=1, padding=1)
+        latent_x2 = F.avg_pool1d(x, kernel_size=3, stride=1, padding=1)
+        latent_x = latent_x1 + latent_x2
+        latent_x = latent_x.transpose(1, 2)
+        latent_x = F.relu_(self.fc1(latent_x))
+        latent_output = interpolate(latent_x, 32)
+
+        (x1, _) = torch.max(x, dim=2)
+        x2 = torch.mean(x, dim=2)
+        x = x1 + x2
+        x = F.dropout(x, p=0.5, training=self.training)
+        x = F.relu_(self.fc1(x))
+        embedding = F.dropout(x, p=0.5, training=self.training)
+        clipwise_output = torch.sigmoid(self.fc_audioset(x))
+
+        output_dict = {'clipwise_output': clipwise_output, 'embedding': embedding, 'fine_grained_embedding': latent_output}
+
+        return output_dict
+
+
+def create_pann_model(audio_cfg, enable_fusion=False, fusion_type='None'):
+    try:
+        ModelProto = eval(audio_cfg.model_name)
+        model = ModelProto(
+            sample_rate = audio_cfg.sample_rate,
+            window_size = audio_cfg.window_size,
+            hop_size =audio_cfg.hop_size,
+            mel_bins = audio_cfg.mel_bins,
+            fmin = audio_cfg.fmin,
+            fmax = audio_cfg.fmax,
+            classes_num = audio_cfg.class_num,
+            enable_fusion = enable_fusion,
+            fusion_type = fusion_type
+        )
+        return model
+    except:
+        raise RuntimeError(f'Import Model for {audio_cfg.model_name} not found, or the audio cfg parameters are not enough.')
+
diff --git a/my_laion_clap/CLAP/src/laion_clap/clap_module/pretrained.py b/my_laion_clap/CLAP/src/laion_clap/clap_module/pretrained.py
new file mode 100644
index 0000000000000000000000000000000000000000..723619a9fd511cf8619def49c4631ec701891b93
--- /dev/null
+++ b/my_laion_clap/CLAP/src/laion_clap/clap_module/pretrained.py
@@ -0,0 +1,147 @@
+import hashlib
+import os
+import urllib
+import warnings
+
+from tqdm import tqdm
+
+_RN50 = dict(
+    openai="https://openaipublic.azureedge.net/clip/models/afeb0e10f9e5a86da6080e35cf09123aca3b358a0c3e3b6c78a7b63bc04b6762/RN50.pt",
+    yfcc15m="https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/rn50-quickgelu-yfcc15m-455df137.pt",
+    cc12m="https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/rn50-quickgelu-cc12m-f000538c.pt"
+)
+
+_RN50_quickgelu = dict(
+    openai="https://openaipublic.azureedge.net/clip/models/afeb0e10f9e5a86da6080e35cf09123aca3b358a0c3e3b6c78a7b63bc04b6762/RN50.pt",
+    yfcc15m="https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/rn50-quickgelu-yfcc15m-455df137.pt",
+    cc12m="https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/rn50-quickgelu-cc12m-f000538c.pt"
+)
+
+_RN101 = dict(
+    openai="https://openaipublic.azureedge.net/clip/models/8fa8567bab74a42d41c5915025a8e4538c3bdbe8804a470a72f30b0d94fab599/RN101.pt",
+    yfcc15m="https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/rn101-quickgelu-yfcc15m-3e04b30e.pt"
+)
+
+_RN101_quickgelu = dict(
+    openai="https://openaipublic.azureedge.net/clip/models/8fa8567bab74a42d41c5915025a8e4538c3bdbe8804a470a72f30b0d94fab599/RN101.pt",
+    yfcc15m="https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/rn101-quickgelu-yfcc15m-3e04b30e.pt"
+)
+
+_RN50x4 = dict(
+    openai="https://openaipublic.azureedge.net/clip/models/7e526bd135e493cef0776de27d5f42653e6b4c8bf9e0f653bb11773263205fdd/RN50x4.pt",
+)
+
+_RN50x16 = dict(
+    openai="https://openaipublic.azureedge.net/clip/models/52378b407f34354e150460fe41077663dd5b39c54cd0bfd2b27167a4a06ec9aa/RN50x16.pt",
+)
+
+_RN50x64 = dict(
+    openai="https://openaipublic.azureedge.net/clip/models/be1cfb55d75a9666199fb2206c106743da0f6468c9d327f3e0d0a543a9919d9c/RN50x64.pt",
+)
+
+_VITB32 = dict(
+    openai="https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt",
+    laion400m_e31="https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_32-quickgelu-laion400m_e31-d867053b.pt",
+    laion400m_e32="https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_32-quickgelu-laion400m_e32-46683a32.pt",
+    laion400m_avg="https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_32-quickgelu-laion400m_avg-8a00ab3c.pt",
+)
+
+_VITB32_quickgelu = dict(
+    openai="https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt",
+    laion400m_e31="https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_32-quickgelu-laion400m_e31-d867053b.pt",
+    laion400m_e32="https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_32-quickgelu-laion400m_e32-46683a32.pt",
+    laion400m_avg="https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_32-quickgelu-laion400m_avg-8a00ab3c.pt",
+)
+
+_VITB16 = dict(
+    openai="https://openaipublic.azureedge.net/clip/models/5806e77cd80f8b59890b7e101eabd078d9fb84e6937f9e85e4ecb61988df416f/ViT-B-16.pt",
+)
+
+_VITL14 = dict(
+    openai="https://openaipublic.azureedge.net/clip/models/b8cca3fd41ae0c99ba7e8951adf17d267cdb84cd88be6f7c2e0eca1737a03836/ViT-L-14.pt",
+)
+
+_PRETRAINED = {
+    "RN50": _RN50,
+    "RN50-quickgelu": _RN50_quickgelu,
+    "RN101": _RN101,
+    "RN101-quickgelu": _RN101_quickgelu,
+    "RN50x4": _RN50x4,
+    "RN50x16": _RN50x16,
+    "ViT-B-32": _VITB32,
+    "ViT-B-32-quickgelu": _VITB32_quickgelu,
+    "ViT-B-16": _VITB16,
+    "ViT-L-14": _VITL14,
+}
+
+
+def list_pretrained(as_str: bool = False):
+    """ returns list of pretrained models
+    Returns a tuple (model_name, pretrain_tag) by default or 'name:tag' if as_str == True
+    """
+    return [':'.join([k, t]) if as_str else (k, t) for k in _PRETRAINED.keys() for t in _PRETRAINED[k].keys()]
+
+
+def list_pretrained_tag_models(tag: str):
+    """ return all models having the specified pretrain tag """
+    models = []
+    for k in _PRETRAINED.keys():
+        if tag in _PRETRAINED[k]:
+            models.append(k)
+    return models
+
+
+def list_pretrained_model_tags(model: str):
+    """ return all pretrain tags for the specified model architecture """
+    tags = []
+    if model in _PRETRAINED:
+        tags.extend(_PRETRAINED[model].keys())
+    return tags
+
+
+def get_pretrained_url(model: str, tag: str):
+    if model not in _PRETRAINED:
+        return ''
+    model_pretrained = _PRETRAINED[model]
+    if tag not in model_pretrained:
+        return ''
+    return model_pretrained[tag]
+
+
+def download_pretrained(url: str, root: str = os.path.expanduser("~/.cache/clip")):
+    os.makedirs(root, exist_ok=True)
+    filename = os.path.basename(url)
+
+    if 'openaipublic' in url:
+        expected_sha256 = url.split("/")[-2]
+    else:
+        expected_sha256 = ''
+
+    download_target = os.path.join(root, filename)
+
+    if os.path.exists(download_target) and not os.path.isfile(download_target):
+        raise RuntimeError(f"{download_target} exists and is not a regular file")
+
+    if os.path.isfile(download_target):
+        if expected_sha256:
+            if hashlib.sha256(open(download_target, "rb").read()).hexdigest() == expected_sha256:
+                return download_target
+            else:
+                warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file")
+        else:
+            return download_target
+
+    with urllib.request.urlopen(url) as source, open(download_target, "wb") as output:
+        with tqdm(total=int(source.info().get("Content-Length")), ncols=80, unit='iB', unit_scale=True) as loop:
+            while True:
+                buffer = source.read(8192)
+                if not buffer:
+                    break
+
+                output.write(buffer)
+                loop.update(len(buffer))
+
+    if expected_sha256 and hashlib.sha256(open(download_target, "rb").read()).hexdigest() != expected_sha256:
+        raise RuntimeError(f"Model has been downloaded but the SHA256 checksum does not not match")
+
+    return download_target
diff --git a/my_laion_clap/CLAP/src/laion_clap/clap_module/timm_model.py b/my_laion_clap/CLAP/src/laion_clap/clap_module/timm_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..071dd148c772f398e87ecbfc836dcfa4a3ae01af
--- /dev/null
+++ b/my_laion_clap/CLAP/src/laion_clap/clap_module/timm_model.py
@@ -0,0 +1,106 @@
+""" timm model adapter
+
+Wraps timm (https://github.com/rwightman/pytorch-image-models) models for use as a vision tower in CLIP model.
+"""
+from collections import OrderedDict
+
+import torch.nn as nn
+
+try:
+    import timm
+    from timm.models.layers import Mlp, to_2tuple
+    from timm.models.layers.attention_pool2d import RotAttentionPool2d
+    from timm.models.layers.attention_pool2d import AttentionPool2d as AbsAttentionPool2d
+except ImportError as e:
+    timm = None
+
+from .utils import freeze_batch_norm_2d
+
+
+class TimmModel(nn.Module):
+    """ timm model adapter
+    # FIXME this adapter is a work in progress, may change in ways that break weight compat
+    """
+
+    def __init__(
+            self,
+            model_name,
+            embed_dim,
+            image_size=224,
+            pool='avg',
+            proj='linear',
+            drop=0.,
+            pretrained=False):
+        super().__init__()
+        if timm is None:
+            raise RuntimeError("Please `pip install timm` to use timm models.")
+
+        self.image_size = to_2tuple(image_size)
+        self.trunk = timm.create_model(model_name, pretrained=pretrained)
+        feat_size = self.trunk.default_cfg.get('pool_size', None)
+        feature_ndim = 1 if not feat_size else 2
+        if pool in ('abs_attn', 'rot_attn'):
+            assert feature_ndim == 2
+            # if attn pooling used, remove both classifier and default pool
+            self.trunk.reset_classifier(0, global_pool='')
+        else:
+            # reset global pool if pool config set, otherwise leave as network default
+            reset_kwargs = dict(global_pool=pool) if pool else {}
+            self.trunk.reset_classifier(0, **reset_kwargs)
+        prev_chs = self.trunk.num_features
+
+        head_layers = OrderedDict()
+        if pool == 'abs_attn':
+            head_layers['pool'] = AbsAttentionPool2d(prev_chs, feat_size=feat_size, out_features=embed_dim)
+            prev_chs = embed_dim
+        elif pool == 'rot_attn':
+            head_layers['pool'] = RotAttentionPool2d(prev_chs, out_features=embed_dim)
+            prev_chs = embed_dim
+        else:
+            assert proj, 'projection layer needed if non-attention pooling is used.'
+
+        # NOTE attention pool ends with a projection layer, so proj should usually be set to '' if such pooling is used
+        if proj == 'linear':
+            head_layers['drop'] = nn.Dropout(drop)
+            head_layers['proj'] = nn.Linear(prev_chs, embed_dim)
+        elif proj == 'mlp':
+            head_layers['mlp'] = Mlp(prev_chs, 2 * embed_dim, embed_dim, drop=drop)
+
+        self.head = nn.Sequential(head_layers)
+
+    def lock(self, unlocked_groups=0, freeze_bn_stats=False):
+        """ lock modules
+        Args:
+            unlocked_groups (int): leave last n layer groups unlocked (default: 0)
+        """
+        if not unlocked_groups:
+            # lock full model
+            for param in self.trunk.parameters():
+                param.requires_grad = False
+            if freeze_bn_stats:
+                freeze_batch_norm_2d(self.trunk)
+        else:
+            # NOTE: partial freeze requires latest timm (master) branch and is subject to change
+            try:
+                # FIXME import here until API stable and in an official release
+                from timm.models.helpers import group_parameters, group_modules
+            except ImportError:
+                raise RuntimeError(
+                    'Please install latest timm `pip install git+https://github.com/rwightman/pytorch-image-models`')
+            matcher = self.trunk.group_matcher()
+            gparams = group_parameters(self.trunk, matcher)
+            max_layer_id = max(gparams.keys())
+            max_layer_id = max_layer_id - unlocked_groups
+            for group_idx in range(max_layer_id + 1):
+                group = gparams[group_idx]
+                for param in group:
+                    self.trunk.get_parameter(param).requires_grad = False
+            if freeze_bn_stats:
+                gmodules = group_modules(self.trunk, matcher, reverse=True)
+                gmodules = {k for k, v in gmodules.items() if v <= max_layer_id}
+                freeze_batch_norm_2d(self.trunk, gmodules)
+
+    def forward(self, x):
+        x = self.trunk(x)
+        x = self.head(x)
+        return x
diff --git a/my_laion_clap/CLAP/src/laion_clap/clap_module/tokenizer.py b/my_laion_clap/CLAP/src/laion_clap/clap_module/tokenizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b4a238b987ce66f2932b11451d916e40816b8a3
--- /dev/null
+++ b/my_laion_clap/CLAP/src/laion_clap/clap_module/tokenizer.py
@@ -0,0 +1,180 @@
+""" CLIP tokenizer
+
+Copied from https://github.com/openai/CLIP. Originally MIT License, Copyright (c) 2021 OpenAI.
+"""
+import gzip
+import html
+import os
+from functools import lru_cache
+from typing import Union, List
+
+import ftfy
+import regex as re
+import torch
+
+
+@lru_cache()
+def default_bpe():
+    return os.path.join(os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz")
+
+
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8+n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+def basic_clean(text):
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+
+
+def whitespace_clean(text):
+    text = re.sub(r'\s+', ' ', text)
+    text = text.strip()
+    return text
+
+
+class SimpleTokenizer(object):
+    def __init__(self, bpe_path: str = default_bpe(), special_tokens=None):
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        merges = gzip.open(bpe_path).read().decode("utf-8").split('\n')
+        merges = merges[1:49152-256-2+1]
+        merges = [tuple(merge.split()) for merge in merges]
+        vocab = list(bytes_to_unicode().values())
+        vocab = vocab + [v+'</w>' for v in vocab]
+        for merge in merges:
+            vocab.append(''.join(merge))
+        if not special_tokens:
+            special_tokens = ['<start_of_text>', '<end_of_text>']
+        else:
+            special_tokens = ['<start_of_text>', '<end_of_text>'] + special_tokens
+        vocab.extend(special_tokens)
+        self.encoder = dict(zip(vocab, range(len(vocab))))
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.bpe_ranks = dict(zip(merges, range(len(merges))))
+        self.cache = {t:t for t in special_tokens}
+        special = "|".join(special_tokens)
+        self.pat = re.compile(special + r"""|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE)
+
+        self.vocab_size = len(self.encoder)
+        self.all_special_ids = [self.encoder[t] for t in special_tokens]
+
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token[:-1]) + ( token[-1] + '</w>',)
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token+'</w>'
+
+        while True:
+            bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:
+                    new_word.extend(word[i:])
+                    break
+
+                if word[i] == first and i < len(word)-1 and word[i+1] == second:
+                    new_word.append(first+second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+
+    def encode(self, text):
+        bpe_tokens = []
+        text = whitespace_clean(basic_clean(text)).lower()
+        for token in re.findall(self.pat, text):
+            token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
+            bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
+        return bpe_tokens
+
+    def decode(self, tokens):
+        text = ''.join([self.decoder[token] for token in tokens])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('</w>', ' ')
+        return text
+
+
+_tokenizer = SimpleTokenizer()
+
+
+def tokenize(texts: Union[str, List[str]], context_length: int = 77) -> torch.LongTensor:
+    """
+    Returns the tokenized representation of given input string(s)
+
+    Parameters
+    ----------
+    texts : Union[str, List[str]]
+        An input string or a list of input strings to tokenize
+    context_length : int
+        The context length to use; all CLIP models use 77 as the context length
+
+    Returns
+    -------
+    A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]
+    """
+    if isinstance(texts, str):
+        texts = [texts]
+
+    sot_token = _tokenizer.encoder["<start_of_text>"]
+    eot_token = _tokenizer.encoder["<end_of_text>"]
+    all_tokens = [[sot_token] + _tokenizer.encode(text) + [eot_token] for text in texts]
+    result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
+
+    for i, tokens in enumerate(all_tokens):
+        if len(tokens) > context_length:
+            tokens = tokens[:context_length]  # Truncate
+        result[i, :len(tokens)] = torch.tensor(tokens)
+
+    return result
diff --git a/my_laion_clap/CLAP/src/laion_clap/clap_module/transform.py b/my_laion_clap/CLAP/src/laion_clap/clap_module/transform.py
new file mode 100644
index 0000000000000000000000000000000000000000..7014c926f153a351d2256c869c67c02d57b30913
--- /dev/null
+++ b/my_laion_clap/CLAP/src/laion_clap/clap_module/transform.py
@@ -0,0 +1,30 @@
+from torchvision.transforms import Normalize, Compose, RandomResizedCrop, InterpolationMode, ToTensor, Resize, \
+    CenterCrop
+
+
+def _convert_to_rgb(image):
+    return image.convert('RGB')
+
+
+def image_transform(
+        image_size: int,
+        is_train: bool,
+        mean=(0.48145466, 0.4578275, 0.40821073),
+        std=(0.26862954, 0.26130258, 0.27577711)
+):
+    normalize = Normalize(mean=mean, std=std)
+    if is_train:
+        return Compose([
+            RandomResizedCrop(image_size, scale=(0.9, 1.0), interpolation=InterpolationMode.BICUBIC),
+            _convert_to_rgb,
+            ToTensor(),
+            normalize,
+        ])
+    else:
+        return Compose([
+            Resize(image_size, interpolation=InterpolationMode.BICUBIC),
+            CenterCrop(image_size),
+            _convert_to_rgb,
+            ToTensor(),
+            normalize,
+        ])
diff --git a/my_laion_clap/CLAP/src/laion_clap/clap_module/utils.py b/my_laion_clap/CLAP/src/laion_clap/clap_module/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee10b9310944e9de9f8e1db1a4104defd0423744
--- /dev/null
+++ b/my_laion_clap/CLAP/src/laion_clap/clap_module/utils.py
@@ -0,0 +1,389 @@
+import numpy as np
+import torch
+from torch import nn as nn
+from torchvision.ops.misc import FrozenBatchNorm2d
+import logging
+import h5py
+from tqdm import tqdm
+import random
+import json
+import os
+import pathlib
+
+# TODO: (yusong) this not a good place to store those information and does not scale. Need to be fixed later.
+dataset_split = {
+    "audiocaps": ["train", "valid", "test"],
+    "audioset": ["balanced_train", "unbalanced_train", "eval"],
+    "BBCSoundEffects": ["train", "test"],
+    "Clotho": ["train", "test", "valid"],
+    "free_to_use_sounds": ["train", "test"],
+    "paramount_motion": ["train", "test"],
+    "sonniss_game_effects": ["train", "test"],
+    "wesoundeffects": ["train", "test"],
+    "MACS": ["train", "test"],
+    "freesound": ["train", "test"],
+    "FSD50K": ["train", "test", "valid"],
+    "fsd50k_class_label": ["train", "test", "valid"],
+    "esc50": ["train", "test"],
+    "ESC50_1": ["train", "test"],
+    "ESC50_2": ["train", "test"],
+    "ESC50_3": ["train", "test"],
+    "ESC50_4": ["train", "test"],
+    "ESC50_5": ["train", "test"],
+    "audiostock": ["train", "test"],
+    "freesound_no_overlap_noesc50": ["train", "test"],
+    "epidemic_sound_effects": ["train", "test"],
+    "VGGSound": ["train", "test"],
+    "urbansound8k_class_label": ["train", "test"],
+    "audioset_t5": ["balanced_train", "unbalanced_train", "eval"],
+    "audioset_t5_debiased": ["balanced_train", "unbalanced_train", "eval"],
+    "epidemic_sound_effects_t5": ["train", "test"],
+    "epidemic_sound_effects_t5_debiased": ["train", "test"],
+    "WavText5K": ["train", "test"],
+    "esc50_no_overlap": ["train", "test"],
+    "usd8k_no_overlap": ["train", "test"],
+    "fsd50k_200_class_label": ["train", "test", "valid"],
+    "fma_full": ["train", "test"],
+    "Genius": ["train", "test"],
+    "Jamendo": ["train", "test"],
+    "juno": ["train", "test"],
+    "CMU_Arctic": ["train", "test"],
+    "ravdess": ["train", "test"],
+    "Europarl-st": ["train", "test"],
+    "common_voice": ["train", "test"],
+    "Jamendo_16bit": ["train", "test"],
+    "genius_16bit_128": ["train", "test"],
+    "juno_16bit": ["train", "test"],
+    "fma_full_16bit_128": ["train", "test"],
+    "GTZAN": ["train", "test"],
+    }
+
+
+def freeze_batch_norm_2d(module, module_match={}, name=""):
+    """
+    Converts all `BatchNorm2d` and `SyncBatchNorm` layers of provided module into `FrozenBatchNorm2d`. If `module` is
+    itself an instance of either `BatchNorm2d` or `SyncBatchNorm`, it is converted into `FrozenBatchNorm2d` and
+    returned. Otherwise, the module is walked recursively and submodules are converted in place.
+
+    Args:
+        module (torch.nn.Module): Any PyTorch module.
+        module_match (dict): Dictionary of full module names to freeze (all if empty)
+        name (str): Full module name (prefix)
+
+    Returns:
+        torch.nn.Module: Resulting module
+
+    Inspired by https://github.com/pytorch/pytorch/blob/a5895f85be0f10212791145bfedc0261d364f103/torch/nn/modules/batchnorm.py#L762
+    """
+    res = module
+    is_match = True
+    if module_match:
+        is_match = name in module_match
+    if is_match and isinstance(
+            module, (nn.modules.batchnorm.BatchNorm2d, nn.modules.batchnorm.SyncBatchNorm)
+    ):
+        res = FrozenBatchNorm2d(module.num_features)
+        res.num_features = module.num_features
+        res.affine = module.affine
+        if module.affine:
+            res.weight.data = module.weight.data.clone().detach()
+            res.bias.data = module.bias.data.clone().detach()
+        res.running_mean.data = module.running_mean.data
+        res.running_var.data = module.running_var.data
+        res.eps = module.eps
+    else:
+        for child_name, child in module.named_children():
+            full_child_name = ".".join([name, child_name]) if name else child_name
+            new_child = freeze_batch_norm_2d(child, module_match, full_child_name)
+            if new_child is not child:
+                res.add_module(child_name, new_child)
+    return res
+
+
+def exist(dataset_name, dataset_type):
+    """
+    Check if dataset exists
+    """
+    if dataset_type in dataset_split[dataset_name]:
+        return True
+    else:
+        return False
+
+
+def get_tar_path_from_dataset_name(
+        dataset_names,
+        dataset_types,
+        islocal,
+        dataset_path,
+        proportion=1,
+        full_dataset=None
+):
+    """
+    Get tar path from dataset name and type
+    """
+    output = []
+    for n in dataset_names:
+        if full_dataset is not None and n in full_dataset:
+            current_dataset_types = dataset_split[n]
+        else:
+            current_dataset_types = dataset_types
+        for s in current_dataset_types:
+            tmp = []
+            if islocal:
+                sizefilepath_ = f"{dataset_path}/{n}/{s}/sizes.json"
+                if not os.path.exists(sizefilepath_):
+                    sizefilepath_ = f"./json_files/{n}/{s}/sizes.json"
+            else:
+                sizefilepath_ = f"./json_files/{n}/{s}/sizes.json"
+            if not os.path.exists(sizefilepath_):
+                continue
+            sizes = json.load(open(sizefilepath_, "r"))
+            for k in sizes.keys():
+                if islocal:
+                    tmp.append(f"{dataset_path}/{n}/{s}/{k}")
+                else:
+                    tmp.append(
+                        f"pipe:aws s3 --cli-connect-timeout 0 cp s3://s-laion-audio/webdataset_tar/{n}/{s}/{k} -"
+                    )
+            if proportion != 1:
+                tmp = random.sample(tmp, int(proportion * len(tmp)))
+            output.append(tmp)
+    return sum(output, [])
+
+
+def get_tar_path_from_txts(txt_path, islocal, proportion=1):
+    """
+    Get tar path from txt path
+    """
+    if isinstance(txt_path, (list, tuple)):
+        return sum(
+            [
+                get_tar_path_from_txts(
+                    txt_path[i], islocal=islocal, proportion=proportion
+                )
+                for i in range(len(txt_path))
+            ],
+            [],
+        )
+    if isinstance(txt_path, str):
+        with open(txt_path) as f:
+            lines = f.readlines()
+        if islocal:
+            lines = [
+                lines[i]
+                .split("\n")[0]
+                .replace("pipe:aws s3 cp s3://s-laion-audio/", "/mnt/audio_clip/")
+                for i in range(len(lines))
+            ]
+        else:
+            lines = [
+                lines[i].split("\n")[0].replace(".tar", ".tar -")
+                for i in range(len(lines))
+            ]
+        if proportion != 1:
+            print("Sampling tars with proportion of {}".format(proportion))
+            lines = random.sample(lines, int(proportion * len(lines)))
+        return lines
+
+
+def get_mix_lambda(mixup_alpha, batch_size):
+    mixup_lambdas = [
+        np.random.beta(mixup_alpha, mixup_alpha, 1)[0] for _ in range(batch_size)
+    ]
+    return np.array(mixup_lambdas).astype(np.float32)
+
+
+def do_mixup(x, mixup_lambda):
+    """
+    Args:
+      x: (batch_size , ...)
+      mixup_lambda: (batch_size,)
+    Returns:
+      out: (batch_size, ...)
+    """
+    out = (
+            x.transpose(0, -1) * mixup_lambda
+            + torch.flip(x, dims=[0]).transpose(0, -1) * (1 - mixup_lambda)
+    ).transpose(0, -1)
+    return out
+
+
+def interpolate(x, ratio):
+    """Interpolate data in time domain. This is used to compensate the
+    resolution reduction in downsampling of a CNN.
+
+    Args:
+      x: (batch_size, time_steps, classes_num)
+      ratio: int, ratio to interpolate
+    Returns:
+      upsampled: (batch_size, time_steps * ratio, classes_num)
+    """
+    (batch_size, time_steps, classes_num) = x.shape
+    upsampled = x[:, :, None, :].repeat(1, 1, ratio, 1)
+    upsampled = upsampled.reshape(batch_size, time_steps * ratio, classes_num)
+    return upsampled
+
+
+def pad_framewise_output(framewise_output, frames_num):
+    """Pad framewise_output to the same length as input frames. The pad value
+    is the same as the value of the last frame.
+    Args:
+      framewise_output: (batch_size, frames_num, classes_num)
+      frames_num: int, number of frames to pad
+    Outputs:
+      output: (batch_size, frames_num, classes_num)
+    """
+    pad = framewise_output[:, -1:, :].repeat(
+        1, frames_num - framewise_output.shape[1], 1
+    )
+    """tensor for padding"""
+
+    output = torch.cat((framewise_output, pad), dim=1)
+    """(batch_size, frames_num, classes_num)"""
+
+
+def process_ipc(index_path, classes_num, filename):
+    # load data
+    logging.info("Load Data...............")
+    ipc = [[] for _ in range(classes_num)]
+    with h5py.File(index_path, "r") as f:
+        for i in tqdm(range(len(f["target"]))):
+            t_class = np.where(f["target"][i])[0]
+            for t in t_class:
+                ipc[t].append(i)
+    print(ipc)
+    np.save(filename, ipc)
+    logging.info("Load Data Succeed...............")
+
+
+def save_to_dict(s, o_={}):
+    sp = s.split(": ")
+    o_.update({sp[0]: float(sp[1])})
+    return o_
+
+
+def get_data_from_log(txt_path):
+    """
+    Output dictionary from out.txt log file
+    """
+    with open(txt_path) as f:
+        lines = f.readlines()
+    val_data = {}
+    train_data = {}
+    train_losses = []
+    train_losses_epoch = []
+    for i in range(len(lines)):
+        if "| INFO |" in lines[i]:
+            if "Eval Epoch" in lines[i]:
+                if "val_loss" in lines[i]:
+                    # float(regex.sub("", lines[310].split("	")[-1]).replace(" ", ""))
+                    line = lines[i].split("Eval Epoch: ")[-1]
+                    num_epoch = int(line.split("	")[0].split(" ")[0])
+                    d = {
+                        line.split("	")[0]
+                        .split(" ")[1]
+                        .replace(":", ""): float(line.split("	")[0].split(" ")[-1])
+                    }
+                    for i in range(1, len(line.split("	"))):
+                        d = save_to_dict(line.split("	")[i], d)
+                    val_data[num_epoch] = d
+            elif "Train Epoch" in lines[i]:
+                num_epoch = int(lines[i].split("Train Epoch: ")[1][0])
+                loss = float(lines[i].split("Loss: ")[-1].split(" (")[0])
+                train_losses.append(loss)
+                train_losses_epoch.append(num_epoch)
+    for i in range(len(train_losses)):
+        train_data[i] = {
+            "num_epoch": train_losses_epoch[i],
+            "train_loss": train_losses[i],
+        }
+    return train_data, val_data
+
+
+def save_p(obj, filename):
+    import pickle
+
+    try:
+        from deepdiff import DeepDiff
+    except:
+        os.system("pip install deepdiff")
+        from deepdiff import DeepDiff
+    with open(filename, "wb") as file:
+        pickle.dump(obj, file, protocol=pickle.HIGHEST_PROTOCOL)  # highest protocol
+    with open(filename, "rb") as file:
+        z = pickle.load(file)
+    assert (
+            DeepDiff(obj, z, ignore_string_case=True) == {}
+    ), "there is something wrong with the saving process"
+    return
+
+
+def load_p(filename):
+    import pickle
+
+    with open(filename, "rb") as file:
+        z = pickle.load(file)
+    return z
+
+
+def save_json(data, name="data.json"):
+    import json
+    with open(name, 'w') as fp:
+        json.dump(data, fp)
+    return
+
+
+def load_json(name):
+    import json
+    with open(name, 'r') as fp:
+        data = json.load(fp)
+    return data
+
+
+from multiprocessing import Process, Manager
+from multiprocessing import Process, Value, Array
+from ctypes import c_wchar
+
+
+def load_class_label(path):
+    # https://stackoverflow.com/questions/48004243/how-to-share-large-read-only-dictionary-list-across-processes-in-multiprocessing
+    # https://stackoverflow.com/questions/45693949/storing-strings-in-a-multiprocessing-sharedctypes-array
+    out = None
+    if path is not None:
+        if pathlib.Path(path).suffix in [".pkl", ".pickle"]:
+            out = load_p(path)
+        elif pathlib.Path(path).suffix in [".json", ".txt"]:
+            out = load_json(path)
+        elif pathlib.Path(path).suffix in [".npy", ".npz"]:
+            out = np.load(path)
+        elif pathlib.Path(path).suffix in [".csv"]:
+            import pandas as pd
+            out = pd.read_csv(path)
+    return out
+    # if out is None:
+    #     return None
+    # else:
+    #     key = Array(c_wchar, '\n'.join(list(out.keys())), lock=False)
+    #     val = Array('i', out.values(), lock=False)
+    #     return (key, val)
+
+
+from torch import optim
+
+
+def get_optimizer(params, lr, betas, eps, momentum, optimizer_name):
+    if optimizer_name.lower() == "adamw":
+        optimizer = optim.AdamW(
+            params, lr=lr, betas=betas, eps=eps
+        )
+    elif optimizer_name.lower() == "sgd":
+        optimizer = optim.SGD(
+            params, lr=lr, momentum=momentum
+        )
+    elif optimizer_name.lower() == "adam":
+        optimizer = optim.Adam(
+            params, lr=lr, betas=betas, eps=eps
+        )
+    else:
+        raise ValueError("optimizer name is not correct")
+    return optimizer
diff --git a/my_laion_clap/CLAP/src/laion_clap/clap_module/version.py b/my_laion_clap/CLAP/src/laion_clap/clap_module/version.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc79d63d5430b972ac6ec1c4bfea9af80922da4d
--- /dev/null
+++ b/my_laion_clap/CLAP/src/laion_clap/clap_module/version.py
@@ -0,0 +1 @@
+__version__ = '0.2.1'
diff --git a/my_laion_clap/CLAP/src/laion_clap/evaluate/__init__.py b/my_laion_clap/CLAP/src/laion_clap/evaluate/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/my_laion_clap/CLAP/src/laion_clap/evaluate/eval_dcase.py b/my_laion_clap/CLAP/src/laion_clap/evaluate/eval_dcase.py
new file mode 100644
index 0000000000000000000000000000000000000000..c615651f2d96f7e34d109e9c3dbb8abc7275065f
--- /dev/null
+++ b/my_laion_clap/CLAP/src/laion_clap/evaluate/eval_dcase.py
@@ -0,0 +1,150 @@
+import torch
+import torch.nn.functional as F
+import torch.backends.cudnn as cudnn
+from open_clip import create_model
+from open_clip import tokenize
+import glob
+import json
+import librosa
+from tqdm import tqdm
+import numpy as np
+import os
+from laion_clap.training.params import parse_args
+
+
+def get_output_from_single_audio(audio, text, model, device):
+
+    # audio_embedding = model.audio_infer(audio, hopsize=5 * 48000, key="embedding", device=device)['embedding']
+    # if audio_embedding.ndim > 1:
+    #     audio_embedding = audio_embedding.mean(dim=0, keepdim=True)
+    # else:
+    #     audio_embedding = audio_embedding.unsqueeze(0)
+    audio_features = model(audio, None, device)
+    audio_features = F.normalize(audio_features, dim=-1)
+    text_features = model(None, text, device=device)
+    text_features = F.normalize(text_features, dim=-1)
+
+    # CHANGE: before normalize or after
+    audio_features_mlp = model.audio_transform(audio_features)
+    text_features_mlp = model.text_transform(text_features)
+    return audio_features, text_features, audio_features_mlp, text_features_mlp, model.logit_scale_a.exp(), model.logit_scale_t.exp()
+
+
+def get_metrics(text_to_audio_logits):
+    metrics = {}
+
+    # repeat ground truth 5 times because Clotho has 5 text for 1 audio
+    ground_truth = torch.repeat_interleave(torch.arange(len(text_features) // 5), 5).view(-1, 1)
+
+    ranking = torch.argsort(text_to_audio_logits, descending=True)
+    preds = torch.where(ranking == ground_truth)[1]  # (yusong) this line is slow because it uses single thread
+    preds = preds.detach().cpu().numpy()
+    metrics[f"mean_rank"] = preds.mean() + 1
+    metrics[f"median_rank"] = np.floor(np.median(preds)) + 1
+    for k in [1, 5, 10]:
+        metrics[f"R@{k}"] = np.mean(preds < k)
+    # map@10
+    metrics[f"mAP@10"] = np.mean(np.where(preds < 10, 1 / (preds + 1), 0.0))
+    return metrics
+
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    model_path = args.pretrained
+
+    clotho_test_preprocessed_dir = "/fsx/yusong/clotho_test_set/test"
+
+    cudnn.benchmark = True
+    cudnn.deterministic = False
+
+    audio_features_ensemble_all = []
+    text_features_ensemble_all = []
+    audio_features_mlp_ensemble_all = []
+    text_features_mlp_ensemble_all = []
+    logit_scale_a_ensemble_all = []
+    logit_scale_t_ensemble_all = []
+
+
+    device = torch.device('cuda')
+    model, clap_model_cfg = create_model(
+        args.amodel,
+        args.tmodel,
+        args.pretrained,
+        precision=args.precision,
+        device=device,
+        jit=args.torchscript,
+        force_quick_gelu=args.force_quick_gelu,
+        openai_model_cache_dir=os.path.expanduser(args.openai_model_cache_dir),
+        skip_params=False
+    )
+
+    # load model
+    checkpoint = torch.load(model_path, map_location=device)
+    if "epoch" in checkpoint:
+        # resuming a train checkpoint w/ epoch and optimizer state
+        start_epoch = checkpoint["epoch"]
+        sd = checkpoint["state_dict"]
+        if next(iter(sd.items()))[0].startswith(
+                "module"
+        ):
+            sd = {k[len("module."):]: v for k, v in sd.items()}
+        model.load_state_dict(sd)
+    else:
+        # loading a bare (model only) checkpoint for fine-tune or evaluation
+        model.load_state_dict(checkpoint)
+
+    model.to(device)
+    model.eval()
+    for param in model.parameters():
+        param.requires_grad = False
+
+    # take every 5th file because clotho has 5 texts for 1 audio
+    test_file_list = sorted(glob.glob(f"{clotho_test_preprocessed_dir}/*.flac"))
+
+    audio_features_all = []
+    text_features_all = []
+    audio_features_mlp_all = []
+    text_features_mlp_all = []
+    logit_scale_a_all = []
+    logit_scale_t_all = []
+
+    with torch.no_grad():
+        for file_path in tqdm(test_file_list):
+            json_path = file_path.replace(".flac", ".json")
+            with open(json_path, "r") as f:
+                json_data = json.load(f)
+            audio, sr = librosa.load(file_path, sr=48000, mono=True)
+            audio = torch.from_numpy(audio).to(device)
+            audio = {'waveform': audio.unsqueeze(0), 'sample_rate': sr}
+            text = json_data["text"]
+
+            if args.tmodel == "transformer":
+                from open_clip import tokenize
+                text = tokenize(text)
+            else:
+                from laion_clap.training.data import tokenizer
+                text = tokenizer(text, tmodel=args.tmodel)  # 5 texts for each audio
+
+            audio_features, text_features, audio_features_mlp, text_features_mlp, logit_scale_a, logit_scale_t = \
+                get_output_from_single_audio(audio, text, model, device)
+
+            audio_features_all.append(audio_features.detach().cpu())
+            text_features_all.append(text_features.detach().cpu())
+            audio_features_mlp_all.append(audio_features_mlp.detach().cpu())
+            text_features_mlp_all.append(text_features_mlp.detach().cpu())
+            logit_scale_a_all.append(logit_scale_a.detach().cpu())
+            logit_scale_t_all.append(logit_scale_t.detach().cpu())
+
+    audio_features = torch.cat(audio_features_all)
+    text_features = torch.cat(text_features_all)
+    logit_scale_a = logit_scale_a_all[0]
+
+    logits_per_audio = (logit_scale_a * audio_features @ text_features.t()).detach().cpu()
+    logits_per_text = logits_per_audio.t().detach().cpu()
+
+    metrics = get_metrics(
+        logits_per_text
+    )
+
+    print(metrics)
diff --git a/my_laion_clap/CLAP/src/laion_clap/evaluate/eval_linear_probe.py b/my_laion_clap/CLAP/src/laion_clap/evaluate/eval_linear_probe.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5123451b503de7c54fac58e99b3507439992171
--- /dev/null
+++ b/my_laion_clap/CLAP/src/laion_clap/evaluate/eval_linear_probe.py
@@ -0,0 +1,515 @@
+'''
+Evalute the linear probe performance on different checkpoints
+'''
+import logging
+import os
+import random
+from datetime import datetime
+import copy
+import numpy as np
+import torch
+import torch.backends.cudnn as cudnn
+from torch.cuda.amp import GradScaler
+import glob
+
+try:
+    import wandb
+except ImportError:
+    wandb = None
+
+try:
+    import torch.utils.tensorboard as tensorboard
+except ImportError:
+    tensorboard = None
+
+try:
+    import horovod.torch as hvd
+except ImportError:
+    hvd = None
+
+from clap_module import create_model_and_transforms, trace_model, create_model
+from training.data import get_data
+from training.params import parse_args
+from training.distributed import is_master, init_distributed_device, world_info_from_env
+from training.logger import setup_logging
+from training.scheduler import cosine_lr
+from training.lp_main import config_lp_optimizer
+from training.lp_train import train_one_epoch, evaluate
+from clap_module.utils import get_tar_path_from_dataset_name, dataset_split
+from clap_module.utils import load_p, load_class_label
+from clap_module.linear_probe import LinearProbe
+
+def maintain_ckpts(args, startidx, all_idx_len):
+    for i in reversed(range(startidx, all_idx_len)):
+        if os.path.exists(os.path.join(args.checkpoint_path, f"epoch_top_{i}.pt")):
+            os.rename(
+                os.path.join(args.checkpoint_path, f"epoch_top_{i}.pt"),
+                os.path.join(args.checkpoint_path, f"epoch_top_{i+1}.pt"),
+            )
+    if os.path.exists(
+        os.path.join(args.checkpoint_path, f"epoch_top_{all_idx_len}.pt")
+    ):
+        os.remove(os.path.join(args.checkpoint_path, f"epoch_top_{all_idx_len}.pt"))
+    return
+
+
+def update_top_k_performance(
+    new_metrics_inputs, current_top_k_ckpt_metrics, args, ckpt, bignumbetter=True, pretrain_epoch=0
+):
+    """
+    Record the top-k performance of the current epoch.
+    current_top_k_metrics is a dictionary of the form: {1: top_1_ckpt_measure, 2: top_2_ckpt_measure, ...}
+    """
+    if isinstance(new_metrics_inputs, (list, tuple)):
+        new_metrics_inputs = np.mean(new_metrics_inputs)
+        return update_top_k_performance(
+            new_metrics_inputs,
+            current_top_k_ckpt_metrics,
+            args=args,
+            ckpt=ckpt,
+            bignumbetter=bignumbetter,
+            pretrain_epoch=pretrain_epoch
+        )
+    elif isinstance(new_metrics_inputs, dict):
+        new_metrics_inputs = np.mean(list(new_metrics_inputs.values()))
+        return update_top_k_performance(
+            new_metrics_inputs,
+            current_top_k_ckpt_metrics,
+            args=args,
+            ckpt=ckpt,
+            bignumbetter=bignumbetter,
+            pretrain_epoch=pretrain_epoch
+        )
+    elif isinstance(new_metrics_inputs, (float, int)):
+        update_flag = {k: False for k in current_top_k_ckpt_metrics.keys()}
+        sorted_keys = sorted(current_top_k_ckpt_metrics.keys())
+        sorted_values = sorted(
+            current_top_k_ckpt_metrics.values(), reverse=bignumbetter
+        )
+        sorted_values_ = copy.deepcopy(sorted_values)
+        sorted_values.append(new_metrics_inputs)
+        sorted_values = sorted(sorted_values, reverse=bignumbetter)
+        sorted_values = sorted_values[:-1]
+
+        if sorted_values == sorted_values_:
+            return current_top_k_ckpt_metrics, new_metrics_inputs
+        else:
+            for i in range(len(sorted_keys)):
+                if current_top_k_ckpt_metrics[sorted_keys[i]] != sorted_values[i]:
+                    current_top_k_ckpt_metrics[sorted_keys[i]] = sorted_values[i]
+                    update_flag[sorted_keys[i]] = True
+            for i in range(len(update_flag)):
+                if update_flag[i]:
+                    maintain_ckpts(args, i, len(sorted_keys))
+                    torch.save(
+                        ckpt,
+                        os.path.join(args.checkpoint_path, f"pretrain_epoch_{pretrain_epoch}_lp_epoch_top_{i}.pt"),
+                    )
+                    break
+            return current_top_k_ckpt_metrics, new_metrics_inputs
+
+
+# def updateifNone(a, b):
+#     a = b if None else a
+#     return a
+
+
+def is_pretrained_params(n):
+    return (
+        n.startswith("clap_model.transformer")
+        or n in ["clap_model.positional_embedding", "clap_model.text_projection"]
+        or n.startswith("clap_model.token_embedding")
+        or n.startswith("clap_model.ln_final")
+        or n.startswith("clap_model.logit_scale_t")
+    )
+
+
+def random_seed(seed=42, rank=0):
+    torch.manual_seed(seed + rank)
+    np.random.seed(seed + rank)
+    random.seed(seed + rank)
+
+def main():
+    args = parse_args()
+    # sanitize model name for filesystem / uri use, easier if we don't use / in name as a rule?
+    args.amodel = args.amodel.replace("/", "-")
+    
+    pretrained_ckpts = sorted(glob.glob(os.path.join(args.pretrained, "*.pt")), key=os.path.getmtime)
+
+    if args.name is None:
+        args.name = "-".join(
+            [
+                datetime.now().strftime("%Y_%m_%d-%H_%M_%S"),
+                f"linear_probe"
+                f"model_{args.amodel}",
+                f"lr_{args.lr}",
+                f"b_{args.batch_size}",
+                f"j_{args.workers}",
+                f"p_{args.precision}",
+            ]
+        )
+
+    # discover initial world args early so we can log properly
+    args.distributed = False
+    args.local_rank, args.rank, args.world_size = world_info_from_env()
+
+    if args.remotedata and is_master(args):
+        for dataset_name in args.datasetnames:
+            for split in dataset_split[dataset_name]:
+                if not os.path.exists(f"./json_files/{dataset_name}/{split}"):
+                    os.makedirs(f"./json_files/{dataset_name}/{split}")
+                os.system(
+                    f"aws s3 cp s3://s-laion-audio/webdataset_tar/{dataset_name}/{split}/sizes.json ./json_files/{dataset_name}/{split}/sizes.json"
+                )
+    args.log_path = None
+    if is_master(args, local=args.log_local):
+        log_base_path = os.path.join(args.logs, args.name)
+        os.makedirs(log_base_path, exist_ok=True)
+        log_filename = f"out-{args.rank}" if args.log_local else "out.log"
+        args.log_path = os.path.join(log_base_path, log_filename)
+
+        # avoid log dir in same name:
+        postfix = 0
+        while os.path.exists(args.log_path):
+            postfix += 1
+            log_base_path_new = log_base_path+'-'+str(postfix)
+            os.makedirs(log_base_path_new, exist_ok=True)
+            log_filename = f"out-{args.rank}" if args.log_local else "out.log"
+            args.log_path = os.path.join(log_base_path_new, log_filename)
+            # print(
+            #     "Error. Experiment already exists. Use --name {} to specify a new experiment."
+            # )
+            # return -1
+
+    # Set logger
+    args.log_level = logging.DEBUG if args.debug else logging.INFO
+    setup_logging(args.log_path, args.log_level)
+
+    # fully initialize distributed device environment
+    device = init_distributed_device(args)
+
+    args.wandb = "wandb" in args.report_to or "all" in args.report_to
+    args.tensorboard = "tensorboard" in args.report_to or "all" in args.report_to
+    if is_master(args):
+        args.tensorboard_path = (
+            os.path.join(args.logs, args.name, "tensorboard")
+            if args.tensorboard
+            else ""
+        )
+        args.checkpoint_path = os.path.join(args.logs, args.name, "checkpoints")
+        for dirname in [args.tensorboard_path, args.checkpoint_path]:
+            if dirname:
+                os.makedirs(dirname, exist_ok=True)
+    else:
+        args.tensorboard_path = ""
+        args.checkpoint_path = ""
+
+    if args.copy_codebase:
+        copy_codebase(args)
+
+    assert args.precision in ["amp", "fp16", "fp32"]
+    if args.precision == "fp16":
+        logging.warning(
+            "It is recommended to use AMP mixed-precision instead of FP16. "
+            "FP16 support needs further verification and tuning, especially for train."
+        )
+
+    if args.horovod:
+        logging.info(
+            f"Running in horovod mode with multiple processes / nodes. Device: {args.device}."
+            f"Process (global: {args.rank}, local {args.local_rank}), total {args.world_size}."
+        )
+    elif args.distributed:
+        logging.info(
+            f"Running in distributed mode with multiple processes. Device: {args.device}."
+            f"Process (global: {args.rank}, local {args.local_rank}), total {args.world_size}."
+        )
+    else:
+        logging.info(f"Running with a single process. Device {args.device}.")
+
+    logging.info(f'openai cache dir: {os.path.expanduser(args.openai_model_cache_dir)}')
+
+    # determine if this worker should save logs and checkpoints. only do so if it is rank == 0
+    args.save_logs = args.logs and args.logs.lower() != "none" and is_master(args)
+    writer = None
+    if args.save_logs and args.tensorboard:
+        assert tensorboard is not None, "Please install tensorboard."
+        writer = tensorboard.SummaryWriter(args.tensorboard_path)
+
+    if args.wandb and is_master(args):
+        assert wandb is not None, "Please install wandb."
+        logging.debug("Starting wandb.")
+        # you will have to configure this for your project!
+        wandb.init(
+            project="clap",
+            notes=args.wandb_notes,
+            name=args.wandb_notes,
+            tags=[],
+            config=vars(args),
+        )
+        logging.debug("Finished loading wandb.")
+
+    for idx, f in enumerate(pretrained_ckpts):
+        logging.info(f"pretrained on {f}")
+        args.pretrained = f
+        ckpt = torch.load(f, map_location='cpu')
+        pretrain_epoch = 0
+        if 'epoch' in ckpt:
+            pretrain_epoch = ckpt['epoch']
+        # train 
+        best_metrics = lp_main(args, device, writer, pretrain_epoch, idx)
+        
+        if args.wandb and is_master(args):
+            assert wandb is not None, "Please install wandb."
+            for name, val in best_metrics.items():
+                wandb.log({f"val/summary/{name}": val, "epoch": pretrain_epoch})
+
+    if args.wandb and is_master(args):
+        wandb.finish()
+
+def update_metric(best_metric, new_metric):
+    for key in new_metric:
+        if key not in best_metric:
+            best_metric[key] = new_metric[key]
+        else:
+            best_metric[key] = max(best_metric[key], new_metric[key])
+    return best_metric
+
+def lp_main(args, device, writer, pretrain_epoch, idx):
+
+    random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    torch.cuda.manual_seed(args.seed)
+    torch.cuda.manual_seed_all(args.seed)
+    np.random.seed(args.seed)
+    args.class_index_dict = load_class_label(args.class_label_path)
+
+
+    # Create CLAP model
+    clap_model, clap_model_cfg = create_model(
+        args.amodel,
+        args.tmodel,
+        args.pretrained,
+        precision=args.precision,
+        device=device,
+        jit=args.torchscript,
+        force_quick_gelu=args.force_quick_gelu,
+        openai_model_cache_dir=os.path.expanduser(args.openai_model_cache_dir),
+        skip_params=False,
+        enable_fusion=args.enable_fusion,
+        fusion_type=args.fusion_type
+    )
+    
+    args.lp_out_ch = len(list(args.class_index_dict.keys()))
+    # Linear Probe 
+    if idx == 0:
+        logging.info(f"linear probe using mlp: {args.lp_mlp}")
+        logging.info(f"linear probe using freeze: {args.lp_freeze}")
+        logging.info(f"linear probe act layer: {args.lp_act}")
+        logging.info(f"linear probe out ch: {args.lp_out_ch}")
+        logging.info(f"linear probe learning rate (if applicable): {args.lp_lr}")
+        logging.info(f"linear probe loss func: {args.lp_loss}")
+        logging.info(f"linear probe lp_metrics: {args.lp_metrics}")
+
+    model = LinearProbe(
+        clap_model, 
+        mlp=args.lp_mlp, freeze=args.lp_freeze, 
+        in_ch=512, out_ch=args.lp_out_ch,
+        act=args.lp_act
+    ) # in_ch is fixed (i.e., 512)
+    model = model.to(device)
+    
+    if args.horovod:
+        with torch.no_grad():
+            for param in model.parameters():
+                param.set_(param.contiguous())
+
+    if args.trace:
+        model = trace_model(model, batch_size=args.batch_size, device=device)
+
+    if is_master(args) and idx == 0:
+        logging.info("Linear Probe CLAP Model:")
+        logging.info(f"{str(clap_model)}")
+        logging.info("Params:")
+        params_file = os.path.join(args.logs, args.name, "params.txt")
+        with open(params_file, "w") as f:
+            for name in sorted(vars(args)):
+                val = getattr(args, name)
+                logging.info(f"  {name}: {val}")
+                f.write(f"{name}: {val}\n")
+    
+
+    if args.distributed and not args.horovod:
+        if args.use_bn_sync:
+            model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
+        ddp_args = {}
+        if args.ddp_static_graph:
+            # this doesn't exist in older PyTorch, arg only added if enabled
+            ddp_args["static_graph"] = True
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[device], find_unused_parameters=True, **ddp_args
+        )
+
+    data = get_data(args, clap_model_cfg)
+    assert len(data), "At least one train or eval dataset must be specified."
+    if args.trace:
+        assert "train" not in data, "Cannot train with traced model"
+
+    optimizer, scheduler, text_freeze_parameters = config_lp_optimizer(model, data, args)
+
+    scaler = GradScaler() if args.precision == "amp" else None
+
+    # optionally resume from a checkpoint
+    start_epoch = 0
+    if args.resume is not None:
+        if os.path.isfile(args.resume):
+            checkpoint = torch.load(args.resume, map_location=device)
+            if "epoch" in checkpoint:
+                # resuming a train checkpoint w/ epoch and optimizer state
+                start_epoch = checkpoint["epoch"]
+                sd = checkpoint["state_dict"]
+                if not args.distributed and next(iter(sd.items()))[0].startswith(
+                    "module"
+                ):
+                    sd = {k[len("module.") :]: v for k, v in sd.items()}
+                model.load_state_dict(sd)
+                if args.split_opt:
+                    if optimizer is not None:
+                        for k, o_ in optimizer.items():
+                            o_.load_state_dict(checkpoint[k + "_" + "optimizer"])
+                if optimizer is not None:
+                    optimizer.load_state_dict(checkpoint["optimizer"])
+                if scaler is not None and "scaler" in checkpoint:
+                    scaler.load_state_dict(checkpoint["scaler"])
+                logging.info(
+                    f"=> resuming checkpoint '{args.resume}' (epoch {start_epoch})"
+                )
+            else:
+                # loading a bare (model only) checkpoint for fine-tune or evaluation
+                model.load_state_dict(checkpoint)
+                logging.info(
+                    f"=> loaded checkpoint '{args.resume}' (epoch {start_epoch})"
+                )
+            if args.freeze_text:
+                print("Freeze Text!!!!")
+                for k in text_freeze_parameters:
+                    k.requires_grad = False
+        else:
+            logging.info("=> no checkpoint found at '{}'".format(args.resume))
+
+    cudnn.benchmark = True
+    cudnn.deterministic = False
+
+    if args.wandb and is_master(args):
+        args.train_sz = data["train"].dataloader.num_samples
+        if args.val_data is not None:
+            args.val_sz = data["val"].dataloader.num_samples
+        if args.debug:
+            wandb.watch(model, log="all")
+        if idx == 0:
+            wandb.save(params_file)
+
+    best_metrics = {}
+
+    if "train" not in data:
+        metric = evaluate(model, data, start_epoch, args, writer, extra_suffix="_pe@" + str(pretrain_epoch))
+        if is_master(args):
+            best_metrics = update_metric(best_metrics, metric)
+        return
+    elif start_epoch == 0 and "val" in data and not args.no_eval:
+        metric = evaluate(model, data, 0, args, writer, extra_suffix="_pe@" + str(pretrain_epoch))
+        if is_master(args):
+            best_metrics = update_metric(best_metrics, metric)
+    if args.save_top_performance:
+        current_top_k_ckpt_metrics = {
+            i: 0 for i in range(args.save_top_performance)
+        }  # initialize the top-k metric for ckpts to 0
+
+    for epoch in range(start_epoch, args.epochs):
+        # freeze the text param after (include) args.freeze_text_after, this is -1 by default
+        if epoch == args.freeze_text_after:
+            print("Text pretrained parameters are freezed since this epoch.")
+            for k in text_freeze_parameters:
+                k.requires_grad = False
+        if is_master(args):
+            logging.info(f"Start epoch {epoch}")
+
+        train_one_epoch(model, data, epoch, optimizer, scaler, scheduler, args, writer, extra_suffix="_pe@" + str(pretrain_epoch))
+        completed_epoch = epoch + 1
+
+        if any(v in data for v in ("val", "imagenet-val", "imagenet-v2")) and not args.no_eval:
+            metric = evaluate(model, data, completed_epoch, args, writer, extra_suffix="_pe@" + str(pretrain_epoch))
+            if is_master(args):
+                best_metrics = update_metric(best_metrics, metric)
+            if args.save_top_performance:
+                top_k_dataset = args.top_k_checkpoint_select_dataset
+                top_k_metric = args.top_k_checkpoint_select_metric
+                filtered_metrics = [
+                    v
+                    for k, v in metric.items()
+                    if top_k_metric in k and top_k_dataset in k
+                ]  # check all R@10 metrics (all dataset) and use it to update the ckpt
+        # Saving checkpoints.
+        if args.save_logs:
+            opt_dict = {
+                    k + "_" + "optimizer": v.state_dict() for k, v in optimizer.items()
+                }
+            checkpoint_dict = {
+                "epoch": completed_epoch,
+                "pretrain_epoch": pretrain_epoch,
+                "name": args.name,
+                "state_dict": model.state_dict(),
+            }
+            checkpoint_dict.update(opt_dict)
+            if scaler is not None:
+                checkpoint_dict["scaler"] = scaler.state_dict()
+
+            if completed_epoch == args.epochs or (
+                args.save_frequency > 0 and (completed_epoch % args.save_frequency) == 0
+            ):
+                torch.save(
+                    checkpoint_dict,
+                    os.path.join(args.checkpoint_path, f"pretrain_epoch_{pretrain_epoch}_lp_epoch_{completed_epoch}.pt"),
+                )
+            if args.save_most_recent:
+                torch.save(
+                    checkpoint_dict,
+                    os.path.join(args.checkpoint_path, f"pretrain_epoch_{pretrain_epoch}_lp_epoch_latest.pt"),
+                )
+            if args.save_top_performance and not args.no_eval:
+                update_top_k_performance(
+                    filtered_metrics,
+                    current_top_k_ckpt_metrics,
+                    args,
+                    checkpoint_dict,
+                    bignumbetter=True,
+                    pretrain_epoch=pretrain_epoch
+                )
+    del clap_model
+    return best_metrics
+
+
+def copy_codebase(args):
+    from shutil import copytree, ignore_patterns
+
+    new_code_path = os.path.join(args.logs, args.name, "code")
+    if os.path.exists(new_code_path):
+        print(
+            f"Error. Experiment already exists at {new_code_path}. Use --name to specify a new experiment."
+        )
+        return -1
+    print(f"Copying codebase to {new_code_path}")
+    current_code_path = os.path.realpath(__file__)
+    for _ in range(3):
+        current_code_path = os.path.dirname(current_code_path)
+    copytree(
+        current_code_path, new_code_path, ignore=ignore_patterns("log", "logs", "wandb")
+    )
+    print("Done copying code.")
+    return 1
+
+
+if __name__ == "__main__":
+    main()
+
+
diff --git a/my_laion_clap/CLAP/src/laion_clap/evaluate/eval_retrieval.py b/my_laion_clap/CLAP/src/laion_clap/evaluate/eval_retrieval.py
new file mode 100644
index 0000000000000000000000000000000000000000..739734cee63588ec647dcb9189520af6294764f2
--- /dev/null
+++ b/my_laion_clap/CLAP/src/laion_clap/evaluate/eval_retrieval.py
@@ -0,0 +1,192 @@
+import os.path
+import glob
+import random
+import numpy as np
+import logging
+import wandb
+import torch
+import torch.backends.cudnn as cudnn
+from laion_clap import create_model
+from laion_clap.training.logger import setup_logging
+from laion_clap.training.data import get_data
+from laion_clap.training.train import evaluate
+from laion_clap.utils import get_tar_path_from_dataset_name, dataset_split
+from laion_clap.training.params import parse_args
+
+
+def find_params_value(file, key):
+    # find value of params in params_file
+    with open(file, 'r') as f:
+        for line in f:
+            if key + ': ' in line:
+                return line.split(': ')[1].strip()
+    return None
+
+
+if __name__ == '__main__':
+    # (yusong) repeated run might have different metric results.
+    # This is because we randomly select crop 10s for each audio.
+    args = parse_args()
+
+    if os.path.isdir(args.pretrained):
+        log_dir = os.path.dirname(args.pretrained)
+    else:
+        log_dir = os.path.dirname(os.path.dirname(args.pretrained))
+
+    args.log_level = logging.DEBUG if args.debug else logging.INFO
+    log_path = os.path.join(log_dir, 'out.log')
+    setup_logging(log_path, args.log_level)
+    params_file = os.path.join(log_dir, 'params.txt')
+
+    seed = 3407
+    random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    np.random.seed(seed)
+
+    cudnn.benchmark = True
+    cudnn.deterministic = False
+    pretrained = 'openai'
+    amodel = find_params_value(params_file, 'amodel')
+    tmodel = find_params_value(params_file, 'tmodel')
+
+    if amodel is None or tmodel is None:
+        raise ValueError('model type not found in params file')
+
+    # set up dummy values for args
+    args.parallel_eval = False
+    args.rank = 0
+    args.local_rank = 0
+    args.world_size = 1
+    args.val_frequency = 1
+    args.epochs = 1
+    args.precision = 'fp32'
+    args.save_logs = True
+    args.wandb = True
+    args.class_index_dict = None
+
+    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
+    args.device = device
+
+    if args.remotedata:
+        for dataset_name in args.datasetnames:
+            for split in dataset_split[dataset_name]:
+                if not os.path.exists(f"./json_files/{dataset_name}/{split}"):
+                    os.makedirs(f"./json_files/{dataset_name}/{split}")
+                os.system(
+                    f"aws s3 cp s3://s-laion-audio/webdataset_tar/{dataset_name}/{split}/sizes.json ./json_files/{dataset_name}/{split}/sizes.json"
+                )
+
+    if args.datasetinfos is None:
+        args.datasetinfos = ["train", "unbalanced_train", "balanced_train"]
+    if args.dataset_type == "webdataset":
+        args.train_data = get_tar_path_from_dataset_name(
+            args.datasetnames,
+            args.datasetinfos,
+            islocal=not args.remotedata,
+            proportion=args.dataset_proportion,
+            dataset_path=args.datasetpath,
+        )
+        args.val_data = get_tar_path_from_dataset_name(
+            args.datasetnames,
+            ["valid", "test", "eval"],
+            islocal=not args.remotedata,
+            proportion=1,
+            dataset_path=args.datasetpath,
+        )
+    model, model_cfg = create_model(
+        amodel,
+        tmodel,
+        pretrained,
+        precision='fp32',
+        device=device,
+        jit=False,
+        force_quick_gelu=False,
+        openai_model_cache_dir=os.path.expanduser(args.openai_model_cache_dir),
+        skip_params=False,
+        enable_fusion=args.enable_fusion,
+        fusion_type=args.fusion_type
+    )  # a hack to get model_cfg
+
+    data = get_data(args, model_cfg=model_cfg)  # (yusong): hack: no model_cfg needed to get data
+
+    writer = None  # if use tensorboard, initalize writer here
+
+    if args.wandb:
+        assert wandb is not None, "Please install wandb."
+
+        # # find the line with "wandb_notes" and get the value
+        # wandb_notes = find_params_value(params_file, 'wandb_notes')
+        # if wandb_notes is None:
+        #     print(f'wandb_notes not found in params file: {params_file}, set to timestamp.')
+        #     wandb_notes = f'experiment_{time.strftime("%Y%m%d-%H%M%S")}'
+        # wandb_notes = wandb_notes + '-eval-retrieval'
+        wandb_notes = args.wandb_notes
+
+        logging.debug("Starting wandb.")
+        args.train_sz = data["train"].dataloader.num_samples
+        if args.val_data is not None:
+            args.val_sz = data["val"].dataloader.num_samples
+        # you will have to configure this for your project!
+        if args.wandb_id is not None:
+            wandb.init(
+                project="clap",
+                id=args.wandb_id,
+                resume=True
+            )
+        else:
+            wandb.init(
+                project="clap",
+                notes=wandb_notes,
+                name=wandb_notes,
+                tags=[],
+                config=vars(args),
+            )
+        logging.debug("Finished loading wandb.")
+
+    if os.path.isdir(args.pretrained):
+        all_model_checkpoints = sorted(glob.glob(os.path.join(log_dir, 'checkpoints', '*.pt')), key=os.path.getmtime)
+    else:
+        all_model_checkpoints = [args.pretrained]
+    for model_path in all_model_checkpoints:
+        args.checkpoint_path = os.path.dirname(model_path)
+        model, model_cfg = create_model(
+            amodel,
+            tmodel,
+            pretrained,
+            precision='fp32',
+            device=device,
+            jit=False,
+            force_quick_gelu=False,
+            openai_model_cache_dir=os.path.expanduser(args.openai_model_cache_dir),
+            skip_params=False,
+            enable_fusion=args.enable_fusion,
+            fusion_type=args.fusion_type
+        )
+
+        # load model
+        checkpoint = torch.load(model_path, map_location=device)
+        if "epoch" in checkpoint:
+            # resuming a train checkpoint w/ epoch and optimizer state
+            start_epoch = checkpoint["epoch"]
+            sd = checkpoint["state_dict"]
+            if next(iter(sd.items()))[0].startswith(
+                    "module"
+            ):
+                sd = {k[len("module."):]: v for k, v in sd.items()}
+            model.load_state_dict(sd)
+            logging.info(
+                f"=> resuming checkpoint '{model_path}' (epoch {start_epoch})"
+            )
+        else:
+            # loading a bare (model only) checkpoint for fine-tune or evaluation
+            model.load_state_dict(checkpoint)
+            start_epoch = 0
+
+        model.to(device)
+        model.eval()
+        for param in model.parameters():
+            param.requires_grad = False
+
+        evaluate(model, data, start_epoch, args, writer)
diff --git a/my_laion_clap/CLAP/src/laion_clap/evaluate/eval_retrieval_main.py b/my_laion_clap/CLAP/src/laion_clap/evaluate/eval_retrieval_main.py
new file mode 100644
index 0000000000000000000000000000000000000000..edfa65fdbf19377c1a0ba5c2e8c4fdc6f0d64e96
--- /dev/null
+++ b/my_laion_clap/CLAP/src/laion_clap/evaluate/eval_retrieval_main.py
@@ -0,0 +1,257 @@
+import os.path
+import glob
+import random
+import numpy as np
+import logging
+import wandb
+import torch
+import torch.nn.functional as F
+import torch.backends.cudnn as cudnn
+from clap_module import create_model
+from clap_module import tokenize
+from training.logger import setup_logging
+from training.data import get_data
+from training.train import evaluate
+from clap_module.utils import get_tar_path_from_dataset_name, dataset_split
+from training.params import parse_args
+
+
+def find_params_value(file, key):
+    # find value of params in params_file
+    with open(file, 'r') as f:
+        for line in f:
+            if key + ': ' in line:
+                return line.split(': ')[1].strip()
+    return None
+
+
+def evaluate_zeroshot(model, data, start_epoch, args, writer):
+    dataloader = data["val"].dataloader
+    metrics = {}
+    device = torch.device(args.device)
+    model.eval()
+    metrics.update({"epoch": start_epoch})
+
+    all_audio_features = []
+    all_class_labels = []
+    with torch.no_grad():
+        for i, batch in enumerate(dataloader):
+            audios = batch  # contains mel_spec, wavform, and longer list
+            audio_features = model(audios, None, device)
+            audio_features = F.normalize(audio_features, dim=-1)
+            all_audio_features.append(audio_features.detach().cpu())
+            all_class_labels.append(torch.argmax(batch["class_label"], 1).long())
+        all_audio_features = torch.cat(all_audio_features, dim=0)
+        all_class_labels = torch.cat(all_class_labels, dim=0)
+        metrics["num_samples"] = all_audio_features.shape[0]
+
+        # get text features
+        all_texts = ["This is a sound of " + t for t in args.class_index_dict.keys()]
+        # (yusong): a hack, can make it better
+        if args.tmodel == "transformer":
+            from clap_module.tokenizer import tokenize
+            all_texts = tokenize(all_texts)
+        else:
+            from training.data import tokenizer
+            all_texts = tokenizer(all_texts)
+        all_text_features = model(None, all_texts, device)
+        all_text_features = F.normalize(all_text_features, dim=-1).detach().cpu()
+
+        # compute similarity
+        logit_scale_a, logit_scale_t = model(None, None, device)
+        logit_scale_a = logit_scale_a.cpu()
+
+        logits_per_audio = (logit_scale_a * all_audio_features @ all_text_features.t()).detach().cpu()
+        logits_per_text = logits_per_audio.t().detach().cpu()
+
+        ground_truth = all_class_labels.view(-1, 1)
+        logit = logits_per_audio
+
+        ranking = torch.argsort(logit, descending=True)
+        preds = torch.where(ranking == ground_truth)[1]  # (yusong) this line is slow because it uses single thread
+        preds = preds.detach().cpu().numpy()
+        metrics[f"{args.datasetnames[0]}_mean_rank"] = preds.mean() + 1
+        metrics[f"{args.datasetnames[0]}_median_rank"] = np.floor(np.median(preds)) + 1
+        for k in [1, 5, 10]:
+            metrics[f"{args.datasetnames[0]}_R@{k}"] = np.mean(preds < k)
+        # map@10
+        metrics[f"{args.datasetnames[0]}_mAP@10"] = np.mean(np.where(preds < 10, 1 / (preds + 1), 0.0))
+
+        logging.info(
+            f"Eval Epoch: {start_epoch} "
+            + "\t".join([f"{k}: {round(v, 4):.4f}" for k, v in metrics.items()])
+        )
+
+        if args.wandb:
+            assert wandb is not None, "Please install wandb."
+            for name, val in metrics.items():
+                wandb.log({f"val/{name}": val, "epoch": start_epoch})
+
+
+if __name__ == '__main__':
+    # (yusong) repeated run might have different metric results.
+    # This is because we randomly select crop 10s for each audio.
+    args = parse_args()
+
+    if os.path.isdir(args.pretrained):
+        log_dir = os.path.dirname(args.pretrained)
+    else:
+        log_dir = os.path.dirname(os.path.dirname(args.pretrained))
+
+    args.log_level = logging.DEBUG if args.debug else logging.INFO
+    log_path = os.path.join(log_dir, 'out.log')
+    setup_logging(log_path, args.log_level)
+    params_file = os.path.join(log_dir, 'params.txt')
+
+    seed = 3407
+    random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    np.random.seed(seed)
+
+    cudnn.benchmark = True
+    cudnn.deterministic = False
+    pretrained = 'openai'
+    amodel = find_params_value(params_file, 'amodel')
+    tmodel = find_params_value(params_file, 'tmodel')
+
+    if amodel is None or tmodel is None:
+        raise ValueError('model type not found in params file')
+
+    # set up dummy values for args
+    args.parallel_eval = False
+    args.rank = 0
+    args.local_rank = 0
+    args.world_size = 1
+    args.val_frequency = 1
+    args.epochs = 1
+    args.precision = 'fp32'
+    args.save_logs = True
+    args.wandb = args.report_to == 'wandb'
+    args.class_index_dict = None
+
+    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
+    args.device = device
+
+    if args.remotedata:
+        for dataset_name in args.datasetnames:
+            for split in dataset_split[dataset_name]:
+                if not os.path.exists(f"./json_files/{dataset_name}/{split}"):
+                    os.makedirs(f"./json_files/{dataset_name}/{split}")
+                os.system(
+                    f"aws s3 cp s3://s-laion-audio/webdataset_tar/{dataset_name}/{split}/sizes.json ./json_files/{dataset_name}/{split}/sizes.json"
+                )
+
+    if args.datasetinfos is None:
+        args.datasetinfos = ["train", "unbalanced_train", "balanced_train"]
+    if args.dataset_type == "webdataset":
+        args.train_data = get_tar_path_from_dataset_name(
+            args.datasetnames,
+            args.datasetinfos,
+            islocal=not args.remotedata,
+            proportion=args.dataset_proportion,
+            dataset_path=args.datasetpath,
+        )
+        args.val_data = get_tar_path_from_dataset_name(
+            args.datasetnames,
+            ["valid", "test", "eval"],
+            islocal=not args.remotedata,
+            proportion=1,
+            dataset_path=args.datasetpath,
+        )
+    model, model_cfg = create_model(
+        amodel,
+        tmodel,
+        pretrained,
+        precision='fp32',
+        device=device,
+        jit=False,
+        force_quick_gelu=False,
+        openai_model_cache_dir=os.path.expanduser(args.openai_model_cache_dir),
+        skip_params=False,
+        enable_fusion=args.enable_fusion,
+        fusion_type=args.fusion_type
+    )  # a hack to get model_cfg
+
+    data = get_data(args, model_cfg=model_cfg)  # (yusong): hack: no model_cfg needed to get data
+
+    writer = None  # if use tensorboard, initalize writer here
+
+    if args.wandb:
+        assert wandb is not None, "Please install wandb."
+
+        # # find the line with "wandb_notes" and get the value
+        # wandb_notes = find_params_value(params_file, 'wandb_notes')
+        # if wandb_notes is None:
+        #     print(f'wandb_notes not found in params file: {params_file}, set to timestamp.')
+        #     wandb_notes = f'experiment_{time.strftime("%Y%m%d-%H%M%S")}'
+        # wandb_notes = wandb_notes + '-eval-retrieval'
+        wandb_notes = args.wandb_notes
+
+        logging.debug("Starting wandb.")
+        args.train_sz = data["train"].dataloader.num_samples
+        if args.val_data is not None:
+            args.val_sz = data["val"].dataloader.num_samples
+        # you will have to configure this for your project!
+        if args.wandb_id is not None:
+            wandb.init(
+                project="clap",
+                id=args.wandb_id,
+                resume=True
+            )
+        else:
+            wandb.init(
+                project="clap",
+                notes=wandb_notes,
+                name=wandb_notes,
+                tags=[],
+                config=vars(args),
+            )
+        logging.debug("Finished loading wandb.")
+
+    if os.path.isdir(args.pretrained):
+        all_model_checkpoints = sorted(glob.glob(os.path.join(log_dir, 'checkpoints', '*.pt')), key=os.path.getmtime)
+    else:
+        all_model_checkpoints = [args.pretrained]
+    for model_path in all_model_checkpoints:
+        args.checkpoint_path = os.path.dirname(model_path)
+        model, model_cfg = create_model(
+            amodel,
+            tmodel,
+            pretrained,
+            precision='fp32',
+            device=device,
+            jit=False,
+            force_quick_gelu=False,
+            openai_model_cache_dir=os.path.expanduser(args.openai_model_cache_dir),
+            skip_params=False,
+            enable_fusion=args.enable_fusion,
+            fusion_type=args.fusion_type
+        )
+
+        # load model
+        checkpoint = torch.load(model_path, map_location=device)
+        if "epoch" in checkpoint:
+            # resuming a train checkpoint w/ epoch and optimizer state
+            start_epoch = checkpoint["epoch"]
+            sd = checkpoint["state_dict"]
+            if next(iter(sd.items()))[0].startswith(
+                    "module"
+            ):
+                sd = {k[len("module."):]: v for k, v in sd.items()}
+            model.load_state_dict(sd)
+            logging.info(
+                f"=> resuming checkpoint '{model_path}' (epoch {start_epoch})"
+            )
+        else:
+            # loading a bare (model only) checkpoint for fine-tune or evaluation
+            model.load_state_dict(checkpoint)
+            start_epoch = 0
+
+        model.to(device)
+        model.eval()
+        for param in model.parameters():
+            param.requires_grad = False
+
+        evaluate_zeroshot(model, data, start_epoch, args, writer)
diff --git a/my_laion_clap/CLAP/src/laion_clap/evaluate/eval_zeroshot_classification.py b/my_laion_clap/CLAP/src/laion_clap/evaluate/eval_zeroshot_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..577cb91125bacb7f7dc7fb9841c2cd819478a736
--- /dev/null
+++ b/my_laion_clap/CLAP/src/laion_clap/evaluate/eval_zeroshot_classification.py
@@ -0,0 +1,261 @@
+import os.path
+import glob
+import random
+import numpy as np
+import logging
+import wandb
+import torch
+import torch.nn.functional as F
+import torch.backends.cudnn as cudnn
+from clap_module import create_model
+from clap_module import tokenize
+from training.logger import setup_logging
+from training.data import get_data
+from training.train import evaluate
+from clap_module.utils import get_tar_path_from_dataset_name, dataset_split
+from training.params import parse_args
+
+
+def find_params_value(file, key):
+    # find value of params in params_file
+    with open(file, 'r') as f:
+        for line in f:
+            if key + ': ' in line:
+                return line.split(': ')[1].strip()
+    return None
+
+
+def evaluate_zeroshot(model, data, start_epoch, args, writer):
+    dataloader = data["val"].dataloader
+    metrics = {}
+    device = torch.device(args.device)
+    model.eval()
+    metrics.update({"epoch": start_epoch})
+
+    all_audio_features = []
+    all_class_labels = []
+    with torch.no_grad():
+        for i, batch in enumerate(dataloader):
+            audios = batch  # contains mel_spec, wavform, and longer list
+            audio_features = model(audios, None, device)
+            audio_features = F.normalize(audio_features, dim=-1)
+            all_audio_features.append(audio_features.detach().cpu())
+            all_class_labels.append(torch.argmax(batch["class_label"], 1).long())
+        all_audio_features = torch.cat(all_audio_features, dim=0)
+        all_class_labels = torch.cat(all_class_labels, dim=0)
+        metrics["num_samples"] = all_audio_features.shape[0]
+
+        # get text features
+        if args.val_dataset_names == ['GTZAN']:
+            all_texts = [f"This is a {t} song." for t in args.class_index_dict.keys()]
+        else:
+            all_texts = [f"This is a sound of {t}." for t in args.class_index_dict.keys()]
+        logging.info(f'class label prompts: {all_texts}')
+        # (yusong): a hack, can make it better
+        if args.tmodel == "transformer":
+            from clap_module.tokenizer import tokenize
+            all_texts = tokenize(all_texts)
+        else:
+            from training.data import tokenizer
+            all_texts = tokenizer(all_texts)
+        all_text_features = model(None, all_texts, device)
+        all_text_features = F.normalize(all_text_features, dim=-1).detach().cpu()
+
+        # compute similarity
+        logit_scale_a, logit_scale_t = model(None, None, device)
+        logit_scale_a = logit_scale_a.cpu()
+
+        logits_per_audio = (logit_scale_a * all_audio_features @ all_text_features.t()).detach().cpu()
+        logits_per_text = logits_per_audio.t().detach().cpu()
+
+        ground_truth = all_class_labels.view(-1, 1)
+        logit = logits_per_audio
+
+        ranking = torch.argsort(logit, descending=True)
+        preds = torch.where(ranking == ground_truth)[1]  # (yusong) this line is slow because it uses single thread
+        preds = preds.detach().cpu().numpy()
+        metrics[f"{args.datasetnames[0]}_mean_rank"] = preds.mean() + 1
+        metrics[f"{args.datasetnames[0]}_median_rank"] = np.floor(np.median(preds)) + 1
+        for k in [1, 5, 10]:
+            metrics[f"{args.datasetnames[0]}_R@{k}"] = np.mean(preds < k)
+        # map@10
+        metrics[f"{args.datasetnames[0]}_mAP@10"] = np.mean(np.where(preds < 10, 1 / (preds + 1), 0.0))
+
+        logging.info(
+            f"Eval Epoch: {start_epoch} "
+            + "\t".join([f"{k}: {round(v, 4):.4f}" for k, v in metrics.items()])
+        )
+
+        if args.wandb:
+            assert wandb is not None, "Please install wandb."
+            for name, val in metrics.items():
+                wandb.log({f"val/{name}": val, "epoch": start_epoch})
+
+
+if __name__ == '__main__':
+    # (yusong) repeated run might have different metric results.
+    # This is because we randomly select crop 10s for each audio.
+    args = parse_args()
+
+    if os.path.isdir(args.pretrained):
+        log_dir = os.path.dirname(args.pretrained)
+    else:
+        log_dir = os.path.dirname(os.path.dirname(args.pretrained))
+
+    args.log_level = logging.DEBUG if args.debug else logging.INFO
+    log_path = os.path.join(log_dir, 'out.log')
+    setup_logging(log_path, args.log_level)
+    params_file = os.path.join(log_dir, 'params.txt')
+
+    seed = 3407
+    random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    np.random.seed(seed)
+
+    cudnn.benchmark = True
+    cudnn.deterministic = False
+    pretrained = 'openai'
+    amodel = find_params_value(params_file, 'amodel')
+    tmodel = find_params_value(params_file, 'tmodel')
+
+    if amodel is None or tmodel is None:
+        raise ValueError('model type not found in params file')
+
+    # set up dummy values for args
+    args.parallel_eval = False
+    args.rank = 0
+    args.local_rank = 0
+    args.world_size = 1
+    args.val_frequency = 1
+    args.epochs = 1
+    args.precision = 'fp32'
+    args.save_logs = True
+    args.wandb = args.report_to == 'wandb'
+    args.class_index_dict = None
+
+    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
+    args.device = device
+
+    if args.remotedata:
+        for dataset_name in args.datasetnames:
+            for split in dataset_split[dataset_name]:
+                if not os.path.exists(f"./json_files/{dataset_name}/{split}"):
+                    os.makedirs(f"./json_files/{dataset_name}/{split}")
+                os.system(
+                    f"aws s3 cp s3://s-laion-audio/webdataset_tar/{dataset_name}/{split}/sizes.json ./json_files/{dataset_name}/{split}/sizes.json"
+                )
+
+    if args.datasetinfos is None:
+        args.datasetinfos = ["train", "unbalanced_train", "balanced_train"]
+    if args.dataset_type == "webdataset":
+        args.train_data = get_tar_path_from_dataset_name(
+            args.datasetnames,
+            args.datasetinfos,
+            islocal=not args.remotedata,
+            proportion=args.dataset_proportion,
+            dataset_path=args.datasetpath,
+        )
+        args.val_data = get_tar_path_from_dataset_name(
+            args.datasetnames,
+            ["valid", "test", "eval"],
+            islocal=not args.remotedata,
+            proportion=1,
+            dataset_path=args.datasetpath,
+        )
+    model, model_cfg = create_model(
+        amodel,
+        tmodel,
+        pretrained,
+        precision='fp32',
+        device=device,
+        jit=False,
+        force_quick_gelu=False,
+        openai_model_cache_dir=os.path.expanduser(args.openai_model_cache_dir),
+        skip_params=False,
+        enable_fusion=args.enable_fusion,
+        fusion_type=args.fusion_type
+    )  # a hack to get model_cfg
+
+    data = get_data(args, model_cfg=model_cfg)  # (yusong): hack: no model_cfg needed to get data
+
+    writer = None  # if use tensorboard, initalize writer here
+
+    if args.wandb:
+        assert wandb is not None, "Please install wandb."
+
+        # # find the line with "wandb_notes" and get the value
+        # wandb_notes = find_params_value(params_file, 'wandb_notes')
+        # if wandb_notes is None:
+        #     print(f'wandb_notes not found in params file: {params_file}, set to timestamp.')
+        #     wandb_notes = f'experiment_{time.strftime("%Y%m%d-%H%M%S")}'
+        # wandb_notes = wandb_notes + '-eval-retrieval'
+        wandb_notes = args.wandb_notes
+
+        logging.debug("Starting wandb.")
+        args.train_sz = data["train"].dataloader.num_samples
+        if args.val_data is not None:
+            args.val_sz = data["val"].dataloader.num_samples
+        # you will have to configure this for your project!
+        if args.wandb_id is not None:
+            wandb.init(
+                project="clap",
+                id=args.wandb_id,
+                resume=True
+            )
+        else:
+            wandb.init(
+                project="clap",
+                notes=wandb_notes,
+                name=wandb_notes,
+                tags=[],
+                config=vars(args),
+            )
+        logging.debug("Finished loading wandb.")
+
+    if os.path.isdir(args.pretrained):
+        all_model_checkpoints = sorted(glob.glob(os.path.join(log_dir, 'checkpoints', '*.pt')), key=os.path.getmtime)
+    else:
+        all_model_checkpoints = [args.pretrained]
+    for model_path in all_model_checkpoints:
+        args.checkpoint_path = os.path.dirname(model_path)
+        model, model_cfg = create_model(
+            amodel,
+            tmodel,
+            pretrained,
+            precision='fp32',
+            device=device,
+            jit=False,
+            force_quick_gelu=False,
+            openai_model_cache_dir=os.path.expanduser(args.openai_model_cache_dir),
+            skip_params=False,
+            enable_fusion=args.enable_fusion,
+            fusion_type=args.fusion_type
+        )
+
+        # load model
+        checkpoint = torch.load(model_path, map_location=device)
+        if "epoch" in checkpoint:
+            # resuming a train checkpoint w/ epoch and optimizer state
+            start_epoch = checkpoint["epoch"]
+            sd = checkpoint["state_dict"]
+            if next(iter(sd.items()))[0].startswith(
+                    "module"
+            ):
+                sd = {k[len("module."):]: v for k, v in sd.items()}
+            model.load_state_dict(sd)
+            logging.info(
+                f"=> resuming checkpoint '{model_path}' (epoch {start_epoch})"
+            )
+        else:
+            # loading a bare (model only) checkpoint for fine-tune or evaluation
+            model.load_state_dict(checkpoint)
+            start_epoch = 0
+
+        model.to(device)
+        model.eval()
+        for param in model.parameters():
+            param.requires_grad = False
+
+        evaluate_zeroshot(model, data, start_epoch, args, writer)
diff --git a/my_laion_clap/CLAP/src/laion_clap/hook.py b/my_laion_clap/CLAP/src/laion_clap/hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..eff596cd22f20a10a325acce8eaeff8fdd62cc77
--- /dev/null
+++ b/my_laion_clap/CLAP/src/laion_clap/hook.py
@@ -0,0 +1,219 @@
+"""
+Contrastive Language-Audio Pretraining Model from LAION
+--------------------------------------------------------
+Paper: https://arxiv.org/abs/2211.06687
+Authors (equal contributions): Ke Chen, Yusong Wu, Tianyu Zhang, Yuchen Hui
+Support: LAION
+"""
+import os
+import torch
+import librosa
+from clap_module import create_model
+from training.data import get_audio_features
+from training.data import int16_to_float32, float32_to_int16
+
+from transformers import RobertaTokenizer
+import wget
+from clap_module.factory import load_state_dict
+
+
+class CLAP_Module(torch.nn.Module):
+    def __init__(self, enable_fusion=False, device=None, amodel= 'HTSAT-tiny', tmodel='roberta') -> None:
+        """Initialize CLAP Model
+
+        Parameters
+        ----------
+        enable_fusion: bool
+            if true, it will create the fusion clap model, otherwise non-fusion clap model (default: false) 
+        device: str
+            if None, it will automatically detect the device (gpu or cpu)
+        amodel: str
+            audio encoder architecture, default: HTSAT-tiny
+        tmodel: str
+            text encoder architecture, default: roberta
+        """
+        super(CLAP_Module, self).__init__()
+        if device is None:
+            device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
+
+        precision = 'fp32'
+
+        if enable_fusion:
+            fusion_type = 'aff_2d'
+            model, model_cfg = create_model(
+                amodel,
+                tmodel,
+                precision=precision,
+                device=device,
+                enable_fusion=enable_fusion,
+                fusion_type=fusion_type
+            )
+        else:
+            model, model_cfg = create_model(
+                amodel,
+                tmodel,
+                precision=precision,
+                device=device,
+                enable_fusion=enable_fusion
+            )
+        self.enable_fusion = enable_fusion
+        self.model = model
+        self.model_cfg = model_cfg
+        self.tokenize = RobertaTokenizer.from_pretrained('roberta-base')
+
+    def tokenizer(self, text):
+        result = self.tokenize(
+            text,
+            padding="max_length",
+            truncation=True,
+            max_length=77,
+            return_tensors="pt",
+        )
+        return {k: v.squeeze(0) for k, v in result.items()}
+
+    def load_ckpt(self, ckpt = None, model_id = -1, verbose = False):
+        """Load the pretrained checkpoint of CLAP model
+
+        Parameters
+        ----------
+        ckpt: str
+            if ckpt is specified, the model will load this ckpt, otherwise the model will download the ckpt from zenodo. \n 
+            For fusion model, it will download the 630k+audioset fusion model (id=3). For non-fusion model, it will download the 630k+audioset model (id=1).
+        model_id:
+            if model_id is specified, you can download our best ckpt, as:
+                id = 0 --> 630k non-fusion ckpt \n
+                id = 1 --> 630k+audioset non-fusion ckpt \n
+                id = 2 --> 630k fusion ckpt \n
+                id = 3 --> 630k+audioset fusion ckpt \n
+            Note that if your model is specied as non-fusion model but you download a fusion model ckpt, you will face an error.
+        """
+        download_link = 'https://huggingface.co/lukewys/laion_clap/resolve/main/'
+        download_names = [
+            '630k-best.pt',
+            '630k-audioset-best.pt',
+            '630k-fusion-best.pt',
+            '630k-audioset-fusion-best.pt'
+        ]
+        if ckpt is not None:
+            print(f'Load the specified checkpoint {ckpt} from users.')
+        else:
+            print(f'Load our best checkpoint in the paper.')
+            if model_id == -1:
+                model_id = 3 if self.enable_fusion else 1
+            package_dir = os.path.dirname(os.path.realpath(__file__))
+            weight_file_name = download_names[model_id]
+            ckpt = os.path.join(package_dir, weight_file_name)
+            if os.path.exists(ckpt):
+                print(f'The checkpoint is already downloaded')
+            else:
+                print('Downloading laion_clap weight files...')
+                ckpt = wget.download(download_link + weight_file_name, os.path.dirname(ckpt))
+                print('Download completed!')
+        print('Load Checkpoint...')
+        ckpt = load_state_dict(ckpt, skip_params=True)
+        self.model.load_state_dict(ckpt)
+        if verbose:
+            param_names = [n for n, p in self.model.named_parameters()]
+            for n in param_names:
+                print(n, "\t", "Loaded" if n in ckpt else "Unloaded")
+    
+    def get_audio_embedding_from_filelist(self, x, use_tensor=False):
+        """get audio embeddings from the audio file list
+
+        Parameters
+        ----------
+        x: List[str] (N,): 
+            an audio file list to extract features, audio files can have different lengths (as we have the feature fusion machanism)
+        use_tensor: boolean:
+            if True, it will return the torch tensor, preserving the gradient (default: False).
+        Returns
+        ----------
+        audio_embed : numpy.darray | torch.Tensor (N,D):
+            audio embeddings that extracted from audio files
+        """ 
+        self.model.eval()
+        audio_input = []
+        for f in x:
+            # load the waveform of the shape (T,), should resample to 48000
+            audio_waveform, _ = librosa.load(f, sr=48000)           
+            # quantize
+            audio_waveform = int16_to_float32(float32_to_int16(audio_waveform))
+            audio_waveform = torch.from_numpy(audio_waveform).float()
+            temp_dict = {}
+            temp_dict = get_audio_features(
+                temp_dict, audio_waveform, 480000, 
+                data_truncating='fusion' if self.enable_fusion else 'rand_trunc', 
+                data_filling='repeatpad',
+                audio_cfg=self.model_cfg['audio_cfg'],
+                require_grad=audio_waveform.requires_grad
+            )
+            audio_input.append(temp_dict)
+        audio_embed = self.model.get_audio_embedding(audio_input)
+        if not use_tensor:
+            audio_embed = audio_embed.detach().cpu().numpy()
+        return audio_embed
+
+
+    def get_audio_embedding_from_data(self, x, use_tensor=False):
+        """get audio embeddings from the audio data
+
+        Parameters
+        ----------
+        x: np.darray | torch.Tensor (N,T): 
+            audio data, must be mono audio tracks.
+        use_tensor: boolean:
+            if True, x should be the tensor input and the output will be the tesnor, preserving the gradient (default: False).      
+            Note that if 'use tensor' is set to True, it will not do the quantize of the audio waveform (otherwise the gradient will not be preserved).
+        Returns
+        ----------
+        audio embed: numpy.darray | torch.Tensor (N,D):
+            audio embeddings that extracted from audio files
+        """ 
+        self.model.eval()
+        audio_input = []
+        for audio_waveform in x:          
+            # quantize
+            if not use_tensor:
+                audio_waveform = int16_to_float32(float32_to_int16(audio_waveform))
+                audio_waveform = torch.from_numpy(audio_waveform).float()
+            temp_dict = {}
+            temp_dict = get_audio_features(
+                temp_dict, audio_waveform, 480000, 
+                data_truncating='fusion' if self.enable_fusion else 'rand_trunc', 
+                data_filling='repeatpad',
+                audio_cfg=self.model_cfg['audio_cfg'],
+                require_grad=audio_waveform.requires_grad
+            )
+            audio_input.append(temp_dict)
+        audio_embed = self.model.get_audio_embedding(audio_input)
+        if not use_tensor:
+            audio_embed = audio_embed.detach().cpu().numpy()
+        return audio_embed
+
+    def get_text_embedding(self, x, tokenizer = None, use_tensor = False):
+        """get text embeddings from texts
+
+        Parameters
+        ----------
+        x: List[str] (N,): 
+            text list 
+        tokenizer: func:
+            the tokenizer function, if not provided (None), will use the default Roberta tokenizer.
+        use_tensor: boolean:
+            if True, the output will be the tesnor, preserving the gradient (default: False).      
+        Returns
+        ----------
+        text_embed : numpy.darray | torch.Tensor (N,D):
+            text embeddings that extracted from texts
+        """ 
+        self.model.eval()
+        if tokenizer is not None:
+            text_input = tokenizer(x)
+        else:
+            text_input = self.tokenizer(x)
+        text_embed = self.model.get_text_embedding(text_input)
+        if not use_tensor:
+            text_embed = text_embed.detach().cpu().numpy()
+        return text_embed
+        
+    
diff --git a/my_laion_clap/CLAP/src/laion_clap/training/__init__.py b/my_laion_clap/CLAP/src/laion_clap/training/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/my_laion_clap/CLAP/src/laion_clap/training/__pycache__/__init__.cpython-38.pyc b/my_laion_clap/CLAP/src/laion_clap/training/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..686aac42cf42678bd60313e14c573da64023bbce
Binary files /dev/null and b/my_laion_clap/CLAP/src/laion_clap/training/__pycache__/__init__.cpython-38.pyc differ
diff --git a/my_laion_clap/CLAP/src/laion_clap/training/__pycache__/data.cpython-38.pyc b/my_laion_clap/CLAP/src/laion_clap/training/__pycache__/data.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ba7dba99fc4a9cb257a680d1197a5d1996c26bef
Binary files /dev/null and b/my_laion_clap/CLAP/src/laion_clap/training/__pycache__/data.cpython-38.pyc differ
diff --git a/my_laion_clap/CLAP/src/laion_clap/training/audioset_textmap.npy b/my_laion_clap/CLAP/src/laion_clap/training/audioset_textmap.npy
new file mode 100644
index 0000000000000000000000000000000000000000..3da4c92d3819aaec11e5f576464a9973a6df811b
--- /dev/null
+++ b/my_laion_clap/CLAP/src/laion_clap/training/audioset_textmap.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bada103070d92f9eadd33e1b4f45ec8583f59080ef218c966b43294bd4c86d5b
+size 84448
diff --git a/my_laion_clap/CLAP/src/laion_clap/training/data.py b/my_laion_clap/CLAP/src/laion_clap/training/data.py
new file mode 100644
index 0000000000000000000000000000000000000000..fad90621575f45388feb5c015cfff5d1f5fe2146
--- /dev/null
+++ b/my_laion_clap/CLAP/src/laion_clap/training/data.py
@@ -0,0 +1,895 @@
+import ast
+import json
+import logging
+import math
+import os
+import random
+import h5py
+from dataclasses import dataclass
+import braceexpand
+import numpy as np
+import pandas as pd
+import torch
+import torch.nn.functional as F
+import torchvision.datasets as datasets
+import torchvision.transforms
+import webdataset as wds
+from PIL import Image
+from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
+from torch.utils.data.distributed import DistributedSampler
+from functools import partial
+from pathlib import Path
+import wget
+import tempfile
+import copy
+from contextlib import suppress
+
+from clap_module.utils import get_tar_path_from_dataset_name, dataset_split
+from clap_module.utils import load_p, load_class_label
+from clap_module import tokenize as clip_tokenizer
+from transformers import BertTokenizer
+from transformers import RobertaTokenizer
+from transformers import BartTokenizer
+
+try:
+    import horovod.torch as hvd
+except ImportError:
+    hvd = None
+
+try:
+    import torchaudio
+except ImportError:
+    torchaudio = None
+
+bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+roberta_tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
+bart_tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
+
+def tokenizer(text, tmodel="roberta", max_length=77):
+    """tokenizer for different models
+    tmodel is default to roberta as it is the best model for our task
+    max_length is default to 77 from the OpenAI CLIP parameters
+    We assume text to be a single string, but it can also be a list of strings
+    """
+    if tmodel == "transformer":
+        return clip_tokenizer(text).squeeze(0)
+
+    elif tmodel == "bert":
+        result = bert_tokenizer(
+            text,
+            padding="max_length",
+            truncation=True,
+            max_length=max_length,
+            return_tensors="pt",
+        )
+        return {k: v.squeeze(0) for k, v in result.items()}
+
+    elif tmodel == "roberta":
+        result = roberta_tokenizer(
+            text,
+            padding="max_length",
+            truncation=True,
+            max_length=max_length,
+            return_tensors="pt",
+        )
+        return {k: v.squeeze(0) for k, v in result.items()}
+
+    elif tmodel == "bart":
+        result = bart_tokenizer(
+            text,
+            padding="max_length",
+            truncation=True,
+            max_length=max_length,
+            return_tensors="pt",
+        )
+        return {k: v.squeeze(0) for k, v in result.items()}
+
+
+# initizlied the audioset map
+_AUDIOSET_MAP_PATH = os.path.join(Path(__file__).parent, "audioset_textmap.npy")
+_AUDIOSET_MAP = np.load(_AUDIOSET_MAP_PATH, allow_pickle=True)
+
+
+def int16_to_float32(x):
+    return (x / 32767.0).astype(np.float32)
+
+
+def float32_to_int16(x):
+    x = np.clip(x, a_min=-1., a_max=1.)
+    return (x * 32767.).astype(np.int16)
+
+
+def int16_to_float32_torch(x):
+    return (x / 32767.0).type(torch.float32)
+
+
+def float32_to_int16_torch(x):
+    x = torch.clamp(x, min=-1., max=1.)
+    return (x * 32767.).type(torch.int16)
+
+
+# For Toy Dataset
+class ToyDataset(Dataset):
+    def __init__(self, index_path, ipc, config, eval_mode=False):
+        """Toy Dataset for testing the audioset input with text labels
+        Parameters
+        ----------
+            index_path: str
+                the link to the h5 file of each audio
+            idc: str
+                the link to the npy file, the number of samples in each class
+            config: dict
+                the audio cfg file
+           eval_model (bool): to indicate if the dataset is a testing dataset
+        """
+        self.audio_cfg = config["audio_cfg"]
+        self.text_cfg = config["text_cfg"]
+        self.fp = h5py.File(index_path, "r")
+        self.ipc = np.load(ipc, allow_pickle=True)
+        self.total_size = len(self.fp["audio_name"])
+        self.classes_num = self.audio_cfg["class_num"]
+        self.eval_mode = eval_mode
+
+        if not eval_mode:
+            self.generate_queue()
+        else:
+            self.queue = []
+            for i in range(self.total_size):
+                target = self.fp["target"][i]
+                if np.sum(target) > 0:
+                    self.queue.append(i)
+            self.total_size = len(self.queue)
+        logging.info("total dataset size: %d" % (self.total_size))
+        logging.info("class num: %d" % (self.classes_num))
+
+    def time_shifting(self, x):
+        frame_num = len(x)
+        shift_len = random.randint(0, frame_num - 1)
+        new_sample = np.concatenate([x[shift_len:], x[:shift_len]], axis=0)
+        return new_sample
+
+    def generate_queue(self):
+        self.queue = []
+        while len(self.queue) < self.total_size:
+            class_set = [*range(self.classes_num)]
+            random.shuffle(class_set)
+            self.queue += [
+                self.ipc[d][random.randint(0, len(self.ipc[d]) - 1)] for d in class_set
+            ]
+        self.queue = self.queue[: self.total_size]
+
+        logging.info("queue regenerated:%s" % (self.queue[-5:]))
+
+    def crop_wav(self, x):
+        crop_size = self.audio_cfg["crop_size"]
+        crop_pos = random.randint(0, len(x) - crop_size - 1)
+        return x[crop_pos: crop_pos + crop_size]
+
+    def prompt_text(self, target):
+        events = _AUDIOSET_MAP[np.where(target > 0)]
+        event_text = "The sounds of " + ", ".join(events[:-1]) + " and " + events[-1]
+        text = tokenizer(event_text)[0]
+        return text
+
+    def __getitem__(self, index):
+        """Load waveform, text, and target of an audio clip
+
+        Parameters
+        ----------
+            index: int
+                the index number
+        Return
+        ------
+            output: dict {
+                "hdf5_path": str,
+                "index_in_hdf5": int,
+                "audio_name": str,
+                "waveform": list (audio_length,),
+                "target": list (class_num, ),
+                "text": torch.tensor (context_length,)
+            }
+                the output dictionary
+        """
+        s_index = self.queue[index]
+
+        audio_name = self.fp["audio_name"][s_index].decode()
+        # Hardcode here CHANGE
+        hdf5_path = (
+            self.fp["hdf5_path"][s_index]
+            .decode()
+            .replace(
+                "../workspace",
+                "/home/la/kechen/Research/ke_zsasp/workspace",
+            )
+        )
+        r_idx = self.fp["index_in_hdf5"][s_index]
+        target = self.fp["target"][s_index].astype(np.float32)
+        text = self.prompt_text(target)
+        with h5py.File(hdf5_path, "r") as f:
+            waveform = int16_to_float32(f["waveform"][r_idx])[
+                       : self.audio_cfg["clip_samples"]
+                       ]
+        assert (
+                len(waveform) == self.audio_cfg["clip_samples"]
+        ), "The sample length is not match"
+        # Time shift
+        # if (self.config.enable_time_shift) and (not self.eval_mode):
+        #     waveform = self.time_shifting(waveform)
+        # # Label Enhance
+        # if (self.config.crop_size is not None) and (not self.eval_mode):
+        #     waveform = self.crop_wav(waveform)
+        # # the label enhance rate is fixed 0.5
+        # if (self.config.enable_label_enhance) and (not self.eval_mode) and random.random() < 0.5:
+        #     kidx = np.where(target)[0]
+        #     for k in kidx:
+        #         for add_key in self.class_map[k][1]:
+        #             target[add_key] = 1.0
+        #         if len(self.class_map[k][2]) > 0:
+        #             add_key = random.choice(self.class_map[k][2])
+        #             target[add_key] = 1.0
+
+        # missing the text input
+        mel_spec = get_mel(torch.from_numpy(waveform), self.audio_cfg)[None, :, :]
+        mel_spec = torch.cat([mel_spec, mel_spec.clone(), mel_spec.clone(), mel_spec.clone()], dim=0).cpu().numpy()
+        longer = random.choice([True, False])
+        if longer == False:
+            mel_spec[1:, :, :] = 0.0
+        data_dict = {
+            "hdf5_path": hdf5_path,
+            "index_in_hdf5": r_idx,
+            "audio_name": audio_name,
+            "waveform": waveform,
+            "class_label": target,
+            "text": text,
+            "longer": longer,
+            "mel_fusion": mel_spec
+        }
+        return data_dict
+
+    def __len__(self):
+        return self.total_size
+
+@dataclass
+class DataInfo:
+    dataloader: DataLoader
+    sampler: DistributedSampler
+
+
+def get_dataset_size(shards, sizefilepath_=None, is_local=True):
+    if isinstance(shards, list):
+        size_list = []
+        for s in shards:
+            size_list.append(
+                get_dataset_size(s, sizefilepath_=sizefilepath_, is_local=is_local)[0]
+            )
+    else:
+        if not is_local:
+            for n in dataset_split.keys():
+                if n in shards.split("/"):
+                    break
+            for s in dataset_split[n]:
+                if s in shards.split("/"):
+                    break
+            sizefilepath_ = f"./json_files/{n}/{s}/sizes.json"
+        shards_list = list(braceexpand.braceexpand(shards))
+        dir_path = os.path.dirname(shards)
+        if sizefilepath_ is not None:
+            sizes = json.load(open(sizefilepath_, "r"))
+            total_size = sum(
+                [
+                    int(sizes[os.path.basename(shard.replace(".tar -", ".tar"))])
+                    for shard in shards_list
+                ]
+            )
+        else:
+            sizes_filename = os.path.join(dir_path, "sizes.json")
+            len_filename = os.path.join(dir_path, "__len__")
+            if os.path.exists(sizes_filename):
+                sizes = json.load(open(sizes_filename, "r"))
+                total_size = sum(
+                    [int(sizes[os.path.basename(shard)]) for shard in shards_list]
+                )
+            elif os.path.exists(len_filename):
+                # FIXME this used to be eval(open(...)) but that seemed rather unsafe
+                total_size = ast.literal_eval(open(len_filename, "r").read())
+            else:
+                raise Exception(
+                    f"Cannot find sizes file for dataset {shards}. Please specify the path to the file."
+                )
+                # total_size = None  # num samples undefined
+                # some common dataset sizes (at time of authors last download)
+                # cc3m-train: 2905954
+                # cc12m: 10968539
+                # LAION-400m: 407332084
+        num_shards = len(shards_list)
+    if isinstance(shards, list):
+        return sum(size_list), len(shards)
+    else:
+        return total_size, num_shards
+
+
+def count_samples(dataloader):
+    os.environ["WDS_EPOCH"] = "0"
+    n_elements, n_batches = 0, 0
+    for images, texts in dataloader:
+        n_batches += 1
+        n_elements += len(images)
+        assert len(images) == len(texts)
+    return n_elements, n_batches
+
+
+def log_and_continue(exn):
+    """Call in an exception handler to ignore any exception, isssue a warning, and continue."""
+    logging.warning(f"Handling webdataset error ({repr(exn)}). Ignoring.")
+    return True
+
+
+_SHARD_SHUFFLE_SIZE = 2000
+_SHARD_SHUFFLE_INITIAL = 500
+_SAMPLE_SHUFFLE_SIZE = 5000
+_SAMPLE_SHUFFLE_INITIAL = 1000
+
+
+def sample_prop(sizefile, inputs, proportion, is_local=True):
+    """
+    Sample a proportion of the data.
+    """
+    file_path_dict = {
+        os.path.split(inputs[i])[1]: os.path.split(inputs[i])[0]
+        for i in range(len(inputs))
+    }
+    sampled_filepath_dict = {}
+    sampled_size_dict = {}
+    if not is_local:
+        if os.path.exists("sizes.json"):
+            os.remove("sizes.json")
+        wget.download(sizefile, "sizes.json")
+        sizefile = "sizes.json"
+    with open(sizefile, "r", encoding="UTF-8") as f:
+        load_dict = json.load(f)
+    L = int(len(file_path_dict) * proportion)
+    subkeys = random.sample(file_path_dict.keys(), L)
+    for k in subkeys:
+        sampled_size_dict[k] = load_dict[k]
+        sampled_filepath_dict[k] = file_path_dict[k]
+    return (
+        sum(sampled_size_dict.values()),
+        L,
+        [os.path.join(v, k) for k, v in sampled_filepath_dict.items()],
+        sampled_size_dict,
+    )
+
+
+def get_mel(audio_data, audio_cfg):
+    # mel shape: (n_mels, T)
+    mel_tf = torchaudio.transforms.MelSpectrogram(
+        sample_rate=audio_cfg['sample_rate'],
+        n_fft=audio_cfg['window_size'],
+        win_length=audio_cfg['window_size'],
+        hop_length=audio_cfg['hop_size'],
+        center=True,
+        pad_mode="reflect",
+        power=2.0,
+        norm=None,
+        onesided=True,
+        n_mels=audio_cfg['mel_bins'],
+        f_min=audio_cfg['fmin'],
+        f_max=audio_cfg['fmax']
+    ).to(audio_data.device)
+    
+    mel = mel_tf(audio_data)
+    # Align to librosa:
+    # librosa_melspec = librosa.feature.melspectrogram(
+    #     waveform,
+    #     sr=audio_cfg['sample_rate'],
+    #     n_fft=audio_cfg['window_size'],
+    #     hop_length=audio_cfg['hop_size'],
+    #     win_length=audio_cfg['window_size'],
+    #     center=True,
+    #     pad_mode="reflect",
+    #     power=2.0,
+    #     n_mels=audio_cfg['mel_bins'],
+    #     norm=None,
+    #     htk=True,
+    #     f_min=audio_cfg['fmin'],
+    #     f_max=audio_cfg['fmax']
+    # )
+    # we use log mel spectrogram as input
+    mel = torchaudio.transforms.AmplitudeToDB(top_db=None)(mel)
+    return mel.T  # (T, n_mels)
+
+
+def get_audio_features(sample, audio_data, max_len, data_truncating, data_filling, audio_cfg, require_grad=False):
+    """
+    Calculate and add audio features to sample.
+    Sample: a dict containing all the data of current sample.
+    audio_data: a tensor of shape (T) containing audio data.
+    max_len: the maximum length of audio data.
+    data_truncating: the method of truncating data.
+    data_filling: the method of filling data.
+    audio_cfg: a dict containing audio configuration. Comes from model_cfg['audio_cfg'].
+    require_grad: whether to require gradient for audio data.
+        This is useful when we want to apply gradient-based classifier-guidance.
+    """
+    grad_fn = suppress if require_grad else torch.no_grad
+    with grad_fn():
+        if len(audio_data) > max_len:
+            if data_truncating == "rand_trunc":
+                longer = torch.tensor([True])
+            elif data_truncating == "fusion":
+                # fusion
+                mel = get_mel(audio_data, audio_cfg)
+                # split to three parts
+                chunk_frames = max_len // audio_cfg['hop_size'] + 1  # the +1 related to how the spectrogram is computed
+                total_frames = mel.shape[0]
+                if chunk_frames == total_frames:
+                    # there is a corner case where the audio length is
+                    # larger than max_len but smaller than max_len+hop_size.
+                    # In this case, we just use the whole audio.
+                    mel_fusion = torch.stack([mel, mel, mel, mel], dim=0)
+                    sample["mel_fusion"] = mel_fusion
+                    longer = torch.tensor([False])
+                else:
+                    ranges = np.array_split(list(range(0, total_frames - chunk_frames + 1)), 3)
+                    # print('total_frames-chunk_frames:', total_frames-chunk_frames,
+                    #       'len(audio_data):', len(audio_data),
+                    #       'chunk_frames:', chunk_frames,
+                    #       'total_frames:', total_frames)
+                    if len(ranges[1]) == 0:
+                        # if the audio is too short, we just use the first chunk
+                        ranges[1] = [0]
+                    if len(ranges[2]) == 0:
+                        # if the audio is too short, we just use the first chunk
+                        ranges[2] = [0]
+                    # randomly choose index for each part
+                    idx_front = np.random.choice(ranges[0])
+                    idx_middle = np.random.choice(ranges[1])
+                    idx_back = np.random.choice(ranges[2])
+                    # select mel
+                    mel_chunk_front = mel[idx_front:idx_front + chunk_frames, :]
+                    mel_chunk_middle = mel[idx_middle:idx_middle + chunk_frames, :]
+                    mel_chunk_back = mel[idx_back:idx_back + chunk_frames, :]
+
+                    # shrink the mel
+                    mel_shrink = torchvision.transforms.Resize(size=[chunk_frames, audio_cfg['mel_bins']])(mel[None])[0]
+                    # logging.info(f"mel_shrink.shape: {mel_shrink.shape}")
+
+                    # stack
+                    mel_fusion = torch.stack([mel_shrink, mel_chunk_front, mel_chunk_middle, mel_chunk_back], dim=0)
+                    sample["mel_fusion"] = mel_fusion
+                    longer = torch.tensor([True])
+            else:
+                raise NotImplementedError(
+                    f"data_truncating {data_truncating} not implemented"
+                )
+            # random crop to max_len (for compatibility)
+            overflow = len(audio_data) - max_len
+            idx = np.random.randint(0, overflow + 1)
+            audio_data = audio_data[idx: idx + max_len]
+
+        else:  # padding if too short
+            if len(audio_data) < max_len:  # do nothing if equal
+                if data_filling == "repeatpad":
+                    n_repeat = int(max_len / len(audio_data))
+                    audio_data = audio_data.repeat(n_repeat)
+                    # audio_data = audio_data.unsqueeze(0).unsqueeze(0).unsqueeze(0)
+                    # audio_data = F.interpolate(audio_data,size=max_len,mode="bicubic")[0,0,0]
+                    audio_data = F.pad(
+                        audio_data,
+                        (0, max_len - len(audio_data)),
+                        mode="constant",
+                        value=0,
+                    )
+                elif data_filling == "pad":
+                    audio_data = F.pad(
+                        audio_data,
+                        (0, max_len - len(audio_data)),
+                        mode="constant",
+                        value=0,
+                    )
+                elif data_filling == "repeat":
+                    n_repeat = int(max_len / len(audio_data))
+                    audio_data = audio_data.repeat(n_repeat + 1)[:max_len]
+                else:
+                    raise NotImplementedError(
+                        f"data_filling {data_filling} not implemented"
+                    )
+            if data_truncating == 'fusion':
+                mel = get_mel(audio_data, audio_cfg)
+                mel_fusion = torch.stack([mel, mel, mel, mel], dim=0)
+                sample["mel_fusion"] = mel_fusion
+            longer = torch.tensor([False])
+
+    sample["longer"] = longer
+    sample["waveform"] = audio_data
+
+    return sample
+
+
+def select_text(json_dict_raw, text_augment_selection):
+    # For selecting augmented text from dataset
+    if text_augment_selection is None or text_augment_selection == "none":
+        texts = json_dict_raw["text"]
+    elif text_augment_selection == "all":
+        if "text_augment_all" in json_dict_raw.keys():
+            texts = json_dict_raw["text_augment_all"]
+        else:
+            texts = json_dict_raw["text"]
+    elif text_augment_selection == "augment_only":
+        if "text_augment_all" in json_dict_raw.keys():
+            if json_dict_raw["text_augment_t5"] is None:
+                texts = json_dict_raw["text"]
+            else:
+                texts = json_dict_raw["text_augment_t5"]
+        else:
+            texts = json_dict_raw["text"]
+    else:
+        raise NotImplementedError(
+            f"text_augment_selection {text_augment_selection} not implemented"
+        )
+    return texts
+
+
+def preprocess_single(
+        sample,
+        audio_ext,
+        text_ext,
+        max_len,
+        audio_cfg,
+        tmodel,
+        class_index_dict,
+        data_filling,
+        data_truncating,
+        text_augment_selection,
+):
+    """
+    Preprocess a single sample for wdsdataloader.
+    """
+    audio_data, orig_sr = sample[audio_ext]
+    audio_data = int16_to_float32_torch(float32_to_int16_torch(audio_data[0]))
+
+    sample = get_audio_features(sample, audio_data, max_len, data_truncating, data_filling, audio_cfg)
+    del sample[audio_ext]
+
+    json_dict_raw = sample[text_ext]
+
+    texts = select_text(json_dict_raw, text_augment_selection)
+    sample["full_text"] = texts
+
+    if isinstance(texts, list) and isinstance(texts[0], str) and len(texts) > 1:
+        texts = random.choice(texts)
+    sample["raw_text"] = texts
+    sample["text"] = tokenizer(texts, tmodel=tmodel)  # text shape: [num_token]
+    if class_index_dict is not None:
+        # https://stackoverflow.com/questions/48004243/how-to-share-large-read-only-dictionary-list-across-processes-in-multiprocessing
+        # https://stackoverflow.com/questions/45693949/storing-strings-in-a-multiprocessing-sharedctypes-array
+
+        # in case the re-written version is wrong, here is the old version:
+        # sample["class_label"] = np.zeros(len(class_index_dict.keys()))
+        # for x in json_dict_raw["tag"]:
+        #     sample["class_label"][class_index_dict[x]] = 1
+        # sample["class_label"] = torch.tensor(sample["class_label"]).float()
+
+        class_labels = np.zeros(len(class_index_dict))
+        class_labels[np.in1d(list(class_index_dict.keys()), json_dict_raw["tag"])] = 1
+        sample["class_label"] = torch.tensor(class_labels).float()
+
+    del sample[text_ext]
+    sample["audio_name"] = sample["__key__"].split("/")[-1] + "." + audio_ext
+    sample["text_name"] = sample["__key__"].split("/")[-1] + "." + text_ext
+    sample["audio_orig_sr"] = orig_sr
+    return sample
+
+
+def collate_fn_with_preprocess(batch,
+                               audio_ext,
+                               text_ext,
+                               max_len,
+                               audio_cfg,
+                               args,
+                               ):
+    """
+    Collate function for wdsdataloader.
+    batch: a list of dict, each dict is a sample
+    """
+
+    class_index_dict = copy.deepcopy(args.class_index_dict)  # To avoid deadlock in multiprocessing
+    data_filling = args.data_filling
+    data_truncating = args.data_truncating
+    text_augment_selection = args.text_augment_selection
+    tmodel = args.tmodel
+
+    # concatenate values in each dictionary. if it is a tensor, concatenate. if it is a list, extend.
+    data_preprocessed = []
+
+    for sample in batch:
+        data_preprocessed.append(
+            preprocess_single(sample, audio_ext, text_ext, max_len, audio_cfg, tmodel, class_index_dict, data_filling,
+                              data_truncating, text_augment_selection))
+
+    batch_dict = {}
+    for k in data_preprocessed[0].keys():
+        if isinstance(data_preprocessed[0][k], dict):  # dealwith bert tokenizer output
+            batch_dict[k] = {}
+            for kk in data_preprocessed[0][k].keys():
+                tmp = []
+                for i in range(len(data_preprocessed)):
+                    tmp.append(data_preprocessed[i][k][kk])
+                batch_dict[k][kk] = torch.vstack(tmp)
+        elif isinstance(data_preprocessed[0][k], torch.Tensor):
+            batch_dict[k] = torch.stack([sample[k] for sample in data_preprocessed])
+        elif isinstance(data_preprocessed[0][k], np.ndarray):
+            batch_dict[k] = torch.tensor(np.stack([sample[k] for sample in data_preprocessed]))
+        else:
+            batch_dict[k] = [sample[k] for sample in data_preprocessed]
+    del data_preprocessed
+    return batch_dict
+
+
+def get_wds_dataset(
+        args,
+        model_cfg,
+        is_train,
+        audio_ext="flac",
+        text_ext="json",
+        max_len=480000,
+        proportion=1.0,
+        sizefilepath_=None,
+        is_local=None,
+):
+    """
+    Get a dataset for wdsdataloader.
+    """
+    if is_local is None and (not args.remotedata is None):
+        is_local = not args.remotedata
+
+    input_shards = args.train_data if is_train else args.val_data
+    assert input_shards is not None
+
+    if not sizefilepath_ is None:
+        sizefilepath = sizefilepath_
+    else:
+        sizefilepath = os.path.join(os.path.dirname(input_shards[0]), "sizes.json")
+
+    if proportion != 1.0:
+        num_samples, num_shards, input_shards, _ = sample_prop(
+            sizefilepath, input_shards, proportion, is_local=is_local
+        )
+    else:
+        num_samples, num_shards = get_dataset_size(
+            input_shards, sizefilepath_=sizefilepath_, is_local=is_local
+        )
+
+    if not num_samples:
+        if is_train:
+            num_samples = args.train_num_samples
+            if not num_samples:
+                raise RuntimeError(
+                    "Currently, number of dataset samples must be specified for training dataset. "
+                    "Please specify via `--train-num-samples` if no dataset length info present."
+                )
+        else:
+            num_samples = (
+                    args.val_num_samples or 0
+            )  # eval will just exhaust the iterator if not specified
+
+    pipeline = [wds.SimpleShardList(input_shards)]
+    # at this point we have an iterator over all the shards
+    # TODO: (yusong): add a if statement of distributed. If not, we don't need to split_by_node
+    if is_train or args.parallel_eval:
+        pipeline.extend(
+            [
+                wds.detshuffle(
+                    bufsize=_SHARD_SHUFFLE_SIZE,
+                    initial=_SHARD_SHUFFLE_INITIAL,
+                    seed=args.seed,
+                ),
+                wds.split_by_node,
+                wds.split_by_worker,
+                # at this point, we have an iterator over the shards assigned to each worker at each node
+                wds.tarfile_to_samples(handler=log_and_continue),
+                wds.shuffle(
+                    bufsize=_SAMPLE_SHUFFLE_SIZE,
+                    initial=_SAMPLE_SHUFFLE_INITIAL,
+                    rng=random.Random(args.seed),
+                ),
+                # wds.repeatedly,  # FIXME determine if this is beneficial
+            ]
+        )
+    else:
+        pipeline.extend(
+            [
+                wds.split_by_worker,
+                # at this point, we have an iterator over the shards assigned to each worker
+                wds.tarfile_to_samples(handler=log_and_continue),
+            ]
+        )
+
+    pipeline.append(
+        wds.decode(wds.torch_audio),
+    )
+
+    pipeline.append(
+        wds.batched(
+            args.batch_size,
+            partial=not (is_train or args.parallel_eval),
+            collation_fn=partial(collate_fn_with_preprocess,
+                                 audio_ext=audio_ext,
+                                 text_ext=text_ext,
+                                 max_len=max_len,
+                                 audio_cfg=model_cfg['audio_cfg'],
+                                 args=args,
+                                 ),
+
+        )
+    )
+
+    dataset = wds.DataPipeline(*pipeline)
+    if is_train or args.parallel_eval:
+        # (yusong): Currently parallel evaluation will be not precise as we are repeat the last few samples.
+        # (yusong): See comments below.
+        # roll over and repeat a few samples to get same number of full batches on each node
+        global_batch_size = args.batch_size * args.world_size
+        num_batches = math.ceil(num_samples / global_batch_size)
+        num_workers = max(1, args.workers)
+        num_worker_batches = math.ceil(
+            num_batches / num_workers
+        )  # per dataloader worker
+        num_batches = num_worker_batches * num_workers
+        num_samples = num_batches * global_batch_size
+        dataset = dataset.with_epoch(
+            num_worker_batches
+        )  # each worker is iterating over this
+    else:
+        # last batches are partial, eval is done on single (master) node
+        num_batches = math.ceil(num_samples / args.batch_size)
+
+    kwargs = {}
+    if args.horovod:  # multi-node training on summit
+        kwargs["multiprocessing_context"] = "forkserver"
+
+    if is_train:
+        if args.prefetch_factor:
+            prefetch_factor = args.prefetch_factor
+        else:
+            prefetch_factor = max(2, args.batch_size // args.workers)
+    else:
+        prefetch_factor = 2
+
+    dataloader = wds.WebLoader(
+        dataset,
+        batch_size=None,
+        shuffle=False,
+        num_workers=args.workers,
+        pin_memory=True,
+        prefetch_factor=prefetch_factor,
+        **kwargs
+    )
+
+    # FIXME not clear which approach is better, with_epoch before vs after dataloader?
+    # hoping to resolve via https://github.com/webdataset/webdataset/issues/169
+    # if is_train:
+    #     # roll over and repeat a few samples to get same number of full batches on each node
+    #     global_batch_size = args.batch_size * args.world_size
+    #     num_batches = math.ceil(num_samples / global_batch_size)
+    #     num_workers = max(1, args.workers)
+    #     num_batches = math.ceil(num_batches / num_workers) * num_workers
+    #     num_samples = num_batches * global_batch_size
+    #     dataloader = dataloader.with_epoch(num_batches)
+    # else:
+    #     # last batches are partial, eval is done on single (master) node
+    #     num_batches = math.ceil(num_samples / args.batch_size)
+
+    # add meta-data to dataloader instance for convenience
+    dataloader.num_batches = num_batches
+    dataloader.num_samples = num_samples
+
+    return DataInfo(dataloader, None)
+
+
+def wds_batch_list2dict(
+        batch,
+        keys=[
+            "__url__",
+            "__key__",
+            "waveform",
+            "text",
+            "raw_text",
+            "audio_name",
+            "text_name",
+            "audio_orig_sr",
+        ],
+):
+    """
+    Return a dictionary of the batch, with keys as the names of the fields.
+    """
+    assert len(keys) == len(
+        batch
+    ), "batch must have same number of keys as keys argument"
+    return {keys[i]: batch[i] for i in range(len(batch))}
+
+
+
+def get_toy_dataset(args, model_cfg, is_train):
+    index_path = args.train_data if is_train else args.val_data
+    ipc_path = args.train_ipc if is_train else args.val_ipc
+    assert index_path and ipc_path
+    eval_mode = not is_train
+    dataset = ToyDataset(index_path, ipc_path, model_cfg, eval_mode=eval_mode)
+
+    num_samples = len(dataset)
+    sampler = (
+        DistributedSampler(dataset, shuffle=False)
+        if args.distributed and is_train
+        else None
+    )
+
+    dataloader = DataLoader(
+        dataset,
+        batch_size=args.batch_size,
+        shuffle=False,
+        num_workers=args.workers,
+        sampler=sampler,
+        drop_last=is_train,
+    )
+    dataloader.num_samples = num_samples
+    dataloader.num_batches = len(dataloader)
+
+    return DataInfo(dataloader, sampler)
+
+
+def get_dataset_fn(dataset_type):
+    if dataset_type == "webdataset":
+        return get_wds_dataset
+    elif dataset_type == "toy":
+        return get_toy_dataset
+    else:
+        raise ValueError(f"Unsupported dataset type: {dataset_type}")
+
+
+def get_data(args, model_cfg):
+    data = {}
+
+    args.class_index_dict = load_class_label(args.class_label_path)
+
+    if args.datasetinfos is None:
+        args.datasetinfos = ["train", "unbalanced_train", "balanced_train"]
+    if args.dataset_type == "webdataset":
+        args.train_data = get_tar_path_from_dataset_name(
+            args.datasetnames,
+            args.datasetinfos,
+            islocal=not args.remotedata,
+            proportion=args.dataset_proportion,
+            dataset_path=args.datasetpath,
+            full_dataset=args.full_train_dataset,
+        )
+
+        if args.full_train_dataset is None:
+            args.full_train_dataset = []
+        if args.exclude_eval_dataset is None:
+            args.exclude_eval_dataset = []
+        excluded_eval_datasets = args.full_train_dataset + args.exclude_eval_dataset
+
+        val_dataset_names = [n for n in args.datasetnames if n not in excluded_eval_datasets] \
+            if excluded_eval_datasets else args.datasetnames
+        args.val_dataset_names = val_dataset_names
+        args.val_data = get_tar_path_from_dataset_name(
+            val_dataset_names,
+            ["valid", "test", "eval"],
+            islocal=not args.remotedata,
+            proportion=1,
+            dataset_path=args.datasetpath,
+            full_dataset=None,
+        )
+
+    if args.train_data:
+        data["train"] = get_dataset_fn(args.dataset_type)(
+            args, model_cfg, is_train=True
+        )
+
+    if args.val_data:
+        data["val"] = get_dataset_fn(args.dataset_type)(
+            args, model_cfg, is_train=False
+        )
+
+    return data
diff --git a/my_laion_clap/CLAP/src/laion_clap/training/distributed.py b/my_laion_clap/CLAP/src/laion_clap/training/distributed.py
new file mode 100644
index 0000000000000000000000000000000000000000..adb0e927a64dbe7fc83fecf65be054ac6bd28a94
--- /dev/null
+++ b/my_laion_clap/CLAP/src/laion_clap/training/distributed.py
@@ -0,0 +1,139 @@
+import os
+
+import torch
+import socket
+
+try:
+    import horovod.torch as hvd
+except ImportError:
+    hvd = None
+
+
+def is_global_master(args):
+    return args.rank == 0
+
+
+def is_local_master(args):
+    return args.local_rank == 0
+
+
+def is_master(args, local=False):
+    return is_local_master(args) if local else is_global_master(args)
+
+
+def is_using_horovod():
+    # NOTE w/ horovod run, OMPI vars should be set, but w/ SLURM PMI vars will be set
+    # Differentiating between horovod and DDP use via SLURM may not be possible, so horovod arg still required...
+    ompi_vars = ["OMPI_COMM_WORLD_RANK", "OMPI_COMM_WORLD_SIZE"]
+    pmi_vars = ["PMI_RANK", "PMI_SIZE"]
+    if all([var in os.environ for var in ompi_vars]) or all([var in os.environ for var in pmi_vars]):
+        return True
+    else:
+        return False
+
+
+def is_using_distributed():
+    if 'WORLD_SIZE' in os.environ:
+        return int(os.environ['WORLD_SIZE']) > 1
+    if 'SLURM_NTASKS' in os.environ:
+        return int(os.environ['SLURM_NTASKS']) > 1
+    return False
+
+
+def world_info_from_env():
+    local_rank = 0
+    for v in ('SLURM_LOCALID', 'MPI_LOCALRANKID', 'OMPI_COMM_WORLD_LOCAL_RANK', 'LOCAL_RANK'):
+        if v in os.environ:
+            local_rank = int(os.environ[v])
+            break
+    global_rank = 0
+    for v in ('SLURM_PROCID', 'PMI_RANK', 'OMPI_COMM_WORLD_RANK', 'RANK'):
+        if v in os.environ:
+            global_rank = int(os.environ[v])
+            break
+    world_size = 1
+    for v in ('SLURM_NTASKS', 'PMI_SIZE', 'OMPI_COMM_WORLD_SIZE', 'WORLD_SIZE'):
+        if v in os.environ:
+            world_size = int(os.environ[v])
+            break
+
+    return local_rank, global_rank, world_size
+
+
+def init_distributed_device(args):
+    # Distributed training = training on more than one GPU.
+    # Works in both single and multi-node scenarios.
+    args.distributed = False
+    args.world_size = 1
+    args.rank = 0  # global rank
+    args.local_rank = 0
+    if args.horovod:
+        assert hvd is not None, "Horovod is not installed"
+        hvd.init()
+        world_size = int(os.environ['OMPI_COMM_WORLD_SIZE'])
+        world_rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
+        local_rank = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])
+        args.local_rank = local_rank
+        args.rank = world_rank
+        args.world_size = world_size
+        # args.local_rank = int(hvd.local_rank())
+        # args.rank = hvd.rank()
+        # args.world_size = hvd.size()
+        args.distributed = True
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+        os.environ['RANK'] = str(args.rank)
+        os.environ['WORLD_SIZE'] = str(args.world_size)
+        print(f"Distributed training: local_rank={args.local_rank}, "
+              f"rank={args.rank}, world_size={args.world_size}, "
+              f"hostname={socket.gethostname()}, pid={os.getpid()}")
+    elif is_using_distributed():
+        if 'SLURM_PROCID' in os.environ:
+            # DDP via SLURM
+            args.local_rank, args.rank, args.world_size = world_info_from_env()
+            # SLURM var -> torch.distributed vars in case needed
+            os.environ['LOCAL_RANK'] = str(args.local_rank)
+            os.environ['RANK'] = str(args.rank)
+            os.environ['WORLD_SIZE'] = str(args.world_size)
+            torch.distributed.init_process_group(
+                backend=args.dist_backend,
+                init_method=args.dist_url,
+                world_size=args.world_size,
+                rank=args.rank,
+            )
+        elif 'OMPI_COMM_WORLD_SIZE' in os.environ: # using Summit cluster
+            world_size = int(os.environ['OMPI_COMM_WORLD_SIZE'])
+            world_rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
+            local_rank = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])
+            args.local_rank = local_rank
+            args.rank = world_rank
+            args.world_size = world_size
+            torch.distributed.init_process_group(
+                backend=args.dist_backend,
+                init_method=args.dist_url,
+                world_size=args.world_size,
+                rank=args.rank,
+            )
+        else:
+            # DDP via torchrun, torch.distributed.launch
+            args.local_rank, _, _ = world_info_from_env()
+            torch.distributed.init_process_group(
+                backend=args.dist_backend,
+                init_method=args.dist_url)
+            args.world_size = torch.distributed.get_world_size()
+            args.rank = torch.distributed.get_rank()
+        args.distributed = True
+        print(f"Distributed training: local_rank={args.local_rank}, "
+              f"rank={args.rank}, world_size={args.world_size}, "
+              f"hostname={socket.gethostname()}, pid={os.getpid()}")
+
+    if torch.cuda.is_available():
+        if args.distributed and not args.no_set_device_rank:
+            device = 'cuda:%d' % args.local_rank
+        else:
+            device = 'cuda:0'
+        torch.cuda.set_device(device)
+    else:
+        device = 'cpu'
+    args.device = device
+    device = torch.device(device)
+    return device
diff --git a/my_laion_clap/CLAP/src/laion_clap/training/imagenet_zeroshot_data.py b/my_laion_clap/CLAP/src/laion_clap/training/imagenet_zeroshot_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..a78987448805afc228b2941302a2894818cac497
--- /dev/null
+++ b/my_laion_clap/CLAP/src/laion_clap/training/imagenet_zeroshot_data.py
@@ -0,0 +1,254 @@
+# NOTE: This script is currently not supported for CLAP.
+
+imagenet_classnames = ["tench", "goldfish", "great white shark", "tiger shark", "hammerhead shark", "electric ray",
+                        "stingray", "rooster", "hen", "ostrich", "brambling", "goldfinch", "house finch", "junco",
+                        "indigo bunting", "American robin", "bulbul", "jay", "magpie", "chickadee", "American dipper",
+                        "kite (bird of prey)", "bald eagle", "vulture", "great grey owl", "fire salamander",
+                        "smooth newt", "newt", "spotted salamander", "axolotl", "American bullfrog", "tree frog",
+                        "tailed frog", "loggerhead sea turtle", "leatherback sea turtle", "mud turtle", "terrapin",
+                        "box turtle", "banded gecko", "green iguana", "Carolina anole",
+                        "desert grassland whiptail lizard", "agama", "frilled-necked lizard", "alligator lizard",
+                        "Gila monster", "European green lizard", "chameleon", "Komodo dragon", "Nile crocodile",
+                        "American alligator", "triceratops", "worm snake", "ring-necked snake",
+                        "eastern hog-nosed snake", "smooth green snake", "kingsnake", "garter snake", "water snake",
+                        "vine snake", "night snake", "boa constrictor", "African rock python", "Indian cobra",
+                        "green mamba", "sea snake", "Saharan horned viper", "eastern diamondback rattlesnake",
+                        "sidewinder rattlesnake", "trilobite", "harvestman", "scorpion", "yellow garden spider",
+                        "barn spider", "European garden spider", "southern black widow", "tarantula", "wolf spider",
+                        "tick", "centipede", "black grouse", "ptarmigan", "ruffed grouse", "prairie grouse", "peafowl",
+                        "quail", "partridge", "african grey parrot", "macaw", "sulphur-crested cockatoo", "lorikeet",
+                        "coucal", "bee eater", "hornbill", "hummingbird", "jacamar", "toucan", "duck",
+                        "red-breasted merganser", "goose", "black swan", "tusker", "echidna", "platypus", "wallaby",
+                        "koala", "wombat", "jellyfish", "sea anemone", "brain coral", "flatworm", "nematode", "conch",
+                        "snail", "slug", "sea slug", "chiton", "chambered nautilus", "Dungeness crab", "rock crab",
+                        "fiddler crab", "red king crab", "American lobster", "spiny lobster", "crayfish", "hermit crab",
+                        "isopod", "white stork", "black stork", "spoonbill", "flamingo", "little blue heron",
+                        "great egret", "bittern bird", "crane bird", "limpkin", "common gallinule", "American coot",
+                        "bustard", "ruddy turnstone", "dunlin", "common redshank", "dowitcher", "oystercatcher",
+                        "pelican", "king penguin", "albatross", "grey whale", "killer whale", "dugong", "sea lion",
+                        "Chihuahua", "Japanese Chin", "Maltese", "Pekingese", "Shih Tzu", "King Charles Spaniel",
+                        "Papillon", "toy terrier", "Rhodesian Ridgeback", "Afghan Hound", "Basset Hound", "Beagle",
+                        "Bloodhound", "Bluetick Coonhound", "Black and Tan Coonhound", "Treeing Walker Coonhound",
+                        "English foxhound", "Redbone Coonhound", "borzoi", "Irish Wolfhound", "Italian Greyhound",
+                        "Whippet", "Ibizan Hound", "Norwegian Elkhound", "Otterhound", "Saluki", "Scottish Deerhound",
+                        "Weimaraner", "Staffordshire Bull Terrier", "American Staffordshire Terrier",
+                        "Bedlington Terrier", "Border Terrier", "Kerry Blue Terrier", "Irish Terrier",
+                        "Norfolk Terrier", "Norwich Terrier", "Yorkshire Terrier", "Wire Fox Terrier",
+                        "Lakeland Terrier", "Sealyham Terrier", "Airedale Terrier", "Cairn Terrier",
+                        "Australian Terrier", "Dandie Dinmont Terrier", "Boston Terrier", "Miniature Schnauzer",
+                        "Giant Schnauzer", "Standard Schnauzer", "Scottish Terrier", "Tibetan Terrier",
+                        "Australian Silky Terrier", "Soft-coated Wheaten Terrier", "West Highland White Terrier",
+                        "Lhasa Apso", "Flat-Coated Retriever", "Curly-coated Retriever", "Golden Retriever",
+                        "Labrador Retriever", "Chesapeake Bay Retriever", "German Shorthaired Pointer", "Vizsla",
+                        "English Setter", "Irish Setter", "Gordon Setter", "Brittany dog", "Clumber Spaniel",
+                        "English Springer Spaniel", "Welsh Springer Spaniel", "Cocker Spaniel", "Sussex Spaniel",
+                        "Irish Water Spaniel", "Kuvasz", "Schipperke", "Groenendael dog", "Malinois", "Briard",
+                        "Australian Kelpie", "Komondor", "Old English Sheepdog", "Shetland Sheepdog", "collie",
+                        "Border Collie", "Bouvier des Flandres dog", "Rottweiler", "German Shepherd Dog", "Dobermann",
+                        "Miniature Pinscher", "Greater Swiss Mountain Dog", "Bernese Mountain Dog",
+                        "Appenzeller Sennenhund", "Entlebucher Sennenhund", "Boxer", "Bullmastiff", "Tibetan Mastiff",
+                        "French Bulldog", "Great Dane", "St. Bernard", "husky", "Alaskan Malamute", "Siberian Husky",
+                        "Dalmatian", "Affenpinscher", "Basenji", "pug", "Leonberger", "Newfoundland dog",
+                        "Great Pyrenees dog", "Samoyed", "Pomeranian", "Chow Chow", "Keeshond", "brussels griffon",
+                        "Pembroke Welsh Corgi", "Cardigan Welsh Corgi", "Toy Poodle", "Miniature Poodle",
+                        "Standard Poodle", "Mexican hairless dog (xoloitzcuintli)", "grey wolf", "Alaskan tundra wolf",
+                        "red wolf or maned wolf", "coyote", "dingo", "dhole", "African wild dog", "hyena", "red fox",
+                        "kit fox", "Arctic fox", "grey fox", "tabby cat", "tiger cat", "Persian cat", "Siamese cat",
+                        "Egyptian Mau", "cougar", "lynx", "leopard", "snow leopard", "jaguar", "lion", "tiger",
+                        "cheetah", "brown bear", "American black bear", "polar bear", "sloth bear", "mongoose",
+                        "meerkat", "tiger beetle", "ladybug", "ground beetle", "longhorn beetle", "leaf beetle",
+                        "dung beetle", "rhinoceros beetle", "weevil", "fly", "bee", "ant", "grasshopper",
+                        "cricket insect", "stick insect", "cockroach", "praying mantis", "cicada", "leafhopper",
+                        "lacewing", "dragonfly", "damselfly", "red admiral butterfly", "ringlet butterfly",
+                        "monarch butterfly", "small white butterfly", "sulphur butterfly", "gossamer-winged butterfly",
+                        "starfish", "sea urchin", "sea cucumber", "cottontail rabbit", "hare", "Angora rabbit",
+                        "hamster", "porcupine", "fox squirrel", "marmot", "beaver", "guinea pig", "common sorrel horse",
+                        "zebra", "pig", "wild boar", "warthog", "hippopotamus", "ox", "water buffalo", "bison",
+                        "ram (adult male sheep)", "bighorn sheep", "Alpine ibex", "hartebeest", "impala (antelope)",
+                        "gazelle", "arabian camel", "llama", "weasel", "mink", "European polecat",
+                        "black-footed ferret", "otter", "skunk", "badger", "armadillo", "three-toed sloth", "orangutan",
+                        "gorilla", "chimpanzee", "gibbon", "siamang", "guenon", "patas monkey", "baboon", "macaque",
+                        "langur", "black-and-white colobus", "proboscis monkey", "marmoset", "white-headed capuchin",
+                        "howler monkey", "titi monkey", "Geoffroy's spider monkey", "common squirrel monkey",
+                        "ring-tailed lemur", "indri", "Asian elephant", "African bush elephant", "red panda",
+                        "giant panda", "snoek fish", "eel", "silver salmon", "rock beauty fish", "clownfish",
+                        "sturgeon", "gar fish", "lionfish", "pufferfish", "abacus", "abaya", "academic gown",
+                        "accordion", "acoustic guitar", "aircraft carrier", "airliner", "airship", "altar", "ambulance",
+                        "amphibious vehicle", "analog clock", "apiary", "apron", "trash can", "assault rifle",
+                        "backpack", "bakery", "balance beam", "balloon", "ballpoint pen", "Band-Aid", "banjo",
+                        "baluster / handrail", "barbell", "barber chair", "barbershop", "barn", "barometer", "barrel",
+                        "wheelbarrow", "baseball", "basketball", "bassinet", "bassoon", "swimming cap", "bath towel",
+                        "bathtub", "station wagon", "lighthouse", "beaker", "military hat (bearskin or shako)",
+                        "beer bottle", "beer glass", "bell tower", "baby bib", "tandem bicycle", "bikini",
+                        "ring binder", "binoculars", "birdhouse", "boathouse", "bobsleigh", "bolo tie", "poke bonnet",
+                        "bookcase", "bookstore", "bottle cap", "hunting bow", "bow tie", "brass memorial plaque", "bra",
+                        "breakwater", "breastplate", "broom", "bucket", "buckle", "bulletproof vest",
+                        "high-speed train", "butcher shop", "taxicab", "cauldron", "candle", "cannon", "canoe",
+                        "can opener", "cardigan", "car mirror", "carousel", "tool kit", "cardboard box / carton",
+                        "car wheel", "automated teller machine", "cassette", "cassette player", "castle", "catamaran",
+                        "CD player", "cello", "mobile phone", "chain", "chain-link fence", "chain mail", "chainsaw",
+                        "storage chest", "chiffonier", "bell or wind chime", "china cabinet", "Christmas stocking",
+                        "church", "movie theater", "cleaver", "cliff dwelling", "cloak", "clogs", "cocktail shaker",
+                        "coffee mug", "coffeemaker", "spiral or coil", "combination lock", "computer keyboard",
+                        "candy store", "container ship", "convertible", "corkscrew", "cornet", "cowboy boot",
+                        "cowboy hat", "cradle", "construction crane", "crash helmet", "crate", "infant bed",
+                        "Crock Pot", "croquet ball", "crutch", "cuirass", "dam", "desk", "desktop computer",
+                        "rotary dial telephone", "diaper", "digital clock", "digital watch", "dining table",
+                        "dishcloth", "dishwasher", "disc brake", "dock", "dog sled", "dome", "doormat", "drilling rig",
+                        "drum", "drumstick", "dumbbell", "Dutch oven", "electric fan", "electric guitar",
+                        "electric locomotive", "entertainment center", "envelope", "espresso machine", "face powder",
+                        "feather boa", "filing cabinet", "fireboat", "fire truck", "fire screen", "flagpole", "flute",
+                        "folding chair", "football helmet", "forklift", "fountain", "fountain pen", "four-poster bed",
+                        "freight car", "French horn", "frying pan", "fur coat", "garbage truck",
+                        "gas mask or respirator", "gas pump", "goblet", "go-kart", "golf ball", "golf cart", "gondola",
+                        "gong", "gown", "grand piano", "greenhouse", "radiator grille", "grocery store", "guillotine",
+                        "hair clip", "hair spray", "half-track", "hammer", "hamper", "hair dryer", "hand-held computer",
+                        "handkerchief", "hard disk drive", "harmonica", "harp", "combine harvester", "hatchet",
+                        "holster", "home theater", "honeycomb", "hook", "hoop skirt", "gymnastic horizontal bar",
+                        "horse-drawn vehicle", "hourglass", "iPod", "clothes iron", "carved pumpkin", "jeans", "jeep",
+                        "T-shirt", "jigsaw puzzle", "rickshaw", "joystick", "kimono", "knee pad", "knot", "lab coat",
+                        "ladle", "lampshade", "laptop computer", "lawn mower", "lens cap", "letter opener", "library",
+                        "lifeboat", "lighter", "limousine", "ocean liner", "lipstick", "slip-on shoe", "lotion",
+                        "music speaker", "loupe magnifying glass", "sawmill", "magnetic compass", "messenger bag",
+                        "mailbox", "tights", "one-piece bathing suit", "manhole cover", "maraca", "marimba", "mask",
+                        "matchstick", "maypole", "maze", "measuring cup", "medicine cabinet", "megalith", "microphone",
+                        "microwave oven", "military uniform", "milk can", "minibus", "miniskirt", "minivan", "missile",
+                        "mitten", "mixing bowl", "mobile home", "ford model t", "modem", "monastery", "monitor",
+                        "moped", "mortar and pestle", "graduation cap", "mosque", "mosquito net", "vespa",
+                        "mountain bike", "tent", "computer mouse", "mousetrap", "moving van", "muzzle", "metal nail",
+                        "neck brace", "necklace", "baby pacifier", "notebook computer", "obelisk", "oboe", "ocarina",
+                        "odometer", "oil filter", "pipe organ", "oscilloscope", "overskirt", "bullock cart",
+                        "oxygen mask", "product packet / packaging", "paddle", "paddle wheel", "padlock", "paintbrush",
+                        "pajamas", "palace", "pan flute", "paper towel", "parachute", "parallel bars", "park bench",
+                        "parking meter", "railroad car", "patio", "payphone", "pedestal", "pencil case",
+                        "pencil sharpener", "perfume", "Petri dish", "photocopier", "plectrum", "Pickelhaube",
+                        "picket fence", "pickup truck", "pier", "piggy bank", "pill bottle", "pillow", "ping-pong ball",
+                        "pinwheel", "pirate ship", "drink pitcher", "block plane", "planetarium", "plastic bag",
+                        "plate rack", "farm plow", "plunger", "Polaroid camera", "pole", "police van", "poncho",
+                        "pool table", "soda bottle", "plant pot", "potter's wheel", "power drill", "prayer rug",
+                        "printer", "prison", "missile", "projector", "hockey puck", "punching bag", "purse", "quill",
+                        "quilt", "race car", "racket", "radiator", "radio", "radio telescope", "rain barrel",
+                        "recreational vehicle", "fishing casting reel", "reflex camera", "refrigerator",
+                        "remote control", "restaurant", "revolver", "rifle", "rocking chair", "rotisserie", "eraser",
+                        "rugby ball", "ruler measuring stick", "sneaker", "safe", "safety pin", "salt shaker", "sandal",
+                        "sarong", "saxophone", "scabbard", "weighing scale", "school bus", "schooner", "scoreboard",
+                        "CRT monitor", "screw", "screwdriver", "seat belt", "sewing machine", "shield", "shoe store",
+                        "shoji screen / room divider", "shopping basket", "shopping cart", "shovel", "shower cap",
+                        "shower curtain", "ski", "balaclava ski mask", "sleeping bag", "slide rule", "sliding door",
+                        "slot machine", "snorkel", "snowmobile", "snowplow", "soap dispenser", "soccer ball", "sock",
+                        "solar thermal collector", "sombrero", "soup bowl", "keyboard space bar", "space heater",
+                        "space shuttle", "spatula", "motorboat", "spider web", "spindle", "sports car", "spotlight",
+                        "stage", "steam locomotive", "through arch bridge", "steel drum", "stethoscope", "scarf",
+                        "stone wall", "stopwatch", "stove", "strainer", "tram", "stretcher", "couch", "stupa",
+                        "submarine", "suit", "sundial", "sunglasses", "sunglasses", "sunscreen", "suspension bridge",
+                        "mop", "sweatshirt", "swim trunks / shorts", "swing", "electrical switch", "syringe",
+                        "table lamp", "tank", "tape player", "teapot", "teddy bear", "television", "tennis ball",
+                        "thatched roof", "front curtain", "thimble", "threshing machine", "throne", "tile roof",
+                        "toaster", "tobacco shop", "toilet seat", "torch", "totem pole", "tow truck", "toy store",
+                        "tractor", "semi-trailer truck", "tray", "trench coat", "tricycle", "trimaran", "tripod",
+                        "triumphal arch", "trolleybus", "trombone", "hot tub", "turnstile", "typewriter keyboard",
+                        "umbrella", "unicycle", "upright piano", "vacuum cleaner", "vase", "vaulted or arched ceiling",
+                        "velvet fabric", "vending machine", "vestment", "viaduct", "violin", "volleyball",
+                        "waffle iron", "wall clock", "wallet", "wardrobe", "military aircraft", "sink",
+                        "washing machine", "water bottle", "water jug", "water tower", "whiskey jug", "whistle",
+                        "hair wig", "window screen", "window shade", "Windsor tie", "wine bottle", "airplane wing",
+                        "wok", "wooden spoon", "wool", "split-rail fence", "shipwreck", "sailboat", "yurt", "website",
+                        "comic book", "crossword", "traffic or street sign", "traffic light", "dust jacket", "menu",
+                        "plate", "guacamole", "consomme", "hot pot", "trifle", "ice cream", "popsicle", "baguette",
+                        "bagel", "pretzel", "cheeseburger", "hot dog", "mashed potatoes", "cabbage", "broccoli",
+                        "cauliflower", "zucchini", "spaghetti squash", "acorn squash", "butternut squash", "cucumber",
+                        "artichoke", "bell pepper", "cardoon", "mushroom", "Granny Smith apple", "strawberry", "orange",
+                        "lemon", "fig", "pineapple", "banana", "jackfruit", "cherimoya (custard apple)", "pomegranate",
+                        "hay", "carbonara", "chocolate syrup", "dough", "meatloaf", "pizza", "pot pie", "burrito",
+                        "red wine", "espresso", "tea cup", "eggnog", "mountain", "bubble", "cliff", "coral reef",
+                        "geyser", "lakeshore", "promontory", "sandbar", "beach", "valley", "volcano", "baseball player",
+                        "bridegroom", "scuba diver", "rapeseed", "daisy", "yellow lady's slipper", "corn", "acorn",
+                        "rose hip", "horse chestnut seed", "coral fungus", "agaric", "gyromitra", "stinkhorn mushroom",
+                        "earth star fungus", "hen of the woods mushroom", "bolete", "corn cob", "toilet paper"]
+
+
+
+
+
+openai_imagenet_template = [
+    lambda c: f'a bad photo of a {c}.',
+    lambda c: f'a photo of many {c}.',
+    lambda c: f'a sculpture of a {c}.',
+    lambda c: f'a photo of the hard to see {c}.',
+    lambda c: f'a low resolution photo of the {c}.',
+    lambda c: f'a rendering of a {c}.',
+    lambda c: f'graffiti of a {c}.',
+    lambda c: f'a bad photo of the {c}.',
+    lambda c: f'a cropped photo of the {c}.',
+    lambda c: f'a tattoo of a {c}.',
+    lambda c: f'the embroidered {c}.',
+    lambda c: f'a photo of a hard to see {c}.',
+    lambda c: f'a bright photo of a {c}.',
+    lambda c: f'a photo of a clean {c}.',
+    lambda c: f'a photo of a dirty {c}.',
+    lambda c: f'a dark photo of the {c}.',
+    lambda c: f'a drawing of a {c}.',
+    lambda c: f'a photo of my {c}.',
+    lambda c: f'the plastic {c}.',
+    lambda c: f'a photo of the cool {c}.',
+    lambda c: f'a close-up photo of a {c}.',
+    lambda c: f'a black and white photo of the {c}.',
+    lambda c: f'a painting of the {c}.',
+    lambda c: f'a painting of a {c}.',
+    lambda c: f'a pixelated photo of the {c}.',
+    lambda c: f'a sculpture of the {c}.',
+    lambda c: f'a bright photo of the {c}.',
+    lambda c: f'a cropped photo of a {c}.',
+    lambda c: f'a plastic {c}.',
+    lambda c: f'a photo of the dirty {c}.',
+    lambda c: f'a jpeg corrupted photo of a {c}.',
+    lambda c: f'a blurry photo of the {c}.',
+    lambda c: f'a photo of the {c}.',
+    lambda c: f'a good photo of the {c}.',
+    lambda c: f'a rendering of the {c}.',
+    lambda c: f'a {c} in a video game.',
+    lambda c: f'a photo of one {c}.',
+    lambda c: f'a doodle of a {c}.',
+    lambda c: f'a close-up photo of the {c}.',
+    lambda c: f'a photo of a {c}.',
+    lambda c: f'the origami {c}.',
+    lambda c: f'the {c} in a video game.',
+    lambda c: f'a sketch of a {c}.',
+    lambda c: f'a doodle of the {c}.',
+    lambda c: f'a origami {c}.',
+    lambda c: f'a low resolution photo of a {c}.',
+    lambda c: f'the toy {c}.',
+    lambda c: f'a rendition of the {c}.',
+    lambda c: f'a photo of the clean {c}.',
+    lambda c: f'a photo of a large {c}.',
+    lambda c: f'a rendition of a {c}.',
+    lambda c: f'a photo of a nice {c}.',
+    lambda c: f'a photo of a weird {c}.',
+    lambda c: f'a blurry photo of a {c}.',
+    lambda c: f'a cartoon {c}.',
+    lambda c: f'art of a {c}.',
+    lambda c: f'a sketch of the {c}.',
+    lambda c: f'a embroidered {c}.',
+    lambda c: f'a pixelated photo of a {c}.',
+    lambda c: f'itap of the {c}.',
+    lambda c: f'a jpeg corrupted photo of the {c}.',
+    lambda c: f'a good photo of a {c}.',
+    lambda c: f'a plushie {c}.',
+    lambda c: f'a photo of the nice {c}.',
+    lambda c: f'a photo of the small {c}.',
+    lambda c: f'a photo of the weird {c}.',
+    lambda c: f'the cartoon {c}.',
+    lambda c: f'art of the {c}.',
+    lambda c: f'a drawing of the {c}.',
+    lambda c: f'a photo of the large {c}.',
+    lambda c: f'a black and white photo of a {c}.',
+    lambda c: f'the plushie {c}.',
+    lambda c: f'a dark photo of a {c}.',
+    lambda c: f'itap of a {c}.',
+    lambda c: f'graffiti of the {c}.',
+    lambda c: f'a toy {c}.',
+    lambda c: f'itap of my {c}.',
+    lambda c: f'a photo of a cool {c}.',
+    lambda c: f'a photo of a small {c}.',
+    lambda c: f'a tattoo of the {c}.',
+]
diff --git a/my_laion_clap/CLAP/src/laion_clap/training/infer_demo.py b/my_laion_clap/CLAP/src/laion_clap/training/infer_demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..b39f3f43fc9f295b244bc1ca2b444f4969488cba
--- /dev/null
+++ b/my_laion_clap/CLAP/src/laion_clap/training/infer_demo.py
@@ -0,0 +1,92 @@
+import torch
+import librosa
+from clap_module import create_model
+from training.data import get_audio_features
+from training.data import int16_to_float32, float32_to_int16
+from transformers import RobertaTokenizer
+
+tokenize = RobertaTokenizer.from_pretrained('roberta-base')
+def tokenizer(text):
+    result = tokenize(
+        text,
+        padding="max_length",
+        truncation=True,
+        max_length=77,
+        return_tensors="pt",
+    )
+    return {k: v.squeeze(0) for k, v in result.items()}
+
+def infer_text():
+    device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
+    precision = 'fp32'
+    amodel = 'HTSAT-tiny' # or 'PANN-14'
+    tmodel = 'roberta' # the best text encoder in our training
+    enable_fusion = True # False if you do not want to use the fusion model
+    fusion_type = 'aff_2d'
+    pretrained = "/home/la/kechen/Research/KE_CLAP/ckpt/fusion_best.pt" # the checkpoint name, the unfusion model can also be loaded.
+
+    model, model_cfg = create_model(
+        amodel,
+        tmodel,
+        pretrained,
+        precision=precision,
+        device=device,
+        enable_fusion=enable_fusion,
+        fusion_type=fusion_type
+    )
+    # load the text, can be a list (i.e. batch size)
+    text_data = ["I love the contrastive learning", "I love the pretrain model"] 
+    # tokenize for roberta, if you want to tokenize for another text encoder, please refer to data.py#L43-90 
+    text_data = tokenizer(text_data)
+    model.eval()
+    text_embed = model.get_text_embedding(text_data)
+    text_embed = text_embed.detach().cpu().numpy()
+    print(text_embed)
+    print(text_embed.shape)
+
+def infer_audio():
+    
+    device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
+    precision = 'fp32'
+    amodel = 'HTSAT-tiny' # or 'PANN-14'
+    tmodel = 'roberta' # the best text encoder in our training
+    enable_fusion = True # False if you do not want to use the fusion model
+    fusion_type = 'aff_2d'
+    pretrained = "/home/la/kechen/Research/KE_CLAP/ckpt/fusion_best.pt" # the checkpoint name, the unfusion model can also be loaded.
+
+    model, model_cfg = create_model(
+        amodel,
+        tmodel,
+        pretrained,
+        precision=precision,
+        device=device,
+        enable_fusion=enable_fusion,
+        fusion_type=fusion_type
+    )
+
+    # load the waveform of the shape (T,), should resample to 48000
+    audio_waveform, sr = librosa.load('/home/la/kechen/Research/KE_CLAP/ckpt/test_clap_short.wav', sr=48000) 
+    # quantize
+    audio_waveform = int16_to_float32(float32_to_int16(audio_waveform))
+    audio_waveform = torch.from_numpy(audio_waveform).float()
+    audio_dict = {}
+
+    # the 'fusion' truncate mode can be changed to 'rand_trunc' if run in unfusion mode
+    audio_dict = get_audio_features(
+        audio_dict, audio_waveform, 480000, 
+        data_truncating='fusion', 
+        data_filling='repeatpad',
+        audio_cfg=model_cfg['audio_cfg']
+    )
+    model.eval()
+    # can send a list to the model, to process many audio tracks in one time (i.e. batch size)
+    audio_embed = model.get_audio_embedding([audio_dict])
+    audio_embed = audio_embed.detach().cpu().numpy()
+    print(audio_embed)
+    print(audio_embed.shape)
+    
+
+
+if __name__ == "__main__":
+    infer_text()
+    # infer_audio()
diff --git a/my_laion_clap/CLAP/src/laion_clap/training/logger.py b/my_laion_clap/CLAP/src/laion_clap/training/logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d9abed92568d459cbc8d6094ae3901935d89621
--- /dev/null
+++ b/my_laion_clap/CLAP/src/laion_clap/training/logger.py
@@ -0,0 +1,26 @@
+import logging
+
+
+def setup_logging(log_file, level, include_host=False):
+    if include_host:
+        import socket
+        hostname = socket.gethostname()
+        formatter = logging.Formatter(
+            f'%(asctime)s |  {hostname} | %(levelname)s | %(message)s', datefmt='%Y-%m-%d,%H:%M:%S')
+    else:
+        formatter = logging.Formatter('%(asctime)s | %(levelname)s | %(message)s', datefmt='%Y-%m-%d,%H:%M:%S')
+
+    logging.root.setLevel(level)
+    loggers = [logging.getLogger(name) for name in logging.root.manager.loggerDict]
+    for logger in loggers:
+        logger.setLevel(level)
+
+    stream_handler = logging.StreamHandler()
+    stream_handler.setFormatter(formatter)
+    logging.root.addHandler(stream_handler)
+
+    if log_file:
+        file_handler = logging.FileHandler(filename=log_file)
+        file_handler.setFormatter(formatter)
+        logging.root.addHandler(file_handler)
+
diff --git a/my_laion_clap/CLAP/src/laion_clap/training/lp_main.py b/my_laion_clap/CLAP/src/laion_clap/training/lp_main.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e88f7356950ef03c78ca4d88681eb78ff1b4f6a
--- /dev/null
+++ b/my_laion_clap/CLAP/src/laion_clap/training/lp_main.py
@@ -0,0 +1,643 @@
+import logging
+import os
+import random
+from datetime import datetime
+import copy
+import numpy as np
+import torch
+import torch.backends.cudnn as cudnn
+from torch.cuda.amp import GradScaler
+import time
+
+try:
+    import wandb
+except ImportError:
+    wandb = None
+
+try:
+    import torch.utils.tensorboard as tensorboard
+except ImportError:
+    tensorboard = None
+
+try:
+    import horovod.torch as hvd
+except ImportError:
+    hvd = None
+
+from clap_module import create_model_and_transforms, trace_model, create_model
+from training.data import get_data
+from training.params import parse_args
+from training.distributed import is_master, init_distributed_device, world_info_from_env
+from training.logger import setup_logging
+from training.scheduler import cosine_lr
+from training.lp_train import train_one_epoch, evaluate
+from clap_module.utils import get_tar_path_from_dataset_name, dataset_split, get_optimizer
+from clap_module.utils import load_p, load_class_label
+from clap_module.linear_probe import LinearProbe
+
+
+def maintain_ckpts(args, startidx, all_idx_len):
+    for i in reversed(range(startidx, all_idx_len)):
+        if os.path.exists(os.path.join(args.checkpoint_path, f"epoch_top_{i}.pt")):
+            os.rename(
+                os.path.join(args.checkpoint_path, f"epoch_top_{i}.pt"),
+                os.path.join(args.checkpoint_path, f"epoch_top_{i+1}.pt"),
+            )
+    if os.path.exists(
+        os.path.join(args.checkpoint_path, f"epoch_top_{all_idx_len}.pt")
+    ):
+        os.remove(os.path.join(args.checkpoint_path, f"epoch_top_{all_idx_len}.pt"))
+    return
+
+
+def update_top_k_performance(
+    new_metrics_inputs, current_top_k_ckpt_metrics, args, ckpt, bignumbetter=True
+):
+    """
+    Record the top-k performance of the current epoch.
+    current_top_k_metrics is a dictionary of the form: {1: top_1_ckpt_measure, 2: top_2_ckpt_measure, ...}
+    """
+    if isinstance(new_metrics_inputs, (list, tuple)):
+        new_metrics_inputs = np.mean(new_metrics_inputs)
+        return update_top_k_performance(
+            new_metrics_inputs,
+            current_top_k_ckpt_metrics,
+            args=args,
+            ckpt=ckpt,
+            bignumbetter=bignumbetter,
+        )
+    elif isinstance(new_metrics_inputs, dict):
+        new_metrics_inputs = np.mean(list(new_metrics_inputs.values()))
+        return update_top_k_performance(
+            new_metrics_inputs,
+            current_top_k_ckpt_metrics,
+            args=args,
+            ckpt=ckpt,
+            bignumbetter=bignumbetter,
+        )
+    elif isinstance(new_metrics_inputs, (float, int)):
+        update_flag = {k: False for k in current_top_k_ckpt_metrics.keys()}
+        sorted_keys = sorted(current_top_k_ckpt_metrics.keys())
+        sorted_values = sorted(
+            current_top_k_ckpt_metrics.values(), reverse=bignumbetter
+        )
+        sorted_values_ = copy.deepcopy(sorted_values)
+        sorted_values.append(new_metrics_inputs)
+        sorted_values = sorted(sorted_values, reverse=bignumbetter)
+        sorted_values = sorted_values[:-1]
+
+        if sorted_values == sorted_values_:
+            return current_top_k_ckpt_metrics, new_metrics_inputs
+        else:
+            for i in range(len(sorted_keys)):
+                if current_top_k_ckpt_metrics[sorted_keys[i]] != sorted_values[i]:
+                    current_top_k_ckpt_metrics[sorted_keys[i]] = sorted_values[i]
+                    update_flag[sorted_keys[i]] = True
+            for i in range(len(update_flag)):
+                if update_flag[i]:
+                    maintain_ckpts(args, i, len(sorted_keys))
+                    torch.save(
+                        ckpt,
+                        os.path.join(args.checkpoint_path, f"epoch_top_{i}.pt"),
+                    )
+                    break
+            return current_top_k_ckpt_metrics, new_metrics_inputs
+
+
+# def updateifNone(a, b):
+#     a = b if None else a
+#     return a
+
+
+def is_pretrained_params(n):
+    return (
+        n.startswith("clap_model.transformer")
+        or n in ["clap_model.positional_embedding", "clap_model.text_projection"]
+        or n.startswith("clap_model.token_embedding")
+        or n.startswith("clap_model.ln_final")
+        or n.startswith("clap_model.logit_scale_t")
+    )
+
+
+def random_seed(seed=42, rank=0):
+    torch.manual_seed(seed + rank)
+    np.random.seed(seed + rank)
+    random.seed(seed + rank)
+
+def config_lp_optimizer(model, data, args):
+    # set wd-related params to 0 if use adam optimizer
+    if args.optimizer == "adam":
+        args.wd = 0
+        args.wd_pretrained = 0
+        args.wd_new = 0
+
+    in_clap = (
+        lambda n, p: n.startswith("clap_model")
+    )
+
+    named_parameters = list(model.named_parameters())
+
+    optimizer = {}
+    scheduler = {}
+
+    # freeze text encoder
+    text_freeze_parameters = [
+        p
+        for n, p in named_parameters
+        if n.startswith("clap_model.transformer")
+        or n in ["clap_model.positional_embedding", "clap_model.text_projection"]
+        or n.startswith("clap_model.token_embedding")
+        or n.startswith("clap_model.ln_final")
+    ]
+
+    if args.freeze_text:
+        logging.info("Freeze Text!!!!")
+        for k in text_freeze_parameters:
+            k.requires_grad = False
+
+    if not args.lp_freeze:
+        exclude = (
+            lambda n, p: p.ndim < 2
+            or "bn" in n
+            or "ln" in n
+            or "bias" in n
+            or "logit_scale" in n
+        )
+        include = lambda n, p: not exclude(n, p)
+
+        # (yusong): we do not split the learning rate anymore
+        # p for n, p in named_parameters if in_clap(n,p) and exclude(n, p) and p.requires_grad
+        gain_or_bias_params = [
+            p for n, p in named_parameters if exclude(n, p) and p.requires_grad
+        ]
+        # rest_params = [p for n, p in named_parameters if in_clap(n,p) and include(n, p) and p.requires_grad]
+        rest_params = [p for n, p in named_parameters if include(n, p) and p.requires_grad]
+
+        if args.train_data is None:
+            optimizer = None
+            scheduler = None
+        else:
+            total_steps = data["train"].dataloader.num_batches * args.epochs
+
+            if args.split_opt:
+                for x in ["lr", "beta1", "beta2", "eps", "wd"]:
+                    for y in ["_new", "_pretrained"]:
+                        if getattr(args, x + y) is None:
+                            setattr(args, x + y, getattr(args, x))
+
+                gain_or_bias_pretrained_params = [
+                    p
+                    for n, p in named_parameters
+                    if (exclude(n, p) and p.requires_grad) and is_pretrained_params(n)
+                ]
+                rest_pretrained_params = [
+                    p
+                    for n, p in named_parameters
+                    if (include(n, p) and p.requires_grad) and is_pretrained_params(n)
+                ]
+                gain_or_bias_new_params = [
+                    p
+                    for n, p in named_parameters
+                    if (exclude(n, p) and p.requires_grad) and (not is_pretrained_params(n))
+                ]
+                rest_new_params = [
+                    p
+                    for n, p in named_parameters
+                    if (include(n, p) and p.requires_grad) and (not is_pretrained_params(n))
+                ]
+
+                pretrained_params_optimizer = get_optimizer(
+                    [
+                        {"params": gain_or_bias_pretrained_params, "weight_decay": 0.0},
+                        {
+                            "params": rest_pretrained_params,
+                            "weight_decay": args.wd_pretrained,
+                        },
+                    ],
+                    lr=args.lr_pretrained,
+                    betas=(args.beta1_pretrained, args.beta2_pretrained),
+                    eps=args.eps_pretrained,
+                    momentum=args.momentum_pretrained,
+                    optimizer_name=args.optimizer,
+                )
+                pretrained_params_scheduler = cosine_lr(
+                    pretrained_params_optimizer,
+                    args.lr_pretrained,
+                    args.warmup,
+                    total_steps,
+                )
+
+                new_params_optimizer = get_optimizer(
+                    [
+                        {"params": gain_or_bias_new_params, "weight_decay": 0.0},
+                        {"params": rest_new_params, "weight_decay": args.wd_new},
+                    ],
+                    lr=args.lr_new,
+                    betas=(args.beta1_new, args.beta2_new),
+                    eps=args.eps_new,
+                    momentum=args.momentum_new,
+                    optimizer_name=args.optimizer,
+                )
+                new_params_scheduler = cosine_lr(
+                    new_params_optimizer, args.lr_new, args.warmup, total_steps
+                )
+
+                optimizer["text"] = pretrained_params_optimizer
+                optimizer["audio"] = new_params_optimizer
+                scheduler["text"] = pretrained_params_scheduler
+                scheduler["audio"] = new_params_scheduler
+
+                if args.horovod:
+                    pretrained_params_optimizer = hvd.DistributedOptimizer(
+                        pretrained_params_optimizer,
+                        named_parameters=model.named_parameters(),
+                    )
+                    new_params_optimizer = hvd.DistributedOptimizer(
+                        new_params_optimizer, named_parameters=model.named_parameters()
+                    )
+                    hvd.broadcast_parameters(model.state_dict(), root_rank=0)
+                    hvd.broadcast_optimizer_state(pretrained_params_optimizer, root_rank=0)
+                    hvd.broadcast_optimizer_state(new_params_optimizer, root_rank=0)
+            else:
+
+                optimizer["clap"] = get_optimizer(
+                    [
+                        {"params": gain_or_bias_params, "weight_decay": 0.0},
+                        {"params": rest_params, "weight_decay": args.wd},
+                    ],
+                    lr=args.lr,
+                    betas=(args.beta1, args.beta2),
+                    eps=args.eps,
+                    momentum=args.momentum,
+                    optimizer_name=args.optimizer,
+            )
+                scheduler["clap"] = cosine_lr(optimizer["clap"], args.lr, args.warmup, total_steps)
+
+                if args.horovod:
+                    optimizer["clap"] = hvd.DistributedOptimizer(
+                        optimizer["clap"], named_parameters=model.named_parameters()
+                    )
+                    hvd.broadcast_parameters(model.state_dict(), root_rank=0)
+                    hvd.broadcast_optimizer_state(optimizer["clap"], root_rank=0)
+
+    # linear probe optimizer
+    else:
+        lp_params = [p for n, p in named_parameters if (not in_clap(n, p)) and p.requires_grad]
+        lp_optim = get_optimizer(lp_params, lr=args.lp_lr, betas=(args.beta1, args.beta2), eps=args.eps, momentum=0.9,
+                                 optimizer_name=args.optimizer)
+        optimizer["lp"] = lp_optim
+
+    return optimizer, scheduler, text_freeze_parameters
+
+
+def main():
+    args = parse_args()
+
+    time.sleep(args.sleep)
+
+    # sanitize model name for filesystem / uri use, easier if we don't use / in name as a rule?
+    args.amodel = args.amodel.replace("/", "-")
+    # download sizes.json file
+
+    # (yusong): the below two lines are for debug
+    # print("setting up faulthandler")
+    # faulthandler.register(10)
+
+    random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    torch.cuda.manual_seed(args.seed)
+    torch.cuda.manual_seed_all(args.seed)
+    np.random.seed(args.seed)
+    args.class_index_dict = load_class_label(args.class_label_path)
+
+    # get the name of the experiments
+    if args.name is None:
+        args.name = "-".join(
+            [
+                datetime.now().strftime("%Y_%m_%d-%H_%M_%S"),
+                f"linear_probe"
+                f"model_{args.amodel}",
+                f"lr_{args.lr}",
+                f"b_{args.batch_size}",
+                f"j_{args.workers}",
+                f"p_{args.precision}",
+            ]
+        )
+
+    # discover initial world args early so we can log properly
+    args.distributed = False
+    args.local_rank, args.rank, args.world_size = world_info_from_env()
+
+    if args.remotedata and is_master(args):
+        for dataset_name in args.datasetnames:
+            for split in dataset_split[dataset_name]:
+                if not os.path.exists(f"./json_files/{dataset_name}/{split}"):
+                    os.makedirs(f"./json_files/{dataset_name}/{split}")
+                os.system(
+                    f"aws s3 cp s3://s-laion-audio/webdataset_tar/{dataset_name}/{split}/sizes.json ./json_files/{dataset_name}/{split}/sizes.json"
+                )
+
+    args.log_path = None
+    if is_master(args, local=args.log_local):
+        log_base_path = os.path.join(args.logs, args.name)
+        os.makedirs(log_base_path, exist_ok=True)
+        log_filename = f"out-{args.rank}" if args.log_local else "out.log"
+        args.log_path = os.path.join(log_base_path, log_filename)
+
+        # avoid log dir in same name:
+        postfix = 0
+        while os.path.exists(args.log_path):
+            postfix += 1
+            log_base_path_new = log_base_path+'-'+str(postfix)
+            os.makedirs(log_base_path_new, exist_ok=True)
+            log_filename = f"out-{args.rank}" if args.log_local else "out.log"
+            args.log_path = os.path.join(log_base_path_new, log_filename)
+            # print(
+            #     "Error. Experiment already exists. Use --name {} to specify a new experiment."
+            # )
+            # return -1
+
+    # Set logger
+    args.log_level = logging.DEBUG if args.debug else logging.INFO
+    setup_logging(args.log_path, args.log_level)
+
+    # fully initialize distributed device environment
+    device = init_distributed_device(args)
+
+    args.wandb = "wandb" in args.report_to or "all" in args.report_to
+    args.tensorboard = "tensorboard" in args.report_to or "all" in args.report_to
+    if is_master(args):
+        args.tensorboard_path = (
+            os.path.join(args.logs, args.name, "tensorboard")
+            if args.tensorboard
+            else ""
+        )
+        args.checkpoint_path = os.path.join(args.logs, args.name, "checkpoints")
+        for dirname in [args.tensorboard_path, args.checkpoint_path]:
+            if dirname:
+                os.makedirs(dirname, exist_ok=True)
+    else:
+        args.tensorboard_path = ""
+        args.checkpoint_path = ""
+
+    if args.copy_codebase:
+        copy_codebase(args)
+
+    assert args.precision in ["amp", "fp16", "fp32"]
+    if args.precision == "fp16":
+        logging.warning(
+            "It is recommended to use AMP mixed-precision instead of FP16. "
+            "FP16 support needs further verification and tuning, especially for train."
+        )
+
+    if args.horovod:
+        logging.info(
+            f"Running in horovod mode with multiple processes / nodes. Device: {args.device}."
+            f"Process (global: {args.rank}, local {args.local_rank}), total {args.world_size}."
+        )
+    elif args.distributed:
+        logging.info(
+            f"Running in distributed mode with multiple processes. Device: {args.device}."
+            f"Process (global: {args.rank}, local {args.local_rank}), total {args.world_size}."
+        )
+    else:
+        logging.info(f"Running with a single process. Device {args.device}.")
+
+    logging.info(f'openai cache dir: {os.path.expanduser(args.openai_model_cache_dir)}')
+
+    # Create CLAP model
+    clap_model, clap_model_cfg = create_model(
+        args.amodel,
+        args.tmodel,
+        args.pretrained,
+        precision=args.precision,
+        device=device,
+        jit=args.torchscript,
+        force_quick_gelu=args.force_quick_gelu,
+        openai_model_cache_dir=os.path.expanduser(args.openai_model_cache_dir),
+        skip_params=False,
+        pretrained_audio=args.pretrained_audio,
+        pretrained_text=args.pretrained_text,
+        enable_fusion=args.enable_fusion,
+        fusion_type=args.fusion_type
+    )
+
+    args.lp_out_ch = len(list(args.class_index_dict.keys()))
+    # Linear Probe 
+    logging.info(f"linear probe using mlp: {args.lp_mlp}")
+    logging.info(f"linear probe using freeze: {args.lp_freeze}")
+    logging.info(f"linear probe act layer: {args.lp_act}")
+    logging.info(f"linear probe out ch: {args.lp_out_ch}")
+    logging.info(f"linear probe learning rate (if applicable): {args.lp_lr}")
+    logging.info(f"linear probe loss func: {args.lp_loss}")
+    logging.info(f"linear probe lp_metrics: {args.lp_metrics}")
+
+    model = LinearProbe(
+        clap_model,
+        mlp=args.lp_mlp, freeze=args.lp_freeze,
+        in_ch=512, out_ch=args.lp_out_ch,
+        act=args.lp_act
+    ) # in_ch is fixed (i.e., 512)
+    model = model.to(device)
+
+    if args.horovod:
+        with torch.no_grad():
+            for param in model.parameters():
+                param.set_(param.contiguous())
+
+    if args.trace:
+        model = trace_model(model, batch_size=args.batch_size, device=device)
+
+    if is_master(args):
+        logging.info("Linear Probe CLAP Model:")
+        logging.info(f"{str(clap_model)}")
+        logging.info("Params:")
+        params_file = os.path.join(args.logs, args.name, "params.txt")
+        with open(params_file, "w") as f:
+            for name in sorted(vars(args)):
+                val = getattr(args, name)
+                logging.info(f"  {name}: {val}")
+                f.write(f"{name}: {val}\n")
+
+    if args.distributed and not args.horovod:
+        if args.use_bn_sync:
+            model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
+        ddp_args = {}
+        if args.ddp_static_graph:
+            # this doesn't exist in older PyTorch, arg only added if enabled
+            ddp_args["static_graph"] = True
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[device], find_unused_parameters=True, **ddp_args
+        )
+
+    data = get_data(args, clap_model_cfg)
+    assert len(data), "At least one train or eval dataset must be specified."
+    if args.trace:
+        assert "train" not in data, "Cannot train with traced model"
+
+
+    optimizer, scheduler, text_freeze_parameters = config_lp_optimizer(model, data, args)
+
+
+    scaler = GradScaler() if args.precision == "amp" else None
+
+    # optionally resume from a checkpoint
+    start_epoch = 0
+    if args.resume is not None:
+        if os.path.isfile(args.resume):
+            checkpoint = torch.load(args.resume, map_location=device)
+            if "epoch" in checkpoint:
+                # resuming a train checkpoint w/ epoch and optimizer state
+                start_epoch = checkpoint["epoch"]
+                sd = checkpoint["state_dict"]
+                if not args.distributed and next(iter(sd.items()))[0].startswith(
+                    "module"
+                ):
+                    sd = {k[len("module.") :]: v for k, v in sd.items()}
+                model.load_state_dict(sd)
+                if args.split_opt:
+                    if optimizer is not None:
+                        for k, o_ in optimizer.items():
+                            o_.load_state_dict(checkpoint[k + "_" + "optimizer"])
+                if optimizer is not None:
+                    optimizer.load_state_dict(checkpoint["optimizer"])
+                if scaler is not None and "scaler" in checkpoint:
+                    scaler.load_state_dict(checkpoint["scaler"])
+                logging.info(
+                    f"=> resuming checkpoint '{args.resume}' (epoch {start_epoch})"
+                )
+            else:
+                # loading a bare (model only) checkpoint for fine-tune or evaluation
+                model.load_state_dict(checkpoint)
+                logging.info(
+                    f"=> loaded checkpoint '{args.resume}' (epoch {start_epoch})"
+                )
+            if args.freeze_text:
+                print("Freeze Text!!!!")
+                for k in text_freeze_parameters:
+                    k.requires_grad = False
+        else:
+            logging.info("=> no checkpoint found at '{}'".format(args.resume))
+
+    cudnn.benchmark = True
+    cudnn.deterministic = False
+
+    # determine if this worker should save logs and checkpoints. only do so if it is rank == 0
+    args.save_logs = args.logs and args.logs.lower() != "none" and is_master(args)
+    writer = None
+    if args.save_logs and args.tensorboard:
+        assert tensorboard is not None, "Please install tensorboard."
+        writer = tensorboard.SummaryWriter(args.tensorboard_path)
+
+    if args.wandb and is_master(args):
+        assert wandb is not None, "Please install wandb."
+        logging.debug("Starting wandb.")
+        args.train_sz = data["train"].dataloader.num_samples
+        if args.val_data is not None:
+            args.val_sz = data["val"].dataloader.num_samples
+        # you will have to configure this for your project!
+        wandb.init(
+            project="clap",
+            notes=args.wandb_notes,
+            name=args.wandb_notes,
+            tags=[],
+            config=vars(args),
+        )
+        if args.debug:
+            wandb.watch(model, log="all")
+        wandb.save(params_file)
+        logging.debug("Finished loading wandb.")
+
+    if "train" not in data:
+        evaluate(model, data, start_epoch, args, writer)
+        return
+    elif start_epoch == 0 and "val" in data and not args.no_eval:
+        evaluate(model, data, 0, args, writer)
+    if args.save_top_performance:
+        current_top_k_ckpt_metrics = {
+            i: 0 for i in range(args.save_top_performance)
+        }  # initialize the top-k metric for ckpts to 0
+
+    for epoch in range(start_epoch, args.epochs):
+        # freeze the text param after (include) args.freeze_text_after, this is -1 by default
+        if epoch == args.freeze_text_after:
+            print("Text pretrained parameters are freezed since this epoch.")
+            for k in text_freeze_parameters:
+                k.requires_grad = False
+        if is_master(args):
+            logging.info(f"Start epoch {epoch}")
+
+        train_one_epoch(model, data, epoch, optimizer, scaler, scheduler, args, writer)
+        completed_epoch = epoch + 1
+
+        if any(v in data for v in ("val", "imagenet-val", "imagenet-v2")) and not args.no_eval:
+            metrics = evaluate(model, data, completed_epoch, args, writer)
+            if args.save_top_performance:
+                top_k_dataset = args.top_k_checkpoint_select_dataset
+                top_k_metric = args.top_k_checkpoint_select_metric
+                filtered_metrics = [
+                    v
+                    for k, v in metrics.items()
+                    if top_k_metric in k and top_k_dataset in k
+                ]  # check all R@10 metrics (all dataset) and use it to update the ckpt
+        # Saving checkpoints.
+        if args.save_logs:
+            opt_dict = {
+                    k + "_" + "optimizer": v.state_dict() for k, v in optimizer.items()
+                }
+            checkpoint_dict = {
+                "epoch": completed_epoch,
+                "name": args.name,
+                "state_dict": model.state_dict(),
+            }
+            checkpoint_dict.update(opt_dict)
+            if scaler is not None:
+                checkpoint_dict["scaler"] = scaler.state_dict()
+
+            if completed_epoch == args.epochs or (
+                args.save_frequency > 0 and (completed_epoch % args.save_frequency) == 0
+            ):
+                torch.save(
+                    checkpoint_dict,
+                    os.path.join(args.checkpoint_path, f"epoch_{completed_epoch}.pt"),
+                )
+            if args.save_most_recent:
+                torch.save(
+                    checkpoint_dict,
+                    os.path.join(args.checkpoint_path, f"epoch_latest.pt"),
+                )
+            if args.save_top_performance and not args.no_eval:
+                update_top_k_performance(
+                    filtered_metrics,
+                    current_top_k_ckpt_metrics,
+                    args,
+                    checkpoint_dict,
+                    bignumbetter=True,
+                )
+
+    if args.wandb and is_master(args):
+        wandb.finish()
+
+
+def copy_codebase(args):
+    from shutil import copytree, ignore_patterns
+
+    new_code_path = os.path.join(args.logs, args.name, "code")
+    if os.path.exists(new_code_path):
+        print(
+            f"Error. Experiment already exists at {new_code_path}. Use --name to specify a new experiment."
+        )
+        return -1
+    print(f"Copying codebase to {new_code_path}")
+    current_code_path = os.path.realpath(__file__)
+    for _ in range(3):
+        current_code_path = os.path.dirname(current_code_path)
+    copytree(
+        current_code_path, new_code_path, ignore=ignore_patterns("log", "logs", "wandb")
+    )
+    print("Done copying code.")
+    return 1
+
+
+if __name__ == "__main__":
+    main()
diff --git a/my_laion_clap/CLAP/src/laion_clap/training/lp_train.py b/my_laion_clap/CLAP/src/laion_clap/training/lp_train.py
new file mode 100644
index 0000000000000000000000000000000000000000..d686336c824dd34b45f056c414d761540141a46f
--- /dev/null
+++ b/my_laion_clap/CLAP/src/laion_clap/training/lp_train.py
@@ -0,0 +1,292 @@
+import json
+import logging
+import math
+import os
+import time
+from contextlib import suppress
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+try:
+    import wandb
+except ImportError:
+    wandb = None
+
+from clap_module import LPLoss, LPMetrics, lp_gather_features
+from clap_module.utils import do_mixup, get_mix_lambda
+from .distributed import is_master
+from .zero_shot import zero_shot_eval
+
+
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+
+def unwrap_model(model):
+    if hasattr(model, "module"):
+        return model.module
+    else:
+        return model
+
+
+def train_one_epoch(
+        model, data, epoch, optimizer, scaler, scheduler, args, tb_writer=None, extra_suffix=""
+):
+    device = torch.device(args.device)
+    autocast = torch.cuda.amp.autocast if args.precision == "amp" else suppress
+    model.train()
+    loss = LPLoss(args.lp_loss)
+
+    dataloader, sampler = data["train"].dataloader, data["train"].sampler
+    if args.distributed and sampler is not None:
+        sampler.set_epoch(epoch)
+    num_batches_per_epoch = dataloader.num_batches
+    sample_digits = math.ceil(math.log(dataloader.num_samples + 1, 10))
+
+    # for toy dataset
+    if args.dataset_type == "toy":
+        dataloader.dataset.generate_queue()
+
+    loss_m = AverageMeter()
+    batch_time_m = AverageMeter()
+    data_time_m = AverageMeter()
+    end = time.time()
+
+    for i, batch in enumerate(dataloader):
+        step = num_batches_per_epoch * epoch + i
+
+        if isinstance(scheduler, dict):
+            for s in scheduler.values():
+                s(step)
+        else:
+            scheduler(step)
+
+        audio = batch # contains mel_spec, wavform, and longer list
+        class_label = batch['class_label']
+        # audio = audio.to(device=device, non_blocking=True)
+        class_label = class_label.to(device=device, non_blocking=True)
+
+        if args.mixup:
+            # https://github.com/RetroCirce/HTS-Audio-Transformer/blob/main/utils.py#L146
+            mix_lambda = torch.from_numpy(get_mix_lambda(0.5, len(audio["waveform"]))).to(device)
+            class_label = do_mixup(class_label, mix_lambda)
+        else:
+            mix_lambda = None
+
+        data_time_m.update(time.time() - end)
+        if isinstance(optimizer, dict):
+            for o_ in optimizer.values():
+                o_.zero_grad()
+        else:
+            optimizer.zero_grad()
+
+        with autocast():
+            pred = model(audio, mix_lambda=mix_lambda, device=device)
+            total_loss = loss(pred, class_label)
+
+        if isinstance(optimizer, dict):
+            if scaler is not None:
+                scaler.scale(total_loss).backward()
+                for o_ in optimizer.values():
+                    if args.horovod:
+                        o_.synchronize()
+                        scaler.unscale_(o_)
+                        with o_.skip_synchronize():
+                            scaler.step(o_)
+                    else:
+                        scaler.step(o_)
+                scaler.update()
+            else:
+                total_loss.backward()
+                for o_ in optimizer.values():
+                    o_.step()
+        else:
+            if scaler is not None:
+                scaler.scale(total_loss).backward()
+                if args.horovod:
+                    optimizer.synchronize()
+                    scaler.unscale_(optimizer)
+                    with optimizer.skip_synchronize():
+                        scaler.step(optimizer)
+                else:
+                    scaler.step(optimizer)
+                scaler.update()
+            else:
+                total_loss.backward()
+                optimizer.step()
+
+        # Note: we clamp to 4.6052 = ln(100), as in the original paper.
+        with torch.no_grad():
+            unwrap_model(model).clap_model.logit_scale_a.clamp_(0, math.log(100))
+            unwrap_model(model).clap_model.logit_scale_t.clamp_(0, math.log(100))
+
+        batch_time_m.update(time.time() - end)
+        end = time.time()
+        batch_count = i + 1
+
+        if is_master(args) and (i % 100 == 0 or batch_count == num_batches_per_epoch):
+            if isinstance(audio, dict):
+                batch_size = len(audio["waveform"])
+            else:
+                batch_size = len(audio)
+            num_samples = batch_count * batch_size * args.world_size
+            samples_per_epoch = dataloader.num_samples
+            percent_complete = 100.0 * batch_count / num_batches_per_epoch
+
+            # NOTE loss is coarsely sampled, just master node and per log update
+            loss_m.update(total_loss.item(), batch_size)
+            if isinstance(optimizer, dict):
+                logging.info(
+                    f"Train Epoch: {epoch} [{num_samples:>{sample_digits}}/{samples_per_epoch} ({percent_complete:.0f}%)] "
+                    f"Loss: {loss_m.val:#.5g} ({loss_m.avg:#.4g}) "
+                    f"Data (t): {data_time_m.avg:.3f} "
+                    f"Batch (t): {batch_time_m.avg:.3f} "
+                    f"LR: {[o_.param_groups[0]['lr'] for o_ in optimizer.values()]}"
+                )
+                log_data = {
+                    "loss": loss_m.val,
+                    "data_time": data_time_m.val,
+                    "batch_time": batch_time_m.val,
+                    "lr": [o_.param_groups[0]["lr"] for o_ in optimizer.values()],
+                }
+            else:
+                logging.info(
+                    f"Train Epoch: {epoch} [{num_samples:>{sample_digits}}/{samples_per_epoch} ({percent_complete:.0f}%)] "
+                    f"Loss: {loss_m.val:#.5g} ({loss_m.avg:#.4g}) "
+                    f"Data (t): {data_time_m.avg:.3f} "
+                    f"Batch (t): {batch_time_m.avg:.3f} "
+                    f"LR: {optimizer.param_groups[0]['lr']:5f} "
+                )
+
+                # Save train loss / etc. Using non avg meter values as loggers have their own smoothing
+                log_data = {
+                    "loss": loss_m.val,
+                    "data_time": data_time_m.val,
+                    "batch_time": batch_time_m.val,
+                    "lr": optimizer.param_groups[0]["lr"],
+                }
+            for name, val in log_data.items():
+                name = f"train{extra_suffix}/{name}"
+                if tb_writer is not None:
+                    tb_writer.add_scalar(name, val, step)
+                if args.wandb:
+                    assert wandb is not None, "Please install wandb."
+                    wandb.log({name: val, "step": step})
+
+            # resetting batch / data time meters per log window
+            batch_time_m.reset()
+            data_time_m.reset()
+    # end for
+
+def evaluate(model, data, epoch, args, tb_writer=None, extra_suffix=""):
+    metrics = {}
+    if not args.parallel_eval:
+        if not is_master(args):
+            return metrics
+    device = torch.device(args.device)
+    model.eval()
+
+    # CHANGE
+    # zero_shot_metrics = zero_shot_eval(model, data, epoch, args)
+    # metrics.update(zero_shot_metrics)
+    if is_master(args):
+        print('Evaluating...')
+        metric_names = args.lp_metrics.split(',')
+        eval_tool = LPMetrics(metric_names=metric_names)
+
+    autocast = torch.cuda.amp.autocast if args.precision == "amp" else suppress        
+    if "val" in data and (
+            args.val_frequency
+            and ((epoch % args.val_frequency) == 0 or epoch == args.epochs)
+    ):
+        if args.parallel_eval:
+            dataloader, sampler = data["val"].dataloader, data["val"].sampler
+            if args.distributed and sampler is not None:
+                sampler.set_epoch(epoch)
+            samples_per_val = dataloader.num_samples
+        else:
+            dataloader = data["val"].dataloader
+            num_samples = 0
+            samples_per_val = dataloader.num_samples
+            
+        eval_info = {
+            'pred': [],
+            'target': []
+        }
+        with torch.no_grad():
+            for i, batch in enumerate(dataloader):
+                audio = batch # contains mel_spec, wavform, and longer list
+                class_label = batch['class_label']
+                           
+                # audio = audio.to(device=device, non_blocking=True)
+                class_label = class_label.to(device=device, non_blocking=True)
+
+                with autocast():
+                    pred = model(audio, device=device)
+                    if args.parallel_eval:
+                        pred, class_label = lp_gather_features(pred, class_label, args.world_size, args.horovod)
+                    eval_info['pred'].append(pred)
+                    eval_info['target'].append(class_label)
+                    
+                num_samples += class_label.shape[0]
+
+                if (i % 100) == 0:  # and i != 0:
+                    logging.info(
+                        f"Eval Epoch: {epoch} [{num_samples} / {samples_per_val}]"
+                    )
+                    
+            if is_master(args):
+                eval_info['pred'] = torch.cat(eval_info['pred'], 0).cpu()
+                eval_info['target'] = torch.cat(eval_info['target'], 0).cpu()
+                metric_dict = eval_tool.evaluate_mertics(eval_info['pred'], eval_info['target'])
+                metrics.update(metric_dict)
+                if "epoch" not in metrics.keys():
+                    metrics.update({"epoch": epoch})
+
+    if is_master(args):
+        if not metrics:
+            return metrics
+
+        logging.info(
+            f"Eval Epoch: {epoch} "
+            + "\n".join(
+                [
+                    "\t".join([f"{m}: {round(metrics[m], 4):.4f}" ])
+                    for m in metrics
+                ]
+            )
+        )
+        if args.save_logs:
+            for name, val in metrics.items():
+                if tb_writer is not None:
+                    tb_writer.add_scalar(f"val{extra_suffix}/{name}", val, epoch)
+
+            with open(os.path.join(args.checkpoint_path, "results.jsonl"), "a+") as f:
+                f.write(json.dumps(metrics))
+                f.write("\n")
+
+        if args.wandb:
+            assert wandb is not None, "Please install wandb."
+            for name, val in metrics.items():
+                wandb.log({f"val{extra_suffix}/{name}": val, "epoch": epoch})
+
+        return metrics
+    else:
+        return metrics
diff --git a/my_laion_clap/CLAP/src/laion_clap/training/main.py b/my_laion_clap/CLAP/src/laion_clap/training/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c48b66e2287785961501dfd75f2b6f5d331c245
--- /dev/null
+++ b/my_laion_clap/CLAP/src/laion_clap/training/main.py
@@ -0,0 +1,597 @@
+import logging
+import os
+import random
+from datetime import datetime
+import copy
+import numpy as np
+import torch
+import torch.backends.cudnn as cudnn
+from torch.cuda.amp import GradScaler
+
+try:
+    import wandb
+except ImportError:
+    wandb = None
+
+try:
+    import torch.utils.tensorboard as tensorboard
+except ImportError:
+    tensorboard = None
+
+try:
+    import horovod.torch as hvd
+except ImportError:
+    hvd = None
+
+from clap_module import create_model_and_transforms, trace_model, create_model
+from training.data import get_data
+from training.distributed import is_master, init_distributed_device, world_info_from_env
+from training.logger import setup_logging
+from training.params import parse_args
+from training.scheduler import cosine_lr
+from training.train import train_one_epoch, evaluate
+from clap_module.utils import dataset_split, get_optimizer
+
+
+def maintain_ckpts(args, startidx, all_idx_len):
+    for i in reversed(range(startidx, all_idx_len)):
+        if os.path.exists(os.path.join(args.checkpoint_path, f"epoch_top_{i}.pt")):
+            os.rename(
+                os.path.join(args.checkpoint_path, f"epoch_top_{i}.pt"),
+                os.path.join(args.checkpoint_path, f"epoch_top_{i+1}.pt"),
+            )
+    if os.path.exists(
+        os.path.join(args.checkpoint_path, f"epoch_top_{all_idx_len}.pt")
+    ):
+        os.remove(os.path.join(args.checkpoint_path, f"epoch_top_{all_idx_len}.pt"))
+    return
+
+
+def update_top_k_performance(
+    new_metrics_inputs, current_top_k_ckpt_metrics, args, ckpt, bignumbetter=True
+):
+    """
+    Record the top-k performance of the current epoch.
+    current_top_k_metrics is a dictionary of the form: {1: top_1_ckpt_measure, 2: top_2_ckpt_measure, ...}
+    """
+    if isinstance(new_metrics_inputs, (list, tuple)):
+        new_metrics_inputs = np.mean(new_metrics_inputs)
+        return update_top_k_performance(
+            new_metrics_inputs,
+            current_top_k_ckpt_metrics,
+            args=args,
+            ckpt=ckpt,
+            bignumbetter=bignumbetter,
+        )
+    elif isinstance(new_metrics_inputs, dict):
+        new_metrics_inputs = np.mean(list(new_metrics_inputs.values()))
+        return update_top_k_performance(
+            new_metrics_inputs,
+            current_top_k_ckpt_metrics,
+            args=args,
+            ckpt=ckpt,
+            bignumbetter=bignumbetter,
+        )
+    elif isinstance(new_metrics_inputs, (float, int)):
+        update_flag = {k: False for k in current_top_k_ckpt_metrics.keys()}
+        sorted_keys = sorted(current_top_k_ckpt_metrics.keys())
+        sorted_values = sorted(
+            current_top_k_ckpt_metrics.values(), reverse=bignumbetter
+        )
+        sorted_values_ = copy.deepcopy(sorted_values)
+        sorted_values.append(new_metrics_inputs)
+        sorted_values = sorted(sorted_values, reverse=bignumbetter)
+        sorted_values = sorted_values[:-1]
+
+        if sorted_values == sorted_values_:
+            return current_top_k_ckpt_metrics, new_metrics_inputs
+        else:
+            for i in range(len(sorted_keys)):
+                if current_top_k_ckpt_metrics[sorted_keys[i]] != sorted_values[i]:
+                    current_top_k_ckpt_metrics[sorted_keys[i]] = sorted_values[i]
+                    update_flag[sorted_keys[i]] = True
+            for i in range(len(update_flag)):
+                if update_flag[i]:
+                    maintain_ckpts(args, i, len(sorted_keys))
+                    torch.save(
+                        ckpt,
+                        os.path.join(args.checkpoint_path, f"epoch_top_{i}.pt"),
+                    )
+                    break
+            return current_top_k_ckpt_metrics, new_metrics_inputs
+
+
+# def updateifNone(a, b):
+#     a = b if None else a
+#     return a
+
+
+def is_pretrained_params(n):
+    return (
+        n.startswith("transformer")
+        or n in ["positional_embedding", "text_projection"]
+        or n.startswith("token_embedding")
+        or n.startswith("ln_final")
+        or n.startswith("logit_scale_t")
+    )
+
+
+def random_seed(seed=42, rank=0):
+    torch.manual_seed(seed + rank)
+    np.random.seed(seed + rank)
+    random.seed(seed + rank)
+
+
+def main():
+    args = parse_args()
+    # sanitize model name for filesystem / uri use, easier if we don't use / in name as a rule?
+    args.amodel = args.amodel.replace("/", "-")
+    # download sizes.json file
+
+    # (yusong): the below two lines are for debug
+    # print("setting up faulthandler")
+    # faulthandler.register(10)
+
+    random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    torch.cuda.manual_seed(args.seed)
+    torch.cuda.manual_seed_all(args.seed)
+    np.random.seed(args.seed)
+    if args.tmodel == "bert" or args.tmodel == "roberta" or args.tmodel == "bart":
+        assert (
+            args.pretrained == "" or args.pretrained is None
+        ), "bert/roberta/bart text encoder does not support pretrained models."
+
+    # get the name of the experiments
+    if args.name is None:
+        args.name = "-".join(
+            [
+                datetime.now().strftime("%Y_%m_%d-%H_%M_%S"),
+                f"model_{args.amodel}",
+                f"lr_{args.lr}",
+                f"b_{args.batch_size}",
+                f"j_{args.workers}",
+                f"p_{args.precision}",
+            ]
+        )
+
+    # discover initial world args early so we can log properly
+    args.distributed = False
+    args.local_rank, args.rank, args.world_size = world_info_from_env()
+
+    if args.remotedata and is_master(args):
+        for dataset_name in args.datasetnames:
+            for split in dataset_split[dataset_name]:
+                if not os.path.exists(f"./json_files/{dataset_name}/{split}"):
+                    os.makedirs(f"./json_files/{dataset_name}/{split}")
+                os.system(
+                    f"aws s3 cp s3://s-laion-audio/webdataset_tar/{dataset_name}/{split}/sizes.json ./json_files/{dataset_name}/{split}/sizes.json"
+                )
+
+    args.log_path = None
+    if is_master(args, local=args.log_local):
+        log_base_path = os.path.join(args.logs, args.name)
+        os.makedirs(log_base_path, exist_ok=True)
+        log_filename = f"out-{args.rank}" if args.log_local else "out.log"
+        args.log_path = os.path.join(log_base_path, log_filename)
+        if os.path.exists(args.log_path):
+            print(
+                "Error. Experiment already exists. Use --name {} to specify a new experiment."
+            )
+            return -1
+
+    # Set logger
+    args.log_level = logging.DEBUG if args.debug else logging.INFO
+    setup_logging(args.log_path, args.log_level)
+
+    # fully initialize distributed device environment
+    device = init_distributed_device(args)
+
+    args.wandb = "wandb" in args.report_to or "all" in args.report_to
+    args.tensorboard = "tensorboard" in args.report_to or "all" in args.report_to
+    if is_master(args):
+        args.tensorboard_path = (
+            os.path.join(args.logs, args.name, "tensorboard")
+            if args.tensorboard
+            else ""
+        )
+        args.checkpoint_path = os.path.join(args.logs, args.name, "checkpoints")
+        for dirname in [args.tensorboard_path, args.checkpoint_path]:
+            if dirname:
+                os.makedirs(dirname, exist_ok=True)
+    else:
+        args.tensorboard_path = ""
+        args.checkpoint_path = ""
+
+    if args.copy_codebase:
+        copy_codebase(args)
+
+    assert args.precision in ["amp", "fp16", "fp32"]
+    if args.precision == "fp16":
+        logging.warning(
+            "It is recommended to use fp32 mixed-precision instead of FP16 and AMP in this model. "
+            "They will cause NaN loss and NaN gradients. "
+            "FP16 and AMP support needs further verification and tuning, especially for train."
+        )
+
+    if args.horovod:
+        logging.info(
+            f"Running in horovod mode with multiple processes / nodes. Device: {args.device}."
+            f"Process (global: {args.rank}, local {args.local_rank}), total {args.world_size}."
+        )
+    elif args.distributed:
+        logging.info(
+            f"Running in distributed mode with multiple processes. Device: {args.device}."
+            f"Process (global: {args.rank}, local {args.local_rank}), total {args.world_size}."
+        )
+    else:
+        logging.info(f"Running with a single process. Device {args.device}.")
+
+    logging.info(f"openai cache dir: {os.path.expanduser(args.openai_model_cache_dir)}")
+
+    model, model_cfg = create_model(
+        args.amodel,
+        args.tmodel,
+        args.pretrained,
+        precision=args.precision,
+        device=device,
+        jit=args.torchscript,
+        force_quick_gelu=args.force_quick_gelu,
+        openai_model_cache_dir=os.path.expanduser(args.openai_model_cache_dir),
+        skip_params=True,
+        pretrained_audio=args.pretrained_audio,
+        pretrained_text=args.pretrained_text,
+        enable_fusion=args.enable_fusion,
+        fusion_type=args.fusion_type
+    )
+
+    if args.horovod:
+        with torch.no_grad():
+            for param in model.parameters():
+                param.set_(param.contiguous())
+
+    if args.trace:
+        model = trace_model(model, batch_size=args.batch_size, device=device)
+
+    if is_master(args):
+        logging.info("Model:")
+        logging.info(f"{str(model)}")
+        logging.info("Params:")
+        params_file = os.path.join(args.logs, args.name, "params.txt")
+        with open(params_file, "w") as f:
+            for name in sorted(vars(args)):
+                val = getattr(args, name)
+                logging.info(f"  {name}: {val}")
+                f.write(f"{name}: {val}\n")
+
+    if args.distributed and not args.horovod:
+        if args.use_bn_sync:
+            model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
+        ddp_args = {}
+        if args.ddp_static_graph:
+            # this doesn't exist in older PyTorch, arg only added if enabled
+            ddp_args["static_graph"] = True
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[device], find_unused_parameters=True, **ddp_args
+        )
+
+    data = get_data(args, model_cfg)
+    assert len(data), "At least one train or eval dataset must be specified."
+    if args.trace:
+        assert "train" not in data, "Cannot train with traced model"
+
+    exclude = (
+        lambda n, p: p.ndim < 2
+        or "bn" in n
+        or "ln" in n
+        or "bias" in n
+        or "logit_scale" in n
+    )
+    include = lambda n, p: not exclude(n, p)
+
+    named_parameters = list(model.named_parameters())
+
+    # freeze text encoder
+    text_freeze_parameters = [
+        p
+        for n, p in named_parameters
+        if 'text_branch' in n
+    ]
+
+    if args.freeze_text:
+        print("Freeze Text!!!!")
+        for k in text_freeze_parameters:
+            k.requires_grad = False
+
+    gain_or_bias_params = [
+        p for n, p in named_parameters if exclude(n, p) and p.requires_grad
+    ]
+    rest_params = [p for n, p in named_parameters if include(n, p) and p.requires_grad]
+
+    # set wd-related params to 0 if use adam optimizer
+    if args.optimizer == "adam":
+        args.wd = 0
+        args.wd_pretrained = 0
+        args.wd_new = 0
+
+    if args.train_data is None:
+        optimizer = None
+        scheduler = None
+    else:
+        total_steps = data["train"].dataloader.num_batches * args.epochs
+
+        if args.split_opt:
+            for x in ["lr", "beta1", "beta2", "eps", "wd"]:
+                for y in ["_new", "_pretrained"]:
+                    if getattr(args, x + y) is None:
+                        setattr(args, x + y, getattr(args, x))
+
+            gain_or_bias_pretrained_params = [
+                p
+                for n, p in named_parameters
+                if (exclude(n, p) and p.requires_grad) and is_pretrained_params(n)
+            ]
+            rest_pretrained_params = [
+                p
+                for n, p in named_parameters
+                if (include(n, p) and p.requires_grad) and is_pretrained_params(n)
+            ]
+            gain_or_bias_new_params = [
+                p
+                for n, p in named_parameters
+                if (exclude(n, p) and p.requires_grad) and (not is_pretrained_params(n))
+            ]
+            rest_new_params = [
+                p
+                for n, p in named_parameters
+                if (include(n, p) and p.requires_grad) and (not is_pretrained_params(n))
+            ]
+            pretrained_params_optimizer = get_optimizer(
+                [
+                    {"params": gain_or_bias_pretrained_params, "weight_decay": 0.0},
+                    {
+                        "params": rest_pretrained_params,
+                        "weight_decay": args.wd_pretrained,
+                    },
+                ],
+                lr=args.lr_pretrained,
+                betas=(args.beta1_pretrained, args.beta2_pretrained),
+                eps=args.eps_pretrained,
+                momentum=args.momentum_pretrained,
+                optimizer_name=args.optimizer,
+                )
+            pretrained_params_scheduler = cosine_lr(
+                pretrained_params_optimizer,
+                args.lr_pretrained,
+                args.warmup,
+                total_steps,
+            )
+            new_params_optimizer = get_optimizer(
+                [
+                    {"params": gain_or_bias_new_params, "weight_decay": 0.0},
+                    {"params": rest_new_params, "weight_decay": args.wd_new},
+                ],
+                lr=args.lr_new,
+                betas=(args.beta1_new, args.beta2_new),
+                eps=args.eps_new,
+                momentum=args.momentum_new,
+                optimizer_name=args.optimizer,
+                )
+
+            new_params_scheduler = cosine_lr(
+                new_params_optimizer, args.lr_new, args.warmup, total_steps
+            )
+
+            optimizer = {
+                "pretrained": pretrained_params_optimizer,
+                "new": new_params_optimizer,
+            }
+            scheduler = {
+                "pretrained": pretrained_params_scheduler,
+                "new": new_params_scheduler,
+            }
+
+            if args.horovod:
+                pretrained_params_optimizer = hvd.DistributedOptimizer(
+                    pretrained_params_optimizer,
+                    named_parameters=model.named_parameters(),
+                )
+                new_params_optimizer = hvd.DistributedOptimizer(
+                    new_params_optimizer, named_parameters=model.named_parameters()
+                )
+                hvd.broadcast_parameters(model.state_dict(), root_rank=0)
+                hvd.broadcast_optimizer_state(pretrained_params_optimizer, root_rank=0)
+                hvd.broadcast_optimizer_state(new_params_optimizer, root_rank=0)
+        else:
+            optimizer = get_optimizer(
+                [
+                    {"params": gain_or_bias_params, "weight_decay": 0.0},
+                    {"params": rest_params, "weight_decay": args.wd},
+                ],
+                lr=args.lr,
+                betas=(args.beta1, args.beta2),
+                eps=args.eps,
+                momentum=args.momentum,
+                optimizer_name=args.optimizer,
+            )
+
+            scheduler = cosine_lr(optimizer, args.lr, args.warmup, total_steps)
+
+            if args.horovod:
+                optimizer = hvd.DistributedOptimizer(
+                    optimizer, named_parameters=model.named_parameters()
+                )
+                hvd.broadcast_parameters(model.state_dict(), root_rank=0)
+                hvd.broadcast_optimizer_state(optimizer, root_rank=0)
+
+    scaler = GradScaler() if args.precision == "amp" else None
+
+    # optionally resume from a checkpoint
+    start_epoch = 0
+    if args.resume is not None:
+        if os.path.isfile(args.resume):
+            checkpoint = torch.load(args.resume, map_location=device)
+            if "epoch" in checkpoint:
+                # resuming a train checkpoint w/ epoch and optimizer state
+                start_epoch = checkpoint["epoch"]
+                sd = checkpoint["state_dict"]
+                if not args.distributed and next(iter(sd.items()))[0].startswith(
+                    "module"
+                ):
+                    sd = {k[len("module.") :]: v for k, v in sd.items()}
+                model.load_state_dict(sd)
+                if args.split_opt:
+                    if optimizer is not None:
+                        for k, o_ in optimizer.items():
+                            o_.load_state_dict(checkpoint[k + "_" + "optimizer"])
+                if optimizer is not None:
+                    optimizer.load_state_dict(checkpoint["optimizer"])
+                if scaler is not None and "scaler" in checkpoint:
+                    scaler.load_state_dict(checkpoint["scaler"])
+                logging.info(
+                    f"=> resuming checkpoint '{args.resume}' (epoch {start_epoch})"
+                )
+            else:
+                # loading a bare (model only) checkpoint for fine-tune or evaluation
+                model.load_state_dict(checkpoint)
+                logging.info(
+                    f"=> loaded checkpoint '{args.resume}' (epoch {start_epoch})"
+                )
+            if args.freeze_text:
+                print("Freeze Text!!!!")
+                for k in text_freeze_parameters:
+                    k.requires_grad = False
+        else:
+            logging.info("=> no checkpoint found at '{}'".format(args.resume))
+
+    cudnn.benchmark = True
+    cudnn.deterministic = False
+
+    # determine if this worker should save logs and checkpoints. only do so if it is rank == 0
+    args.save_logs = args.logs and args.logs.lower() != "none" and is_master(args)
+    writer = None
+    if args.save_logs and args.tensorboard:
+        assert tensorboard is not None, "Please install tensorboard."
+        writer = tensorboard.SummaryWriter(args.tensorboard_path)
+
+    if args.wandb and is_master(args):
+        assert wandb is not None, "Please install wandb."
+        logging.debug("Starting wandb.")
+        args.train_sz = data["train"].dataloader.num_samples
+        if args.val_data is not None:
+            args.val_sz = data["val"].dataloader.num_samples
+        # you will have to configure this for your project!
+        wandb.init(
+            entity="clap",
+            project="clap",
+            notes=args.wandb_notes,
+            name=args.wandb_notes,
+            tags=[],
+            config=vars(args),
+        )
+        if args.debug:
+            wandb.watch(model, log="all")
+        wandb.save(params_file)
+        logging.debug("Finished loading wandb.")
+
+    if "train" not in data:
+        evaluate(model, data, start_epoch, args, writer)
+        return
+    elif start_epoch == 0 and "val" in data and not args.no_eval:
+        evaluate(model, data, 0, args, writer)
+        #  print(f'rank {args.rank}, Start First Evaluation')#  (yusong): for debug
+    if args.save_top_performance:
+        current_top_k_ckpt_metrics = {
+            i: 0 for i in range(args.save_top_performance)
+        }  # initialize the top-k metric for ckpts to 0
+
+    #  print(f'rank {args.rank}, Start Training') #  (yusong): for debug
+    for epoch in range(start_epoch, args.epochs):
+        # freeze the text param after (include) args.freeze_text_after, this is -1 by default
+        if epoch == args.freeze_text_after:
+            print("Text pretrained parameters are freezed since this epoch.")
+            for k in text_freeze_parameters:
+                k.requires_grad = False
+        if is_master(args):
+            logging.info(f"Start epoch {epoch}")
+
+        train_one_epoch(model, data, epoch, optimizer, scaler, scheduler, args, writer)
+        completed_epoch = epoch + 1
+
+        if (
+            any(v in data for v in ("val", "imagenet-val", "imagenet-v2"))
+            and not args.no_eval
+        ):
+            metrics = evaluate(model, data, completed_epoch, args, writer)
+            if args.save_top_performance:
+                top_k_dataset = args.top_k_checkpoint_select_dataset
+                top_k_metric = args.top_k_checkpoint_select_metric
+                filtered_metrics = [
+                    v
+                    for k, v in metrics.items()
+                    if top_k_metric in k and top_k_dataset in k
+                ]  # check all R@10 metrics (all dataset) and use it to update the ckpt
+        # Saving checkpoints.
+        if args.save_logs:
+            if args.split_opt:
+                opt_dict = {
+                    k + "_" + "optimizer": v.state_dict() for k, v in optimizer.items()
+                }
+            else:
+                opt_dict = {"optimizer": optimizer.state_dict()}
+            checkpoint_dict = {
+                "epoch": completed_epoch,
+                "name": args.name,
+                "state_dict": model.state_dict(),
+            }
+            checkpoint_dict.update(opt_dict)
+            if scaler is not None:
+                checkpoint_dict["scaler"] = scaler.state_dict()
+
+            if completed_epoch == args.epochs or (
+                args.save_frequency > 0 and (completed_epoch % args.save_frequency) == 0
+            ):
+                torch.save(
+                    checkpoint_dict,
+                    os.path.join(args.checkpoint_path, f"epoch_{completed_epoch}.pt"),
+                )
+            if args.save_most_recent:
+                torch.save(
+                    checkpoint_dict,
+                    os.path.join(args.checkpoint_path, f"epoch_latest.pt"),
+                )
+            if args.save_top_performance and not args.no_eval:
+                update_top_k_performance(
+                    filtered_metrics,
+                    current_top_k_ckpt_metrics,
+                    args,
+                    checkpoint_dict,
+                    bignumbetter=True,
+                )
+
+    if args.wandb and is_master(args):
+        wandb.finish()
+
+
+def copy_codebase(args):
+    from shutil import copytree, ignore_patterns
+
+    new_code_path = os.path.join(args.logs, args.name, "code")
+    if os.path.exists(new_code_path):
+        print(
+            f"Error. Experiment already exists at {new_code_path}. Use --name to specify a new experiment."
+        )
+        return -1
+    print(f"Copying codebase to {new_code_path}")
+    current_code_path = os.path.realpath(__file__)
+    for _ in range(3):
+        current_code_path = os.path.dirname(current_code_path)
+    copytree(
+        current_code_path, new_code_path, ignore=ignore_patterns("log", "logs", "wandb")
+    )
+    print("Done copying code.")
+    return 1
+
+
+if __name__ == "__main__":
+    main()
diff --git a/my_laion_clap/CLAP/src/laion_clap/training/params.py b/my_laion_clap/CLAP/src/laion_clap/training/params.py
new file mode 100644
index 0000000000000000000000000000000000000000..84cd3b43104007a14835e4de9cd4521899ba6345
--- /dev/null
+++ b/my_laion_clap/CLAP/src/laion_clap/training/params.py
@@ -0,0 +1,567 @@
+import argparse
+
+
+def get_default_params(model_name):
+    # Params from paper (https://arxiv.org/pdf/2103.00020.pdf)
+    model_name = model_name.lower()
+    if "vit" in model_name:
+        return {"lr": 5.0e-4, "beta1": 0.9, "beta2": 0.98, "eps": 1.0e-6}
+    else:
+        return {"lr": 5.0e-4, "beta1": 0.9, "beta2": 0.999, "eps": 1.0e-8}
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--train-data",
+        type=str,
+        default=None,
+        help="Path to h5 filewith training data",
+    )
+    parser.add_argument(
+        "--val-data",
+        type=str,
+        default=None,
+        help="Path to h5 file with validation data",
+    )
+    parser.add_argument(
+        "--freeze-text",
+        default=False,
+        action="store_true",
+        help="if you need to freeze the text encoder, make this True",
+    )
+    parser.add_argument(
+        "--freeze-text-after",
+        type=int,
+        default=-1,
+        help="if you need to freeze the text encoder after (include) epoch x, set this param to x. Set -1 to disable it",
+    )
+    parser.add_argument(
+        "--train-ipc",
+        type=str,
+        default=None,
+        help="Path to npy file of the number of instance per class in training data",
+    )
+    parser.add_argument(
+        "--val-ipc",
+        type=str,
+        default=None,
+        help="Path to npy file of the number of instance per class in validation data",
+    )
+    parser.add_argument(
+        "--train-num-samples",
+        type=int,
+        default=None,
+        help="Number of samples in dataset. Required for webdataset if not available in info file.",
+    )
+    parser.add_argument(
+        "--val-num-samples",
+        type=int,
+        default=None,
+        help="Number of samples in dataset. Useful for webdataset if not available in info file.",
+    )
+    parser.add_argument(
+        "--dataset-type",
+        choices=["webdataset", "csv", "auto", "toy"],
+        default="auto",
+        help="Which type of dataset to process.",
+    )
+    parser.add_argument(
+        "--csv-separator",
+        type=str,
+        default="\t",
+        help="For csv-like datasets, which separator to use.",
+    )
+    parser.add_argument(
+        "--csv-img-key",
+        type=str,
+        default="filepath",
+        help="For csv-like datasets, the name of the key for the image paths.",
+    )
+    parser.add_argument(
+        "--csv-caption-key",
+        type=str,
+        default="title",
+        help="For csv-like datasets, the name of the key for the captions.",
+    )
+    parser.add_argument(
+        "--imagenet-val",
+        type=str,
+        default=None,
+        help="Path to imagenet val set for conducting zero shot evaluation.",
+    )
+    parser.add_argument(
+        "--imagenet-v2",
+        type=str,
+        default=None,
+        help="Path to imagenet v2 for conducting zero shot evaluation.",
+    )
+    parser.add_argument(
+        "--datasetnames",
+        nargs="+",
+        default=None,
+        help="If loading webdataset, spedify the dataset names to load. Can be some of these: Clotho, audioset, audiocaps, BBCSoundEffects",
+    )
+    parser.add_argument(
+        "--full-train-dataset",
+        nargs="+",
+        default=None,
+        help="Which dataset will be trained with all the subsets. (train+test)",
+    )
+    parser.add_argument(
+        "--exclude-eval-dataset",
+        nargs="+",
+        default=None,
+        help="Which dataset will be excluded with evaluation",
+    )
+    parser.add_argument(
+        "--datasetinfos",
+        nargs="+",
+        default=None,
+        help="If loading webdataset, spedify the dataset types to load. Can be some of these: train, test, valid, unbalanced_train, balanced_train, eval",
+    )
+    parser.add_argument(
+        "--dataset-proportion",
+        type=float,
+        default=1.0,
+        help="How much proportion of dataset we want to train.",
+    )
+    parser.add_argument(
+        "--remotedata",
+        default=False,
+        action="store_true",
+        help="if the dataset is remote, set this flag",
+    )
+    parser.add_argument(
+        "--class-label-path",
+        type=str,
+        default=None,
+        help="The path of the class label pickle or csv.",
+    )
+    parser.add_argument(
+        "--datasetpath",
+        type=str,
+        default="/mnt/audio_clip/webdataset_tar",
+        help="The path to the dataset",
+    )
+    parser.add_argument(
+        "--logs",
+        type=str,
+        default="./logs/",
+        help="Where to store tensorboard logs. Use None to avoid storing logs.",
+    )
+    parser.add_argument(
+        "--log-local",
+        action="store_true",
+        default=False,
+        help="log files on local master, otherwise global master only.",
+    )
+    parser.add_argument(
+        "--name",
+        type=str,
+        default=None,
+        help="Optional identifier for the experiment when storing logs. Otherwise use current time.",
+    )
+    parser.add_argument(
+        "--workers", type=int, default=1, help="Number of workers per GPU."
+    )
+    parser.add_argument(
+        "--batch-size", type=int, default=64, help="Batch size per GPU."
+    )
+    parser.add_argument(
+        "--epochs", type=int, default=32, help="Number of epochs to train for."
+    )
+    parser.add_argument("--lr", type=float, default=None, help="Learning rate.")
+    parser.add_argument("--beta1", type=float, default=None, help="Adam beta 1.")
+    parser.add_argument("--beta2", type=float, default=None, help="Adam beta 2.")
+    parser.add_argument("--eps", type=float, default=None, help="Adam epsilon.")
+    parser.add_argument("--momentum", type=float, default=None, help="SGD epsilon.")
+    parser.add_argument("--wd", type=float, default=0.2, help="Weight decay.")
+
+    parser.add_argument(
+        "--split-opt",
+        action="store_true",
+        default=False,
+        help="Use this flag to skip the learning rate decay.",
+    )
+    parser.add_argument(
+        "--lr-pretrained", type=float, default=None, help="Learning rate for text."
+    )
+    parser.add_argument(
+        "--beta1-pretrained", type=float, default=None, help="Adam beta 1 for text."
+    )
+    parser.add_argument(
+        "--beta2-pretrained", type=float, default=None, help="Adam beta 2 for text."
+    )
+    parser.add_argument(
+        "--eps-pretrained", type=float, default=None, help="Adam epsilon for text."
+    )
+    parser.add_argument(
+        "--wd-pretrained", type=float, default=0.2, help="Weight decay for text."
+    )
+    parser.add_argument(
+        "--momentum-pretrained", type=float, default=0.9, help="Momentum for text."
+    )
+    parser.add_argument(
+        "--lr-new", type=float, default=None, help="Learning rate for audio."
+    )
+    parser.add_argument(
+        "--beta1-new", type=float, default=None, help="Adam beta 1 for audio."
+    )
+    parser.add_argument(
+        "--beta2-new", type=float, default=None, help="Adam beta 2 for audio."
+    )
+    parser.add_argument(
+        "--eps-new", type=float, default=None, help="Adam epsilon for audio."
+    )
+    parser.add_argument(
+        "--wd-new", type=float, default=0.2, help="Weight decay for audio."
+    )
+    parser.add_argument(
+        "--momentum-new", type=float, default=0.9, help="Momentum for audio."
+    )
+    parser.add_argument(
+        "--warmup", type=int, default=10000, help="Number of steps to warmup for."
+    )
+    parser.add_argument(
+        "--use-bn-sync",
+        default=False,
+        action="store_true",
+        help="Whether to use batch norm sync.",
+    )
+    parser.add_argument(
+        "--skip-scheduler",
+        action="store_true",
+        default=False,
+        help="Use this flag to skip the learning rate decay.",
+    )
+    parser.add_argument(
+        "--save-frequency", type=int, default=1, help="How often to save checkpoints."
+    )
+    parser.add_argument(
+        "--save-top-performance",
+        type=int,
+        default=0,
+        help="Save the top x performance weights if the value >0",
+    )
+    parser.add_argument(
+        "--save-most-recent",
+        action="store_true",
+        default=False,
+        help="Always save the most recent model trained to epoch_latest.pt.",
+    )
+    parser.add_argument(
+        "--zeroshot-frequency", type=int, default=2, help="How often to run zero shot."
+    )
+    parser.add_argument(
+        "--val-frequency",
+        type=int,
+        default=1,
+        help="How often to run evaluation with val data.",
+    )
+    parser.add_argument(
+        "--resume",
+        default=None,
+        type=str,
+        help="path to latest checkpoint (default: none)",
+    )
+    parser.add_argument(
+        "--precision",
+        choices=["amp", "fp16", "fp32"],
+        default="amp",
+        help="Floating point precision.",
+    )
+    parser.add_argument(
+        "--amodel",
+        type=str,
+        default="RN50",
+        help="Name of the audio backbone to use.",
+    )
+    parser.add_argument(
+        "--tmodel",
+        type=str,
+        default="transformer",
+        help="Name of the text backbone to use. Can be [transformer, bert, roberta, bart]",
+    )
+    parser.add_argument(
+        "--pretrained-audio",
+        default="",
+        type=str,
+        help="Use a pretrained audio model weights for the audio encoder of CLAP",
+    )
+    parser.add_argument(
+        "--pretrained-text",
+        default="",
+        type=str,
+        help="Use a pretrained text model weights for the text encoder of CLAP",
+    )
+    parser.add_argument(
+        "--pretrained",
+        default="",
+        type=str,
+        help="Use a pretrained CLIP model weights with the specified tag or file path.",
+    )
+    parser.add_argument(
+        "--pretrained-image",
+        default=False,
+        action="store_true",
+        help="Load imagenet pretrained weights for image tower backbone if available.",
+    )
+    parser.add_argument(
+        "--lock-image",
+        default=False,
+        action="store_true",
+        help="Lock full image tower by disabling gradients.",
+    )
+    parser.add_argument(
+        "--lock-image-unlocked-groups",
+        type=int,
+        default=0,
+        help="Leave last n image tower layer groups unlocked.",
+    )
+    parser.add_argument(
+        "--lock-image-freeze-bn-stats",
+        default=False,
+        action="store_true",
+        help="Freeze BatchNorm running stats in image tower for any locked layers.",
+    )
+    parser.add_argument(
+        "--local-loss",
+        default=False,
+        action="store_true",
+        help="calculate loss w/ local features @ global (instead of realizing full global @ global matrix)",
+    )
+    parser.add_argument(
+        "--gather-with-grad",
+        default=False,
+        action="store_true",
+        help="enable full distributed gradient for feature gather",
+    )
+    parser.add_argument(
+        "--force-quick-gelu",
+        default=False,
+        action="store_true",
+        help="Force use of QuickGELU activation for non-OpenAI transformer models.",
+    )
+    parser.add_argument(
+        "--torchscript",
+        default=False,
+        action="store_true",
+        help="torch.jit.script the model, also uses jit version of OpenAI models if pretrained=='openai'",
+    )
+    parser.add_argument(
+        "--trace",
+        default=False,
+        action="store_true",
+        help="torch.jit.trace the model for inference / eval only",
+    )
+    # arguments for distributed training
+    parser.add_argument(
+        "--dist-url",
+        default="env://",
+        type=str,
+        help="url used to set up distributed training",
+    )
+    parser.add_argument(
+        "--dist-backend", default="nccl", type=str, help="distributed backend"
+    )
+    parser.add_argument(
+        "--report-to",
+        default="",
+        type=str,
+        help="Options are ['wandb', 'tensorboard', 'wandb,tensorboard']",
+    )
+    parser.add_argument(
+        "--wandb-notes", default="", type=str, help="Notes if logging with wandb"
+    )
+    parser.add_argument(
+        "--C", type=float, default=3.16, help="inverse regularizer for logistic reg."
+    )
+    parser.add_argument(
+        "--debug",
+        default=False,
+        action="store_true",
+        help="If true, more information is logged.",
+    )
+    parser.add_argument(
+        "--copy-codebase",
+        default=False,
+        action="store_true",
+        help="If true, we copy the entire base on the log diretory, and execute from there.",
+    )
+    parser.add_argument(
+        "--horovod",
+        default=False,
+        action="store_true",
+        help="Use horovod for distributed training.",
+    )
+    parser.add_argument(
+        "--ddp-static-graph",
+        default=False,
+        action="store_true",
+        help="Enable static graph optimization for DDP in PyTorch >= 1.11.",
+    )
+    parser.add_argument(
+        "--no-set-device-rank",
+        default=False,
+        action="store_true",
+        help="Don't set device index from local rank (when CUDA_VISIBLE_DEVICES restricted to one per proc).",
+    )
+    parser.add_argument("--seed", type=int, default=4242, help="Default random seed.")
+
+    parser.add_argument(
+        "--top-k-checkpoint-select-dataset",
+        type=str,
+        default="all",
+        help="The dataset of selecting top-k checkpoint.",
+    )
+
+    # @R10, @R@5, @R1, mAP@10
+    parser.add_argument(
+        "--top-k-checkpoint-select-metric",
+        type=str,
+        default="_R@10",
+        help="The metric for selecting top-k checkpoint.",
+    )
+    parser.add_argument(
+        "--openai-model-cache-dir",
+        type=str,
+        default="~/.cache/clip",
+        help="Directory to download OpenAI models.",
+    )
+    parser.add_argument(
+        "--optimizer",
+        type=str,
+        default="adamw",
+        help="can be AdamW or SGD",
+    )
+    parser.add_argument(
+        "--parallel-eval",
+        default=False,
+        action="store_true",
+        help="Eval in parallel (multi-GPU, multi-node).",
+    )
+
+    parser.add_argument(
+        "--no-eval",
+        default=False,
+        action="store_true",
+        help="Training without evaluation.",
+    )
+
+    parser.add_argument(
+        "--lp-mlp",
+        default=False,
+        action="store_true",
+        help="Linear Probe using MLP layer or not.",
+    )
+
+    parser.add_argument(
+        "--lp-freeze",
+        default=False,
+        action="store_true",
+        help="Linear Probe using Freeze CLAP or not",
+    )
+
+    parser.add_argument(
+        "--lp-act",
+        default="None",
+        type=str,
+        help="Options are ['relu','elu','prelu','softmax','sigmoid']",
+    )
+
+    parser.add_argument(
+        "--lp-loss", type=str, default="bce", help="Loss func of Linear Probe."
+    )
+
+    parser.add_argument(
+        "--lp-metrics",
+        type=str,
+        default="map,mauc,acc",
+        help="Metrics of Linear Probe.",
+    )
+
+    parser.add_argument(
+        "--lp-lr", type=float, default=1e-4, help="learning rate of linear probe"
+    )
+    parser.add_argument(
+        "--kappa", type=float, default=0,
+        help="the kappa in the weighted contrastive loss, default is to turn off the weighted contrastive loss"
+    )
+
+    parser.add_argument(
+        "--data-filling",
+        type=str,
+        default="pad",
+        help="type of data filling when the audio length is shorter than the max length."
+             "Can be one of the following: repeat, repeatpad, pad",
+    )
+    parser.add_argument(
+        "--data-truncating",
+        type=str,
+        default="rand_trunc",
+        help="type of data truncation when the audio length is longer than the max length."
+             "Can be one of the following: rand_trunc, fusion",
+    )
+
+    parser.add_argument(
+        "--clap-mlploss",
+        default=False,
+        action="store_true",
+        help="Using MLP loss for CLAP model or not",
+    )
+
+    parser.add_argument(
+        "--wandb-id",
+        type=str,
+        default=None,
+        help="the id of wandb experiment to restore.",
+    )
+
+    parser.add_argument(
+        "--sleep", type=float, default=0, help="sleep n seconds before start training"
+    )
+
+    # variable length processing
+    parser.add_argument(
+        "--enable-fusion",
+        default=False,
+        action="store_true",
+        help="Enable feature funsion for variable-length data",
+    )
+
+    parser.add_argument(
+        "--fusion-type",
+        type=str,
+        default='None',
+        help="Type is among ['channel_map', 'daf_1d','aff_1d','iaff_1d','daf_2d','aff_2d','iaff_2d']",
+    )
+
+    parser.add_argument(
+        "--mixup",
+        default=False,
+        action="store_true",
+        help="Enable mixup in finetuning training.",
+    )
+    parser.add_argument(
+        "--text-augment-selection",
+        type=str,
+        default=None,
+        help="For selecting levels of augmented text. Type is among ['all', 'augment_only', 'none']",
+    )
+    parser.add_argument(
+        "--prefetch-factor",
+        type=int,
+        default=None,
+        help="The prefetch factor for dataloader. Larger value will use more memory and CPU but faster.",
+    )
+
+    args = parser.parse_args()
+
+    # If some params are not passed, we use the default values based on model name.
+    default_params = get_default_params(args.amodel)
+    for name, val in default_params.items():
+        if getattr(args, name) is None:
+            setattr(args, name, val)
+
+    return args
diff --git a/my_laion_clap/CLAP/src/laion_clap/training/scheduler.py b/my_laion_clap/CLAP/src/laion_clap/training/scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0bfdf796c95df003582ede43f0511bd9181c1e4
--- /dev/null
+++ b/my_laion_clap/CLAP/src/laion_clap/training/scheduler.py
@@ -0,0 +1,23 @@
+import numpy as np
+
+
+def assign_learning_rate(optimizer, new_lr):
+    for param_group in optimizer.param_groups:
+        param_group["lr"] = new_lr
+
+
+def _warmup_lr(base_lr, warmup_length, step):
+    return base_lr * (step + 1) / warmup_length
+
+
+def cosine_lr(optimizer, base_lr, warmup_length, steps):
+    def _lr_adjuster(step):
+        if step < warmup_length:
+            lr = _warmup_lr(base_lr, warmup_length, step)
+        else:
+            e = step - warmup_length
+            es = steps - warmup_length
+            lr = 0.5 * (1 + np.cos(np.pi * e / es)) * base_lr
+        assign_learning_rate(optimizer, lr)
+        return lr
+    return _lr_adjuster
\ No newline at end of file
diff --git a/my_laion_clap/CLAP/src/laion_clap/training/train.py b/my_laion_clap/CLAP/src/laion_clap/training/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..06db94158fe15c0a17c95babf60b75d31e8893c7
--- /dev/null
+++ b/my_laion_clap/CLAP/src/laion_clap/training/train.py
@@ -0,0 +1,781 @@
+import json
+import logging
+import math
+import os
+import time
+from contextlib import suppress
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+try:
+    import wandb
+except ImportError:
+    wandb = None
+
+from clap_module import ClipLoss, gather_features
+from .distributed import is_master
+
+
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+
+def unwrap_model(model):
+    if hasattr(model, "module"):
+        return model.module
+    else:
+        return model
+
+
+def train_one_epoch(
+        model, data, epoch, optimizer, scaler, scheduler, args, tb_writer=None
+):
+    device = torch.device(args.device)
+    autocast = torch.cuda.amp.autocast if args.precision == "amp" else suppress
+    model.train()
+    loss = ClipLoss(
+        local_loss=args.local_loss,
+        gather_with_grad=args.gather_with_grad,
+        cache_labels=True,
+        rank=args.rank,
+        world_size=args.world_size,
+        use_horovod=args.horovod,
+        mlp_loss=args.clap_mlploss,
+        weight_loss_kappa=args.kappa,
+    )
+
+    dataloader, sampler = data["train"].dataloader, data["train"].sampler
+    if args.distributed and sampler is not None:
+        sampler.set_epoch(epoch)
+    num_batches_per_epoch = dataloader.num_batches
+    sample_digits = math.ceil(math.log(dataloader.num_samples + 1, 10))
+
+    # for toy dataset
+    if args.dataset_type == "toy":
+        dataloader.dataset.generate_queue()
+
+    loss_m = AverageMeter()
+    batch_time_m = AverageMeter()
+    data_time_m = AverageMeter()
+    end = time.time()
+
+    for i, batch in enumerate(dataloader):
+        # logging.info(f"batch {i} of {num_batches_per_epoch}")
+        step = num_batches_per_epoch * epoch + i
+        if isinstance(scheduler, dict):
+            for s in scheduler.values():
+                s(step)
+        else:
+            scheduler(step)
+        audios = batch  # contains mel_spec, wavform, and longer list
+        texts = batch['text']
+        # audios = audios.to(device=device, non_blocking=True)
+        # texts = texts.to(device=device, non_blocking=True)
+
+        data_time_m.update(time.time() - end)
+        if isinstance(optimizer, dict):
+            for o_ in optimizer.values():
+                o_.zero_grad()
+        else:
+            optimizer.zero_grad()
+
+        with autocast():
+            (
+                audio_features,
+                text_features,
+                audio_features_mlp,
+                text_features_mlp,
+                logit_scale_a,
+                logit_scale_t,
+            ) = model(audios, texts, device)
+
+            if args.clap_mlploss:
+                total_loss = loss(
+                    audio_features=audio_features,
+                    text_features=text_features,
+                    logit_scale_a=logit_scale_a,
+                    logit_scale_t=logit_scale_t,
+                    audio_features_mlp=audio_features_mlp,
+                    text_features_mlp=text_features_mlp
+                )
+            else:
+                total_loss = loss(
+                    audio_features=audio_features,
+                    text_features=text_features,
+                    logit_scale_a=logit_scale_a
+                )
+        if isinstance(optimizer, dict):
+            if scaler is not None:
+                scaler.scale(total_loss).backward()
+                for o_ in optimizer.values():
+                    if args.horovod:
+                        o_.synchronize()
+                        scaler.unscale_(o_)
+                        with o_.skip_synchronize():
+                            scaler.step(o_)
+                    else:
+                        scaler.step(o_)
+                scaler.update()
+            else:
+                total_loss.backward()
+                for o_ in optimizer.values():
+                    o_.step()
+        else:
+            if scaler is not None:
+                scaler.scale(total_loss).backward()
+                if args.horovod:
+                    optimizer.synchronize()
+                    scaler.unscale_(optimizer)
+                    with optimizer.skip_synchronize():
+                        scaler.step(optimizer)
+                else:
+                    scaler.step(optimizer)
+                scaler.update()
+            else:
+                total_loss.backward()
+                optimizer.step()
+
+        # Note: we clamp to 4.6052 = ln(100), as in the original paper.
+        with torch.no_grad():
+            unwrap_model(model).logit_scale_a.clamp_(0, math.log(100))
+            if args.clap_mlploss:
+                unwrap_model(model).logit_scale_t.clamp_(0, math.log(100))
+
+        batch_time_m.update(time.time() - end)
+        end = time.time()
+        batch_count = i + 1
+        if is_master(args) and (i % 100 == 0 or batch_count == num_batches_per_epoch):
+            if isinstance(audios, dict):
+                batch_size = len(audios["waveform"])
+            else:
+                batch_size = len(audios)
+            num_samples = batch_count * batch_size * args.world_size
+            samples_per_epoch = dataloader.num_samples
+            percent_complete = 100.0 * batch_count / num_batches_per_epoch
+
+            # NOTE loss is coarsely sampled, just master node and per log update
+            loss_m.update(total_loss.item(), batch_size)
+            logit_scale_scalar_a = logit_scale_a.item()
+            logit_scale_scalar_t = logit_scale_t.item()
+            if isinstance(optimizer, dict):
+                if args.clap_mlploss:
+                    logging.info(
+                        f"Train Epoch: {epoch} [{num_samples:>{sample_digits}}/{samples_per_epoch} ({percent_complete:.0f}%)] "
+                        f"Loss: {loss_m.val:#.5g} ({loss_m.avg:#.4g}) "
+                        f"Data (t): {data_time_m.avg:.3f} "
+                        f"Batch (t): {batch_time_m.avg:.3f} "
+                        f"LR: {[o_.param_groups[0]['lr'] for o_ in optimizer.values()]} "
+                        f"Logit Scale Audio: {logit_scale_scalar_a:.3f}"
+                        f"Logit Scale Text: {logit_scale_scalar_t:.3f}"
+                    )
+                    log_data = {
+                        "loss": loss_m.val,
+                        "data_time": data_time_m.val,
+                        "batch_time": batch_time_m.val,
+                        "scale_audio": logit_scale_scalar_a,
+                        "scale_text": logit_scale_scalar_t,
+                        "lr": [o_.param_groups[0]["lr"] for o_ in optimizer.values()],
+                    }
+                else:
+                    logging.info(
+                        f"Train Epoch: {epoch} [{num_samples:>{sample_digits}}/{samples_per_epoch} ({percent_complete:.0f}%)] "
+                        f"Loss: {loss_m.val:#.5g} ({loss_m.avg:#.4g}) "
+                        f"Data (t): {data_time_m.avg:.3f} "
+                        f"Batch (t): {batch_time_m.avg:.3f} "
+                        f"LR: {[o_.param_groups[0]['lr'] for o_ in optimizer.values()]} "
+                        f"Logit Scale Audio: {logit_scale_scalar_a:.3f}"
+                    )
+                    log_data = {
+                        "loss": loss_m.val,
+                        "data_time": data_time_m.val,
+                        "batch_time": batch_time_m.val,
+                        "scale_audio": logit_scale_scalar_a,
+                        "lr": [o_.param_groups[0]["lr"] for o_ in optimizer.values()],
+                    }
+
+            else:
+                if args.clap_mlploss:
+                    logging.info(
+                        f"Train Epoch: {epoch} [{num_samples:>{sample_digits}}/{samples_per_epoch} ({percent_complete:.0f}%)] "
+                        f"Loss: {loss_m.val:#.5g} ({loss_m.avg:#.4g}) "
+                        f"Data (t): {data_time_m.avg:.3f} "
+                        f"Batch (t): {batch_time_m.avg:.3f} "
+                        f"LR: {optimizer.param_groups[0]['lr']:5f} "
+                        f"Logit Scale Audio: {logit_scale_scalar_a:.3f}"
+                        f"Logit Scale Text: {logit_scale_scalar_t:.3f}"
+                    )
+
+                    # Save train loss / etc. Using non avg meter values as loggers have their own smoothing
+                    log_data = {
+                        "loss": loss_m.val,
+                        "data_time": data_time_m.val,
+                        "batch_time": batch_time_m.val,
+                        "scale_audio": logit_scale_scalar_a,
+                        "scale_text": logit_scale_scalar_t,
+                        "lr": optimizer.param_groups[0]["lr"],
+                    }
+                else:
+                    logging.info(
+                        f"Train Epoch: {epoch} [{num_samples:>{sample_digits}}/{samples_per_epoch} ({percent_complete:.0f}%)] "
+                        f"Loss: {loss_m.val:#.5g} ({loss_m.avg:#.4g}) "
+                        f"Data (t): {data_time_m.avg:.3f} "
+                        f"Batch (t): {batch_time_m.avg:.3f} "
+                        f"LR: {optimizer.param_groups[0]['lr']:5f} "
+                        f"Logit Scale Audio: {logit_scale_scalar_a:.3f}"
+                    )
+
+                    # Save train loss / etc. Using non avg meter values as loggers have their own smoothing
+                    log_data = {
+                        "loss": loss_m.val,
+                        "data_time": data_time_m.val,
+                        "batch_time": batch_time_m.val,
+                        "scale_audio": logit_scale_scalar_a,
+                        "lr": optimizer.param_groups[0]["lr"],
+                    }
+            for name, val in log_data.items():
+                name = "train/" + name
+                if tb_writer is not None:
+                    tb_writer.add_scalar(name, val, step)
+                if args.wandb:
+                    assert wandb is not None, "Please install wandb."
+                    wandb.log({name: val, "step": step})
+
+            # resetting batch / data time meters per log window
+            batch_time_m.reset()
+            data_time_m.reset()
+    # end for
+
+
+def evaluate(model, data, epoch, args, tb_writer=None):
+    metrics = {}
+    if not args.parallel_eval:
+        if not is_master(args):
+            return metrics
+    device = torch.device(args.device)
+    model.eval()
+
+    # CHANGE
+    # zero_shot_metrics = zero_shot_eval(model, data, epoch, args)
+    # metrics.update(zero_shot_metrics)
+    if is_master(args):
+        print('Evaluating...')
+    autocast = torch.cuda.amp.autocast if args.precision == "amp" else suppress
+    if args.val_dataset_names == ['Clotho', 'audiocaps']:
+        # if only clotho and audiocaps are used, then we will use a different evaluation function.
+        # This is because in the Clotho and audiocaps valid and test set, there are 5 text for 1 audio.
+        if args.parallel_eval:
+            # (yusong): just a hack here. Don't use parallel eval when evaluating only clotho and audiocaps.
+            raise NotImplementedError("Parallel evaluation not supported for eval only Clotho and audiocaps.")
+        val_metrics_per_dataset = evaluate_clotho_audiocaps(model, data, epoch, args, autocast, device, tb_writer)
+        for m in val_metrics_per_dataset.values():
+            metrics.update(m)
+        if "epoch" not in metrics.keys():
+            metrics.update({"epoch": epoch})
+        metrics = select_top_metric_clotho_audiocaps(metrics, val_metrics_per_dataset, args)
+    elif "val" in data and (
+            args.val_frequency
+            and ((epoch % args.val_frequency) == 0 or epoch == args.epochs)
+    ):
+        dataloader = data["val"].dataloader
+        num_samples = 0
+        samples_per_val = dataloader.num_samples
+
+        # FIXME this does not scale past small eval datasets
+        # all_audio_features @ all_text_features will blow up memory and compute very quickly
+        eval_info = {}
+        if args.clap_mlploss:
+            eval_info["all"] = {
+                "cumulative_loss": 0.0,
+                "num_samples": 0,
+                "all_audio_features": [],
+                "all_text_features": [],
+                "all_audio_features_mlp": [],
+                "all_text_features_mlp": []
+            }  # cumulative_loss = 0.0
+        else:
+            eval_info["all"] = {
+                "cumulative_loss": 0.0,
+                "num_samples": 0,
+                "all_audio_features": [],
+                "all_text_features": []
+            }  # cumu
+        # all_audio_features, all_text_features, all_audio_features_mlp, all_text_features_mlp = [], [], [], []
+        with torch.no_grad():
+            for i, batch in enumerate(dataloader):
+                audios = batch  # contains mel_spec, wavform, and longer list
+                texts = batch['text']
+                # audios = audios.to(device=device, non_blocking=True)
+
+                all_names = list(set(["-".join(b.split("/")[-3:-1]) for b in batch['__url__']]))
+                for name in all_names:
+                    if name not in eval_info.keys():
+                        if args.clap_mlploss:
+                            eval_info[name] = {
+                                "cumulative_loss": 0.0,
+                                "num_samples": 0,
+                                "all_audio_features": [],
+                                "all_text_features": [],
+                                "all_audio_features_mlp": [],
+                                "all_text_features_mlp": [],
+                            }
+                        else:
+                            eval_info[name] = {
+                                "cumulative_loss": 0.0,
+                                "num_samples": 0,
+                                "all_audio_features": [],
+                                "all_text_features": []
+                            }
+                with autocast():
+                    (
+                        audio_features,
+                        text_features,
+                        audio_features_mlp,
+                        text_features_mlp,
+                        logit_scale_a,
+                        logit_scale_t,
+                    ) = model(audios, texts, device)
+
+                    if args.parallel_eval:
+                        # multi-GPU eval
+                        if args.clap_mlploss:
+                            (
+                                audio_features,
+                                text_features,
+                                audio_features_mlp,
+                                text_features_mlp,
+                            ) = gather_features(
+                                audio_features=audio_features,
+                                text_features=text_features,
+                                audio_features_mlp=audio_features_mlp,
+                                text_features_mlp=text_features_mlp,
+                                local_loss=False,
+                                gather_with_grad=False,
+                                rank=args.rank,
+                                world_size=args.world_size,
+                                use_horovod=args.horovod,
+                                mlp_loss=args.clap_mlploss
+                            )
+                        else:
+                            (
+                                audio_features,
+                                text_features,
+                            ) = gather_features(
+                                audio_features=audio_features,
+                                text_features=text_features,
+                                local_loss=False,
+                                gather_with_grad=False,
+                                rank=args.rank,
+                                world_size=args.world_size,
+                                use_horovod=args.horovod,
+                                mlp_loss=args.clap_mlploss
+                            )
+
+                    if is_master(args):
+                        num_samples += audio_features.shape[0]
+                        for n in [*all_names, "all"]:
+                            if n == "all":
+                                eval_info[n]["all_audio_features"].append(
+                                    audio_features.cpu()
+                                )
+                                eval_info[n]["all_text_features"].append(
+                                    text_features.cpu()
+                                )
+                                if args.clap_mlploss:
+                                    eval_info[n]["all_audio_features_mlp"].append(
+                                        audio_features_mlp.cpu()
+                                    )
+                                    eval_info[n]["all_text_features_mlp"].append(
+                                        text_features_mlp.cpu()
+                                    )
+                            else:
+                                idx = np.where(
+                                    np.array(
+                                        ["-".join(b.split("/")[-3:-1]) for b in batch['__url__']]
+                                    )
+                                    == n
+                                )[0]
+                                eval_info[n]["all_audio_features"].append(
+                                    audio_features.cpu().index_select(
+                                        0, torch.tensor(idx).long()
+                                    )
+                                )
+                                eval_info[n]["all_text_features"].append(
+                                    text_features.cpu().index_select(
+                                        0, torch.tensor(idx).long()
+                                    )
+                                )
+                                if args.clap_mlploss:
+                                    eval_info[n]["all_audio_features_mlp"].append(
+                                        audio_features_mlp.cpu().index_select(
+                                            0, torch.tensor(idx).long()
+                                        )
+                                    )
+                                    eval_info[n]["all_text_features_mlp"].append(
+                                        text_features_mlp.cpu().index_select(
+                                            0, torch.tensor(idx).long()
+                                        )
+                                    )
+                        #  print(f'eval step {i}') #  (yusong): for debug
+
+                # cumulative_loss += total_loss * batch_size
+                # num_samples += batch_size
+                if is_master(args) and (i % 100) == 0:  # and i != 0:
+                    logging.info(
+                        f"Eval Epoch: {epoch} [{num_samples} / {samples_per_val}]"
+                    )
+            if is_master(args):
+                val_metrics_per_dataset = {}
+                for n in eval_info.keys():
+                    if args.clap_mlploss:
+                        metrics_single_dataset = get_metrics(
+                            audio_features=torch.cat(eval_info[n]["all_audio_features"]),
+                            text_features=torch.cat(eval_info[n]["all_text_features"]),
+                            logit_scale_a=logit_scale_a.cpu(),
+                            audio_features_mlp=torch.cat(
+                                eval_info[n]["all_audio_features_mlp"]
+                            ),
+                            text_features_mlp=torch.cat(eval_info[n]["all_text_features_mlp"]),
+                            logit_scale_t=logit_scale_t.cpu(),
+                            mlp_loss=args.clap_mlploss
+                        )
+                    else:
+                        metrics_single_dataset = get_metrics(
+                            audio_features=torch.cat(eval_info[n]["all_audio_features"]),
+                            text_features=torch.cat(eval_info[n]["all_text_features"]),
+                            logit_scale_a=logit_scale_a.cpu(),
+                            mlp_loss=args.clap_mlploss
+                        )
+                    val_metrics_per_dataset[n] = {
+                        n + "/" + k: v for k, v in metrics_single_dataset.items()
+                    }
+                    metrics.update(val_metrics_per_dataset[n])
+                    if "epoch" not in metrics.keys():
+                        metrics.update({"epoch": epoch})
+    if is_master(args):
+        if not metrics:
+            return metrics
+
+        logging.info(
+            f"Eval Epoch: {epoch} "
+            + "\n".join(
+                [
+                    "\t".join([f"{k}: {round(v, 4):.4f}" for k, v in m.items()])
+                    for m in val_metrics_per_dataset.values()
+                ]
+            )
+        )
+
+        if args.save_logs:
+            for name, val in metrics.items():
+                if tb_writer is not None:
+                    tb_writer.add_scalar(f"val/{name}", val, epoch)
+
+            with open(os.path.join(args.checkpoint_path, "results.jsonl"), "a+") as f:
+                f.write(json.dumps(metrics))
+                f.write("\n")
+
+        if args.wandb:
+            assert wandb is not None, "Please install wandb."
+            for name, val in metrics.items():
+                wandb.log({f"val/{name}": val, "epoch": epoch})
+
+        return metrics
+    else:
+        return metrics
+
+
+def get_metrics(
+        audio_features,
+        text_features,
+        logit_scale_a,
+        audio_features_mlp=None,
+        text_features_mlp=None,
+        logit_scale_t=None,
+        mlp_loss=False
+):
+    metrics = {}
+    if mlp_loss:
+        # Set up audio to text & text to audio similary matrice
+        a_logits_per_audio = (
+            (logit_scale_a * audio_features @ text_features_mlp.t()).detach().cpu()
+        )
+        a_logits_per_text = a_logits_per_audio.t().detach().cpu()
+        t_logits_per_audio = (
+            (logit_scale_t * audio_features_mlp @ text_features.t()).detach().cpu()
+        )
+        t_logits_per_text = t_logits_per_audio.t().detach().cpu()
+
+        labels = torch.arange(audio_features.shape[0]).long()
+        # Change the loss from two terms into four terms with 2x2 combined CE loss
+        total_loss = (
+                             F.cross_entropy(a_logits_per_audio, labels)
+                             + F.cross_entropy(a_logits_per_text, labels)
+                             + F.cross_entropy(t_logits_per_audio, labels)
+                             + F.cross_entropy(t_logits_per_text, labels)
+                     ) / 4
+
+        metrics[f"cumulative_loss"] = total_loss.item()
+        metrics[f"num_samples"] = audio_features.shape[0]
+
+        logits = {
+            "audio_to_text": (a_logits_per_audio + t_logits_per_audio) / 2,
+            "text_to_audio": (a_logits_per_text + t_logits_per_text) / 2,
+        }
+        ground_truth = torch.arange(len(text_features)).view(-1, 1)
+
+    else:
+        # print("text_features", text_features)
+        # print("text_features.shape", text_features.shape)
+        logits_per_audio = (logit_scale_a * audio_features @ text_features.t()).detach().cpu()
+        logits_per_text = logits_per_audio.t().detach().cpu()
+
+        labels = torch.arange(audio_features.shape[0]).long()
+        # Change the loss from two terms into four terms with 2x2 combined CE loss
+        total_loss = (
+                             F.cross_entropy(logits_per_audio, labels)
+                             + F.cross_entropy(logits_per_text, labels)
+                     ) / 2
+
+        metrics[f"cumulative_loss"] = total_loss.item()
+        metrics[f"num_samples"] = audio_features.shape[0]
+
+        logits = {"audio_to_text": logits_per_audio, "text_to_audio": logits_per_text}
+
+        ground_truth = torch.arange(len(text_features)).view(-1, 1)
+
+    for name, logit in logits.items():
+        ranking = torch.argsort(logit, descending=True)
+        preds = torch.where(ranking == ground_truth)[1]  # (yusong) this line is slow because it uses single thread
+        preds = preds.detach().cpu().numpy()
+        metrics[f"{name}_mean_rank"] = preds.mean() + 1
+        metrics[f"{name}_median_rank"] = np.floor(np.median(preds)) + 1
+        for k in [1, 5, 10]:
+            metrics[f"{name}_R@{k}"] = np.mean(preds < k)
+        # map@10
+        metrics[f"{name}_mAP@10"] = np.mean(np.where(preds < 10, 1 / (preds + 1), 0.0))
+
+    return metrics
+
+
+def evaluate_clotho_audiocaps(
+        model, data, epoch, args, autocast, device, tb_writer=None
+):
+    """
+    Adapted from https://github.com/XinhaoMei/audio-text_retrieval/blob/main/tools/utils.py.
+    1. for text-to-audio retrieval, do 5 times and average the results
+    2. for R@1, R@5, R@10 in audio-to-text retrieval, take the best rank among 5 text
+    3. for map@10 in audio-to-text retrieval:
+        3.1: sort the rank of 5 text
+        3.2: exclude the rank >=10 (0-index)
+        3.3: compute the map regarding the remaining ranks: np.mean(np.arange(1, len(ranks)+1) / ranks).
+        (3.3) That is, take the top ranks of 5 text that is < 10, and assign the descending number as ground truth.
+        (3.3) E.g.: the ground truth of first rank of the 5 text should be 1, the second rank should be 2, etc.
+    """
+    # TODO: (yusong) only support single GPU evaluation and only support non-mlp case for now.
+    dataloader = data["val"].dataloader
+    with torch.no_grad():
+        eval_info = {}
+        for i, batch in enumerate(dataloader):
+            audios = batch  # contains mel_spec, wavform, and longer list
+
+            # each item in the list has 5 texts
+            if args.tmodel == "transformer":
+                from clap_module import tokenize
+                texts = [tokenize(t) for t in batch['full_text']]
+                texts = torch.cat(texts)
+            else:
+                from .data import tokenizer
+                texts = [tokenizer(t, tmodel=args.tmodel) for t in batch['full_text']]  # 5 texts for each audio
+                texts = {k: torch.cat([t[k] for t in texts]) for k in texts[0].keys()}  # 5 x batch
+
+            # audios = audios.to(device=device, non_blocking=True)
+
+            # batch['__url__'] contains the path to the data tar this sample is from
+            # So, b.split("/")[-3:-1] will get you '<dataset_name>-<dataset-split>'
+            all_names = list(set(["-".join(b.split("/")[-3:-1]) for b in batch['__url__']]))
+            for name in all_names:
+                if name not in eval_info.keys():
+                    # we will not use mlp outputs even if args.clap_mlploss=True
+                    eval_info[name] = {
+                        "cumulative_loss": 0.0,
+                        "num_samples": 0,
+                        "all_audio_features": [],
+                        "all_text_features": []
+                    }
+            with autocast():
+                audio_features = model(audios, None, device)
+                text_features = model(None, texts, device)
+                audio_features = F.normalize(audio_features, dim=-1)
+                text_features = F.normalize(text_features, dim=-1)
+
+                all_names = list(set(["-".join(b.split("/")[-3:-1]) for b in batch['__url__']]))
+                for n in all_names:
+                    idx = np.where(
+                        np.array(
+                            ["-".join(b.split("/")[-3:-1]) for b in batch['__url__']]
+                        )
+                        == n
+                    )[0]
+                    eval_info[n]["all_audio_features"].append(
+                        audio_features.cpu().index_select(
+                            0, torch.tensor(idx).long()
+                        )
+                    )
+                    # (yusong) please double-check. This is for selecting 5 text features at once.
+                    # because idx is a list of indices in size of num_samples,
+                    # and text_features is a tensor of size (5*num_samples, dim)
+                    # so we need to select 5 consecutive indices at once for a single index in idx.
+                    eval_info[n]["all_text_features"].append(
+                        text_features.cpu().reshape([-1, 5, text_features.shape[1]]).index_select(
+                            0, torch.tensor(idx).long()
+                        ).reshape([-1, text_features.shape[1]])
+                    )
+
+        val_metrics_all = {}
+
+        for n in eval_info.keys():
+            logit_scale_a, logit_scale_t = model(None, None, device)
+            logit_scale_a = logit_scale_a.cpu()
+
+            audio_features = torch.cat(eval_info[n]["all_audio_features"], dim=0)
+            text_features = torch.cat(eval_info[n]["all_text_features"], dim=0)
+
+            logits_per_audio = (logit_scale_a * audio_features @ text_features.t()).detach().cpu()
+            logits_per_text = logits_per_audio.t().detach().cpu()
+
+            # logits_per_audio shape: [num_samples, num_samples*5]
+            # logits_per_text shape: [num_samples*5, num_samples]
+
+            logging.info(f"dataset {n}, logits_per_audio shape: {logits_per_audio.shape}, "
+                         f"logits_per_text shape: {logits_per_text.shape}")
+
+            metrics = {}
+            num_samples = audio_features.shape[0]
+            metrics[f"num_samples"] = num_samples
+
+            # (yusong) the following code is very important, please double-check:
+            # logits_per_audio.reshape(num_samples, num_samples, 5)[:, :, d]
+            # logits_per_text.reshape(num_samples, 5, num_samples)[:, d, :]
+            # Those two are retrieving one of the 5 text for each audio.
+            labels = torch.arange(audio_features.shape[0]).long()
+            audio_to_text_loss = [
+                F.cross_entropy(
+                    logits_per_audio.reshape(num_samples, num_samples, 5)[:, :, d], labels) for d in range(5)
+            ]
+            text_to_audio_loss = [
+                F.cross_entropy(
+                    logits_per_text.reshape(num_samples, 5, num_samples)[:, d, :], labels) for d in range(5)
+            ]
+            total_loss = (
+                                 np.mean(audio_to_text_loss) + np.mean(text_to_audio_loss)
+                         ) / 2
+
+            metrics[f"cumulative_loss"] = total_loss.item()
+
+            # text to audio: do 5 times
+            pred_text = []
+            for d in range(5):
+                logit = logits_per_text.reshape(num_samples, 5, num_samples)[:, d, :]
+                ground_truth = torch.arange(len(logit)).view(-1, 1)
+                ranking = torch.argsort(logit, descending=True)  # [num_samples, num_samples]
+                preds = torch.where(ranking == ground_truth)[1]
+                pred_text.append(preds.detach().cpu().numpy())
+            pred_text_concat = np.concatenate(pred_text, axis=0)  # [5*num_samples]
+            metrics[f"text_to_audio_mean_rank"] = pred_text_concat.mean() + 1
+            metrics[f"text_to_audio_median_rank"] = np.floor(np.median(pred_text_concat)) + 1
+            for k in [1, 5, 10]:
+                metrics[f"text_to_audio_R@{k}"] = np.mean(pred_text_concat < k)
+            # map@10
+            metrics[f"text_to_audio_mAP@10"] = np.mean(np.where(pred_text_concat < 10, 1 / (pred_text_concat + 1), 0.0))
+
+            # audio to text: take the best result
+            # for audio to text map 10, sort and assign descending ground truth.
+            # see https://github.com/XinhaoMei/audio-text_retrieval/blob/main/tools/utils.py#L103
+            # map@10
+            map_all = []
+            pred_audio_all = []
+            for d in range(num_samples):
+                # logits_per_audio: [num_samples, num_samples*5]
+                logit_single = logits_per_audio[d, :]  # [5*num_samples]
+                # Ground-truth index: [d*5, d*5+1, d*5+2, d*5+3, d*5+4]
+                ranking = torch.argsort(logit_single, descending=True)  # [5*num_samples]
+                # ranking: the index of first match, second match, ...
+                ground_truth = torch.arange(d * 5, d * 5 + 5)[None]
+                all_pred = torch.where(torch.stack([ranking] * 5) == ground_truth.view(-1, 1))[1]
+                min_pred = torch.min(all_pred)
+                pred_audio_all.append(min_pred.detach().cpu().numpy())
+                all_pred_filter = all_pred[all_pred < 10].detach().cpu().numpy()
+                # /5 because we have 5 text, so it means for the text rank >=10 we count as 0.
+                map_single = np.sum((np.arange(1, len(all_pred_filter) + 1) / (all_pred_filter + 1))) / 5
+                map_all.append(map_single)
+            metrics[f"audio_to_text_mAP@10"] = np.mean(map_all)
+            for k in [1, 5, 10]:
+                metrics[f"audio_to_text_R@{k}"] = np.mean(np.array(pred_audio_all) < k)
+
+            val_metrics_all[n] = {
+                n + "/" + k: v for k, v in metrics.items()
+            }
+    return val_metrics_all
+
+
+def calculate_selection_performance_clotho_audiocaps(val_metrics_per_dataset):
+    """
+    Calculate performance for Clotho+AudioCaps for model selection.
+    """
+    selection_performance_all = []
+    for n in val_metrics_per_dataset.keys():
+        selection_performance = (val_metrics_per_dataset[n][f"{n}/audio_to_text_mAP@10"] +
+                                 val_metrics_per_dataset[n][f"{n}/text_to_audio_mAP@10"]) / 2
+        selection_performance_all.append(selection_performance)
+    return np.mean(selection_performance_all)
+
+
+def select_top_metric_clotho_audiocaps(metrics, val_metrics_per_dataset, args):
+    # val_metrics_per_dataset: dict, key: dataset name, value: dict, key: metric name, value: metric value
+    # metrics: dict, key: metric name, value: metric value
+    # Hack: use args to save the top performance
+    if not hasattr(args, "top_selection_performance"):
+        selection_performance = calculate_selection_performance_clotho_audiocaps(val_metrics_per_dataset)
+        # TODO: write the if and else together
+        metric_update = {}
+        for n in val_metrics_per_dataset.keys():
+            for k in val_metrics_per_dataset[n].keys():
+                metric_update[k.split('/')[0] + '-top' + '/' + k.split('/')[1]] = val_metrics_per_dataset[n][k]
+        metric_update['top_selection_performance'] = selection_performance
+        metric_update['top-selection-epoch'] = metrics['epoch']
+        metrics.update(metric_update)
+        args.top_metric = metric_update
+        args.top_selection_performance = selection_performance
+    else:
+        selection_performance_new = calculate_selection_performance_clotho_audiocaps(val_metrics_per_dataset)
+        selection_performance_old = args.top_selection_performance
+        if selection_performance_new > selection_performance_old:
+            metric_update = {}
+            for n in val_metrics_per_dataset.keys():
+                for k in val_metrics_per_dataset[n].keys():
+                    metric_update[k.split('/')[0] + '-top' + '/' + k.split('/')[1]] = val_metrics_per_dataset[n][k]
+            metric_update['top_selection_performance'] = selection_performance_new
+            metric_update['top-selection-epoch'] = metrics['epoch']
+            metrics.update(metric_update)
+            args.top_metric = metric_update
+            args.top_selection_performance = selection_performance_new
+        else:
+            metrics.update(args.top_metric)
+    return metrics
diff --git a/my_laion_clap/CLAP/src/laion_clap/training/zero_shot.py b/my_laion_clap/CLAP/src/laion_clap/training/zero_shot.py
new file mode 100644
index 0000000000000000000000000000000000000000..04472c16e36041f90c8f229c5e026dcc394fb977
--- /dev/null
+++ b/my_laion_clap/CLAP/src/laion_clap/training/zero_shot.py
@@ -0,0 +1,90 @@
+# NOTE: This script is currently not supported for CLAP.
+import logging
+from contextlib import suppress
+
+import torch
+import torch.nn.functional as F
+from tqdm import tqdm
+
+from clap_module import tokenize
+from .imagenet_zeroshot_data import imagenet_classnames, openai_imagenet_template
+
+
+def zero_shot_classifier(model, classnames, templates, args):
+    with torch.no_grad():
+        zeroshot_weights = []
+        for classname in tqdm(classnames):
+            texts = [template(classname) for template in templates]  # format with class
+            texts = tokenize(texts).to(args.device)  # tokenize
+            if args.distributed and not args.horovod:
+                class_embeddings = model.module.encode_text(texts)
+            else:
+                class_embeddings = model.encode_text(texts)
+            class_embedding = F.normalize(class_embeddings, dim=-1).mean(dim=0)
+            class_embedding /= class_embedding.norm()
+            zeroshot_weights.append(class_embedding)
+        zeroshot_weights = torch.stack(zeroshot_weights, dim=1).to(args.device)
+    return zeroshot_weights
+
+
+def accuracy(output, target, topk=(1,)):
+    pred = output.topk(max(topk), 1, True, True)[1].t()
+    correct = pred.eq(target.view(1, -1).expand_as(pred))
+    return [float(correct[:k].reshape(-1).float().sum(0, keepdim=True).cpu().numpy()) for k in topk]
+
+
+def run(model, classifier, dataloader, args):
+    autocast = torch.cuda.amp.autocast if args.precision == 'amp' else suppress
+    with torch.no_grad():
+        top1, top5, n = 0., 0., 0.
+        for images, target in tqdm(dataloader, unit_scale=args.batch_size):
+            images = images.to(args.device)
+            target = target.to(args.device)
+
+            with autocast():
+                # predict
+                if args.distributed and not args.horovod:
+                    image_features = model.module.encode_image(images)
+                else:
+                    image_features = model.encode_image(images)
+                image_features = F.normalize(image_features, dim=-1)
+                logits = 100. * image_features @ classifier
+
+            # measure accuracy
+            acc1, acc5 = accuracy(logits, target, topk=(1, 5))
+            top1 += acc1
+            top5 += acc5
+            n += images.size(0)
+
+    top1 = (top1 / n)
+    top5 = (top5 / n)
+    return top1, top5
+
+
+def zero_shot_eval(model, data, epoch, args):
+    if 'imagenet-val' not in data and 'imagenet-v2' not in data:
+        return {}
+    if args.zeroshot_frequency == 0:
+        return {}
+    if (epoch % args.zeroshot_frequency) != 0 and epoch != args.epochs:
+        return {}
+
+    logging.info('Starting zero-shot imagenet.')
+
+    logging.info('Building zero-shot classifier')
+    classifier = zero_shot_classifier(model, imagenet_classnames, openai_imagenet_template, args)
+
+    logging.info('Using classifier')
+    results = {}
+    if 'imagenet-val' in data:
+        top1, top5 = run(model, classifier, data['imagenet-val'].dataloader, args)
+        results['imagenet-zeroshot-val-top1'] = top1
+        results['imagenet-zeroshot-val-top5'] = top5
+    if 'imagenet-v2' in data:
+        top1, top5 = run(model, classifier, data['imagenet-v2'].dataloader, args)
+        results['imagenetv2-zeroshot-val-top1'] = top1
+        results['imagenetv2-zeroshot-val-top5'] = top5
+
+    logging.info('Finished zero-shot imagenet.')
+
+    return results
diff --git a/my_laion_clap/CLAP/src/laion_clap/unit_test.py b/my_laion_clap/CLAP/src/laion_clap/unit_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..f138c627b43bb70b45bdd9364841ac0f113e32dc
--- /dev/null
+++ b/my_laion_clap/CLAP/src/laion_clap/unit_test.py
@@ -0,0 +1,75 @@
+"""
+Contrastive Language-Audio Pretraining Model from LAION
+--------------------------------------------------------
+Paper: https://arxiv.org/abs/2211.06687
+Authors (equal contributions): Ke Chen, Yusong Wu, Tianyu Zhang, Yuchen Hui
+Support: LAION
+"""
+
+import numpy as np
+import librosa
+import torch
+import laion_clap
+
+# quantization
+def int16_to_float32(x):
+    return (x / 32767.0).astype(np.float32)
+
+
+def float32_to_int16(x):
+    x = np.clip(x, a_min=-1., a_max=1.)
+    return (x * 32767.).astype(np.int16)
+
+model = laion_clap.CLAP_Module(enable_fusion=False)
+model.load_ckpt()
+
+# Directly get audio embeddings from audio files
+audio_file = [
+    '/home/la/kechen/Research/KE_CLAP/ckpt/test_clap_short.wav',
+    '/home/la/kechen/Research/KE_CLAP/ckpt/test_clap_long.wav'
+]
+audio_embed = model.get_audio_embedding_from_filelist(x = audio_file, use_tensor=False)
+print(audio_embed[:,-20:])
+print(audio_embed.shape)
+
+# Get audio embeddings from audio data
+audio_data, _ = librosa.load('/home/la/kechen/Research/KE_CLAP/ckpt/test_clap_short.wav', sr=48000) # sample rate should be 48000
+audio_data = audio_data.reshape(1, -1) # Make it (1,T) or (N,T)
+audio_embed = model.get_audio_embedding_from_data(x = audio_data, use_tensor=False)
+print(audio_embed[:,-20:])
+print(audio_embed.shape)
+
+# Directly get audio embeddings from audio files, but return torch tensor
+audio_file = [
+    '/home/la/kechen/Research/KE_CLAP/ckpt/test_clap_short.wav',
+    '/home/la/kechen/Research/KE_CLAP/ckpt/test_clap_long.wav'
+]
+audio_embed = model.get_audio_embedding_from_filelist(x = audio_file, use_tensor=True)
+print(audio_embed[:,-20:])
+print(audio_embed.shape)
+
+# Get audio embeddings from audio data
+audio_data, _ = librosa.load('/home/la/kechen/Research/KE_CLAP/ckpt/test_clap_short.wav', sr=48000) # sample rate should be 48000
+audio_data = audio_data.reshape(1, -1) # Make it (1,T) or (N,T)
+audio_data = torch.from_numpy(int16_to_float32(float32_to_int16(audio_data))).float() # quantize before send it in to the model
+audio_embed = model.get_audio_embedding_from_data(x = audio_data, use_tensor=True)
+print(audio_embed[:,-20:])
+print(audio_embed.shape)
+
+# Get text embedings from texts:
+text_data = ["I love the contrastive learning", "I love the pretrain model"] 
+text_embed = model.get_text_embedding(text_data)
+print(text_embed)
+print(text_embed.shape)
+
+# Get text embedings from texts, but return torch tensor:
+text_data = ["I love the contrastive learning", "I love the pretrain model"] 
+text_embed = model.get_text_embedding(text_data, use_tensor=True)
+print(text_embed)
+print(text_embed.shape)
+
+
+
+
+
+
diff --git a/my_laion_clap/CLAP/src/tests/__init__.py b/my_laion_clap/CLAP/src/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/my_laion_clap/CLAP/src/tests/check_ckpt.py b/my_laion_clap/CLAP/src/tests/check_ckpt.py
new file mode 100644
index 0000000000000000000000000000000000000000..d034dd44ac67643f5f2b6ed8d229ee20d1ccbde4
--- /dev/null
+++ b/my_laion_clap/CLAP/src/tests/check_ckpt.py
@@ -0,0 +1,802 @@
+import torch
+
+def keys_in_state_dict(ckpt, device='cpu'):
+    if device=="cpu":
+        a = torch.load(ckpt, map_location=torch.device('cpu'))["state_dict"]
+    else:
+        a = torch.load(ckpt)["state_dict"]
+    print("keys_in_state_dict", a.keys())
+
+
+def check_ckpt_diff(ckpt_a, ckpt_b, key_include=None, key_exclude=None, device='cpu', verbose=True):
+    if device=="cpu":
+        a = torch.load(ckpt_a, map_location=torch.device('cpu'))["state_dict"]
+        b = torch.load(ckpt_b, map_location=torch.device('cpu'))["state_dict"]
+    else:
+        a = torch.load(ckpt_a)["state_dict"]
+        b = torch.load(ckpt_b)["state_dict"]
+    a_sum = 0
+    b_sum = 0
+    difference_count = 0
+    for k in a.keys():
+        if key_include is not None and key_include not in k:
+            continue
+        if key_exclude is not None and key_exclude in k:
+            continue
+        if k in b.keys():
+            a_sum += torch.sum(a[k])
+            b_sum += torch.sum(b[k])
+            if verbose:
+                if torch.sum(a[k]) != torch.sum(b[k]):
+                    print(f"key {k} is different")
+                    difference_count += 1
+    print("a_sum: ", a_sum)
+    print("b_sum: ", b_sum)
+    print("diff: ", a_sum - b_sum)
+    if verbose:
+        print("difference_count: ", difference_count)
+    return bool(a_sum - b_sum)
+
+# Transformer no freeze:
+# check_ckpt_diff("/fsx/clap_logs/2022_09_11-19_37_08-model_PANN-14-lr_0.001-b_160-j_4-p_fp32/checkpoints/epoch_10.pt", "/fsx/clap_logs/2022_09_11-19_37_08-model_PANN-14-lr_0.001-b_160-j_4-p_fp32/checkpoints/epoch_100.pt", "text_branch.resblocks")
+
+check_ckpt_diff("/fsx/clap_logs/2022_09_29-23_42_40-model_PANN-14-lr_0.001-b_160-j_4-p_fp32/checkpoints/epoch_1.pt",
+                "/fsx/clap_logs/2022_09_29-23_42_40-model_PANN-14-lr_0.001-b_160-j_4-p_fp32/checkpoints/epoch_2.pt",
+                "text_branch.resblocks")
+
+# key module.text_branch.resblocks.0.attn.in_proj_weight is different
+# key module.text_branch.resblocks.0.attn.in_proj_bias is different
+# key module.text_branch.resblocks.0.attn.out_proj.weight is different
+# key module.text_branch.resblocks.0.attn.out_proj.bias is different
+# key module.text_branch.resblocks.0.ln_1.weight is different
+# key module.text_branch.resblocks.0.ln_1.bias is different
+# key module.text_branch.resblocks.0.mlp.c_fc.weight is different
+# key module.text_branch.resblocks.0.mlp.c_fc.bias is different
+# key module.text_branch.resblocks.0.mlp.c_proj.weight is different
+# key module.text_branch.resblocks.0.mlp.c_proj.bias is different
+# key module.text_branch.resblocks.0.ln_2.weight is different
+# key module.text_branch.resblocks.0.ln_2.bias is different
+# key module.text_branch.resblocks.1.attn.in_proj_weight is different
+# key module.text_branch.resblocks.1.attn.in_proj_bias is different
+# key module.text_branch.resblocks.1.attn.out_proj.weight is different
+# key module.text_branch.resblocks.1.attn.out_proj.bias is different
+# key module.text_branch.resblocks.1.ln_1.weight is different
+# key module.text_branch.resblocks.1.ln_1.bias is different
+# key module.text_branch.resblocks.1.mlp.c_fc.weight is different
+# key module.text_branch.resblocks.1.mlp.c_fc.bias is different
+# key module.text_branch.resblocks.1.mlp.c_proj.weight is different
+# key module.text_branch.resblocks.1.mlp.c_proj.bias is different
+# key module.text_branch.resblocks.1.ln_2.weight is different
+# key module.text_branch.resblocks.1.ln_2.bias is different
+# key module.text_branch.resblocks.2.attn.in_proj_weight is different
+# key module.text_branch.resblocks.2.attn.in_proj_bias is different
+# key module.text_branch.resblocks.2.attn.out_proj.weight is different
+# key module.text_branch.resblocks.2.attn.out_proj.bias is different
+# key module.text_branch.resblocks.2.ln_1.weight is different
+# key module.text_branch.resblocks.2.ln_1.bias is different
+# key module.text_branch.resblocks.2.mlp.c_fc.weight is different
+# key module.text_branch.resblocks.2.mlp.c_fc.bias is different
+# key module.text_branch.resblocks.2.mlp.c_proj.weight is different
+# key module.text_branch.resblocks.2.mlp.c_proj.bias is different
+# key module.text_branch.resblocks.2.ln_2.weight is different
+# key module.text_branch.resblocks.2.ln_2.bias is different
+# key module.text_branch.resblocks.3.attn.in_proj_weight is different
+# key module.text_branch.resblocks.3.attn.in_proj_bias is different
+# key module.text_branch.resblocks.3.attn.out_proj.weight is different
+# key module.text_branch.resblocks.3.attn.out_proj.bias is different
+# key module.text_branch.resblocks.3.ln_1.weight is different
+# key module.text_branch.resblocks.3.ln_1.bias is different
+# key module.text_branch.resblocks.3.mlp.c_fc.weight is different
+# key module.text_branch.resblocks.3.mlp.c_fc.bias is different
+# key module.text_branch.resblocks.3.mlp.c_proj.weight is different
+# key module.text_branch.resblocks.3.mlp.c_proj.bias is different
+# key module.text_branch.resblocks.3.ln_2.weight is different
+# key module.text_branch.resblocks.3.ln_2.bias is different
+# key module.text_branch.resblocks.4.attn.in_proj_weight is different
+# key module.text_branch.resblocks.4.attn.in_proj_bias is different
+# key module.text_branch.resblocks.4.attn.out_proj.weight is different
+# key module.text_branch.resblocks.4.attn.out_proj.bias is different
+# key module.text_branch.resblocks.4.ln_1.weight is different
+# key module.text_branch.resblocks.4.ln_1.bias is different
+# key module.text_branch.resblocks.4.mlp.c_fc.weight is different
+# key module.text_branch.resblocks.4.mlp.c_fc.bias is different
+# key module.text_branch.resblocks.4.mlp.c_proj.weight is different
+# key module.text_branch.resblocks.4.mlp.c_proj.bias is different
+# key module.text_branch.resblocks.4.ln_2.weight is different
+# key module.text_branch.resblocks.4.ln_2.bias is different
+# key module.text_branch.resblocks.5.attn.in_proj_weight is different
+# key module.text_branch.resblocks.5.attn.in_proj_bias is different
+# key module.text_branch.resblocks.5.attn.out_proj.weight is different
+# key module.text_branch.resblocks.5.attn.out_proj.bias is different
+# key module.text_branch.resblocks.5.ln_1.weight is different
+# key module.text_branch.resblocks.5.ln_1.bias is different
+# key module.text_branch.resblocks.5.mlp.c_fc.weight is different
+# key module.text_branch.resblocks.5.mlp.c_fc.bias is different
+# key module.text_branch.resblocks.5.mlp.c_proj.weight is different
+# key module.text_branch.resblocks.5.mlp.c_proj.bias is different
+# key module.text_branch.resblocks.5.ln_2.weight is different
+# key module.text_branch.resblocks.5.ln_2.bias is different
+# key module.text_branch.resblocks.6.attn.in_proj_weight is different
+# key module.text_branch.resblocks.6.attn.in_proj_bias is different
+# key module.text_branch.resblocks.6.attn.out_proj.weight is different
+# key module.text_branch.resblocks.6.attn.out_proj.bias is different
+# key module.text_branch.resblocks.6.ln_1.weight is different
+# key module.text_branch.resblocks.6.ln_1.bias is different
+# key module.text_branch.resblocks.6.mlp.c_fc.weight is different
+# key module.text_branch.resblocks.6.mlp.c_fc.bias is different
+# key module.text_branch.resblocks.6.mlp.c_proj.weight is different
+# key module.text_branch.resblocks.6.mlp.c_proj.bias is different
+# key module.text_branch.resblocks.6.ln_2.weight is different
+# key module.text_branch.resblocks.6.ln_2.bias is different
+# key module.text_branch.resblocks.7.attn.in_proj_weight is different
+# key module.text_branch.resblocks.7.attn.in_proj_bias is different
+# key module.text_branch.resblocks.7.attn.out_proj.weight is different
+# key module.text_branch.resblocks.7.attn.out_proj.bias is different
+# key module.text_branch.resblocks.7.ln_1.weight is different
+# key module.text_branch.resblocks.7.ln_1.bias is different
+# key module.text_branch.resblocks.7.mlp.c_fc.weight is different
+# key module.text_branch.resblocks.7.mlp.c_fc.bias is different
+# key module.text_branch.resblocks.7.mlp.c_proj.weight is different
+# key module.text_branch.resblocks.7.mlp.c_proj.bias is different
+# key module.text_branch.resblocks.7.ln_2.weight is different
+# key module.text_branch.resblocks.7.ln_2.bias is different
+# key module.text_branch.resblocks.8.attn.in_proj_weight is different
+# key module.text_branch.resblocks.8.attn.in_proj_bias is different
+# key module.text_branch.resblocks.8.attn.out_proj.weight is different
+# key module.text_branch.resblocks.8.attn.out_proj.bias is different
+# key module.text_branch.resblocks.8.ln_1.weight is different
+# key module.text_branch.resblocks.8.ln_1.bias is different
+# key module.text_branch.resblocks.8.mlp.c_fc.weight is different
+# key module.text_branch.resblocks.8.mlp.c_fc.bias is different
+# key module.text_branch.resblocks.8.mlp.c_proj.weight is different
+# key module.text_branch.resblocks.8.mlp.c_proj.bias is different
+# key module.text_branch.resblocks.8.ln_2.weight is different
+# key module.text_branch.resblocks.8.ln_2.bias is different
+# key module.text_branch.resblocks.9.attn.in_proj_weight is different
+# key module.text_branch.resblocks.9.attn.in_proj_bias is different
+# key module.text_branch.resblocks.9.attn.out_proj.weight is different
+# key module.text_branch.resblocks.9.attn.out_proj.bias is different
+# key module.text_branch.resblocks.9.ln_1.weight is different
+# key module.text_branch.resblocks.9.ln_1.bias is different
+# key module.text_branch.resblocks.9.mlp.c_fc.weight is different
+# key module.text_branch.resblocks.9.mlp.c_fc.bias is different
+# key module.text_branch.resblocks.9.mlp.c_proj.weight is different
+# key module.text_branch.resblocks.9.mlp.c_proj.bias is different
+# key module.text_branch.resblocks.9.ln_2.weight is different
+# key module.text_branch.resblocks.9.ln_2.bias is different
+# key module.text_branch.resblocks.10.attn.in_proj_weight is different
+# key module.text_branch.resblocks.10.attn.in_proj_bias is different
+# key module.text_branch.resblocks.10.attn.out_proj.weight is different
+# key module.text_branch.resblocks.10.attn.out_proj.bias is different
+# key module.text_branch.resblocks.10.ln_1.weight is different
+# key module.text_branch.resblocks.10.ln_1.bias is different
+# key module.text_branch.resblocks.10.mlp.c_fc.weight is different
+# key module.text_branch.resblocks.10.mlp.c_fc.bias is different
+# key module.text_branch.resblocks.10.mlp.c_proj.weight is different
+# key module.text_branch.resblocks.10.mlp.c_proj.bias is different
+# key module.text_branch.resblocks.10.ln_2.weight is different
+# key module.text_branch.resblocks.10.ln_2.bias is different
+# key module.text_branch.resblocks.11.attn.in_proj_weight is different
+# key module.text_branch.resblocks.11.attn.in_proj_bias is different
+# key module.text_branch.resblocks.11.attn.out_proj.weight is different
+# key module.text_branch.resblocks.11.attn.out_proj.bias is different
+# key module.text_branch.resblocks.11.ln_1.weight is different
+# key module.text_branch.resblocks.11.ln_1.bias is different
+# key module.text_branch.resblocks.11.mlp.c_fc.weight is different
+# key module.text_branch.resblocks.11.mlp.c_fc.bias is different
+# key module.text_branch.resblocks.11.mlp.c_proj.weight is different
+# key module.text_branch.resblocks.11.mlp.c_proj.bias is different
+# key module.text_branch.resblocks.11.ln_2.weight is different
+# key module.text_branch.resblocks.11.ln_2.bias is different
+# a_sum:  tensor(12113.6445)
+# b_sum:  tensor(9883.4424)
+# diff:  tensor(2230.2021)
+# True
+
+
+# Transformer freeze:
+# check_ckpt_diff("/fsx/clap_logs/2022_09_16-18_55_10-model_PANN-14-lr_0.001-b_160-j_4-p_fp32/checkpoints/epoch_10.pt", "/fsx/clap_logs/2022_09_16-18_55_10-model_PANN-14-lr_0.001-b_160-j_4-p_fp32/checkpoints/epoch_100.pt", "text_branch.resblocks")
+
+# key module.text_branch.resblocks.0.attn.in_proj_weight is different
+# key module.text_branch.resblocks.0.attn.in_proj_bias is different
+# key module.text_branch.resblocks.0.attn.out_proj.weight is different
+# key module.text_branch.resblocks.0.attn.out_proj.bias is different
+# key module.text_branch.resblocks.0.ln_1.weight is different
+# key module.text_branch.resblocks.0.ln_1.bias is different
+# key module.text_branch.resblocks.0.mlp.c_fc.weight is different
+# key module.text_branch.resblocks.0.mlp.c_fc.bias is different
+# key module.text_branch.resblocks.0.mlp.c_proj.weight is different
+# key module.text_branch.resblocks.0.mlp.c_proj.bias is different
+# key module.text_branch.resblocks.0.ln_2.weight is different
+# key module.text_branch.resblocks.0.ln_2.bias is different
+# key module.text_branch.resblocks.1.attn.in_proj_weight is different
+# key module.text_branch.resblocks.1.attn.in_proj_bias is different
+# key module.text_branch.resblocks.1.attn.out_proj.weight is different
+# key module.text_branch.resblocks.1.attn.out_proj.bias is different
+# key module.text_branch.resblocks.1.ln_1.weight is different
+# key module.text_branch.resblocks.1.ln_1.bias is different
+# key module.text_branch.resblocks.1.mlp.c_fc.weight is different
+# key module.text_branch.resblocks.1.mlp.c_fc.bias is different
+# key module.text_branch.resblocks.1.mlp.c_proj.weight is different
+# key module.text_branch.resblocks.1.mlp.c_proj.bias is different
+# key module.text_branch.resblocks.1.ln_2.weight is different
+# key module.text_branch.resblocks.1.ln_2.bias is different
+# key module.text_branch.resblocks.2.attn.in_proj_weight is different
+# key module.text_branch.resblocks.2.attn.in_proj_bias is different
+# key module.text_branch.resblocks.2.attn.out_proj.weight is different
+# key module.text_branch.resblocks.2.attn.out_proj.bias is different
+# key module.text_branch.resblocks.2.ln_1.weight is different
+# key module.text_branch.resblocks.2.ln_1.bias is different
+# key module.text_branch.resblocks.2.mlp.c_fc.weight is different
+# key module.text_branch.resblocks.2.mlp.c_fc.bias is different
+# key module.text_branch.resblocks.2.mlp.c_proj.weight is different
+# key module.text_branch.resblocks.2.mlp.c_proj.bias is different
+# key module.text_branch.resblocks.2.ln_2.weight is different
+# key module.text_branch.resblocks.2.ln_2.bias is different
+# key module.text_branch.resblocks.3.attn.in_proj_weight is different
+# key module.text_branch.resblocks.3.attn.in_proj_bias is different
+# key module.text_branch.resblocks.3.attn.out_proj.weight is different
+# key module.text_branch.resblocks.3.attn.out_proj.bias is different
+# key module.text_branch.resblocks.3.ln_1.weight is different
+# key module.text_branch.resblocks.3.ln_1.bias is different
+# key module.text_branch.resblocks.3.mlp.c_fc.weight is different
+# key module.text_branch.resblocks.3.mlp.c_fc.bias is different
+# key module.text_branch.resblocks.3.mlp.c_proj.weight is different
+# key module.text_branch.resblocks.3.mlp.c_proj.bias is different
+# key module.text_branch.resblocks.3.ln_2.weight is different
+# key module.text_branch.resblocks.3.ln_2.bias is different
+# key module.text_branch.resblocks.4.attn.in_proj_weight is different
+# key module.text_branch.resblocks.4.attn.in_proj_bias is different
+# key module.text_branch.resblocks.4.attn.out_proj.weight is different
+# key module.text_branch.resblocks.4.attn.out_proj.bias is different
+# key module.text_branch.resblocks.4.ln_1.weight is different
+# key module.text_branch.resblocks.4.ln_1.bias is different
+# key module.text_branch.resblocks.4.mlp.c_fc.weight is different
+# key module.text_branch.resblocks.4.mlp.c_fc.bias is different
+# key module.text_branch.resblocks.4.mlp.c_proj.weight is different
+# key module.text_branch.resblocks.4.mlp.c_proj.bias is different
+# key module.text_branch.resblocks.4.ln_2.weight is different
+# key module.text_branch.resblocks.4.ln_2.bias is different
+# key module.text_branch.resblocks.5.attn.in_proj_weight is different
+# key module.text_branch.resblocks.5.attn.in_proj_bias is different
+# key module.text_branch.resblocks.5.attn.out_proj.weight is different
+# key module.text_branch.resblocks.5.attn.out_proj.bias is different
+# key module.text_branch.resblocks.5.ln_1.weight is different
+# key module.text_branch.resblocks.5.ln_1.bias is different
+# key module.text_branch.resblocks.5.mlp.c_fc.weight is different
+# key module.text_branch.resblocks.5.mlp.c_fc.bias is different
+# key module.text_branch.resblocks.5.mlp.c_proj.weight is different
+# key module.text_branch.resblocks.5.mlp.c_proj.bias is different
+# key module.text_branch.resblocks.5.ln_2.weight is different
+# key module.text_branch.resblocks.5.ln_2.bias is different
+# key module.text_branch.resblocks.6.attn.in_proj_weight is different
+# key module.text_branch.resblocks.6.attn.in_proj_bias is different
+# key module.text_branch.resblocks.6.attn.out_proj.weight is different
+# key module.text_branch.resblocks.6.attn.out_proj.bias is different
+# key module.text_branch.resblocks.6.ln_1.weight is different
+# key module.text_branch.resblocks.6.ln_1.bias is different
+# key module.text_branch.resblocks.6.mlp.c_fc.weight is different
+# key module.text_branch.resblocks.6.mlp.c_fc.bias is different
+# key module.text_branch.resblocks.6.mlp.c_proj.weight is different
+# key module.text_branch.resblocks.6.mlp.c_proj.bias is different
+# key module.text_branch.resblocks.6.ln_2.weight is different
+# key module.text_branch.resblocks.6.ln_2.bias is different
+# key module.text_branch.resblocks.7.attn.in_proj_weight is different
+# key module.text_branch.resblocks.7.attn.in_proj_bias is different
+# key module.text_branch.resblocks.7.attn.out_proj.weight is different
+# key module.text_branch.resblocks.7.attn.out_proj.bias is different
+# key module.text_branch.resblocks.7.ln_1.weight is different
+# key module.text_branch.resblocks.7.ln_1.bias is different
+# key module.text_branch.resblocks.7.mlp.c_fc.weight is different
+# key module.text_branch.resblocks.7.mlp.c_fc.bias is different
+# key module.text_branch.resblocks.7.mlp.c_proj.weight is different
+# key module.text_branch.resblocks.7.mlp.c_proj.bias is different
+# key module.text_branch.resblocks.7.ln_2.weight is different
+# key module.text_branch.resblocks.7.ln_2.bias is different
+# key module.text_branch.resblocks.8.attn.in_proj_weight is different
+# key module.text_branch.resblocks.8.attn.in_proj_bias is different
+# key module.text_branch.resblocks.8.attn.out_proj.weight is different
+# key module.text_branch.resblocks.8.attn.out_proj.bias is different
+# key module.text_branch.resblocks.8.ln_1.weight is different
+# key module.text_branch.resblocks.8.ln_1.bias is different
+# key module.text_branch.resblocks.8.mlp.c_fc.weight is different
+# key module.text_branch.resblocks.8.mlp.c_fc.bias is different
+# key module.text_branch.resblocks.8.mlp.c_proj.weight is different
+# key module.text_branch.resblocks.8.mlp.c_proj.bias is different
+# key module.text_branch.resblocks.8.ln_2.weight is different
+# key module.text_branch.resblocks.8.ln_2.bias is different
+# key module.text_branch.resblocks.9.attn.in_proj_weight is different
+# key module.text_branch.resblocks.9.attn.in_proj_bias is different
+# key module.text_branch.resblocks.9.attn.out_proj.weight is different
+# key module.text_branch.resblocks.9.attn.out_proj.bias is different
+# key module.text_branch.resblocks.9.ln_1.weight is different
+# key module.text_branch.resblocks.9.ln_1.bias is different
+# key module.text_branch.resblocks.9.mlp.c_fc.weight is different
+# key module.text_branch.resblocks.9.mlp.c_fc.bias is different
+# key module.text_branch.resblocks.9.mlp.c_proj.weight is different
+# key module.text_branch.resblocks.9.mlp.c_proj.bias is different
+# key module.text_branch.resblocks.9.ln_2.weight is different
+# key module.text_branch.resblocks.9.ln_2.bias is different
+# key module.text_branch.resblocks.10.attn.in_proj_weight is different
+# key module.text_branch.resblocks.10.attn.in_proj_bias is different
+# key module.text_branch.resblocks.10.attn.out_proj.weight is different
+# key module.text_branch.resblocks.10.attn.out_proj.bias is different
+# key module.text_branch.resblocks.10.ln_1.weight is different
+# key module.text_branch.resblocks.10.ln_1.bias is different
+# key module.text_branch.resblocks.10.mlp.c_fc.weight is different
+# key module.text_branch.resblocks.10.mlp.c_fc.bias is different
+# key module.text_branch.resblocks.10.mlp.c_proj.weight is different
+# key module.text_branch.resblocks.10.mlp.c_proj.bias is different
+# key module.text_branch.resblocks.10.ln_2.weight is different
+# key module.text_branch.resblocks.10.ln_2.bias is different
+# key module.text_branch.resblocks.11.attn.in_proj_weight is different
+# key module.text_branch.resblocks.11.attn.in_proj_bias is different
+# key module.text_branch.resblocks.11.attn.out_proj.weight is different
+# key module.text_branch.resblocks.11.attn.out_proj.bias is different
+# key module.text_branch.resblocks.11.ln_1.weight is different
+# key module.text_branch.resblocks.11.ln_1.bias is different
+# key module.text_branch.resblocks.11.mlp.c_fc.weight is different
+# key module.text_branch.resblocks.11.mlp.c_fc.bias is different
+# key module.text_branch.resblocks.11.mlp.c_proj.weight is different
+# key module.text_branch.resblocks.11.mlp.c_proj.bias is different
+# key module.text_branch.resblocks.11.ln_2.weight is different
+# key module.text_branch.resblocks.11.ln_2.bias is different
+# a_sum:  tensor(12133.6348)
+# b_sum:  tensor(10423.9521)
+# diff:  tensor(1709.6826)
+# True
+
+
+# bert no freeze:
+# check_ckpt_diff("/fsx/clap_logs/2022_09_14-02_33_11-model_PANN-14-lr_0.0001-b_160-j_4-p_fp32/checkpoints/epoch_10.pt", "/fsx/clap_logs/2022_09_14-02_33_11-model_PANN-14-lr_0.0001-b_160-j_4-p_fp32/checkpoints/epoch_100.pt", "text_branch.encoder")
+
+# key module.text_branch.encoder.layer.0.attention.self.query.weight is different
+# key module.text_branch.encoder.layer.0.attention.self.query.bias is different
+# key module.text_branch.encoder.layer.0.attention.self.key.weight is different
+# key module.text_branch.encoder.layer.0.attention.self.key.bias is different
+# key module.text_branch.encoder.layer.0.attention.self.value.weight is different
+# key module.text_branch.encoder.layer.0.attention.self.value.bias is different
+# key module.text_branch.encoder.layer.0.attention.output.dense.weight is different
+# key module.text_branch.encoder.layer.0.attention.output.dense.bias is different
+# key module.text_branch.encoder.layer.0.attention.output.LayerNorm.weight is different
+# key module.text_branch.encoder.layer.0.attention.output.LayerNorm.bias is different
+# key module.text_branch.encoder.layer.0.intermediate.dense.weight is different
+# key module.text_branch.encoder.layer.0.intermediate.dense.bias is different
+# key module.text_branch.encoder.layer.0.output.dense.weight is different
+# key module.text_branch.encoder.layer.0.output.dense.bias is different
+# key module.text_branch.encoder.layer.0.output.LayerNorm.weight is different
+# key module.text_branch.encoder.layer.0.output.LayerNorm.bias is different
+# key module.text_branch.encoder.layer.1.attention.self.query.weight is different
+# key module.text_branch.encoder.layer.1.attention.self.query.bias is different
+# key module.text_branch.encoder.layer.1.attention.self.key.weight is different
+# key module.text_branch.encoder.layer.1.attention.self.key.bias is different
+# key module.text_branch.encoder.layer.1.attention.self.value.weight is different
+# key module.text_branch.encoder.layer.1.attention.self.value.bias is different
+# key module.text_branch.encoder.layer.1.attention.output.dense.weight is different
+# key module.text_branch.encoder.layer.1.attention.output.dense.bias is different
+# key module.text_branch.encoder.layer.1.attention.output.LayerNorm.weight is different
+# key module.text_branch.encoder.layer.1.attention.output.LayerNorm.bias is different
+# key module.text_branch.encoder.layer.1.intermediate.dense.weight is different
+# key module.text_branch.encoder.layer.1.intermediate.dense.bias is different
+# key module.text_branch.encoder.layer.1.output.dense.weight is different
+# key module.text_branch.encoder.layer.1.output.dense.bias is different
+# key module.text_branch.encoder.layer.1.output.LayerNorm.weight is different
+# key module.text_branch.encoder.layer.1.output.LayerNorm.bias is different
+# key module.text_branch.encoder.layer.2.attention.self.query.weight is different
+# key module.text_branch.encoder.layer.2.attention.self.query.bias is different
+# key module.text_branch.encoder.layer.2.attention.self.key.weight is different
+# key module.text_branch.encoder.layer.2.attention.self.key.bias is different
+# key module.text_branch.encoder.layer.2.attention.self.value.weight is different
+# key module.text_branch.encoder.layer.2.attention.self.value.bias is different
+# key module.text_branch.encoder.layer.2.attention.output.dense.weight is different
+# key module.text_branch.encoder.layer.2.attention.output.dense.bias is different
+# key module.text_branch.encoder.layer.2.attention.output.LayerNorm.weight is different
+# key module.text_branch.encoder.layer.2.attention.output.LayerNorm.bias is different
+# key module.text_branch.encoder.layer.2.intermediate.dense.weight is different
+# key module.text_branch.encoder.layer.2.intermediate.dense.bias is different
+# key module.text_branch.encoder.layer.2.output.dense.weight is different
+# key module.text_branch.encoder.layer.2.output.dense.bias is different
+# key module.text_branch.encoder.layer.2.output.LayerNorm.weight is different
+# key module.text_branch.encoder.layer.2.output.LayerNorm.bias is different
+# key module.text_branch.encoder.layer.3.attention.self.query.weight is different
+# key module.text_branch.encoder.layer.3.attention.self.query.bias is different
+# key module.text_branch.encoder.layer.3.attention.self.key.weight is different
+# key module.text_branch.encoder.layer.3.attention.self.key.bias is different
+# key module.text_branch.encoder.layer.3.attention.self.value.weight is different
+# key module.text_branch.encoder.layer.3.attention.self.value.bias is different
+# key module.text_branch.encoder.layer.3.attention.output.dense.weight is different
+# key module.text_branch.encoder.layer.3.attention.output.dense.bias is different
+# key module.text_branch.encoder.layer.3.attention.output.LayerNorm.weight is different
+# key module.text_branch.encoder.layer.3.attention.output.LayerNorm.bias is different
+# key module.text_branch.encoder.layer.3.intermediate.dense.weight is different
+# key module.text_branch.encoder.layer.3.intermediate.dense.bias is different
+# key module.text_branch.encoder.layer.3.output.dense.weight is different
+# key module.text_branch.encoder.layer.3.output.dense.bias is different
+# key module.text_branch.encoder.layer.3.output.LayerNorm.weight is different
+# key module.text_branch.encoder.layer.3.output.LayerNorm.bias is different
+# key module.text_branch.encoder.layer.4.attention.self.query.weight is different
+# key module.text_branch.encoder.layer.4.attention.self.query.bias is different
+# key module.text_branch.encoder.layer.4.attention.self.key.weight is different
+# key module.text_branch.encoder.layer.4.attention.self.key.bias is different
+# key module.text_branch.encoder.layer.4.attention.self.value.weight is different
+# key module.text_branch.encoder.layer.4.attention.self.value.bias is different
+# key module.text_branch.encoder.layer.4.attention.output.dense.weight is different
+# key module.text_branch.encoder.layer.4.attention.output.dense.bias is different
+# key module.text_branch.encoder.layer.4.attention.output.LayerNorm.weight is different
+# key module.text_branch.encoder.layer.4.attention.output.LayerNorm.bias is different
+# key module.text_branch.encoder.layer.4.intermediate.dense.weight is different
+# key module.text_branch.encoder.layer.4.intermediate.dense.bias is different
+# key module.text_branch.encoder.layer.4.output.dense.weight is different
+# key module.text_branch.encoder.layer.4.output.dense.bias is different
+# key module.text_branch.encoder.layer.4.output.LayerNorm.weight is different
+# key module.text_branch.encoder.layer.4.output.LayerNorm.bias is different
+# key module.text_branch.encoder.layer.5.attention.self.query.weight is different
+# key module.text_branch.encoder.layer.5.attention.self.query.bias is different
+# key module.text_branch.encoder.layer.5.attention.self.key.weight is different
+# key module.text_branch.encoder.layer.5.attention.self.key.bias is different
+# key module.text_branch.encoder.layer.5.attention.self.value.weight is different
+# key module.text_branch.encoder.layer.5.attention.self.value.bias is different
+# key module.text_branch.encoder.layer.5.attention.output.dense.weight is different
+# key module.text_branch.encoder.layer.5.attention.output.dense.bias is different
+# key module.text_branch.encoder.layer.5.attention.output.LayerNorm.weight is different
+# key module.text_branch.encoder.layer.5.attention.output.LayerNorm.bias is different
+# key module.text_branch.encoder.layer.5.intermediate.dense.weight is different
+# key module.text_branch.encoder.layer.5.intermediate.dense.bias is different
+# key module.text_branch.encoder.layer.5.output.dense.weight is different
+# key module.text_branch.encoder.layer.5.output.dense.bias is different
+# key module.text_branch.encoder.layer.5.output.LayerNorm.weight is different
+# key module.text_branch.encoder.layer.5.output.LayerNorm.bias is different
+# key module.text_branch.encoder.layer.6.attention.self.query.weight is different
+# key module.text_branch.encoder.layer.6.attention.self.query.bias is different
+# key module.text_branch.encoder.layer.6.attention.self.key.weight is different
+# key module.text_branch.encoder.layer.6.attention.self.key.bias is different
+# key module.text_branch.encoder.layer.6.attention.self.value.weight is different
+# key module.text_branch.encoder.layer.6.attention.self.value.bias is different
+# key module.text_branch.encoder.layer.6.attention.output.dense.weight is different
+# key module.text_branch.encoder.layer.6.attention.output.dense.bias is different
+# key module.text_branch.encoder.layer.6.attention.output.LayerNorm.weight is different
+# key module.text_branch.encoder.layer.6.attention.output.LayerNorm.bias is different
+# key module.text_branch.encoder.layer.6.intermediate.dense.weight is different
+# key module.text_branch.encoder.layer.6.intermediate.dense.bias is different
+# key module.text_branch.encoder.layer.6.output.dense.weight is different
+# key module.text_branch.encoder.layer.6.output.dense.bias is different
+# key module.text_branch.encoder.layer.6.output.LayerNorm.weight is different
+# key module.text_branch.encoder.layer.6.output.LayerNorm.bias is different
+# key module.text_branch.encoder.layer.7.attention.self.query.weight is different
+# key module.text_branch.encoder.layer.7.attention.self.query.bias is different
+# key module.text_branch.encoder.layer.7.attention.self.key.weight is different
+# key module.text_branch.encoder.layer.7.attention.self.key.bias is different
+# key module.text_branch.encoder.layer.7.attention.self.value.weight is different
+# key module.text_branch.encoder.layer.7.attention.self.value.bias is different
+# key module.text_branch.encoder.layer.7.attention.output.dense.weight is different
+# key module.text_branch.encoder.layer.7.attention.output.dense.bias is different
+# key module.text_branch.encoder.layer.7.attention.output.LayerNorm.weight is different
+# key module.text_branch.encoder.layer.7.attention.output.LayerNorm.bias is different
+# key module.text_branch.encoder.layer.7.intermediate.dense.weight is different
+# key module.text_branch.encoder.layer.7.intermediate.dense.bias is different
+# key module.text_branch.encoder.layer.7.output.dense.weight is different
+# key module.text_branch.encoder.layer.7.output.dense.bias is different
+# key module.text_branch.encoder.layer.7.output.LayerNorm.weight is different
+# key module.text_branch.encoder.layer.7.output.LayerNorm.bias is different
+# key module.text_branch.encoder.layer.8.attention.self.query.weight is different
+# key module.text_branch.encoder.layer.8.attention.self.query.bias is different
+# key module.text_branch.encoder.layer.8.attention.self.key.weight is different
+# key module.text_branch.encoder.layer.8.attention.self.key.bias is different
+# key module.text_branch.encoder.layer.8.attention.self.value.weight is different
+# key module.text_branch.encoder.layer.8.attention.self.value.bias is different
+# key module.text_branch.encoder.layer.8.attention.output.dense.weight is different
+# key module.text_branch.encoder.layer.8.attention.output.dense.bias is different
+# key module.text_branch.encoder.layer.8.attention.output.LayerNorm.weight is different
+# key module.text_branch.encoder.layer.8.attention.output.LayerNorm.bias is different
+# key module.text_branch.encoder.layer.8.intermediate.dense.weight is different
+# key module.text_branch.encoder.layer.8.intermediate.dense.bias is different
+# key module.text_branch.encoder.layer.8.output.dense.weight is different
+# key module.text_branch.encoder.layer.8.output.dense.bias is different
+# key module.text_branch.encoder.layer.8.output.LayerNorm.weight is different
+# key module.text_branch.encoder.layer.8.output.LayerNorm.bias is different
+# key module.text_branch.encoder.layer.9.attention.self.query.weight is different
+# key module.text_branch.encoder.layer.9.attention.self.query.bias is different
+# key module.text_branch.encoder.layer.9.attention.self.key.weight is different
+# key module.text_branch.encoder.layer.9.attention.self.key.bias is different
+# key module.text_branch.encoder.layer.9.attention.self.value.weight is different
+# key module.text_branch.encoder.layer.9.attention.self.value.bias is different
+# key module.text_branch.encoder.layer.9.attention.output.dense.weight is different
+# key module.text_branch.encoder.layer.9.attention.output.dense.bias is different
+# key module.text_branch.encoder.layer.9.attention.output.LayerNorm.weight is different
+# key module.text_branch.encoder.layer.9.attention.output.LayerNorm.bias is different
+# key module.text_branch.encoder.layer.9.intermediate.dense.weight is different
+# key module.text_branch.encoder.layer.9.intermediate.dense.bias is different
+# key module.text_branch.encoder.layer.9.output.dense.weight is different
+# key module.text_branch.encoder.layer.9.output.dense.bias is different
+# key module.text_branch.encoder.layer.9.output.LayerNorm.weight is different
+# key module.text_branch.encoder.layer.9.output.LayerNorm.bias is different
+# key module.text_branch.encoder.layer.10.attention.self.query.weight is different
+# key module.text_branch.encoder.layer.10.attention.self.query.bias is different
+# key module.text_branch.encoder.layer.10.attention.self.key.weight is different
+# key module.text_branch.encoder.layer.10.attention.self.key.bias is different
+# key module.text_branch.encoder.layer.10.attention.self.value.weight is different
+# key module.text_branch.encoder.layer.10.attention.self.value.bias is different
+# key module.text_branch.encoder.layer.10.attention.output.dense.weight is different
+# key module.text_branch.encoder.layer.10.attention.output.dense.bias is different
+# key module.text_branch.encoder.layer.10.attention.output.LayerNorm.weight is different
+# key module.text_branch.encoder.layer.10.attention.output.LayerNorm.bias is different
+# key module.text_branch.encoder.layer.10.intermediate.dense.weight is different
+# key module.text_branch.encoder.layer.10.intermediate.dense.bias is different
+# key module.text_branch.encoder.layer.10.output.dense.weight is different
+# key module.text_branch.encoder.layer.10.output.dense.bias is different
+# key module.text_branch.encoder.layer.10.output.LayerNorm.weight is different
+# key module.text_branch.encoder.layer.10.output.LayerNorm.bias is different
+# key module.text_branch.encoder.layer.11.attention.self.query.weight is different
+# key module.text_branch.encoder.layer.11.attention.self.query.bias is different
+# key module.text_branch.encoder.layer.11.attention.self.key.weight is different
+# key module.text_branch.encoder.layer.11.attention.self.key.bias is different
+# key module.text_branch.encoder.layer.11.attention.self.value.weight is different
+# key module.text_branch.encoder.layer.11.attention.self.value.bias is different
+# key module.text_branch.encoder.layer.11.attention.output.dense.weight is different
+# key module.text_branch.encoder.layer.11.attention.output.dense.bias is different
+# key module.text_branch.encoder.layer.11.attention.output.LayerNorm.weight is different
+# key module.text_branch.encoder.layer.11.attention.output.LayerNorm.bias is different
+# key module.text_branch.encoder.layer.11.intermediate.dense.weight is different
+# key module.text_branch.encoder.layer.11.intermediate.dense.bias is different
+# key module.text_branch.encoder.layer.11.output.dense.weight is different
+# key module.text_branch.encoder.layer.11.output.dense.bias is different
+# key module.text_branch.encoder.layer.11.output.LayerNorm.weight is different
+# key module.text_branch.encoder.layer.11.output.LayerNorm.bias is different
+# a_sum:  tensor(15185.1230)
+# b_sum:  tensor(15576.5596)
+# diff:  tensor(-391.4365)
+# True
+
+
+# bert freeze:
+# check_ckpt_diff("/fsx/clap_logs/2022_09_13-01_25_15-model_PANN-14-lr_0.0001-b_160-j_4-p_fp32/checkpoints/epoch_10.pt", "/fsx/clap_logs/2022_09_13-01_25_15-model_PANN-14-lr_0.0001-b_160-j_4-p_fp32/checkpoints/epoch_100.pt", "text_branch.encoder")
+
+# key module.text_branch.encoder.layer.0.attention.self.query.weight is different
+# key module.text_branch.encoder.layer.0.attention.self.query.bias is different
+# key module.text_branch.encoder.layer.0.attention.self.key.weight is different
+# key module.text_branch.encoder.layer.0.attention.self.key.bias is different
+# key module.text_branch.encoder.layer.0.attention.self.value.weight is different
+# key module.text_branch.encoder.layer.0.attention.self.value.bias is different
+# key module.text_branch.encoder.layer.0.attention.output.dense.weight is different
+# key module.text_branch.encoder.layer.0.attention.output.dense.bias is different
+# key module.text_branch.encoder.layer.0.attention.output.LayerNorm.weight is different
+# key module.text_branch.encoder.layer.0.attention.output.LayerNorm.bias is different
+# key module.text_branch.encoder.layer.0.intermediate.dense.weight is different
+# key module.text_branch.encoder.layer.0.intermediate.dense.bias is different
+# key module.text_branch.encoder.layer.0.output.dense.weight is different
+# key module.text_branch.encoder.layer.0.output.dense.bias is different
+# key module.text_branch.encoder.layer.0.output.LayerNorm.weight is different
+# key module.text_branch.encoder.layer.0.output.LayerNorm.bias is different
+# key module.text_branch.encoder.layer.1.attention.self.query.weight is different
+# key module.text_branch.encoder.layer.1.attention.self.query.bias is different
+# key module.text_branch.encoder.layer.1.attention.self.key.weight is different
+# key module.text_branch.encoder.layer.1.attention.self.key.bias is different
+# key module.text_branch.encoder.layer.1.attention.self.value.weight is different
+# key module.text_branch.encoder.layer.1.attention.self.value.bias is different
+# key module.text_branch.encoder.layer.1.attention.output.dense.weight is different
+# key module.text_branch.encoder.layer.1.attention.output.dense.bias is different
+# key module.text_branch.encoder.layer.1.attention.output.LayerNorm.weight is different
+# key module.text_branch.encoder.layer.1.attention.output.LayerNorm.bias is different
+# key module.text_branch.encoder.layer.1.intermediate.dense.weight is different
+# key module.text_branch.encoder.layer.1.intermediate.dense.bias is different
+# key module.text_branch.encoder.layer.1.output.dense.weight is different
+# key module.text_branch.encoder.layer.1.output.dense.bias is different
+# key module.text_branch.encoder.layer.1.output.LayerNorm.weight is different
+# key module.text_branch.encoder.layer.1.output.LayerNorm.bias is different
+# key module.text_branch.encoder.layer.2.attention.self.query.weight is different
+# key module.text_branch.encoder.layer.2.attention.self.query.bias is different
+# key module.text_branch.encoder.layer.2.attention.self.key.weight is different
+# key module.text_branch.encoder.layer.2.attention.self.key.bias is different
+# key module.text_branch.encoder.layer.2.attention.self.value.weight is different
+# key module.text_branch.encoder.layer.2.attention.self.value.bias is different
+# key module.text_branch.encoder.layer.2.attention.output.dense.weight is different
+# key module.text_branch.encoder.layer.2.attention.output.dense.bias is different
+# key module.text_branch.encoder.layer.2.attention.output.LayerNorm.weight is different
+# key module.text_branch.encoder.layer.2.attention.output.LayerNorm.bias is different
+# key module.text_branch.encoder.layer.2.intermediate.dense.weight is different
+# key module.text_branch.encoder.layer.2.intermediate.dense.bias is different
+# key module.text_branch.encoder.layer.2.output.dense.weight is different
+# key module.text_branch.encoder.layer.2.output.dense.bias is different
+# key module.text_branch.encoder.layer.2.output.LayerNorm.weight is different
+# key module.text_branch.encoder.layer.2.output.LayerNorm.bias is different
+# key module.text_branch.encoder.layer.3.attention.self.query.weight is different
+# key module.text_branch.encoder.layer.3.attention.self.query.bias is different
+# key module.text_branch.encoder.layer.3.attention.self.key.weight is different
+# key module.text_branch.encoder.layer.3.attention.self.key.bias is different
+# key module.text_branch.encoder.layer.3.attention.self.value.weight is different
+# key module.text_branch.encoder.layer.3.attention.self.value.bias is different
+# key module.text_branch.encoder.layer.3.attention.output.dense.weight is different
+# key module.text_branch.encoder.layer.3.attention.output.dense.bias is different
+# key module.text_branch.encoder.layer.3.attention.output.LayerNorm.weight is different
+# key module.text_branch.encoder.layer.3.attention.output.LayerNorm.bias is different
+# key module.text_branch.encoder.layer.3.intermediate.dense.weight is different
+# key module.text_branch.encoder.layer.3.intermediate.dense.bias is different
+# key module.text_branch.encoder.layer.3.output.dense.weight is different
+# key module.text_branch.encoder.layer.3.output.dense.bias is different
+# key module.text_branch.encoder.layer.3.output.LayerNorm.weight is different
+# key module.text_branch.encoder.layer.3.output.LayerNorm.bias is different
+# key module.text_branch.encoder.layer.4.attention.self.query.weight is different
+# key module.text_branch.encoder.layer.4.attention.self.query.bias is different
+# key module.text_branch.encoder.layer.4.attention.self.key.weight is different
+# key module.text_branch.encoder.layer.4.attention.self.key.bias is different
+# key module.text_branch.encoder.layer.4.attention.self.value.weight is different
+# key module.text_branch.encoder.layer.4.attention.self.value.bias is different
+# key module.text_branch.encoder.layer.4.attention.output.dense.weight is different
+# key module.text_branch.encoder.layer.4.attention.output.dense.bias is different
+# key module.text_branch.encoder.layer.4.attention.output.LayerNorm.weight is different
+# key module.text_branch.encoder.layer.4.attention.output.LayerNorm.bias is different
+# key module.text_branch.encoder.layer.4.intermediate.dense.weight is different
+# key module.text_branch.encoder.layer.4.intermediate.dense.bias is different
+# key module.text_branch.encoder.layer.4.output.dense.weight is different
+# key module.text_branch.encoder.layer.4.output.dense.bias is different
+# key module.text_branch.encoder.layer.4.output.LayerNorm.weight is different
+# key module.text_branch.encoder.layer.4.output.LayerNorm.bias is different
+# key module.text_branch.encoder.layer.5.attention.self.query.weight is different
+# key module.text_branch.encoder.layer.5.attention.self.query.bias is different
+# key module.text_branch.encoder.layer.5.attention.self.key.weight is different
+# key module.text_branch.encoder.layer.5.attention.self.key.bias is different
+# key module.text_branch.encoder.layer.5.attention.self.value.weight is different
+# key module.text_branch.encoder.layer.5.attention.self.value.bias is different
+# key module.text_branch.encoder.layer.5.attention.output.dense.weight is different
+# key module.text_branch.encoder.layer.5.attention.output.dense.bias is different
+# key module.text_branch.encoder.layer.5.attention.output.LayerNorm.weight is different
+# key module.text_branch.encoder.layer.5.attention.output.LayerNorm.bias is different
+# key module.text_branch.encoder.layer.5.intermediate.dense.weight is different
+# key module.text_branch.encoder.layer.5.intermediate.dense.bias is different
+# key module.text_branch.encoder.layer.5.output.dense.weight is different
+# key module.text_branch.encoder.layer.5.output.dense.bias is different
+# key module.text_branch.encoder.layer.5.output.LayerNorm.weight is different
+# key module.text_branch.encoder.layer.5.output.LayerNorm.bias is different
+# key module.text_branch.encoder.layer.6.attention.self.query.weight is different
+# key module.text_branch.encoder.layer.6.attention.self.query.bias is different
+# key module.text_branch.encoder.layer.6.attention.self.key.weight is different
+# key module.text_branch.encoder.layer.6.attention.self.key.bias is different
+# key module.text_branch.encoder.layer.6.attention.self.value.weight is different
+# key module.text_branch.encoder.layer.6.attention.self.value.bias is different
+# key module.text_branch.encoder.layer.6.attention.output.dense.weight is different
+# key module.text_branch.encoder.layer.6.attention.output.dense.bias is different
+# key module.text_branch.encoder.layer.6.attention.output.LayerNorm.weight is different
+# key module.text_branch.encoder.layer.6.attention.output.LayerNorm.bias is different
+# key module.text_branch.encoder.layer.6.intermediate.dense.weight is different
+# key module.text_branch.encoder.layer.6.intermediate.dense.bias is different
+# key module.text_branch.encoder.layer.6.output.dense.weight is different
+# key module.text_branch.encoder.layer.6.output.dense.bias is different
+# key module.text_branch.encoder.layer.6.output.LayerNorm.weight is different
+# key module.text_branch.encoder.layer.6.output.LayerNorm.bias is different
+# key module.text_branch.encoder.layer.7.attention.self.query.weight is different
+# key module.text_branch.encoder.layer.7.attention.self.query.bias is different
+# key module.text_branch.encoder.layer.7.attention.self.key.weight is different
+# key module.text_branch.encoder.layer.7.attention.self.key.bias is different
+# key module.text_branch.encoder.layer.7.attention.self.value.weight is different
+# key module.text_branch.encoder.layer.7.attention.self.value.bias is different
+# key module.text_branch.encoder.layer.7.attention.output.dense.weight is different
+# key module.text_branch.encoder.layer.7.attention.output.dense.bias is different
+# key module.text_branch.encoder.layer.7.attention.output.LayerNorm.weight is different
+# key module.text_branch.encoder.layer.7.attention.output.LayerNorm.bias is different
+# key module.text_branch.encoder.layer.7.intermediate.dense.weight is different
+# key module.text_branch.encoder.layer.7.intermediate.dense.bias is different
+# key module.text_branch.encoder.layer.7.output.dense.weight is different
+# key module.text_branch.encoder.layer.7.output.dense.bias is different
+# key module.text_branch.encoder.layer.7.output.LayerNorm.weight is different
+# key module.text_branch.encoder.layer.7.output.LayerNorm.bias is different
+# key module.text_branch.encoder.layer.8.attention.self.query.weight is different
+# key module.text_branch.encoder.layer.8.attention.self.query.bias is different
+# key module.text_branch.encoder.layer.8.attention.self.key.weight is different
+# key module.text_branch.encoder.layer.8.attention.self.key.bias is different
+# key module.text_branch.encoder.layer.8.attention.self.value.weight is different
+# key module.text_branch.encoder.layer.8.attention.self.value.bias is different
+# key module.text_branch.encoder.layer.8.attention.output.dense.weight is different
+# key module.text_branch.encoder.layer.8.attention.output.dense.bias is different
+# key module.text_branch.encoder.layer.8.attention.output.LayerNorm.weight is different
+# key module.text_branch.encoder.layer.8.attention.output.LayerNorm.bias is different
+# key module.text_branch.encoder.layer.8.intermediate.dense.weight is different
+# key module.text_branch.encoder.layer.8.intermediate.dense.bias is different
+# key module.text_branch.encoder.layer.8.output.dense.weight is different
+# key module.text_branch.encoder.layer.8.output.dense.bias is different
+# key module.text_branch.encoder.layer.8.output.LayerNorm.weight is different
+# key module.text_branch.encoder.layer.8.output.LayerNorm.bias is different
+# key module.text_branch.encoder.layer.9.attention.self.query.weight is different
+# key module.text_branch.encoder.layer.9.attention.self.query.bias is different
+# key module.text_branch.encoder.layer.9.attention.self.key.weight is different
+# key module.text_branch.encoder.layer.9.attention.self.key.bias is different
+# key module.text_branch.encoder.layer.9.attention.self.value.weight is different
+# key module.text_branch.encoder.layer.9.attention.self.value.bias is different
+# key module.text_branch.encoder.layer.9.attention.output.dense.weight is different
+# key module.text_branch.encoder.layer.9.attention.output.dense.bias is different
+# key module.text_branch.encoder.layer.9.attention.output.LayerNorm.weight is different
+# key module.text_branch.encoder.layer.9.attention.output.LayerNorm.bias is different
+# key module.text_branch.encoder.layer.9.intermediate.dense.weight is different
+# key module.text_branch.encoder.layer.9.intermediate.dense.bias is different
+# key module.text_branch.encoder.layer.9.output.dense.weight is different
+# key module.text_branch.encoder.layer.9.output.dense.bias is different
+# key module.text_branch.encoder.layer.9.output.LayerNorm.weight is different
+# key module.text_branch.encoder.layer.9.output.LayerNorm.bias is different
+# key module.text_branch.encoder.layer.10.attention.self.query.weight is different
+# key module.text_branch.encoder.layer.10.attention.self.query.bias is different
+# key module.text_branch.encoder.layer.10.attention.self.key.weight is different
+# key module.text_branch.encoder.layer.10.attention.self.key.bias is different
+# key module.text_branch.encoder.layer.10.attention.self.value.weight is different
+# key module.text_branch.encoder.layer.10.attention.self.value.bias is different
+# key module.text_branch.encoder.layer.10.attention.output.dense.weight is different
+# key module.text_branch.encoder.layer.10.attention.output.dense.bias is different
+# key module.text_branch.encoder.layer.10.attention.output.LayerNorm.weight is different
+# key module.text_branch.encoder.layer.10.attention.output.LayerNorm.bias is different
+# key module.text_branch.encoder.layer.10.intermediate.dense.weight is different
+# key module.text_branch.encoder.layer.10.intermediate.dense.bias is different
+# key module.text_branch.encoder.layer.10.output.dense.weight is different
+# key module.text_branch.encoder.layer.10.output.dense.bias is different
+# key module.text_branch.encoder.layer.10.output.LayerNorm.weight is different
+# key module.text_branch.encoder.layer.10.output.LayerNorm.bias is different
+# key module.text_branch.encoder.layer.11.attention.self.query.weight is different
+# key module.text_branch.encoder.layer.11.attention.self.query.bias is different
+# key module.text_branch.encoder.layer.11.attention.self.key.weight is different
+# key module.text_branch.encoder.layer.11.attention.self.key.bias is different
+# key module.text_branch.encoder.layer.11.attention.self.value.weight is different
+# key module.text_branch.encoder.layer.11.attention.self.value.bias is different
+# key module.text_branch.encoder.layer.11.attention.output.dense.weight is different
+# key module.text_branch.encoder.layer.11.attention.output.dense.bias is different
+# key module.text_branch.encoder.layer.11.attention.output.LayerNorm.weight is different
+# key module.text_branch.encoder.layer.11.attention.output.LayerNorm.bias is different
+# key module.text_branch.encoder.layer.11.intermediate.dense.weight is different
+# key module.text_branch.encoder.layer.11.intermediate.dense.bias is different
+# key module.text_branch.encoder.layer.11.output.dense.weight is different
+# key module.text_branch.encoder.layer.11.output.dense.bias is different
+# key module.text_branch.encoder.layer.11.output.LayerNorm.weight is different
+# key module.text_branch.encoder.layer.11.output.LayerNorm.bias is different
+# a_sum:  tensor(15078.6641)
+# b_sum:  tensor(15540.0723)
+# diff:  tensor(-461.4082)
+# True
+
+# linear_prob_text
+# check_ckpt_diff("/fsx/clap_logs/2022_09_15-02_05_29-linear_probemodel_PANN-14-lr_0.0001-b_512-j_4-p_fp32/checkpoints/pretrain_epoch_10_lp_epoch_50.pt", "/fsx/clap_logs/2022_09_15-02_05_29-linear_probemodel_PANN-14-lr_0.0001-b_512-j_4-p_fp32/checkpoints/pretrain_epoch_10_lp_epoch_100.pt", "text_branch.resblocks")
+
+# a_sum:  tensor(12111.0244)
+# b_sum:  tensor(12111.0244)
+# diff:  tensor(0.)
+
+# linear_prob_audio
+# check_ckpt_diff("/fsx/clap_logs/2022_09_15-02_05_29-linear_probemodel_PANN-14-lr_0.0001-b_512-j_4-p_fp32/checkpoints/pretrain_epoch_10_lp_epoch_50.pt", "/fsx/clap_logs/2022_09_15-02_05_29-linear_probemodel_PANN-14-lr_0.0001-b_512-j_4-p_fp32/checkpoints/pretrain_epoch_10_lp_epoch_100.pt", "clap_model")
+
+# key clap_model.audio_branch.bn0.num_batches_tracked is different
+# key clap_model.audio_branch.conv_block1.bn1.running_mean is different
+# key clap_model.audio_branch.conv_block1.bn1.running_var is different
+# key clap_model.audio_branch.conv_block1.bn1.num_batches_tracked is different
+# key clap_model.audio_branch.conv_block1.bn2.running_mean is different
+# key clap_model.audio_branch.conv_block1.bn2.running_var is different
+# key clap_model.audio_branch.conv_block1.bn2.num_batches_tracked is different
+# key clap_model.audio_branch.conv_block2.bn1.running_mean is different
+# key clap_model.audio_branch.conv_block2.bn1.running_var is different
+# key clap_model.audio_branch.conv_block2.bn1.num_batches_tracked is different
+# key clap_model.audio_branch.conv_block2.bn2.running_mean is different
+# key clap_model.audio_branch.conv_block2.bn2.running_var is different
+# key clap_model.audio_branch.conv_block2.bn2.num_batches_tracked is different
+# key clap_model.audio_branch.conv_block3.bn1.running_mean is different
+# key clap_model.audio_branch.conv_block3.bn1.running_var is different
+# key clap_model.audio_branch.conv_block3.bn1.num_batches_tracked is different
+# key clap_model.audio_branch.conv_block3.bn2.running_mean is different
+# key clap_model.audio_branch.conv_block3.bn2.running_var is different
+# key clap_model.audio_branch.conv_block3.bn2.num_batches_tracked is different
+# key clap_model.audio_branch.conv_block4.bn1.running_mean is different
+# key clap_model.audio_branch.conv_block4.bn1.running_var is different
+# key clap_model.audio_branch.conv_block4.bn1.num_batches_tracked is different
+# key clap_model.audio_branch.conv_block4.bn2.running_mean is different
+# key clap_model.audio_branch.conv_block4.bn2.running_var is different
+# key clap_model.audio_branch.conv_block4.bn2.num_batches_tracked is different
+# key clap_model.audio_branch.conv_block5.bn1.running_mean is different
+# key clap_model.audio_branch.conv_block5.bn1.running_var is different
+# key clap_model.audio_branch.conv_block5.bn1.num_batches_tracked is different
+# key clap_model.audio_branch.conv_block5.bn2.running_mean is different
+# key clap_model.audio_branch.conv_block5.bn2.running_var is different
+# key clap_model.audio_branch.conv_block5.bn2.num_batches_tracked is different
+# key clap_model.audio_branch.conv_block6.bn1.running_mean is different
+# key clap_model.audio_branch.conv_block6.bn1.running_var is different
+# key clap_model.audio_branch.conv_block6.bn1.num_batches_tracked is different
+# key clap_model.audio_branch.conv_block6.bn2.running_mean is different
+# key clap_model.audio_branch.conv_block6.bn2.running_var is different
+# key clap_model.audio_branch.conv_block6.bn2.num_batches_tracked is different
+# a_sum:  tensor(120061.5078)
+# b_sum:  tensor(122656.0469)
+# diff:  tensor(-2594.5391)
+# True
+
diff --git a/my_laion_clap/CLAP/src/tests/check_tars.py b/my_laion_clap/CLAP/src/tests/check_tars.py
new file mode 100644
index 0000000000000000000000000000000000000000..8dcf1c120dcc316f4f3166f57d448e64eaf2dbdd
--- /dev/null
+++ b/my_laion_clap/CLAP/src/tests/check_tars.py
@@ -0,0 +1,120 @@
+import webdataset as wds
+import soundfile as sf
+import io
+import os
+import random
+import copy
+from tqdm import tqdm
+import shutil
+import argparse
+import traceback
+import logging
+import json
+from laion_clap import tokenize
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--tar-path",
+        type=str,
+        default=None,
+        help="Path to the tars",
+    )
+    parser.add_argument(
+        "--start",
+        type=int,
+        default=0,
+        help="start from tar-path + start",
+    )
+    parser.add_argument(
+        "--end",
+        type=int,
+        default=99999,
+        help="end with tar-path + end",
+    )
+    parser.add_argument(
+        "--exclude",
+        nargs='+',
+        default=None,
+        help="exclude tar-path + exclude",
+    )
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=1,
+    )
+    parser.add_argument(
+        "--order",
+        default=False,
+        action='store_true',
+        help="if keep the search order accendingly",
+    )
+    args = parser.parse_args()
+    return args
+
+def log_and_continue(exn):
+    """Call in an exception handler to ignore any exception, isssue a warning, and continue."""
+    logging.warning(f"Handling webdataset error ({repr(exn)}). Ignoring.")
+    return True
+
+def preprocess(
+    sample,
+):
+    """
+    Preprocess a single sample for wdsdataloader.
+    """
+    audio_ext = "flac"
+    text_ext = "json"
+    audio_data, orig_sr = sf.read(io.BytesIO(sample[audio_ext]))
+    json_dict_raw = json.loads(sample[text_ext].decode("utf-8"))
+    sample["waveform"] = audio_data
+    texts = json_dict_raw["text"]
+    if isinstance(texts, list) and isinstance(texts[0], str) and len(texts) > 1:
+        texts = random.choice(texts)
+    sample["raw_text"] = texts
+    sample["text"] = tokenize(texts)
+    return sample
+
+if __name__ == "__main__":
+    args = parse_args()
+    tar_path = args.tar_path
+    idx_list = list(range(args.start, args.end))
+    if args.exclude != None:
+        for x in args.exclude:
+            idx_list.remove(x)
+    if not args.order:
+        random.shuffle(idx_list)
+    if "aws" in tar_path:
+        args.local = False
+    if args.local:
+        input_shards = [os.path.join(args.tar_path, str(i)+".tar") for i in idx_list]
+    else:
+        input_shards = [os.path.join(args.tar_path, str(i)+".tar -") for i in idx_list]
+    pipeline = [wds.SimpleShardList(input_shards)]
+    pipeline.extend(
+        [
+            wds.split_by_node,
+            wds.split_by_worker,
+            wds.tarfile_to_samples(handler=log_and_continue),
+            wds.map(preprocess),
+            wds.to_tuple("__url__", "__key__", "waveform"),
+            wds.batched(1),
+        ]
+    )
+    dataset = wds.DataPipeline(*pipeline)
+    dataloader = wds.WebLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=0)
+    old_k = 0
+    old_batch = None
+    try:
+        for k, batch in tqdm(enumerate(dataloader)):
+            print("k:", k)
+            print("batch:", batch)
+            old_k = k
+            old_batch = copy.deepcopy(batch)
+    except:
+        with open("check_tar_log.txt","a") as file:
+            traceback.print_exc(file = file)
+        print("old_k:", old_k)
+        print("old_batch:", old_batch)
+        pass
diff --git a/my_laion_clap/CLAP/src/tests/data_loader_test.py b/my_laion_clap/CLAP/src/tests/data_loader_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..03be75e7ce16723053eb3d506b59f91160a7f3c7
--- /dev/null
+++ b/my_laion_clap/CLAP/src/tests/data_loader_test.py
@@ -0,0 +1,60 @@
+from laion_clap import create_model
+from laion_clap.training.data import get_data
+from laion_clap.training import parse_args
+import torch
+import os
+from tqdm import tqdm
+from laion_clap.training.distributed import is_master, world_info_from_env
+from laion_clap.utils import dataset_split
+
+
+def run_dataloader():
+    for i, batch in enumerate(tqdm(dataloader, total=data["train"].dataloader.num_samples // args.batch_size)):
+        pass
+
+
+if __name__ == '__main__':
+
+    args = parse_args()
+    # sanitize model name for filesystem / uri use, easier if we don't use / in name as a rule?
+    args.amodel = args.amodel.replace("/", "-")
+    device = torch.device('cpu')
+
+    # discover initial world args early so we can log properly
+    args.distributed = False
+    args.local_rank, args.rank, args.world_size = world_info_from_env()
+
+    if args.remotedata and is_master(args):
+        for dataset_name in args.datasetnames:
+            for split in dataset_split[dataset_name]:
+                if not os.path.exists(f"./json_files/{dataset_name}/{split}"):
+                    os.makedirs(f"./json_files/{dataset_name}/{split}")
+                os.system(
+                    f"aws s3 cp s3://s-laion-audio/webdataset_tar/{dataset_name}/{split}/sizes.json ./json_files/{dataset_name}/{split}/sizes.json"
+                )
+
+    model, model_cfg = create_model(
+        args.amodel,
+        args.tmodel,
+        args.pretrained,
+        precision=args.precision,
+        device=device,
+        jit=args.torchscript,
+        force_quick_gelu=args.force_quick_gelu,
+        openai_model_cache_dir=os.path.expanduser(args.openai_model_cache_dir),
+        skip_params=True,
+        pretrained_audio=args.pretrained_audio,
+        pretrained_text=args.pretrained_text,
+        enable_fusion=args.enable_fusion,
+        fusion_type=args.fusion_type
+    )
+
+    data = get_data(args, model_cfg)
+
+    dataloader, sampler = data["train"].dataloader, data["train"].sampler
+
+    print('dataset size:', data["train"].dataloader.num_samples)
+    print('batch size:', args.batch_size)
+    print('num batches:', data["train"].dataloader.num_samples // args.batch_size)
+
+    run_dataloader()
diff --git a/my_ms_clap/.gitignore b/my_ms_clap/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..dfcfd56f444f9ae40e1082c07fe254cc547136cf
--- /dev/null
+++ b/my_ms_clap/.gitignore
@@ -0,0 +1,350 @@
+## Ignore Visual Studio temporary files, build results, and
+## files generated by popular Visual Studio add-ons.
+##
+## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
+
+# User-specific files
+*.rsuser
+*.suo
+*.user
+*.userosscache
+*.sln.docstates
+
+# User-specific files (MonoDevelop/Xamarin Studio)
+*.userprefs
+
+# Mono auto generated files
+mono_crash.*
+
+# Build results
+[Dd]ebug/
+[Dd]ebugPublic/
+[Rr]elease/
+[Rr]eleases/
+x64/
+x86/
+[Aa][Rr][Mm]/
+[Aa][Rr][Mm]64/
+bld/
+[Bb]in/
+[Oo]bj/
+[Ll]og/
+[Ll]ogs/
+
+# Visual Studio 2015/2017 cache/options directory
+.vs/
+# Uncomment if you have tasks that create the project's static files in wwwroot
+#wwwroot/
+
+# Visual Studio 2017 auto generated files
+Generated\ Files/
+
+# MSTest test Results
+[Tt]est[Rr]esult*/
+[Bb]uild[Ll]og.*
+
+# NUnit
+*.VisualState.xml
+TestResult.xml
+nunit-*.xml
+
+# Build Results of an ATL Project
+[Dd]ebugPS/
+[Rr]eleasePS/
+dlldata.c
+
+# Benchmark Results
+BenchmarkDotNet.Artifacts/
+
+# .NET Core
+project.lock.json
+project.fragment.lock.json
+artifacts/
+
+# StyleCop
+StyleCopReport.xml
+
+# Files built by Visual Studio
+*_i.c
+*_p.c
+*_h.h
+*.ilk
+*.meta
+*.obj
+*.iobj
+*.pch
+*.pdb
+*.ipdb
+*.pgc
+*.pgd
+*.rsp
+*.sbr
+*.tlb
+*.tli
+*.tlh
+*.tmp
+*.tmp_proj
+*_wpftmp.csproj
+*.log
+*.vspscc
+*.vssscc
+.builds
+*.pidb
+*.svclog
+*.scc
+
+# Chutzpah Test files
+_Chutzpah*
+
+# Visual C++ cache files
+ipch/
+*.aps
+*.ncb
+*.opendb
+*.opensdf
+*.sdf
+*.cachefile
+*.VC.db
+*.VC.VC.opendb
+
+# Visual Studio profiler
+*.psess
+*.vsp
+*.vspx
+*.sap
+
+# Visual Studio Trace Files
+*.e2e
+
+# TFS 2012 Local Workspace
+$tf/
+
+# Guidance Automation Toolkit
+*.gpState
+
+# ReSharper is a .NET coding add-in
+_ReSharper*/
+*.[Rr]e[Ss]harper
+*.DotSettings.user
+
+# TeamCity is a build add-in
+_TeamCity*
+
+# DotCover is a Code Coverage Tool
+*.dotCover
+
+# AxoCover is a Code Coverage Tool
+.axoCover/*
+!.axoCover/settings.json
+
+# Visual Studio code coverage results
+*.coverage
+*.coveragexml
+
+# NCrunch
+_NCrunch_*
+.*crunch*.local.xml
+nCrunchTemp_*
+
+# MightyMoose
+*.mm.*
+AutoTest.Net/
+
+# Web workbench (sass)
+.sass-cache/
+
+# Installshield output folder
+[Ee]xpress/
+
+# DocProject is a documentation generator add-in
+DocProject/buildhelp/
+DocProject/Help/*.HxT
+DocProject/Help/*.HxC
+DocProject/Help/*.hhc
+DocProject/Help/*.hhk
+DocProject/Help/*.hhp
+DocProject/Help/Html2
+DocProject/Help/html
+
+# Click-Once directory
+publish/
+
+# Publish Web Output
+*.[Pp]ublish.xml
+*.azurePubxml
+# Note: Comment the next line if you want to checkin your web deploy settings,
+# but database connection strings (with potential passwords) will be unencrypted
+*.pubxml
+*.publishproj
+
+# Microsoft Azure Web App publish settings. Comment the next line if you want to
+# checkin your Azure Web App publish settings, but sensitive information contained
+# in these scripts will be unencrypted
+PublishScripts/
+
+# NuGet Packages
+*.nupkg
+# NuGet Symbol Packages
+*.snupkg
+# The packages folder can be ignored because of Package Restore
+**/[Pp]ackages/*
+# except build/, which is used as an MSBuild target.
+!**/[Pp]ackages/build/
+# Uncomment if necessary however generally it will be regenerated when needed
+#!**/[Pp]ackages/repositories.config
+# NuGet v3's project.json files produces more ignorable files
+*.nuget.props
+*.nuget.targets
+
+# Microsoft Azure Build Output
+csx/
+*.build.csdef
+
+# Microsoft Azure Emulator
+ecf/
+rcf/
+
+# Windows Store app package directories and files
+AppPackages/
+BundleArtifacts/
+Package.StoreAssociation.xml
+_pkginfo.txt
+*.appx
+*.appxbundle
+*.appxupload
+
+# Visual Studio cache files
+# files ending in .cache can be ignored
+*.[Cc]ache
+# but keep track of directories ending in .cache
+!?*.[Cc]ache/
+
+# Others
+ClientBin/
+~$*
+*~
+*.dbmdl
+*.dbproj.schemaview
+*.jfm
+*.pfx
+*.publishsettings
+orleans.codegen.cs
+
+# Including strong name files can present a security risk
+# (https://github.com/github/gitignore/pull/2483#issue-259490424)
+#*.snk
+
+# Since there are multiple workflows, uncomment next line to ignore bower_components
+# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
+#bower_components/
+
+# RIA/Silverlight projects
+Generated_Code/
+
+# Backup & report files from converting an old project file
+# to a newer Visual Studio version. Backup files are not needed,
+# because we have git ;-)
+_UpgradeReport_Files/
+Backup*/
+UpgradeLog*.XML
+UpgradeLog*.htm
+ServiceFabricBackup/
+*.rptproj.bak
+
+# SQL Server files
+*.mdf
+*.ldf
+*.ndf
+
+# Business Intelligence projects
+*.rdl.data
+*.bim.layout
+*.bim_*.settings
+*.rptproj.rsuser
+*- [Bb]ackup.rdl
+*- [Bb]ackup ([0-9]).rdl
+*- [Bb]ackup ([0-9][0-9]).rdl
+
+# Microsoft Fakes
+FakesAssemblies/
+
+# GhostDoc plugin setting file
+*.GhostDoc.xml
+
+# Node.js Tools for Visual Studio
+.ntvs_analysis.dat
+node_modules/
+
+# Visual Studio 6 build log
+*.plg
+
+# Visual Studio 6 workspace options file
+*.opt
+
+# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
+*.vbw
+
+# Visual Studio LightSwitch build output
+**/*.HTMLClient/GeneratedArtifacts
+**/*.DesktopClient/GeneratedArtifacts
+**/*.DesktopClient/ModelManifest.xml
+**/*.Server/GeneratedArtifacts
+**/*.Server/ModelManifest.xml
+_Pvt_Extensions
+
+# Paket dependency manager
+.paket/paket.exe
+paket-files/
+
+# FAKE - F# Make
+.fake/
+
+# CodeRush personal settings
+.cr/personal
+
+# Python Tools for Visual Studio (PTVS)
+__pycache__/
+*.pyc
+
+# Cake - Uncomment if you are using it
+# tools/**
+# !tools/packages.config
+
+# Tabs Studio
+*.tss
+
+# Telerik's JustMock configuration file
+*.jmconfig
+
+# BizTalk build output
+*.btp.cs
+*.btm.cs
+*.odx.cs
+*.xsd.cs
+
+# OpenCover UI analysis results
+OpenCover/
+
+# Azure Stream Analytics local run output
+ASALocalRun/
+
+# MSBuild Binary and Structured Log
+*.binlog
+
+# NVidia Nsight GPU debugger configuration file
+*.nvuser
+
+# MFractors (Xamarin productivity tool) working folder
+.mfractor/
+
+# Local History for Visual Studio
+.localhistory/
+
+# BeatPulse healthcheck temp database
+healthchecksdb
+
+# Backup folder for Package Reference Convert tool in Visual Studio 2017
+MigrationBackup/
+
+# Ionide (cross platform F# VS Code tools) working folder
+.ionide/
diff --git a/my_ms_clap/CODE_OF_CONDUCT.md b/my_ms_clap/CODE_OF_CONDUCT.md
new file mode 100644
index 0000000000000000000000000000000000000000..f9ba8cf65f3e3104dd061c178066ec8247811f33
--- /dev/null
+++ b/my_ms_clap/CODE_OF_CONDUCT.md
@@ -0,0 +1,9 @@
+# Microsoft Open Source Code of Conduct
+
+This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
+
+Resources:
+
+- [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
+- [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
+- Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
diff --git a/my_ms_clap/LICENSE b/my_ms_clap/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..9e841e7a26e4eb057b24511e7b92d42b257a80e5
--- /dev/null
+++ b/my_ms_clap/LICENSE
@@ -0,0 +1,21 @@
+    MIT License
+
+    Copyright (c) Microsoft Corporation.
+
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+    of this software and associated documentation files (the "Software"), to deal
+    in the Software without restriction, including without limitation the rights
+    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+    copies of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be included in all
+    copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+    SOFTWARE
diff --git a/my_ms_clap/README.md b/my_ms_clap/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..c8019be2b3c6cd219e7280639ee196c5c9a7be5e
--- /dev/null
+++ b/my_ms_clap/README.md
@@ -0,0 +1,120 @@
+###### [Overview](#CLAP) | [Setup](#Setup) | [CLAP weights](#CLAP-weights) | [Usage](#Usage) | [Examples](#Examples) | [Citation](#Citation)
+
+# CLAP
+
+CLAP (Contrastive Language-Audio Pretraining) is a model that learns acoustic concepts from natural language supervision and enables “Zero-Shot” inference. The model has been extensively evaluated in 26 audio downstream tasks achieving SoTA in several of them including classification, retrieval, and captioning.
+
+<img width="832" alt="clap_diagrams" src="https://github.com/bmartin1/CLAP/assets/26778834/c5340a09-cc0c-4e41-ad5a-61546eaa824c">
+
+## Setup
+
+Install the dependencies: `pip install -r requirements.txt` using Python 3 to get started.
+
+If you have [conda](https://www.anaconda.com) installed, you can run the following: 
+
+```shell
+git clone https://github.com/microsoft/CLAP.git && \
+cd CLAP && \
+conda create -n clap python=3.10 && \
+conda activate clap && \
+pip install -r requirements.txt
+```
+
+## NEW CLAP weights
+Download CLAP weights: versions _2022_, _2023_, and _clapcap_: [Pretrained Model \[Zenodo\]](https://zenodo.org/record/8378278)
+
+_clapcap_ is the audio captioning model that uses the 2023 encoders.
+
+## Usage
+
+- Zero-Shot Classification and Retrieval
+```python
+# Load model (Choose between versions '2022' or '2023')
+from src import CLAP 
+
+clap_model = CLAP("<PATH TO WEIGHTS>", version = '2023', use_cuda=False)
+
+# Extract text embeddings
+text_embeddings = clap_model.get_text_embeddings(class_labels: List[str])
+
+# Extract audio embeddings
+audio_embeddings = clap_model.get_audio_embeddings(file_paths: List[str])
+
+# Compute similarity between audio and text embeddings 
+similarities = clap_model.compute_similarity(audio_embeddings, text_embeddings)
+```
+
+- Audio Captioning
+```python
+# Load model (Choose version 'clapcap')
+from src import CLAP 
+
+clap_model = CLAP("<PATH TO WEIGHTS>", version = 'clapcap', use_cuda=False)
+
+# Generate audio captions
+captions = clap_model.generate_caption(file_paths: List[str])
+```
+
+## Examples
+Take a look at `CLAP\src\` for usage examples. 
+
+To run Zero-Shot Classification on the ESC50 dataset try the following:
+
+```bash
+> cd src && python zero_shot_classification.py
+```
+Output (version 2023)
+```bash
+ESC50 Accuracy: 93.9%
+```
+
+## Citation
+
+Kindly cite our work if you find it useful.
+
+[CLAP: Learning Audio Concepts from Natural Language Supervision](https://ieeexplore.ieee.org/abstract/document/10095889)
+```
+@inproceedings{CLAP2022,
+  title={Clap learning audio concepts from natural language supervision},
+  author={Elizalde, Benjamin and Deshmukh, Soham and Al Ismail, Mahmoud and Wang, Huaming},
+  booktitle={ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
+  pages={1--5},
+  year={2023},
+  organization={IEEE}
+}
+```
+
+[Natural Language Supervision for General-Purpose Audio Representations](https://arxiv.org/abs/2309.05767)
+```
+@misc{CLAP2023,
+      title={Natural Language Supervision for General-Purpose Audio Representations}, 
+      author={Benjamin Elizalde and Soham Deshmukh and Huaming Wang},
+      year={2023},
+      eprint={2309.05767},
+      archivePrefix={arXiv},
+      primaryClass={cs.SD},
+      url={https://arxiv.org/abs/2309.05767}
+}
+```
+
+## Contributing
+
+This project welcomes contributions and suggestions.  Most contributions require you to agree to a
+Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
+the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
+
+When you submit a pull request, a CLA bot will automatically determine whether you need to provide
+a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions
+provided by the bot. You will only need to do this once across all repos using our CLA.
+
+This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
+For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
+contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
+
+## Trademarks
+
+This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft 
+trademarks or logos is subject to and must follow 
+[Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general).
+Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship.
+Any use of third-party trademarks or logos are subject to those third-party's policies.
diff --git a/my_ms_clap/SECURITY.md b/my_ms_clap/SECURITY.md
new file mode 100644
index 0000000000000000000000000000000000000000..e138ec5d6a77c5e6e09346a69cbdc90c85a3806f
--- /dev/null
+++ b/my_ms_clap/SECURITY.md
@@ -0,0 +1,41 @@
+<!-- BEGIN MICROSOFT SECURITY.MD V0.0.8 BLOCK -->
+
+## Security
+
+Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
+
+If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below.
+
+## Reporting Security Issues
+
+**Please do not report security vulnerabilities through public GitHub issues.**
+
+Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report).
+
+If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey).
+
+You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). 
+
+Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
+
+  * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
+  * Full paths of source file(s) related to the manifestation of the issue
+  * The location of the affected source code (tag/branch/commit or direct URL)
+  * Any special configuration required to reproduce the issue
+  * Step-by-step instructions to reproduce the issue
+  * Proof-of-concept or exploit code (if possible)
+  * Impact of the issue, including how an attacker might exploit the issue
+
+This information will help us triage your report more quickly.
+
+If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs.
+
+## Preferred Languages
+
+We prefer all communications to be in English.
+
+## Policy
+
+Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd).
+
+<!-- END MICROSOFT SECURITY.MD BLOCK -->
diff --git a/my_ms_clap/SUPPORT.md b/my_ms_clap/SUPPORT.md
new file mode 100644
index 0000000000000000000000000000000000000000..291d4d43733f4c15a81ff598ec1c99fd6c18f64c
--- /dev/null
+++ b/my_ms_clap/SUPPORT.md
@@ -0,0 +1,25 @@
+# TODO: The maintainer of this repo has not yet edited this file
+
+**REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project?
+
+- **No CSS support:** Fill out this template with information about how to file issues and get help.
+- **Yes CSS support:** Fill out an intake form at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). CSS will work with/help you to determine next steps.
+- **Not sure?** Fill out an intake as though the answer were "Yes". CSS will help you decide.
+
+*Then remove this first heading from this SUPPORT.MD file before publishing your repo.*
+
+# Support
+
+## How to file issues and get help  
+
+This project uses GitHub Issues to track bugs and feature requests. Please search the existing 
+issues before filing new issues to avoid duplicates.  For new issues, file your bug or 
+feature request as a new Issue.
+
+For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE 
+FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER
+CHANNEL. WHERE WILL YOU HELP PEOPLE?**.
+
+## Microsoft Support Policy  
+
+Support for this **PROJECT or PRODUCT** is limited to the resources listed above.
diff --git a/my_ms_clap/requirements.txt b/my_ms_clap/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3135521be3e2e7f496d4108220aba59b25576217
--- /dev/null
+++ b/my_ms_clap/requirements.txt
@@ -0,0 +1,50 @@
+appdirs==1.4.4
+audioread==3.0.0
+certifi==2022.12.7
+cffi==1.15.1
+charset-normalizer==3.0.1
+colorama==0.4.6
+decorator==5.1.1
+filelock==3.9.0
+flit_core==3.6.0
+huggingface-hub==0.12.1
+idna==3.4
+importlib-metadata==6.0.0
+importlib-resources==5.12.0
+jaraco.classes==3.2.3
+joblib==1.2.0
+lazy_loader==0.1
+librosa==0.10.0
+llvmlite==0.39.1
+mkl-service==2.4.0
+more-itertools==9.0.0
+msgpack==1.0.4
+numba==0.56.4
+numpy==1.23.5
+packaging==23.0
+pandas==1.4.2
+pooch==1.6.0
+pycparser==2.21
+pywin32-ctypes==0.2.0
+PyYAML==6.0
+regex==2022.10.31
+requests==2.28.2
+scikit-learn==1.2.1
+scipy==1.10.1
+setuptools==65.6.3
+six==1.16.0
+soundfile==0.12.1
+soxr==0.3.3
+threadpoolctl==3.1.0
+tokenizers==0.13.2
+torch==1.13.1
+torchaudio==0.13.1
+torchlibrosa==0.1.0
+torchvision==0.14.1
+tqdm==4.64.1
+transformers==4.26.1
+typing_extensions==4.4.0
+urllib3==1.26.14
+wheel==0.38.4
+wincertstore==0.2
+zipp==3.14.0
diff --git a/my_ms_clap/src/CLAPWrapper.py b/my_ms_clap/src/CLAPWrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..1bd9715ee98522b563002770004de1feb34c9daf
--- /dev/null
+++ b/my_ms_clap/src/CLAPWrapper.py
@@ -0,0 +1,458 @@
+import warnings
+warnings.filterwarnings("ignore")
+import random
+import torchaudio
+# from torch._six import string_classes
+import collections
+import re
+import numpy as np
+from transformers import AutoTokenizer, logging
+try:
+    from models.clap import CLAP
+    from models.mapper import get_clapcap
+except:
+    from .models.clap import CLAP
+    from .models.mapper import get_clapcap
+import math
+import torchaudio.transforms as T
+import os
+import torch
+from importlib_resources import files
+import argparse
+import yaml
+import sys
+logging.set_verbosity_error()
+
+
+class CLAPWrapper():
+    """
+    A class for interfacing CLAP model.  
+    """
+
+    def __init__(self, model_fp, config_root, version, use_cuda=False):
+        self.supported_versions = ['2022', '2023', 'clapcap']
+        self.np_str_obj_array_pattern = re.compile(r'[SaUO]')
+        self.file_path = os.path.realpath(__file__)
+        self.default_collate_err_msg_format = (
+            "default_collate: batch must contain tensors, numpy arrays, numbers, "
+            "dicts or lists; found {}")
+        self.config_root = config_root
+        self.config_as_str = self.get_config_path(version)
+        self.model_fp = model_fp
+        self.use_cuda = use_cuda
+        self.version = version
+        if 'clapcap' in self.version:
+            self.clapcap, self.tokenizer, self.args = self.load_clapcap()
+        else:
+            self.clap, self.tokenizer, self.args = self.load_clap()
+    
+    def get_config_path(self, version):
+        if version in self.supported_versions:
+            # config_root = /home/zkong/audio_flamingo/audio_flamingo_v1/microsoft_clap/src/configs
+            return f"{self.config_root}/config_{version}.yml"
+        else:
+            raise ValueError(f"The specific version is not supported. The supported versions are {str(self.supported_versions)}")
+    
+    def read_config_as_args(self,config_path,args=None,is_config_str=False):
+        return_dict = {}
+
+        if config_path is not None:
+            if is_config_str:
+                yml_config = yaml.load(config_path, Loader=yaml.FullLoader)
+            else:
+                with open(config_path, "r") as f:
+                    yml_config = yaml.load(f, Loader=yaml.FullLoader)
+
+            if args != None:
+                for k, v in yml_config.items():
+                    if k in args.__dict__:
+                        args.__dict__[k] = v
+                    else:
+                        sys.stderr.write("Ignored unknown parameter {} in yaml.\n".format(k))
+            else:
+                for k, v in yml_config.items():
+                    return_dict[k] = v
+
+        args = args if args != None else return_dict
+        return argparse.Namespace(**args)
+
+    def load_clap(self):
+        r"""Load CLAP model with args from config file"""
+
+        args = self.read_config_as_args(self.config_as_str, is_config_str=False)
+
+        if 'roberta' in args.text_model or 'clip' in args.text_model or 'gpt' in args.text_model:
+            self.token_keys = ['input_ids', 'attention_mask']
+        elif 'bert' in args.text_model:
+            self.token_keys = ['input_ids', 'token_type_ids', 'attention_mask']
+
+        clap = CLAP(
+            audioenc_name=args.audioenc_name,
+            sample_rate=args.sampling_rate,
+            window_size=args.window_size,
+            hop_size=args.hop_size,
+            mel_bins=args.mel_bins,
+            fmin=args.fmin,
+            fmax=args.fmax,
+            classes_num=args.num_classes,
+            out_emb=args.out_emb,
+            text_model=args.text_model,
+            transformer_embed_dim=args.transformer_embed_dim,
+            d_proj=args.d_proj
+        )
+
+        # Load pretrained weights for model
+        model_state_dict = torch.load(self.model_fp, map_location=torch.device('cpu'))['model']
+
+        # We unwrap the DDP model and save. If the model is not unwrapped and saved, then the model needs to unwrapped before `load_state_dict`: 
+        # Reference link: https://discuss.pytorch.org/t/how-to-load-dataparallel-model-which-trained-using-multiple-gpus/146005
+        clap.load_state_dict(model_state_dict)
+
+        clap.eval()  # set clap in eval mode
+        tokenizer = AutoTokenizer.from_pretrained(args.text_model)
+        if 'gpt' in args.text_model:
+            tokenizer.add_special_tokens({'pad_token': '!'})
+
+        if self.use_cuda and torch.cuda.is_available():
+            clap = clap.cuda()
+
+        return clap, tokenizer, args
+    
+    def load_clapcap(self):
+        r"""Load CLAP model with args from config file"""
+
+        args = self.read_config_as_args(self.config_as_str, is_config_str=False)
+        args.prefix_dim = args.d_proj
+        text_model = args.text_model
+        args.text_model = args.text_decoder
+        args.cross_attention = True if 'cross' in args.clapcap_model.lower() else False
+
+        if 'roberta' in args.text_model or 'clip' in args.text_model or 'gpt' in args.text_model:
+            self.token_keys = ['input_ids', 'attention_mask']
+        elif 'bert' in args.text_model:
+            self.token_keys = ['input_ids', 'token_type_ids', 'attention_mask']
+
+        clap = CLAP(
+            audioenc_name=args.audioenc_name,
+            sample_rate=args.sampling_rate,
+            window_size=args.window_size,
+            hop_size=args.hop_size,
+            mel_bins=args.mel_bins,
+            fmin=args.fmin,
+            fmax=args.fmax,
+            classes_num=args.num_classes,
+            out_emb=args.out_emb,
+            text_model=text_model,
+            transformer_embed_dim=args.transformer_embed_dim,
+            d_proj=args.d_proj
+        )
+
+        clapcap = get_clapcap(args.clapcap_model)(clap, args.text_decoder, args.prefix_length, args.prefix_length_clip, args.prefix_dim,
+                 args.num_layers, args.normalize_prefix, args.mapping_type, True, True)
+
+        model_state_dict = torch.load(self.model_fp, map_location=torch.device('cpu'))['model']
+        clapcap.load_state_dict(model_state_dict)
+
+        clapcap.eval()  # set clap in eval mode
+        tokenizer = AutoTokenizer.from_pretrained(args.text_model)
+        if 'gpt' in args.text_model:
+            tokenizer.add_special_tokens({'pad_token': '!'})
+
+        if self.use_cuda and torch.cuda.is_available():
+            clapcap = clapcap.cuda()
+
+        return clapcap, tokenizer, args
+
+    def default_collate(self, batch):
+        r"""Puts each data field into a tensor with outer dimension batch size"""
+        elem = batch[0]
+        elem_type = type(elem)
+        if isinstance(elem, torch.Tensor):
+            out = None
+            if torch.utils.data.get_worker_info() is not None:
+                # If we're in a background process, concatenate directly into a
+                # shared memory tensor to avoid an extra copy
+                numel = sum([x.numel() for x in batch])
+                storage = elem.storage()._new_shared(numel)
+                out = elem.new(storage)
+            return torch.stack(batch, 0, out=out)
+        elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' \
+                and elem_type.__name__ != 'string_':
+            if elem_type.__name__ == 'ndarray' or elem_type.__name__ == 'memmap':
+                # array of string classes and object
+                if self.np_str_obj_array_pattern.search(elem.dtype.str) is not None:
+                    raise TypeError(
+                        self.default_collate_err_msg_format.format(elem.dtype))
+
+                return self.default_collate([torch.as_tensor(b) for b in batch])
+            elif elem.shape == ():  # scalars
+                return torch.as_tensor(batch)
+        elif isinstance(elem, float):
+            return torch.tensor(batch, dtype=torch.float64)
+        elif isinstance(elem, int):
+            return torch.tensor(batch)
+        # elif isinstance(elem, string_classes):
+        #     return batch
+        elif isinstance(elem, collections.abc.Mapping):
+            return {key: self.default_collate([d[key] for d in batch]) for key in elem}
+        elif isinstance(elem, tuple) and hasattr(elem, '_fields'):  # namedtuple
+            return elem_type(*(self.default_collate(samples) for samples in zip(*batch)))
+        elif isinstance(elem, collections.abc.Sequence):
+            # check to make sure that the elements in batch have consistent size
+            it = iter(batch)
+            elem_size = len(next(it))
+            if not all(len(elem) == elem_size for elem in it):
+                raise RuntimeError(
+                    'each element in list of batch should be of equal size')
+            transposed = zip(*batch)
+            return [self.default_collate(samples) for samples in transposed]
+
+        raise TypeError(self.default_collate_err_msg_format.format(elem_type))
+    
+    def read_audio(self, audio_path, resample=False):
+        r"""Loads audio file or array and returns a torch tensor"""
+        # Randomly sample a segment of audio_duration from the clip or pad to match duration
+        audio_time_series, sample_rate = torchaudio.load(audio_path)
+        
+        resample_rate = self.args.sampling_rate
+        if resample:
+            resampler = T.Resample(sample_rate, resample_rate)
+            audio_time_series = resampler(audio_time_series)
+        return audio_time_series, sample_rate
+
+    def load_audio_into_tensor(self, audio_path, audio_duration, resample=False):
+        r"""Loads audio file and returns raw audio."""
+        # Randomly sample a segment of audio_duration from the clip or pad to match duration
+        audio_time_series, sample_rate = self.read_audio(audio_path, resample=False)
+        audio_time_series = audio_time_series.reshape(-1)
+
+        # audio_time_series is shorter than predefined audio duration,
+        # so audio_time_series is extended
+        if audio_duration*sample_rate >= audio_time_series.shape[0]:
+            repeat_factor = int(np.ceil((audio_duration*sample_rate) /
+                                        audio_time_series.shape[0]))
+            # Repeat audio_time_series by repeat_factor to match audio_duration
+            audio_time_series = audio_time_series.repeat(repeat_factor)
+            # remove excess part of audio_time_series
+            audio_time_series = audio_time_series[0:audio_duration*sample_rate]
+        else:
+            # audio_time_series is longer than predefined audio duration,
+            # so audio_time_series is trimmed
+            start_index = random.randrange(
+                audio_time_series.shape[0] - audio_duration*sample_rate)
+            audio_time_series = audio_time_series[start_index:start_index +
+                                                  audio_duration*sample_rate]
+        return torch.FloatTensor(audio_time_series)
+    
+    # modified by Kong
+    def load_audio_clip_into_tensor(self, audio_clip, audio_duration, resample=False):
+        r"""Loads audio clip and returns raw audio."""
+        # Randomly sample a segment of audio_duration from the clip or pad to match duration
+        sample_rate = 44100
+        audio_time_series = audio_clip.reshape(-1)
+
+        # audio_time_series is shorter than predefined audio duration,
+        # so audio_time_series is extended
+        assert audio_duration * sample_rate >= audio_time_series.shape[0], \
+            'dur * sr = {} should be larger than len = {}'.format(audio_duration * sample_rate, audio_time_series.shape[0])
+        repeat_factor = int(np.ceil((audio_duration*sample_rate) /
+                                    audio_time_series.shape[0]))
+        # Repeat audio_time_series by repeat_factor to match audio_duration
+        audio_time_series = audio_time_series.repeat(repeat_factor)
+        # remove excess part of audio_time_series
+        audio_time_series = audio_time_series[0:audio_duration*sample_rate]
+
+        # return torch.FloatTensor(audio_time_series)
+        return audio_time_series  # already on cuda device
+    
+    def preprocess_audio(self, audio_files, resample):
+        r"""Load list of audio files and return raw audio"""
+        audio_tensors = []
+        for audio_file in audio_files:
+            audio_tensor = self.load_audio_into_tensor(
+                audio_file, self.args.duration, resample)
+            audio_tensor = audio_tensor.reshape(
+                1, -1).cuda() if self.use_cuda and torch.cuda.is_available() else audio_tensor.reshape(1, -1)
+            audio_tensors.append(audio_tensor)
+        return self.default_collate(audio_tensors)
+    
+    # modified by Kong
+    def preprocess_audio_clips(self, audio_clips, resample=False):
+        r"""Load list of audio clips and return raw audio"""
+        audio_tensors = []
+        for audio_clip in audio_clips:
+            audio_tensor = self.load_audio_clip_into_tensor(
+                audio_clip, self.args.duration, resample=False)
+            audio_tensor = audio_tensor.reshape(
+                1, -1).cuda() if self.use_cuda and torch.cuda.is_available() else audio_tensor.reshape(1, -1)
+            audio_tensors.append(audio_tensor)
+        return self.default_collate(audio_tensors)
+
+    def preprocess_text(self, text_queries):
+        r"""Load list of class labels and return tokenized text"""
+        tokenized_texts = []
+        for ttext in text_queries:
+            if 'gpt' in self.args.text_model:
+                ttext = ttext + ' <|endoftext|>'
+            tok = self.tokenizer.encode_plus(
+                text=ttext, add_special_tokens=True, max_length=self.args.text_len, padding='max_length', return_tensors="pt")
+            for key in self.token_keys:
+                tok[key] = tok[key].reshape(-1).cuda() if self.use_cuda and torch.cuda.is_available() else tok[key].reshape(-1)
+            tokenized_texts.append(tok)
+        return self.default_collate(tokenized_texts)
+
+    def get_text_embeddings(self, class_labels):
+        r"""Load list of class labels and return text embeddings"""
+        preprocessed_text = self.preprocess_text(class_labels)
+        return self._get_text_embeddings(preprocessed_text)
+
+    def get_audio_embeddings(self, audio_files, resample):
+        r"""Load list of audio files and return a audio embeddings"""
+        preprocessed_audio = self.preprocess_audio(audio_files, resample)
+        return self._get_audio_embeddings(preprocessed_audio)
+    
+    # modified by Kong
+    def get_audio_embeddings_from_clips(self, audio_clips, resample=False):
+        r"""Load list of audio files and return a audio embeddings"""
+        preprocessed_audio = self.preprocess_audio_clips(audio_clips, resample)
+        return self._get_audio_embeddings(preprocessed_audio)
+
+    def _get_text_embeddings(self, preprocessed_text):
+        r"""Load preprocessed text and return text embeddings"""
+        with torch.no_grad():
+            return self.clap.caption_encoder(preprocessed_text)
+
+    # modified by Kong
+    def _get_audio_embeddings(self, preprocessed_audio):
+        r"""Load preprocessed audio and return a audio embeddings"""
+        with torch.no_grad():
+            preprocessed_audio = preprocessed_audio.reshape(
+                preprocessed_audio.shape[0], preprocessed_audio.shape[2])
+            #Append [0] the audio emebdding, [1] has output class probabilities
+            if 'clapcap' in self.version:
+                return self.clapcap.clap(preprocessed_audio)[0]
+            else:
+                return self.clap.audio_encoder(preprocessed_audio)[0]
+
+    def _generic_batch_inference(self, func, *args):
+        r"""Process audio and/or text per batch"""
+        input_tmp = args[0]
+        batch_size = args[-1]
+        # args[0] has audio_files, args[1] has class_labels
+        inputs = [args[0], args[1]] if len(args) == 3 else [args[0]]
+        args0_len = len(args[0])
+        # compute text_embeddings once for all the audio_files batches
+        if len(inputs) == 2:
+            text_embeddings = self.get_text_embeddings(args[1])
+            inputs = [args[0], args[1], text_embeddings]
+        dataset_idx = 0
+        for _ in range(math.ceil(args0_len/batch_size)):
+            next_batch_idx = dataset_idx + batch_size
+            # batch size is bigger than available audio/text items
+            if next_batch_idx >= args0_len:
+                inputs[0] = input_tmp[dataset_idx:]
+                return func(*tuple(inputs))
+            else:
+                inputs[0] = input_tmp[dataset_idx:next_batch_idx]
+                yield func(*tuple(inputs))
+            dataset_idx = next_batch_idx
+
+    def get_audio_embeddings_per_batch(self, audio_files, batch_size):
+        r"""Load preprocessed audio and return a audio embeddings per batch"""
+        return self._generic_batch_inference(self.get_audio_embeddings, audio_files, batch_size)
+
+    def get_text_embeddings_per_batch(self, class_labels, batch_size):
+        r"""Load preprocessed text and return text embeddings per batch"""
+        return self._generic_batch_inference(self.get_text_embeddings, class_labels, batch_size)
+
+    def compute_similarity(self, audio_embeddings, text_embeddings):
+        r"""Compute similarity between text and audio embeddings"""
+        audio_embeddings = audio_embeddings/torch.norm(audio_embeddings, dim=-1, keepdim=True)
+        text_embeddings = text_embeddings/torch.norm(text_embeddings, dim=-1, keepdim=True)
+    
+        logit_scale = self.clap.logit_scale.exp()
+        similarity = logit_scale*text_embeddings @ audio_embeddings.T
+        return similarity.T
+
+    def classify_audio_files_per_batch(self, audio_files, class_labels, batch_size):
+        r"""Compute classification probabilities for each audio recording in a batch and each class label"""
+        return self._generic_batch_inference(self.classify_audio_files, audio_files, class_labels, batch_size)
+    
+    def generate_caption(self, audio_files, resample=True, beam_size: int = 5, entry_length=67, temperature=1.):
+        r"""Generate audio captions for each audio recording in a batch"""
+        captions = []
+        audio_tensors = self.preprocess_audio(audio_files, resample)
+
+        with torch.no_grad():
+            prefix = self.clapcap.clap(audio_tensors.squeeze(1))[0]
+            if self.args.normalize_prefix:
+                prefix = prefix / prefix.norm(2, -1).reshape(-1,1)
+            prefix_embed = self.clapcap.clap_project(prefix).view(-1, self.args.prefix_length, self.clapcap.gpt.transformer.wte.weight.shape[1])
+
+            for i in range(len(audio_tensors)):
+                gen_caption = self._generate_beam(embed=prefix_embed[i].unsqueeze(0),\
+                                                            beam_size=beam_size,\
+                                                            entry_length=entry_length,\
+                                                            temperature=temperature)[0]
+                captions.append(gen_caption.capitalize())
+        return captions
+    
+    def _generate_beam(self, beam_size: int = 5, prompt=None, embed=None,
+                  entry_length=67, temperature=1., stop_token: str = ' <|endoftext|>'):
+        r"""Generate captions by beam search decoding"""
+        self.clapcap.eval()
+        stop_token_index = self.tokenizer.encode(stop_token)[0]
+        tokens = None
+        scores = None
+        device = next(self.clapcap.parameters()).device
+        seq_lengths = torch.ones(beam_size, device=device)
+        is_stopped = torch.zeros(beam_size, device=device, dtype=torch.bool)
+        with torch.no_grad():
+            if embed is not None:
+                generated = embed
+            else:
+                if tokens is None:
+                    tokens = torch.tensor(self.tokenizer.encode(prompt))
+                    tokens = tokens.unsqueeze(0).to(device)
+                    generated = self.clapcap.gpt.transformer.wte(tokens)
+            for i in range(entry_length):
+                outputs = self.clapcap.gpt(inputs_embeds=generated)
+                logits = outputs.logits
+                logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)
+                logits = logits.softmax(-1).log()
+                if scores is None:
+                    scores, next_tokens = logits.topk(beam_size, -1)
+                    generated = generated.expand(beam_size, *generated.shape[1:])
+                    next_tokens, scores = next_tokens.permute(1, 0), scores.squeeze(0)
+                    if tokens is None:
+                        tokens = next_tokens
+                    else:
+                        tokens = tokens.expand(beam_size, *tokens.shape[1:])
+                        tokens = torch.cat((tokens, next_tokens), dim=1)
+                else:
+                    logits[is_stopped] = -float(np.inf)
+                    logits[is_stopped, 0] = 0
+                    scores_sum = scores[:, None] + logits
+                    seq_lengths[~is_stopped] += 1
+                    scores_sum_average = scores_sum / seq_lengths[:, None]
+                    scores_sum_average, next_tokens = scores_sum_average.view(-1).topk(beam_size, -1)
+                    next_tokens_source = next_tokens // scores_sum.shape[1]
+                    seq_lengths = seq_lengths[next_tokens_source]
+                    next_tokens = next_tokens % scores_sum.shape[1]
+                    next_tokens = next_tokens.unsqueeze(1)
+                    tokens = tokens[next_tokens_source]
+                    tokens = torch.cat((tokens, next_tokens), dim=1)
+                    generated = generated[next_tokens_source]
+                    scores = scores_sum_average * seq_lengths
+                    is_stopped = is_stopped[next_tokens_source]
+                next_token_embed = self.clapcap.gpt.transformer.wte(next_tokens.squeeze()).view(generated.shape[0], 1, -1)
+                generated = torch.cat((generated, next_token_embed), dim=1)
+                is_stopped = is_stopped + next_tokens.eq(stop_token_index).squeeze()
+                if is_stopped.all():
+                    break
+        scores = scores / seq_lengths
+        output_list = tokens.cpu().numpy()
+        output_texts = [self.tokenizer.decode(output[:int(length)]) for output, length in zip(output_list, seq_lengths)]
+        order = scores.argsort(descending=True)
+        output_texts = [output_texts[i] for i in order]
+        return output_texts
\ No newline at end of file
diff --git a/my_ms_clap/src/__init__.py b/my_ms_clap/src/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/my_ms_clap/src/audio_captioning.py b/my_ms_clap/src/audio_captioning.py
new file mode 100644
index 0000000000000000000000000000000000000000..fded5c2972928516e3f7531342622e4d517d7d89
--- /dev/null
+++ b/my_ms_clap/src/audio_captioning.py
@@ -0,0 +1,25 @@
+"""
+This is an example using CLAPCAP for audio captioning.
+"""
+from CLAPWrapper import CLAPWrapper
+
+# Load and initialize CLAP
+weights_path = "weights_path"
+clap_model = CLAPWrapper(weights_path, version = 'clapcap', use_cuda=False)
+
+#Load audio files
+audio_files = ['audio_file']
+
+# Generate captions for the recording
+captions = clap_model.generate_caption(audio_files, resample=True, beam_size=5, entry_length=67, temperature=0.01)
+
+# Print the result
+for i in range(len(audio_files)):
+    print(f"Audio file: {audio_files[i]} \n")
+    print(f"Generated caption: {captions[i]} \n")
+
+"""
+The output (the exact caption may vary):
+
+The birds are singing in the trees.
+"""
diff --git a/my_ms_clap/src/configs/config_2022.yml b/my_ms_clap/src/configs/config_2022.yml
new file mode 100644
index 0000000000000000000000000000000000000000..1b01939eb3348d63e79395363d09be947d31d97f
--- /dev/null
+++ b/my_ms_clap/src/configs/config_2022.yml
@@ -0,0 +1,26 @@
+# TEXT ENCODER CONFIG
+text_model: 'bert-base-uncased'
+text_len: 100
+transformer_embed_dim: 768
+freeze_text_encoder_weights: True
+
+# AUDIO ENCODER CONFIG
+audioenc_name: 'Cnn14'
+out_emb: 2048
+sampling_rate: 44100
+duration: 5
+fmin: 50
+fmax: 14000
+n_fft: 1028
+hop_size: 320
+mel_bins: 64
+window_size: 1024
+
+# PROJECTION SPACE CONFIG 
+d_proj: 1024
+temperature: 0.003
+
+# TRAINING AND EVALUATION CONFIG
+num_classes: 527
+batch_size: 1024
+demo: False
diff --git a/my_ms_clap/src/configs/config_2023.yml b/my_ms_clap/src/configs/config_2023.yml
new file mode 100644
index 0000000000000000000000000000000000000000..856e7aa765e55ed707f5edc29e2b4badaa8f2289
--- /dev/null
+++ b/my_ms_clap/src/configs/config_2023.yml
@@ -0,0 +1,26 @@
+# TEXT ENCODER CONFIG
+text_model: 'gpt2'
+text_len: 77
+transformer_embed_dim: 768
+freeze_text_encoder_weights: True
+
+# AUDIO ENCODER CONFIG
+audioenc_name: 'HTSAT'
+out_emb: 768
+sampling_rate: 44100
+duration: 7
+fmin: 50
+fmax: 8000 #14000 
+n_fft: 1024 # 1028 
+hop_size: 320
+mel_bins: 64
+window_size: 1024
+
+# PROJECTION SPACE CONFIG 
+d_proj: 1024
+temperature: 0.003
+
+# TRAINING AND EVALUATION CONFIG
+num_classes: 527
+batch_size: 1024
+demo: False
\ No newline at end of file
diff --git a/my_ms_clap/src/configs/config_clapcap.yml b/my_ms_clap/src/configs/config_clapcap.yml
new file mode 100644
index 0000000000000000000000000000000000000000..9c5df1cddc740920c84bd17b75a302057f210e21
--- /dev/null
+++ b/my_ms_clap/src/configs/config_clapcap.yml
@@ -0,0 +1,34 @@
+# TEXT ENCODER CONFIG
+text_model: 'gpt2'
+transformer_embed_dim: 768
+freeze_text_encoder_weights: True
+
+# AUDIO ENCODER CONFIG
+audioenc_name: 'HTSAT'
+out_emb: 768
+sampling_rate: 44100
+duration: 7
+fmin: 50
+fmax: 8000
+n_fft: 1024
+hop_size: 320
+mel_bins: 64
+window_size: 1024
+
+# PROJECTION SPACE CONFIG
+d_proj: 1024
+temperature: 0.003
+
+# TRAINING AND EVALUATION CONFIG
+batch_size: 128
+num_classes: 527
+
+# CLAPCAP CONFIG
+clapcap_model: 'ClapCaption'
+text_decoder: 'gpt2'
+prefix_length: 40
+prefix_length_clip: 40
+mapping_type: 'transformer'
+num_layers: 8
+normalize_prefix: True
+freeze_gpt_weights: True
diff --git a/my_ms_clap/src/esc50_dataset.py b/my_ms_clap/src/esc50_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cac51bbdfa7e4c63c4749200c28d1c27fbe8e30
--- /dev/null
+++ b/my_ms_clap/src/esc50_dataset.py
@@ -0,0 +1,82 @@
+from torch.utils.data import Dataset
+from torchvision.datasets.utils import download_url
+from tqdm import tqdm
+import pandas as pd
+import os
+import torch.nn as nn
+import torch
+
+class AudioDataset(Dataset):
+    def __init__(self, root: str, download: bool = True):
+        self.root = os.path.expanduser(root)
+        if download:
+            self.download()
+
+    def __getitem__(self, index):
+        raise NotImplementedError
+
+    def download(self):
+        raise NotImplementedError
+
+    def __len__(self):
+        raise NotImplementedError
+
+
+class ESC50(AudioDataset):
+    base_folder = 'ESC-50-master'
+    url = "https://github.com/karoldvl/ESC-50/archive/master.zip"
+    filename = "ESC-50-master.zip"
+    num_files_in_dir = 2000
+    audio_dir = 'audio'
+    label_col = 'category'
+    file_col = 'filename'
+    meta = {
+        'filename': os.path.join('meta','esc50.csv'),
+    }
+
+    def __init__(self, root, reading_transformations: nn.Module = None, download: bool = True):
+        super().__init__(root)
+        self._load_meta()
+
+        self.targets, self.audio_paths = [], []
+        self.pre_transformations = reading_transformations
+        print("Loading audio files")
+        # self.df['filename'] = os.path.join(self.root, self.base_folder, self.audio_dir) + os.sep + self.df['filename']
+        self.df['category'] = self.df['category'].str.replace('_',' ')
+
+        for _, row in tqdm(self.df.iterrows()):
+            file_path = os.path.join(self.root, self.base_folder, self.audio_dir, row[self.file_col])
+            self.targets.append(row[self.label_col])
+            self.audio_paths.append(file_path)
+
+    def _load_meta(self):
+        path = os.path.join(self.root, self.base_folder, self.meta['filename'])
+
+        self.df = pd.read_csv(path)
+        self.class_to_idx = {}
+        self.classes = [x.replace('_',' ') for x in sorted(self.df[self.label_col].unique())]
+        for i, category in enumerate(self.classes):
+            self.class_to_idx[category] = i
+
+    def __getitem__(self, index):
+        """
+        Args:
+            index (int): Index
+        Returns:
+            tuple: (image, target) where target is index of the target class.
+        """
+        file_path, target = self.audio_paths[index], self.targets[index]
+        idx = torch.tensor(self.class_to_idx[target])
+        one_hot_target = torch.zeros(len(self.classes)).scatter_(0, idx, 1).reshape(1,-1)
+        return file_path, target, one_hot_target
+
+    def __len__(self):
+        return len(self.audio_paths)
+
+    def download(self):
+        download_url(self.url, self.root, self.filename)
+
+        # extract file
+        from zipfile import ZipFile
+        with ZipFile(os.path.join(self.root, self.filename), 'r') as zip:
+            zip.extractall(path=self.root)
diff --git a/my_ms_clap/src/models/__init__.py b/my_ms_clap/src/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f87f80bc2d87921c9e70d72c3747659a89ac5eb
--- /dev/null
+++ b/my_ms_clap/src/models/__init__.py
@@ -0,0 +1,6 @@
+from . import clap 
+from . import audio
+from . import htsat
+from . import config
+from . import pytorch_utils
+from . import htsat
\ No newline at end of file
diff --git a/my_ms_clap/src/models/audio.py b/my_ms_clap/src/models/audio.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7897a7d2e61d3ad0d4122cf0f7d515859e9209e
--- /dev/null
+++ b/my_ms_clap/src/models/audio.py
@@ -0,0 +1,186 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchlibrosa.stft import Spectrogram, LogmelFilterBank
+
+try:
+    from models.htsat import HTSATWrapper
+except:
+    from .htsat import HTSATWrapper
+
+def get_audio_encoder(name: str):
+    if name == "Cnn14":
+        return Cnn14
+    elif name == "HTSAT":
+        return HTSATWrapper
+    else:
+        raise Exception('The audio encoder name {} is incorrect or not supported'.format(name))
+
+
+class ConvBlock(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        
+        super(ConvBlock, self).__init__()
+        
+        self.conv1 = nn.Conv2d(in_channels=in_channels, 
+                              out_channels=out_channels,
+                              kernel_size=(3, 3), stride=(1, 1),
+                              padding=(1, 1), bias=False)
+                              
+        self.conv2 = nn.Conv2d(in_channels=out_channels, 
+                              out_channels=out_channels,
+                              kernel_size=(3, 3), stride=(1, 1),
+                              padding=(1, 1), bias=False)
+                              
+        self.bn1 = nn.BatchNorm2d(out_channels)
+        self.bn2 = nn.BatchNorm2d(out_channels)
+
+        
+    def forward(self, input, pool_size=(2, 2), pool_type='avg'):
+        
+        x = input
+        x = F.relu_(self.bn1(self.conv1(x)))
+        x = F.relu_(self.bn2(self.conv2(x)))
+        if pool_type == 'max':
+            x = F.max_pool2d(x, kernel_size=pool_size)
+        elif pool_type == 'avg':
+            x = F.avg_pool2d(x, kernel_size=pool_size)
+        elif pool_type == 'avg+max':
+            x1 = F.avg_pool2d(x, kernel_size=pool_size)
+            x2 = F.max_pool2d(x, kernel_size=pool_size)
+            x = x1 + x2
+        else:
+            raise Exception('Incorrect argument!')
+        
+        return x
+
+
+class ConvBlock5x5(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        
+        super(ConvBlock5x5, self).__init__()
+        
+        self.conv1 = nn.Conv2d(in_channels=in_channels, 
+                              out_channels=out_channels,
+                              kernel_size=(5, 5), stride=(1, 1),
+                              padding=(2, 2), bias=False)
+                              
+        self.bn1 = nn.BatchNorm2d(out_channels)
+
+        
+    def forward(self, input, pool_size=(2, 2), pool_type='avg'):
+        
+        x = input
+        x = F.relu_(self.bn1(self.conv1(x)))
+        if pool_type == 'max':
+            x = F.max_pool2d(x, kernel_size=pool_size)
+        elif pool_type == 'avg':
+            x = F.avg_pool2d(x, kernel_size=pool_size)
+        elif pool_type == 'avg+max':
+            x1 = F.avg_pool2d(x, kernel_size=pool_size)
+            x2 = F.max_pool2d(x, kernel_size=pool_size)
+            x = x1 + x2
+        else:
+            raise Exception('Incorrect argument!')
+        
+        return x
+
+
+class AttBlock(nn.Module):
+    def __init__(self, n_in, n_out, activation='linear', temperature=1.):
+        super(AttBlock, self).__init__()
+        
+        self.activation = activation
+        self.temperature = temperature
+        self.att = nn.Conv1d(in_channels=n_in, out_channels=n_out, kernel_size=1, stride=1, padding=0, bias=True)
+        self.cla = nn.Conv1d(in_channels=n_in, out_channels=n_out, kernel_size=1, stride=1, padding=0, bias=True)
+        
+        self.bn_att = nn.BatchNorm1d(n_out)
+         
+    def forward(self, x):
+        # x: (n_samples, n_in, n_time)
+        norm_att = torch.softmax(torch.clamp(self.att(x), -10, 10), dim=-1)
+        cla = self.nonlinear_transform(self.cla(x))
+        x = torch.sum(norm_att * cla, dim=2)
+        return x, norm_att, cla
+
+    def nonlinear_transform(self, x):
+        if self.activation == 'linear':
+            return x
+        elif self.activation == 'sigmoid':
+            return torch.sigmoid(x)
+
+
+class Cnn14(nn.Module):
+    def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin, 
+        fmax, classes_num, out_emb):
+        
+        super(Cnn14, self).__init__()
+
+        window = 'hann'
+        center = True
+        pad_mode = 'reflect'
+        ref = 1.0
+        amin = 1e-10
+        top_db = None
+
+        # Spectrogram extractor
+        self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, 
+            win_length=window_size, window=window, center=center, pad_mode=pad_mode, 
+            freeze_parameters=True)
+
+        # Logmel feature extractor
+        self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, 
+            n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, 
+            freeze_parameters=True)
+
+        self.bn0 = nn.BatchNorm2d(64)
+
+        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
+        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
+        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
+        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
+        self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024)
+        self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048)
+        
+        # out_emb is 2048 for best Cnn14
+        self.fc1 = nn.Linear(2048, out_emb, bias=True)
+        self.fc_audioset = nn.Linear(out_emb, classes_num, bias=True)
+        
+    def forward(self, input, mixup_lambda=None):
+        """
+        Input: (batch_size, data_length)
+        """
+
+        x = self.spectrogram_extractor(input)   # (batch_size, 1, time_steps, freq_bins)
+        x = self.logmel_extractor(x)    # (batch_size, 1, time_steps, mel_bins)
+
+        x = x.transpose(1, 3)
+        x = self.bn0(x)
+        x = x.transpose(1, 3)
+
+        x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block5(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block6(x, pool_size=(1, 1), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = torch.mean(x, dim=3)
+        
+        (x1, _) = torch.max(x, dim=2)
+        x2 = torch.mean(x, dim=2)
+        x = x1 + x2
+        x = F.dropout(x, p=0.5, training=self.training)
+        x = F.relu_(self.fc1(x))
+        embedding = F.dropout(x, p=0.5, training=self.training)
+        clipwise_output = torch.sigmoid(self.fc_audioset(x))
+        
+        output_dict = {'clipwise_output': clipwise_output, 'embedding': embedding}
+
+        return output_dict
\ No newline at end of file
diff --git a/my_ms_clap/src/models/clap.py b/my_ms_clap/src/models/clap.py
new file mode 100644
index 0000000000000000000000000000000000000000..fcf06e5b7330bb087f046ce4c3a87f5d95caa43c
--- /dev/null
+++ b/my_ms_clap/src/models/clap.py
@@ -0,0 +1,109 @@
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+from transformers import AutoModel
+from .audio import get_audio_encoder
+
+class Projection(nn.Module):
+    def __init__(self, d_in: int, d_out: int, p: float=0.5) -> None:
+        super().__init__()
+        self.linear1 = nn.Linear(d_in, d_out, bias=False)
+        self.linear2 = nn.Linear(d_out, d_out, bias=False)
+        self.layer_norm = nn.LayerNorm(d_out)
+        self.drop = nn.Dropout(p)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        embed1 = self.linear1(x)
+        embed2 = self.drop(self.linear2(F.gelu(embed1)))
+        embeds = self.layer_norm(embed1 + embed2)
+        return embeds
+
+class AudioEncoder(nn.Module):
+    def __init__(self, audioenc_name:str, d_in: int, d_out: int, sample_rate: int, window_size: int,
+            hop_size: int, mel_bins: int, fmin: int, fmax: int, classes_num: int) -> None:
+        super().__init__()
+
+        audio_encoder = get_audio_encoder(audioenc_name)
+
+        self.base = audio_encoder(
+            sample_rate, window_size,
+            hop_size, mel_bins, fmin, fmax,
+            classes_num, d_in)
+
+        self.projection = Projection(d_in, d_out)
+
+    def forward(self, x):
+        out_dict = self.base(x)
+        audio_features, audio_classification_output = out_dict['embedding'], out_dict['clipwise_output']
+        projected_vec = self.projection(audio_features)
+        return projected_vec, audio_classification_output
+
+class TextEncoder(nn.Module):
+    def __init__(self, d_out: int, text_model: str, transformer_embed_dim: int) -> None:
+        super().__init__()
+        self.text_model = text_model
+        self.base = AutoModel.from_pretrained(text_model)
+
+        if 'clip' in text_model:
+            self.clip_text_projection = self.base.text_projection
+            self.base = self.base.text_model
+            if 'base' in text_model:
+                transformer_embed_dim = 512
+        
+        self.projection = Projection(transformer_embed_dim, d_out)
+
+    def forward(self, x):
+        if 'clip' in self.text_model:
+            pooled_output = self.base(**x)[1] # get pooled output
+            out = self.clip_text_projection(pooled_output)  # get CLS token output
+        elif 'gpt' in self.text_model:
+            batch_size = x['input_ids'].shape[0]
+            hidden_states = self.base(**x)[0] # (batch_size=4, seq_len, 768)
+
+            sequence_lengths = torch.ne(x['input_ids'], 0).sum(-1) - 1 # tensor([13, 14, 18, 17])
+            out = hidden_states[torch.arange(batch_size, device=hidden_states.device), sequence_lengths] # [batch_size, 768] = [4, 768]
+        else:
+            out = self.base(**x)[0]
+            out = out[:, 0, :]  # get CLS token output
+        
+        projected_vec = self.projection(out)
+
+        return projected_vec
+
+class CLAP(nn.Module):
+    def __init__(self,
+                # audio
+                audioenc_name: str,
+                sample_rate: int, 
+                window_size: int, 
+                hop_size: int, 
+                mel_bins: int, 
+                fmin: int, 
+                fmax: int, 
+                classes_num: int, 
+                out_emb: int,
+                # text
+                text_model: str,
+                transformer_embed_dim: int,
+                # common
+                d_proj: int,
+                ):
+        super().__init__()
+
+        
+        self.audio_encoder = AudioEncoder(
+            audioenc_name, out_emb, d_proj,
+            sample_rate, window_size, hop_size, mel_bins, fmin, fmax, classes_num)
+
+        self.caption_encoder = TextEncoder(
+            d_proj, text_model, transformer_embed_dim
+        )
+
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+
+    def forward(self, audio, text):
+        audio_embed, _ = self.audio_encoder(audio)
+        caption_embed = self.caption_encoder(text)
+
+        return caption_embed, audio_embed, self.logit_scale.exp()
\ No newline at end of file
diff --git a/my_ms_clap/src/models/config.py b/my_ms_clap/src/models/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..29f19218bfa59b013e38231df13ccc3de2b4882f
--- /dev/null
+++ b/my_ms_clap/src/models/config.py
@@ -0,0 +1,128 @@
+# Ke Chen
+# knutchen@ucsd.edu
+# HTS-AT: A HIERARCHICAL TOKEN-SEMANTIC AUDIO TRANSFORMER FOR SOUND CLASSIFICATION AND DETECTION
+# The configuration for training the model
+
+exp_name = "exp_htsat_pretrain" # the saved ckpt prefix name of the model 
+workspace = "/home/kechen/Research/HTSAT" # the folder of your code
+dataset_path = "/home/Research/audioset" # the dataset path
+desed_folder = "/home/Research/DESED" # the desed file
+
+dataset_type = "audioset" # "audioset" "esc-50" "scv2"
+index_type = "full_train" # only works for audioset
+balanced_data = True # only works for audioset
+
+loss_type = "clip_bce" # 
+# AudioSet & SCV2: "clip_bce" |  ESC-50: "clip_ce" 
+
+# trained from a checkpoint, or evaluate a single model 
+resume_checkpoint = None 
+# "/home/Research/model_backup/AudioSet/HTSAT_AudioSet_Saved_1.ckpt"
+ 
+esc_fold = 0 # just for esc dataset, select the fold you need for evaluation and (+1) validation
+
+
+debug = False
+
+random_seed = 970131 # 19970318 970131 12412 127777 1009 34047
+batch_size = 32 * 4 # batch size per GPU x GPU number , default is 32 x 4 = 128
+learning_rate = 1e-3 # 1e-4 also workable 
+max_epoch = 100
+num_workers = 3
+
+lr_scheduler_epoch = [10,20,30]
+lr_rate = [0.02, 0.05, 0.1]
+
+# these data preparation optimizations do not bring many improvements, so deprecated
+enable_token_label = False # token label
+class_map_path = "class_hier_map.npy"
+class_filter = None 
+retrieval_index = [15382, 9202, 130, 17618, 17157, 17516, 16356, 6165, 13992, 9238, 5550, 5733, 1914, 1600, 3450, 13735, 11108, 3762, 
+    9840, 11318, 8131, 4429, 16748, 4992, 16783, 12691, 4945, 8779, 2805, 9418, 2797, 14357, 5603, 212, 3852, 12666, 1338, 10269, 2388, 8260, 4293, 14454, 7677, 11253, 5060, 14938, 8840, 4542, 2627, 16336, 8992, 15496, 11140, 446, 6126, 10691, 8624, 10127, 9068, 16710, 10155, 14358, 7567, 5695, 2354, 8057, 17635, 133, 16183, 14535, 7248, 4560, 14429, 2463, 10773, 113, 2462, 9223, 4929, 14274, 4716, 17307, 4617, 2132, 11083, 1039, 1403, 9621, 13936, 2229, 2875, 17840, 9359, 13311, 9790, 13288, 4750, 17052, 8260, 14900]
+token_label_range = [0.2,0.6]
+enable_time_shift = False # shift time
+enable_label_enhance = False # enhance hierarchical label
+enable_repeat_mode = False # repeat the spectrogram / reshape the spectrogram
+
+
+
+# for model's design
+enable_tscam = True # enbale the token-semantic layer
+
+# for signal processing
+sample_rate = 32000 # 16000 for scv2, 32000 for audioset and esc-50
+clip_samples = sample_rate * 10 # audio_set 10-sec clip
+window_size = 1024
+hop_size = 320 # 160 for scv2, 320 for audioset and esc-50
+mel_bins = 64
+fmin = 50
+fmax = 14000
+shift_max = int(clip_samples * 0.5)
+
+# for data collection
+classes_num = 527 # esc: 50 | audioset: 527 | scv2: 35
+patch_size = (25, 4) # deprecated
+crop_size = None # int(clip_samples * 0.5) deprecated
+
+# for htsat hyperparamater
+htsat_window_size = 8
+htsat_spec_size =  256
+htsat_patch_size = 4 
+htsat_stride = (4, 4)
+htsat_num_head = [4,8,16,32]
+htsat_dim = 96 
+htsat_depth = [2,2,6,2]
+
+swin_pretrain_path = None
+# "/home/Research/model_backup/pretrain/swin_tiny_c24_patch4_window8_256.pth"
+
+# Some Deprecated Optimization in the model design, check the model code for details
+htsat_attn_heatmap = False
+htsat_hier_output = False 
+htsat_use_max = False
+
+
+# for ensemble test 
+
+ensemble_checkpoints = []
+ensemble_strides = []
+
+
+# weight average folder
+wa_folder = "/home/version_0/checkpoints/"
+# weight average output filename
+wa_model_path = "HTSAT_AudioSet_Saved_x.ckpt"
+
+esm_model_pathes = [
+    "/home/Research/model_backup/AudioSet/HTSAT_AudioSet_Saved_1.ckpt",
+    "/home/Research/model_backup/AudioSet/HTSAT_AudioSet_Saved_2.ckpt",
+    "/home/Research/model_backup/AudioSet/HTSAT_AudioSet_Saved_3.ckpt",
+    "/home/Research/model_backup/AudioSet/HTSAT_AudioSet_Saved_4.ckpt",
+    "/home/Research/model_backup/AudioSet/HTSAT_AudioSet_Saved_5.ckpt",
+    "/home/Research/model_backup/AudioSet/HTSAT_AudioSet_Saved_6.ckpt"
+]
+
+# for framewise localization
+heatmap_dir = "/home/Research/heatmap_output"
+test_file = "htsat-test-ensemble"
+fl_local = False # indicate if we need to use this dataset for the framewise detection
+fl_dataset = "/home/Research/desed/desed_eval.npy"  
+fl_class_num = [
+    "Speech", "Frying", "Dishes", "Running_water",
+    "Blender", "Electric_shaver_toothbrush", "Alarm_bell_ringing",
+    "Cat", "Dog", "Vacuum_cleaner"
+]
+
+# map 527 classes into 10 classes
+fl_audioset_mapping = [
+    [0,1,2,3,4,5,6,7],
+    [366, 367, 368],
+    [364],
+    [288, 289, 290, 291, 292, 293, 294, 295, 296, 297],
+    [369],
+    [382],
+    [310, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402],
+    [81, 82, 83, 84, 85],
+    [74, 75, 76, 77, 78, 79],
+    [377]
+]
\ No newline at end of file
diff --git a/my_ms_clap/src/models/htsat.py b/my_ms_clap/src/models/htsat.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e18afecbb0f12ebdf242d814325222c46aae88b
--- /dev/null
+++ b/my_ms_clap/src/models/htsat.py
@@ -0,0 +1,956 @@
+# Ke Chen
+# knutchen@ucsd.edu
+# HTS-AT: A HIERARCHICAL TOKEN-SEMANTIC AUDIO TRANSFORMER FOR SOUND CLASSIFICATION AND DETECTION
+# Model Core
+# below codes are based and referred from https://github.com/microsoft/Swin-Transformer
+# Swin Transformer for Computer Vision: https://arxiv.org/pdf/2103.14030.pdf
+
+
+import logging
+import pdb
+import math
+import random
+from numpy.core.fromnumeric import clip, reshape
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint as checkpoint
+
+# import os
+import sys
+sys.path.append('/home/zkong/audio_flamingo/audio_flamingo_v1/v0.2/open_flamingo/my_ms_clap/models')
+
+from torchlibrosa.stft import Spectrogram, LogmelFilterBank
+from torchlibrosa.augmentation import SpecAugmentation
+
+from itertools import repeat
+from typing import List
+try:
+    from models.pytorch_utils import do_mixup, interpolate
+    import models.config as config
+except:
+    from .pytorch_utils import do_mixup, interpolate
+    from . import config
+    # from CLAP_API.models.pytorch_utils import do_mixup, interpolate
+    # from CLAP_API.models import config
+
+import torch.nn.functional as F
+import collections.abc
+import warnings
+
+from torch.nn.init import _calculate_fan_in_and_fan_out
+
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable):
+            return x
+        return tuple(repeat(x, n))
+    return parse
+
+to_1tuple = _ntuple(1)
+to_2tuple = _ntuple(2)
+to_3tuple = _ntuple(3)
+to_4tuple = _ntuple(4)
+to_ntuple = _ntuple
+
+
+def drop_path(x, drop_prob: float = 0., training: bool = False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+    'survival rate' as the argument.
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
+    random_tensor.floor_()  # binarize
+    output = x.div(keep_prob) * random_tensor
+    return output
+
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+class PatchEmbed(nn.Module):
+    """ 2D Image to Patch Embedding
+    """
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, norm_layer=None, flatten=True, patch_stride = 16):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        patch_stride = to_2tuple(patch_stride)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.patch_stride = patch_stride
+        self.grid_size = (img_size[0] // patch_stride[0], img_size[1] // patch_stride[1])
+        self.num_patches = self.grid_size[0] * self.grid_size[1]
+        self.flatten = flatten
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        
+        padding = ((patch_size[0] - patch_stride[0]) // 2, (patch_size[1] - patch_stride[1]) // 2)
+
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_stride, padding=padding)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+        assert H == self.img_size[0] and W == self.img_size[1], \
+            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        x = self.proj(x)
+        if self.flatten:
+            x = x.flatten(2).transpose(1, 2)  # BCHW -> BNC
+        x = self.norm(x)
+        return x
+
+class Mlp(nn.Module):
+    """ MLP as used in Vision Transformer, MLP-Mixer and related networks
+    """
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+def _no_grad_trunc_normal_(tensor, mean, std, a, b):
+    # Cut & paste from PyTorch official master until it's in a few official releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1. + math.erf(x / math.sqrt(2.))) / 2.
+
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+                      "The distribution of values may be incorrect.",
+                      stacklevel=2)
+
+    with torch.no_grad():
+        # Values are generated by using a truncated uniform distribution and
+        # then using the inverse CDF for the normal distribution.
+        # Get upper and lower cdf values
+        l = norm_cdf((a - mean) / std)
+        u = norm_cdf((b - mean) / std)
+
+        # Uniformly fill tensor with values from [l, u], then translate to
+        # [2l-1, 2u-1].
+        tensor.uniform_(2 * l - 1, 2 * u - 1)
+
+        # Use inverse cdf transform for normal distribution to get truncated
+        # standard normal
+        tensor.erfinv_()
+
+        # Transform to proper mean, std
+        tensor.mul_(std * math.sqrt(2.))
+        tensor.add_(mean)
+
+        # Clamp to ensure it's in the proper range
+        tensor.clamp_(min=a, max=b)
+        return tensor
+
+
+def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
+    # type: (Tensor, float, float, float, float) -> Tensor
+    r"""Fills the input Tensor with values drawn from a truncated
+    normal distribution. The values are effectively drawn from the
+    normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \leq \text{mean} \leq b`.
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        a: the minimum cutoff value
+        b: the maximum cutoff value
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.trunc_normal_(w)
+    """
+    return _no_grad_trunc_normal_(tensor, mean, std, a, b)
+
+
+def variance_scaling_(tensor, scale=1.0, mode='fan_in', distribution='normal'):
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
+    if mode == 'fan_in':
+        denom = fan_in
+    elif mode == 'fan_out':
+        denom = fan_out
+    elif mode == 'fan_avg':
+        denom = (fan_in + fan_out) / 2
+
+    variance = scale / denom
+
+    if distribution == "truncated_normal":
+        # constant is stddev of standard normal truncated to (-2, 2)
+        trunc_normal_(tensor, std=math.sqrt(variance) / .87962566103423978)
+    elif distribution == "normal":
+        tensor.normal_(std=math.sqrt(variance))
+    elif distribution == "uniform":
+        bound = math.sqrt(3 * variance)
+        tensor.uniform_(-bound, bound)
+    else:
+        raise ValueError(f"invalid distribution {distribution}")
+
+
+def lecun_normal_(tensor):
+    variance_scaling_(tensor, mode='fan_in', distribution='truncated_normal')
+
+
+# below codes are based and referred from https://github.com/microsoft/Swin-Transformer
+# Swin Transformer for Computer Vision: https://arxiv.org/pdf/2103.14030.pdf
+
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    return windows
+
+
+def window_reverse(windows, window_size, H, W):
+    """
+    Args:
+        windows: (num_windows*B, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+    Returns:
+        x: (B, H, W, C)
+    """
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    return x
+
+
+class WindowAttention(nn.Module):
+    r""" Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+
+    def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.):
+
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(self.window_size[0])
+        coords_w = torch.arange(self.window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        self.register_buffer("relative_position_index", relative_position_index)
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        trunc_normal_(self.relative_position_bias_table, std=.02)
+        self.softmax = nn.Softmax(dim=-1)
+
+    def forward(self, x, mask=None):
+        """
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+        """
+        B_, N, C = x.shape
+        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
+
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+
+        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1)  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x, attn
+
+    def extra_repr(self):
+        return f'dim={self.dim}, window_size={self.window_size}, num_heads={self.num_heads}'
+
+
+# We use the model based on Swintransformer Block, therefore we can use the swin-transformer pretrained model
+class SwinTransformerBlock(nn.Module):
+    r""" Swin Transformer Block.
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resulotion.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(self, dim, input_resolution, num_heads, window_size=7, shift_size=0,
+                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0.,
+                 act_layer=nn.GELU, norm_layer=nn.LayerNorm, norm_before_mlp='ln'):
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        self.norm_before_mlp = norm_before_mlp
+        if min(self.input_resolution) <= self.window_size:
+            # if window size is larger than input resolution, we don't partition windows
+            self.shift_size = 0
+            self.window_size = min(self.input_resolution)
+        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
+
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention(
+            dim, window_size=to_2tuple(self.window_size), num_heads=num_heads,
+            qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        if self.norm_before_mlp == 'ln':
+            self.norm2 = nn.LayerNorm(dim)
+        elif self.norm_before_mlp == 'bn':
+            self.norm2 = lambda x: nn.BatchNorm1d(dim)(x.transpose(1, 2)).transpose(1, 2)
+        else:
+            raise NotImplementedError
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+
+        if self.shift_size > 0:
+            # calculate attention mask for SW-MSA
+            H, W = self.input_resolution
+            img_mask = torch.zeros((1, H, W, 1))  # 1 H W 1
+            h_slices = (slice(0, -self.window_size),
+                        slice(-self.window_size, -self.shift_size),
+                        slice(-self.shift_size, None))
+            w_slices = (slice(0, -self.window_size),
+                        slice(-self.window_size, -self.shift_size),
+                        slice(-self.shift_size, None))
+            cnt = 0
+            for h in h_slices:
+                for w in w_slices:
+                    img_mask[:, h, w, :] = cnt
+                    cnt += 1
+
+            mask_windows = window_partition(img_mask, self.window_size)  # nW, window_size, window_size, 1
+            mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
+            attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+            attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+        else:
+            attn_mask = None
+
+        self.register_buffer("attn_mask", attn_mask)
+
+    def forward(self, x):
+        # pdb.set_trace()
+        H, W = self.input_resolution
+        # print("H: ", H)
+        # print("W: ", W)
+        # pdb.set_trace()
+        B, L, C = x.shape
+        # assert L == H * W, "input feature has wrong size"
+
+        shortcut = x
+        x = self.norm1(x)
+        x = x.view(B, H, W, C)
+
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
+        else:
+            shifted_x = x
+
+        # partition windows
+        x_windows = window_partition(shifted_x, self.window_size)  # nW*B, window_size, window_size, C
+        x_windows = x_windows.view(-1, self.window_size * self.window_size, C)  # nW*B, window_size*window_size, C
+
+        # W-MSA/SW-MSA
+        attn_windows, attn = self.attn(x_windows, mask=self.attn_mask)  # nW*B, window_size*window_size, C
+
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
+        shifted_x = window_reverse(attn_windows, self.window_size, H, W)  # B H' W' C
+
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
+        else:
+            x = shifted_x
+        x = x.view(B, H * W, C)
+
+        # FFN
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+
+        return x, attn
+
+    def extra_repr(self):
+        return f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, " \
+               f"window_size={self.window_size}, shift_size={self.shift_size}, mlp_ratio={self.mlp_ratio}"
+
+
+
+class PatchMerging(nn.Module):
+    r""" Patch Merging Layer.
+    Args:
+        input_resolution (tuple[int]): Resolution of input feature.
+        dim (int): Number of input channels.
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+        self.norm = norm_layer(4 * dim)
+
+    def forward(self, x):
+        """
+        x: B, H*W, C
+        """
+        H, W = self.input_resolution
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+        assert H % 2 == 0 and W % 2 == 0, f"x size ({H}*{W}) are not even."
+
+        x = x.view(B, H, W, C)
+
+        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
+        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
+        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
+        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
+        x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
+        x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C
+
+        x = self.norm(x)
+        x = self.reduction(x)
+
+        return x
+
+    def extra_repr(self):
+        return f"input_resolution={self.input_resolution}, dim={self.dim}"
+
+
+class BasicLayer(nn.Module):
+    """ A basic Swin Transformer layer for one stage.
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resolution.
+        depth (int): Number of blocks.
+        num_heads (int): Number of attention heads.
+        window_size (int): Local window size.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+
+    def __init__(self, dim, input_resolution, depth, num_heads, window_size,
+                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., norm_layer=nn.LayerNorm, downsample=None, use_checkpoint=False,
+                 norm_before_mlp='ln'):
+
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+
+        # build blocks
+        self.blocks = nn.ModuleList([
+            SwinTransformerBlock(dim=dim, input_resolution=input_resolution,
+                                 num_heads=num_heads, window_size=window_size,
+                                 shift_size=0 if (i % 2 == 0) else window_size // 2,
+                                 mlp_ratio=mlp_ratio,
+                                 qkv_bias=qkv_bias, qk_scale=qk_scale,
+                                 drop=drop, attn_drop=attn_drop,
+                                 drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
+                                 norm_layer=norm_layer, norm_before_mlp=norm_before_mlp)
+            for i in range(depth)])
+
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(input_resolution, dim=dim, norm_layer=norm_layer)
+        else:
+            self.downsample = None
+
+    def forward(self, x):
+        attns = []
+        for blk in self.blocks:
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x)
+            else:
+                x, attn = blk(x)
+                if not self.training:
+                    attns.append(attn.unsqueeze(0))
+        if self.downsample is not None:
+            x = self.downsample(x)
+        if not self.training:
+            attn = torch.cat(attns, dim = 0)
+            attn = torch.mean(attn, dim = 0)
+        return x, attn
+
+    def extra_repr(self):
+        return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}"
+
+
+# The Core of HTSAT
+class HTSAT_Swin_Transformer(nn.Module):
+    r"""HTSAT based on the Swin Transformer
+    Args:
+        spec_size (int | tuple(int)): Input Spectrogram size. Default 256
+        patch_size (int | tuple(int)): Patch size. Default: 4
+        path_stride (iot | tuple(int)): Patch Stride for Frequency and Time Axis. Default: 4
+        in_chans (int): Number of input image channels. Default: 1 (mono)
+        num_classes (int): Number of classes for classification head. Default: 527
+        embed_dim (int): Patch embedding dimension. Default: 96
+        depths (tuple(int)): Depth of each HTSAT-Swin Transformer layer.
+        num_heads (tuple(int)): Number of attention heads in different layers.
+        window_size (int): Window size. Default: 8
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None
+        drop_rate (float): Dropout rate. Default: 0
+        attn_drop_rate (float): Attention dropout rate. Default: 0
+        drop_path_rate (float): Stochastic depth rate. Default: 0.1
+        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
+        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False
+        patch_norm (bool): If True, add normalization after patch embedding. Default: True
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False
+        config (module): The configuration Module from config.py
+    """
+
+    def __init__(self, spec_size=256, patch_size=4, patch_stride=(4,4), 
+                in_chans=1, num_classes=527,
+                 embed_dim=96, depths=[2, 2, 6, 2], num_heads=[4, 8, 16, 32],
+                 window_size=8, mlp_ratio=4., qkv_bias=True, qk_scale=None,
+                 drop_rate=0., attn_drop_rate=0., drop_path_rate=0.1,
+                 norm_layer=nn.LayerNorm, 
+                 ape=False, patch_norm=True,
+                 use_checkpoint=False, norm_before_mlp='ln', config = None, **kwargs):
+        super(HTSAT_Swin_Transformer, self).__init__()
+
+        self.config = config
+        self.spec_size = spec_size 
+        self.patch_stride = patch_stride
+        self.patch_size = patch_size
+        self.window_size = window_size
+        self.embed_dim = embed_dim
+        self.depths = depths
+        self.ape = ape
+        self.in_chans = in_chans
+        self.num_classes = num_classes
+        self.num_heads = num_heads
+        self.num_layers = len(self.depths)
+        self.num_features = int(self.embed_dim * 2 ** (self.num_layers - 1))
+        
+        self.drop_rate = drop_rate
+        self.attn_drop_rate = attn_drop_rate
+        self.drop_path_rate = drop_path_rate
+
+        self.qkv_bias = qkv_bias
+        self.qk_scale = None
+
+        self.patch_norm = patch_norm
+        self.norm_layer = norm_layer if self.patch_norm else None
+        self.norm_before_mlp = norm_before_mlp
+        self.mlp_ratio = mlp_ratio
+
+        self.use_checkpoint = use_checkpoint
+
+        #  process mel-spec ; used only once
+        self.freq_ratio = self.spec_size // self.config.mel_bins
+        window = 'hann'
+        center = True
+        pad_mode = 'reflect'
+        ref = 1.0
+        amin = 1e-10
+        top_db = None
+        self.interpolate_ratio = 32     # Downsampled ratio
+        # Spectrogram extractor
+        self.spectrogram_extractor = Spectrogram(n_fft=config.window_size, hop_length=config.hop_size, 
+            win_length=config.window_size, window=window, center=center, pad_mode=pad_mode, 
+            freeze_parameters=True)
+        # Logmel feature extractor
+        self.logmel_extractor = LogmelFilterBank(sr=config.sample_rate, n_fft=config.window_size, 
+            n_mels=config.mel_bins, fmin=config.fmin, fmax=config.fmax, ref=ref, amin=amin, top_db=top_db, 
+            freeze_parameters=True)
+        # Spec augmenter
+        self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, 
+            freq_drop_width=8, freq_stripes_num=2) # 2 2
+        self.bn0 = nn.BatchNorm2d(self.config.mel_bins)
+
+
+        # split spctrogram into non-overlapping patches
+        self.patch_embed = PatchEmbed(
+            img_size=self.spec_size, patch_size=self.patch_size, in_chans=self.in_chans, 
+            embed_dim=self.embed_dim, norm_layer=self.norm_layer, patch_stride = patch_stride)
+
+        num_patches = self.patch_embed.num_patches
+        patches_resolution = self.patch_embed.grid_size
+        self.patches_resolution = patches_resolution
+
+        # absolute position embedding
+        if self.ape:
+            self.absolute_pos_embed = nn.Parameter(torch.zeros(1, num_patches, self.embed_dim))
+            trunc_normal_(self.absolute_pos_embed, std=.02)
+
+        self.pos_drop = nn.Dropout(p=self.drop_rate)
+
+        # stochastic depth
+        dpr = [x.item() for x in torch.linspace(0, self.drop_path_rate, sum(self.depths))]  # stochastic depth decay rule
+
+        # build layers
+        self.layers = nn.ModuleList()
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(dim=int(self.embed_dim * 2 ** i_layer),
+                input_resolution=(patches_resolution[0] // (2 ** i_layer),
+                                    patches_resolution[1] // (2 ** i_layer)),
+                depth=self.depths[i_layer],
+                num_heads=self.num_heads[i_layer],
+                window_size=self.window_size,
+                mlp_ratio=self.mlp_ratio,
+                qkv_bias=self.qkv_bias, qk_scale=self.qk_scale,
+                drop=self.drop_rate, attn_drop=self.attn_drop_rate,
+                drop_path=dpr[sum(self.depths[:i_layer]):sum(self.depths[:i_layer + 1])],
+                norm_layer=self.norm_layer,
+                downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
+                use_checkpoint=use_checkpoint,
+                norm_before_mlp=self.norm_before_mlp)
+            self.layers.append(layer)
+
+        self.norm = self.norm_layer(self.num_features)
+        self.avgpool = nn.AdaptiveAvgPool1d(1)
+        self.maxpool = nn.AdaptiveMaxPool1d(1)
+
+        if self.config.enable_tscam:
+            SF = self.spec_size // (2 ** (len(self.depths) - 1)) // self.patch_stride[0] // self.freq_ratio
+            self.tscam_conv = nn.Conv2d(
+                in_channels = self.num_features,
+                out_channels = self.num_classes,
+                kernel_size = (SF,3),
+                padding = (0,1)
+            )
+            self.head = nn.Linear(num_classes, num_classes)
+        else:
+            self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'absolute_pos_embed'}
+
+    @torch.jit.ignore
+    def no_weight_decay_keywords(self):
+        return {'relative_position_bias_table'}
+
+    def forward_features(self, x):
+        frames_num = x.shape[2]        
+        x = self.patch_embed(x)
+        if self.ape:
+            x = x + self.absolute_pos_embed
+        x = self.pos_drop(x)
+        for i, layer in enumerate(self.layers):
+            x, attn = layer(x)
+
+        if self.config.enable_tscam:
+            # for x
+            x = self.norm(x)
+            B, N, C = x.shape
+            SF = frames_num // (2 ** (len(self.depths) - 1)) // self.patch_stride[0]
+            ST = frames_num // (2 ** (len(self.depths) - 1)) // self.patch_stride[1]
+            x = x.permute(0,2,1).contiguous().reshape(B, C, SF, ST)
+            B, C, F, T = x.shape
+            # group 2D CNN
+            c_freq_bin = F // self.freq_ratio
+            x = x.reshape(B, C, F // c_freq_bin, c_freq_bin, T)
+            x = x.permute(0,1,3,2,4).contiguous().reshape(B, C, c_freq_bin, -1)
+
+            # get latent_output
+            latent_output = self.avgpool(torch.flatten(x,2))
+            latent_output = torch.flatten(latent_output, 1)
+
+            # display the attention map, if needed
+            if self.config.htsat_attn_heatmap:
+                # for attn
+                attn = torch.mean(attn, dim = 1)
+                attn = torch.mean(attn, dim = 1)
+                attn = attn.reshape(B, SF, ST)
+                c_freq_bin = SF // self.freq_ratio
+                attn = attn.reshape(B, SF // c_freq_bin, c_freq_bin, ST) 
+                attn = attn.permute(0,2,1,3).contiguous().reshape(B, c_freq_bin, -1)
+                attn = attn.mean(dim = 1)
+                attn_max = torch.max(attn, dim = 1, keepdim = True)[0]
+                attn_min = torch.min(attn, dim = 1, keepdim = True)[0]
+                attn = ((attn * 0.15) + (attn_max * 0.85 - attn_min)) / (attn_max - attn_min)
+                attn = attn.unsqueeze(dim = 2)
+
+            x = self.tscam_conv(x)
+            x = torch.flatten(x, 2) # B, C, T
+
+            if self.config.htsat_attn_heatmap:
+                fpx = interpolate(torch.sigmoid(x).permute(0,2,1).contiguous() * attn, 8 * self.patch_stride[1]) 
+            else: 
+                fpx = interpolate(torch.sigmoid(x).permute(0,2,1).contiguous(), 8 * self.patch_stride[1]) 
+            
+            x = self.avgpool(x)
+            x = torch.flatten(x, 1)
+
+            if self.config.loss_type == "clip_ce":
+                output_dict = {
+                    'framewise_output': fpx, # already sigmoided
+                    'clipwise_output': x,
+                    'latent_output': latent_output
+                }
+            else:
+                output_dict = {
+                    'framewise_output': fpx, # already sigmoided
+                    'clipwise_output': torch.sigmoid(x),
+                    'latent_output': latent_output
+                }
+           
+        else:
+            x = self.norm(x)  # B N C
+            B, N, C = x.shape
+            
+            fpx = x.permute(0,2,1).contiguous().reshape(B, C, frames_num // (2 ** (len(self.depths) + 1)), frames_num // (2 ** (len(self.depths) + 1)) )
+            B, C, F, T = fpx.shape
+            c_freq_bin = F // self.freq_ratio
+            fpx = fpx.reshape(B, C, F // c_freq_bin, c_freq_bin, T)
+            fpx = fpx.permute(0,1,3,2,4).contiguous().reshape(B, C, c_freq_bin, -1)
+            fpx = torch.sum(fpx, dim = 2)
+            fpx = interpolate(fpx.permute(0,2,1).contiguous(), 8 * self.patch_stride[1]) 
+            x = self.avgpool(x.transpose(1, 2))  # B C 1
+            x = torch.flatten(x, 1)
+            if self.num_classes > 0:
+                x = self.head(x)
+                fpx = self.head(fpx)
+            output_dict = {'framewise_output': torch.sigmoid(fpx), 
+                'clipwise_output': torch.sigmoid(x)}
+        return output_dict
+
+    def crop_wav(self, x, crop_size, spe_pos = None):
+        time_steps = x.shape[2]
+        tx = torch.zeros(x.shape[0], x.shape[1], crop_size, x.shape[3]).to(x.device)
+        for i in range(len(x)):
+            if spe_pos is None:
+                crop_pos = random.randint(0, time_steps - crop_size - 1)
+            else:
+                crop_pos = spe_pos
+            tx[i][0] = x[i, 0, crop_pos:crop_pos + crop_size,:]
+        return tx
+
+    # Reshape the wavform to a img size, if you want to use the pretrained swin transformer model
+    def reshape_wav2img(self, x):
+        B, C, T, F = x.shape
+        target_T = int(self.spec_size * self.freq_ratio)
+        target_F = self.spec_size // self.freq_ratio
+        assert T <= target_T and F <= target_F, "the wav size should less than or equal to the swin input size"
+        # to avoid bicubic zero error
+        if T < target_T:
+            x = nn.functional.interpolate(x, (target_T, x.shape[3]), mode="bicubic", align_corners=True)
+        if F < target_F:
+            x = nn.functional.interpolate(x, (x.shape[2], target_F), mode="bicubic", align_corners=True)
+        x = x.permute(0,1,3,2).contiguous()
+        x = x.reshape(x.shape[0], x.shape[1], x.shape[2], self.freq_ratio, x.shape[3] // self.freq_ratio)
+        # print(x.shape)
+        x = x.permute(0,1,3,2,4).contiguous()
+        x = x.reshape(x.shape[0], x.shape[1], x.shape[2] * x.shape[3], x.shape[4])
+        return x
+    
+    # Repeat the wavform to a img size, if you want to use the pretrained swin transformer model
+    def repeat_wat2img(self, x, cur_pos):
+        B, C, T, F = x.shape
+        target_T = int(self.spec_size * self.freq_ratio)
+        target_F = self.spec_size // self.freq_ratio
+        assert T <= target_T and F <= target_F, "the wav size should less than or equal to the swin input size"
+        # to avoid bicubic zero error
+        if T < target_T:
+            x = nn.functional.interpolate(x, (target_T, x.shape[3]), mode="bicubic", align_corners=True)
+        if F < target_F:
+            x = nn.functional.interpolate(x, (x.shape[2], target_F), mode="bicubic", align_corners=True)  
+        x = x.permute(0,1,3,2).contiguous() # B C F T
+        x = x[:,:,:,cur_pos:cur_pos + self.spec_size]
+        x = x.repeat(repeats = (1,1,4,1))
+        return x
+
+    def forward(self, x: torch.Tensor, mixup_lambda = None, infer_mode = False):# out_feat_keys: List[str] = None):
+        x = self.spectrogram_extractor(x)   # (batch_size, 1, time_steps, freq_bins)
+        x = self.logmel_extractor(x)    # (batch_size, 1, time_steps, mel_bins)
+        
+        
+        x = x.transpose(1, 3)
+        x = self.bn0(x)
+        x = x.transpose(1, 3)
+        if self.training:
+            x = self.spec_augmenter(x)
+        if self.training and mixup_lambda is not None:
+            x = do_mixup(x, mixup_lambda)
+        
+        if infer_mode:
+            # in infer mode. we need to handle different length audio input
+            frame_num = x.shape[2]
+            target_T = int(self.spec_size * self.freq_ratio)
+            repeat_ratio = math.floor(target_T / frame_num)
+            x = x.repeat(repeats=(1,1,repeat_ratio,1))
+            x = self.reshape_wav2img(x)
+            output_dict = self.forward_features(x)
+        elif self.config.enable_repeat_mode:
+            if self.training:
+                cur_pos = random.randint(0, (self.freq_ratio - 1) * self.spec_size - 1)
+                x = self.repeat_wat2img(x, cur_pos)
+                output_dict = self.forward_features(x)
+            else:
+                output_dicts = []
+                for cur_pos in range(0, (self.freq_ratio - 1) * self.spec_size + 1, self.spec_size):
+                    tx = x.clone()
+                    tx = self.repeat_wat2img(tx, cur_pos)
+                    output_dicts.append(self.forward_features(tx))
+                clipwise_output = torch.zeros_like(output_dicts[0]["clipwise_output"]).float().to(x.device)
+                framewise_output = torch.zeros_like(output_dicts[0]["framewise_output"]).float().to(x.device)
+                for d in output_dicts:
+                    clipwise_output += d["clipwise_output"]
+                    framewise_output += d["framewise_output"]
+                clipwise_output  = clipwise_output / len(output_dicts)
+                framewise_output = framewise_output / len(output_dicts)
+
+                output_dict = {
+                    'framewise_output': framewise_output, 
+                    'clipwise_output': clipwise_output
+                }
+        else:
+            if x.shape[2] > self.freq_ratio * self.spec_size:
+                if self.training:
+                    x = self.crop_wav(x, crop_size=self.freq_ratio * self.spec_size)
+                    x = self.reshape_wav2img(x)
+                    output_dict = self.forward_features(x)
+                else:
+                    # Change: Hard code here
+                    overlap_size = 344 #(x.shape[2] - 1) // 4
+                    output_dicts = []
+                    crop_size = 689 #(x.shape[2] - 1) // 2
+                    for cur_pos in range(0, x.shape[2] - crop_size - 1, overlap_size):
+                        tx = self.crop_wav(x, crop_size = crop_size, spe_pos = cur_pos)
+                        tx = self.reshape_wav2img(tx)
+                        output_dicts.append(self.forward_features(tx))
+                    clipwise_output = torch.zeros_like(output_dicts[0]["clipwise_output"]).float().to(x.device)
+                    framewise_output = torch.zeros_like(output_dicts[0]["framewise_output"]).float().to(x.device)
+                    latent_output = torch.zeros_like(output_dicts[0]["latent_output"]).float().to(x.device)
+                    for d in output_dicts:
+                        clipwise_output += d["clipwise_output"]
+                        framewise_output += d["framewise_output"]
+                        latent_output += d["latent_output"]
+                    clipwise_output  = clipwise_output / len(output_dicts)
+                    framewise_output = framewise_output / len(output_dicts)
+                    latent_output = latent_output / len(output_dicts)
+                    output_dict = {
+                        'framewise_output': framewise_output, 
+                        'clipwise_output': clipwise_output,
+                        'latent_output': latent_output,
+                    }
+            else: # this part is typically used, and most easy one
+                x = self.reshape_wav2img(x)
+                output_dict = self.forward_features(x)
+        # x = self.head(x)
+        return output_dict
+
+class HTSATWrapper(nn.Module):
+    def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin, 
+        fmax, classes_num, out_emb):
+        super().__init__()
+
+        # print("parameters are being overidden when using HTSAT")
+        # print("HTSAT only support loading a pretrained model on AudioSet")
+        # @TODO later look at what parameters are same and can be merged
+
+        self.htsat = HTSAT_Swin_Transformer(config=config)
+
+    def forward(self, x):
+        out_dict = self.htsat(x)
+        out_dict['embedding'] = out_dict['latent_output']
+        return out_dict
\ No newline at end of file
diff --git a/my_ms_clap/src/models/mapper.py b/my_ms_clap/src/models/mapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ad37337e4df891922e8b87603d286017b26e55c
--- /dev/null
+++ b/my_ms_clap/src/models/mapper.py
@@ -0,0 +1,200 @@
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as nnf
+from torch.utils.data import Dataset, DataLoader
+from enum import Enum
+from transformers import GPT2LMHeadModel
+from typing import Tuple, Optional, Union
+
+def get_clapcap(name: str):
+    if name == "ClapCaption":
+        return ClapCaptionModel
+    else:
+        raise Exception('The ClapCap model {} is incorrect or not supported'.format(name))
+
+class MappingType(Enum):
+    MLP = 'mlp'
+    Transformer = 'transformer'
+
+class MLP(nn.Module):
+    def __init__(self, sizes: Tuple[int, ...], bias=True, act=nn.Tanh):
+        super(MLP, self).__init__()
+        layers = []
+        for i in range(len(sizes) - 1):
+            layers.append(nn.Linear(sizes[i], sizes[i + 1], bias=bias))
+            if i < len(sizes) - 2:
+                layers.append(act())
+        self.model = nn.Sequential(*layers)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.model(x)
+
+
+class MlpTransformer(nn.Module):
+    def __init__(self, in_dim, h_dim, out_d: Optional[int] = None, act=nnf.relu, dropout=0.):
+        super().__init__()
+        out_d = out_d if out_d is not None else in_dim
+        self.fc1 = nn.Linear(in_dim, h_dim)
+        self.act = act
+        self.fc2 = nn.Linear(h_dim, out_d)
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.dropout(x)
+        x = self.fc2(x)
+        x = self.dropout(x)
+        return x
+
+class MultiHeadAttention(nn.Module):
+
+    def __init__(self, dim_self, dim_ref, num_heads, bias=True, dropout=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim_self // num_heads
+        self.scale = head_dim ** -0.5
+        self.to_queries = nn.Linear(dim_self, dim_self, bias=bias)
+        self.to_keys_values = nn.Linear(dim_ref, dim_self * 2, bias=bias)
+        self.project = nn.Linear(dim_self, dim_self)
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x, y=None, mask=None):
+        y = y if y is not None else x
+        b, n, c = x.shape
+        _, m, d = y.shape
+        # b n h dh
+        queries = self.to_queries(x).reshape(b, n, self.num_heads, c // self.num_heads)
+        # b m 2 h dh
+        keys_values = self.to_keys_values(y).reshape(b, m, 2, self.num_heads, c // self.num_heads)
+        keys, values = keys_values[:, :, 0], keys_values[:, :, 1]
+        attention = torch.einsum('bnhd,bmhd->bnmh', queries, keys) * self.scale
+        if mask is not None:
+            if mask.dim() == 2:
+                mask = mask.unsqueeze(1)
+            attention = attention.masked_fill(mask.unsqueeze(3), float("-inf"))
+        attention = attention.softmax(dim=2)
+        out = torch.einsum('bnmh,bmhd->bnhd', attention, values).reshape(b, n, c)
+        out = self.project(out)
+        return out, attention
+
+
+class TransformerLayer(nn.Module):
+
+    def forward_with_attention(self, x, y=None, mask=None):
+        x_, attention = self.attn(self.norm1(x), y, mask)
+        x = x + x_
+        x = x + self.mlp(self.norm2(x))
+        return x, attention
+
+    def forward(self, x, y=None, mask=None):
+        x = x + self.attn(self.norm1(x), y, mask)[0]
+        x = x + self.mlp(self.norm2(x))
+        return x
+
+    def __init__(self, dim_self, dim_ref, num_heads, mlp_ratio=4., bias=False, dropout=0., act=nnf.relu,
+                 norm_layer: nn.Module = nn.LayerNorm):
+        super().__init__()
+        self.norm1 = norm_layer(dim_self)
+        self.attn = MultiHeadAttention(dim_self, dim_ref, num_heads, bias=bias, dropout=dropout)
+        self.norm2 = norm_layer(dim_self)
+        self.mlp = MlpTransformer(dim_self, int(dim_self * mlp_ratio), act=act, dropout=dropout)
+
+
+class Transformer(nn.Module):
+    def __init__(self, dim_self: int, num_heads: int, num_layers: int, dim_ref: Optional[int] = None,
+                 mlp_ratio: float = 2., act=nnf.relu, norm_layer: nn.Module = nn.LayerNorm, enc_dec: bool = False):
+        super(Transformer, self).__init__()
+        dim_ref = dim_ref if dim_ref is not None else dim_self
+        self.enc_dec = enc_dec
+        if enc_dec:
+            num_layers = num_layers * 2
+        layers = []
+        for i in range(num_layers):
+            if i % 2 == 0 and enc_dec:  # cross
+                layers.append(TransformerLayer(dim_self, dim_ref, num_heads, mlp_ratio, act=act, norm_layer=norm_layer))
+            elif enc_dec:  # self
+                layers.append(TransformerLayer(dim_self, dim_self, num_heads, mlp_ratio, act=act, norm_layer=norm_layer))
+            else:  # self or cross
+                layers.append(TransformerLayer(dim_self, dim_ref, num_heads, mlp_ratio, act=act, norm_layer=norm_layer))
+        self.layers = nn.ModuleList(layers)
+
+    def forward_with_attention(self, x, y=None, mask=None):
+        attentions = []
+        for layer in self.layers:
+            x, att = layer.forward_with_attention(x, y, mask)
+            attentions.append(att)
+        return x, attentions
+
+    def forward(self, x, y=None, mask=None):
+        for i, layer in enumerate(self.layers):
+            if i % 2 == 0 and self.enc_dec: # cross
+                x = layer(x, y)
+            elif self.enc_dec:  # self
+                x = layer(x, x, mask)
+            else:  # self or cross
+                x = layer(x, y, mask)
+        return x
+
+
+class TransformerMapper(nn.Module):
+    def __init__(self, dim_clip: int, dim_embedding: int, prefix_length: int, clip_length: int, num_layers: int = 8):
+        super(TransformerMapper, self).__init__()
+        self.clip_length = clip_length
+        self.transformer = Transformer(dim_embedding, 8, num_layers)
+        self.linear = nn.Linear(dim_clip, clip_length * dim_embedding)
+        self.prefix_const = nn.Parameter(torch.randn(prefix_length, dim_embedding), requires_grad=True)
+
+    def forward(self, x):
+        x = self.linear(x).view(x.shape[0], self.clip_length, -1)
+        prefix = self.prefix_const.unsqueeze(0).expand(x.shape[0], *self.prefix_const.shape)
+        prefix = torch.cat((x, prefix), dim=1)
+        out = self.transformer(prefix)[:, self.clip_length:]
+        return out
+
+class ClapCaptionModel(nn.Module):
+    def __init__(self, clap, text_decoder: str, prefix_length: int, clip_length: Optional[int] = None, prefix_size: int = 512,
+                 num_layers: int = 8, normalize_prefix: bool = True, mapping_type: str = None,\
+                 freeze_audio_encoder_weights: bool = True, freeze_gpt_weights: bool = True):
+        super(ClapCaptionModel, self).__init__()
+        self.clap = clap.audio_encoder
+        self.prefix_length = prefix_length
+        self.normalize_prefix = normalize_prefix
+        self.gpt = GPT2LMHeadModel.from_pretrained(text_decoder)
+        self.gpt_embedding_size = self.gpt.transformer.wte.weight.shape[1]
+        if mapping_type == 'mlp':
+            self.clap_project = MLP((prefix_size, (self.gpt_embedding_size * prefix_length) // 2,
+                                     self.gpt_embedding_size * prefix_length))
+        else:
+            self.clap_project = TransformerMapper(prefix_size, self.gpt_embedding_size, prefix_length,
+                                                                     clip_length, num_layers)
+
+        # Freeze all CLAP parameters
+        if freeze_audio_encoder_weights:
+            for p in self.clap.parameters():
+                p.requires_grad = False
+        
+        if freeze_gpt_weights:
+            for p in self.gpt.parameters():
+                p.requires_grad = False
+
+    def get_dummy_token(self, batch_size: int, device: torch.device) -> torch.Tensor:
+        return torch.zeros(batch_size, self.prefix_length, dtype=torch.int64, device=device)
+
+    def forward(self, audios: torch.Tensor, tokens: torch.Tensor, mask: Optional[torch.Tensor] = None,
+                labels: Optional[torch.Tensor] = None):
+        # get audio embeddings
+        prefix, _ = self.clap(audios)
+        # normalize prefix (audio embedding)
+        if self.normalize_prefix:
+            prefix = prefix / prefix.norm(2, -1).reshape(-1,1)
+
+        embedding_text = self.gpt.transformer.wte(tokens['input_ids'])
+        prefix_projections = self.clap_project(prefix).view(-1, self.prefix_length, self.gpt_embedding_size)
+        embedding_cat = torch.cat((prefix_projections, embedding_text), dim=1)
+        if labels is not None:
+            dummy_token = self.get_dummy_token(tokens['input_ids'].shape[0], tokens['input_ids'].device)
+            labels = torch.cat((dummy_token, tokens), dim=1)
+        out = self.gpt(inputs_embeds=embedding_cat, labels=labels, attention_mask=mask)
+        return out
\ No newline at end of file
diff --git a/my_ms_clap/src/models/pytorch_utils.py b/my_ms_clap/src/models/pytorch_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..453d3cef8e0be759f296aa484ee39b2d5390f112
--- /dev/null
+++ b/my_ms_clap/src/models/pytorch_utils.py
@@ -0,0 +1,184 @@
+import numpy as np
+import time
+import torch
+import torch.nn as nn
+
+
+def move_data_to_device(x, device):
+    if 'float' in str(x.dtype):
+        x = torch.Tensor(x)
+    elif 'int' in str(x.dtype):
+        x = torch.LongTensor(x)
+    else:
+        return x
+
+    return x.to(device)
+
+
+def do_mixup(x, mixup_lambda):
+    """Mixup x of even indexes (0, 2, 4, ...) with x of odd indexes 
+    (1, 3, 5, ...).
+    Args:
+      x: (batch_size * 2, ...)
+      mixup_lambda: (batch_size * 2,)
+    Returns:
+      out: (batch_size, ...)
+    """
+    out = (x[0 :: 2].transpose(0, -1) * mixup_lambda[0 :: 2] + \
+        x[1 :: 2].transpose(0, -1) * mixup_lambda[1 :: 2]).transpose(0, -1)
+    return out
+    
+
+def append_to_dict(dict, key, value):
+    if key in dict.keys():
+        dict[key].append(value)
+    else:
+        dict[key] = [value]
+
+
+def interpolate(x, ratio):
+    """Interpolate data in time domain. This is used to compensate the 
+    resolution reduction in downsampling of a CNN.
+    
+    Args:
+      x: (batch_size, time_steps, classes_num)
+      ratio: int, ratio to interpolate
+    Returns:
+      upsampled: (batch_size, time_steps * ratio, classes_num)
+    """
+    (batch_size, time_steps, classes_num) = x.shape
+    upsampled = x[:, :, None, :].repeat(1, 1, ratio, 1)
+    upsampled = upsampled.reshape(batch_size, time_steps * ratio, classes_num)
+    return upsampled
+
+
+def pad_framewise_output(framewise_output, frames_num):
+    """Pad framewise_output to the same length as input frames. The pad value 
+    is the same as the value of the last frame.
+    Args:
+      framewise_output: (batch_size, frames_num, classes_num)
+      frames_num: int, number of frames to pad
+    Outputs:
+      output: (batch_size, frames_num, classes_num)
+    """
+    pad = framewise_output[:, -1 :, :].repeat(1, frames_num - framewise_output.shape[1], 1)
+    """tensor for padding"""
+
+    output = torch.cat((framewise_output, pad), dim=1)
+    """(batch_size, frames_num, classes_num)"""
+
+    return output
+
+
+def count_parameters(model):
+    return sum(p.numel() for p in model.parameters() if p.requires_grad)
+
+
+def count_flops(model, audio_length):
+    """Count flops. Code modified from others' implementation.
+    """
+    multiply_adds = True
+    list_conv2d=[]
+    def conv2d_hook(self, input, output):
+        batch_size, input_channels, input_height, input_width = input[0].size()
+        output_channels, output_height, output_width = output[0].size()
+ 
+        kernel_ops = self.kernel_size[0] * self.kernel_size[1] * (self.in_channels / self.groups) * (2 if multiply_adds else 1)
+        bias_ops = 1 if self.bias is not None else 0
+ 
+        params = output_channels * (kernel_ops + bias_ops)
+        flops = batch_size * params * output_height * output_width
+ 
+        list_conv2d.append(flops)
+
+    list_conv1d=[]
+    def conv1d_hook(self, input, output):
+        batch_size, input_channels, input_length = input[0].size()
+        output_channels, output_length = output[0].size()
+ 
+        kernel_ops = self.kernel_size[0] * (self.in_channels / self.groups) * (2 if multiply_adds else 1)
+        bias_ops = 1 if self.bias is not None else 0
+ 
+        params = output_channels * (kernel_ops + bias_ops)
+        flops = batch_size * params * output_length
+ 
+        list_conv1d.append(flops)
+ 
+    list_linear=[] 
+    def linear_hook(self, input, output):
+        batch_size = input[0].size(0) if input[0].dim() == 2 else 1
+ 
+        weight_ops = self.weight.nelement() * (2 if multiply_adds else 1)
+        bias_ops = self.bias.nelement()
+ 
+        flops = batch_size * (weight_ops + bias_ops)
+        list_linear.append(flops)
+ 
+    list_bn=[] 
+    def bn_hook(self, input, output):
+        list_bn.append(input[0].nelement() * 2)
+ 
+    list_relu=[] 
+    def relu_hook(self, input, output):
+        list_relu.append(input[0].nelement() * 2)
+ 
+    list_pooling2d=[]
+    def pooling2d_hook(self, input, output):
+        batch_size, input_channels, input_height, input_width = input[0].size()
+        output_channels, output_height, output_width = output[0].size()
+ 
+        kernel_ops = self.kernel_size * self.kernel_size
+        bias_ops = 0
+        params = output_channels * (kernel_ops + bias_ops)
+        flops = batch_size * params * output_height * output_width
+ 
+        list_pooling2d.append(flops)
+
+    list_pooling1d=[]
+    def pooling1d_hook(self, input, output):
+        batch_size, input_channels, input_length = input[0].size()
+        output_channels, output_length = output[0].size()
+ 
+        kernel_ops = self.kernel_size[0]
+        bias_ops = 0
+        
+        params = output_channels * (kernel_ops + bias_ops)
+        flops = batch_size * params * output_length
+ 
+        list_pooling2d.append(flops)
+ 
+    def foo(net):
+        childrens = list(net.children())
+        if not childrens:
+            if isinstance(net, nn.Conv2d):
+                net.register_forward_hook(conv2d_hook)
+            elif isinstance(net, nn.Conv1d):
+                net.register_forward_hook(conv1d_hook)
+            elif isinstance(net, nn.Linear):
+                net.register_forward_hook(linear_hook)
+            elif isinstance(net, nn.BatchNorm2d) or isinstance(net, nn.BatchNorm1d):
+                net.register_forward_hook(bn_hook)
+            elif isinstance(net, nn.ReLU):
+                net.register_forward_hook(relu_hook)
+            elif isinstance(net, nn.AvgPool2d) or isinstance(net, nn.MaxPool2d):
+                net.register_forward_hook(pooling2d_hook)
+            elif isinstance(net, nn.AvgPool1d) or isinstance(net, nn.MaxPool1d):
+                net.register_forward_hook(pooling1d_hook)
+            else:
+                print('Warning: flop of module {} is not counted!'.format(net))
+            return
+        for c in childrens:
+            foo(c)
+
+    # Register hook
+    foo(model)
+    
+    device = device = next(model.parameters()).device
+    input = torch.rand(1, audio_length).to(device)
+
+    out = model(input)
+ 
+    total_flops = sum(list_conv2d) + sum(list_conv1d) + sum(list_linear) + \
+        sum(list_bn) + sum(list_relu) + sum(list_pooling2d) + sum(list_pooling1d)
+    
+    return total_flops
\ No newline at end of file
diff --git a/my_ms_clap/src/models/utils.py b/my_ms_clap/src/models/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..f95931fb1c422cbd8349b88e1effb9323f170b2b
--- /dev/null
+++ b/my_ms_clap/src/models/utils.py
@@ -0,0 +1,26 @@
+import argparse
+import yaml
+import sys
+
+def read_config_as_args(config_path,args=None,is_config_str=False):
+    return_dict = {}
+
+    if config_path is not None:
+        if is_config_str:
+            yml_config = yaml.load(config_path, Loader=yaml.FullLoader)
+        else:
+            with open(config_path, "r") as f:
+                yml_config = yaml.load(f, Loader=yaml.FullLoader)
+
+        if args != None:
+            for k, v in yml_config.items():
+                if k in args.__dict__:
+                    args.__dict__[k] = v
+                else:
+                    sys.stderr.write("Ignored unknown parameter {} in yaml.\n".format(k))
+        else:
+            for k, v in yml_config.items():
+                return_dict[k] = v
+
+    args = args if args != None else return_dict
+    return argparse.Namespace(**args)
diff --git a/my_ms_clap/src/zero_shot_classification.py b/my_ms_clap/src/zero_shot_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..64c4ed2759136cc5ac29bc4e3dad6357df9d025c
--- /dev/null
+++ b/my_ms_clap/src/zero_shot_classification.py
@@ -0,0 +1,46 @@
+"""
+This is an example using CLAP to perform zeroshot
+    classification on ESC50 (https://github.com/karolpiczak/ESC-50).
+"""
+
+from CLAPWrapper import CLAPWrapper
+from esc50_dataset import ESC50
+import torch.nn.functional as F
+import numpy as np
+from tqdm import tqdm
+from sklearn.metrics import accuracy_score
+
+# Load dataset
+root_path = "root_path" # Folder with ESC-50-master/
+dataset = ESC50(root=root_path, download=True) #If download=False code assumes base_folder='ESC-50-master' in esc50_dataset.py
+prompt = 'this is the sound of '
+y = [prompt + x for x in dataset.classes]
+
+# Load and initialize CLAP
+weights_path = "weights_path"
+clap_model = CLAPWrapper(weights_path, version = '2023', use_cuda=False)
+
+# Computing text embeddings
+text_embeddings = clap_model.get_text_embeddings(y)
+
+# Computing audio embeddings
+y_preds, y_labels = [], []
+for i in tqdm(range(len(dataset))):
+    x, _, one_hot_target = dataset.__getitem__(i)
+    audio_embeddings = clap_model.get_audio_embeddings([x], resample=True)
+    similarity = clap_model.compute_similarity(audio_embeddings, text_embeddings)
+    y_pred = F.softmax(similarity.detach().cpu(), dim=1).numpy()
+    y_preds.append(y_pred)
+    y_labels.append(one_hot_target.detach().cpu().numpy())
+
+
+y_labels, y_preds = np.concatenate(y_labels, axis=0), np.concatenate(y_preds, axis=0)
+acc = accuracy_score(np.argmax(y_labels, axis=1), np.argmax(y_preds, axis=1))
+print('ESC50 Accuracy {}'.format(acc))
+
+"""
+The output:
+
+ESC50 Accuracy: 93.9%
+
+"""
diff --git a/my_ms_clap/src/zero_shot_predictions.py b/my_ms_clap/src/zero_shot_predictions.py
new file mode 100644
index 0000000000000000000000000000000000000000..da8197de1d11d53ffa3c4384b05a3ed878f9813e
--- /dev/null
+++ b/my_ms_clap/src/zero_shot_predictions.py
@@ -0,0 +1,51 @@
+"""
+This is an example using CLAP for zero-shot inference.
+"""
+from CLAPWrapper import CLAPWrapper
+import torch.nn.functional as F
+
+# Define classes for zero-shot
+# Should be in lower case and can be more than one word
+classes = ['coughing','sneezing','drinking sipping', 'breathing', 'brushing teeth']
+ground_truth = ['coughing']
+# Add prompt
+prompt = 'this is a sound of '
+class_prompts = [prompt + x for x in classes]
+#Load audio files
+audio_files = ['audio_file']
+
+# Load and initialize CLAP
+weights_path = "weights_path"
+# Setting use_cuda = True will load the model on a GPU using CUDA
+clap_model = CLAPWrapper(weights_path, version = '2023', use_cuda=False)
+
+# compute text embeddings from natural text
+text_embeddings = clap_model.get_text_embeddings(class_prompts)
+
+# compute the audio embeddings from an audio file
+audio_embeddings = clap_model.get_audio_embeddings(audio_files, resample=True)
+
+# compute the similarity between audio_embeddings and text_embeddings
+similarity = clap_model.compute_similarity(audio_embeddings, text_embeddings)
+
+similarity = F.softmax(similarity, dim=1)
+values, indices = similarity[0].topk(5)
+
+# Print the results
+print("Ground Truth: {}".format(ground_truth))
+print("Top predictions:\n")
+for value, index in zip(values, indices):
+    print(f"{classes[index]:>16s}: {100 * value.item():.2f}%")
+
+"""
+The output (the exact numbers may vary):
+
+Ground Truth: coughing
+Top predictions:
+
+        coughing: 98.55%
+        sneezing: 1.24%
+drinking sipping: 0.15%
+       breathing: 0.02%
+  brushing teeth: 0.01%
+"""
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ce90f61e935d2e70f6c61f702f8fbe9ccfc2fa1f
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,18 @@
+numpy
+scipy
+scikit-learn
+librosa
+soundfile
+pydub
+torch==2.0.1
+torchaudio==2.0.2
+torchlibrosa==0.1.0
+torchvision==0.15.2
+transformers==4.46.3
+tokenizers==0.20.3
+einops
+einops_exts
+huggingface-hub
+laion-clap==1.1.3
+nnAudio
+safetensors
\ No newline at end of file
diff --git a/src/__init__.py b/src/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/src/__pycache__/__init__.cpython-38.pyc b/src/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3e2ab877a0e6498fd614f3afcd42fd85cb0b1784
Binary files /dev/null and b/src/__pycache__/__init__.cpython-38.pyc differ
diff --git a/src/__pycache__/factory.cpython-38.pyc b/src/__pycache__/factory.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5feb6a46260af08542c88f46d5d0c2ab749bc492
Binary files /dev/null and b/src/__pycache__/factory.cpython-38.pyc differ
diff --git a/src/__pycache__/flamingo.cpython-38.pyc b/src/__pycache__/flamingo.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..61e045860cc07131428106eec68ea9d5d0b47f0c
Binary files /dev/null and b/src/__pycache__/flamingo.cpython-38.pyc differ
diff --git a/src/__pycache__/flamingo_lm.cpython-38.pyc b/src/__pycache__/flamingo_lm.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ff00aa12e28ee2aa015259d613737df273ce137f
Binary files /dev/null and b/src/__pycache__/flamingo_lm.cpython-38.pyc differ
diff --git a/src/__pycache__/helpers.cpython-38.pyc b/src/__pycache__/helpers.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9c53ce05b656462a455c22c876e2a0c90ffc2080
Binary files /dev/null and b/src/__pycache__/helpers.cpython-38.pyc differ
diff --git a/src/__pycache__/utils.cpython-38.pyc b/src/__pycache__/utils.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3952f33ac9c4dfaf1f74421ad81b6de00f05eb3c
Binary files /dev/null and b/src/__pycache__/utils.cpython-38.pyc differ
diff --git a/src/factory.py b/src/factory.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b4c0c3f0281b2a7b672b7d37253c226c9a885a6
--- /dev/null
+++ b/src/factory.py
@@ -0,0 +1,579 @@
+import sys 
+sys.path.append('../')
+
+from typing import Optional
+from copy import deepcopy
+
+from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel, Wav2Vec2FeatureExtractor, WhisperFeatureExtractor, WhisperModel
+# from .modeling_whisper import WhisperModel
+from my_laion_clap.CLAP.src.laion_clap.clap_module.htsat import create_htsat_model
+from my_ms_clap.src.CLAPWrapper import CLAPWrapper
+
+import torch
+import torchaudio
+import torchaudio.transforms as T
+import numpy as np
+from torch import nn
+import torchvision.transforms
+from contextlib import suppress
+
+try:
+    from .flamingo import Flamingo
+    from .flamingo_lm import FlamingoLMMixin
+    from .utils import extend_instance
+except:
+    from flamingo import Flamingo
+    from flamingo_lm import FlamingoLMMixin
+    from utils import extend_instance
+
+def int16_to_float32(x):
+    return (x / 32767.0).astype(np.float32)
+
+def float32_to_int16(x):
+    x = np.clip(x, a_min=-1., a_max=1.)
+    return (x * 32767.).astype(np.int16)
+
+def int16_to_float32_torch(x):
+    return (x / 32767.0).type(torch.float32)
+
+def float32_to_int16_torch(x):
+    x = torch.clamp(x, min=-1., max=1.)
+    return (x * 32767.).type(torch.int16)
+
+class CLAPAudioCfp:
+    model_type: str = "HTSAT"
+    model_name: str = "large"
+    sample_rate: int = 16000
+    audio_length: int = 1024
+    window_size: int = 1024
+    hop_size: int = 160
+    fmin: int = 50
+    fmax: int = 14000
+    class_num: int = 527
+    mel_bins: int = 64
+    clip_samples: int = 160000
+
+
+class CLAP(nn.Module):
+    def __init__(self, clap_config):
+        super(CLAP, self).__init__()
+
+        self.clap_config = clap_config
+
+        self.method = clap_config["method"]
+        device_id = f'cuda:{torch.cuda.current_device()}'
+
+        if ('finetune' in clap_config) and clap_config['finetune']:
+            self.finetune = True 
+            print('Finetuning CLAP encoder as well!')
+        else:
+            self.finetune = False 
+
+        audio_cfg = CLAPAudioCfp()
+        enable_fusion = True
+        fusion_type = "aff_2d"
+        self.nvclap = create_htsat_model(audio_cfg, enable_fusion, fusion_type)
+        clap_state_dict = torch.load(clap_config["checkpoint"], map_location = 'cpu')
+        clap_state_dict_copy = clap_state_dict['state_dict'].copy()
+        for key in list(clap_state_dict['state_dict'].keys()):
+            if 'audio' in key:
+                clap_state_dict_copy[key.replace('module.audio_branch.','')] = clap_state_dict_copy[key]
+                del clap_state_dict_copy[key]
+            else:
+                del clap_state_dict_copy[key]
+        self.nvclap.load_state_dict(clap_state_dict_copy, strict = False)
+        self.nvclap = self.nvclap.to(device_id)
+        
+        for param in self.nvclap.parameters():
+            param.requires_grad = self.finetune
+
+        if self.finetune:
+            self.nvclap.train()
+        else:
+            self.nvclap.eval()
+
+        print('loaded NVCLAP model: {}'.format(clap_config["checkpoint"]))
+                
+    def get_mel(self, audio_data):
+
+        # mel shape: (n_mels, T)
+        mel_tf = torchaudio.transforms.MelSpectrogram(
+            sample_rate=16000,
+            n_fft=1024,
+            win_length=1024,
+            hop_length=160,
+            center=True,
+            pad_mode="reflect",
+            power=2.0,
+            norm=None,
+            onesided=True,
+            n_mels=64,
+            f_min=50,
+            f_max=14000
+        ).to(audio_data.device)
+        
+        mel = mel_tf(audio_data)
+
+        # we use log mel spectrogram as input
+        mel = torchaudio.transforms.AmplitudeToDB(top_db=None)(mel)
+
+        return mel.T  # (T, n_mels)
+
+    def get_audio_features(self, sample, audio_data, max_len, data_truncating, data_filling, require_grad=False):
+
+        grad_fn = suppress if require_grad else torch.no_grad
+        with grad_fn():
+            if len(audio_data) > max_len:
+                if data_truncating == "rand_trunc":
+                    longer = torch.tensor([True])
+                elif data_truncating == "fusion":
+                    # fusion
+                    mel = self.get_mel(audio_data)
+                    # split to three parts
+                    chunk_frames = max_len // 160 + 1  # the +1 related to how the spectrogram is computed
+                    total_frames = mel.shape[0]
+                    if chunk_frames == total_frames:
+                        # there is a corner case where the audio length is
+                        # larger than max_len but smaller than max_len+hop_size.
+                        # In this case, we just use the whole audio.
+                        mel_fusion = torch.stack([mel, mel, mel, mel], dim=0)
+                        sample["mel_fusion"] = mel_fusion
+                        longer = torch.tensor([False])
+                    else:
+                        ranges = np.array_split(list(range(0, total_frames - chunk_frames + 1)), 3)
+                        if len(ranges[1]) == 0:
+                            # if the audio is too short, we just use the first chunk
+                            ranges[1] = [0]
+                        if len(ranges[2]) == 0:
+                            # if the audio is too short, we just use the first chunk
+                            ranges[2] = [0]
+                        # randomly choose index for each part
+                        idx_front = np.random.choice(ranges[0])
+                        idx_middle = np.random.choice(ranges[1])
+                        idx_back = np.random.choice(ranges[2])
+                        # select mel
+                        mel_chunk_front = mel[idx_front:idx_front + chunk_frames, :]
+                        mel_chunk_middle = mel[idx_middle:idx_middle + chunk_frames, :]
+                        mel_chunk_back = mel[idx_back:idx_back + chunk_frames, :]
+
+                        # shrink the mel
+                        mel_shrink = torchvision.transforms.Resize(size=[chunk_frames, 64])(mel[None])[0]
+                        # logging.info(f"mel_shrink.shape: {mel_shrink.shape}")
+
+                        # stack
+                        mel_fusion = torch.stack([mel_shrink, mel_chunk_front, mel_chunk_middle, mel_chunk_back], dim=0)
+                        sample["mel_fusion"] = mel_fusion
+                        longer = torch.tensor([True])
+                else:
+                    raise NotImplementedError(
+                        f"data_truncating {data_truncating} not implemented"
+                    )
+                # random crop to max_len (for compatibility)
+                overflow = len(audio_data) - max_len
+                idx = np.random.randint(0, overflow + 1)
+                audio_data = audio_data[idx: idx + max_len]
+
+            else:  # padding if too short
+                if len(audio_data) < max_len:  # do nothing if equal
+                    if data_filling == "repeatpad":
+                        n_repeat = int(max_len / len(audio_data))
+                        audio_data = audio_data.repeat(n_repeat)
+                        # audio_data = audio_data.unsqueeze(0).unsqueeze(0).unsqueeze(0)
+                        # audio_data = F.interpolate(audio_data,size=max_len,mode="bicubic")[0,0,0]
+                        audio_data = F.pad(
+                            audio_data,
+                            (0, max_len - len(audio_data)),
+                            mode="constant",
+                            value=0,
+                        )
+                    elif data_filling == "pad":
+                        audio_data = F.pad(
+                            audio_data,
+                            (0, max_len - len(audio_data)),
+                            mode="constant",
+                            value=0,
+                        )
+                    elif data_filling == "repeat":
+                        n_repeat = int(max_len / len(audio_data))
+                        audio_data = audio_data.repeat(n_repeat + 1)[:max_len]
+                    else:
+                        raise NotImplementedError(
+                            f"data_filling {data_filling} not implemented"
+                        )
+                if data_truncating == 'fusion':
+                    mel = self.get_mel(audio_data)
+                    mel_fusion = torch.stack([mel, mel, mel, mel], dim=0)
+                    sample["mel_fusion"] = mel_fusion
+                longer = torch.tensor([False])
+
+        sample["longer"] = longer
+        sample["waveform"] = audio_data
+
+        return sample
+
+
+    def load_audio(self, clips):
+
+        # waveform, sr = torchaudio.load(filename)
+        # waveform = torchaudio.functional.resample(waveform, orig_freq=self.clap_config['sampling_rate'], new_freq=16000)
+        processed_clips = []
+        for clip in clips:
+            audio_data = int16_to_float32_torch(float32_to_int16_torch(clip))
+            sample = self.get_audio_features({}, audio_data, 160000, "fusion", "repeatpad")
+            processed_clips.append(sample)
+
+        waveforms = {}
+        waveforms["mel_fusion"] = torch.stack([item["mel_fusion"] for item in processed_clips], dim=0)
+        waveforms["longer"] = torch.stack([item["longer"] for item in processed_clips], dim=0)
+        waveforms["waveform"] = torch.stack([item["waveform"] for item in processed_clips], dim=0)
+
+        return waveforms
+
+
+    def forward(self, audio_clips):
+        
+        # It will handle various segments, 1 audio will have various segments [B X n_segments X time]
+        # expand batch dimension during inference
+        if len(audio_clips.shape) == 2:
+            audio_clips = audio_clips.unsqueeze(0)
+        assert len(audio_clips.shape) == 3
+
+        audio_embeds = []
+        for audio_clip in audio_clips:
+            audio = self.load_audio(audio_clip)
+            audio_embed = self.nvclap(audio) #.reshape(-1, self.clap_config["audio_embed_dim"])
+            audio_embeds.append(audio_embed)
+
+        audio_embeds = torch.stack(audio_embeds, dim=0)
+        # audio_embeds.requires_grad = self.finetune
+
+        return audio_embeds
+    
+class Whisper(nn.Module):
+
+    def __init__(self, whisper_config):
+        super(Whisper, self).__init__()
+
+        self.whisper_config = whisper_config
+
+        self.method = self.whisper_config["method"]
+        device_id = f'cuda:{torch.cuda.current_device()}'
+
+        if ('finetune' in self.whisper_config) and self.whisper_config['finetune']:
+            self.finetune = True 
+            print('Finetuning Whisper encoder as well!')
+        else:
+            self.finetune = False 
+
+        self.whisper = WhisperModel.from_pretrained(self.whisper_config['path']).encoder
+        self.whisper = self.whisper.to(device_id)
+
+        self.wav_processor = WhisperFeatureExtractor.from_pretrained(self.whisper_config['path'])
+        
+        for param in self.whisper.parameters():
+            param.requires_grad = self.finetune
+
+        if self.finetune:
+            self.whisper.train()
+        else:
+            self.whisper.eval()
+
+        print('loaded Whisper model: {}'.format(self.whisper_config['path']))
+
+    def load_audio(self, clips):
+
+        device_id = f'cuda:{torch.cuda.current_device()}'
+        sample = self.wav_processor(clips.cpu().numpy(), sampling_rate=self.whisper_config['sampling_rate'], return_tensors="pt")["input_features"].to(device_id)
+
+        return sample
+
+    def forward(self, audio_clips):
+
+        # It will handle various segments, 1 audio will have various segments [batch X n_segments X time]
+        if len(audio_clips.shape) == 2:
+            audio_clips = audio_clips.unsqueeze(0)
+        assert len(audio_clips.shape) == 3
+
+        audio_embeds = []
+        for audio_clip in audio_clips:
+            audio = self.load_audio(audio_clip)
+            audio_embed = self.whisper(audio).last_hidden_state #.reshape(-1, self.whisper_config["audio_embed_dim"])
+            audio_embeds.append(audio_embed)
+
+        audio_embeds = torch.stack(audio_embeds, dim=0)
+        # audio_embeds.requires_grad = self.finetune
+
+        return audio_embeds
+
+class MERT(nn.Module):
+
+    def __init__(self, mert_config):
+        super(MERT, self).__init__()
+
+        self.mert_config = mert_config
+
+        self.method = mert_config["method"]
+        device_id = f'cuda:{torch.cuda.current_device()}'
+
+        if ('finetune' in mert_config) and mert_config['finetune']:
+            self.finetune = True 
+            print('Finetuning MERT encoder as well!')
+        else:
+            self.finetune = False
+
+        self.mert = AutoModel.from_pretrained(mert_config['path'], trust_remote_code=True)
+        self.mert = self.mert.to(device_id)
+        self.resampler = T.Resample(16000, mert_config['sampling_rate']).to(device_id)
+
+        self.wav_processor = Wav2Vec2FeatureExtractor.from_pretrained(mert_config['path'],trust_remote_code=True)
+        
+        for param in self.mert.parameters():
+            param.requires_grad = self.finetune
+
+        if self.finetune:
+            self.mert.train()
+        else:
+            self.mert.eval()
+
+        print('loaded MERT model: {}'.format(mert_config['path']))
+
+    def load_audio(self, clips):
+        device_id = f'cuda:{torch.cuda.current_device()}'
+        clips = self.resampler(clips.float()).float()
+        sample = self.wav_processor(clips, sampling_rate=self.mert_config['sampling_rate'], return_tensors="pt")["input_values"]
+        if len(sample.shape) == 1:
+            sample = sample.unsqueeze(0)
+        return sample.to(device_id)
+
+    def forward(self, audio_clips):
+
+        # It will handle various segments, 1 audio will have various segments [batch X n_segments X time]
+        if len(audio_clips.shape) == 2:
+            audio_clips = audio_clips.unsqueeze(0)
+        assert len(audio_clips.shape) == 3
+
+        audio_embeds = []
+        for audio_clip in audio_clips:
+            audio = self.load_audio(audio_clip).to(torch.bfloat16) # all processing happens in float
+            if len(audio.shape) > 2:
+                audio = audio.squeeze(0)
+            audio_embed = self.mert(audio, output_hidden_states=True).last_hidden_state #.reshape(-1, self.mert_config["audio_embed_dim"])
+            audio_embeds.append(audio_embed)
+
+        audio_embeds = torch.stack(audio_embeds, dim=0)
+        audio_embeds.requires_grad = self.finetune
+
+        return audio_embeds
+
+
+
+def create_model_and_transforms(
+    clap_config: dict,
+    lang_encoder_path: str,
+    tokenizer_path: str,
+    audio_transformer_kwargs: dict,
+    cross_attn_every_n_layers: int = 1,
+    use_local_files: bool = False,
+    decoder_layers_attr_name: str = None,
+    freeze_lm_embeddings: bool = False,
+    unfreeze_full_lm: bool = False,
+    cache_dir: Optional[str] = None,
+    **flamingo_kwargs,
+):
+    clap = CLAP(clap_config)
+
+    text_tokenizer = AutoTokenizer.from_pretrained(
+        tokenizer_path,
+        local_files_only=use_local_files,
+        trust_remote_code=True,
+        cache_dir=cache_dir,
+    )
+    text_tokenizer.add_special_tokens(
+        {"additional_special_tokens": ["<audio>", "<|endofchunk|>", "<|PAD_TOKEN|>"]}
+    )
+
+    text_tokenizer.pad_token = None
+    text_tokenizer.pad_token_id = None
+
+    text_tokenizer.pad_token = "<|PAD_TOKEN|>"
+    text_tokenizer.pad_token_id = text_tokenizer.encode("<|PAD_TOKEN|>")[-1]
+
+    if text_tokenizer.sep_token is None:
+        text_tokenizer.add_special_tokens({"sep_token": "<SEP>"})
+
+    lang_encoder = AutoModelForCausalLM.from_pretrained(
+        lang_encoder_path,
+        local_files_only=use_local_files,
+        trust_remote_code=True,
+        cache_dir=cache_dir,
+    )
+
+    extend_instance(lang_encoder, FlamingoLMMixin)
+
+    if decoder_layers_attr_name is None:
+        decoder_layers_attr_name = _infer_decoder_layers_attr_name(lang_encoder)
+    lang_encoder.set_decoder_layers_attr_name(decoder_layers_attr_name)
+    lang_encoder.resize_token_embeddings(len(text_tokenizer))
+    
+    if ('finetune' in clap_config) and clap_config['finetune']:
+        unfreeze_clap = True 
+    else:
+        unfreeze_clap = False 
+
+    model = Flamingo(
+        clap,
+        unfreeze_clap,
+        lang_encoder,
+        text_tokenizer.encode("<|endofchunk|>")[-1],
+        text_tokenizer.encode("<audio>")[-1],
+        text_tokenizer.sep_token_id,
+        clap_embed_dim = clap_config["audio_embed_dim"],
+        audio_transformer_kwargs=audio_transformer_kwargs, 
+        cross_attn_every_n_layers=cross_attn_every_n_layers,
+        **flamingo_kwargs,
+    )
+
+    model.requires_grad_(False)
+    assert sum(p.numel() for p in model.parameters() if p.requires_grad) == 0
+
+    model.audio_transformer_clap.requires_grad_(True)
+    
+    model.lang_encoder.gated_cross_attn_layers_sound.requires_grad_(True)
+
+    if not freeze_lm_embeddings:
+        model.lang_encoder.get_input_embeddings().requires_grad_(True)
+    
+    if unfreeze_full_lm:
+        model.lang_encoder.requires_grad_(True)
+
+    if unfreeze_clap:
+        model.clap.requires_grad_(True)
+
+
+    print("Flamingo model initialized with {:,} trainable parameters (audio transformer has {:,}, LM has {:,})".format(
+        sum(p.numel() for p in model.parameters() if p.requires_grad),
+        sum(p.numel() for p in model.audio_transformer_clap.parameters() if p.requires_grad),
+        sum(p.numel() for p in model.lang_encoder.parameters() if p.requires_grad),
+    ))
+
+    return model, text_tokenizer
+
+
+def _infer_decoder_layers_attr_name(model):
+    for k in __KNOWN_DECODER_LAYERS_ATTR_NAMES:
+        if k.lower() in model.__class__.__name__.lower():
+            return __KNOWN_DECODER_LAYERS_ATTR_NAMES[k]
+
+    raise ValueError(
+        f"We require the attribute name for the nn.ModuleList in the decoder storing the transformer block layers. Please supply this string manually."
+    )
+
+
+__KNOWN_DECODER_LAYERS_ATTR_NAMES = {
+    "opt": "model.decoder.layers",
+    "gptj": "transformer.h",
+    "gpt-j": "transformer.h",
+    "pythia": "gpt_neox.layers",
+    "llama": "model.layers",
+    "gptneoxforcausallm": "gpt_neox.layers",
+    "mpt": "transformer.blocks",
+    "mosaicgpt": "transformer.blocks",
+    "qwen": "model.layers",
+}
+
+
+if __name__ == '__main__':
+    import torch
+    torch.set_printoptions(profile="full")  # only in debug mode
+    import sys 
+    sys.path.append('../')
+    import os
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"
+    import yaml
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-c', '--config', type=str, default='../configs/config.yaml', help='yaml config path')
+    args = parser.parse_args()
+
+    config = yaml.load(open(args.config), Loader=yaml.FullLoader)
+
+    data_config = config['data_config']
+    model_config = config["model_config"]
+    clap_config = config["clap_config"]
+
+    model, tokenizer = create_model_and_transforms(
+        **model_config,
+        clap_config=clap_config,
+        use_local_files=False,
+        gradient_checkpointing=True,
+        freeze_lm_embeddings=True
+    )
+    model = model.cuda()
+
+    from data.data import AudioTextData, DataCollator
+    from torch.utils.data import DataLoader
+
+    batch_size = 8
+    trainset = AudioTextData(
+        **data_config, clap_config=clap_config, tokenizer=tokenizer,
+        epoch=1, force_reblend=True
+    )
+    data_collator = DataCollator(tokenizer)
+    trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True, collate_fn=data_collator, num_workers=4)
+
+    for step, batch in enumerate(trainloader):
+        audio_clips = batch["audio_clips"].cuda()
+        audio_embed_mask = batch["audio_embed_mask"].cuda()
+        input_ids = batch["input_ids"].cuda()
+        attention_mask = batch["attention_mask"].cuda()
+
+        print('batch {}:'.format(step+1), audio_clips.shape, audio_embed_mask.shape, input_ids.shape, attention_mask.shape)
+
+        labels = input_ids.clone()
+
+        labels[labels == tokenizer.pad_token_id] = -100
+        labels[:, :2] = -100
+        labels[labels == tokenizer.encode("<audio>")[-1]] = -100
+
+        sep_locations = labels == tokenizer.sep_token_id
+        endofchunk_token_id = tokenizer.encode("<|endofchunk|>")[-1]
+        eoc_locations = labels == endofchunk_token_id
+
+        if not all(sep_locations.sum(dim=1) == eoc_locations.sum(dim=1)):
+            print("Warning: sep loc {} but eoc loc {}".format(sep_locations.sum(dim=1), eoc_locations.sum(dim=1)))
+            
+            for input_id in labels:
+                input_id[input_id==-100] = tokenizer.encode("-")[-1]
+                print(input_id, '\n', tokenizer.decode(input_id))
+
+        for i in range(labels.shape[0]):
+            shouldmask = True
+            for j in range(labels.shape[1]):
+                if shouldmask and (labels[i][j] != tokenizer.eos_token_id):
+                    masked_value = -100
+                else:
+                    masked_value = labels[i][j]
+
+                if labels[i][j] == tokenizer.sep_token_id:
+                    shouldmask = False
+                elif labels[i][j] == endofchunk_token_id:
+                    shouldmask = True
+                
+                labels[i][j] = masked_value
+
+            if labels[i][-1] not in [-100, tokenizer.eos_token_id, tokenizer.pad_token_id, endofchunk_token_id]:
+                debug_masked_labels_in_the_end = []
+                for j in range(labels.shape[1]-1, -1, -1):
+                    if labels[i][j] not in [-100, tokenizer.eos_token_id, endofchunk_token_id]:
+                        debug_masked_labels_in_the_end.insert(0, deepcopy(labels[i][j].item()))
+                        labels[i][j] = -100
+                    else:
+                        break
+                        
+                print('hit max_token and masking ids from the end:', \
+                    tokenizer.decode(torch.LongTensor(debug_masked_labels_in_the_end).to(labels.device))
+                )
+
+        if step == 50:
+            break
diff --git a/src/flamingo.py b/src/flamingo.py
new file mode 100644
index 0000000000000000000000000000000000000000..913c562a4808559f6e5394d6a53fe25729993897
--- /dev/null
+++ b/src/flamingo.py
@@ -0,0 +1,279 @@
+import torch
+from einops import rearrange
+from torch import nn
+import torch.nn.utils as utils
+
+from torch.distributed.fsdp.wrap import (
+    enable_wrap,
+    wrap,
+)
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from torch.distributed.fsdp import (
+    FullyShardedDataParallel as FSDP,
+)
+
+try:
+    from .helpers import TransformerEncoder
+    from .utils import apply_with_stopping_condition
+except:
+    from helpers import TransformerEncoder
+    from utils import apply_with_stopping_condition
+
+
+class Flamingo(nn.Module):
+    def __init__(
+        self,
+        clap,
+        unfreeze_clap,
+        lang_encoder: nn.Module,
+        eoc_token_id: int,
+        media_token_id: int,
+        sep_token_id: int,
+        clap_embed_dim: int,
+        audio_transformer_kwargs: dict,
+        cross_attn_every_n_layers: int = 1,
+        gradient_checkpointing: bool = False,
+    ):
+        super().__init__()
+
+        self.eoc_token_id = eoc_token_id
+        self.media_token_id = media_token_id
+        self.sep_token_id = sep_token_id 
+
+        # initialize embedding dimensions
+        self.clap_embed_dim = clap_embed_dim
+
+        # Initilaize CLAP
+        self.clap = clap
+        self.unfreeze_clap = unfreeze_clap
+        self.clap.requires_grad_(unfreeze_clap)
+
+        if hasattr(lang_encoder.config, "d_model"):
+            self.lang_dim = lang_encoder.config.d_model  # mpt uses d_model
+        else:
+            self.lang_dim = lang_encoder.config.hidden_size
+
+        # define just one type of audio transformer
+        n_head = audio_transformer_kwargs["n_head"]
+        n_layers = audio_transformer_kwargs["n_layers"]
+        d_inner = audio_transformer_kwargs["d_inner"]
+        max_num_media = audio_transformer_kwargs["max_num_media"]
+        max_window_per_audio = audio_transformer_kwargs["max_window_per_audio"]
+        
+        common_encoder_embed_dim = clap_embed_dim
+
+        # define the transformers
+        assert common_encoder_embed_dim % n_head == 0
+        self.audio_transformer_clap = TransformerEncoder(
+            d_word_vec=common_encoder_embed_dim, 
+            n_layers=n_layers, 
+            n_head=n_head, 
+            d_k=common_encoder_embed_dim // n_head, 
+            d_v=common_encoder_embed_dim // n_head,
+            d_model=common_encoder_embed_dim, 
+            d_inner=d_inner, 
+            dropout=0.0, 
+            n_position=max_num_media, 
+            scale_emb=True
+        )
+        
+        self.lang_encoder = lang_encoder
+        self.lang_encoder.init_flamingo(
+            media_token_id=media_token_id,
+            lang_hidden_size=self.lang_dim,
+            audio_hidden_size=common_encoder_embed_dim,
+            max_window_per_audio=max_window_per_audio,
+            cross_attn_every_n_layers=cross_attn_every_n_layers,
+            gradient_checkpointing=gradient_checkpointing,
+        )
+
+        self._use_gradient_checkpointing = gradient_checkpointing
+        
+        # enable gradient checkpoint for the audio transformers
+        self.audio_transformer_clap._use_gradient_checkpointing = gradient_checkpointing
+
+        # enable gradient checkpoint for encoders
+        self.clap._use_gradient_checkpointing = gradient_checkpointing
+
+
+    def forward(
+        self,
+        audio_x: torch.Tensor,
+        audio_x_mask: torch.Tensor,
+        lang_x: torch.Tensor,
+        attention_mask: torch.Tensor = None,
+        labels: torch.Tensor = None,
+        clear_conditioned_layers: bool = True,
+        past_key_values=None,
+        use_cache: bool = False,
+    ):
+        assert (
+            self.lang_encoder.initialized_flamingo
+        ), "Flamingo layers are not initialized. Please call `init_flamingo` first."
+
+        assert (
+            self.lang_encoder._use_cached_audio_x or audio_x is not None
+        ), "Must provide either audio_x or have precached media using cache_media()."
+
+        if self.lang_encoder._use_cached_audio_x:
+            assert (
+                audio_x is None
+            ), "Expect audio_x to be None when media has been cached using cache_media(). Try uncache_media() first."
+            assert self.lang_encoder.is_conditioned()
+
+        else:
+            self._encode_audio_x(audio_x=audio_x, audio_x_mask=audio_x_mask)
+            self._condition_media_locations(input_ids=lang_x)
+
+        output = self.lang_encoder(
+            input_ids=lang_x,
+            attention_mask=attention_mask,
+            labels=labels,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+        )
+
+        if clear_conditioned_layers:
+            self.lang_encoder.clear_conditioned_layers()
+
+        return output
+
+    def generate(
+        self,
+        audio_x: torch.Tensor,
+        audio_x_mask: torch.Tensor,
+        lang_x: torch.Tensor,
+        attention_mask: torch.Tensor = None,
+        **kwargs,
+    ):
+        num_beams = kwargs.pop("num_beams", 1)
+        if num_beams > 1:
+            audio_x = audio_x.repeat_interleave(num_beams, dim=0)
+
+        self.lang_encoder._use_cached_audio_x = True
+        self._encode_audio_x(audio_x=audio_x, audio_x_mask=audio_x_mask)
+
+        eos_token_id = kwargs.pop("eos_token_id", self.eoc_token_id)
+        output = self.lang_encoder.generate(
+            input_ids=lang_x,
+            attention_mask=attention_mask,
+            eos_token_id=eos_token_id,
+            num_beams=num_beams,
+            **kwargs,
+        )
+
+        self.lang_encoder.clear_conditioned_layers()
+        self.lang_encoder._use_cached_audio_x = False
+        return output
+
+    def _encode_audio_x(self, audio_x: torch.Tensor, audio_x_mask: torch.Tensor):
+        """
+        rearrange code based on https://github.com/dhansmair/flamingo-mini
+        """
+
+        assert audio_x.ndim == 3, "audio_x should be of shape (B, num_window, window_length)"
+
+        #------------------------------------------------------------------------#
+        # get embeddings from CLAP
+        audio_embeds = self.clap(audio_x)
+        B, L, H, D = audio_embeds.shape  # L is number of windows, D is feature dim
+        assert D == self.clap_embed_dim
+
+        audio_x_out = rearrange(audio_embeds, 'b l h d -> b (l h) d')
+        # handle the masks
+        expanded_speech_mask = audio_x_mask.repeat_interleave(H, dim=1) # B, (LxH)
+        if B > 1 and expanded_speech_mask.shape[0] == 1:
+            expanded_speech_mask = expanded_speech_mask.repeat(B, 1)
+        assert expanded_speech_mask.shape[0] == B and expanded_speech_mask.shape[1] == L*H, "{} != ({},{})".format(expanded_speech_mask.shape, B, L*H)
+
+        #------------------------------------------------------------------------#
+        audio_x_out = self.audio_transformer_clap(audio_x_out, causal_mask = expanded_speech_mask)  # B, LxH, D
+
+        # Unsqueeze to handle Falmingo code
+        audio_x_out = audio_x_out.unsqueeze(2)  # B, L, n=1, D
+        audio_x_mask = expanded_speech_mask.unsqueeze(2)  # B, L, n=1
+
+        #------------------------------------------------------------------------#
+
+        for layer in self.lang_encoder._get_decoder_layers():
+            layer.condition_audio_x(audio_x_out, audio_x_mask)
+
+    def wrap_fsdp(self, wrapper_kwargs, device_id):
+        # unfreeze the decoder layers
+        for block in self.lang_encoder.old_decoder_blocks:
+            block.requires_grad_(True)
+
+        # wrap in FSDP
+        with enable_wrap(wrapper_cls=FSDP, **wrapper_kwargs):
+
+            self.audio_transformer_clap = wrap(wrap(self.audio_transformer_clap))
+
+            self.lang_encoder.old_decoder_blocks = nn.ModuleList(
+                wrap(wrap(block)) for block in self.lang_encoder.old_decoder_blocks
+            )
+            self.lang_encoder.gated_cross_attn_layers_sound = nn.ModuleList(
+                wrap(wrap(layer)) if layer is not None else None
+                for layer in self.lang_encoder.gated_cross_attn_layers_sound
+            )
+            self.lang_encoder.init_flamingo_layers(self._use_gradient_checkpointing)
+            self.lang_encoder.set_input_embeddings(
+                wrap(wrap(self.lang_encoder.get_input_embeddings()))
+            )
+
+            if hasattr(self.lang_encoder, 'set_output_embeddings'):
+                self.lang_encoder.set_output_embeddings(
+                    wrap(wrap(self.lang_encoder.get_output_embeddings()))
+                )
+            else:
+                print('skip wrapping output embeddings')
+
+        # manually move non-FSDP managed parameters to device_id
+        # these are all in lang_encoder
+        apply_with_stopping_condition(
+            module=self.lang_encoder,
+            apply_fn=lambda m: m.to(device_id),
+            apply_condition=lambda m: len(list(m.children())) == 0,
+            stopping_condition=lambda m: isinstance(m, FSDP),
+        )
+
+        # clap shouldn't be wrapped; should be on each gpu
+        if self.unfreeze_clap:
+            apply_with_stopping_condition(
+                module=self.clap,
+                apply_fn=lambda m: m.to(device_id),
+                apply_condition=lambda m: len(list(m.children())) == 0,
+                stopping_condition=lambda m: isinstance(m, FSDP),
+            )
+
+        # exclude the original decoder layers from the optimizer
+        for block in self.lang_encoder.old_decoder_blocks:
+            for p in block.parameters():
+                p.exclude_from_optimizer = True
+
+        # set up clip_grad_norm_ function
+        def clip_grad_norm_(max_norm):
+
+            utils.clip_grad_norm_(self.clap.nvclap.parameters(), max_norm=max_norm)
+
+            self.audio_transformer_clap.clip_grad_norm_(max_norm)
+            for layer in self.lang_encoder.gated_cross_attn_layers_sound:
+                if layer is not None:
+                    layer.clip_grad_norm_(max_norm)
+            self.lang_encoder.get_input_embeddings().clip_grad_norm_(max_norm)
+
+        self.clip_grad_norm_ = clip_grad_norm_
+
+    def _condition_media_locations(self, input_ids: torch.Tensor):
+        media_locations = (input_ids == self.media_token_id)
+
+        for layer in self.lang_encoder._get_decoder_layers():
+            layer.condition_media_locations(media_locations)
+
+    def cache_media(self, input_ids: torch.Tensor, audio_x: torch.Tensor, audio_x_mask: torch.Tensor):
+        self._encode_audio_x(audio_x=audio_x, audio_x_mask=audio_x_mask)
+        self._condition_media_locations(input_ids=input_ids)
+        self.lang_encoder._use_cached_audio_x = True
+
+    def uncache_media(self):
+        self.lang_encoder.clear_conditioned_layers()
+        self.lang_encoder._use_cached_audio_x = False
diff --git a/src/flamingo_lm.py b/src/flamingo_lm.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e4cb937164bbfbc036eec37875675f31b936b2e
--- /dev/null
+++ b/src/flamingo_lm.py
@@ -0,0 +1,172 @@
+import torch.nn as nn
+
+try:
+    from .helpers import GatedCrossAttentionBlock
+    from .utils import getattr_recursive, setattr_recursive
+except:
+    from helpers import GatedCrossAttentionBlock
+    from utils import getattr_recursive, setattr_recursive
+
+
+class FlamingoLayer(nn.Module):
+    """
+    FlamingoLayer is a wrapper around the GatedCrossAttentionBlock and DecoderLayer.
+    """
+
+    def __init__(
+        self, gated_cross_attn_layer_sound, decoder_layer, gradient_checkpointing=False
+    ):
+        super().__init__()
+        self.gated_cross_attn_layer_sound = gated_cross_attn_layer_sound
+        self.decoder_layer = decoder_layer
+        self.audio_x = None
+        self.audio_x_mask = None
+        self.few_shot_mask = None
+        self.media_locations = None
+        if self.gated_cross_attn_layer_sound is not None:
+            self.gated_cross_attn_layer_sound._use_gradient_checkpointing = (
+                gradient_checkpointing
+            )
+        self.decoder_layer._use_gradient_checkpointing = gradient_checkpointing
+
+    def is_conditioned(self) -> bool:
+        """Check whether the layer is conditioned."""
+        return (self.audio_x is not None) and (self.audio_x_mask is not None) and (self.media_locations is not None)
+
+    def condition_audio_x(self, sound_x, sound_x_mask):
+        self.sound_x = sound_x
+        self.sound_x_mask = sound_x_mask
+
+    def condition_media_locations(self, media_locations):
+        self.media_locations = media_locations
+
+    def condition_use_cached_media(self, use_cached_media):
+        self.use_cached_media = use_cached_media
+
+    def forward(
+        self,
+        lang_x,
+        attention_mask=None,
+        **decoder_layer_kwargs,
+    ):
+        if self.gated_cross_attn_layer_sound is not None:
+            if self.sound_x is None:
+                raise ValueError("sound_x must be conditioned before forward pass")
+
+            if self.media_locations is None:
+                raise ValueError(
+                    "media_locations must be conditioned before forward pass"
+                )
+
+            lang_x = self.gated_cross_attn_layer_sound(
+                lang_x,
+                self.sound_x,
+                self.sound_x_mask,
+                media_locations=self.media_locations,
+                use_cached_media=self.use_cached_media,
+            )
+        
+        # Normal decoder layer
+        lang_x = self.decoder_layer(
+            lang_x, attention_mask=attention_mask, **decoder_layer_kwargs
+        )
+        return lang_x
+
+
+class FlamingoLMMixin(nn.Module):
+    """
+    Mixin to add cross-attention layers to a language model.
+    """
+
+    def set_decoder_layers_attr_name(self, decoder_layers_attr_name):
+        self.decoder_layers_attr_name = decoder_layers_attr_name
+
+    def _get_decoder_layers(self):
+        return getattr_recursive(self, self.decoder_layers_attr_name)
+
+    def _set_decoder_layers(self, value):
+        setattr_recursive(self, self.decoder_layers_attr_name, value)
+
+    def init_flamingo(
+        self,
+        media_token_id,
+        lang_hidden_size,
+        audio_hidden_size,
+        max_window_per_audio,
+        cross_attn_every_n_layers,
+        gradient_checkpointing,
+    ):
+        """
+        Initialize Flamingo by adding a new gated cross attn to the decoder. Store the media token id for computing the media locations.
+        """
+        self.old_decoder_blocks = self._get_decoder_layers()
+        self.gated_cross_attn_layers_sound = nn.ModuleList(
+            [
+                GatedCrossAttentionBlock(
+                    dim=lang_hidden_size, 
+                    dim_audio=audio_hidden_size,
+                    max_window_per_audio=max_window_per_audio, 
+                    only_attend_immediate_media=False,
+                )
+                if (layer_idx + 1) % cross_attn_every_n_layers == 0
+                else None
+                for layer_idx, _ in enumerate(self._get_decoder_layers())
+            ]
+        )
+
+        self.init_flamingo_layers(gradient_checkpointing)
+        self.media_token_id = media_token_id
+        self.initialized_flamingo = True
+        self._use_cached_audio_x = False
+
+    def init_flamingo_layers(self, gradient_checkpointing):
+        """
+        Re initializes the FlamingoLayers.
+        Propagates any changes made to self.gated_corss_attn_layers or self.old_decoder_blocks
+        """
+        self._set_decoder_layers(
+            nn.ModuleList(
+                [
+                    FlamingoLayer(
+                        gated_cross_attn_layers_sound, decoder_layer, gradient_checkpointing
+                    )
+                    for gated_cross_attn_layers_sound, decoder_layer in zip(
+                        self.gated_cross_attn_layers_sound, self.old_decoder_blocks
+                    )
+                ]
+            )
+        )
+
+    def forward(self, input_ids, attention_mask, **kwargs):
+        """Condition the Flamingo layers on the media locations before forward()"""
+        if not self.initialized_flamingo:
+            raise ValueError(
+                "Flamingo layers are not initialized. Please call `init_flamingo` first."
+            )
+
+        media_locations = input_ids == self.media_token_id
+
+        use_cached_media_locations = (
+            self._use_cached_audio_x
+            and self.is_conditioned()
+            and not media_locations.any()
+        )
+
+        for layer in self._get_decoder_layers():
+            if not use_cached_media_locations:
+                layer.condition_media_locations(media_locations)
+            layer.condition_use_cached_media(use_cached_media_locations)
+
+        kwargs["input_ids"] = input_ids
+        kwargs["attention_mask"] = attention_mask
+        return super().forward(**kwargs)
+
+    def is_conditioned(self) -> bool:
+        """Check whether all decoder layers are already conditioned."""
+        return all(l.is_conditioned() for l in self._get_decoder_layers())
+
+    def clear_conditioned_layers(self):
+        for layer in self._get_decoder_layers():
+            layer.condition_audio_x(None, None)
+            layer.condition_media_locations(None)
+            layer.condition_use_cached_media(None)
diff --git a/src/helpers.py b/src/helpers.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a19ff51ec264f52b60c942e805e053e4c287314
--- /dev/null
+++ b/src/helpers.py
@@ -0,0 +1,468 @@
+"""
+Based on: https://github.com/lucidrains/flamingo-pytorch
+"""
+
+from einops import rearrange, repeat
+from einops_exts import rearrange_many
+
+import numpy as np
+from functools import reduce, partial
+
+import torch
+from torch import einsum, nn
+import torch.nn.functional as F
+from torch.cuda.amp import autocast
+
+
+def exists(val):
+    return val is not None
+
+def FeedForward(dim, mult=4):
+    inner_dim = int(dim * mult)
+    return nn.Sequential(
+        nn.LayerNorm(dim),
+        nn.Linear(dim, inner_dim, bias=False),
+        nn.GELU(),
+        nn.Linear(inner_dim, dim, bias=False),
+    )
+
+# Transformer (encoder) https://github.com/jadore801120/attention-is-all-you-need-pytorch
+# Original Copyright 2017 Victor Huang
+#  MIT License (https://opensource.org/licenses/MIT)
+
+class ScaledDotProductAttention(nn.Module):
+    ''' Scaled Dot-Product Attention '''
+
+    def __init__(self, temperature, attn_dropout=0.1):
+        super().__init__()
+        self.temperature = temperature
+        self.dropout = nn.Dropout(attn_dropout)
+
+    def forward(self, q, k, v, mask=None):
+
+        attn = torch.matmul(q / self.temperature, k.transpose(2, 3))
+
+        if mask is not None:
+            attn = attn.masked_fill(mask == 0, -1e9)
+
+        attn = self.dropout(F.softmax(attn, dim=-1))
+        output = torch.matmul(attn, v)
+
+        return output, attn
+
+
+class MultiHeadAttention(nn.Module):
+    ''' Multi-Head Attention module '''
+
+    def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1):
+        super().__init__()
+
+        self.n_head = n_head
+        self.d_k = d_k
+        self.d_v = d_v
+
+        self.w_qs = nn.Linear(d_model, n_head * d_k, bias=False)
+        self.w_ks = nn.Linear(d_model, n_head * d_k, bias=False)
+        self.w_vs = nn.Linear(d_model, n_head * d_v, bias=False)
+        self.fc = nn.Linear(n_head * d_v, d_model, bias=False)
+
+        self.attention = ScaledDotProductAttention(temperature=d_k ** 0.5)
+
+        self.dropout = nn.Dropout(dropout)
+        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
+
+
+    def forward(self, q, k, v, mask=None, rotary_frequencies=None):
+
+        d_k, d_v, n_head = self.d_k, self.d_v, self.n_head
+        sz_b, len_q, len_k, len_v = q.size(0), q.size(1), k.size(1), v.size(1)
+
+        residual = q
+
+        # Pass through the pre-attention projection: b x lq x (n*dv)
+        # Separate different heads: b x lq x n x dv
+        q = self.w_qs(q).view(sz_b, len_q, n_head, d_k)
+        k = self.w_ks(k).view(sz_b, len_k, n_head, d_k)
+        v = self.w_vs(v).view(sz_b, len_v, n_head, d_v)
+
+        # Apply rotary positional embeddings
+        q = apply_rotary_pos_emb(q, rotary_frequencies)
+        k = apply_rotary_pos_emb(k, rotary_frequencies)
+
+        # Transpose for attention dot product: b x n x lq x dv
+        q, k, v = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)
+
+        if mask is not None:
+            mask = mask.unsqueeze(1).unsqueeze(2)   # For head axis broadcasting.
+
+        q, attn = self.attention(q, k, v, mask=mask)
+
+        # Transpose to move the head dimension back: b x lq x n x dv
+        # Combine the last two dimensions to concatenate all the heads together: b x lq x (n*dv)
+        q = q.transpose(1, 2).contiguous().view(sz_b, len_q, -1)
+        q = self.dropout(self.fc(q))
+        q += residual
+
+        q = self.layer_norm(q)
+
+        return q, attn
+
+
+class PositionwiseFeedForward(nn.Module):
+    ''' A two-feed-forward-layer module '''
+
+    def __init__(self, d_in, d_hid, dropout=0.1):
+        super().__init__()
+        self.w_1 = nn.Linear(d_in, d_hid) # position-wise
+        self.w_2 = nn.Linear(d_hid, d_in) # position-wise
+        self.layer_norm = nn.LayerNorm(d_in, eps=1e-6)
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x):
+
+        residual = x
+
+        x = self.w_2(F.relu(self.w_1(x)))
+        x = self.dropout(x)
+        x += residual
+
+        x = self.layer_norm(x)
+
+        return x
+
+class RotaryEmbedding(nn.Module):
+    def __init__(
+        self,
+        dim,
+        use_xpos = False,
+        scale_base = 512,
+        interpolation_factor = 1.,
+        base = 4096,
+        base_rescale_factor = 1.
+    ):
+        super().__init__()
+        # proposed by reddit user bloc97, to rescale rotary embeddings to longer sequence length without fine-tuning
+        # has some connection to NTK literature
+        # https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
+        base *= base_rescale_factor ** (dim / (dim - 2))
+
+        inv_freq = 1. / (base ** (torch.arange(0, dim, 2).float() / dim))
+        self.register_buffer('inv_freq', inv_freq)
+
+        assert interpolation_factor >= 1.
+        self.interpolation_factor = interpolation_factor
+
+        if not use_xpos:
+            self.register_buffer('scale', None)
+            return
+
+        scale = (torch.arange(0, dim, 2) + 0.4 * dim) / (1.4 * dim)
+
+        self.scale_base = scale_base
+        self.register_buffer('scale', scale)
+
+    def forward_from_seq_len(self, seq_len):
+        device = self.inv_freq.device
+
+        t = torch.arange(seq_len, device = device)
+        return self.forward(t)
+
+    @autocast(enabled = False)
+    def forward(self, t):
+        device = self.inv_freq.device
+
+        t = t.to(torch.float32)
+
+        t = t / self.interpolation_factor
+
+        freqs = torch.einsum('i , j -> i j', t, self.inv_freq)
+        freqs = torch.cat((freqs, freqs), dim = -1)
+
+        if self.scale is None:
+            return freqs, 1.
+        
+        power = (torch.arange(seq_len, device = device) - (seq_len // 2)) / self.scale_base
+        scale = self.scale ** rearrange(power, 'n -> n 1')
+        scale = torch.cat((scale, scale), dim = -1)
+
+        return freqs, scale
+
+def rotate_half(x):
+    x = rearrange(x, '... (j d) -> ... j d', j = 2)
+    x1, x2 = x.unbind(dim = -2)
+    return torch.cat((-x2, x1), dim = -1)
+
+@autocast(enabled = False)
+def apply_rotary_pos_emb(t, freqs, scale = 1):
+    out_dtype = t.dtype
+
+    # cast to float32 if necessary for numerical stability
+    dtype = reduce(torch.promote_types, (t.dtype, freqs.dtype, torch.float32))
+    rot_dim, seq_len = freqs.shape[-1], t.shape[-2]
+    freqs, t = freqs.to(dtype), t.to(dtype)
+    freqs = freqs[-seq_len:, :]
+
+    if t.ndim == 4 and freqs.ndim == 3:
+        freqs = rearrange(freqs, 'b n d -> b 1 n d')
+
+    # partial rotary embeddings, Wang et al. GPT-J
+    t, t_unrotated = t[..., :rot_dim], t[..., rot_dim:]
+    t = (t * freqs.cos() * scale) + (rotate_half(t) * freqs.sin() * scale)
+
+    t, t_unrotated = t.to(out_dtype), t_unrotated.to(out_dtype)
+
+    return torch.cat((t, t_unrotated), dim = -1)
+
+class PositionalEncoding(nn.Module):
+
+    def __init__(self, d_hid, n_position=200):
+        super(PositionalEncoding, self).__init__()
+        self.register_buffer('pos_table', self._get_sinusoid_encoding_table(n_position, d_hid))
+
+    def _get_sinusoid_encoding_table(self, n_position, d_hid):
+
+        def get_position_angle_vec(position):
+            return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)]
+
+        sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)])
+        sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
+        sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1
+
+        return torch.FloatTensor(sinusoid_table).unsqueeze(0)
+
+    def forward(self, x):
+        return x + self.pos_table[:, :x.size(1)].clone().detach()
+
+
+class EncoderLayer(nn.Module):
+    ''' Compose with two layers '''
+
+    def __init__(self, d_model, d_inner, n_head, d_k, d_v, dropout=0.0):
+        super(EncoderLayer, self).__init__()
+        self.slf_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout)
+        self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, dropout=dropout)
+
+    def forward(self, enc_input, slf_attn_mask=None, rotary_frequencies=None):
+        enc_output, enc_slf_attn = self.slf_attn(
+            enc_input, enc_input, enc_input, mask=slf_attn_mask, rotary_frequencies=rotary_frequencies)
+        enc_output = self.pos_ffn(enc_output)
+        return enc_output, enc_slf_attn
+
+
+class TransformerEncoder(nn.Module):
+    ''' A encoder model with self attention mechanism. '''
+
+    def __init__(
+            self, d_word_vec=512, n_layers=6, n_head=8, d_k=64, d_v=64,
+            d_model=512, d_inner=2048, dropout=0.0, n_position=16, scale_emb=True):
+
+        super().__init__()
+
+        if n_position > 0:
+            dim_head = d_word_vec // n_head
+            self.position_enc = RotaryEmbedding(max(dim_head // 2, 32)) #PositionalEncoding(d_word_vec, n_position=n_position)
+        else:
+            self.position_enc = lambda x: x
+        self.dropout = nn.Dropout(p=dropout)
+        self.layer_stack = nn.ModuleList([
+            EncoderLayer(d_model, d_inner, n_head, d_k, d_v, dropout=dropout)
+            for _ in range(n_layers)])
+        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
+        self.scale_emb = scale_emb
+        self.d_model = d_model
+
+    def forward(self, src_seq, causal_mask = None, return_attns=False):
+        if len(src_seq.shape) == 2:
+            src_seq = src_seq.unsqueeze(1)
+        B, L, D = src_seq.shape
+
+        enc_slf_attn_list = []
+
+        causal_mask = causal_mask
+
+        enc_output = src_seq
+        if self.scale_emb:
+            enc_output = enc_output * self.d_model ** 0.5
+        # --------- #
+        # Apply rotary position embeddings
+        pos_emb = self.position_enc.forward_from_seq_len(enc_output.shape[1])
+        freqs, _ = pos_emb
+        # --------- #
+        enc_output = self.dropout(enc_output)
+        enc_output = self.layer_norm(enc_output)
+
+        for enc_layer in self.layer_stack:
+            enc_output, enc_slf_attn = enc_layer(enc_output, slf_attn_mask=causal_mask, rotary_frequencies=freqs)
+            enc_slf_attn_list += [enc_slf_attn] if return_attns else []
+
+        if return_attns:
+            return enc_output, enc_slf_attn_list
+        return enc_output
+
+
+# gated cross attention
+class MaskedCrossAttention(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        dim_audio,
+        max_window_per_audio, 
+        dim_head=64,
+        heads=8,
+        only_attend_immediate_media=True,
+    ):
+        super().__init__()
+        self.max_window_per_audio = max_window_per_audio
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        inner_dim = dim_head * heads
+
+        self.norm = nn.LayerNorm(dim)
+
+        self.to_q = nn.Linear(dim, inner_dim, bias=False)
+        self.to_kv = nn.Linear(dim_audio, inner_dim * 2, bias=False)
+        self.to_out = nn.Linear(inner_dim, dim, bias=False)
+
+        self.only_attend_immediate_media = only_attend_immediate_media
+
+    def forward(
+        self, 
+        x, 
+        media, media_mask, 
+        media_locations=None, 
+        use_cached_media=False
+    ):
+
+        if not use_cached_media:
+            assert (
+                media_locations.shape[1] == x.shape[1]
+            ), f"media_location.shape is {media_locations.shape} but x.shape is {x.shape}"
+
+        T_txt = x.shape[1]
+        B, L = media.shape[:2]
+        assert media.shape[2] == 1  # extra dim
+        assert L % self.max_window_per_audio == 0  # should be 4 or 8 times
+        h = self.heads
+
+        x = self.norm(x)
+
+        q = self.to_q(x)
+        media = rearrange(media, "b t n d -> b (t n) d")
+
+        k, v = self.to_kv(media).chunk(2, dim=-1)
+        q, k, v = rearrange_many((q, k, v), "b n (h d) -> b h n d", h=h)
+
+        q = q * self.scale
+
+        sim = einsum("... i d, ... j d -> ... i j", q, k)
+
+        # mask padded audio embeddings
+        media_mask = rearrange(media_mask, "b i n -> b 1 1 (i n)").bool()  # n = 1 is extra dim
+        sim = sim.masked_fill(~media_mask, -torch.finfo(sim.dtype).max)
+
+        assert self.only_attend_immediate_media is False
+
+        # mask media locations
+        # if exists(media_locations):
+        #     few_shot_mask = torch.zeros(B, T_txt, L).bool().to(sim.device)
+        #     for batch_idx in range(B): 
+        #         media_locations_b = media_locations[batch_idx].nonzero()  # locations of <audio>
+        #         if len(media_locations_b.shape) > 1:
+        #             media_locations_b = media_locations_b.squeeze(-1)
+
+        #         for i in range(-1, len(media_locations_b)):
+        #             if i == -1:
+        #                 if len(media_locations_b) == 1:
+        #                     text_start, text_end = 0, T_txt
+        #                 else:
+        #                     text_start, text_end = 0, media_locations_b[i+1]
+                    
+        #             elif i == len(media_locations_b) - 1:
+        #                 text_start, text_end = media_locations_b[i], T_txt
+                    
+        #             else:
+        #                 text_start, text_end = media_locations_b[i], media_locations_b[i+1]
+
+        #             if self.only_attend_immediate_media:
+        #                 look_at_window_start = max(i,0) * self.max_window_per_audio
+        #             else:
+        #                 look_at_window_start = 0
+        #             look_at_window_end = (max(i,0) + 1) * self.max_window_per_audio
+                    
+        #             few_shot_mask[batch_idx, text_start:text_end, look_at_window_start:look_at_window_end] = True
+
+        #     sim = sim.masked_fill(~few_shot_mask.unsqueeze(1), -torch.finfo(sim.dtype).max)
+
+        sim = sim - sim.amax(dim=-1, keepdim=True).detach()
+        attn = sim.softmax(dim=-1)
+
+        if exists(media_locations) and self.only_attend_immediate_media:
+            text_without_media_mask = text_time == 0
+            text_without_media_mask = rearrange(
+                text_without_media_mask, "b i -> b 1 i 1"
+            )
+            attn = attn.masked_fill(text_without_media_mask, 0.0)
+
+        out = einsum("... i j, ... j d -> ... i d", attn, v)
+        out = rearrange(out, "b h n d -> b n (h d)")
+        return self.to_out(out)
+
+
+class GatedCrossAttentionBlock(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        dim_audio,
+        max_window_per_audio, 
+        dim_head=64,
+        heads=8,
+        ff_mult=4,
+        only_attend_immediate_media=True,
+    ):
+        super().__init__()
+        self.attn = MaskedCrossAttention(
+            dim=dim,
+            dim_audio=dim_audio,
+            max_window_per_audio=max_window_per_audio,
+            dim_head=dim_head,
+            heads=heads,
+            only_attend_immediate_media=only_attend_immediate_media,
+        )
+        self.attn_gate = nn.Parameter(torch.tensor([0.0]))
+
+        self.ff = FeedForward(dim, mult=ff_mult)
+        self.ff_gate = nn.Parameter(torch.tensor([0.0]))
+
+    def forward(
+        self,
+        x,
+        media,
+        media_mask,
+        media_locations=None,
+        use_cached_media=False,
+    ):
+        x = (
+            self.attn(
+                x,
+                media,
+                media_mask,
+                media_locations=media_locations,
+                use_cached_media=use_cached_media,
+            )
+            * self.attn_gate.tanh()
+            + x
+        )
+        x = self.ff(x) * self.ff_gate.tanh() + x
+
+        return x
+
+
+if __name__ == '__main__':
+    enc = TransformerEncoder().cuda()
+    x = torch.randn(2, 1000, 512).cuda()
+    mask = torch.ones(2, 1000).cuda()
+    output = enc(x,causal_mask=mask)
+    enc._use_gradient_checkpointing = True
+    print(output.shape)
\ No newline at end of file
diff --git a/src/modeling_whisper.py b/src/modeling_whisper.py
new file mode 100644
index 0000000000000000000000000000000000000000..0cc537493a25084aaedcbdfe3f5591fd7c445f98
--- /dev/null
+++ b/src/modeling_whisper.py
@@ -0,0 +1,1770 @@
+# This script is based on https://github.com/huggingface/transformers/blob/v4.29.1/src/transformers/models/whisper/modeling_whisper.py
+
+""" PyTorch Whisper model."""
+
+import math
+import random
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+from transformers.activations import ACT2FN
+from transformers.generation.logits_process import WhisperTimeStampLogitsProcessor
+from transformers.modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    Seq2SeqLMOutput,
+    Seq2SeqModelOutput,
+    SequenceClassifierOutput,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+from transformers.models.whisper.configuration_whisper import WhisperConfig
+from transformers.models.whisper.tokenization_whisper import TASK_IDS, TO_LANGUAGE_CODE
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "WhisperConfig"
+_CHECKPOINT_FOR_DOC = "openai/whisper-tiny"
+
+
+WHISPER_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "openai/whisper-base",
+    # See all Whisper models at https://huggingface.co/models?filter=whisper
+]
+
+
+# Copied from transformers.models.bart.modeling_bart.shift_tokens_right
+def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
+    """
+    Shift input ids one token to the right.
+    """
+    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
+    shifted_input_ids[:, 0] = decoder_start_token_id
+
+    if pad_token_id is None:
+        raise ValueError("self.model.config.pad_token_id has to be defined.")
+    # replace possible -100 values in labels by `pad_token_id`
+    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+
+    return shifted_input_ids
+
+
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+    inverted_mask = 1.0 - expanded_mask
+
+    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+
+
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2._compute_mask_indices
+def _compute_mask_indices(
+    shape: Tuple[int, int],
+    mask_prob: float,
+    mask_length: int,
+    attention_mask: Optional[torch.LongTensor] = None,
+    min_masks: int = 0,
+) -> np.ndarray:
+    """
+    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
+    ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on
+    CPU as part of the preprocessing during training.
+
+    Args:
+        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
+               the first element is the batch size and the second element is the length of the axis to span.
+        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
+                    independently generated mask spans of length `mask_length` is computed by
+                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
+                    actual percentage will be smaller.
+        mask_length: size of the mask
+        min_masks: minimum number of masked spans
+        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
+                        each batch dimension.
+    """
+    batch_size, sequence_length = shape
+
+    if mask_length < 1:
+        raise ValueError("`mask_length` has to be bigger than 0.")
+
+    if mask_length > sequence_length:
+        raise ValueError(
+            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
+            f" and `sequence_length`: {sequence_length}`"
+        )
+
+    # epsilon is used for probabilistic rounding
+    epsilon = np.random.rand(1).item()
+
+    def compute_num_masked_span(input_length):
+        """Given input length, compute how many spans should be masked"""
+        num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
+        num_masked_span = max(num_masked_span, min_masks)
+
+        # make sure num masked span <= sequence_length
+        if num_masked_span * mask_length > sequence_length:
+            num_masked_span = sequence_length // mask_length
+
+        # make sure num_masked span is also <= input_length - (mask_length - 1)
+        if input_length - (mask_length - 1) < num_masked_span:
+            num_masked_span = max(input_length - (mask_length - 1), 0)
+
+        return num_masked_span
+
+    # compute number of masked spans in batch
+    input_lengths = (
+        attention_mask.sum(-1).detach().tolist()
+        if attention_mask is not None
+        else [sequence_length for _ in range(batch_size)]
+    )
+
+    # SpecAugment mask to fill
+    spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool)
+    spec_aug_mask_idxs = []
+
+    max_num_masked_span = compute_num_masked_span(sequence_length)
+
+    if max_num_masked_span == 0:
+        return spec_aug_mask
+
+    for input_length in input_lengths:
+        # compute num of masked spans for this input
+        num_masked_span = compute_num_masked_span(input_length)
+
+        # get random indices to mask
+        spec_aug_mask_idx = np.random.choice(
+            np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
+        )
+
+        # pick first sampled index that will serve as a dummy index to pad vector
+        # to ensure same dimension for all batches due to probabilistic rounding
+        # Picking first sample just pads those vectors twice.
+        if len(spec_aug_mask_idx) == 0:
+            # this case can only happen if `input_length` is strictly smaller then
+            # `sequence_length` in which case the last token has to be a padding
+            # token which we can use as a dummy mask id
+            dummy_mask_idx = sequence_length - 1
+        else:
+            dummy_mask_idx = spec_aug_mask_idx[0]
+
+        spec_aug_mask_idx = np.concatenate(
+            [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
+        )
+        spec_aug_mask_idxs.append(spec_aug_mask_idx)
+
+    spec_aug_mask_idxs = np.array(spec_aug_mask_idxs)
+
+    # expand masked indices to masked spans
+    spec_aug_mask_idxs = np.broadcast_to(
+        spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
+
+    # add offset to the starting indexes so that indexes now create a span
+    offsets = np.arange(mask_length)[None, None, :]
+    offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
+        batch_size, max_num_masked_span * mask_length
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
+
+    # ensure that we cannot have indices larger than sequence_length
+    if spec_aug_mask_idxs.max() > sequence_length - 1:
+        spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1
+
+    # scatter indices to mask
+    np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)
+
+    return spec_aug_mask
+
+
+class WhisperPositionalEmbedding(nn.Embedding):
+    def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
+        super().__init__(num_positions, embedding_dim)
+
+    def forward(self, input_ids, past_key_values_length=0):
+        return self.weight[past_key_values_length : past_key_values_length + input_ids.shape[1]]
+
+
+class WhisperAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=False)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    # Copied from transformers.models.bart.modeling_bart.BartAttention._shape with BART->whisper
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    # Copied from transformers.models.bart.modeling_bart.BartAttention.forward with BART->whisper
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        # `past_key_value[0].shape[2] == key_value_states.shape[1]`
+        # is checking that the `sequence_length` of the `past_key_value` is the same as
+        # the provided `key_value_states` to support prefix tuning
+        if (
+            is_cross_attention
+            and past_key_value is not None
+            and past_key_value[0].shape[2] == key_value_states.shape[1]
+        ):
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.reshape(*proj_shape)
+        value_states = value_states.reshape(*proj_shape)
+
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if layer_head_mask is not None:
+            if layer_head_mask.size() != (self.num_heads,):
+                raise ValueError(
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
+                    f" {layer_head_mask.size()}"
+                )
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to be reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned across GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped, past_key_value
+
+
+# Copied from transformers.models.mbart.modeling_mbart.MBartEncoderLayer with MBart->Whisper
+class WhisperEncoderLayer(nn.Module):
+    def __init__(self, config: WhisperConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn = WhisperAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.encoder_attention_heads,
+            dropout=config.attention_dropout,
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        layer_head_mask: torch.Tensor,
+        output_attentions: bool = False,
+    ) -> torch.Tensor:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+                `(encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states, attn_weights, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        if hidden_states.dtype == torch.float16 and (
+            torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
+        ):
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+# Copied from transformers.models.mbart.modeling_mbart.MBartDecoderLayer with MBart->Whisper
+class WhisperDecoderLayer(nn.Module):
+    def __init__(self, config: WhisperConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+
+        self.self_attn = WhisperAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+        )
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.encoder_attn = WhisperAttention(
+            self.embed_dim,
+            config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+        )
+        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
+        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = True,
+    ) -> torch.Tensor:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            encoder_hidden_states (`torch.FloatTensor`):
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
+            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+                `(encoder_attention_heads,)`.
+            cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
+                size `(decoder_attention_heads,)`.
+            past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Self Attention
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        # add present self-attn cache to positions 1,2 of present_key_value tuple
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_value=self_attn_past_key_value,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        # Cross-Attention Block
+        cross_attn_present_key_value = None
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+            # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
+                hidden_states=hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                layer_head_mask=cross_attn_layer_head_mask,
+                past_key_value=cross_attn_past_key_value,
+                output_attentions=output_attentions,
+            )
+            hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+            hidden_states = residual + hidden_states
+
+            # add cross-attn to positions 3,4 of present_key_value tuple
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+class WhisperPreTrainedModel(PreTrainedModel):
+    config_class = WhisperConfig
+    base_model_prefix = "model"
+    main_input_name = "input_features"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["WhisperEncoderLayer", "WhisperDecoderLayer"]
+
+    def _init_weights(self, module):
+        std = self.config.init_std
+        if isinstance(module, (nn.Linear, nn.Conv1d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (WhisperDecoder, WhisperEncoder)):
+            module.gradient_checkpointing = value
+
+    def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor):
+        """
+        Computes the output length of the convolutional layers
+        """
+        input_lengths = (input_lengths - 1) // 2 + 1
+
+        return input_lengths
+
+
+WHISPER_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`WhisperConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+WHISPER_INPUTS_DOCSTRING = r"""
+    Args:
+        input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, sequence_length)`):
+            Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by
+            loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via
+            the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
+            [`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a
+            tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
+        attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing *SpecAugment* data augmentation on padding token indices. Mask values selected in
+            `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`WhisperTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+
+            Whisper uses the `decoder_start_token_id` as the starting token for `decoder_input_ids` generation. If
+            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+
+            If you want to change padding behavior, you should read
+            [`modeling_whisper._prepare_decoder_attention_mask`] and modify to your needs. See diagram 1 in [the BART
+            paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy.
+        head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        decoder_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
+            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
+            representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds` have to be
+            input (see `past_key_values`). This is useful if you want more control over how to convert
+            `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+WHISPER_ENCODER_INPUTS_DOCSTRING = r"""
+    Args:
+        input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, sequence_length)`):
+            Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by
+            loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via
+            the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
+            [`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a
+            tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
+        head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
+            hidden-states at the output of the last layer of the encoder.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class WhisperEncoder(WhisperPreTrainedModel):
+    """
+    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
+    [`WhisperEncoderLayer`].
+
+    Args:
+        config: WhisperConfig
+    """
+
+    def __init__(self, config: WhisperConfig):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.encoder_layerdrop
+
+        embed_dim = config.d_model
+        self.num_mel_bins = config.num_mel_bins
+        self.padding_idx = config.pad_token_id
+        self.max_source_positions = config.max_source_positions
+        self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
+
+        self.conv1 = nn.Conv1d(self.num_mel_bins, embed_dim, kernel_size=3, padding=1)
+        self.conv2 = nn.Conv1d(embed_dim, embed_dim, kernel_size=3, stride=2, padding=1)
+
+        self.embed_positions = nn.Embedding(self.max_source_positions, embed_dim)
+
+        self.layers = nn.ModuleList([WhisperEncoderLayer(config) for _ in range(config.encoder_layers)])
+        self.layer_norm = nn.LayerNorm(config.d_model)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def _freeze_parameters(self):
+        for param in self.parameters():
+            param.requires_grad = False
+        self._requires_grad = False
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.conv1
+
+    def set_input_embeddings(self, value: nn.Module):
+        self.conv1 = value
+
+    def forward(
+        self,
+        input_features,
+        attention_mask=None,
+        head_mask=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`):
+                Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
+                obtained by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a
+                `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
+                `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
+                and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
+            attention_mask (`torch.Tensor`)`, *optional*):
+                Whisper does not support masking of the `input_features`, this argument is preserved for compatibility,
+                but it is not used. By default the silence in the input log mel spectrogram are ignored.
+            head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        inputs_embeds = nn.functional.gelu(self.conv1(input_features))
+        inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds))
+
+        inputs_embeds = inputs_embeds.permute(0, 2, 1)
+        embed_pos = self.embed_positions.weight
+
+        hidden_states = inputs_embeds + embed_pos
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        # check if head_mask has a correct number of layers specified if desired
+        if head_mask is not None:
+            assert head_mask.size()[0] == (
+                len(self.layers)
+            ), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+                layer_outputs = (None, None)
+            else:
+                if self.gradient_checkpointing and self.training:
+
+                    def create_custom_forward(module):
+                        def custom_forward(*inputs):
+                            return module(*inputs, output_attentions)
+
+                        return custom_forward
+
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(encoder_layer),
+                        hidden_states,
+                        None,
+                        (head_mask[idx] if head_mask is not None else None),
+                    )
+                else:
+                    layer_outputs = encoder_layer(
+                        hidden_states,
+                        None,
+                        layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                        output_attentions=output_attentions,
+                    )
+
+                hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        hidden_states = self.layer_norm(hidden_states)
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class WhisperDecoder(WhisperPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`WhisperDecoderLayer`]
+
+    Args:
+        config: WhisperConfig
+    """
+
+    def __init__(self, config: WhisperConfig):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.decoder_layerdrop
+        self.padding_idx = config.pad_token_id
+        self.max_target_positions = config.max_target_positions
+        self.max_source_positions = config.max_source_positions
+        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)
+        self.embed_positions = WhisperPositionalEmbedding(self.max_target_positions, config.d_model)
+
+        self.layers = nn.ModuleList([WhisperDecoderLayer(config) for _ in range(config.decoder_layers)])
+
+        self.layer_norm = nn.LayerNorm(config.d_model)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+            )
+
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
+            combined_attention_mask = (
+                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+            )
+
+        return combined_attention_mask
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        head_mask=None,
+        cross_attn_head_mask=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+
+                Indices can be obtained using [`WhisperTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                [`PreTrainedTokenizer.__call__`] for details.
+
+                [What are input IDs?](../glossary#input-ids)
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules in encoder to avoid performing cross-attention
+                on hidden heads. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+                shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
+                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
+                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
+                all `decoder_input_ids` of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of
+                shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing
+                `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more
+                control over how to convert `input_ids` indices into associated vectors than the model's internal
+                embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask, input_shape, inputs_embeds, past_key_values_length
+        )
+
+        # embed positions
+        if input_ids is not None:
+            positions = self.embed_positions(input_ids, past_key_values_length=past_key_values_length)
+        else:
+            positions = self.embed_positions(inputs_embeds, past_key_values_length=past_key_values_length)
+
+        hidden_states = inputs_embeds + positions
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`..."
+                )
+                use_cache = False
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+        next_decoder_cache = () if use_cache else None
+
+        # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
+        for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
+            if attn_mask is not None:
+                assert attn_mask.size()[0] == (len(self.layers)), (
+                    f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
+                    f" {head_mask.size()[0]}."
+                )
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (dropout_probability < self.layerdrop):
+                continue
+
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, output_attentions, use_cache)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    encoder_hidden_states,
+                    None,  # encoder attention mask
+                    head_mask[idx] if head_mask is not None else None,
+                    cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None,
+                    None,  # past_key_value
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                    cross_attn_layer_head_mask=(
+                        cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None
+                    ),
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[3 if output_attentions else 1],)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+
+        hidden_states = self.layer_norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    "The bare Whisper Model outputting raw hidden-states without any specific head on top.",
+    WHISPER_START_DOCSTRING,
+)
+class WhisperModel(WhisperPreTrainedModel):
+    _keys_to_ignore_on_load_missing = [r"proj_out.weight"]
+
+    def __init__(self, config: WhisperConfig):
+        super().__init__(config)
+
+        self.encoder = WhisperEncoder(config)
+        self.decoder = WhisperDecoder(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.decoder.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.decoder.embed_tokens = value
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_decoder(self):
+        return self.decoder
+
+    def freeze_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the Whisper encoder so that its parameters will
+        not be updated during training.
+        """
+        self.encoder._freeze_parameters()
+
+    def _mask_input_features(
+        self,
+        input_features: torch.FloatTensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+    ):
+        """
+        Masks extracted features along time axis and/or along feature axis according to
+        [SpecAugment](https://arxiv.org/abs/1904.08779).
+        """
+
+        # `config.apply_spec_augment` can set masking to False
+        if not getattr(self.config, "apply_spec_augment", True):
+            return input_features
+
+        # generate indices & apply SpecAugment along time axis
+        batch_size, hidden_size, sequence_length = input_features.size()
+
+        if self.config.mask_time_prob > 0 and self.training:
+            # generate indices & apply SpecAugment along time axis
+            mask_time_indices = _compute_mask_indices(
+                (batch_size, sequence_length),
+                mask_prob=self.config.mask_time_prob,
+                mask_length=self.config.mask_time_length,
+                attention_mask=attention_mask,
+                min_masks=self.config.mask_time_min_masks,
+            )
+            mask_time_indices = torch.tensor(mask_time_indices, device=input_features.device, dtype=torch.bool)
+            mask_time_indices = mask_time_indices[:, None].expand(-1, hidden_size, -1)
+            input_features[mask_time_indices] = 0
+
+        if self.config.mask_feature_prob > 0 and self.training:
+            # generate indices & apply SpecAugment along feature axis
+            mask_feature_indices = _compute_mask_indices(
+                (batch_size, hidden_size),
+                mask_prob=self.config.mask_feature_prob,
+                mask_length=self.config.mask_feature_length,
+                min_masks=self.config.mask_feature_min_masks,
+            )
+            mask_feature_indices = torch.tensor(mask_feature_indices, device=input_features.device, dtype=torch.bool)
+            input_features[mask_feature_indices] = 0
+
+        return input_features
+
+    @add_start_docstrings_to_model_forward(WHISPER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_features: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        decoder_inputs_embeds: Optional[Tuple[torch.FloatTensor]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], Seq2SeqModelOutput]:
+        r"""
+        Returns:
+
+        Example:
+         ```python
+         >>> import torch
+         >>> from transformers import AutoFeatureExtractor, WhisperModel
+         >>> from datasets import load_dataset
+
+         >>> model = WhisperModel.from_pretrained("openai/whisper-base")
+         >>> feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-base")
+         >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+         >>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt")
+         >>> input_features = inputs.input_features
+         >>> decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id
+         >>> last_hidden_state = model(input_features, decoder_input_ids=decoder_input_ids).last_hidden_state
+         >>> list(last_hidden_state.shape)
+         [1, 2, 512]
+         ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if encoder_outputs is None:
+            input_features = self._mask_input_features(input_features, attention_mask=attention_mask)
+
+            encoder_outputs = self.encoder(
+                input_features,
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+
+        return Seq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    "The Whisper Model with a language modeling head. Can be used for automatic speech recognition.",
+    WHISPER_START_DOCSTRING,
+)
+class WhisperForConditionalGeneration(WhisperPreTrainedModel):
+    base_model_prefix = "model"
+    _keys_to_ignore_on_load_missing = [
+        r"encoder.version",
+        r"decoder.version",
+        r"proj_out.weight",
+    ]
+    _keys_to_ignore_on_save = [
+        r"proj_out.weight",
+    ]
+
+    def __init__(self, config: WhisperConfig):
+        super().__init__(config)
+        self.model = WhisperModel(config)
+        self.proj_out = nn.Linear(config.d_model, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_encoder(self):
+        return self.model.get_encoder()
+
+    def get_decoder(self):
+        return self.model.get_decoder()
+
+    def resize_token_embeddings(self, new_num_tokens: int) -> nn.Embedding:
+        new_embeddings = super().resize_token_embeddings(new_num_tokens)
+        return new_embeddings
+
+    def get_output_embeddings(self):
+        return self.proj_out
+
+    def set_output_embeddings(self, new_embeddings):
+        self.proj_out = new_embeddings
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.model.get_input_embeddings()
+
+    def freeze_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the Whisper encoder so that its parameters will
+        not be updated during training.
+        """
+        self.model.encoder._freeze_parameters()
+
+    @add_start_docstrings_to_model_forward(WHISPER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_features: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        decoder_inputs_embeds: Optional[Tuple[torch.FloatTensor]] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], Seq2SeqLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the language modeling loss. Indices should either be in `[0, ..., config.vocab_size]`
+            or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored (masked), the loss is
+            only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> import torch
+        >>> from transformers import AutoProcessor, WhisperForConditionalGeneration
+        >>> from datasets import load_dataset
+
+        >>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en")
+        >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
+
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+
+        >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt")
+        >>> input_features = inputs.input_features
+
+        >>> generated_ids = model.generate(inputs=input_features)
+
+        >>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        >>> transcription
+        ' Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if labels is not None:
+            if decoder_input_ids is None and decoder_inputs_embeds is None:
+                decoder_input_ids = shift_tokens_right(
+                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
+                )
+
+        outputs = self.model(
+            input_features,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            encoder_outputs=encoder_outputs,
+            decoder_attention_mask=decoder_attention_mask,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        lm_logits = self.proj_out(outputs[0])
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # move labels to correct device to enable PP
+            labels = labels.to(lm_logits.device)
+            loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.reshape(-1))
+
+        if not return_dict:
+            output = (lm_logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return Seq2SeqLMOutput(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        generation_config=None,
+        logits_processor=None,
+        stopping_criteria=None,
+        prefix_allowed_tokens_fn=None,
+        synced_gpus=False,
+        return_timestamps=None,
+        task=None,
+        language=None,
+        is_multilingual=None,
+        **kwargs,
+    ):
+        """
+
+        Generates sequences of token ids for models with a language modeling head.
+
+        <Tip warning={true}>
+
+        Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the
+        model's default generation configuration. You can override any `generation_config` by passing the corresponding
+        parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`.
+
+        For an overview of generation strategies and code examples, check out the [following
+        guide](./generation_strategies).
+
+        </Tip>
+
+        Parameters:
+            inputs (`torch.Tensor` of varying shape depending on the modality, *optional*):
+                The sequence used as a prompt for the generation or as model inputs to the encoder. If `None` the
+                method initializes it with `bos_token_id` and a batch size of 1. For decoder-only models `inputs`
+                should of in the format of `input_ids`. For encoder-decoder models *inputs* can represent any of
+                `input_ids`, `input_values`, `input_features`, or `pixel_values`.
+            generation_config (`~generation.GenerationConfig`, *optional*):
+                The generation configuration to be used as base parametrization for the generation call. `**kwargs`
+                passed to generate matching the attributes of `generation_config` will override them. If
+                `generation_config` is not provided, the default will be used, which had the following loading
+                priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
+                configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
+                default values, whose documentation should be checked to parameterize generation.
+            logits_processor (`LogitsProcessorList`, *optional*):
+                Custom logits processors that complement the default logits processors built from arguments and
+                generation config. If a logit processor is passed that is already created with the arguments or a
+                generation config an error is thrown. This feature is intended for advanced users.
+            stopping_criteria (`StoppingCriteriaList`, *optional*):
+                Custom stopping criteria that complement the default stopping criteria built from arguments and a
+                generation config. If a stopping criteria is passed that is already created with the arguments or a
+                generation config an error is thrown. This feature is intended for advanced users.
+            prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], List[int]]`, *optional*):
+                If provided, this function constraints the beam search to allowed tokens only at each step. If not
+                provided no constraint is applied. This function takes 2 arguments: the batch ID `batch_id` and
+                `input_ids`. It has to return a list with the allowed tokens for the next generation step conditioned
+                on the batch ID `batch_id` and the previously generated tokens `inputs_ids`. This argument is useful
+                for constrained generation conditioned on the prefix, as described in [Autoregressive Entity
+                Retrieval](https://arxiv.org/abs/2010.00904).
+            synced_gpus (`bool`, *optional*, defaults to `False`):
+                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+            return_timestamps (`bool`, *optional*):
+                Whether to return the timestamps with the text. This enables the `WhisperTimestampsLogitsProcessor`.
+            task (`bool`, *optional*):
+                Task to use for generation, either "translate" or "transcribe". The `model.config.forced_decoder_ids`
+                will be updated accordingly.
+            language (`bool`, *optional*):
+                Language token to use for generation, can be either in the form of `<|en|>`, `en` or `english`. You can
+                find all the possible language tokens in the `model.generation_config.lang_to_id` dictionary.
+            is_multilingual (`bool`, *optional*):
+                Whether or not the model is multilingual.
+            kwargs:
+                Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
+                forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
+                specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.
+
+        Return:
+            [`~utils.ModelOutput`] or `torch.LongTensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True`
+            or when `config.return_dict_in_generate=True`) or a `torch.FloatTensor`.
+
+                If the model is *not* an encoder-decoder model (`model.config.is_encoder_decoder=False`), the possible
+                [`~utils.ModelOutput`] types are:
+
+                    - [`~generation.GreedySearchDecoderOnlyOutput`],
+                    - [`~generation.SampleDecoderOnlyOutput`],
+                    - [`~generation.BeamSearchDecoderOnlyOutput`],
+                    - [`~generation.BeamSampleDecoderOnlyOutput`]
+
+                If the model is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible
+                [`~utils.ModelOutput`] types are:
+
+                    - [`~generation.GreedySearchEncoderDecoderOutput`],
+                    - [`~generation.SampleEncoderDecoderOutput`],
+                    - [`~generation.BeamSearchEncoderDecoderOutput`],
+                    - [`~generation.BeamSampleEncoderDecoderOutput`]
+        """
+        if generation_config is None:
+            generation_config = self.generation_config
+
+        if return_timestamps is not None:
+            if not hasattr(generation_config, "no_timestamps_token_id"):
+                raise ValueError(
+                    "You are trying to return timestamps, but the generation config is not properly set."
+                    "Make sure to initialize the generation config with the correct attributes that are needed such as `no_timestamps_token_id`."
+                    "For more details on how to generate the approtiate config, refer to https://github.com/huggingface/transformers/issues/21878#issuecomment-1451902363"
+                )
+
+            generation_config.return_timestamps = return_timestamps
+        else:
+            generation_config.return_timestamps = False
+
+        if language is not None:
+            language = language.lower()
+            generation_config.language = language
+        if task is not None:
+            generation_config.task = task
+
+        forced_decoder_ids = []
+        if task is not None or language is not None:
+            if hasattr(generation_config, "language"):
+                if generation_config.language in generation_config.lang_to_id.keys():
+                    language_token = generation_config.language
+                elif generation_config.language in TO_LANGUAGE_CODE.keys():
+                    language_token = f"<|{TO_LANGUAGE_CODE[generation_config.language]}|>"
+                elif generation_config.language in TO_LANGUAGE_CODE.values():
+                    language_token = f"<|{generation_config.language}|>"
+                else:
+                    is_language_code = len(generation_config.language) == 2
+                    raise ValueError(
+                        f"Unsupported language: {generation_config.language}. Language should be one of:"
+                        f" {list(TO_LANGUAGE_CODE.values()) if is_language_code else list(TO_LANGUAGE_CODE.keys())}."
+                    )
+                forced_decoder_ids.append((1, generation_config.lang_to_id[language_token]))
+            else:
+                forced_decoder_ids.append((1, None))  # automatically detect the language
+
+            if hasattr(generation_config, "task"):
+                if generation_config.task in TASK_IDS:
+                    forced_decoder_ids.append((2, generation_config.task_to_id[generation_config.task]))
+                else:
+                    raise ValueError(
+                        f"The `{generation_config.task}`task is not supported. The task should be one of `{TASK_IDS}`"
+                    )
+            else:
+                forced_decoder_ids.append((2, generation_config.task_to_id["transcribe"]))  # defaults to transcribe
+            if hasattr(generation_config, "no_timestamps_token_id") and not generation_config.return_timestamps:
+                idx = forced_decoder_ids[-1][0] + 1 if forced_decoder_ids else 1
+                forced_decoder_ids.append((idx, generation_config.no_timestamps_token_id))
+
+        # Legacy code for backward compatibility
+        elif hasattr(self.config, "forced_decoder_ids") and self.config.forced_decoder_ids is not None:
+            forced_decoder_ids = self.config.forced_decoder_ids
+        elif (
+            hasattr(self.generation_config, "forced_decoder_ids")
+            and self.generation_config.forced_decoder_ids is not None
+        ):
+            forced_decoder_ids = self.generation_config.forced_decoder_ids
+
+        if generation_config.return_timestamps:
+            logits_processor = [WhisperTimeStampLogitsProcessor(generation_config)]
+
+        if len(forced_decoder_ids) > 0:
+            generation_config.forced_decoder_ids = forced_decoder_ids
+
+        return super().generate(
+            inputs,
+            generation_config,
+            logits_processor,
+            stopping_criteria,
+            prefix_allowed_tokens_fn,
+            synced_gpus,
+            **kwargs,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        decoder_input_ids,
+        past_key_values=None,
+        use_cache=None,
+        encoder_outputs=None,
+        attention_mask=None,
+        **kwargs,
+    ):
+        # cut decoder_input_ids if past is used
+        if past_key_values is not None:
+            decoder_input_ids = decoder_input_ids[:, -1:]
+
+        return {
+            "encoder_outputs": encoder_outputs,
+            "past_key_values": past_key_values,
+            "decoder_input_ids": decoder_input_ids,
+            "use_cache": use_cache,
+            "decoder_attention_mask": None,
+        }
+
+    #
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        return reordered_past
+
+
+@add_start_docstrings(
+    """
+    Whisper Encoder Model with a sequence classification head on top (a linear layer over the pooled output) for tasks
+    like SUPERB Keyword Spotting.
+    """,
+    WHISPER_ENCODER_INPUTS_DOCSTRING,
+)
+class WhisperForAudioClassification(WhisperPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.encoder = WhisperEncoder(config)
+        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
+        if config.use_weighted_layer_sum:
+            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
+        self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size)
+        self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def freeze_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the Whisper encoder so that its parameters will
+        not be updated during training. Only the projection layers and classification head will be updated.
+        """
+        self.encoder._freeze_parameters()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.encoder.get_input_embeddings()
+
+    def set_input_embeddings(self, value: nn.Module):
+        self.encoder.set_input_embeddings(value)
+
+    @add_start_docstrings_to_model_forward(WHISPER_ENCODER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_features: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> import torch
+        >>> from transformers import AutoFeatureExtractor, WhisperForAudioClassification
+        >>> from datasets import load_dataset
+
+        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("sanchit-gandhi/whisper-medium-fleurs-lang-id")
+        >>> model = WhisperForAudioClassification.from_pretrained("sanchit-gandhi/whisper-medium-fleurs-lang-id")
+
+        >>> ds = load_dataset("google/fleurs", "all", split="validation", streaming=True)
+        >>> sample = next(iter(ds))
+
+        >>> inputs = feature_extractor(
+        ...     sample["audio"]["array"], sampling_rate=sample["audio"]["sampling_rate"], return_tensors="pt"
+        ... )
+        >>> input_features = inputs.input_features
+
+        >>> with torch.no_grad():
+        ...     logits = model(input_features).logits
+
+        >>> predicted_class_ids = torch.argmax(logits).item()
+        >>> predicted_label = model.config.id2label[predicted_class_ids]
+        >>> predicted_label
+        'af_za'
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_features,
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+
+        if self.config.use_weighted_layer_sum:
+            hidden_states = torch.stack(encoder_outputs, dim=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+        else:
+            hidden_states = encoder_outputs[0]
+
+        hidden_states = self.projector(hidden_states)
+        pooled_output = hidden_states.mean(dim=1)
+
+        logits = self.classifier(pooled_output)
+
+        loss = None
+
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # move labels to correct device to enable PP
+            labels = labels.to(logits.device)
+            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + encoder_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
\ No newline at end of file
diff --git a/src/utils.py b/src/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..7895264638c7f52660e01436de00cc2bc0e52a89
--- /dev/null
+++ b/src/utils.py
@@ -0,0 +1,48 @@
+def extend_instance(obj, mixin):
+    """Apply mixins to a class instance after creation"""
+    base_cls = obj.__class__
+    base_cls_name = obj.__class__.__name__
+    obj.__class__ = type(
+        base_cls_name, (mixin, base_cls), {}
+    )  # mixin needs to go first for our forward() logic to work
+
+
+def getattr_recursive(obj, att):
+    """
+    Return nested attribute of obj
+    Example: getattr_recursive(obj, 'a.b.c') is equivalent to obj.a.b.c
+    """
+    if att == "":
+        return obj
+    i = att.find(".")
+    if i < 0:
+        return getattr(obj, att)
+    else:
+        return getattr_recursive(getattr(obj, att[:i]), att[i + 1 :])
+
+
+def setattr_recursive(obj, att, val):
+    """
+    Set nested attribute of obj
+    Example: setattr_recursive(obj, 'a.b.c', val) is equivalent to obj.a.b.c = val
+    """
+    if "." in att:
+        obj = getattr_recursive(obj, ".".join(att.split(".")[:-1]))
+    setattr(obj, att.split(".")[-1], val)
+
+
+def apply_with_stopping_condition(
+    module, apply_fn, apply_condition=None, stopping_condition=None, **other_args
+):
+    if stopping_condition(module):
+        return
+    if apply_condition(module):
+        apply_fn(module, **other_args)
+    for child in module.children():
+        apply_with_stopping_condition(
+            child,
+            apply_fn,
+            apply_condition=apply_condition,
+            stopping_condition=stopping_condition,
+            **other_args
+        )
diff --git a/train/README.md b/train/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..9bdf7f908fa4cff1352bbf42839736dcb50d4714
--- /dev/null
+++ b/train/README.md
@@ -0,0 +1 @@
+# Audio Flamingo Training
diff --git a/train/__init__.py b/train/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/train/__pycache__/__init__.cpython-38.pyc b/train/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5eaaa99e3c8ef867d123b2faad337810689a2190
Binary files /dev/null and b/train/__pycache__/__init__.cpython-38.pyc differ
diff --git a/train/__pycache__/distributed.cpython-38.pyc b/train/__pycache__/distributed.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4e66fec0cbbe643444948efa9cb9134f53f33b33
Binary files /dev/null and b/train/__pycache__/distributed.cpython-38.pyc differ
diff --git a/train/__pycache__/train_utils.cpython-38.pyc b/train/__pycache__/train_utils.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c282fd613200d65a82d3e7a912bd60702bad525c
Binary files /dev/null and b/train/__pycache__/train_utils.cpython-38.pyc differ
diff --git a/train/__pycache__/valid_utils.cpython-38.pyc b/train/__pycache__/valid_utils.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3d243220abcc1197a00043adbad603c18dd7ab0b
Binary files /dev/null and b/train/__pycache__/valid_utils.cpython-38.pyc differ
diff --git a/train/distributed.py b/train/distributed.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5ca3d08add11adba4e864e320a47bba42ba1c94
--- /dev/null
+++ b/train/distributed.py
@@ -0,0 +1,146 @@
+"""
+Util functions for setting up distributed training.
+Credit: https://github.com/mlfoundations/open_clip/blob/main/src/training/distributed.py
+"""
+
+import os
+import torch
+
+try:
+    import horovod.torch as hvd
+except ImportError:
+    hvd = None
+
+
+def is_global_master(args):
+    return args.rank == 0
+
+
+def is_local_master(args):
+    return args.local_rank == 0
+
+
+def is_master(args, local=False):
+    return is_local_master(args) if local else is_global_master(args)
+
+
+def is_using_horovod():
+    # NOTE w/ horovod run, OMPI vars should be set, but w/ SLURM PMI vars will be set
+    # Differentiating between horovod and DDP use via SLURM may not be possible, so horovod arg still required...
+    ompi_vars = ["OMPI_COMM_WORLD_RANK", "OMPI_COMM_WORLD_SIZE"]
+    pmi_vars = ["PMI_RANK", "PMI_SIZE"]
+    if all([var in os.environ for var in ompi_vars]) or all(
+        [var in os.environ for var in pmi_vars]
+    ):
+        return True
+    else:
+        return False
+
+
+def is_using_distributed():
+    if "WORLD_SIZE" in os.environ:
+        return int(os.environ["WORLD_SIZE"]) > 1
+    if "SLURM_NTASKS" in os.environ:
+        return int(os.environ["SLURM_NTASKS"]) > 1
+    return False
+
+
+def world_info_from_env():
+    local_rank = 0
+    for v in (
+        "LOCAL_RANK",
+        "MPI_LOCALRANKID",
+        "SLURM_LOCALID",
+        "OMPI_COMM_WORLD_LOCAL_RANK",
+    ):
+        if v in os.environ:
+            local_rank = int(os.environ[v])
+            break
+    global_rank = 0
+    for v in ("RANK", "PMI_RANK", "SLURM_PROCID", "OMPI_COMM_WORLD_RANK"):
+        if v in os.environ:
+            global_rank = int(os.environ[v])
+            break
+    world_size = 1
+    for v in ("WORLD_SIZE", "PMI_SIZE", "SLURM_NTASKS", "OMPI_COMM_WORLD_SIZE"):
+        if v in os.environ:
+            world_size = int(os.environ[v])
+            break
+
+    return local_rank, global_rank, world_size
+
+
+def init_distributed_device(args):
+    # Distributed training = training on more than one GPU.
+    # Works in both single and multi-node scenarios.
+    args.distributed = False
+    args.world_size = 1
+    args.rank = 0  # global rank
+    args.local_rank = 0
+
+    if args.horovod:
+        assert hvd is not None, "Horovod is not installed"
+        print('using horovod')
+        hvd.init()
+        args.local_rank = int(hvd.local_rank())
+        args.rank = hvd.rank()
+        args.world_size = hvd.size()
+        args.distributed = True
+        os.environ["LOCAL_RANK"] = str(args.local_rank)
+        os.environ["RANK"] = str(args.rank)
+        os.environ["WORLD_SIZE"] = str(args.world_size)
+
+    elif is_using_distributed():
+        if "SLURM_PROCID" in os.environ:
+            print('DDP via SLURM')
+            args.local_rank, args.rank, args.world_size = world_info_from_env()
+
+            # SLURM var -> torch.distributed vars in case needed
+            os.environ["LOCAL_RANK"] = str(args.local_rank)
+            os.environ["RANK"] = str(args.rank)
+            os.environ["WORLD_SIZE"] = str(args.world_size)
+
+            init_method = args.dist_url
+
+            # # master_ip = os.getenv('MASTER_ADDR', 'localhost')
+            # # master_port = os.getenv('MASTER_PORT', '7000')
+            # print("DDP RANK %d WORLD_SIZE %d" % (args.rank, args.world_size))
+            # # init_method = f'tcp://{master_ip}:{master_port}'
+            # init_method = 'tcp://localhost:54323'
+            # print("Init method: %s" % (init_method))
+
+            torch.distributed.init_process_group(
+                backend=args.dist_backend,
+                init_method=init_method,
+                world_size=args.world_size,
+                rank=args.rank,
+            )
+        else:
+            print('DDP via torchrun, torch.distributed.launch')
+            args.local_rank, _, _ = world_info_from_env()
+            torch.distributed.init_process_group(
+                backend=args.dist_backend, init_method=args.dist_url
+            )
+            args.world_size = torch.distributed.get_world_size()
+            args.rank = torch.distributed.get_rank()
+        args.distributed = True
+    else:
+        print('needed to run on single gpu')
+        torch.distributed.init_process_group(
+            backend=args.dist_backend,
+            init_method=args.dist_url,
+            world_size=1,
+            rank=0,
+        )
+
+    if torch.cuda.is_available():
+        if args.distributed and not args.no_set_device_rank:
+            device = "cuda:%d" % args.local_rank
+        else:
+            device = "cuda:0"
+        torch.cuda.set_device(device)
+    else:
+        device = "cpu"
+    args.device = device
+    device = torch.device(device)
+    return device
diff --git a/train/train.py b/train/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..e34310187d80ac34f727e8e718d8baf9ec35a907
--- /dev/null
+++ b/train/train.py
@@ -0,0 +1,402 @@
+""" Main training script """
+
+import argparse
+import functools
+import glob
+import os
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
+os.environ['TRANSFORMERS_CACHE'] = '/lustre/fsw/portfolios/adlr/users/sreyang/.cache'
+os.environ['HF_HOME'] = '/lustre/fsw/portfolios/adlr/users/sreyang/.cache'
+import random
+import shutil
+import sys 
+sys.path.append('../')
+import yaml
+import time
+
+import numpy as np
+import torch
+from torch.utils.tensorboard import SummaryWriter
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp import (
+    CPUOffload,
+    MixedPrecision,
+    ShardingStrategy,
+    BackwardPrefetch,
+)
+from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
+    checkpoint_wrapper,
+    CheckpointWrapper,
+    CheckpointImpl,
+    apply_activation_checkpointing,
+)
+from torch.distributed.fsdp._init_utils import _init_intra_and_inter_node_groups
+from torch.distributed.distributed_c10d import _get_default_group
+torch.cuda.empty_cache() 
+
+from transformers import (
+    get_constant_schedule_with_warmup,
+    get_cosine_schedule_with_warmup,
+    get_linear_schedule_with_warmup,
+)
+
+from data.data import get_audiotext_dataloader  # AudioTextData, DataCollator
+from distributed import init_distributed_device, world_info_from_env
+from train_utils import (
+    train_one_epoch,
+    get_mp_policy_dtype,
+    save_checkpoint,
+    Dict2Class,
+    get_autocast, 
+    get_cast_dtype
+)
+from valid_utils import validation_losses
+from src.factory import create_model_and_transforms
+
+os.environ['TRANSFORMERS_CACHE'] = '/lustre/fsw/portfolios/adlr/users/sreyang/.cache'
+os.environ['HF_HOME'] = '/lustre/fsw/portfolios/adlr/users/sreyang/.cache'
+
+def random_seed(seed=42, rank=0):
+    torch.manual_seed(seed + rank)
+    np.random.seed(seed + rank)
+    random.seed(seed + rank)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-c', '--config', type=str, default='../config/config.yaml', help='yaml config path')
+    parsed_args = parser.parse_args()
+
+    config = yaml.load(open(parsed_args.config), Loader=yaml.FullLoader)
+    data_config = config['data_config']
+    model_config = config['model_config']
+    clap_config = config["clap_config"]
+    args = Dict2Class(config['train_config'])
+
+    if 'sft_config' in config:
+        sft_config = config['sft_config']
+        unfreeze_full_lm = sft_config['unfreeze_full_lm']
+    else:
+        sft_config = None
+        unfreeze_full_lm = False
+
+    # get paths done 
+    exp_path = os.path.join(args.expdir, args.run_name)
+    os.makedirs(exp_path, exist_ok=True)
+    print('exp_path:', exp_path)
+    shutil.copy(parsed_args.config, os.path.join(exp_path, 'config.yaml'))
+    data_config["dataset_blending_output"] = os.path.join(exp_path, data_config["dataset_blending_output"])
+
+    # Validate args
+    if args.fsdp and not args.fsdp_use_orig_params:
+        print(
+            "Warning: FSDP is running without fsdp_use_orig_params flag. "
+            + "This is not recommended because it means we will use uniform weight decay"
+            + " and train all embeddings, not just the newly added ones. "
+            + "Note: OPT models are not compatible with fsdp_use_orig_params flag."
+        )
+
+    if args.fsdp and args.fsdp_sharding_strategy == "hybrid":
+        print(
+            "Warning: As of torch=2.0.1, the FSDP logic for optim_state_dict() is broken for hybrid sharding."
+            + "To make this method work, we need to modify torch.distributed.fsdp._optim_utils.py"
+            + "Copy and paste the code from the _optim_utils.py in this repo into the torch file."
+            + "The main issue was the missing group kwarg on line 1596 in _all_gather_optim_state."
+        )
+
+    # Set up distributed training
+    print('initializing distributed environment')
+    if args.offline:
+        os.environ["TRANSFORMERS_OFFLINE"] = "1"
+    args.local_rank, args.rank, args.world_size = world_info_from_env()
+    device_id = init_distributed_device(args)
+    random_seed(args.seed)
+
+    # Initialize model
+    print('creating model')
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"  # disable the tokenizer parallelism warning
+    model, tokenizer = create_model_and_transforms(
+        **model_config,
+        clap_config=clap_config,
+        use_local_files=args.offline,
+        gradient_checkpointing=args.gradient_checkpointing,
+        freeze_lm_embeddings=args.freeze_lm_embeddings,
+        unfreeze_full_lm=unfreeze_full_lm
+    )
+    random_seed(args.seed, args.rank)
+
+    # Initialize logging
+    print(f"Start running training on rank {args.rank}.")
+
+    # Load model checkpoint on CPU
+    checkpoint_list = glob.glob(f"{args.expdir}/{args.run_name}/checkpoint_*.pt")
+    if len(checkpoint_list) == 0:
+        print(f"Found no checkpoints for run {args.run_name}.")
+        resume_from_checkpoint = None
+    else:
+        resume_from_checkpoint = sorted(
+            checkpoint_list, key=lambda x: int(x.split("_")[-1].split(".")[0])
+        )[-1]
+        print(
+            f"Found checkpoint {resume_from_checkpoint} for run {args.run_name}."
+        )
+
+    # load pretrained model
+    resume_from_epoch = 0
+    if (resume_from_checkpoint is None) and (sft_config is not None):
+        # just started SFT
+        pretrained_path = os.path.join(
+            sft_config['pretrained_path'],
+            sft_config['pretrained_ckpt']
+        )
+        if args.rank == 0:
+            print(f"Loading checkpoint from {pretrained_path}")
+        checkpoint = torch.load(pretrained_path, map_location="cpu")
+        msd = checkpoint["model_state_dict"]
+        msd = {k.replace("module.", ""): v for k, v in msd.items()}
+
+        # for fsdp, only one rank needs to load the state dict
+        if not args.fsdp or args.rank == 0:
+            model.load_state_dict(msd, False)
+            del checkpoint["model_state_dict"]
+            del msd
+
+
+    elif resume_from_checkpoint is not None:
+        # continue training (either pretraining or STF)
+        if args.rank == 0:
+            print(f"Loading checkpoint from {resume_from_checkpoint}")
+        checkpoint = torch.load(resume_from_checkpoint, map_location="cpu")
+        msd = checkpoint["model_state_dict"]
+        msd = {k.replace("module.", ""): v for k, v in msd.items()}
+        resume_from_epoch = checkpoint["epoch"] + 1
+
+        # for fsdp, only one rank needs to load the state dict
+        if not args.fsdp or args.rank == 0:
+            model.load_state_dict(msd, False)
+            del checkpoint["model_state_dict"]
+            del msd
+    
+    else:
+        pass
+
+    # Initialize FSDP / DDP, and ensure the model is on GPU
+    print(f"Initializing distributed training with {args.world_size} GPUs.")
+    if args.fsdp:
+        print(
+            f"Before FSDP parameter num: {sum(p.numel() for p in model.parameters())} on rank {args.rank}"
+        )
+
+        # init MixedPrecision
+        if args.precision != "fp32":
+            cast_dtype = get_mp_policy_dtype(args.precision)
+            mp_policy = MixedPrecision(
+                param_dtype=torch.float32,
+                reduce_dtype=cast_dtype,  # gradient communication
+                buffer_dtype=cast_dtype,
+            )
+        else:
+            mp_policy = None
+
+        # init process groups
+        if args.fsdp_sharding_strategy == "hybrid":
+            intra_node_group, inter_node_group = _init_intra_and_inter_node_groups(
+                _get_default_group()
+            )
+            args.my_group = intra_node_group  # for optimizer saving
+            process_group = (intra_node_group, inter_node_group)  # for FSDP init
+        else:
+            args.my_group = None  # for optimizer saving
+            process_group = None  # for FSDP init
+
+        # init FSDP
+        wrapper_kwargs = dict(
+            process_group=process_group,
+            cpu_offload=CPUOffload(offload_params=False),
+            device_id=device_id,
+            sync_module_states=True,  # broadcast loaded ckpt from rank 0 -> all ranks
+            sharding_strategy=ShardingStrategy.FULL_SHARD
+            if args.fsdp_sharding_strategy == "full"
+            else ShardingStrategy.HYBRID_SHARD,
+            use_orig_params=args.fsdp_use_orig_params,
+            mixed_precision=mp_policy,
+            forward_prefetch=True,
+            backward_prefetch=BackwardPrefetch.BACKWARD_PRE,
+            limit_all_gathers=True,
+        )
+        model.wrap_fsdp(wrapper_kwargs, device_id)
+        ddp_model = model
+
+        print(
+            f"After FSDP parameter num: {sum(p.numel() for p in model.parameters())} on rank {args.rank}"
+        )
+        print(
+            f"After FSDP {torch.cuda.memory_allocated()/1024**3:.3} GB on rank {args.rank}"
+        )
+
+    else:
+        model = model.to(device_id)
+        ddp_model = DDP(model, device_ids=[device_id])
+
+    # Initialize gradient checkpointing
+    if args.gradient_checkpointing:
+        non_reentrant_wrapper = functools.partial(
+            checkpoint_wrapper,
+            offload_to_cpu=True,
+            checkpoint_impl=CheckpointImpl.NO_REENTRANT,
+        )
+        apply_activation_checkpointing(
+            ddp_model,
+            checkpoint_wrapper_fn=non_reentrant_wrapper,
+            check_fn=lambda m: getattr(m, "_use_gradient_checkpointing", False)
+            and not isinstance(m, FSDP)
+            and not isinstance(m, CheckpointWrapper),
+        )
+
+    # Initialize optimizer
+    params_to_optimize = ddp_model.named_parameters()
+    params_to_optimize = list(
+        filter(
+            lambda x: x[1].requires_grad
+            and not getattr(x[1], "exclude_from_optimizer", False),
+            params_to_optimize,
+        )
+    )
+    if not args.fsdp or args.fsdp_use_orig_params:
+        # apply weight decay only to params in the xattn layers
+        def get_grouped_params(model):
+            params_with_wd, params_without_wd = [], []
+            for n, p in params_to_optimize:
+                if "gated_cross_attn" in n:
+                    params_with_wd.append(p)
+                else:
+                    params_without_wd.append(p)
+            return [
+                {"params": params_with_wd, "weight_decay": args.weight_decay},
+                {"params": params_without_wd, "weight_decay": 0.0},
+            ]
+
+        optimizer = torch.optim.AdamW(
+            get_grouped_params(params_to_optimize), lr=args.learning_rate
+        )
+    else:
+        # unclear if we should be using no weight decay or small weight decay for all parameters
+        optimizer = torch.optim.AdamW(
+            (p for _, p in params_to_optimize),
+            lr=args.learning_rate,
+            weight_decay=args.weight_decay,
+        )
+
+    # load optimizer checkpoint
+    if resume_from_checkpoint is not None:
+        osd = checkpoint["optimizer_state_dict"]
+        if args.fsdp:
+            osd = FSDP.optim_state_dict_to_load(osd, ddp_model, optimizer)
+        optimizer.load_state_dict(osd)
+        del checkpoint["optimizer_state_dict"]
+        del osd
+
+    # Initialize data loaders
+    AudioTextDataInfo = get_audiotext_dataloader(
+        data_config, clap_config, tokenizer, args.batch_size, split='train',
+        epoch=0, force_reblend=True
+    )
+
+    total_training_steps = (
+        len(AudioTextDataInfo.dataset) // (args.batch_size * args.world_size)
+    ) * args.num_epochs
+
+    if args.rank == 0:
+        print(f"Total training steps: {total_training_steps}")
+        tb = SummaryWriter(os.path.join(exp_path, 'tensorboard'))
+    else:
+        tb = None
+
+    # Initialize lr scheduler
+    if args.lr_scheduler == "linear":
+        lr_scheduler = get_linear_schedule_with_warmup(
+            optimizer,
+            num_warmup_steps=args.warmup_steps,
+            num_training_steps=total_training_steps,
+        )
+    elif args.lr_scheduler == "cosine":
+        lr_scheduler = get_cosine_schedule_with_warmup(
+            optimizer,
+            num_warmup_steps=args.warmup_steps,
+            num_training_steps=total_training_steps,
+        )
+    else:
+        lr_scheduler = get_constant_schedule_with_warmup(
+            optimizer, num_warmup_steps=args.warmup_steps
+        )
+
+    # load lr scheduler checkpoint
+    if resume_from_checkpoint is not None:
+        lr_scheduler.load_state_dict(checkpoint["lr_scheduler_state_dict"])
+        del checkpoint["lr_scheduler_state_dict"]
+
+    # Start training!
+    ddp_model.train()
+
+    print('start training from epoch {}'.format(resume_from_epoch))
+    for epoch in range(resume_from_epoch, args.num_epochs):
+        # force reblending dataset for every epoch
+        if epoch > 0:
+            AudioTextDataInfo = get_audiotext_dataloader(
+                data_config, clap_config, tokenizer, args.batch_size, split='train',
+                epoch=epoch, force_reblend=True
+            )
+        AudioTextDataInfo.set_epoch(epoch)
+        trainloader = AudioTextDataInfo.dataloader
+        
+        # train one epoch
+        train_one_epoch(
+            args=args,
+            model=ddp_model,
+            epoch=epoch,
+            tokenizer=tokenizer,
+            optimizer=optimizer,
+            lr_scheduler=lr_scheduler,
+            trainloader=trainloader,
+            device_id=device_id,
+            tb=tb
+        )
+
+        # save checkpoint
+        save_checkpoint(ddp_model, optimizer, lr_scheduler, epoch, args)
+        time.sleep(1.0)
+
+        # validation 
+        if epoch % 5 == 0:
+            torch.distributed.barrier()
+            try:
+                with torch.no_grad():
+                    valid_losses = validation_losses(
+                        model=ddp_model, 
+                        data_config=data_config, 
+                        clap_config=clap_config, 
+                        tokenizer=tokenizer, 
+                        batch_size=args.batch_size, 
+                        autocast=get_autocast(args.precision, cache_enabled=(not args.fsdp)), 
+                        cast_dtype=get_cast_dtype(args.precision),
+                        device_id=device_id
+                    )
+
+                if args.rank == 0:
+                    for key in valid_losses:
+                        tb.add_scalar("Valid/{}".format(key), valid_losses[key], (epoch+1)*len(trainloader))
+            
+            except Exception as error:
+                print("An exception occurred:", error)
+                
+            torch.distributed.barrier()
+        
+    # save final checkpoint
+    save_checkpoint(ddp_model, optimizer, lr_scheduler, epoch, args)
+    if args.rank == 0:
+        tb.close()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/train/train_utils.py b/train/train_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..b69366202962140169d43b1a49471be68eb24feb
--- /dev/null
+++ b/train/train_utils.py
@@ -0,0 +1,343 @@
+import time
+import os
+from tqdm import tqdm
+import sys
+from copy import deepcopy
+
+from contextlib import suppress
+import torch
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp import (
+    FullStateDictConfig,
+    StateDictType,
+)
+from torch.distributed.fsdp.api import FullOptimStateDictConfig
+from einops import rearrange
+
+
+class Dict2Class:
+    def __init__(self, data_dict):
+        for key, value in data_dict.items():
+            setattr(self, key, value)
+
+
+class SysLogger(object):
+    def __init__(self, filename="../log/log.log"):
+        self.terminal = sys.stdout
+        self.log = open(filename, "a")
+
+    def write(self, message):
+        self.terminal.write(message+'\n')
+        self.log.write(message)
+
+
+def get_cast_dtype(precision: str):
+    cast_dtype = None
+    if precision == "bf16":
+        cast_dtype = torch.bfloat16
+    elif precision == "fp16":
+        cast_dtype = torch.float16
+    return cast_dtype
+
+
+def get_mp_policy_dtype(precision: str):
+    if "bfloat16" in precision or "bf16" in precision:
+        return torch.bfloat16
+    elif precision == "fp16":
+        return torch.float16
+    else:
+        return torch.float32
+
+
+def get_autocast(precision, cache_enabled=True):
+    if precision == "amp":
+        return torch.cuda.amp.autocast(cache_enabled=cache_enabled)
+    elif precision == "amp_bfloat16" or precision == "amp_bf16":
+        return lambda: torch.cuda.amp.autocast(
+            dtype=torch.bfloat16, cache_enabled=cache_enabled
+        )
+    else:
+        return suppress
+
+
+def train_one_epoch(
+    args,
+    model,
+    epoch,
+    trainloader,
+    tokenizer,
+    optimizer,
+    lr_scheduler,
+    device_id,
+    tb
+):
+    # setup loaders
+    num_batches_per_epoch = len(trainloader)
+    total_training_steps = num_batches_per_epoch * args.num_epochs
+    print('num_batches_per_epoch={}, total_training_steps={}'.format(num_batches_per_epoch, total_training_steps))
+
+    autocast = get_autocast(
+        args.precision, cache_enabled=(not args.fsdp)
+    )  # if fsdp, disable cache to save memory
+    cast_dtype = get_cast_dtype(args.precision)
+
+    # setup model
+    media_token_id = tokenizer("<audio>", add_special_tokens=False)["input_ids"][-1]
+    assert media_token_id == tokenizer.encode("<audio>")[-1]
+    endofchunk_token_id = tokenizer("<|endofchunk|>", add_special_tokens=False)["input_ids"][-1]
+    model.train()
+
+    # setup logging
+    step_time_m = AverageMeter()
+    data_time_m = AverageMeter()
+    end = time.time()
+
+    # loop through dataloader
+    for num_steps, batch in tqdm(
+        enumerate(trainloader),
+        disable=args.rank != 0,
+        total=total_training_steps,
+        initial=(epoch * num_batches_per_epoch)
+    ):
+
+        data_time_m.update(time.time() - end)
+        global_step = num_steps + epoch * num_batches_per_epoch
+
+        #### FORWARD PASS ####
+        audio_clips = batch["audio_clips"].to(device_id, dtype=cast_dtype, non_blocking=True)  # (B, N_WINDOWS, WINDOW_LENGTH)
+        audio_embed_mask = batch["audio_embed_mask"].to(device_id, dtype=cast_dtype, non_blocking=True)  # (B, N_WINDOWS)
+
+        input_ids = batch["input_ids"].to(device_id, dtype=torch.long, non_blocking=True)  # (B, N_TOKENS)
+        attention_mask = batch["attention_mask"].to(device_id, dtype=cast_dtype, non_blocking=True)  # (B, N_TOKENS)
+
+        # set up labels; language model is expected to handle shifting
+        labels = input_ids.clone()
+        labels[labels == tokenizer.pad_token_id] = -100
+        labels[:, :1] = -100
+        labels[labels == tokenizer.encode("<audio>")[-1]] = -100
+
+        # mask all prompts except for between <SEP> and <|endofchunk|>
+        sep_locations = labels == tokenizer.sep_token_id
+        eoc_locations = labels == endofchunk_token_id
+
+        if not all(sep_locations.sum(dim=1) == eoc_locations.sum(dim=1)):
+            print("Warning: <SEP>-<EoC> pairing mismatch at step {} due to max_token limit.".format(num_steps))
+
+        for i in range(labels.shape[0]):
+            shouldmask = True
+            for j in range(labels.shape[1]):
+                if shouldmask and (labels[i][j] != tokenizer.eos_token_id):
+                    masked_value = -100
+                else:
+                    masked_value = labels[i][j]
+
+                if labels[i][j] == tokenizer.sep_token_id:
+                    shouldmask = False
+                elif labels[i][j] == endofchunk_token_id:
+                    shouldmask = True
+                
+                labels[i][j] = masked_value
+            
+            if labels[i][-1] not in [-100, tokenizer.eos_token_id, tokenizer.pad_token_id, endofchunk_token_id]:
+                for j in range(labels.shape[1]-1, -1, -1):
+                    if labels[i][j] not in [-100, tokenizer.eos_token_id, endofchunk_token_id]:
+                        labels[i][j] = -100
+                    else:
+                        break
+
+        labels = labels.to(device_id)
+
+        # gradient accumulation w/ fsdp cpu offloading requires a no_sync context manager
+        with autocast():
+            output = model(
+                audio_x=audio_clips,
+                audio_x_mask=audio_embed_mask,
+                lang_x=input_ids,
+                attention_mask=attention_mask,
+                labels=labels
+            )
+            loss = output.loss
+
+        divided_loss = loss / args.gradient_accumulation_steps
+        train_loss = divided_loss * args.loss_multiplier
+        train_loss.backward()
+
+        if (not args.freeze_lm_embeddings) and (
+            not args.fsdp or args.fsdp_use_orig_params
+        ):
+            # Mask gradients for input embeddings s.t. we only update the added tokens <audio> and <|endofchunk|>
+            if args.fsdp:
+                embed_grad = model.lang_encoder.get_input_embeddings().weight.grad
+            else:
+                embed_grad = (
+                    model.module.lang_encoder.get_input_embeddings().weight.grad
+                )
+            zero_mask = torch.zeros_like(embed_grad)
+            zero_mask[media_token_id] = torch.ones_like(zero_mask[media_token_id])
+            zero_mask[endofchunk_token_id] = torch.ones_like(
+                zero_mask[endofchunk_token_id]
+            )
+            if args.fsdp:
+                model.lang_encoder.get_input_embeddings().weight.grad = (
+                    embed_grad * zero_mask
+                )
+            else:
+                model.module.lang_encoder.get_input_embeddings().weight.grad = (
+                    embed_grad * zero_mask
+                )
+
+        # clip gradient norm
+        if args.fsdp:
+            """
+            The way we clip gradients with FSDP is different than the non-FSDP case,
+            because during FSDP, gradient norms are computed over certain submodules,
+            rather than the entire model.
+            At least for OPT-125M, this didn't seem to make a difference in performance.
+            """
+            model.clip_grad_norm_(1.0)
+        else:
+            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+
+        # step optimizer and log
+        if (((num_steps + 1) % args.gradient_accumulation_steps) == 0) or (
+            num_steps == num_batches_per_epoch - 1
+        ):
+            optimizer.step()
+            lr_scheduler.step()
+            optimizer.zero_grad(set_to_none=True)
+
+            # step time and reset end outside of rank 0
+            step_time_m.update(time.time() - end)
+            end = time.time()
+
+            # rank 0 logging
+            if args.rank == 0:
+                samples_per_second = (
+                    args.gradient_accumulation_steps
+                    * args.batch_size
+                    * args.world_size
+                    / step_time_m.val
+                )
+                samples_per_second_per_gpu = (
+                    args.gradient_accumulation_steps
+                    * args.batch_size
+                    / step_time_m.val
+                )
+                log_dict = {
+                    "data_time": data_time_m.avg,
+                    "step_time": step_time_m.avg,
+                    "samples_per_second": samples_per_second,
+                    "samples_per_second_per_gpu": samples_per_second_per_gpu,
+                    "lr": optimizer.param_groups[0]["lr"],
+                    "loss": loss.item()
+                }
+
+                if ((num_steps + 1) % args.logging_steps == 0):
+                    for key in log_dict:
+                        tb.add_scalar("Train/{}".format(key), log_dict[key], global_step)
+
+                step_time_m.reset()
+                data_time_m.reset()
+
+        # Log loss to console
+        if ((num_steps + 1) % args.logging_steps == 0):
+            print(
+                f"Step {num_steps+1}/{num_batches_per_epoch} of epoch {epoch+1}/{args.num_epochs} complete. Loss: {loss.item():.3f}\n"
+            )
+
+
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+
+def filter_state_dict_to_trainable(model, state_dict):
+    """
+    Remove non-trainable parameters from model state dict.
+    Exception: Embeddings will not be removed, even if frozen.
+    This is because we need the new <audio> <|endofchunk|> tokens to
+    be consistent across initializations.
+    """
+    for (
+        name,
+        p,
+    ) in model.named_parameters():  # won't work for fsdp + use_orig_params=False
+        if "fsdp" in name:
+            continue
+        if "embed" in name or isinstance(p, torch.nn.Embedding):
+            continue
+        if not p.requires_grad:
+            name = name.replace("._checkpoint_wrapped_module", "")
+            if name in state_dict:
+                del state_dict[name]
+            else:
+                print(f"WARNING: filtering but {name} not in state_dict")
+
+    # also remove the keys in state_dict generated from
+    # lang_encoder.old_decoder_blocks and lang_encoder.gated_cross_attn_layers
+    # because these are already saved in lang_encoder.model...
+    to_delete = [
+        n
+        for n in state_dict.keys()
+        if ("lang_encoder.old_decoder_blocks" in n)
+        or ("lang_encoder.gated_cross_attn_layers" in n)
+        or ("vision_encoder" in n)
+    ]
+    for name in to_delete:
+        del state_dict[name]
+    return state_dict
+
+
+def save_checkpoint(model, optimizer, lr_scheduler, epoch, args):
+    """
+    Save training checkpoint with model, optimizer, and lr_scheduler state.
+    """
+    if args.fsdp:
+        FSDP.set_state_dict_type(
+            model,
+            StateDictType.FULL_STATE_DICT,
+            FullStateDictConfig(rank0_only=True, offload_to_cpu=True),
+            FullOptimStateDictConfig(rank0_only=True),
+        )
+        model_state = model.state_dict()
+        optim_state = FSDP.optim_state_dict(model, optimizer, group=args.my_group)
+
+    else:
+        model_state = model.state_dict()
+        optim_state = optimizer.state_dict()
+
+    if args.rank == 0:
+        if not (args.fsdp and not args.fsdp_use_orig_params):
+            model_state = filter_state_dict_to_trainable(model, model_state)
+
+        checkpoint_dir = os.path.join(args.expdir, args.run_name)
+        if not os.path.exists(checkpoint_dir):
+            os.makedirs(checkpoint_dir)
+
+        checkpoint_dict = {
+            "epoch": epoch,
+            "model_state_dict": model_state,
+            "optimizer_state_dict": optim_state,
+            "lr_scheduler_state_dict": lr_scheduler.state_dict(),
+        }
+
+        print(f"Saving checkpoint to {checkpoint_dir}/checkpoint_{epoch}.pt")
+        torch.save(checkpoint_dict, f"{checkpoint_dir}/checkpoint_{epoch}.pt")
+
+        if args.delete_previous_checkpoint:
+            if epoch > 0 and (epoch-1) % 5 != 0:
+                os.remove(f"{checkpoint_dir}/checkpoint_{epoch-1}.pt")
diff --git a/train/valid_utils.py b/train/valid_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..bff6c34e58b854f384b485dc6956c943738cf704
--- /dev/null
+++ b/train/valid_utils.py
@@ -0,0 +1,142 @@
+
+import argparse
+import functools
+import os
+import random
+from tqdm import tqdm
+import sys 
+sys.path.append('../')
+import yaml
+import time
+
+import numpy as np
+import torch
+from data.data import get_audiotext_dataloader
+
+
+@torch.no_grad()
+def validation_losses(model, data_config, clap_config, tokenizer, batch_size, autocast, cast_dtype, device_id, verbose=True):
+
+    model.eval()
+
+    @torch.no_grad()
+    def get_val_loss(validloader):
+
+        loss_sum = 0.0
+        for idx, batch in tqdm(enumerate(validloader)):
+
+            audio_clips = batch["audio_clips"].to(device_id, dtype=cast_dtype, non_blocking=True)
+            audio_embed_mask = batch["audio_embed_mask"].to(device_id, dtype=cast_dtype, non_blocking=True)
+            input_ids = batch["input_ids"].to(device_id, dtype=cast_dtype, non_blocking=True)
+            attention_mask = batch["attention_mask"].to(device_id, dtype=cast_dtype, non_blocking=True)
+
+            labels = input_ids.clone()
+            labels[labels == tokenizer.pad_token_id] = -100
+            labels[:, :1] = -100
+            labels[labels == tokenizer.encode("<audio>")[-1]] = -100
+
+            sep_locations = labels == tokenizer.sep_token_id
+            eoc_locations = labels == endofchunk_token_id
+
+            for i in range(labels.shape[0]):
+                shouldmask = True
+                for j in range(labels.shape[1]):
+                    if shouldmask and (labels[i][j] != tokenizer.eos_token_id):
+                        masked_value = -100
+                    else:
+                        masked_value = labels[i][j]
+
+                    if labels[i][j] == tokenizer.sep_token_id:
+                        shouldmask = False
+                    elif labels[i][j] == endofchunk_token_id:
+                        shouldmask = True
+                    
+                    labels[i][j] = masked_value
+                
+                if labels[i][-1] not in [-100, tokenizer.eos_token_id, tokenizer.pad_token_id, endofchunk_token_id]:
+                    for j in range(labels.shape[1]-1, -1, -1):
+                        if labels[i][j] not in [-100, tokenizer.eos_token_id, endofchunk_token_id]:
+                            labels[i][j] = -100
+                        else:
+                            break
+
+            labels = labels.to(device_id)
+
+            with autocast():
+                output = model(
+                    audio_x=audio_clips,
+                    audio_x_mask=audio_embed_mask,
+                    lang_x=input_ids,
+                    attention_mask=attention_mask,
+                    labels=labels
+                )
+                valid_loss_no_multiplier = output.loss.item()
+                loss_sum += valid_loss_no_multiplier
+
+        return loss_sum / ((idx+1) * batch_size)
+
+    media_token_id = tokenizer("<audio>", add_special_tokens=False)["input_ids"][-1]
+    assert media_token_id == tokenizer.encode("<audio>")[-1]
+    endofchunk_token_id = tokenizer("<|endofchunk|>", add_special_tokens=False)["input_ids"][-1]
+
+    valid_losses = {}
+    all_valid_AudioTextDataInfo = get_audiotext_dataloader(data_config, clap_config, tokenizer, batch_size, split='val')
+    for valid_dataset_name in all_valid_AudioTextDataInfo:
+        if verbose:
+            print('computing validation loss on {}'.format(valid_dataset_name))
+
+        validloader = all_valid_AudioTextDataInfo[valid_dataset_name].dataloader 
+        valid_losses[valid_dataset_name] = get_val_loss(validloader)
+
+        if verbose:
+            print('validation loss on {} is {:.3f}'.format(valid_dataset_name, valid_losses[valid_dataset_name]))
+    
+    model.train() 
+
+    return valid_losses
+
+
+if __name__ == "__main__":
+    from src.factory import create_model_and_transforms
+    from train_utils import Dict2Class, get_autocast, get_cast_dtype
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-c', '--config', type=str, default='../configs/config.yaml', help='yaml config path')
+    parsed_args = parser.parse_args()
+
+    config = yaml.load(open(parsed_args.config), Loader=yaml.FullLoader)
+    data_config = config['data_config']
+    model_config = config['model_config']
+    clap_config = config['clap_config']
+    args = Dict2Class(config['train_config'])
+
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"  # disable the tokenizer parallelism warning
+    model, tokenizer = create_model_and_transforms(
+        **model_config,
+        clap_config=clap_config,
+        use_local_files=args.offline,
+        gradient_checkpointing=args.gradient_checkpointing,
+        freeze_lm_embeddings=args.freeze_lm_embeddings,
+    )
+
+    device_id = 0
+    model = model.to(device_id)
+    
+    autocast = get_autocast(
+        args.precision, cache_enabled=(not args.fsdp)
+    )  # if fsdp, disable cache to save memory
+    cast_dtype = get_cast_dtype(args.precision)
+
+    valid_losses = validation_losses(
+        model, 
+        data_config, 
+        clap_config,
+        tokenizer, 
+        args.batch_size, 
+        autocast, 
+        cast_dtype,
+        device_id,
+        verbose=True
+    )
+
+    print(valid_losses)
\ No newline at end of file