Wangpeng An
		
	commited on
		
		
					Upload folder using huggingface_hub
Browse files- config.json +52 -0
- mm_projector.bin +3 -0
- pretrain_all.sh +91 -0
- trainer_state.json +0 -0
    	
        config.json
    ADDED
    
    | @@ -0,0 +1,52 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "X": [
         | 
| 3 | 
            +
                "Audio_asr",
         | 
| 4 | 
            +
                "Audio_caption",
         | 
| 5 | 
            +
                "Video",
         | 
| 6 | 
            +
                "Image"
         | 
| 7 | 
            +
              ],
         | 
| 8 | 
            +
              "_name_or_path": "/mnt/bn/tns-algo-public-my2/wangpeng.an/model/Meta-Llama-3-8B-Instruct",
         | 
| 9 | 
            +
              "architectures": [
         | 
| 10 | 
            +
                "LlamaForCausalLM"
         | 
| 11 | 
            +
              ],
         | 
| 12 | 
            +
              "attention_bias": false,
         | 
| 13 | 
            +
              "attention_dropout": 0.0,
         | 
| 14 | 
            +
              "bos_token_id": 128000,
         | 
| 15 | 
            +
              "eos_token_id": 128009,
         | 
| 16 | 
            +
              "freeze_mm_mlp_adapter": false,
         | 
| 17 | 
            +
              "hidden_act": "silu",
         | 
| 18 | 
            +
              "hidden_size": 4096,
         | 
| 19 | 
            +
              "image_aspect_ratio": "square",
         | 
| 20 | 
            +
              "image_grid_pinpoints": null,
         | 
| 21 | 
            +
              "initializer_range": 0.02,
         | 
| 22 | 
            +
              "intermediate_size": 14336,
         | 
| 23 | 
            +
              "is_fusion": false,
         | 
| 24 | 
            +
              "max_position_embeddings": 8192,
         | 
| 25 | 
            +
              "mlp_bias": false,
         | 
| 26 | 
            +
              "mm_audio_caption_tower": "/mnt/bn/tns-algo-public-my2/wangpeng.an/model/pretrained_model/LanguageBind_Audio",
         | 
| 27 | 
            +
              "mm_audio_tower": "/mnt/bn/tns-algo-public-my2/wangpeng.an/model/LanguageBind_Audio_Asr",
         | 
| 28 | 
            +
              "mm_hidden_size": 1024,
         | 
| 29 | 
            +
              "mm_image_tower": "/mnt/bn/tns-algo-public-my2/wangpeng.an/model/pretrained_model/LanguageBind_Image",
         | 
| 30 | 
            +
              "mm_projector_type": "mlp2x_gelu",
         | 
| 31 | 
            +
              "mm_use_x_patch_token": false,
         | 
| 32 | 
            +
              "mm_use_x_start_end": false,
         | 
| 33 | 
            +
              "mm_video_tower": "/mnt/bn/tns-algo-public-my2/wangpeng.an/model/pretrained_model/LanguageBind_Video_merge",
         | 
| 34 | 
            +
              "mm_vision_select_feature": "patch",
         | 
| 35 | 
            +
              "mm_vision_select_layer": -2,
         | 
| 36 | 
            +
              "model_type": "llava_llama",
         | 
| 37 | 
            +
              "num_attention_heads": 32,
         | 
| 38 | 
            +
              "num_hidden_layers": 32,
         | 
| 39 | 
            +
              "num_key_value_heads": 8,
         | 
| 40 | 
            +
              "pretraining_tp": 1,
         | 
| 41 | 
            +
              "rms_norm_eps": 1e-05,
         | 
| 42 | 
            +
              "rope_scaling": null,
         | 
| 43 | 
            +
              "rope_theta": 500000.0,
         | 
| 44 | 
            +
              "tie_word_embeddings": false,
         | 
| 45 | 
            +
              "tokenizer_model_max_length": 3072,
         | 
| 46 | 
            +
              "torch_dtype": "bfloat16",
         | 
| 47 | 
            +
              "transformers_version": "4.43.1",
         | 
| 48 | 
            +
              "tune_mm_mlp_adapter": true,
         | 
| 49 | 
            +
              "use_cache": true,
         | 
| 50 | 
            +
              "use_mm_proj": true,
         | 
| 51 | 
            +
              "vocab_size": 128257
         | 
| 52 | 
            +
            }
         | 
    	
        mm_projector.bin
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:e57532ad31d36df8c8add40d8f2f7de3e27a009a19a266f71e1c44550341e2a6
         | 
| 3 | 
            +
            size 41961528
         | 
    	
        pretrain_all.sh
    ADDED
    
    | @@ -0,0 +1,91 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            #!/bin/bash
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            # Pretrain a multimodal model.
         | 
| 4 | 
            +
            export OMP_NUM_THREADS=8
         | 
| 5 | 
            +
            export NCCL_IB_DISABLE=0
         | 
| 6 | 
            +
            export NCCL_IB_GID_INDEX=3
         | 
| 7 | 
            +
            export NCCL_SOCKET_IFNAME=eth0
         | 
| 8 | 
            +
            export NCCL_DEBUG=INFO
         | 
| 9 | 
            +
            export NCCL_IB_SL=1
         | 
| 10 | 
            +
            export CUDA_DEVICE_MAX_CONNECTIONS=1
         | 
| 11 | 
            +
            export NCCL_BLOCKING_WAIT=1
         | 
| 12 | 
            +
            export NCCL_ASYNC_ERROR_HANDLING=1
         | 
| 13 | 
            +
            export NCCL_TIMEOUT=500
         | 
| 14 | 
            +
            export TORCH_DISTRIBUTED_DEBUG=DETAIL
         | 
| 15 | 
            +
            export NCCL_SOCKET_FAMILY=AF_INET6
         | 
| 16 | 
            +
            export NCCL_NET_PLUGIN=libnccl-net-gcp-fastrak.so
         | 
| 17 | 
            +
             | 
| 18 | 
            +
            DATETIME=`date +'%y-%m-%d-%H-%M-%S'`
         | 
| 19 | 
            +
             | 
| 20 | 
            +
            # Setting for multi nodes training.
         | 
| 21 | 
            +
            ports=(`echo $METIS_WORKER_0_PORT | tr ',' ' '`)
         | 
| 22 | 
            +
            port=${ports[0]}
         | 
| 23 | 
            +
             | 
| 24 | 
            +
            echo "total workers: ${ARNOLD_WORKER_NUM}"
         | 
| 25 | 
            +
            echo "cur worker id: ${ARNOLD_ID}"
         | 
| 26 | 
            +
            echo "gpus per worker: ${ARNOLD_WORKER_GPU}"
         | 
| 27 | 
            +
            echo "master ip: ${METIS_WORKER_0_HOST}"
         | 
| 28 | 
            +
            echo "master port: ${port}"
         | 
| 29 | 
            +
             | 
| 30 | 
            +
            source /mnt/bn/tns-algo-public-my2/wangpeng.an/environment/anaconda3/bin/activate multimodal
         | 
| 31 | 
            +
             | 
| 32 | 
            +
            cd /mnt/bn/tns-algo-public-my2/wangpeng.an/train/OmniFusion-main
         | 
| 33 | 
            +
             | 
| 34 | 
            +
            # Install necessary packages
         | 
| 35 | 
            +
            pip3 install requests
         | 
| 36 | 
            +
            pip3 install attrs
         | 
| 37 | 
            +
            pip3 install aiofiles
         | 
| 38 | 
            +
            pip3 install pynvml
         | 
| 39 | 
            +
             | 
| 40 | 
            +
             | 
| 41 | 
            +
            # Print Python executable path, torchrun, deepspeed and PYTHONPATH
         | 
| 42 | 
            +
            echo "Python executable: $(which python)"
         | 
| 43 | 
            +
            echo "torchrun executable: $(which torchrun)"
         | 
| 44 | 
            +
            echo "deepspeed executable: $(which deepspeed)"
         | 
| 45 | 
            +
            echo "PYTHONPATH before torchrun: $PYTHONPATH"
         | 
| 46 | 
            +
             | 
| 47 | 
            +
            sudo chmod 777 /var/lib/fastrak -R
         | 
| 48 | 
            +
             | 
| 49 | 
            +
            ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node=$ARNOLD_WORKER_GPU --nnodes=$ARNOLD_WORKER_NUM --node_rank=$ARNOLD_ID --master_addr=$METIS_WORKER_0_HOST --master_port=$port \
         | 
| 50 | 
            +
                llava/train/train_mem.py \
         | 
| 51 | 
            +
                --deepspeed ./scripts/zero2.json \
         | 
| 52 | 
            +
                --model_name_or_path /mnt/bn/tns-algo-public-my2/wangpeng.an/model/Meta-Llama-3-8B-Instruct \
         | 
| 53 | 
            +
                --version plain \
         | 
| 54 | 
            +
                --data_path /mnt/bn/tns-algo-public-my2/wangpeng.an/data/annotations/video_image_caption_asr_stage1.json \
         | 
| 55 | 
            +
                --audio_asr_folder /mnt/bn/tns-algo-public-my2/wangpeng.an/data \
         | 
| 56 | 
            +
                --audio_caption_folder /mnt/bn/tns-algo-public-my2/wangpeng.an/data/caption_data/0818 \
         | 
| 57 | 
            +
                --video_folder /mnt/bn/tns-algo-public-my2/wangpeng.an/data/video/Video-LLaVA \
         | 
| 58 | 
            +
                --image_folder /mnt/bn/tns-algo-public-my2/wangpeng.an/data/video/Video-LLaVA \
         | 
| 59 | 
            +
                --X "Audio_asr" "Audio_caption" "Video" "Image" \
         | 
| 60 | 
            +
                --audio_tower /mnt/bn/tns-algo-public-my2/wangpeng.an/model/LanguageBind_Audio_Asr \
         | 
| 61 | 
            +
                --audio_caption_tower /mnt/bn/tns-algo-public-my2/wangpeng.an/model/pretrained_model/LanguageBind_Audio \
         | 
| 62 | 
            +
                --video_tower /mnt/bn/tns-algo-public-my2/wangpeng.an/model/pretrained_model/LanguageBind_Video_merge \
         | 
| 63 | 
            +
                --image_tower /mnt/bn/tns-algo-public-my2/wangpeng.an/model/pretrained_model/LanguageBind_Image \
         | 
| 64 | 
            +
                --mm_projector_type mlp2x_gelu \
         | 
| 65 | 
            +
                --tune_mm_mlp_adapter True \
         | 
| 66 | 
            +
                --mm_vision_select_layer -2 \
         | 
| 67 | 
            +
                --mm_use_x_start_end False \
         | 
| 68 | 
            +
                --mm_use_x_patch_token False \
         | 
| 69 | 
            +
                --bf16 True \
         | 
| 70 | 
            +
                --output_dir ./checkpoints/Omni-Pretrain-8B-llama-0209 \
         | 
| 71 | 
            +
                --num_train_epochs 1 \
         | 
| 72 | 
            +
                --per_device_train_batch_size 4 \
         | 
| 73 | 
            +
                --per_device_eval_batch_size 16 \
         | 
| 74 | 
            +
                --gradient_accumulation_steps 2 \
         | 
| 75 | 
            +
                --evaluation_strategy "no" \
         | 
| 76 | 
            +
                --save_strategy "steps" \
         | 
| 77 | 
            +
                --save_steps 2000 \
         | 
| 78 | 
            +
                --save_total_limit 20 \
         | 
| 79 | 
            +
                --learning_rate 1e-3 \
         | 
| 80 | 
            +
                --weight_decay 0. \
         | 
| 81 | 
            +
                --warmup_ratio 0.03 \
         | 
| 82 | 
            +
                --lr_scheduler_type "cosine" \
         | 
| 83 | 
            +
                --logging_steps 1 \
         | 
| 84 | 
            +
                --tf32 True \
         | 
| 85 | 
            +
                --model_max_length 2048 \
         | 
| 86 | 
            +
                --tokenizer_model_max_length 3072 \
         | 
| 87 | 
            +
                --gradient_checkpointing True \
         | 
| 88 | 
            +
                --dataloader_num_workers 8 \
         | 
| 89 | 
            +
                --lazy_preprocess True \
         | 
| 90 | 
            +
                --report_to tensorboard \
         | 
| 91 | 
            +
                --cache_dir "./cache_dir"
         | 
    	
        trainer_state.json
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  |