Wangpeng An commited on
Commit
d10a61f
·
verified ·
1 Parent(s): 124ce99

Delete finetune_all_multinode_stage2.sh

Browse files
Files changed (1) hide show
  1. finetune_all_multinode_stage2.sh +0 -93
finetune_all_multinode_stage2.sh DELETED
@@ -1,93 +0,0 @@
1
- #!/bin/bash
2
-
3
- # Pretrain a multimodal model.
4
- export OMP_NUM_THREADS=8
5
- export NCCL_IB_DISABLE=0
6
- export NCCL_IB_GID_INDEX=3
7
- export NCCL_SOCKET_IFNAME=eth0
8
- export NCCL_DEBUG=INFO
9
- export NCCL_IB_SL=1
10
- export CUDA_DEVICE_MAX_CONNECTIONS=1
11
- export NCCL_BLOCKING_WAIT=1
12
- export NCCL_ASYNC_ERROR_HANDLING=1
13
- export NCCL_TIMEOUT=500
14
- export TORCH_DISTRIBUTED_DEBUG=DETAIL
15
-
16
- DATETIME=`date +'%y-%m-%d-%H-%M-%S'`
17
-
18
- # Setting for multi nodes training.
19
- ports=(`echo $METIS_WORKER_0_PORT | tr ',' ' '`)
20
- port=${ports[0]}
21
-
22
- echo "total workers: ${ARNOLD_WORKER_NUM}"
23
- echo "cur worker id: ${ARNOLD_ID}"
24
- echo "gpus per worker: ${ARNOLD_WORKER_GPU}"
25
- echo "master ip: ${METIS_WORKER_0_HOST}"
26
- echo "master port: ${port}"
27
-
28
- source /mnt/bn/tns-algo-video-public-my2/wangpeng.an/environment/anaconda3/bin/activate multimodal
29
-
30
- cd /mnt/bn/tns-algo-video-public-my2/wangpeng.an/model/OmniFusion-main
31
-
32
- # Install necessary packages
33
- pip3 install requests
34
- pip3 install attrs
35
- pip3 install aiofiles
36
- pip3 install pynvml
37
-
38
-
39
- # Print Python executable path, torchrun, deepspeed and PYTHONPATH
40
- echo "Python executable: $(which python)"
41
- echo "torchrun executable: $(which torchrun)"
42
- echo "deepspeed executable: $(which deepspeed)"
43
- echo "PYTHONPATH before torchrun: $PYTHONPATH"
44
-
45
- sudo chmod 777 /var/lib/fastrak -R
46
-
47
- # Launch training with DeepSpeed and torchrun
48
- ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node=$ARNOLD_WORKER_GPU --nnodes=$ARNOLD_WORKER_NUM --node_rank=$ARNOLD_ID --master_addr=$METIS_WORKER_0_HOST --master_port=$port \
49
- llava/train/train_mem.py \
50
- --deepspeed ./scripts/zero2.json \
51
- --model_name_or_path ./checkpoints/OmniFusion-8B \
52
- --version llama_3_1 \
53
- --data_path /mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/stage2/stage2_new_1016.json \
54
- --audio_asr_folder /mnt/bn/tns-algo-video-public-my2/wangpeng.an/data \
55
- --audio_caption_folder /mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/stage2/audio_caption_data_tune/audio_caption_tune/audio_caption \
56
- --video_folder /mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/stage2/videos_images_tune/video_images_tune/videos_images_tune \
57
- --image_folder /mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/stage2/videos_images_tune/video_images_tune/videos_images_tune \
58
- --X "Audio_asr" "Audio_caption" "Video" "Image" \
59
- --audio_tower /mnt/bn/tns-algo-video-public-my2/wangpeng.an/model/LanguageBind_Audio_Asr \
60
- --audio_caption_tower /mnt/bn/tns-algo-video-public-my2/wangpeng.an/model/pretrained_model/LanguageBind_Audio \
61
- --video_tower /mnt/bn/tns-algo-video-public-my2/wangpeng.an/model/pretrained_model/LanguageBind_Video_merge \
62
- --image_tower /mnt/bn/tns-algo-video-public-my2/wangpeng.an/model/pretrained_model/LanguageBind_Image \
63
- --pretrain_mm_mlp_adapter checkpoints/Video-LLaVA-Pretrain-7B/checkpoint-4000/mm_projector.bin \
64
- --mm_projector_type mlp2x_gelu \
65
- --mm_vision_select_layer -2 \
66
- --mm_use_x_start_end False \
67
- --mm_use_x_patch_token False \
68
- --image_aspect_ratio pad \
69
- --group_by_modality_length True \
70
- --tune_mm_mlp_adapter False \
71
- --bf16 True \
72
- --output_dir ./checkpoints/OmniFusion-8B-stage3-1017 \
73
- --num_train_epochs 1 \
74
- --per_device_train_batch_size 8 \
75
- --per_device_eval_batch_size 4 \
76
- --gradient_accumulation_steps 1 \
77
- --evaluation_strategy "no" \
78
- --save_strategy "steps" \
79
- --save_steps 3000 \
80
- --save_total_limit 4 \
81
- --learning_rate 2e-5 \
82
- --weight_decay 0. \
83
- --warmup_ratio 0.03 \
84
- --lr_scheduler_type "cosine" \
85
- --logging_steps 1 \
86
- --tf32 True \
87
- --model_max_length 2048 \
88
- --tokenizer_model_max_length 3072 \
89
- --gradient_checkpointing True \
90
- --dataloader_num_workers 8 \
91
- --lazy_preprocess True \
92
- --report_to none \
93
- --cache_dir "./cache_dir"