multitensor commited on
Commit
5d37bc2
·
verified ·
1 Parent(s): 9555439

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. config.json +52 -0
  2. mm_projector.bin +3 -0
  3. pretrain_all.sh +91 -0
  4. trainer_state.json +0 -0
config.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "X": [
3
+ "Audio_asr",
4
+ "Audio_caption",
5
+ "Video",
6
+ "Image"
7
+ ],
8
+ "_name_or_path": "/mnt/bn/tns-algo-public-my2/wangpeng.an/model/Meta-Llama-3-8B-Instruct",
9
+ "architectures": [
10
+ "LlamaForCausalLM"
11
+ ],
12
+ "attention_bias": false,
13
+ "attention_dropout": 0.0,
14
+ "bos_token_id": 128000,
15
+ "eos_token_id": 128009,
16
+ "freeze_mm_mlp_adapter": false,
17
+ "hidden_act": "silu",
18
+ "hidden_size": 4096,
19
+ "image_aspect_ratio": "square",
20
+ "image_grid_pinpoints": null,
21
+ "initializer_range": 0.02,
22
+ "intermediate_size": 14336,
23
+ "is_fusion": false,
24
+ "max_position_embeddings": 8192,
25
+ "mlp_bias": false,
26
+ "mm_audio_caption_tower": "/mnt/bn/tns-algo-public-my2/wangpeng.an/model/pretrained_model/LanguageBind_Audio",
27
+ "mm_audio_tower": "/mnt/bn/tns-algo-public-my2/wangpeng.an/model/LanguageBind_Audio_Asr",
28
+ "mm_hidden_size": 1024,
29
+ "mm_image_tower": "/mnt/bn/tns-algo-public-my2/wangpeng.an/model/pretrained_model/LanguageBind_Image",
30
+ "mm_projector_type": "mlp2x_gelu",
31
+ "mm_use_x_patch_token": false,
32
+ "mm_use_x_start_end": false,
33
+ "mm_video_tower": "/mnt/bn/tns-algo-public-my2/wangpeng.an/model/pretrained_model/LanguageBind_Video_merge",
34
+ "mm_vision_select_feature": "patch",
35
+ "mm_vision_select_layer": -2,
36
+ "model_type": "llava_llama",
37
+ "num_attention_heads": 32,
38
+ "num_hidden_layers": 32,
39
+ "num_key_value_heads": 8,
40
+ "pretraining_tp": 1,
41
+ "rms_norm_eps": 1e-05,
42
+ "rope_scaling": null,
43
+ "rope_theta": 500000.0,
44
+ "tie_word_embeddings": false,
45
+ "tokenizer_model_max_length": 3072,
46
+ "torch_dtype": "bfloat16",
47
+ "transformers_version": "4.43.1",
48
+ "tune_mm_mlp_adapter": true,
49
+ "use_cache": true,
50
+ "use_mm_proj": true,
51
+ "vocab_size": 128257
52
+ }
mm_projector.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e57532ad31d36df8c8add40d8f2f7de3e27a009a19a266f71e1c44550341e2a6
3
+ size 41961528
pretrain_all.sh ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Pretrain a multimodal model.
4
+ export OMP_NUM_THREADS=8
5
+ export NCCL_IB_DISABLE=0
6
+ export NCCL_IB_GID_INDEX=3
7
+ export NCCL_SOCKET_IFNAME=eth0
8
+ export NCCL_DEBUG=INFO
9
+ export NCCL_IB_SL=1
10
+ export CUDA_DEVICE_MAX_CONNECTIONS=1
11
+ export NCCL_BLOCKING_WAIT=1
12
+ export NCCL_ASYNC_ERROR_HANDLING=1
13
+ export NCCL_TIMEOUT=500
14
+ export TORCH_DISTRIBUTED_DEBUG=DETAIL
15
+ export NCCL_SOCKET_FAMILY=AF_INET6
16
+ export NCCL_NET_PLUGIN=libnccl-net-gcp-fastrak.so
17
+
18
+ DATETIME=`date +'%y-%m-%d-%H-%M-%S'`
19
+
20
+ # Setting for multi nodes training.
21
+ ports=(`echo $METIS_WORKER_0_PORT | tr ',' ' '`)
22
+ port=${ports[0]}
23
+
24
+ echo "total workers: ${ARNOLD_WORKER_NUM}"
25
+ echo "cur worker id: ${ARNOLD_ID}"
26
+ echo "gpus per worker: ${ARNOLD_WORKER_GPU}"
27
+ echo "master ip: ${METIS_WORKER_0_HOST}"
28
+ echo "master port: ${port}"
29
+
30
+ source /mnt/bn/tns-algo-public-my2/wangpeng.an/environment/anaconda3/bin/activate multimodal
31
+
32
+ cd /mnt/bn/tns-algo-public-my2/wangpeng.an/train/OmniFusion-main
33
+
34
+ # Install necessary packages
35
+ pip3 install requests
36
+ pip3 install attrs
37
+ pip3 install aiofiles
38
+ pip3 install pynvml
39
+
40
+
41
+ # Print Python executable path, torchrun, deepspeed and PYTHONPATH
42
+ echo "Python executable: $(which python)"
43
+ echo "torchrun executable: $(which torchrun)"
44
+ echo "deepspeed executable: $(which deepspeed)"
45
+ echo "PYTHONPATH before torchrun: $PYTHONPATH"
46
+
47
+ sudo chmod 777 /var/lib/fastrak -R
48
+
49
+ ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node=$ARNOLD_WORKER_GPU --nnodes=$ARNOLD_WORKER_NUM --node_rank=$ARNOLD_ID --master_addr=$METIS_WORKER_0_HOST --master_port=$port \
50
+ llava/train/train_mem.py \
51
+ --deepspeed ./scripts/zero2.json \
52
+ --model_name_or_path /mnt/bn/tns-algo-public-my2/wangpeng.an/model/Meta-Llama-3-8B-Instruct \
53
+ --version plain \
54
+ --data_path /mnt/bn/tns-algo-public-my2/wangpeng.an/data/annotations/video_image_caption_asr_stage1.json \
55
+ --audio_asr_folder /mnt/bn/tns-algo-public-my2/wangpeng.an/data \
56
+ --audio_caption_folder /mnt/bn/tns-algo-public-my2/wangpeng.an/data/caption_data/0818 \
57
+ --video_folder /mnt/bn/tns-algo-public-my2/wangpeng.an/data/video/Video-LLaVA \
58
+ --image_folder /mnt/bn/tns-algo-public-my2/wangpeng.an/data/video/Video-LLaVA \
59
+ --X "Audio_asr" "Audio_caption" "Video" "Image" \
60
+ --audio_tower /mnt/bn/tns-algo-public-my2/wangpeng.an/model/LanguageBind_Audio_Asr \
61
+ --audio_caption_tower /mnt/bn/tns-algo-public-my2/wangpeng.an/model/pretrained_model/LanguageBind_Audio \
62
+ --video_tower /mnt/bn/tns-algo-public-my2/wangpeng.an/model/pretrained_model/LanguageBind_Video_merge \
63
+ --image_tower /mnt/bn/tns-algo-public-my2/wangpeng.an/model/pretrained_model/LanguageBind_Image \
64
+ --mm_projector_type mlp2x_gelu \
65
+ --tune_mm_mlp_adapter True \
66
+ --mm_vision_select_layer -2 \
67
+ --mm_use_x_start_end False \
68
+ --mm_use_x_patch_token False \
69
+ --bf16 True \
70
+ --output_dir ./checkpoints/Omni-Pretrain-8B-llama-0209 \
71
+ --num_train_epochs 1 \
72
+ --per_device_train_batch_size 4 \
73
+ --per_device_eval_batch_size 16 \
74
+ --gradient_accumulation_steps 2 \
75
+ --evaluation_strategy "no" \
76
+ --save_strategy "steps" \
77
+ --save_steps 2000 \
78
+ --save_total_limit 20 \
79
+ --learning_rate 1e-3 \
80
+ --weight_decay 0. \
81
+ --warmup_ratio 0.03 \
82
+ --lr_scheduler_type "cosine" \
83
+ --logging_steps 1 \
84
+ --tf32 True \
85
+ --model_max_length 2048 \
86
+ --tokenizer_model_max_length 3072 \
87
+ --gradient_checkpointing True \
88
+ --dataloader_num_workers 8 \
89
+ --lazy_preprocess True \
90
+ --report_to tensorboard \
91
+ --cache_dir "./cache_dir"
trainer_state.json ADDED
The diff for this file is too large to render. See raw diff