THUdyh commited on
Commit
f996a4a
·
verified ·
1 Parent(s): 365bbd5

Add files using upload-large-folder tool

Browse files
Files changed (1) hide show
  1. config.json +3 -49
config.json CHANGED
@@ -12,65 +12,19 @@
12
  "freeze_speech_adapter": false,
13
  "hidden_act": "silu",
14
  "hidden_size": 3584,
15
- "image_aspect_ratio": "anyres_genli",
16
- "image_crop_resolution": 224,
17
- "image_grid_pinpoints": [
18
- [
19
- 336,
20
- 672
21
- ],
22
- [
23
- 672,
24
- 336
25
- ],
26
- [
27
- 672,
28
- 672
29
- ],
30
- [
31
- 1008,
32
- 336
33
- ],
34
- [
35
- 336,
36
- 1008
37
- ],
38
- [
39
- 1008,
40
- 672
41
- ],
42
- [
43
- 672,
44
- 1008
45
- ],
46
- [
47
- 1008,
48
- 1008
49
- ]
50
- ],
51
- "image_split_resolution": 224,
52
  "initializer_range": 0.02,
53
  "intermediate_size": 18944,
54
  "max_position_embeddings": 32768,
55
  "max_window_layers": 28,
56
  "mm_hidden_size": 1152,
57
- "mm_newline_position": "no_token",
58
- "mm_patch_merge_type": "genli",
59
- "mm_pooling_position": "before",
60
- "mm_projector_lr": null,
61
  "mm_projector_type": "ola_mlp",
62
- "mm_resampler_type": null,
63
- "mm_spatial_pool_mode": "average",
64
- "mm_spatial_pool_out_channels": null,
65
- "mm_spatial_pool_stride": 2,
66
  "mm_use_im_patch_token": false,
67
  "mm_use_im_start_end": false,
68
  "mm_vision_select_feature": "patch",
69
  "mm_vision_select_layer": -1,
70
- "music_encoder": "/path/to/beats",
71
- "mm_vision_tower": "/path/to/oryx_vit",
72
  "mm_vision_tower_lr": null,
73
- "mm_vision_tower_two": null,
74
  "modality_max_length": "None",
75
  "model_type": "ola_qwen",
76
  "num_attention_heads": 28,
@@ -80,7 +34,7 @@
80
  "rope_scaling": null,
81
  "rope_theta": 1000000.0,
82
  "sliding_window": null,
83
- "speech_encoder": "/path/to/whisper",
84
  "speech_encoder_ds_rate": 10,
85
  "speech_encoder_hidden_size": 2048,
86
  "speech_encoder_type": "dual",
 
12
  "freeze_speech_adapter": false,
13
  "hidden_act": "silu",
14
  "hidden_size": 3584,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  "initializer_range": 0.02,
16
  "intermediate_size": 18944,
17
  "max_position_embeddings": 32768,
18
  "max_window_layers": 28,
19
  "mm_hidden_size": 1152,
 
 
 
 
20
  "mm_projector_type": "ola_mlp",
 
 
 
 
21
  "mm_use_im_patch_token": false,
22
  "mm_use_im_start_end": false,
23
  "mm_vision_select_feature": "patch",
24
  "mm_vision_select_layer": -1,
25
+ "music_encoder": "./pretrained/BEATs_iter3_plus_AS2M_finetuned_on_AS2M_cpt2.pt",
26
+ "mm_vision_tower": "./pretrained/oryx_vit.pth",
27
  "mm_vision_tower_lr": null,
 
28
  "modality_max_length": "None",
29
  "model_type": "ola_qwen",
30
  "num_attention_heads": 28,
 
34
  "rope_scaling": null,
35
  "rope_theta": 1000000.0,
36
  "sliding_window": null,
37
+ "speech_encoder": "./pretrained/large-v3.pt",
38
  "speech_encoder_ds_rate": 10,
39
  "speech_encoder_hidden_size": 2048,
40
  "speech_encoder_type": "dual",