epfl-dlab
/

Phi3_French_Hypertokenizer_64HT

+{
+  "added_tokens_decoder": {},
+  "clean_up_tokenization_spaces": false,
+  "model_max_length": 1000000000000000019884624838656,
+  "tokenizer_class": "PreTrainedTokenizerFast"
+}

trainer_config.yaml ADDED Viewed

+cls: HF
+base_tokenizer_path: microsoft/Phi-3-mini-128k-instruct
+dataset:
+  path: allenai/c4
+  data_dir: fr
+  name: c4_fr
+  split: train
+  column: text
+target_num_hyper_token: 64
+batch_size: 1000
+total_training_size: 100000