mtasic85 commited on
Commit
c5afc4e
·
1 Parent(s): 2b6c108

cpt core 4

Browse files
Files changed (2) hide show
  1. README.md +3 -1
  2. scripts/cpt_core_model_4.py +32 -0
README.md CHANGED
@@ -402,10 +402,12 @@ litgpt convert_pretrained_checkpoint ../out/pretrain-core-3/final ../out/pretrai
402
  ```bash
403
  litgpt convert_from_litgpt ../out/pretrain-core-3/final ../out/pretrain-core-3/hf
404
  cp ../config-3.json ../out/pretrain-core-3/hf/config.json
 
 
405
  ```
406
 
407
  ```bash
408
- CUDA_VISIBLE_DEVICES=0 CUDA_LAUNCH_BLOCKING=0
409
  ```
410
 
411
  ```
 
402
  ```bash
403
  litgpt convert_from_litgpt ../out/pretrain-core-3/final ../out/pretrain-core-3/hf
404
  cp ../config-3.json ../out/pretrain-core-3/hf/config.json
405
+ cp -rv ../tokenizer/* ../out/pretrain-core-3/hf
406
+ python -B convert_pth_to_safetensors.py
407
  ```
408
 
409
  ```bash
410
+ python -B cpt_core_model_4.py
411
  ```
412
 
413
  ```
scripts/cpt_core_model_4.py CHANGED
@@ -8,6 +8,12 @@ load_in_4bit = True
8
  model_name = '../out/pretrain-core-3/hf'
9
  output_dir = '../out/cpt-core-4'
10
 
 
 
 
 
 
 
11
  model, tokenizer = FastLanguageModel.from_pretrained(
12
  model_name=model_name,
13
  max_seq_length=max_seq_length,
@@ -63,6 +69,32 @@ final_dataset = concatenate_datasets(core_datasets)
63
  print(f'{final_dataset=}')
64
  '''
65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  '''
67
  from trl import SFTTrainer
68
  from transformers import TrainingArguments
 
8
  model_name = '../out/pretrain-core-3/hf'
9
  output_dir = '../out/cpt-core-4'
10
 
11
+ dataset_input_dir = '../core-data-4-8193-16385-16385-1000/'
12
+ dataset_block_size = 16385
13
+
14
+ #
15
+ # model
16
+ #
17
  model, tokenizer = FastLanguageModel.from_pretrained(
18
  model_name=model_name,
19
  max_seq_length=max_seq_length,
 
69
  print(f'{final_dataset=}')
70
  '''
71
 
72
+ from litdata import TokensLoader, StreamingDataset
73
+
74
+ dataset = StreamingDataset(
75
+ input_dir=dataset_input_dir,
76
+ item_loader=TokensLoader(block_size=dataset_block_size),
77
+ )
78
+
79
+
80
+ def unlsoth_generator(dataset):
81
+ for batch in dataset:
82
+ print(batch)
83
+
84
+ yield {
85
+ 'input_ids': batch['input_ids'].tolist() # Convert tensor to list
86
+ }
87
+
88
+ break
89
+ # # Assuming TokensLoader returns tensors with 'input_ids'
90
+ # yield {
91
+ # 'input_ids': batch['input_ids'].tolist() # Convert tensor to list
92
+ # }
93
+
94
+ for n in unlsoth_generator(dataset):
95
+ print(n)
96
+ break
97
+
98
  '''
99
  from trl import SFTTrainer
100
  from transformers import TrainingArguments