mtasic85 commited on
Commit
3bd4051
·
1 Parent(s): c5afc4e

cpt core 4

Browse files
Files changed (1) hide show
  1. scripts/cpt_core_model_4.py +13 -21
scripts/cpt_core_model_4.py CHANGED
@@ -1,6 +1,6 @@
1
  from unsloth import FastLanguageModel
2
  import torch
3
- # from transformers import AutoTokenizer
4
 
5
  max_seq_length = 16384
6
  dtype = torch.bfloat16
@@ -20,12 +20,10 @@ model, tokenizer = FastLanguageModel.from_pretrained(
20
  dtype=dtype,
21
  load_in_4bit=load_in_4bit,
22
  )
23
-
24
  print(f'{model=}')
25
 
26
  # print('Ignore loaded tokenizer by FastLanguageModel.from_pretrained and using AutoTokenizer.from_pretrained')
27
  # tokenizer = AutoTokenizer.from_pretrained('..', trust_remote_code=True, use_fast=True)
28
-
29
  # print(f'{tokenizer=}')
30
 
31
  model = FastLanguageModel.get_peft_model(
@@ -69,33 +67,28 @@ final_dataset = concatenate_datasets(core_datasets)
69
  print(f'{final_dataset=}')
70
  '''
71
 
 
72
  from litdata import TokensLoader, StreamingDataset
73
 
74
- dataset = StreamingDataset(
 
75
  input_dir=dataset_input_dir,
76
  item_loader=TokensLoader(block_size=dataset_block_size),
77
  )
78
 
79
 
80
- def unlsoth_generator(dataset):
81
- for batch in dataset:
82
- print(batch)
83
-
84
- yield {
85
- 'input_ids': batch['input_ids'].tolist() # Convert tensor to list
86
- }
87
 
 
 
 
88
  break
89
- # # Assuming TokensLoader returns tensors with 'input_ids'
90
- # yield {
91
- # 'input_ids': batch['input_ids'].tolist() # Convert tensor to list
92
- # }
93
 
94
- for n in unlsoth_generator(dataset):
95
- print(n)
96
- break
97
 
98
- '''
 
 
99
  from trl import SFTTrainer
100
  from transformers import TrainingArguments
101
  from unsloth import is_bfloat16_supported
@@ -105,7 +98,7 @@ from unsloth import UnslothTrainer, UnslothTrainingArguments
105
  trainer = UnslothTrainer(
106
  model=model,
107
  tokenizer=tokenizer,
108
- train_dataset=final_dataset,
109
  dataset_text_field='text',
110
  max_seq_length=max_seq_length,
111
  dataset_num_proc=32,
@@ -133,4 +126,3 @@ trainer = UnslothTrainer(
133
  )
134
 
135
  trainer_stats = trainer.train()
136
- '''
 
1
  from unsloth import FastLanguageModel
2
  import torch
3
+ from transformers import AutoTokenizer
4
 
5
  max_seq_length = 16384
6
  dtype = torch.bfloat16
 
20
  dtype=dtype,
21
  load_in_4bit=load_in_4bit,
22
  )
 
23
  print(f'{model=}')
24
 
25
  # print('Ignore loaded tokenizer by FastLanguageModel.from_pretrained and using AutoTokenizer.from_pretrained')
26
  # tokenizer = AutoTokenizer.from_pretrained('..', trust_remote_code=True, use_fast=True)
 
27
  # print(f'{tokenizer=}')
28
 
29
  model = FastLanguageModel.get_peft_model(
 
67
  print(f'{final_dataset=}')
68
  '''
69
 
70
+ from datasets import Dataset
71
  from litdata import TokensLoader, StreamingDataset
72
 
73
+
74
+ litgpt_streaming_dataset = StreamingDataset(
75
  input_dir=dataset_input_dir,
76
  item_loader=TokensLoader(block_size=dataset_block_size),
77
  )
78
 
79
 
80
+ def unlsoth_generator():
81
+ global litgpt_streaming_dataset
 
 
 
 
 
82
 
83
+ for batch in litgpt_streaming_dataset:
84
+ # print(batch)
85
+ yield {'input_ids': batch}
86
  break
 
 
 
 
87
 
 
 
 
88
 
89
+ train_dataset = Dataset.from_generator(unlsoth_generator, streaming=True)
90
+
91
+
92
  from trl import SFTTrainer
93
  from transformers import TrainingArguments
94
  from unsloth import is_bfloat16_supported
 
98
  trainer = UnslothTrainer(
99
  model=model,
100
  tokenizer=tokenizer,
101
+ train_dataset=train_dataset,
102
  dataset_text_field='text',
103
  max_seq_length=max_seq_length,
104
  dataset_num_proc=32,
 
126
  )
127
 
128
  trainer_stats = trainer.train()