dad1909 commited on
Commit
d1c2177
·
verified ·
1 Parent(s): c2ab1f3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -3
app.py CHANGED
@@ -1,17 +1,26 @@
1
  import os
2
  import torch
 
3
  from unsloth import FastLanguageModel, is_bfloat16_supported
4
  from trl import SFTTrainer
5
  from transformers import TrainingArguments
6
  from datasets import load_dataset
7
  import gradio as gr
8
 
9
-
10
  max_seq_length = 4096
11
  dtype = None
12
  load_in_4bit = True
13
  hf_token = os.getenv("Token")
14
 
 
 
 
 
 
 
 
 
 
15
  print("Starting model and tokenizer loading...")
16
 
17
  # Load the model and tokenizer
@@ -121,8 +130,10 @@ trainer = SFTTrainer(
121
  weight_decay=0.01,
122
  lr_scheduler_type="linear",
123
  seed=3407,
124
- local_rank=4,
125
  output_dir="outputs",
 
 
 
126
  ),
127
  )
128
  print("Trainer initialized.")
@@ -142,4 +153,6 @@ model.push_to_hub_merged(
142
  save_method="merged_16bit",
143
  token=True
144
  )
145
- print("Model pushed to hub successfully.")
 
 
 
1
  import os
2
  import torch
3
+ import torch.distributed as dist
4
  from unsloth import FastLanguageModel, is_bfloat16_supported
5
  from trl import SFTTrainer
6
  from transformers import TrainingArguments
7
  from datasets import load_dataset
8
  import gradio as gr
9
 
 
10
  max_seq_length = 4096
11
  dtype = None
12
  load_in_4bit = True
13
  hf_token = os.getenv("Token")
14
 
15
+ def setup_distributed_training():
16
+ dist.init_process_group(backend='nccl')
17
+ torch.cuda.set_device(dist.get_rank())
18
+
19
+ def cleanup_distributed_training():
20
+ dist.destroy_process_group()
21
+
22
+ setup_distributed_training()
23
+
24
  print("Starting model and tokenizer loading...")
25
 
26
  # Load the model and tokenizer
 
130
  weight_decay=0.01,
131
  lr_scheduler_type="linear",
132
  seed=3407,
 
133
  output_dir="outputs",
134
+ # Distributed training arguments
135
+ deepspeed=None, # If using deepspeed for further optimizations
136
+ local_rank=dist.get_rank(), # Add this line
137
  ),
138
  )
139
  print("Trainer initialized.")
 
153
  save_method="merged_16bit",
154
  token=True
155
  )
156
+ print("Model pushed to hub successfully.")
157
+
158
+ cleanup_distributed_training()