Twelve2five commited on
Commit
3e7a8e3
·
verified ·
1 Parent(s): c9681ef

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -18
app.py CHANGED
@@ -14,6 +14,7 @@ from huggingface_hub import snapshot_download
14
  from tqdm import tqdm
15
  import gradio as gr
16
  import math
 
17
 
18
  # --- Configuration ---
19
  YOUR_HF_USERNAME = "Twelve2five"
@@ -38,6 +39,17 @@ WARMUP_RATIO = 0.03
38
  LR_SCHEDULER = "cosine"
39
  OPTIMIZER = "paged_adamw_8bit"
40
 
 
 
 
 
 
 
 
 
 
 
 
41
  def seq2seq_causal_collator(features):
42
  """
43
  Collator that concatenates context (input_ids) and target (labels)
@@ -128,19 +140,24 @@ def load_model():
128
  bnb_4bit_use_double_quant=True,
129
  )
130
 
131
- # --- Load Base Model (with quantization) ---
132
- try:
133
- model = AutoModelForCausalLM.from_pretrained(
134
- hf_model_repo_id,
135
- quantization_config=bnb_config,
136
- device_map="auto",
137
- trust_remote_code=True
138
- )
139
- print(f"Loaded model vocab size: {model.config.vocab_size}")
140
- print(f"Input embedding shape: {model.get_input_embeddings().weight.shape}")
141
- except Exception as e:
142
- print(f"Error loading model: {e}")
143
- return None
 
 
 
 
 
144
 
145
  # --- Prepare for K-bit Training & Apply LoRA ---
146
  model = prepare_model_for_kbit_training(model)
@@ -272,10 +289,13 @@ def train_model(progress=gr.Progress()):
272
  warmup_steps=num_warmup_steps,
273
  lr_scheduler_type=LR_SCHEDULER,
274
  report_to="tensorboard",
275
- fp16=False,
276
- bf16=True if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else False,
277
  gradient_checkpointing=True,
278
- gradient_checkpointing_kwargs={'use_reentrant': False},
 
 
 
279
  )
280
 
281
  trainer = Trainer(
@@ -285,6 +305,10 @@ def train_model(progress=gr.Progress()):
285
  data_collator=seq2seq_causal_collator,
286
  )
287
 
 
 
 
 
288
  progress(0.5, desc="Starting training...")
289
  # Clear cache before starting
290
  gc.collect()
@@ -292,6 +316,7 @@ def train_model(progress=gr.Progress()):
292
  torch.cuda.empty_cache()
293
 
294
  try:
 
295
  train_result = trainer.train()
296
 
297
  progress(0.9, desc="Saving model...")
@@ -359,6 +384,6 @@ if __name__ == "__main__":
359
  # Install dependencies first if needed
360
  # !pip install -q -U transformers accelerate bitsandbytes peft torch datasets huggingface_hub gradio
361
 
362
- # Create and launch the U
363
  demo = create_ui()
364
- demo.launch()
 
14
  from tqdm import tqdm
15
  import gradio as gr
16
  import math
17
+ from accelerate import Accelerator
18
 
19
  # --- Configuration ---
20
  YOUR_HF_USERNAME = "Twelve2five"
 
39
  LR_SCHEDULER = "cosine"
40
  OPTIMIZER = "paged_adamw_8bit"
41
 
42
+ # Multi-GPU configuration
43
+ accelerator = Accelerator()
44
+
45
+ # Configure environment for multi-GPU
46
+ os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
47
+
48
+ # Print GPU information
49
+ print(f"Available GPUs: {torch.cuda.device_count()}")
50
+ for i in range(torch.cuda.device_count()):
51
+ print(f"GPU {i}: {torch.cuda.get_device_name(i)} with {torch.cuda.get_device_properties(i).total_memory / 1e9:.2f} GB")
52
+
53
  def seq2seq_causal_collator(features):
54
  """
55
  Collator that concatenates context (input_ids) and target (labels)
 
140
  bnb_4bit_use_double_quant=True,
141
  )
142
 
143
+ # Configure for multi-GPU
144
+ device_map = "auto" # Let the library automatically distribute across GPUs
145
+
146
+ # For 4x L4 GPUs (24GB each)
147
+ max_memory = {i: "22GB" for i in range(torch.cuda.device_count())}
148
+ max_memory["cpu"] = "32GB" # Allow some CPU offloading if needed
149
+
150
+ model = AutoModelForCausalLM.from_pretrained(
151
+ hf_model_repo_id,
152
+ quantization_config=bnb_config,
153
+ device_map=device_map,
154
+ max_memory=max_memory,
155
+ trust_remote_code=True,
156
+ use_cache=False,
157
+ torch_dtype=torch.float16,
158
+ )
159
+ print(f"Loaded model vocab size: {model.config.vocab_size}")
160
+ print(f"Input embedding shape: {model.get_input_embeddings().weight.shape}")
161
 
162
  # --- Prepare for K-bit Training & Apply LoRA ---
163
  model = prepare_model_for_kbit_training(model)
 
289
  warmup_steps=num_warmup_steps,
290
  lr_scheduler_type=LR_SCHEDULER,
291
  report_to="tensorboard",
292
+ fp16=True,
293
+ bf16=False,
294
  gradient_checkpointing=True,
295
+ gradient_checkpointing_kwargs={"use_reentrant": False},
296
+ ddp_find_unused_parameters=False,
297
+ local_rank=int(os.getenv("LOCAL_RANK", -1)),
298
+ dataloader_num_workers=4,
299
  )
300
 
301
  trainer = Trainer(
 
305
  data_collator=seq2seq_causal_collator,
306
  )
307
 
308
+ # Print memory usage before training
309
+ for i in range(torch.cuda.device_count()):
310
+ print(f"GPU {i} memory: {torch.cuda.memory_allocated(i) / 1e9:.2f}GB allocated, {torch.cuda.memory_reserved(i) / 1e9:.2f}GB reserved")
311
+
312
  progress(0.5, desc="Starting training...")
313
  # Clear cache before starting
314
  gc.collect()
 
316
  torch.cuda.empty_cache()
317
 
318
  try:
319
+ # Train distributed across GPUs
320
  train_result = trainer.train()
321
 
322
  progress(0.9, desc="Saving model...")
 
384
  # Install dependencies first if needed
385
  # !pip install -q -U transformers accelerate bitsandbytes peft torch datasets huggingface_hub gradio
386
 
387
+ # Create and launch the UI
388
  demo = create_ui()
389
+ demo.launch()