Spaces:
Runtime error
Runtime error
zetavg
commited on
update fine-tune resuming related stuff
Browse files
llama_lora/ui/finetune_ui.py
CHANGED
|
@@ -316,6 +316,13 @@ def do_train(
|
|
| 316 |
resume_from_checkpoint = os.path.join(Global.data_dir, "lora_models", continue_from_model)
|
| 317 |
if continue_from_checkpoint:
|
| 318 |
resume_from_checkpoint = os.path.join(resume_from_checkpoint, continue_from_checkpoint)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 319 |
|
| 320 |
output_dir = os.path.join(Global.data_dir, "lora_models", model_name)
|
| 321 |
if os.path.exists(output_dir):
|
|
|
|
| 316 |
resume_from_checkpoint = os.path.join(Global.data_dir, "lora_models", continue_from_model)
|
| 317 |
if continue_from_checkpoint:
|
| 318 |
resume_from_checkpoint = os.path.join(resume_from_checkpoint, continue_from_checkpoint)
|
| 319 |
+
will_be_resume_from_checkpoint_file = os.path.join(resume_from_checkpoint, "pytorch_model.bin")
|
| 320 |
+
if not os.path.exists(will_be_resume_from_checkpoint_file):
|
| 321 |
+
raise ValueError(f"Unable to resume from checkpoint {continue_from_model}/{continue_from_checkpoint}. Resuming is only possible from checkpoints stored locally in the data directory. Please ensure that the file '{will_be_resume_from_checkpoint_file}' exists.")
|
| 322 |
+
else:
|
| 323 |
+
will_be_resume_from_checkpoint_file = os.path.join(resume_from_checkpoint, "adapter_model.bin")
|
| 324 |
+
if not os.path.exists(will_be_resume_from_checkpoint_file):
|
| 325 |
+
raise ValueError(f"Unable to continue from model {continue_from_model}. Continuation is only possible from models stored locally in the data directory. Please ensure that the file '{will_be_resume_from_checkpoint_file}' exists.")
|
| 326 |
|
| 327 |
output_dir = os.path.join(Global.data_dir, "lora_models", model_name)
|
| 328 |
if os.path.exists(output_dir):
|
lora_models/unhelpful-ai-v01/finetune_params.json
CHANGED
|
@@ -1,8 +1,9 @@
|
|
| 1 |
{
|
| 2 |
-
"num_train_epochs":
|
| 3 |
"learning_rate": 0.0003,
|
| 4 |
"cutoff_len": 512,
|
| 5 |
-
"
|
|
|
|
| 6 |
"lora_alpha": 32,
|
| 7 |
"lora_dropout": 0.05,
|
| 8 |
"lora_target_modules": [
|
|
@@ -11,9 +12,5 @@
|
|
| 11 |
"k_proj",
|
| 12 |
"o_proj"
|
| 13 |
],
|
| 14 |
-
"train_on_inputs": false
|
| 15 |
-
"group_by_length": false,
|
| 16 |
-
"save_steps": 500,
|
| 17 |
-
"save_total_limit": 5,
|
| 18 |
-
"logging_steps": 10
|
| 19 |
}
|
|
|
|
| 1 |
{
|
| 2 |
+
"num_train_epochs": 8,
|
| 3 |
"learning_rate": 0.0003,
|
| 4 |
"cutoff_len": 512,
|
| 5 |
+
"val_set_size": 0,
|
| 6 |
+
"lora_r": 16,
|
| 7 |
"lora_alpha": 32,
|
| 8 |
"lora_dropout": 0.05,
|
| 9 |
"lora_target_modules": [
|
|
|
|
| 12 |
"k_proj",
|
| 13 |
"o_proj"
|
| 14 |
],
|
| 15 |
+
"train_on_inputs": false
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
}
|