Kevin Fink commited on
Commit
6662b37
·
1 Parent(s): f0b7505
Files changed (1) hide show
  1. app.py +5 -4
app.py CHANGED
@@ -26,7 +26,7 @@ def fine_tune_model(model_name, dataset_name, hub_id, api_key, num_epochs, batch
26
  model = get_peft_model(model, lora_config)
27
  tokenizer = AutoTokenizer.from_pretrained(model_name)
28
 
29
- max_length = 128
30
  try:
31
  tokenized_train_dataset = load_from_disk(f'{hub_id.strip()}_train_dataset')
32
  tokenized_test_dataset = load_from_disk(f'{hub_id.strip()}_test_dataset')
@@ -39,7 +39,7 @@ def fine_tune_model(model_name, dataset_name, hub_id, api_key, num_epochs, batch
39
  model_inputs = tokenizer(
40
  examples['text'],
41
  max_length=max_length, # Set to None for dynamic padding
42
- padding=True, # Disable padding here, we will handle it later
43
  truncation=True,
44
  )
45
 
@@ -47,7 +47,7 @@ def fine_tune_model(model_name, dataset_name, hub_id, api_key, num_epochs, batch
47
  labels = tokenizer(
48
  examples['target'],
49
  max_length=max_length, # Set to None for dynamic padding
50
- padding=True, # Disable padding here, we will handle it later
51
  truncation=True,
52
  text_target=examples['target'] # Use text_target for target text
53
  )
@@ -98,7 +98,8 @@ def fine_tune_model(model_name, dataset_name, hub_id, api_key, num_epochs, batch
98
  eval_dataset=tokenized_datasets['test'],
99
  #callbacks=[LoggingCallback()],
100
  )
101
-
 
102
  # Fine-tune the model
103
  trainer.train()
104
  trainer.push_to_hub(commit_message="Training complete!")
 
26
  model = get_peft_model(model, lora_config)
27
  tokenizer = AutoTokenizer.from_pretrained(model_name)
28
 
29
+ max_length = 91
30
  try:
31
  tokenized_train_dataset = load_from_disk(f'{hub_id.strip()}_train_dataset')
32
  tokenized_test_dataset = load_from_disk(f'{hub_id.strip()}_test_dataset')
 
39
  model_inputs = tokenizer(
40
  examples['text'],
41
  max_length=max_length, # Set to None for dynamic padding
42
+ padding='longest', # Disable padding here, we will handle it later
43
  truncation=True,
44
  )
45
 
 
47
  labels = tokenizer(
48
  examples['target'],
49
  max_length=max_length, # Set to None for dynamic padding
50
+ padding='longest', # Disable padding here, we will handle it later
51
  truncation=True,
52
  text_target=examples['target'] # Use text_target for target text
53
  )
 
98
  eval_dataset=tokenized_datasets['test'],
99
  #callbacks=[LoggingCallback()],
100
  )
101
+ for batch in trainer.get_train_dataloader():
102
+ print(batch['input_ids'].shape, batch['labels'].shape)
103
  # Fine-tune the model
104
  trainer.train()
105
  trainer.push_to_hub(commit_message="Training complete!")