Spaces:

shorecode
/

gradio-3

Runtime error

App Files Files Community

sc-2 commited on Dec 5, 2024

Commit

6662b37

1 Parent(s): f0b7505

init

Browse files

Files changed (1) hide show

app.py +5 -4

app.py CHANGED Viewed

@@ -26,7 +26,7 @@ def fine_tune_model(model_name, dataset_name, hub_id, api_key, num_epochs, batch
         model = get_peft_model(model, lora_config)
         tokenizer = AutoTokenizer.from_pretrained(model_name)
-        max_length = 128
         try:
             tokenized_train_dataset = load_from_disk(f'{hub_id.strip()}_train_dataset')
             tokenized_test_dataset = load_from_disk(f'{hub_id.strip()}_test_dataset')
@@ -39,7 +39,7 @@ def fine_tune_model(model_name, dataset_name, hub_id, api_key, num_epochs, batch
                 model_inputs = tokenizer(
                     examples['text'],
                     max_length=max_length,  # Set to None for dynamic padding
-                    padding=True,     # Disable padding here, we will handle it later
                     truncation=True,
                 )
@@ -47,7 +47,7 @@ def fine_tune_model(model_name, dataset_name, hub_id, api_key, num_epochs, batch
                 labels = tokenizer(
                     examples['target'],
                     max_length=max_length,  # Set to None for dynamic padding
-                    padding=True,     # Disable padding here, we will handle it later
                     truncation=True,
                     text_target=examples['target']  # Use text_target for target text
                 )
@@ -98,7 +98,8 @@ def fine_tune_model(model_name, dataset_name, hub_id, api_key, num_epochs, batch
             eval_dataset=tokenized_datasets['test'],
             #callbacks=[LoggingCallback()],
         )
         # Fine-tune the model
         trainer.train()
         trainer.push_to_hub(commit_message="Training complete!")

         model = get_peft_model(model, lora_config)
         tokenizer = AutoTokenizer.from_pretrained(model_name)
+        max_length = 91
         try:
             tokenized_train_dataset = load_from_disk(f'{hub_id.strip()}_train_dataset')
             tokenized_test_dataset = load_from_disk(f'{hub_id.strip()}_test_dataset')
                 model_inputs = tokenizer(
                     examples['text'],
                     max_length=max_length,  # Set to None for dynamic padding
+                    padding='longest',     # Disable padding here, we will handle it later
                     truncation=True,
                 )
                 labels = tokenizer(
                     examples['target'],
                     max_length=max_length,  # Set to None for dynamic padding
+                    padding='longest',     # Disable padding here, we will handle it later
                     truncation=True,
                     text_target=examples['target']  # Use text_target for target text
                 )
             eval_dataset=tokenized_datasets['test'],
             #callbacks=[LoggingCallback()],
         )
+        for batch in trainer.get_train_dataloader():
+            print(batch['input_ids'].shape, batch['labels'].shape)
         # Fine-tune the model
         trainer.train()
         trainer.push_to_hub(commit_message="Training complete!")