Spaces:

fahimfarhan
/

hyenadna-sm-32k-mqtl-classifier-space

Runtime error

App Files Files Community

Soumic commited on Oct 5, 2024

Commit

7bfa8a1

1 Parent(s): ef3f3d9

:zap: Code is stable

Browse files

Files changed (1) hide show

app.py +35 -18

app.py CHANGED Viewed

@@ -77,7 +77,7 @@ class PagingMQTLDataset(IterableDataset):
     input_ids = self.bert_tokenizer(sequence)["input_ids"]
     tokenized_tensor = torch.tensor(input_ids)
     label_tensor = torch.tensor(label)
-    output_dict = {"input_ids": tokenized_tensor, "labels": label_tensor} # so this is now you do it?
     return output_dict  # tokenized_tensor, label_tensor
@@ -137,7 +137,7 @@ def create_paging_train_val_test_datasets(tokenizer, WINDOW, is_debug, batch_siz
   }
   dataset_map = None
-  is_my_laptop = os.path.isfile("/src/inputdata/dataset_4000_test_binned.csv")
   if is_my_laptop:
     dataset_map = load_dataset("csv", data_files=data_files, streaming=True)
   else:
@@ -204,16 +204,17 @@ def start():
   os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
   login_inside_huggingface_virtualmachine()
-  WINDOW = 1000
   batch_size = 100
   tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME, trust_remote_code=True)
-  model = AutoModelForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME, torch_dtype=torch.bfloat16,
-                                                             device_map="auto",
-                                                             trust_remote_code=True)
   args = {
     "output_dir": "output_hyena_dna-mqtl_classification",
-    "num_train_epochs": 2,
-    "max_steps": 20,
     # Set the number of steps you expect to train, originally 1000, takes too much time. So I set it to 10 to run faster and check my code/pipeline
     "run_name": "laptop_run_hyena_dna-mqtl_classification",  # Override run_name here
     "per_device_train_batch_size": 1,
@@ -233,6 +234,7 @@ def start():
   # train_dataset, eval_dataset, test_dataset = create_data_module(tokenizer=tokenizer, WINDOW=WINDOW,
   #                                                                batch_size=batch_size,
   #                                                                is_debug=False)
   max_length = 32_000
   sequence = 'ACTG' * int(max_length / 4)
   # sequence = 'ACTG' * int(1000) # seq_len = 4000 it works!
@@ -243,13 +245,14 @@ def start():
   # Create a dataset for training
   run_the_code_ds = Dataset.from_dict({"input_ids": tokenized, "labels": labels})
   run_the_code_ds.set_format("pt")
-  # train_ds, val_ds, test_ds = create_paging_train_val_test_datasets(tokenizer, WINDOW=WINDOW, is_debug=False)
-  train_ds, val_ds, test_ds = run_the_code_ds, run_the_code_ds, run_the_code_ds
   # train_ds.set_format("pt") # doesn't work!
   trainer = Trainer(
-    model=model,
     args=training_args,
     train_dataset=train_ds,
     eval_dataset=val_ds,
@@ -268,15 +271,29 @@ def start():
     print(f"{test_results = }")
   except Exception as oome:
     print(f"{oome = }")
 if __name__ == '__main__':
   start()
   pass
-"""
-git submodule add https://huggingface.co/spaces/fahimfarhan/hyenadna-sm-32k-mqtl-classifier-space src/huggingface-mqtl-classification-hyena-dna
-"""

     input_ids = self.bert_tokenizer(sequence)["input_ids"]
     tokenized_tensor = torch.tensor(input_ids)
     label_tensor = torch.tensor(label)
+    output_dict = {"input_ids": tokenized_tensor, "labels": label_tensor}  # so this is now you do it?
     return output_dict  # tokenized_tensor, label_tensor
   }
   dataset_map = None
+  is_my_laptop = os.path.isfile("/home/soumic/Codes/mqtl-classification/src/inputdata/dataset_4000_train_binned.csv")
   if is_my_laptop:
     dataset_map = load_dataset("csv", data_files=data_files, streaming=True)
   else:
   os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
   login_inside_huggingface_virtualmachine()
+  WINDOW = 4000
   batch_size = 100
   tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME, trust_remote_code=True)
+  classifier_model = AutoModelForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME,
+                                                                        torch_dtype=torch.bfloat16,
+                                                                        device_map="auto",
+                                                                        trust_remote_code=True)
   args = {
     "output_dir": "output_hyena_dna-mqtl_classification",
+    "num_train_epochs": 1,
+    "max_steps": 10,
     # Set the number of steps you expect to train, originally 1000, takes too much time. So I set it to 10 to run faster and check my code/pipeline
     "run_name": "laptop_run_hyena_dna-mqtl_classification",  # Override run_name here
     "per_device_train_batch_size": 1,
   # train_dataset, eval_dataset, test_dataset = create_data_module(tokenizer=tokenizer, WINDOW=WINDOW,
   #                                                                batch_size=batch_size,
   #                                                                is_debug=False)
+  """  # example code
   max_length = 32_000
   sequence = 'ACTG' * int(max_length / 4)
   # sequence = 'ACTG' * int(1000) # seq_len = 4000 it works!
   # Create a dataset for training
   run_the_code_ds = Dataset.from_dict({"input_ids": tokenized, "labels": labels})
   run_the_code_ds.set_format("pt")
+  """
+  train_ds, val_ds, test_ds = create_paging_train_val_test_datasets(tokenizer, WINDOW=WINDOW, is_debug=False)
+  # train_ds, val_ds, test_ds = run_the_code_ds, run_the_code_ds, run_the_code_ds
   # train_ds.set_format("pt") # doesn't work!
   trainer = Trainer(
+    model=classifier_model,
     args=training_args,
     train_dataset=train_ds,
     eval_dataset=val_ds,
     print(f"{test_results = }")
   except Exception as oome:
     print(f"{oome = }")
+  finally:
+    # save the model
+    model_name = "HyenaDnaMQtlClassifier"
+    is_my_laptop = os.path.isfile("/home/soumic/Codes/mqtl-classification/src/inputdata/dataset_4000_train_binned.csv")
+    model_repository_name = f"fahimfarhan/hyenadna-sm-32k-mqtl-classifier-seq-len-{WINDOW}"
+    model_subdirectory = f"my-awesome-model-{WINDOW}"
+    classifier_model.save_pretrained(save_directory=model_subdirectory, safe_serialization=False)
+    # push to the hub
+    commit_message = f":tada: Push model for window size {WINDOW} from huggingface space"
+    if is_my_laptop:
+      commit_message = f":tada: Push model for window size {WINDOW} from zephyrus"
+    classifier_model.push_to_hub(
+      repo_id=model_repository_name,
+      # subfolder=f"my-awesome-model-{WINDOW}", subfolder didn't work :/
+      commit_message=commit_message,  # f":tada: Push model for window size {WINDOW}"
+      safe_serialization=False
+    )
+  pass
 if __name__ == '__main__':
   start()
   pass