Soumic commited on
Commit
7bfa8a1
·
1 Parent(s): ef3f3d9

:zap: Code is stable

Browse files
Files changed (1) hide show
  1. app.py +35 -18
app.py CHANGED
@@ -77,7 +77,7 @@ class PagingMQTLDataset(IterableDataset):
77
  input_ids = self.bert_tokenizer(sequence)["input_ids"]
78
  tokenized_tensor = torch.tensor(input_ids)
79
  label_tensor = torch.tensor(label)
80
- output_dict = {"input_ids": tokenized_tensor, "labels": label_tensor} # so this is now you do it?
81
  return output_dict # tokenized_tensor, label_tensor
82
 
83
 
@@ -137,7 +137,7 @@ def create_paging_train_val_test_datasets(tokenizer, WINDOW, is_debug, batch_siz
137
  }
138
 
139
  dataset_map = None
140
- is_my_laptop = os.path.isfile("/src/inputdata/dataset_4000_test_binned.csv")
141
  if is_my_laptop:
142
  dataset_map = load_dataset("csv", data_files=data_files, streaming=True)
143
  else:
@@ -204,16 +204,17 @@ def start():
204
  os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
205
 
206
  login_inside_huggingface_virtualmachine()
207
- WINDOW = 1000
208
  batch_size = 100
209
  tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME, trust_remote_code=True)
210
- model = AutoModelForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME, torch_dtype=torch.bfloat16,
211
- device_map="auto",
212
- trust_remote_code=True)
 
213
  args = {
214
  "output_dir": "output_hyena_dna-mqtl_classification",
215
- "num_train_epochs": 2,
216
- "max_steps": 20,
217
  # Set the number of steps you expect to train, originally 1000, takes too much time. So I set it to 10 to run faster and check my code/pipeline
218
  "run_name": "laptop_run_hyena_dna-mqtl_classification", # Override run_name here
219
  "per_device_train_batch_size": 1,
@@ -233,6 +234,7 @@ def start():
233
  # train_dataset, eval_dataset, test_dataset = create_data_module(tokenizer=tokenizer, WINDOW=WINDOW,
234
  # batch_size=batch_size,
235
  # is_debug=False)
 
236
  max_length = 32_000
237
  sequence = 'ACTG' * int(max_length / 4)
238
  # sequence = 'ACTG' * int(1000) # seq_len = 4000 it works!
@@ -243,13 +245,14 @@ def start():
243
  # Create a dataset for training
244
  run_the_code_ds = Dataset.from_dict({"input_ids": tokenized, "labels": labels})
245
  run_the_code_ds.set_format("pt")
 
246
 
247
- # train_ds, val_ds, test_ds = create_paging_train_val_test_datasets(tokenizer, WINDOW=WINDOW, is_debug=False)
248
- train_ds, val_ds, test_ds = run_the_code_ds, run_the_code_ds, run_the_code_ds
249
  # train_ds.set_format("pt") # doesn't work!
250
 
251
  trainer = Trainer(
252
- model=model,
253
  args=training_args,
254
  train_dataset=train_ds,
255
  eval_dataset=val_ds,
@@ -268,15 +271,29 @@ def start():
268
  print(f"{test_results = }")
269
  except Exception as oome:
270
  print(f"{oome = }")
271
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272
 
273
 
274
  if __name__ == '__main__':
275
  start()
276
  pass
277
-
278
- """
279
- git submodule add https://huggingface.co/spaces/fahimfarhan/hyenadna-sm-32k-mqtl-classifier-space src/huggingface-mqtl-classification-hyena-dna
280
-
281
- """
282
-
 
77
  input_ids = self.bert_tokenizer(sequence)["input_ids"]
78
  tokenized_tensor = torch.tensor(input_ids)
79
  label_tensor = torch.tensor(label)
80
+ output_dict = {"input_ids": tokenized_tensor, "labels": label_tensor} # so this is now you do it?
81
  return output_dict # tokenized_tensor, label_tensor
82
 
83
 
 
137
  }
138
 
139
  dataset_map = None
140
+ is_my_laptop = os.path.isfile("/home/soumic/Codes/mqtl-classification/src/inputdata/dataset_4000_train_binned.csv")
141
  if is_my_laptop:
142
  dataset_map = load_dataset("csv", data_files=data_files, streaming=True)
143
  else:
 
204
  os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
205
 
206
  login_inside_huggingface_virtualmachine()
207
+ WINDOW = 4000
208
  batch_size = 100
209
  tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME, trust_remote_code=True)
210
+ classifier_model = AutoModelForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME,
211
+ torch_dtype=torch.bfloat16,
212
+ device_map="auto",
213
+ trust_remote_code=True)
214
  args = {
215
  "output_dir": "output_hyena_dna-mqtl_classification",
216
+ "num_train_epochs": 1,
217
+ "max_steps": 10,
218
  # Set the number of steps you expect to train, originally 1000, takes too much time. So I set it to 10 to run faster and check my code/pipeline
219
  "run_name": "laptop_run_hyena_dna-mqtl_classification", # Override run_name here
220
  "per_device_train_batch_size": 1,
 
234
  # train_dataset, eval_dataset, test_dataset = create_data_module(tokenizer=tokenizer, WINDOW=WINDOW,
235
  # batch_size=batch_size,
236
  # is_debug=False)
237
+ """ # example code
238
  max_length = 32_000
239
  sequence = 'ACTG' * int(max_length / 4)
240
  # sequence = 'ACTG' * int(1000) # seq_len = 4000 it works!
 
245
  # Create a dataset for training
246
  run_the_code_ds = Dataset.from_dict({"input_ids": tokenized, "labels": labels})
247
  run_the_code_ds.set_format("pt")
248
+ """
249
 
250
+ train_ds, val_ds, test_ds = create_paging_train_val_test_datasets(tokenizer, WINDOW=WINDOW, is_debug=False)
251
+ # train_ds, val_ds, test_ds = run_the_code_ds, run_the_code_ds, run_the_code_ds
252
  # train_ds.set_format("pt") # doesn't work!
253
 
254
  trainer = Trainer(
255
+ model=classifier_model,
256
  args=training_args,
257
  train_dataset=train_ds,
258
  eval_dataset=val_ds,
 
271
  print(f"{test_results = }")
272
  except Exception as oome:
273
  print(f"{oome = }")
274
+ finally:
275
+ # save the model
276
+ model_name = "HyenaDnaMQtlClassifier"
277
+ is_my_laptop = os.path.isfile("/home/soumic/Codes/mqtl-classification/src/inputdata/dataset_4000_train_binned.csv")
278
+ model_repository_name = f"fahimfarhan/hyenadna-sm-32k-mqtl-classifier-seq-len-{WINDOW}"
279
+
280
+ model_subdirectory = f"my-awesome-model-{WINDOW}"
281
+ classifier_model.save_pretrained(save_directory=model_subdirectory, safe_serialization=False)
282
+
283
+ # push to the hub
284
+ commit_message = f":tada: Push model for window size {WINDOW} from huggingface space"
285
+ if is_my_laptop:
286
+ commit_message = f":tada: Push model for window size {WINDOW} from zephyrus"
287
+
288
+ classifier_model.push_to_hub(
289
+ repo_id=model_repository_name,
290
+ # subfolder=f"my-awesome-model-{WINDOW}", subfolder didn't work :/
291
+ commit_message=commit_message, # f":tada: Push model for window size {WINDOW}"
292
+ safe_serialization=False
293
+ )
294
+ pass
295
 
296
 
297
  if __name__ == '__main__':
298
  start()
299
  pass