Spaces:
Runtime error
Runtime error
Soumic
commited on
Commit
·
7bfa8a1
1
Parent(s):
ef3f3d9
:zap: Code is stable
Browse files
app.py
CHANGED
@@ -77,7 +77,7 @@ class PagingMQTLDataset(IterableDataset):
|
|
77 |
input_ids = self.bert_tokenizer(sequence)["input_ids"]
|
78 |
tokenized_tensor = torch.tensor(input_ids)
|
79 |
label_tensor = torch.tensor(label)
|
80 |
-
output_dict = {"input_ids": tokenized_tensor, "labels": label_tensor}
|
81 |
return output_dict # tokenized_tensor, label_tensor
|
82 |
|
83 |
|
@@ -137,7 +137,7 @@ def create_paging_train_val_test_datasets(tokenizer, WINDOW, is_debug, batch_siz
|
|
137 |
}
|
138 |
|
139 |
dataset_map = None
|
140 |
-
is_my_laptop = os.path.isfile("/src/inputdata/
|
141 |
if is_my_laptop:
|
142 |
dataset_map = load_dataset("csv", data_files=data_files, streaming=True)
|
143 |
else:
|
@@ -204,16 +204,17 @@ def start():
|
|
204 |
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
|
205 |
|
206 |
login_inside_huggingface_virtualmachine()
|
207 |
-
WINDOW =
|
208 |
batch_size = 100
|
209 |
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME, trust_remote_code=True)
|
210 |
-
|
211 |
-
|
212 |
-
|
|
|
213 |
args = {
|
214 |
"output_dir": "output_hyena_dna-mqtl_classification",
|
215 |
-
"num_train_epochs":
|
216 |
-
"max_steps":
|
217 |
# Set the number of steps you expect to train, originally 1000, takes too much time. So I set it to 10 to run faster and check my code/pipeline
|
218 |
"run_name": "laptop_run_hyena_dna-mqtl_classification", # Override run_name here
|
219 |
"per_device_train_batch_size": 1,
|
@@ -233,6 +234,7 @@ def start():
|
|
233 |
# train_dataset, eval_dataset, test_dataset = create_data_module(tokenizer=tokenizer, WINDOW=WINDOW,
|
234 |
# batch_size=batch_size,
|
235 |
# is_debug=False)
|
|
|
236 |
max_length = 32_000
|
237 |
sequence = 'ACTG' * int(max_length / 4)
|
238 |
# sequence = 'ACTG' * int(1000) # seq_len = 4000 it works!
|
@@ -243,13 +245,14 @@ def start():
|
|
243 |
# Create a dataset for training
|
244 |
run_the_code_ds = Dataset.from_dict({"input_ids": tokenized, "labels": labels})
|
245 |
run_the_code_ds.set_format("pt")
|
|
|
246 |
|
247 |
-
|
248 |
-
train_ds, val_ds, test_ds = run_the_code_ds, run_the_code_ds, run_the_code_ds
|
249 |
# train_ds.set_format("pt") # doesn't work!
|
250 |
|
251 |
trainer = Trainer(
|
252 |
-
model=
|
253 |
args=training_args,
|
254 |
train_dataset=train_ds,
|
255 |
eval_dataset=val_ds,
|
@@ -268,15 +271,29 @@ def start():
|
|
268 |
print(f"{test_results = }")
|
269 |
except Exception as oome:
|
270 |
print(f"{oome = }")
|
271 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
272 |
|
273 |
|
274 |
if __name__ == '__main__':
|
275 |
start()
|
276 |
pass
|
277 |
-
|
278 |
-
"""
|
279 |
-
git submodule add https://huggingface.co/spaces/fahimfarhan/hyenadna-sm-32k-mqtl-classifier-space src/huggingface-mqtl-classification-hyena-dna
|
280 |
-
|
281 |
-
"""
|
282 |
-
|
|
|
77 |
input_ids = self.bert_tokenizer(sequence)["input_ids"]
|
78 |
tokenized_tensor = torch.tensor(input_ids)
|
79 |
label_tensor = torch.tensor(label)
|
80 |
+
output_dict = {"input_ids": tokenized_tensor, "labels": label_tensor} # so this is now you do it?
|
81 |
return output_dict # tokenized_tensor, label_tensor
|
82 |
|
83 |
|
|
|
137 |
}
|
138 |
|
139 |
dataset_map = None
|
140 |
+
is_my_laptop = os.path.isfile("/home/soumic/Codes/mqtl-classification/src/inputdata/dataset_4000_train_binned.csv")
|
141 |
if is_my_laptop:
|
142 |
dataset_map = load_dataset("csv", data_files=data_files, streaming=True)
|
143 |
else:
|
|
|
204 |
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
|
205 |
|
206 |
login_inside_huggingface_virtualmachine()
|
207 |
+
WINDOW = 4000
|
208 |
batch_size = 100
|
209 |
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME, trust_remote_code=True)
|
210 |
+
classifier_model = AutoModelForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME,
|
211 |
+
torch_dtype=torch.bfloat16,
|
212 |
+
device_map="auto",
|
213 |
+
trust_remote_code=True)
|
214 |
args = {
|
215 |
"output_dir": "output_hyena_dna-mqtl_classification",
|
216 |
+
"num_train_epochs": 1,
|
217 |
+
"max_steps": 10,
|
218 |
# Set the number of steps you expect to train, originally 1000, takes too much time. So I set it to 10 to run faster and check my code/pipeline
|
219 |
"run_name": "laptop_run_hyena_dna-mqtl_classification", # Override run_name here
|
220 |
"per_device_train_batch_size": 1,
|
|
|
234 |
# train_dataset, eval_dataset, test_dataset = create_data_module(tokenizer=tokenizer, WINDOW=WINDOW,
|
235 |
# batch_size=batch_size,
|
236 |
# is_debug=False)
|
237 |
+
""" # example code
|
238 |
max_length = 32_000
|
239 |
sequence = 'ACTG' * int(max_length / 4)
|
240 |
# sequence = 'ACTG' * int(1000) # seq_len = 4000 it works!
|
|
|
245 |
# Create a dataset for training
|
246 |
run_the_code_ds = Dataset.from_dict({"input_ids": tokenized, "labels": labels})
|
247 |
run_the_code_ds.set_format("pt")
|
248 |
+
"""
|
249 |
|
250 |
+
train_ds, val_ds, test_ds = create_paging_train_val_test_datasets(tokenizer, WINDOW=WINDOW, is_debug=False)
|
251 |
+
# train_ds, val_ds, test_ds = run_the_code_ds, run_the_code_ds, run_the_code_ds
|
252 |
# train_ds.set_format("pt") # doesn't work!
|
253 |
|
254 |
trainer = Trainer(
|
255 |
+
model=classifier_model,
|
256 |
args=training_args,
|
257 |
train_dataset=train_ds,
|
258 |
eval_dataset=val_ds,
|
|
|
271 |
print(f"{test_results = }")
|
272 |
except Exception as oome:
|
273 |
print(f"{oome = }")
|
274 |
+
finally:
|
275 |
+
# save the model
|
276 |
+
model_name = "HyenaDnaMQtlClassifier"
|
277 |
+
is_my_laptop = os.path.isfile("/home/soumic/Codes/mqtl-classification/src/inputdata/dataset_4000_train_binned.csv")
|
278 |
+
model_repository_name = f"fahimfarhan/hyenadna-sm-32k-mqtl-classifier-seq-len-{WINDOW}"
|
279 |
+
|
280 |
+
model_subdirectory = f"my-awesome-model-{WINDOW}"
|
281 |
+
classifier_model.save_pretrained(save_directory=model_subdirectory, safe_serialization=False)
|
282 |
+
|
283 |
+
# push to the hub
|
284 |
+
commit_message = f":tada: Push model for window size {WINDOW} from huggingface space"
|
285 |
+
if is_my_laptop:
|
286 |
+
commit_message = f":tada: Push model for window size {WINDOW} from zephyrus"
|
287 |
+
|
288 |
+
classifier_model.push_to_hub(
|
289 |
+
repo_id=model_repository_name,
|
290 |
+
# subfolder=f"my-awesome-model-{WINDOW}", subfolder didn't work :/
|
291 |
+
commit_message=commit_message, # f":tada: Push model for window size {WINDOW}"
|
292 |
+
safe_serialization=False
|
293 |
+
)
|
294 |
+
pass
|
295 |
|
296 |
|
297 |
if __name__ == '__main__':
|
298 |
start()
|
299 |
pass
|
|
|
|
|
|
|
|
|
|
|
|