sync from github
Browse files
src/backend/hflm_with_measurement.py
CHANGED
@@ -24,7 +24,7 @@ from transformers.models.auto.modeling_auto import (
|
|
24 |
MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
|
25 |
)
|
26 |
from transformers import TextStreamer
|
27 |
-
|
28 |
from lm_eval import utils
|
29 |
from lm_eval.api.instance import Instance
|
30 |
from lm_eval.api.model import TemplateLM
|
@@ -333,21 +333,26 @@ class HFLMWithMeasurement(HFLM):
|
|
333 |
linear_count = 0
|
334 |
element_wise_mul = 0
|
335 |
for name, module in self.model.named_modules():
|
336 |
-
if ('layers.0.' in name or
|
337 |
-
if 'experts.0.' in name:
|
|
|
|
|
338 |
if isinstance(module, torch.nn.Linear):
|
339 |
# print(name, module)
|
340 |
linear_count += 1
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
|
|
|
|
348 |
else:
|
349 |
continue
|
350 |
print(f"linear_count: {linear_count}")
|
|
|
351 |
|
352 |
stopping_criteria = stop_sequences_criteria(
|
353 |
self.tokenizer, stop, context.shape[1], context.shape[0]
|
@@ -373,13 +378,17 @@ class HFLMWithMeasurement(HFLM):
|
|
373 |
model_info = API.model_info(repo_id=self.pretrained, revision=self.revision)
|
374 |
model_size_param = get_model_size(model_info=model_info, precision=self.precision)
|
375 |
|
376 |
-
n_layers = model_config.num_hidden_layers if hasattr(model_config, "num_hidden_layers") else
|
|
|
|
|
377 |
d_model = model_config.hidden_size if hasattr(model_config, "hidden_size") else model_config.d_model
|
378 |
|
379 |
if hasattr(model_config, "num_experts_per_tok"):
|
380 |
n_experts_per_tok = model_config.num_experts_per_tok
|
381 |
elif hasattr(model_config, "num_selected_experts"):
|
382 |
n_experts_per_tok = model_config.num_selected_experts
|
|
|
|
|
383 |
else:
|
384 |
n_experts_per_tok = 1
|
385 |
|
@@ -389,16 +398,19 @@ class HFLMWithMeasurement(HFLM):
|
|
389 |
d_ff = model_config.intermediate_size
|
390 |
elif hasattr(model_config, "d_ff"):
|
391 |
d_ff = model_config.d_ff
|
|
|
|
|
|
|
|
|
392 |
else:
|
393 |
-
|
394 |
-
d_ff = d_model * model_config.ff_ratio
|
395 |
-
else:
|
396 |
-
raise ValueError("Unknown FFN dimension")
|
397 |
|
398 |
if hasattr(model_config, "num_local_experts"):
|
399 |
num_experts = model_config.num_local_experts
|
400 |
elif hasattr(model_config, "num_experts"):
|
401 |
num_experts = model_config.num_experts
|
|
|
|
|
402 |
else:
|
403 |
num_experts = 1
|
404 |
|
|
|
24 |
MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
|
25 |
)
|
26 |
from transformers import TextStreamer
|
27 |
+
from transformers.models.dbrx.modeling_dbrx import DbrxExpertGLU
|
28 |
from lm_eval import utils
|
29 |
from lm_eval.api.instance import Instance
|
30 |
from lm_eval.api.model import TemplateLM
|
|
|
333 |
linear_count = 0
|
334 |
element_wise_mul = 0
|
335 |
for name, module in self.model.named_modules():
|
336 |
+
if ('layers.0.' in name or "transformer.blocks.0" in name) and ('attn' not in name):
|
337 |
+
if 'experts.0.' in name or "ffn.experts" in name:
|
338 |
+
if "linear_v" in name:
|
339 |
+
element_wise_mul = 1
|
340 |
if isinstance(module, torch.nn.Linear):
|
341 |
# print(name, module)
|
342 |
linear_count += 1
|
343 |
+
elif isinstance(module, DbrxExpertGLU):
|
344 |
+
linear_count = 3
|
345 |
+
# elif 'experts' not in name:
|
346 |
+
# if ("gate" not in name and "router" not in name) or "gate_proj" in name:
|
347 |
+
# if "gate_proj" in name:
|
348 |
+
# element_wise_mul = 1
|
349 |
+
# if isinstance(module, torch.nn.Linear):
|
350 |
+
# # print(name, module)
|
351 |
+
# linear_count += 1
|
352 |
else:
|
353 |
continue
|
354 |
print(f"linear_count: {linear_count}")
|
355 |
+
print(f"element_wise_mul: {element_wise_mul}")
|
356 |
|
357 |
stopping_criteria = stop_sequences_criteria(
|
358 |
self.tokenizer, stop, context.shape[1], context.shape[0]
|
|
|
378 |
model_info = API.model_info(repo_id=self.pretrained, revision=self.revision)
|
379 |
model_size_param = get_model_size(model_info=model_info, precision=self.precision)
|
380 |
|
381 |
+
n_layers = model_config.num_hidden_layers if hasattr(model_config, "num_hidden_layers") else \
|
382 |
+
(model_config.num_layers if hasattr(model_config, "num_layers") else model_config.n_layers)
|
383 |
+
|
384 |
d_model = model_config.hidden_size if hasattr(model_config, "hidden_size") else model_config.d_model
|
385 |
|
386 |
if hasattr(model_config, "num_experts_per_tok"):
|
387 |
n_experts_per_tok = model_config.num_experts_per_tok
|
388 |
elif hasattr(model_config, "num_selected_experts"):
|
389 |
n_experts_per_tok = model_config.num_selected_experts
|
390 |
+
elif hasattr(model_config, "ffn_config"):
|
391 |
+
n_experts_per_tok = model_config.ffn_config.moe_top_k
|
392 |
else:
|
393 |
n_experts_per_tok = 1
|
394 |
|
|
|
398 |
d_ff = model_config.intermediate_size
|
399 |
elif hasattr(model_config, "d_ff"):
|
400 |
d_ff = model_config.d_ff
|
401 |
+
elif hasattr(model_config, "ff_ratio"):
|
402 |
+
d_ff = d_model * model_config.ff_ratio
|
403 |
+
elif hasattr(model_config, "ffn_config"):
|
404 |
+
d_ff = model_config.ffn_config.ffn_hidden_size
|
405 |
else:
|
406 |
+
raise ValueError("Unknown FFN dimension")
|
|
|
|
|
|
|
407 |
|
408 |
if hasattr(model_config, "num_local_experts"):
|
409 |
num_experts = model_config.num_local_experts
|
410 |
elif hasattr(model_config, "num_experts"):
|
411 |
num_experts = model_config.num_experts
|
412 |
+
elif hasattr(model_config, "ffn_config"):
|
413 |
+
num_experts = model_config.ffn_config.moe_num_experts
|
414 |
else:
|
415 |
num_experts = 1
|
416 |
|