|
_gm.lm_head: |
|
accum_format: !Format 'SAME' |
|
approximation_function: !ApproximationFunction 'NONE' |
|
input_format: !Format 'BFP[8|8]{128,-1}(SN)' |
|
instance: !DmxModule 'Linear' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
smoothquant_scale_format: !Format 'SAME' |
|
state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/8891be16ed2ea14cfc59a26a6838ba29.pth |
|
weight_format: !Format 'BFP[8|8]{128,-1}(SN)' |
|
weight_sparseness: !Sparseness 'DENSE' |
|
_gm.transformer.drop: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
input_format: !Format 'SAME' |
|
instance: !DmxModule 'Dropout' |
|
output_format: !Format 'SAME' |
|
_gm.transformer.h.0.attn.attn_dropout: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
input_format: !Format 'SAME' |
|
instance: !DmxModule 'Dropout' |
|
output_format: !Format 'SAME' |
|
_gm.transformer.h.0.attn.c_attn: |
|
accum_format: !Format 'SAME' |
|
approximation_function: !ApproximationFunction 'NONE' |
|
bias_format: !Format 'BFP[24|8]{1,-1}(SN)' |
|
input_format: !Format 'BFP[8|8]{128,-1}(SN)' |
|
instance: !DmxModule 'Linear' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
smoothquant_scale_format: !Format 'SAME' |
|
state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/afde7bd75645a9d22cdc4b60f1fbf318.pth |
|
weight_format: !Format 'BFP[4|8]{128,-1}(SN)' |
|
weight_sparseness: !Sparseness 'DENSE' |
|
_gm.transformer.h.0.attn.c_proj: |
|
accum_format: !Format 'SAME' |
|
approximation_function: !ApproximationFunction 'NONE' |
|
bias_format: !Format 'BFP[24|8]{1,-1}(SN)' |
|
input_format: !Format 'BFP[8|8]{128,-1}(SN)' |
|
instance: !DmxModule 'Linear' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
smoothquant_scale_format: !Format 'SAME' |
|
state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/1761c7df73888eebc91ead3ef8ed46d5.pth |
|
weight_format: !Format 'BFP[4|8]{128,-1}(SN)' |
|
weight_sparseness: !Sparseness 'DENSE' |
|
_gm.transformer.h.0.attn.matmul: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
input_format: !Format 'BFP[8|8]{128,-1}(SN)' |
|
instance: !DmxModule 'ActActMatMul' |
|
multiplier_format: !Format 'BFP[8|8]{128,-2}(SN)' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
_gm.transformer.h.0.attn.matmul_1: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
input_format: !Format 'BFP[8|8]{128,-1}(SN)' |
|
instance: !DmxModule 'ActActMatMul' |
|
multiplier_format: !Format 'BFP[8|8]{128,-2}(SN)' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
_gm.transformer.h.0.attn.resid_dropout: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
input_format: !Format 'SAME' |
|
instance: !DmxModule 'Dropout' |
|
output_format: !Format 'SAME' |
|
_gm.transformer.h.0.attn.softmax: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
input_format: !Format 'FP[1|5|10,15](FN)' |
|
instance: !DmxModule 'Softmax' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
_gm.transformer.h.0.ln_1: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
bias_format: !Format 'SAME' |
|
input_format: !Format 'FP[1|5|10,15](FN)' |
|
instance: !DmxModule 'LayerNorm' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
weight_format: !Format 'SAME' |
|
_gm.transformer.h.0.ln_2: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
bias_format: !Format 'SAME' |
|
input_format: !Format 'FP[1|5|10,15](FN)' |
|
instance: !DmxModule 'LayerNorm' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
weight_format: !Format 'SAME' |
|
_gm.transformer.h.0.mlp.act: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
input_format: !Format 'FP[1|5|10,15](FN)' |
|
instance: !DmxModule 'GELU' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
_gm.transformer.h.0.mlp.c_fc: |
|
accum_format: !Format 'SAME' |
|
approximation_function: !ApproximationFunction 'NONE' |
|
bias_format: !Format 'BFP[24|8]{1,-1}(SN)' |
|
input_format: !Format 'BFP[8|8]{128,-1}(SN)' |
|
instance: !DmxModule 'Linear' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
smoothquant_scale_format: !Format 'SAME' |
|
state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/0872608e267020d4128c37718a258ee7.pth |
|
weight_format: !Format 'BFP[4|8]{128,-1}(SN)' |
|
weight_sparseness: !Sparseness 'DENSE' |
|
_gm.transformer.h.0.mlp.c_proj: |
|
accum_format: !Format 'SAME' |
|
approximation_function: !ApproximationFunction 'NONE' |
|
bias_format: !Format 'BFP[24|8]{1,-1}(SN)' |
|
input_format: !Format 'BFP[8|8]{128,-1}(SN)' |
|
instance: !DmxModule 'Linear' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
smoothquant_scale_format: !Format 'SAME' |
|
state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/b300412083508b593761bc342114bb84.pth |
|
weight_format: !Format 'BFP[4|8]{128,-1}(SN)' |
|
weight_sparseness: !Sparseness 'DENSE' |
|
_gm.transformer.h.0.mlp.dropout: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
input_format: !Format 'SAME' |
|
instance: !DmxModule 'Dropout' |
|
output_format: !Format 'SAME' |
|
_gm.transformer.h.0.resadd: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
input_format: !Format 'FP[1|5|10,15](FN)' |
|
instance: !DmxModule 'ResAdd' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
residual_format: !Format 'FP[1|5|10,15](FN)' |
|
_gm.transformer.h.0.resadd_1: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
input_format: !Format 'FP[1|5|10,15](FN)' |
|
instance: !DmxModule 'ResAdd' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
residual_format: !Format 'FP[1|5|10,15](FN)' |
|
_gm.transformer.h.1.attn.attn_dropout: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
input_format: !Format 'SAME' |
|
instance: !DmxModule 'Dropout' |
|
output_format: !Format 'SAME' |
|
_gm.transformer.h.1.attn.c_attn: |
|
accum_format: !Format 'SAME' |
|
approximation_function: !ApproximationFunction 'NONE' |
|
bias_format: !Format 'BFP[24|8]{1,-1}(SN)' |
|
input_format: !Format 'BFP[8|8]{128,-1}(SN)' |
|
instance: !DmxModule 'Linear' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
smoothquant_scale_format: !Format 'SAME' |
|
state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/ad536868f2092bb1f026d60c6b9c3d80.pth |
|
weight_format: !Format 'BFP[4|8]{128,-1}(SN)' |
|
weight_sparseness: !Sparseness 'DENSE' |
|
_gm.transformer.h.1.attn.c_proj: |
|
accum_format: !Format 'SAME' |
|
approximation_function: !ApproximationFunction 'NONE' |
|
bias_format: !Format 'BFP[24|8]{1,-1}(SN)' |
|
input_format: !Format 'BFP[8|8]{128,-1}(SN)' |
|
instance: !DmxModule 'Linear' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
smoothquant_scale_format: !Format 'SAME' |
|
state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/ed14512d4bb368ad7d16778dfca78e4b.pth |
|
weight_format: !Format 'BFP[4|8]{128,-1}(SN)' |
|
weight_sparseness: !Sparseness 'DENSE' |
|
_gm.transformer.h.1.attn.matmul: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
input_format: !Format 'BFP[8|8]{128,-1}(SN)' |
|
instance: !DmxModule 'ActActMatMul' |
|
multiplier_format: !Format 'BFP[8|8]{128,-2}(SN)' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
_gm.transformer.h.1.attn.matmul_1: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
input_format: !Format 'BFP[8|8]{128,-1}(SN)' |
|
instance: !DmxModule 'ActActMatMul' |
|
multiplier_format: !Format 'BFP[8|8]{128,-2}(SN)' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
_gm.transformer.h.1.attn.resid_dropout: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
input_format: !Format 'SAME' |
|
instance: !DmxModule 'Dropout' |
|
output_format: !Format 'SAME' |
|
_gm.transformer.h.1.attn.softmax: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
input_format: !Format 'FP[1|5|10,15](FN)' |
|
instance: !DmxModule 'Softmax' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
_gm.transformer.h.1.ln_1: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
bias_format: !Format 'SAME' |
|
input_format: !Format 'FP[1|5|10,15](FN)' |
|
instance: !DmxModule 'LayerNorm' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
weight_format: !Format 'SAME' |
|
_gm.transformer.h.1.ln_2: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
bias_format: !Format 'SAME' |
|
input_format: !Format 'FP[1|5|10,15](FN)' |
|
instance: !DmxModule 'LayerNorm' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
weight_format: !Format 'SAME' |
|
_gm.transformer.h.1.mlp.act: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
input_format: !Format 'FP[1|5|10,15](FN)' |
|
instance: !DmxModule 'GELU' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
_gm.transformer.h.1.mlp.c_fc: |
|
accum_format: !Format 'SAME' |
|
approximation_function: !ApproximationFunction 'NONE' |
|
bias_format: !Format 'BFP[24|8]{1,-1}(SN)' |
|
input_format: !Format 'BFP[8|8]{128,-1}(SN)' |
|
instance: !DmxModule 'Linear' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
smoothquant_scale_format: !Format 'SAME' |
|
state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/fbc26ac6c834c9f4f2cf0e0fe8d1505c.pth |
|
weight_format: !Format 'BFP[4|8]{128,-1}(SN)' |
|
weight_sparseness: !Sparseness 'DENSE' |
|
_gm.transformer.h.1.mlp.c_proj: |
|
accum_format: !Format 'SAME' |
|
approximation_function: !ApproximationFunction 'NONE' |
|
bias_format: !Format 'BFP[24|8]{1,-1}(SN)' |
|
input_format: !Format 'BFP[8|8]{128,-1}(SN)' |
|
instance: !DmxModule 'Linear' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
smoothquant_scale_format: !Format 'SAME' |
|
state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/13283ea3665932eb2112053911f3fad3.pth |
|
weight_format: !Format 'BFP[4|8]{128,-1}(SN)' |
|
weight_sparseness: !Sparseness 'DENSE' |
|
_gm.transformer.h.1.mlp.dropout: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
input_format: !Format 'SAME' |
|
instance: !DmxModule 'Dropout' |
|
output_format: !Format 'SAME' |
|
_gm.transformer.h.1.resadd: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
input_format: !Format 'FP[1|5|10,15](FN)' |
|
instance: !DmxModule 'ResAdd' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
residual_format: !Format 'FP[1|5|10,15](FN)' |
|
_gm.transformer.h.1.resadd_1: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
input_format: !Format 'FP[1|5|10,15](FN)' |
|
instance: !DmxModule 'ResAdd' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
residual_format: !Format 'FP[1|5|10,15](FN)' |
|
_gm.transformer.h.2.attn.attn_dropout: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
input_format: !Format 'SAME' |
|
instance: !DmxModule 'Dropout' |
|
output_format: !Format 'SAME' |
|
_gm.transformer.h.2.attn.c_attn: |
|
accum_format: !Format 'SAME' |
|
approximation_function: !ApproximationFunction 'NONE' |
|
bias_format: !Format 'BFP[24|8]{1,-1}(SN)' |
|
input_format: !Format 'BFP[8|8]{128,-1}(SN)' |
|
instance: !DmxModule 'Linear' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
smoothquant_scale_format: !Format 'SAME' |
|
state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/7ed7736f36ecaa3b7551b07f97317824.pth |
|
weight_format: !Format 'BFP[4|8]{128,-1}(SN)' |
|
weight_sparseness: !Sparseness 'DENSE' |
|
_gm.transformer.h.2.attn.c_proj: |
|
accum_format: !Format 'SAME' |
|
approximation_function: !ApproximationFunction 'NONE' |
|
bias_format: !Format 'BFP[24|8]{1,-1}(SN)' |
|
input_format: !Format 'BFP[8|8]{128,-1}(SN)' |
|
instance: !DmxModule 'Linear' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
smoothquant_scale_format: !Format 'SAME' |
|
state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/8046f060fa4d9278015268362a06dd01.pth |
|
weight_format: !Format 'BFP[4|8]{128,-1}(SN)' |
|
weight_sparseness: !Sparseness 'DENSE' |
|
_gm.transformer.h.2.attn.matmul: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
input_format: !Format 'BFP[8|8]{128,-1}(SN)' |
|
instance: !DmxModule 'ActActMatMul' |
|
multiplier_format: !Format 'BFP[8|8]{128,-2}(SN)' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
_gm.transformer.h.2.attn.matmul_1: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
input_format: !Format 'BFP[8|8]{128,-1}(SN)' |
|
instance: !DmxModule 'ActActMatMul' |
|
multiplier_format: !Format 'BFP[8|8]{128,-2}(SN)' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
_gm.transformer.h.2.attn.resid_dropout: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
input_format: !Format 'SAME' |
|
instance: !DmxModule 'Dropout' |
|
output_format: !Format 'SAME' |
|
_gm.transformer.h.2.attn.softmax: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
input_format: !Format 'FP[1|5|10,15](FN)' |
|
instance: !DmxModule 'Softmax' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
_gm.transformer.h.2.ln_1: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
bias_format: !Format 'SAME' |
|
input_format: !Format 'FP[1|5|10,15](FN)' |
|
instance: !DmxModule 'LayerNorm' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
weight_format: !Format 'SAME' |
|
_gm.transformer.h.2.ln_2: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
bias_format: !Format 'SAME' |
|
input_format: !Format 'FP[1|5|10,15](FN)' |
|
instance: !DmxModule 'LayerNorm' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
weight_format: !Format 'SAME' |
|
_gm.transformer.h.2.mlp.act: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
input_format: !Format 'FP[1|5|10,15](FN)' |
|
instance: !DmxModule 'GELU' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
_gm.transformer.h.2.mlp.c_fc: |
|
accum_format: !Format 'SAME' |
|
approximation_function: !ApproximationFunction 'NONE' |
|
bias_format: !Format 'BFP[24|8]{1,-1}(SN)' |
|
input_format: !Format 'BFP[8|8]{128,-1}(SN)' |
|
instance: !DmxModule 'Linear' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
smoothquant_scale_format: !Format 'SAME' |
|
state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/6f096ef7ae5e56a0ec533b9679c30b39.pth |
|
weight_format: !Format 'BFP[4|8]{128,-1}(SN)' |
|
weight_sparseness: !Sparseness 'DENSE' |
|
_gm.transformer.h.2.mlp.c_proj: |
|
accum_format: !Format 'SAME' |
|
approximation_function: !ApproximationFunction 'NONE' |
|
bias_format: !Format 'BFP[24|8]{1,-1}(SN)' |
|
input_format: !Format 'BFP[8|8]{128,-1}(SN)' |
|
instance: !DmxModule 'Linear' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
smoothquant_scale_format: !Format 'SAME' |
|
state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/fa911cfcb1982e0577c1d3a7f36a0454.pth |
|
weight_format: !Format 'BFP[4|8]{128,-1}(SN)' |
|
weight_sparseness: !Sparseness 'DENSE' |
|
_gm.transformer.h.2.mlp.dropout: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
input_format: !Format 'SAME' |
|
instance: !DmxModule 'Dropout' |
|
output_format: !Format 'SAME' |
|
_gm.transformer.h.2.resadd: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
input_format: !Format 'FP[1|5|10,15](FN)' |
|
instance: !DmxModule 'ResAdd' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
residual_format: !Format 'FP[1|5|10,15](FN)' |
|
_gm.transformer.h.2.resadd_1: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
input_format: !Format 'FP[1|5|10,15](FN)' |
|
instance: !DmxModule 'ResAdd' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
residual_format: !Format 'FP[1|5|10,15](FN)' |
|
_gm.transformer.h.3.attn.attn_dropout: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
input_format: !Format 'SAME' |
|
instance: !DmxModule 'Dropout' |
|
output_format: !Format 'SAME' |
|
_gm.transformer.h.3.attn.c_attn: |
|
accum_format: !Format 'SAME' |
|
approximation_function: !ApproximationFunction 'NONE' |
|
bias_format: !Format 'BFP[24|8]{1,-1}(SN)' |
|
input_format: !Format 'BFP[8|8]{128,-1}(SN)' |
|
instance: !DmxModule 'Linear' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
smoothquant_scale_format: !Format 'SAME' |
|
state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/c951bf4ab9b12f759db16896ab547a3c.pth |
|
weight_format: !Format 'BFP[4|8]{128,-1}(SN)' |
|
weight_sparseness: !Sparseness 'DENSE' |
|
_gm.transformer.h.3.attn.c_proj: |
|
accum_format: !Format 'SAME' |
|
approximation_function: !ApproximationFunction 'NONE' |
|
bias_format: !Format 'BFP[24|8]{1,-1}(SN)' |
|
input_format: !Format 'BFP[8|8]{128,-1}(SN)' |
|
instance: !DmxModule 'Linear' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
smoothquant_scale_format: !Format 'SAME' |
|
state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/f4a90368b689674302c9c1057e8de3d4.pth |
|
weight_format: !Format 'BFP[4|8]{128,-1}(SN)' |
|
weight_sparseness: !Sparseness 'DENSE' |
|
_gm.transformer.h.3.attn.matmul: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
input_format: !Format 'BFP[8|8]{128,-1}(SN)' |
|
instance: !DmxModule 'ActActMatMul' |
|
multiplier_format: !Format 'BFP[8|8]{128,-2}(SN)' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
_gm.transformer.h.3.attn.matmul_1: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
input_format: !Format 'BFP[8|8]{128,-1}(SN)' |
|
instance: !DmxModule 'ActActMatMul' |
|
multiplier_format: !Format 'BFP[8|8]{128,-2}(SN)' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
_gm.transformer.h.3.attn.resid_dropout: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
input_format: !Format 'SAME' |
|
instance: !DmxModule 'Dropout' |
|
output_format: !Format 'SAME' |
|
_gm.transformer.h.3.attn.softmax: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
input_format: !Format 'FP[1|5|10,15](FN)' |
|
instance: !DmxModule 'Softmax' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
_gm.transformer.h.3.ln_1: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
bias_format: !Format 'SAME' |
|
input_format: !Format 'FP[1|5|10,15](FN)' |
|
instance: !DmxModule 'LayerNorm' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
weight_format: !Format 'SAME' |
|
_gm.transformer.h.3.ln_2: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
bias_format: !Format 'SAME' |
|
input_format: !Format 'FP[1|5|10,15](FN)' |
|
instance: !DmxModule 'LayerNorm' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
weight_format: !Format 'SAME' |
|
_gm.transformer.h.3.mlp.act: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
input_format: !Format 'FP[1|5|10,15](FN)' |
|
instance: !DmxModule 'GELU' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
_gm.transformer.h.3.mlp.c_fc: |
|
accum_format: !Format 'SAME' |
|
approximation_function: !ApproximationFunction 'NONE' |
|
bias_format: !Format 'BFP[24|8]{1,-1}(SN)' |
|
input_format: !Format 'BFP[8|8]{128,-1}(SN)' |
|
instance: !DmxModule 'Linear' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
smoothquant_scale_format: !Format 'SAME' |
|
state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/6ffcfb2adf95138b85c3920b31ca619a.pth |
|
weight_format: !Format 'BFP[4|8]{128,-1}(SN)' |
|
weight_sparseness: !Sparseness 'DENSE' |
|
_gm.transformer.h.3.mlp.c_proj: |
|
accum_format: !Format 'SAME' |
|
approximation_function: !ApproximationFunction 'NONE' |
|
bias_format: !Format 'BFP[24|8]{1,-1}(SN)' |
|
input_format: !Format 'BFP[8|8]{128,-1}(SN)' |
|
instance: !DmxModule 'Linear' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
smoothquant_scale_format: !Format 'SAME' |
|
state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/231c6b2eb59e46a3ee76455a6eedd948.pth |
|
weight_format: !Format 'BFP[4|8]{128,-1}(SN)' |
|
weight_sparseness: !Sparseness 'DENSE' |
|
_gm.transformer.h.3.mlp.dropout: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
input_format: !Format 'SAME' |
|
instance: !DmxModule 'Dropout' |
|
output_format: !Format 'SAME' |
|
_gm.transformer.h.3.resadd: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
input_format: !Format 'FP[1|5|10,15](FN)' |
|
instance: !DmxModule 'ResAdd' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
residual_format: !Format 'FP[1|5|10,15](FN)' |
|
_gm.transformer.h.3.resadd_1: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
input_format: !Format 'FP[1|5|10,15](FN)' |
|
instance: !DmxModule 'ResAdd' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
residual_format: !Format 'FP[1|5|10,15](FN)' |
|
_gm.transformer.h.4.attn.attn_dropout: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
input_format: !Format 'SAME' |
|
instance: !DmxModule 'Dropout' |
|
output_format: !Format 'SAME' |
|
_gm.transformer.h.4.attn.c_attn: |
|
accum_format: !Format 'SAME' |
|
approximation_function: !ApproximationFunction 'NONE' |
|
bias_format: !Format 'BFP[24|8]{1,-1}(SN)' |
|
input_format: !Format 'BFP[8|8]{128,-1}(SN)' |
|
instance: !DmxModule 'Linear' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
smoothquant_scale_format: !Format 'SAME' |
|
state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/defe3e44f3377caa14f35bdf611a2632.pth |
|
weight_format: !Format 'BFP[4|8]{128,-1}(SN)' |
|
weight_sparseness: !Sparseness 'DENSE' |
|
_gm.transformer.h.4.attn.c_proj: |
|
accum_format: !Format 'SAME' |
|
approximation_function: !ApproximationFunction 'NONE' |
|
bias_format: !Format 'BFP[24|8]{1,-1}(SN)' |
|
input_format: !Format 'BFP[8|8]{128,-1}(SN)' |
|
instance: !DmxModule 'Linear' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
smoothquant_scale_format: !Format 'SAME' |
|
state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/3f31ba127de772f7f29c8cca0723ffcc.pth |
|
weight_format: !Format 'BFP[4|8]{128,-1}(SN)' |
|
weight_sparseness: !Sparseness 'DENSE' |
|
_gm.transformer.h.4.attn.matmul: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
input_format: !Format 'BFP[8|8]{128,-1}(SN)' |
|
instance: !DmxModule 'ActActMatMul' |
|
multiplier_format: !Format 'BFP[8|8]{128,-2}(SN)' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
_gm.transformer.h.4.attn.matmul_1: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
input_format: !Format 'BFP[8|8]{128,-1}(SN)' |
|
instance: !DmxModule 'ActActMatMul' |
|
multiplier_format: !Format 'BFP[8|8]{128,-2}(SN)' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
_gm.transformer.h.4.attn.resid_dropout: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
input_format: !Format 'SAME' |
|
instance: !DmxModule 'Dropout' |
|
output_format: !Format 'SAME' |
|
_gm.transformer.h.4.attn.softmax: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
input_format: !Format 'FP[1|5|10,15](FN)' |
|
instance: !DmxModule 'Softmax' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
_gm.transformer.h.4.ln_1: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
bias_format: !Format 'SAME' |
|
input_format: !Format 'FP[1|5|10,15](FN)' |
|
instance: !DmxModule 'LayerNorm' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
weight_format: !Format 'SAME' |
|
_gm.transformer.h.4.ln_2: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
bias_format: !Format 'SAME' |
|
input_format: !Format 'FP[1|5|10,15](FN)' |
|
instance: !DmxModule 'LayerNorm' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
weight_format: !Format 'SAME' |
|
_gm.transformer.h.4.mlp.act: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
input_format: !Format 'FP[1|5|10,15](FN)' |
|
instance: !DmxModule 'GELU' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
_gm.transformer.h.4.mlp.c_fc: |
|
accum_format: !Format 'SAME' |
|
approximation_function: !ApproximationFunction 'NONE' |
|
bias_format: !Format 'BFP[24|8]{1,-1}(SN)' |
|
input_format: !Format 'BFP[8|8]{128,-1}(SN)' |
|
instance: !DmxModule 'Linear' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
smoothquant_scale_format: !Format 'SAME' |
|
state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/33e67804659d50361fc310a868c21097.pth |
|
weight_format: !Format 'BFP[4|8]{128,-1}(SN)' |
|
weight_sparseness: !Sparseness 'DENSE' |
|
_gm.transformer.h.4.mlp.c_proj: |
|
accum_format: !Format 'SAME' |
|
approximation_function: !ApproximationFunction 'NONE' |
|
bias_format: !Format 'BFP[24|8]{1,-1}(SN)' |
|
input_format: !Format 'BFP[8|8]{128,-1}(SN)' |
|
instance: !DmxModule 'Linear' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
smoothquant_scale_format: !Format 'SAME' |
|
state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/64b47243fbe8f9dbcd7ecef1d9d10af7.pth |
|
weight_format: !Format 'BFP[4|8]{128,-1}(SN)' |
|
weight_sparseness: !Sparseness 'DENSE' |
|
_gm.transformer.h.4.mlp.dropout: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
input_format: !Format 'SAME' |
|
instance: !DmxModule 'Dropout' |
|
output_format: !Format 'SAME' |
|
_gm.transformer.h.4.resadd: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
input_format: !Format 'FP[1|5|10,15](FN)' |
|
instance: !DmxModule 'ResAdd' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
residual_format: !Format 'FP[1|5|10,15](FN)' |
|
_gm.transformer.h.4.resadd_1: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
input_format: !Format 'FP[1|5|10,15](FN)' |
|
instance: !DmxModule 'ResAdd' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
residual_format: !Format 'FP[1|5|10,15](FN)' |
|
_gm.transformer.h.5.attn.attn_dropout: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
input_format: !Format 'SAME' |
|
instance: !DmxModule 'Dropout' |
|
output_format: !Format 'SAME' |
|
_gm.transformer.h.5.attn.c_attn: |
|
accum_format: !Format 'SAME' |
|
approximation_function: !ApproximationFunction 'NONE' |
|
bias_format: !Format 'BFP[24|8]{1,-1}(SN)' |
|
input_format: !Format 'BFP[8|8]{128,-1}(SN)' |
|
instance: !DmxModule 'Linear' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
smoothquant_scale_format: !Format 'SAME' |
|
state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/4094cdca91aeccbd21edd90fe5ab2b1b.pth |
|
weight_format: !Format 'BFP[4|8]{128,-1}(SN)' |
|
weight_sparseness: !Sparseness 'DENSE' |
|
_gm.transformer.h.5.attn.c_proj: |
|
accum_format: !Format 'SAME' |
|
approximation_function: !ApproximationFunction 'NONE' |
|
bias_format: !Format 'BFP[24|8]{1,-1}(SN)' |
|
input_format: !Format 'BFP[8|8]{128,-1}(SN)' |
|
instance: !DmxModule 'Linear' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
smoothquant_scale_format: !Format 'SAME' |
|
state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/b91a79c86f968d0757761896ae394fa0.pth |
|
weight_format: !Format 'BFP[4|8]{128,-1}(SN)' |
|
weight_sparseness: !Sparseness 'DENSE' |
|
_gm.transformer.h.5.attn.matmul: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
input_format: !Format 'BFP[8|8]{128,-1}(SN)' |
|
instance: !DmxModule 'ActActMatMul' |
|
multiplier_format: !Format 'BFP[8|8]{128,-2}(SN)' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
_gm.transformer.h.5.attn.matmul_1: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
input_format: !Format 'BFP[8|8]{128,-1}(SN)' |
|
instance: !DmxModule 'ActActMatMul' |
|
multiplier_format: !Format 'BFP[8|8]{128,-2}(SN)' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
_gm.transformer.h.5.attn.resid_dropout: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
input_format: !Format 'SAME' |
|
instance: !DmxModule 'Dropout' |
|
output_format: !Format 'SAME' |
|
_gm.transformer.h.5.attn.softmax: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
input_format: !Format 'FP[1|5|10,15](FN)' |
|
instance: !DmxModule 'Softmax' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
_gm.transformer.h.5.ln_1: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
bias_format: !Format 'SAME' |
|
input_format: !Format 'FP[1|5|10,15](FN)' |
|
instance: !DmxModule 'LayerNorm' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
weight_format: !Format 'SAME' |
|
_gm.transformer.h.5.ln_2: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
bias_format: !Format 'SAME' |
|
input_format: !Format 'FP[1|5|10,15](FN)' |
|
instance: !DmxModule 'LayerNorm' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
weight_format: !Format 'SAME' |
|
_gm.transformer.h.5.mlp.act: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
input_format: !Format 'FP[1|5|10,15](FN)' |
|
instance: !DmxModule 'GELU' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
_gm.transformer.h.5.mlp.c_fc: |
|
accum_format: !Format 'SAME' |
|
approximation_function: !ApproximationFunction 'NONE' |
|
bias_format: !Format 'BFP[24|8]{1,-1}(SN)' |
|
input_format: !Format 'BFP[8|8]{128,-1}(SN)' |
|
instance: !DmxModule 'Linear' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
smoothquant_scale_format: !Format 'SAME' |
|
state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/e279d6c6717b5e276072f366d1ec826e.pth |
|
weight_format: !Format 'BFP[4|8]{128,-1}(SN)' |
|
weight_sparseness: !Sparseness 'DENSE' |
|
_gm.transformer.h.5.mlp.c_proj: |
|
accum_format: !Format 'SAME' |
|
approximation_function: !ApproximationFunction 'NONE' |
|
bias_format: !Format 'BFP[24|8]{1,-1}(SN)' |
|
input_format: !Format 'BFP[8|8]{128,-1}(SN)' |
|
instance: !DmxModule 'Linear' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
smoothquant_scale_format: !Format 'SAME' |
|
state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/6f08db70eeeb788df7ebe19929a26982.pth |
|
weight_format: !Format 'BFP[4|8]{128,-1}(SN)' |
|
weight_sparseness: !Sparseness 'DENSE' |
|
_gm.transformer.h.5.mlp.dropout: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
input_format: !Format 'SAME' |
|
instance: !DmxModule 'Dropout' |
|
output_format: !Format 'SAME' |
|
_gm.transformer.h.5.resadd: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
input_format: !Format 'FP[1|5|10,15](FN)' |
|
instance: !DmxModule 'ResAdd' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
residual_format: !Format 'FP[1|5|10,15](FN)' |
|
_gm.transformer.h.5.resadd_1: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
input_format: !Format 'FP[1|5|10,15](FN)' |
|
instance: !DmxModule 'ResAdd' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
residual_format: !Format 'FP[1|5|10,15](FN)' |
|
_gm.transformer.ln_f: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
bias_format: !Format 'SAME' |
|
input_format: !Format 'FP[1|5|10,15](FN)' |
|
instance: !DmxModule 'LayerNorm' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
weight_format: !Format 'SAME' |
|
_gm.transformer.wpe: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
input_format: !Format 'SAME' |
|
instance: !DmxModule 'Embedding' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
weight_format: !Format 'SAME' |
|
weight_sparseness: !Sparseness 'DENSE' |
|
_gm.transformer.wte: |
|
approximation_function: !ApproximationFunction 'NONE' |
|
input_format: !Format 'SAME' |
|
instance: !DmxModule 'Embedding' |
|
output_format: !Format 'FP[1|5|10,15](FN)' |
|
weight_format: !Format 'SAME' |
|
weight_sparseness: !Sparseness 'DENSE' |
|
|