distilgpt2 / configs /myconfig.yaml
bmah-dmx's picture
Added DistilGPT2 model
710ed6f
_gm.lm_head:
accum_format: !Format 'SAME'
approximation_function: !ApproximationFunction 'NONE'
input_format: !Format 'BFP[8|8]{128,-1}(SN)'
instance: !DmxModule 'Linear'
output_format: !Format 'FP[1|5|10,15](FN)'
smoothquant_scale_format: !Format 'SAME'
state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/8891be16ed2ea14cfc59a26a6838ba29.pth
weight_format: !Format 'BFP[8|8]{128,-1}(SN)'
weight_sparseness: !Sparseness 'DENSE'
_gm.transformer.drop:
approximation_function: !ApproximationFunction 'NONE'
input_format: !Format 'SAME'
instance: !DmxModule 'Dropout'
output_format: !Format 'SAME'
_gm.transformer.h.0.attn.attn_dropout:
approximation_function: !ApproximationFunction 'NONE'
input_format: !Format 'SAME'
instance: !DmxModule 'Dropout'
output_format: !Format 'SAME'
_gm.transformer.h.0.attn.c_attn:
accum_format: !Format 'SAME'
approximation_function: !ApproximationFunction 'NONE'
bias_format: !Format 'BFP[24|8]{1,-1}(SN)'
input_format: !Format 'BFP[8|8]{128,-1}(SN)'
instance: !DmxModule 'Linear'
output_format: !Format 'FP[1|5|10,15](FN)'
smoothquant_scale_format: !Format 'SAME'
state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/afde7bd75645a9d22cdc4b60f1fbf318.pth
weight_format: !Format 'BFP[4|8]{128,-1}(SN)'
weight_sparseness: !Sparseness 'DENSE'
_gm.transformer.h.0.attn.c_proj:
accum_format: !Format 'SAME'
approximation_function: !ApproximationFunction 'NONE'
bias_format: !Format 'BFP[24|8]{1,-1}(SN)'
input_format: !Format 'BFP[8|8]{128,-1}(SN)'
instance: !DmxModule 'Linear'
output_format: !Format 'FP[1|5|10,15](FN)'
smoothquant_scale_format: !Format 'SAME'
state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/1761c7df73888eebc91ead3ef8ed46d5.pth
weight_format: !Format 'BFP[4|8]{128,-1}(SN)'
weight_sparseness: !Sparseness 'DENSE'
_gm.transformer.h.0.attn.matmul:
approximation_function: !ApproximationFunction 'NONE'
input_format: !Format 'BFP[8|8]{128,-1}(SN)'
instance: !DmxModule 'ActActMatMul'
multiplier_format: !Format 'BFP[8|8]{128,-2}(SN)'
output_format: !Format 'FP[1|5|10,15](FN)'
_gm.transformer.h.0.attn.matmul_1:
approximation_function: !ApproximationFunction 'NONE'
input_format: !Format 'BFP[8|8]{128,-1}(SN)'
instance: !DmxModule 'ActActMatMul'
multiplier_format: !Format 'BFP[8|8]{128,-2}(SN)'
output_format: !Format 'FP[1|5|10,15](FN)'
_gm.transformer.h.0.attn.resid_dropout:
approximation_function: !ApproximationFunction 'NONE'
input_format: !Format 'SAME'
instance: !DmxModule 'Dropout'
output_format: !Format 'SAME'
_gm.transformer.h.0.attn.softmax:
approximation_function: !ApproximationFunction 'NONE'
input_format: !Format 'FP[1|5|10,15](FN)'
instance: !DmxModule 'Softmax'
output_format: !Format 'FP[1|5|10,15](FN)'
_gm.transformer.h.0.ln_1:
approximation_function: !ApproximationFunction 'NONE'
bias_format: !Format 'SAME'
input_format: !Format 'FP[1|5|10,15](FN)'
instance: !DmxModule 'LayerNorm'
output_format: !Format 'FP[1|5|10,15](FN)'
weight_format: !Format 'SAME'
_gm.transformer.h.0.ln_2:
approximation_function: !ApproximationFunction 'NONE'
bias_format: !Format 'SAME'
input_format: !Format 'FP[1|5|10,15](FN)'
instance: !DmxModule 'LayerNorm'
output_format: !Format 'FP[1|5|10,15](FN)'
weight_format: !Format 'SAME'
_gm.transformer.h.0.mlp.act:
approximation_function: !ApproximationFunction 'NONE'
input_format: !Format 'FP[1|5|10,15](FN)'
instance: !DmxModule 'GELU'
output_format: !Format 'FP[1|5|10,15](FN)'
_gm.transformer.h.0.mlp.c_fc:
accum_format: !Format 'SAME'
approximation_function: !ApproximationFunction 'NONE'
bias_format: !Format 'BFP[24|8]{1,-1}(SN)'
input_format: !Format 'BFP[8|8]{128,-1}(SN)'
instance: !DmxModule 'Linear'
output_format: !Format 'FP[1|5|10,15](FN)'
smoothquant_scale_format: !Format 'SAME'
state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/0872608e267020d4128c37718a258ee7.pth
weight_format: !Format 'BFP[4|8]{128,-1}(SN)'
weight_sparseness: !Sparseness 'DENSE'
_gm.transformer.h.0.mlp.c_proj:
accum_format: !Format 'SAME'
approximation_function: !ApproximationFunction 'NONE'
bias_format: !Format 'BFP[24|8]{1,-1}(SN)'
input_format: !Format 'BFP[8|8]{128,-1}(SN)'
instance: !DmxModule 'Linear'
output_format: !Format 'FP[1|5|10,15](FN)'
smoothquant_scale_format: !Format 'SAME'
state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/b300412083508b593761bc342114bb84.pth
weight_format: !Format 'BFP[4|8]{128,-1}(SN)'
weight_sparseness: !Sparseness 'DENSE'
_gm.transformer.h.0.mlp.dropout:
approximation_function: !ApproximationFunction 'NONE'
input_format: !Format 'SAME'
instance: !DmxModule 'Dropout'
output_format: !Format 'SAME'
_gm.transformer.h.0.resadd:
approximation_function: !ApproximationFunction 'NONE'
input_format: !Format 'FP[1|5|10,15](FN)'
instance: !DmxModule 'ResAdd'
output_format: !Format 'FP[1|5|10,15](FN)'
residual_format: !Format 'FP[1|5|10,15](FN)'
_gm.transformer.h.0.resadd_1:
approximation_function: !ApproximationFunction 'NONE'
input_format: !Format 'FP[1|5|10,15](FN)'
instance: !DmxModule 'ResAdd'
output_format: !Format 'FP[1|5|10,15](FN)'
residual_format: !Format 'FP[1|5|10,15](FN)'
_gm.transformer.h.1.attn.attn_dropout:
approximation_function: !ApproximationFunction 'NONE'
input_format: !Format 'SAME'
instance: !DmxModule 'Dropout'
output_format: !Format 'SAME'
_gm.transformer.h.1.attn.c_attn:
accum_format: !Format 'SAME'
approximation_function: !ApproximationFunction 'NONE'
bias_format: !Format 'BFP[24|8]{1,-1}(SN)'
input_format: !Format 'BFP[8|8]{128,-1}(SN)'
instance: !DmxModule 'Linear'
output_format: !Format 'FP[1|5|10,15](FN)'
smoothquant_scale_format: !Format 'SAME'
state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/ad536868f2092bb1f026d60c6b9c3d80.pth
weight_format: !Format 'BFP[4|8]{128,-1}(SN)'
weight_sparseness: !Sparseness 'DENSE'
_gm.transformer.h.1.attn.c_proj:
accum_format: !Format 'SAME'
approximation_function: !ApproximationFunction 'NONE'
bias_format: !Format 'BFP[24|8]{1,-1}(SN)'
input_format: !Format 'BFP[8|8]{128,-1}(SN)'
instance: !DmxModule 'Linear'
output_format: !Format 'FP[1|5|10,15](FN)'
smoothquant_scale_format: !Format 'SAME'
state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/ed14512d4bb368ad7d16778dfca78e4b.pth
weight_format: !Format 'BFP[4|8]{128,-1}(SN)'
weight_sparseness: !Sparseness 'DENSE'
_gm.transformer.h.1.attn.matmul:
approximation_function: !ApproximationFunction 'NONE'
input_format: !Format 'BFP[8|8]{128,-1}(SN)'
instance: !DmxModule 'ActActMatMul'
multiplier_format: !Format 'BFP[8|8]{128,-2}(SN)'
output_format: !Format 'FP[1|5|10,15](FN)'
_gm.transformer.h.1.attn.matmul_1:
approximation_function: !ApproximationFunction 'NONE'
input_format: !Format 'BFP[8|8]{128,-1}(SN)'
instance: !DmxModule 'ActActMatMul'
multiplier_format: !Format 'BFP[8|8]{128,-2}(SN)'
output_format: !Format 'FP[1|5|10,15](FN)'
_gm.transformer.h.1.attn.resid_dropout:
approximation_function: !ApproximationFunction 'NONE'
input_format: !Format 'SAME'
instance: !DmxModule 'Dropout'
output_format: !Format 'SAME'
_gm.transformer.h.1.attn.softmax:
approximation_function: !ApproximationFunction 'NONE'
input_format: !Format 'FP[1|5|10,15](FN)'
instance: !DmxModule 'Softmax'
output_format: !Format 'FP[1|5|10,15](FN)'
_gm.transformer.h.1.ln_1:
approximation_function: !ApproximationFunction 'NONE'
bias_format: !Format 'SAME'
input_format: !Format 'FP[1|5|10,15](FN)'
instance: !DmxModule 'LayerNorm'
output_format: !Format 'FP[1|5|10,15](FN)'
weight_format: !Format 'SAME'
_gm.transformer.h.1.ln_2:
approximation_function: !ApproximationFunction 'NONE'
bias_format: !Format 'SAME'
input_format: !Format 'FP[1|5|10,15](FN)'
instance: !DmxModule 'LayerNorm'
output_format: !Format 'FP[1|5|10,15](FN)'
weight_format: !Format 'SAME'
_gm.transformer.h.1.mlp.act:
approximation_function: !ApproximationFunction 'NONE'
input_format: !Format 'FP[1|5|10,15](FN)'
instance: !DmxModule 'GELU'
output_format: !Format 'FP[1|5|10,15](FN)'
_gm.transformer.h.1.mlp.c_fc:
accum_format: !Format 'SAME'
approximation_function: !ApproximationFunction 'NONE'
bias_format: !Format 'BFP[24|8]{1,-1}(SN)'
input_format: !Format 'BFP[8|8]{128,-1}(SN)'
instance: !DmxModule 'Linear'
output_format: !Format 'FP[1|5|10,15](FN)'
smoothquant_scale_format: !Format 'SAME'
state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/fbc26ac6c834c9f4f2cf0e0fe8d1505c.pth
weight_format: !Format 'BFP[4|8]{128,-1}(SN)'
weight_sparseness: !Sparseness 'DENSE'
_gm.transformer.h.1.mlp.c_proj:
accum_format: !Format 'SAME'
approximation_function: !ApproximationFunction 'NONE'
bias_format: !Format 'BFP[24|8]{1,-1}(SN)'
input_format: !Format 'BFP[8|8]{128,-1}(SN)'
instance: !DmxModule 'Linear'
output_format: !Format 'FP[1|5|10,15](FN)'
smoothquant_scale_format: !Format 'SAME'
state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/13283ea3665932eb2112053911f3fad3.pth
weight_format: !Format 'BFP[4|8]{128,-1}(SN)'
weight_sparseness: !Sparseness 'DENSE'
_gm.transformer.h.1.mlp.dropout:
approximation_function: !ApproximationFunction 'NONE'
input_format: !Format 'SAME'
instance: !DmxModule 'Dropout'
output_format: !Format 'SAME'
_gm.transformer.h.1.resadd:
approximation_function: !ApproximationFunction 'NONE'
input_format: !Format 'FP[1|5|10,15](FN)'
instance: !DmxModule 'ResAdd'
output_format: !Format 'FP[1|5|10,15](FN)'
residual_format: !Format 'FP[1|5|10,15](FN)'
_gm.transformer.h.1.resadd_1:
approximation_function: !ApproximationFunction 'NONE'
input_format: !Format 'FP[1|5|10,15](FN)'
instance: !DmxModule 'ResAdd'
output_format: !Format 'FP[1|5|10,15](FN)'
residual_format: !Format 'FP[1|5|10,15](FN)'
_gm.transformer.h.2.attn.attn_dropout:
approximation_function: !ApproximationFunction 'NONE'
input_format: !Format 'SAME'
instance: !DmxModule 'Dropout'
output_format: !Format 'SAME'
_gm.transformer.h.2.attn.c_attn:
accum_format: !Format 'SAME'
approximation_function: !ApproximationFunction 'NONE'
bias_format: !Format 'BFP[24|8]{1,-1}(SN)'
input_format: !Format 'BFP[8|8]{128,-1}(SN)'
instance: !DmxModule 'Linear'
output_format: !Format 'FP[1|5|10,15](FN)'
smoothquant_scale_format: !Format 'SAME'
state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/7ed7736f36ecaa3b7551b07f97317824.pth
weight_format: !Format 'BFP[4|8]{128,-1}(SN)'
weight_sparseness: !Sparseness 'DENSE'
_gm.transformer.h.2.attn.c_proj:
accum_format: !Format 'SAME'
approximation_function: !ApproximationFunction 'NONE'
bias_format: !Format 'BFP[24|8]{1,-1}(SN)'
input_format: !Format 'BFP[8|8]{128,-1}(SN)'
instance: !DmxModule 'Linear'
output_format: !Format 'FP[1|5|10,15](FN)'
smoothquant_scale_format: !Format 'SAME'
state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/8046f060fa4d9278015268362a06dd01.pth
weight_format: !Format 'BFP[4|8]{128,-1}(SN)'
weight_sparseness: !Sparseness 'DENSE'
_gm.transformer.h.2.attn.matmul:
approximation_function: !ApproximationFunction 'NONE'
input_format: !Format 'BFP[8|8]{128,-1}(SN)'
instance: !DmxModule 'ActActMatMul'
multiplier_format: !Format 'BFP[8|8]{128,-2}(SN)'
output_format: !Format 'FP[1|5|10,15](FN)'
_gm.transformer.h.2.attn.matmul_1:
approximation_function: !ApproximationFunction 'NONE'
input_format: !Format 'BFP[8|8]{128,-1}(SN)'
instance: !DmxModule 'ActActMatMul'
multiplier_format: !Format 'BFP[8|8]{128,-2}(SN)'
output_format: !Format 'FP[1|5|10,15](FN)'
_gm.transformer.h.2.attn.resid_dropout:
approximation_function: !ApproximationFunction 'NONE'
input_format: !Format 'SAME'
instance: !DmxModule 'Dropout'
output_format: !Format 'SAME'
_gm.transformer.h.2.attn.softmax:
approximation_function: !ApproximationFunction 'NONE'
input_format: !Format 'FP[1|5|10,15](FN)'
instance: !DmxModule 'Softmax'
output_format: !Format 'FP[1|5|10,15](FN)'
_gm.transformer.h.2.ln_1:
approximation_function: !ApproximationFunction 'NONE'
bias_format: !Format 'SAME'
input_format: !Format 'FP[1|5|10,15](FN)'
instance: !DmxModule 'LayerNorm'
output_format: !Format 'FP[1|5|10,15](FN)'
weight_format: !Format 'SAME'
_gm.transformer.h.2.ln_2:
approximation_function: !ApproximationFunction 'NONE'
bias_format: !Format 'SAME'
input_format: !Format 'FP[1|5|10,15](FN)'
instance: !DmxModule 'LayerNorm'
output_format: !Format 'FP[1|5|10,15](FN)'
weight_format: !Format 'SAME'
_gm.transformer.h.2.mlp.act:
approximation_function: !ApproximationFunction 'NONE'
input_format: !Format 'FP[1|5|10,15](FN)'
instance: !DmxModule 'GELU'
output_format: !Format 'FP[1|5|10,15](FN)'
_gm.transformer.h.2.mlp.c_fc:
accum_format: !Format 'SAME'
approximation_function: !ApproximationFunction 'NONE'
bias_format: !Format 'BFP[24|8]{1,-1}(SN)'
input_format: !Format 'BFP[8|8]{128,-1}(SN)'
instance: !DmxModule 'Linear'
output_format: !Format 'FP[1|5|10,15](FN)'
smoothquant_scale_format: !Format 'SAME'
state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/6f096ef7ae5e56a0ec533b9679c30b39.pth
weight_format: !Format 'BFP[4|8]{128,-1}(SN)'
weight_sparseness: !Sparseness 'DENSE'
_gm.transformer.h.2.mlp.c_proj:
accum_format: !Format 'SAME'
approximation_function: !ApproximationFunction 'NONE'
bias_format: !Format 'BFP[24|8]{1,-1}(SN)'
input_format: !Format 'BFP[8|8]{128,-1}(SN)'
instance: !DmxModule 'Linear'
output_format: !Format 'FP[1|5|10,15](FN)'
smoothquant_scale_format: !Format 'SAME'
state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/fa911cfcb1982e0577c1d3a7f36a0454.pth
weight_format: !Format 'BFP[4|8]{128,-1}(SN)'
weight_sparseness: !Sparseness 'DENSE'
_gm.transformer.h.2.mlp.dropout:
approximation_function: !ApproximationFunction 'NONE'
input_format: !Format 'SAME'
instance: !DmxModule 'Dropout'
output_format: !Format 'SAME'
_gm.transformer.h.2.resadd:
approximation_function: !ApproximationFunction 'NONE'
input_format: !Format 'FP[1|5|10,15](FN)'
instance: !DmxModule 'ResAdd'
output_format: !Format 'FP[1|5|10,15](FN)'
residual_format: !Format 'FP[1|5|10,15](FN)'
_gm.transformer.h.2.resadd_1:
approximation_function: !ApproximationFunction 'NONE'
input_format: !Format 'FP[1|5|10,15](FN)'
instance: !DmxModule 'ResAdd'
output_format: !Format 'FP[1|5|10,15](FN)'
residual_format: !Format 'FP[1|5|10,15](FN)'
_gm.transformer.h.3.attn.attn_dropout:
approximation_function: !ApproximationFunction 'NONE'
input_format: !Format 'SAME'
instance: !DmxModule 'Dropout'
output_format: !Format 'SAME'
_gm.transformer.h.3.attn.c_attn:
accum_format: !Format 'SAME'
approximation_function: !ApproximationFunction 'NONE'
bias_format: !Format 'BFP[24|8]{1,-1}(SN)'
input_format: !Format 'BFP[8|8]{128,-1}(SN)'
instance: !DmxModule 'Linear'
output_format: !Format 'FP[1|5|10,15](FN)'
smoothquant_scale_format: !Format 'SAME'
state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/c951bf4ab9b12f759db16896ab547a3c.pth
weight_format: !Format 'BFP[4|8]{128,-1}(SN)'
weight_sparseness: !Sparseness 'DENSE'
_gm.transformer.h.3.attn.c_proj:
accum_format: !Format 'SAME'
approximation_function: !ApproximationFunction 'NONE'
bias_format: !Format 'BFP[24|8]{1,-1}(SN)'
input_format: !Format 'BFP[8|8]{128,-1}(SN)'
instance: !DmxModule 'Linear'
output_format: !Format 'FP[1|5|10,15](FN)'
smoothquant_scale_format: !Format 'SAME'
state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/f4a90368b689674302c9c1057e8de3d4.pth
weight_format: !Format 'BFP[4|8]{128,-1}(SN)'
weight_sparseness: !Sparseness 'DENSE'
_gm.transformer.h.3.attn.matmul:
approximation_function: !ApproximationFunction 'NONE'
input_format: !Format 'BFP[8|8]{128,-1}(SN)'
instance: !DmxModule 'ActActMatMul'
multiplier_format: !Format 'BFP[8|8]{128,-2}(SN)'
output_format: !Format 'FP[1|5|10,15](FN)'
_gm.transformer.h.3.attn.matmul_1:
approximation_function: !ApproximationFunction 'NONE'
input_format: !Format 'BFP[8|8]{128,-1}(SN)'
instance: !DmxModule 'ActActMatMul'
multiplier_format: !Format 'BFP[8|8]{128,-2}(SN)'
output_format: !Format 'FP[1|5|10,15](FN)'
_gm.transformer.h.3.attn.resid_dropout:
approximation_function: !ApproximationFunction 'NONE'
input_format: !Format 'SAME'
instance: !DmxModule 'Dropout'
output_format: !Format 'SAME'
_gm.transformer.h.3.attn.softmax:
approximation_function: !ApproximationFunction 'NONE'
input_format: !Format 'FP[1|5|10,15](FN)'
instance: !DmxModule 'Softmax'
output_format: !Format 'FP[1|5|10,15](FN)'
_gm.transformer.h.3.ln_1:
approximation_function: !ApproximationFunction 'NONE'
bias_format: !Format 'SAME'
input_format: !Format 'FP[1|5|10,15](FN)'
instance: !DmxModule 'LayerNorm'
output_format: !Format 'FP[1|5|10,15](FN)'
weight_format: !Format 'SAME'
_gm.transformer.h.3.ln_2:
approximation_function: !ApproximationFunction 'NONE'
bias_format: !Format 'SAME'
input_format: !Format 'FP[1|5|10,15](FN)'
instance: !DmxModule 'LayerNorm'
output_format: !Format 'FP[1|5|10,15](FN)'
weight_format: !Format 'SAME'
_gm.transformer.h.3.mlp.act:
approximation_function: !ApproximationFunction 'NONE'
input_format: !Format 'FP[1|5|10,15](FN)'
instance: !DmxModule 'GELU'
output_format: !Format 'FP[1|5|10,15](FN)'
_gm.transformer.h.3.mlp.c_fc:
accum_format: !Format 'SAME'
approximation_function: !ApproximationFunction 'NONE'
bias_format: !Format 'BFP[24|8]{1,-1}(SN)'
input_format: !Format 'BFP[8|8]{128,-1}(SN)'
instance: !DmxModule 'Linear'
output_format: !Format 'FP[1|5|10,15](FN)'
smoothquant_scale_format: !Format 'SAME'
state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/6ffcfb2adf95138b85c3920b31ca619a.pth
weight_format: !Format 'BFP[4|8]{128,-1}(SN)'
weight_sparseness: !Sparseness 'DENSE'
_gm.transformer.h.3.mlp.c_proj:
accum_format: !Format 'SAME'
approximation_function: !ApproximationFunction 'NONE'
bias_format: !Format 'BFP[24|8]{1,-1}(SN)'
input_format: !Format 'BFP[8|8]{128,-1}(SN)'
instance: !DmxModule 'Linear'
output_format: !Format 'FP[1|5|10,15](FN)'
smoothquant_scale_format: !Format 'SAME'
state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/231c6b2eb59e46a3ee76455a6eedd948.pth
weight_format: !Format 'BFP[4|8]{128,-1}(SN)'
weight_sparseness: !Sparseness 'DENSE'
_gm.transformer.h.3.mlp.dropout:
approximation_function: !ApproximationFunction 'NONE'
input_format: !Format 'SAME'
instance: !DmxModule 'Dropout'
output_format: !Format 'SAME'
_gm.transformer.h.3.resadd:
approximation_function: !ApproximationFunction 'NONE'
input_format: !Format 'FP[1|5|10,15](FN)'
instance: !DmxModule 'ResAdd'
output_format: !Format 'FP[1|5|10,15](FN)'
residual_format: !Format 'FP[1|5|10,15](FN)'
_gm.transformer.h.3.resadd_1:
approximation_function: !ApproximationFunction 'NONE'
input_format: !Format 'FP[1|5|10,15](FN)'
instance: !DmxModule 'ResAdd'
output_format: !Format 'FP[1|5|10,15](FN)'
residual_format: !Format 'FP[1|5|10,15](FN)'
_gm.transformer.h.4.attn.attn_dropout:
approximation_function: !ApproximationFunction 'NONE'
input_format: !Format 'SAME'
instance: !DmxModule 'Dropout'
output_format: !Format 'SAME'
_gm.transformer.h.4.attn.c_attn:
accum_format: !Format 'SAME'
approximation_function: !ApproximationFunction 'NONE'
bias_format: !Format 'BFP[24|8]{1,-1}(SN)'
input_format: !Format 'BFP[8|8]{128,-1}(SN)'
instance: !DmxModule 'Linear'
output_format: !Format 'FP[1|5|10,15](FN)'
smoothquant_scale_format: !Format 'SAME'
state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/defe3e44f3377caa14f35bdf611a2632.pth
weight_format: !Format 'BFP[4|8]{128,-1}(SN)'
weight_sparseness: !Sparseness 'DENSE'
_gm.transformer.h.4.attn.c_proj:
accum_format: !Format 'SAME'
approximation_function: !ApproximationFunction 'NONE'
bias_format: !Format 'BFP[24|8]{1,-1}(SN)'
input_format: !Format 'BFP[8|8]{128,-1}(SN)'
instance: !DmxModule 'Linear'
output_format: !Format 'FP[1|5|10,15](FN)'
smoothquant_scale_format: !Format 'SAME'
state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/3f31ba127de772f7f29c8cca0723ffcc.pth
weight_format: !Format 'BFP[4|8]{128,-1}(SN)'
weight_sparseness: !Sparseness 'DENSE'
_gm.transformer.h.4.attn.matmul:
approximation_function: !ApproximationFunction 'NONE'
input_format: !Format 'BFP[8|8]{128,-1}(SN)'
instance: !DmxModule 'ActActMatMul'
multiplier_format: !Format 'BFP[8|8]{128,-2}(SN)'
output_format: !Format 'FP[1|5|10,15](FN)'
_gm.transformer.h.4.attn.matmul_1:
approximation_function: !ApproximationFunction 'NONE'
input_format: !Format 'BFP[8|8]{128,-1}(SN)'
instance: !DmxModule 'ActActMatMul'
multiplier_format: !Format 'BFP[8|8]{128,-2}(SN)'
output_format: !Format 'FP[1|5|10,15](FN)'
_gm.transformer.h.4.attn.resid_dropout:
approximation_function: !ApproximationFunction 'NONE'
input_format: !Format 'SAME'
instance: !DmxModule 'Dropout'
output_format: !Format 'SAME'
_gm.transformer.h.4.attn.softmax:
approximation_function: !ApproximationFunction 'NONE'
input_format: !Format 'FP[1|5|10,15](FN)'
instance: !DmxModule 'Softmax'
output_format: !Format 'FP[1|5|10,15](FN)'
_gm.transformer.h.4.ln_1:
approximation_function: !ApproximationFunction 'NONE'
bias_format: !Format 'SAME'
input_format: !Format 'FP[1|5|10,15](FN)'
instance: !DmxModule 'LayerNorm'
output_format: !Format 'FP[1|5|10,15](FN)'
weight_format: !Format 'SAME'
_gm.transformer.h.4.ln_2:
approximation_function: !ApproximationFunction 'NONE'
bias_format: !Format 'SAME'
input_format: !Format 'FP[1|5|10,15](FN)'
instance: !DmxModule 'LayerNorm'
output_format: !Format 'FP[1|5|10,15](FN)'
weight_format: !Format 'SAME'
_gm.transformer.h.4.mlp.act:
approximation_function: !ApproximationFunction 'NONE'
input_format: !Format 'FP[1|5|10,15](FN)'
instance: !DmxModule 'GELU'
output_format: !Format 'FP[1|5|10,15](FN)'
_gm.transformer.h.4.mlp.c_fc:
accum_format: !Format 'SAME'
approximation_function: !ApproximationFunction 'NONE'
bias_format: !Format 'BFP[24|8]{1,-1}(SN)'
input_format: !Format 'BFP[8|8]{128,-1}(SN)'
instance: !DmxModule 'Linear'
output_format: !Format 'FP[1|5|10,15](FN)'
smoothquant_scale_format: !Format 'SAME'
state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/33e67804659d50361fc310a868c21097.pth
weight_format: !Format 'BFP[4|8]{128,-1}(SN)'
weight_sparseness: !Sparseness 'DENSE'
_gm.transformer.h.4.mlp.c_proj:
accum_format: !Format 'SAME'
approximation_function: !ApproximationFunction 'NONE'
bias_format: !Format 'BFP[24|8]{1,-1}(SN)'
input_format: !Format 'BFP[8|8]{128,-1}(SN)'
instance: !DmxModule 'Linear'
output_format: !Format 'FP[1|5|10,15](FN)'
smoothquant_scale_format: !Format 'SAME'
state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/64b47243fbe8f9dbcd7ecef1d9d10af7.pth
weight_format: !Format 'BFP[4|8]{128,-1}(SN)'
weight_sparseness: !Sparseness 'DENSE'
_gm.transformer.h.4.mlp.dropout:
approximation_function: !ApproximationFunction 'NONE'
input_format: !Format 'SAME'
instance: !DmxModule 'Dropout'
output_format: !Format 'SAME'
_gm.transformer.h.4.resadd:
approximation_function: !ApproximationFunction 'NONE'
input_format: !Format 'FP[1|5|10,15](FN)'
instance: !DmxModule 'ResAdd'
output_format: !Format 'FP[1|5|10,15](FN)'
residual_format: !Format 'FP[1|5|10,15](FN)'
_gm.transformer.h.4.resadd_1:
approximation_function: !ApproximationFunction 'NONE'
input_format: !Format 'FP[1|5|10,15](FN)'
instance: !DmxModule 'ResAdd'
output_format: !Format 'FP[1|5|10,15](FN)'
residual_format: !Format 'FP[1|5|10,15](FN)'
_gm.transformer.h.5.attn.attn_dropout:
approximation_function: !ApproximationFunction 'NONE'
input_format: !Format 'SAME'
instance: !DmxModule 'Dropout'
output_format: !Format 'SAME'
_gm.transformer.h.5.attn.c_attn:
accum_format: !Format 'SAME'
approximation_function: !ApproximationFunction 'NONE'
bias_format: !Format 'BFP[24|8]{1,-1}(SN)'
input_format: !Format 'BFP[8|8]{128,-1}(SN)'
instance: !DmxModule 'Linear'
output_format: !Format 'FP[1|5|10,15](FN)'
smoothquant_scale_format: !Format 'SAME'
state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/4094cdca91aeccbd21edd90fe5ab2b1b.pth
weight_format: !Format 'BFP[4|8]{128,-1}(SN)'
weight_sparseness: !Sparseness 'DENSE'
_gm.transformer.h.5.attn.c_proj:
accum_format: !Format 'SAME'
approximation_function: !ApproximationFunction 'NONE'
bias_format: !Format 'BFP[24|8]{1,-1}(SN)'
input_format: !Format 'BFP[8|8]{128,-1}(SN)'
instance: !DmxModule 'Linear'
output_format: !Format 'FP[1|5|10,15](FN)'
smoothquant_scale_format: !Format 'SAME'
state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/b91a79c86f968d0757761896ae394fa0.pth
weight_format: !Format 'BFP[4|8]{128,-1}(SN)'
weight_sparseness: !Sparseness 'DENSE'
_gm.transformer.h.5.attn.matmul:
approximation_function: !ApproximationFunction 'NONE'
input_format: !Format 'BFP[8|8]{128,-1}(SN)'
instance: !DmxModule 'ActActMatMul'
multiplier_format: !Format 'BFP[8|8]{128,-2}(SN)'
output_format: !Format 'FP[1|5|10,15](FN)'
_gm.transformer.h.5.attn.matmul_1:
approximation_function: !ApproximationFunction 'NONE'
input_format: !Format 'BFP[8|8]{128,-1}(SN)'
instance: !DmxModule 'ActActMatMul'
multiplier_format: !Format 'BFP[8|8]{128,-2}(SN)'
output_format: !Format 'FP[1|5|10,15](FN)'
_gm.transformer.h.5.attn.resid_dropout:
approximation_function: !ApproximationFunction 'NONE'
input_format: !Format 'SAME'
instance: !DmxModule 'Dropout'
output_format: !Format 'SAME'
_gm.transformer.h.5.attn.softmax:
approximation_function: !ApproximationFunction 'NONE'
input_format: !Format 'FP[1|5|10,15](FN)'
instance: !DmxModule 'Softmax'
output_format: !Format 'FP[1|5|10,15](FN)'
_gm.transformer.h.5.ln_1:
approximation_function: !ApproximationFunction 'NONE'
bias_format: !Format 'SAME'
input_format: !Format 'FP[1|5|10,15](FN)'
instance: !DmxModule 'LayerNorm'
output_format: !Format 'FP[1|5|10,15](FN)'
weight_format: !Format 'SAME'
_gm.transformer.h.5.ln_2:
approximation_function: !ApproximationFunction 'NONE'
bias_format: !Format 'SAME'
input_format: !Format 'FP[1|5|10,15](FN)'
instance: !DmxModule 'LayerNorm'
output_format: !Format 'FP[1|5|10,15](FN)'
weight_format: !Format 'SAME'
_gm.transformer.h.5.mlp.act:
approximation_function: !ApproximationFunction 'NONE'
input_format: !Format 'FP[1|5|10,15](FN)'
instance: !DmxModule 'GELU'
output_format: !Format 'FP[1|5|10,15](FN)'
_gm.transformer.h.5.mlp.c_fc:
accum_format: !Format 'SAME'
approximation_function: !ApproximationFunction 'NONE'
bias_format: !Format 'BFP[24|8]{1,-1}(SN)'
input_format: !Format 'BFP[8|8]{128,-1}(SN)'
instance: !DmxModule 'Linear'
output_format: !Format 'FP[1|5|10,15](FN)'
smoothquant_scale_format: !Format 'SAME'
state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/e279d6c6717b5e276072f366d1ec826e.pth
weight_format: !Format 'BFP[4|8]{128,-1}(SN)'
weight_sparseness: !Sparseness 'DENSE'
_gm.transformer.h.5.mlp.c_proj:
accum_format: !Format 'SAME'
approximation_function: !ApproximationFunction 'NONE'
bias_format: !Format 'BFP[24|8]{1,-1}(SN)'
input_format: !Format 'BFP[8|8]{128,-1}(SN)'
instance: !DmxModule 'Linear'
output_format: !Format 'FP[1|5|10,15](FN)'
smoothquant_scale_format: !Format 'SAME'
state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/6f08db70eeeb788df7ebe19929a26982.pth
weight_format: !Format 'BFP[4|8]{128,-1}(SN)'
weight_sparseness: !Sparseness 'DENSE'
_gm.transformer.h.5.mlp.dropout:
approximation_function: !ApproximationFunction 'NONE'
input_format: !Format 'SAME'
instance: !DmxModule 'Dropout'
output_format: !Format 'SAME'
_gm.transformer.h.5.resadd:
approximation_function: !ApproximationFunction 'NONE'
input_format: !Format 'FP[1|5|10,15](FN)'
instance: !DmxModule 'ResAdd'
output_format: !Format 'FP[1|5|10,15](FN)'
residual_format: !Format 'FP[1|5|10,15](FN)'
_gm.transformer.h.5.resadd_1:
approximation_function: !ApproximationFunction 'NONE'
input_format: !Format 'FP[1|5|10,15](FN)'
instance: !DmxModule 'ResAdd'
output_format: !Format 'FP[1|5|10,15](FN)'
residual_format: !Format 'FP[1|5|10,15](FN)'
_gm.transformer.ln_f:
approximation_function: !ApproximationFunction 'NONE'
bias_format: !Format 'SAME'
input_format: !Format 'FP[1|5|10,15](FN)'
instance: !DmxModule 'LayerNorm'
output_format: !Format 'FP[1|5|10,15](FN)'
weight_format: !Format 'SAME'
_gm.transformer.wpe:
approximation_function: !ApproximationFunction 'NONE'
input_format: !Format 'SAME'
instance: !DmxModule 'Embedding'
output_format: !Format 'FP[1|5|10,15](FN)'
weight_format: !Format 'SAME'
weight_sparseness: !Sparseness 'DENSE'
_gm.transformer.wte:
approximation_function: !ApproximationFunction 'NONE'
input_format: !Format 'SAME'
instance: !DmxModule 'Embedding'
output_format: !Format 'FP[1|5|10,15](FN)'
weight_format: !Format 'SAME'
weight_sparseness: !Sparseness 'DENSE'