_gm.lm_head: accum_format: !Format 'SAME' approximation_function: !ApproximationFunction 'NONE' input_format: !Format 'BFP[8|8]{128,-1}(SN)' instance: !DmxModule 'Linear' output_format: !Format 'FP[1|5|10,15](FN)' smoothquant_scale_format: !Format 'SAME' state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/8891be16ed2ea14cfc59a26a6838ba29.pth weight_format: !Format 'BFP[8|8]{128,-1}(SN)' weight_sparseness: !Sparseness 'DENSE' _gm.transformer.drop: approximation_function: !ApproximationFunction 'NONE' input_format: !Format 'SAME' instance: !DmxModule 'Dropout' output_format: !Format 'SAME' _gm.transformer.h.0.attn.attn_dropout: approximation_function: !ApproximationFunction 'NONE' input_format: !Format 'SAME' instance: !DmxModule 'Dropout' output_format: !Format 'SAME' _gm.transformer.h.0.attn.c_attn: accum_format: !Format 'SAME' approximation_function: !ApproximationFunction 'NONE' bias_format: !Format 'BFP[24|8]{1,-1}(SN)' input_format: !Format 'BFP[8|8]{128,-1}(SN)' instance: !DmxModule 'Linear' output_format: !Format 'FP[1|5|10,15](FN)' smoothquant_scale_format: !Format 'SAME' state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/afde7bd75645a9d22cdc4b60f1fbf318.pth weight_format: !Format 'BFP[4|8]{128,-1}(SN)' weight_sparseness: !Sparseness 'DENSE' _gm.transformer.h.0.attn.c_proj: accum_format: !Format 'SAME' approximation_function: !ApproximationFunction 'NONE' bias_format: !Format 'BFP[24|8]{1,-1}(SN)' input_format: !Format 'BFP[8|8]{128,-1}(SN)' instance: !DmxModule 'Linear' output_format: !Format 'FP[1|5|10,15](FN)' smoothquant_scale_format: !Format 'SAME' state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/1761c7df73888eebc91ead3ef8ed46d5.pth weight_format: !Format 'BFP[4|8]{128,-1}(SN)' weight_sparseness: !Sparseness 'DENSE' _gm.transformer.h.0.attn.matmul: approximation_function: !ApproximationFunction 'NONE' input_format: !Format 'BFP[8|8]{128,-1}(SN)' instance: !DmxModule 'ActActMatMul' multiplier_format: !Format 'BFP[8|8]{128,-2}(SN)' output_format: !Format 'FP[1|5|10,15](FN)' _gm.transformer.h.0.attn.matmul_1: approximation_function: !ApproximationFunction 'NONE' input_format: !Format 'BFP[8|8]{128,-1}(SN)' instance: !DmxModule 'ActActMatMul' multiplier_format: !Format 'BFP[8|8]{128,-2}(SN)' output_format: !Format 'FP[1|5|10,15](FN)' _gm.transformer.h.0.attn.resid_dropout: approximation_function: !ApproximationFunction 'NONE' input_format: !Format 'SAME' instance: !DmxModule 'Dropout' output_format: !Format 'SAME' _gm.transformer.h.0.attn.softmax: approximation_function: !ApproximationFunction 'NONE' input_format: !Format 'FP[1|5|10,15](FN)' instance: !DmxModule 'Softmax' output_format: !Format 'FP[1|5|10,15](FN)' _gm.transformer.h.0.ln_1: approximation_function: !ApproximationFunction 'NONE' bias_format: !Format 'SAME' input_format: !Format 'FP[1|5|10,15](FN)' instance: !DmxModule 'LayerNorm' output_format: !Format 'FP[1|5|10,15](FN)' weight_format: !Format 'SAME' _gm.transformer.h.0.ln_2: approximation_function: !ApproximationFunction 'NONE' bias_format: !Format 'SAME' input_format: !Format 'FP[1|5|10,15](FN)' instance: !DmxModule 'LayerNorm' output_format: !Format 'FP[1|5|10,15](FN)' weight_format: !Format 'SAME' _gm.transformer.h.0.mlp.act: approximation_function: !ApproximationFunction 'NONE' input_format: !Format 'FP[1|5|10,15](FN)' instance: !DmxModule 'GELU' output_format: !Format 'FP[1|5|10,15](FN)' _gm.transformer.h.0.mlp.c_fc: accum_format: !Format 'SAME' approximation_function: !ApproximationFunction 'NONE' bias_format: !Format 'BFP[24|8]{1,-1}(SN)' input_format: !Format 'BFP[8|8]{128,-1}(SN)' instance: !DmxModule 'Linear' output_format: !Format 'FP[1|5|10,15](FN)' smoothquant_scale_format: !Format 'SAME' state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/0872608e267020d4128c37718a258ee7.pth weight_format: !Format 'BFP[4|8]{128,-1}(SN)' weight_sparseness: !Sparseness 'DENSE' _gm.transformer.h.0.mlp.c_proj: accum_format: !Format 'SAME' approximation_function: !ApproximationFunction 'NONE' bias_format: !Format 'BFP[24|8]{1,-1}(SN)' input_format: !Format 'BFP[8|8]{128,-1}(SN)' instance: !DmxModule 'Linear' output_format: !Format 'FP[1|5|10,15](FN)' smoothquant_scale_format: !Format 'SAME' state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/b300412083508b593761bc342114bb84.pth weight_format: !Format 'BFP[4|8]{128,-1}(SN)' weight_sparseness: !Sparseness 'DENSE' _gm.transformer.h.0.mlp.dropout: approximation_function: !ApproximationFunction 'NONE' input_format: !Format 'SAME' instance: !DmxModule 'Dropout' output_format: !Format 'SAME' _gm.transformer.h.0.resadd: approximation_function: !ApproximationFunction 'NONE' input_format: !Format 'FP[1|5|10,15](FN)' instance: !DmxModule 'ResAdd' output_format: !Format 'FP[1|5|10,15](FN)' residual_format: !Format 'FP[1|5|10,15](FN)' _gm.transformer.h.0.resadd_1: approximation_function: !ApproximationFunction 'NONE' input_format: !Format 'FP[1|5|10,15](FN)' instance: !DmxModule 'ResAdd' output_format: !Format 'FP[1|5|10,15](FN)' residual_format: !Format 'FP[1|5|10,15](FN)' _gm.transformer.h.1.attn.attn_dropout: approximation_function: !ApproximationFunction 'NONE' input_format: !Format 'SAME' instance: !DmxModule 'Dropout' output_format: !Format 'SAME' _gm.transformer.h.1.attn.c_attn: accum_format: !Format 'SAME' approximation_function: !ApproximationFunction 'NONE' bias_format: !Format 'BFP[24|8]{1,-1}(SN)' input_format: !Format 'BFP[8|8]{128,-1}(SN)' instance: !DmxModule 'Linear' output_format: !Format 'FP[1|5|10,15](FN)' smoothquant_scale_format: !Format 'SAME' state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/ad536868f2092bb1f026d60c6b9c3d80.pth weight_format: !Format 'BFP[4|8]{128,-1}(SN)' weight_sparseness: !Sparseness 'DENSE' _gm.transformer.h.1.attn.c_proj: accum_format: !Format 'SAME' approximation_function: !ApproximationFunction 'NONE' bias_format: !Format 'BFP[24|8]{1,-1}(SN)' input_format: !Format 'BFP[8|8]{128,-1}(SN)' instance: !DmxModule 'Linear' output_format: !Format 'FP[1|5|10,15](FN)' smoothquant_scale_format: !Format 'SAME' state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/ed14512d4bb368ad7d16778dfca78e4b.pth weight_format: !Format 'BFP[4|8]{128,-1}(SN)' weight_sparseness: !Sparseness 'DENSE' _gm.transformer.h.1.attn.matmul: approximation_function: !ApproximationFunction 'NONE' input_format: !Format 'BFP[8|8]{128,-1}(SN)' instance: !DmxModule 'ActActMatMul' multiplier_format: !Format 'BFP[8|8]{128,-2}(SN)' output_format: !Format 'FP[1|5|10,15](FN)' _gm.transformer.h.1.attn.matmul_1: approximation_function: !ApproximationFunction 'NONE' input_format: !Format 'BFP[8|8]{128,-1}(SN)' instance: !DmxModule 'ActActMatMul' multiplier_format: !Format 'BFP[8|8]{128,-2}(SN)' output_format: !Format 'FP[1|5|10,15](FN)' _gm.transformer.h.1.attn.resid_dropout: approximation_function: !ApproximationFunction 'NONE' input_format: !Format 'SAME' instance: !DmxModule 'Dropout' output_format: !Format 'SAME' _gm.transformer.h.1.attn.softmax: approximation_function: !ApproximationFunction 'NONE' input_format: !Format 'FP[1|5|10,15](FN)' instance: !DmxModule 'Softmax' output_format: !Format 'FP[1|5|10,15](FN)' _gm.transformer.h.1.ln_1: approximation_function: !ApproximationFunction 'NONE' bias_format: !Format 'SAME' input_format: !Format 'FP[1|5|10,15](FN)' instance: !DmxModule 'LayerNorm' output_format: !Format 'FP[1|5|10,15](FN)' weight_format: !Format 'SAME' _gm.transformer.h.1.ln_2: approximation_function: !ApproximationFunction 'NONE' bias_format: !Format 'SAME' input_format: !Format 'FP[1|5|10,15](FN)' instance: !DmxModule 'LayerNorm' output_format: !Format 'FP[1|5|10,15](FN)' weight_format: !Format 'SAME' _gm.transformer.h.1.mlp.act: approximation_function: !ApproximationFunction 'NONE' input_format: !Format 'FP[1|5|10,15](FN)' instance: !DmxModule 'GELU' output_format: !Format 'FP[1|5|10,15](FN)' _gm.transformer.h.1.mlp.c_fc: accum_format: !Format 'SAME' approximation_function: !ApproximationFunction 'NONE' bias_format: !Format 'BFP[24|8]{1,-1}(SN)' input_format: !Format 'BFP[8|8]{128,-1}(SN)' instance: !DmxModule 'Linear' output_format: !Format 'FP[1|5|10,15](FN)' smoothquant_scale_format: !Format 'SAME' state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/fbc26ac6c834c9f4f2cf0e0fe8d1505c.pth weight_format: !Format 'BFP[4|8]{128,-1}(SN)' weight_sparseness: !Sparseness 'DENSE' _gm.transformer.h.1.mlp.c_proj: accum_format: !Format 'SAME' approximation_function: !ApproximationFunction 'NONE' bias_format: !Format 'BFP[24|8]{1,-1}(SN)' input_format: !Format 'BFP[8|8]{128,-1}(SN)' instance: !DmxModule 'Linear' output_format: !Format 'FP[1|5|10,15](FN)' smoothquant_scale_format: !Format 'SAME' state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/13283ea3665932eb2112053911f3fad3.pth weight_format: !Format 'BFP[4|8]{128,-1}(SN)' weight_sparseness: !Sparseness 'DENSE' _gm.transformer.h.1.mlp.dropout: approximation_function: !ApproximationFunction 'NONE' input_format: !Format 'SAME' instance: !DmxModule 'Dropout' output_format: !Format 'SAME' _gm.transformer.h.1.resadd: approximation_function: !ApproximationFunction 'NONE' input_format: !Format 'FP[1|5|10,15](FN)' instance: !DmxModule 'ResAdd' output_format: !Format 'FP[1|5|10,15](FN)' residual_format: !Format 'FP[1|5|10,15](FN)' _gm.transformer.h.1.resadd_1: approximation_function: !ApproximationFunction 'NONE' input_format: !Format 'FP[1|5|10,15](FN)' instance: !DmxModule 'ResAdd' output_format: !Format 'FP[1|5|10,15](FN)' residual_format: !Format 'FP[1|5|10,15](FN)' _gm.transformer.h.2.attn.attn_dropout: approximation_function: !ApproximationFunction 'NONE' input_format: !Format 'SAME' instance: !DmxModule 'Dropout' output_format: !Format 'SAME' _gm.transformer.h.2.attn.c_attn: accum_format: !Format 'SAME' approximation_function: !ApproximationFunction 'NONE' bias_format: !Format 'BFP[24|8]{1,-1}(SN)' input_format: !Format 'BFP[8|8]{128,-1}(SN)' instance: !DmxModule 'Linear' output_format: !Format 'FP[1|5|10,15](FN)' smoothquant_scale_format: !Format 'SAME' state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/7ed7736f36ecaa3b7551b07f97317824.pth weight_format: !Format 'BFP[4|8]{128,-1}(SN)' weight_sparseness: !Sparseness 'DENSE' _gm.transformer.h.2.attn.c_proj: accum_format: !Format 'SAME' approximation_function: !ApproximationFunction 'NONE' bias_format: !Format 'BFP[24|8]{1,-1}(SN)' input_format: !Format 'BFP[8|8]{128,-1}(SN)' instance: !DmxModule 'Linear' output_format: !Format 'FP[1|5|10,15](FN)' smoothquant_scale_format: !Format 'SAME' state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/8046f060fa4d9278015268362a06dd01.pth weight_format: !Format 'BFP[4|8]{128,-1}(SN)' weight_sparseness: !Sparseness 'DENSE' _gm.transformer.h.2.attn.matmul: approximation_function: !ApproximationFunction 'NONE' input_format: !Format 'BFP[8|8]{128,-1}(SN)' instance: !DmxModule 'ActActMatMul' multiplier_format: !Format 'BFP[8|8]{128,-2}(SN)' output_format: !Format 'FP[1|5|10,15](FN)' _gm.transformer.h.2.attn.matmul_1: approximation_function: !ApproximationFunction 'NONE' input_format: !Format 'BFP[8|8]{128,-1}(SN)' instance: !DmxModule 'ActActMatMul' multiplier_format: !Format 'BFP[8|8]{128,-2}(SN)' output_format: !Format 'FP[1|5|10,15](FN)' _gm.transformer.h.2.attn.resid_dropout: approximation_function: !ApproximationFunction 'NONE' input_format: !Format 'SAME' instance: !DmxModule 'Dropout' output_format: !Format 'SAME' _gm.transformer.h.2.attn.softmax: approximation_function: !ApproximationFunction 'NONE' input_format: !Format 'FP[1|5|10,15](FN)' instance: !DmxModule 'Softmax' output_format: !Format 'FP[1|5|10,15](FN)' _gm.transformer.h.2.ln_1: approximation_function: !ApproximationFunction 'NONE' bias_format: !Format 'SAME' input_format: !Format 'FP[1|5|10,15](FN)' instance: !DmxModule 'LayerNorm' output_format: !Format 'FP[1|5|10,15](FN)' weight_format: !Format 'SAME' _gm.transformer.h.2.ln_2: approximation_function: !ApproximationFunction 'NONE' bias_format: !Format 'SAME' input_format: !Format 'FP[1|5|10,15](FN)' instance: !DmxModule 'LayerNorm' output_format: !Format 'FP[1|5|10,15](FN)' weight_format: !Format 'SAME' _gm.transformer.h.2.mlp.act: approximation_function: !ApproximationFunction 'NONE' input_format: !Format 'FP[1|5|10,15](FN)' instance: !DmxModule 'GELU' output_format: !Format 'FP[1|5|10,15](FN)' _gm.transformer.h.2.mlp.c_fc: accum_format: !Format 'SAME' approximation_function: !ApproximationFunction 'NONE' bias_format: !Format 'BFP[24|8]{1,-1}(SN)' input_format: !Format 'BFP[8|8]{128,-1}(SN)' instance: !DmxModule 'Linear' output_format: !Format 'FP[1|5|10,15](FN)' smoothquant_scale_format: !Format 'SAME' state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/6f096ef7ae5e56a0ec533b9679c30b39.pth weight_format: !Format 'BFP[4|8]{128,-1}(SN)' weight_sparseness: !Sparseness 'DENSE' _gm.transformer.h.2.mlp.c_proj: accum_format: !Format 'SAME' approximation_function: !ApproximationFunction 'NONE' bias_format: !Format 'BFP[24|8]{1,-1}(SN)' input_format: !Format 'BFP[8|8]{128,-1}(SN)' instance: !DmxModule 'Linear' output_format: !Format 'FP[1|5|10,15](FN)' smoothquant_scale_format: !Format 'SAME' state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/fa911cfcb1982e0577c1d3a7f36a0454.pth weight_format: !Format 'BFP[4|8]{128,-1}(SN)' weight_sparseness: !Sparseness 'DENSE' _gm.transformer.h.2.mlp.dropout: approximation_function: !ApproximationFunction 'NONE' input_format: !Format 'SAME' instance: !DmxModule 'Dropout' output_format: !Format 'SAME' _gm.transformer.h.2.resadd: approximation_function: !ApproximationFunction 'NONE' input_format: !Format 'FP[1|5|10,15](FN)' instance: !DmxModule 'ResAdd' output_format: !Format 'FP[1|5|10,15](FN)' residual_format: !Format 'FP[1|5|10,15](FN)' _gm.transformer.h.2.resadd_1: approximation_function: !ApproximationFunction 'NONE' input_format: !Format 'FP[1|5|10,15](FN)' instance: !DmxModule 'ResAdd' output_format: !Format 'FP[1|5|10,15](FN)' residual_format: !Format 'FP[1|5|10,15](FN)' _gm.transformer.h.3.attn.attn_dropout: approximation_function: !ApproximationFunction 'NONE' input_format: !Format 'SAME' instance: !DmxModule 'Dropout' output_format: !Format 'SAME' _gm.transformer.h.3.attn.c_attn: accum_format: !Format 'SAME' approximation_function: !ApproximationFunction 'NONE' bias_format: !Format 'BFP[24|8]{1,-1}(SN)' input_format: !Format 'BFP[8|8]{128,-1}(SN)' instance: !DmxModule 'Linear' output_format: !Format 'FP[1|5|10,15](FN)' smoothquant_scale_format: !Format 'SAME' state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/c951bf4ab9b12f759db16896ab547a3c.pth weight_format: !Format 'BFP[4|8]{128,-1}(SN)' weight_sparseness: !Sparseness 'DENSE' _gm.transformer.h.3.attn.c_proj: accum_format: !Format 'SAME' approximation_function: !ApproximationFunction 'NONE' bias_format: !Format 'BFP[24|8]{1,-1}(SN)' input_format: !Format 'BFP[8|8]{128,-1}(SN)' instance: !DmxModule 'Linear' output_format: !Format 'FP[1|5|10,15](FN)' smoothquant_scale_format: !Format 'SAME' state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/f4a90368b689674302c9c1057e8de3d4.pth weight_format: !Format 'BFP[4|8]{128,-1}(SN)' weight_sparseness: !Sparseness 'DENSE' _gm.transformer.h.3.attn.matmul: approximation_function: !ApproximationFunction 'NONE' input_format: !Format 'BFP[8|8]{128,-1}(SN)' instance: !DmxModule 'ActActMatMul' multiplier_format: !Format 'BFP[8|8]{128,-2}(SN)' output_format: !Format 'FP[1|5|10,15](FN)' _gm.transformer.h.3.attn.matmul_1: approximation_function: !ApproximationFunction 'NONE' input_format: !Format 'BFP[8|8]{128,-1}(SN)' instance: !DmxModule 'ActActMatMul' multiplier_format: !Format 'BFP[8|8]{128,-2}(SN)' output_format: !Format 'FP[1|5|10,15](FN)' _gm.transformer.h.3.attn.resid_dropout: approximation_function: !ApproximationFunction 'NONE' input_format: !Format 'SAME' instance: !DmxModule 'Dropout' output_format: !Format 'SAME' _gm.transformer.h.3.attn.softmax: approximation_function: !ApproximationFunction 'NONE' input_format: !Format 'FP[1|5|10,15](FN)' instance: !DmxModule 'Softmax' output_format: !Format 'FP[1|5|10,15](FN)' _gm.transformer.h.3.ln_1: approximation_function: !ApproximationFunction 'NONE' bias_format: !Format 'SAME' input_format: !Format 'FP[1|5|10,15](FN)' instance: !DmxModule 'LayerNorm' output_format: !Format 'FP[1|5|10,15](FN)' weight_format: !Format 'SAME' _gm.transformer.h.3.ln_2: approximation_function: !ApproximationFunction 'NONE' bias_format: !Format 'SAME' input_format: !Format 'FP[1|5|10,15](FN)' instance: !DmxModule 'LayerNorm' output_format: !Format 'FP[1|5|10,15](FN)' weight_format: !Format 'SAME' _gm.transformer.h.3.mlp.act: approximation_function: !ApproximationFunction 'NONE' input_format: !Format 'FP[1|5|10,15](FN)' instance: !DmxModule 'GELU' output_format: !Format 'FP[1|5|10,15](FN)' _gm.transformer.h.3.mlp.c_fc: accum_format: !Format 'SAME' approximation_function: !ApproximationFunction 'NONE' bias_format: !Format 'BFP[24|8]{1,-1}(SN)' input_format: !Format 'BFP[8|8]{128,-1}(SN)' instance: !DmxModule 'Linear' output_format: !Format 'FP[1|5|10,15](FN)' smoothquant_scale_format: !Format 'SAME' state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/6ffcfb2adf95138b85c3920b31ca619a.pth weight_format: !Format 'BFP[4|8]{128,-1}(SN)' weight_sparseness: !Sparseness 'DENSE' _gm.transformer.h.3.mlp.c_proj: accum_format: !Format 'SAME' approximation_function: !ApproximationFunction 'NONE' bias_format: !Format 'BFP[24|8]{1,-1}(SN)' input_format: !Format 'BFP[8|8]{128,-1}(SN)' instance: !DmxModule 'Linear' output_format: !Format 'FP[1|5|10,15](FN)' smoothquant_scale_format: !Format 'SAME' state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/231c6b2eb59e46a3ee76455a6eedd948.pth weight_format: !Format 'BFP[4|8]{128,-1}(SN)' weight_sparseness: !Sparseness 'DENSE' _gm.transformer.h.3.mlp.dropout: approximation_function: !ApproximationFunction 'NONE' input_format: !Format 'SAME' instance: !DmxModule 'Dropout' output_format: !Format 'SAME' _gm.transformer.h.3.resadd: approximation_function: !ApproximationFunction 'NONE' input_format: !Format 'FP[1|5|10,15](FN)' instance: !DmxModule 'ResAdd' output_format: !Format 'FP[1|5|10,15](FN)' residual_format: !Format 'FP[1|5|10,15](FN)' _gm.transformer.h.3.resadd_1: approximation_function: !ApproximationFunction 'NONE' input_format: !Format 'FP[1|5|10,15](FN)' instance: !DmxModule 'ResAdd' output_format: !Format 'FP[1|5|10,15](FN)' residual_format: !Format 'FP[1|5|10,15](FN)' _gm.transformer.h.4.attn.attn_dropout: approximation_function: !ApproximationFunction 'NONE' input_format: !Format 'SAME' instance: !DmxModule 'Dropout' output_format: !Format 'SAME' _gm.transformer.h.4.attn.c_attn: accum_format: !Format 'SAME' approximation_function: !ApproximationFunction 'NONE' bias_format: !Format 'BFP[24|8]{1,-1}(SN)' input_format: !Format 'BFP[8|8]{128,-1}(SN)' instance: !DmxModule 'Linear' output_format: !Format 'FP[1|5|10,15](FN)' smoothquant_scale_format: !Format 'SAME' state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/defe3e44f3377caa14f35bdf611a2632.pth weight_format: !Format 'BFP[4|8]{128,-1}(SN)' weight_sparseness: !Sparseness 'DENSE' _gm.transformer.h.4.attn.c_proj: accum_format: !Format 'SAME' approximation_function: !ApproximationFunction 'NONE' bias_format: !Format 'BFP[24|8]{1,-1}(SN)' input_format: !Format 'BFP[8|8]{128,-1}(SN)' instance: !DmxModule 'Linear' output_format: !Format 'FP[1|5|10,15](FN)' smoothquant_scale_format: !Format 'SAME' state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/3f31ba127de772f7f29c8cca0723ffcc.pth weight_format: !Format 'BFP[4|8]{128,-1}(SN)' weight_sparseness: !Sparseness 'DENSE' _gm.transformer.h.4.attn.matmul: approximation_function: !ApproximationFunction 'NONE' input_format: !Format 'BFP[8|8]{128,-1}(SN)' instance: !DmxModule 'ActActMatMul' multiplier_format: !Format 'BFP[8|8]{128,-2}(SN)' output_format: !Format 'FP[1|5|10,15](FN)' _gm.transformer.h.4.attn.matmul_1: approximation_function: !ApproximationFunction 'NONE' input_format: !Format 'BFP[8|8]{128,-1}(SN)' instance: !DmxModule 'ActActMatMul' multiplier_format: !Format 'BFP[8|8]{128,-2}(SN)' output_format: !Format 'FP[1|5|10,15](FN)' _gm.transformer.h.4.attn.resid_dropout: approximation_function: !ApproximationFunction 'NONE' input_format: !Format 'SAME' instance: !DmxModule 'Dropout' output_format: !Format 'SAME' _gm.transformer.h.4.attn.softmax: approximation_function: !ApproximationFunction 'NONE' input_format: !Format 'FP[1|5|10,15](FN)' instance: !DmxModule 'Softmax' output_format: !Format 'FP[1|5|10,15](FN)' _gm.transformer.h.4.ln_1: approximation_function: !ApproximationFunction 'NONE' bias_format: !Format 'SAME' input_format: !Format 'FP[1|5|10,15](FN)' instance: !DmxModule 'LayerNorm' output_format: !Format 'FP[1|5|10,15](FN)' weight_format: !Format 'SAME' _gm.transformer.h.4.ln_2: approximation_function: !ApproximationFunction 'NONE' bias_format: !Format 'SAME' input_format: !Format 'FP[1|5|10,15](FN)' instance: !DmxModule 'LayerNorm' output_format: !Format 'FP[1|5|10,15](FN)' weight_format: !Format 'SAME' _gm.transformer.h.4.mlp.act: approximation_function: !ApproximationFunction 'NONE' input_format: !Format 'FP[1|5|10,15](FN)' instance: !DmxModule 'GELU' output_format: !Format 'FP[1|5|10,15](FN)' _gm.transformer.h.4.mlp.c_fc: accum_format: !Format 'SAME' approximation_function: !ApproximationFunction 'NONE' bias_format: !Format 'BFP[24|8]{1,-1}(SN)' input_format: !Format 'BFP[8|8]{128,-1}(SN)' instance: !DmxModule 'Linear' output_format: !Format 'FP[1|5|10,15](FN)' smoothquant_scale_format: !Format 'SAME' state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/33e67804659d50361fc310a868c21097.pth weight_format: !Format 'BFP[4|8]{128,-1}(SN)' weight_sparseness: !Sparseness 'DENSE' _gm.transformer.h.4.mlp.c_proj: accum_format: !Format 'SAME' approximation_function: !ApproximationFunction 'NONE' bias_format: !Format 'BFP[24|8]{1,-1}(SN)' input_format: !Format 'BFP[8|8]{128,-1}(SN)' instance: !DmxModule 'Linear' output_format: !Format 'FP[1|5|10,15](FN)' smoothquant_scale_format: !Format 'SAME' state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/64b47243fbe8f9dbcd7ecef1d9d10af7.pth weight_format: !Format 'BFP[4|8]{128,-1}(SN)' weight_sparseness: !Sparseness 'DENSE' _gm.transformer.h.4.mlp.dropout: approximation_function: !ApproximationFunction 'NONE' input_format: !Format 'SAME' instance: !DmxModule 'Dropout' output_format: !Format 'SAME' _gm.transformer.h.4.resadd: approximation_function: !ApproximationFunction 'NONE' input_format: !Format 'FP[1|5|10,15](FN)' instance: !DmxModule 'ResAdd' output_format: !Format 'FP[1|5|10,15](FN)' residual_format: !Format 'FP[1|5|10,15](FN)' _gm.transformer.h.4.resadd_1: approximation_function: !ApproximationFunction 'NONE' input_format: !Format 'FP[1|5|10,15](FN)' instance: !DmxModule 'ResAdd' output_format: !Format 'FP[1|5|10,15](FN)' residual_format: !Format 'FP[1|5|10,15](FN)' _gm.transformer.h.5.attn.attn_dropout: approximation_function: !ApproximationFunction 'NONE' input_format: !Format 'SAME' instance: !DmxModule 'Dropout' output_format: !Format 'SAME' _gm.transformer.h.5.attn.c_attn: accum_format: !Format 'SAME' approximation_function: !ApproximationFunction 'NONE' bias_format: !Format 'BFP[24|8]{1,-1}(SN)' input_format: !Format 'BFP[8|8]{128,-1}(SN)' instance: !DmxModule 'Linear' output_format: !Format 'FP[1|5|10,15](FN)' smoothquant_scale_format: !Format 'SAME' state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/4094cdca91aeccbd21edd90fe5ab2b1b.pth weight_format: !Format 'BFP[4|8]{128,-1}(SN)' weight_sparseness: !Sparseness 'DENSE' _gm.transformer.h.5.attn.c_proj: accum_format: !Format 'SAME' approximation_function: !ApproximationFunction 'NONE' bias_format: !Format 'BFP[24|8]{1,-1}(SN)' input_format: !Format 'BFP[8|8]{128,-1}(SN)' instance: !DmxModule 'Linear' output_format: !Format 'FP[1|5|10,15](FN)' smoothquant_scale_format: !Format 'SAME' state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/b91a79c86f968d0757761896ae394fa0.pth weight_format: !Format 'BFP[4|8]{128,-1}(SN)' weight_sparseness: !Sparseness 'DENSE' _gm.transformer.h.5.attn.matmul: approximation_function: !ApproximationFunction 'NONE' input_format: !Format 'BFP[8|8]{128,-1}(SN)' instance: !DmxModule 'ActActMatMul' multiplier_format: !Format 'BFP[8|8]{128,-2}(SN)' output_format: !Format 'FP[1|5|10,15](FN)' _gm.transformer.h.5.attn.matmul_1: approximation_function: !ApproximationFunction 'NONE' input_format: !Format 'BFP[8|8]{128,-1}(SN)' instance: !DmxModule 'ActActMatMul' multiplier_format: !Format 'BFP[8|8]{128,-2}(SN)' output_format: !Format 'FP[1|5|10,15](FN)' _gm.transformer.h.5.attn.resid_dropout: approximation_function: !ApproximationFunction 'NONE' input_format: !Format 'SAME' instance: !DmxModule 'Dropout' output_format: !Format 'SAME' _gm.transformer.h.5.attn.softmax: approximation_function: !ApproximationFunction 'NONE' input_format: !Format 'FP[1|5|10,15](FN)' instance: !DmxModule 'Softmax' output_format: !Format 'FP[1|5|10,15](FN)' _gm.transformer.h.5.ln_1: approximation_function: !ApproximationFunction 'NONE' bias_format: !Format 'SAME' input_format: !Format 'FP[1|5|10,15](FN)' instance: !DmxModule 'LayerNorm' output_format: !Format 'FP[1|5|10,15](FN)' weight_format: !Format 'SAME' _gm.transformer.h.5.ln_2: approximation_function: !ApproximationFunction 'NONE' bias_format: !Format 'SAME' input_format: !Format 'FP[1|5|10,15](FN)' instance: !DmxModule 'LayerNorm' output_format: !Format 'FP[1|5|10,15](FN)' weight_format: !Format 'SAME' _gm.transformer.h.5.mlp.act: approximation_function: !ApproximationFunction 'NONE' input_format: !Format 'FP[1|5|10,15](FN)' instance: !DmxModule 'GELU' output_format: !Format 'FP[1|5|10,15](FN)' _gm.transformer.h.5.mlp.c_fc: accum_format: !Format 'SAME' approximation_function: !ApproximationFunction 'NONE' bias_format: !Format 'BFP[24|8]{1,-1}(SN)' input_format: !Format 'BFP[8|8]{128,-1}(SN)' instance: !DmxModule 'Linear' output_format: !Format 'FP[1|5|10,15](FN)' smoothquant_scale_format: !Format 'SAME' state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/e279d6c6717b5e276072f366d1ec826e.pth weight_format: !Format 'BFP[4|8]{128,-1}(SN)' weight_sparseness: !Sparseness 'DENSE' _gm.transformer.h.5.mlp.c_proj: accum_format: !Format 'SAME' approximation_function: !ApproximationFunction 'NONE' bias_format: !Format 'BFP[24|8]{1,-1}(SN)' input_format: !Format 'BFP[8|8]{128,-1}(SN)' instance: !DmxModule 'Linear' output_format: !Format 'FP[1|5|10,15](FN)' smoothquant_scale_format: !Format 'SAME' state_dict_url: https://huggingface.co/d-matrix/gpt2/blob/distilgpt2/checkpoints/6f08db70eeeb788df7ebe19929a26982.pth weight_format: !Format 'BFP[4|8]{128,-1}(SN)' weight_sparseness: !Sparseness 'DENSE' _gm.transformer.h.5.mlp.dropout: approximation_function: !ApproximationFunction 'NONE' input_format: !Format 'SAME' instance: !DmxModule 'Dropout' output_format: !Format 'SAME' _gm.transformer.h.5.resadd: approximation_function: !ApproximationFunction 'NONE' input_format: !Format 'FP[1|5|10,15](FN)' instance: !DmxModule 'ResAdd' output_format: !Format 'FP[1|5|10,15](FN)' residual_format: !Format 'FP[1|5|10,15](FN)' _gm.transformer.h.5.resadd_1: approximation_function: !ApproximationFunction 'NONE' input_format: !Format 'FP[1|5|10,15](FN)' instance: !DmxModule 'ResAdd' output_format: !Format 'FP[1|5|10,15](FN)' residual_format: !Format 'FP[1|5|10,15](FN)' _gm.transformer.ln_f: approximation_function: !ApproximationFunction 'NONE' bias_format: !Format 'SAME' input_format: !Format 'FP[1|5|10,15](FN)' instance: !DmxModule 'LayerNorm' output_format: !Format 'FP[1|5|10,15](FN)' weight_format: !Format 'SAME' _gm.transformer.wpe: approximation_function: !ApproximationFunction 'NONE' input_format: !Format 'SAME' instance: !DmxModule 'Embedding' output_format: !Format 'FP[1|5|10,15](FN)' weight_format: !Format 'SAME' weight_sparseness: !Sparseness 'DENSE' _gm.transformer.wte: approximation_function: !ApproximationFunction 'NONE' input_format: !Format 'SAME' instance: !DmxModule 'Embedding' output_format: !Format 'FP[1|5|10,15](FN)' weight_format: !Format 'SAME' weight_sparseness: !Sparseness 'DENSE'