diff --git a/README.md b/README.md index a37536e8f5ed6823fcb4c90d1999c90f7037159d..98f55330b606e8b3c6701fe19ab42bf09d3bddcd 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,6 @@ --- +license: mit +library_name: transformers base_model: deepseek-ai/DeepSeek-R1 tags: - mlx @@ -8,7 +10,7 @@ tags: The Model [mlx-community/DeepSeek-R1-2bit](https://huggingface.co/mlx-community/DeepSeek-R1-2bit) was converted to MLX format from [deepseek-ai/DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-R1) -using mlx-lm version **0.21.3**. +using mlx-lm version **0.21.6**. ## Use with mlx @@ -30,4 +32,4 @@ if tokenizer.chat_template is not None: ) response = generate(model, tokenizer, prompt=prompt, verbose=True) -``` \ No newline at end of file +``` diff --git a/config.json b/config.json index aca907ef812080231d7fccc9138bc2b36e112b2a..d4f728568686dac778792ff2758e8f9bc8a4fd1f 100644 --- a/config.json +++ b/config.json @@ -37,11 +37,11 @@ "qk_nope_head_dim": 128, "qk_rope_head_dim": 64, "quantization": { - "group_size": 64, + "group_size": 32, "bits": 2 }, "quantization_config": { - "group_size": 64, + "group_size": 32, "bits": 2 }, "rms_norm_eps": 1e-06, diff --git a/model-00001-of-00050.safetensors b/model-00001-of-00050.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..995091f5e34d129c4d0b00703fd1b2a4ba3478c3 --- /dev/null +++ b/model-00001-of-00050.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:43e337c38b52393b78dcadda481da149bd127cbbccb18f56cbb5c44e866ec9c4 +size 5348389439 diff --git a/model-00002-of-00050.safetensors b/model-00002-of-00050.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d8beea30fe84a9e280a6767d46cd82438d2f23df --- /dev/null +++ b/model-00002-of-00050.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7bf2e89c323378dec8fb179126fd51ce1a5eb5546e926dbe5d0c9ee9d079a41 +size 5301810784 diff --git a/model-00003-of-00050.safetensors b/model-00003-of-00050.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..24b3ab756a76e50fc8aae84813d6e761f94a6eb3 --- /dev/null +++ b/model-00003-of-00050.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e9a5194c0a984e4bad2fe58b01d373ecaead62bba957fec6a2e7543d4667d41 +size 4788008353 diff --git a/model-00004-of-00050.safetensors b/model-00004-of-00050.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b8ca3617f4889f6c4d07a955069e6e2cb998c2d5 --- /dev/null +++ b/model-00004-of-00050.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e69cd1b5870514e64a3e2ff5d5dc83c47c31e5a155013ba565fef2a772abe8d +size 5257770270 diff --git a/model-00005-of-00050.safetensors b/model-00005-of-00050.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..02a831e01ae2ed2cb9314270e49784696c83b583 --- /dev/null +++ b/model-00005-of-00050.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0baf97afbe7750f98b1cd8a08e215da82823e239d7b290054af57d7812daa4c +size 4788008349 diff --git a/model-00006-of-00050.safetensors b/model-00006-of-00050.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6cf305aa2e14c85f216e867405d9e955aecb0684 --- /dev/null +++ b/model-00006-of-00050.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6910aa40aefeccc1c4151f6a7754b747b6a4a652239e11a3fbddbb82a07514d +size 5257770274 diff --git a/model-00007-of-00050.safetensors b/model-00007-of-00050.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a023bc4f835e928614375547c37ab4c59f11acbc --- /dev/null +++ b/model-00007-of-00050.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c8ea924ed35e3729c1b97bb7822fc97bb1f198e6d113581687ffb500f23bfa8 +size 4878394783 diff --git a/model-00008-of-00050.safetensors b/model-00008-of-00050.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f7249bdc13549976a55977e75ad62280e8c99f8f --- /dev/null +++ b/model-00008-of-00050.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:575b20f8a0156b97ec10c102ed6e64bb353c43d78bebc9f29332161228f025c6 +size 5257770230 diff --git a/model-00009-of-00050.safetensors b/model-00009-of-00050.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e1e7359067d2bb0b85b3efe866bc28a547210da9 --- /dev/null +++ b/model-00009-of-00050.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8703e97c9d8f40905217c092cb161a244acc66d046c91c4e4e94e0c29cd2088f +size 4788008366 diff --git a/model-00010-of-00050.safetensors b/model-00010-of-00050.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..caa5c6afe99310b96bf49a867416fc246dd18d87 --- /dev/null +++ b/model-00010-of-00050.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0bcf7e0178a9d4173661af6c96c29be283856ecaf2c00c398ec9da8f0b5bfde4 +size 5257770278 diff --git a/model-00011-of-00050.safetensors b/model-00011-of-00050.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..cc339f8be9c145a1b8945e833cd496cb06d8471f --- /dev/null +++ b/model-00011-of-00050.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19d1848577227f7826d53145f2ff6bfcee37ebc212b6af3d9b2fed261d2bda39 +size 4788008354 diff --git a/model-00012-of-00050.safetensors b/model-00012-of-00050.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..99be49bb809caa4bab94db596e8986a7fb607271 --- /dev/null +++ b/model-00012-of-00050.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9a9ffe71003b9fd86f4fbc91af44cecc858b5f16b25b06533f25948d3abd566 +size 5257770314 diff --git a/model-00013-of-00050.safetensors b/model-00013-of-00050.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c7f24f5b57ca7220d0f03c133bb6d246568b6a22 --- /dev/null +++ b/model-00013-of-00050.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e26adf5f2e680c0b3e736ccd72976a935a7aafe5629eca735cfb7357d4a0e4a +size 4878394774 diff --git a/model-00014-of-00050.safetensors b/model-00014-of-00050.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0a76982127604815973402616a4220e5299fb252 --- /dev/null +++ b/model-00014-of-00050.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09ba84fc7c5d149e2348464f4ad3c7f2fb475bebcb829bbd97f88c038361d171 +size 5257770282 diff --git a/model-00015-of-00050.safetensors b/model-00015-of-00050.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fbd3261aae06bf0971bb9ffe83018deec80024b3 --- /dev/null +++ b/model-00015-of-00050.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9dc8b118eab7a665b073b04ba97a25207a4f22c693a652a2eef90ff71f899566 +size 4788008392 diff --git a/model-00016-of-00050.safetensors b/model-00016-of-00050.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d2cdc6bd76b62994c38c24d13e9b453ef449513e --- /dev/null +++ b/model-00016-of-00050.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6a000a997178a0836cea90ad52364d227e7b5a585d2c20868a2a54b91144d2f +size 5257770240 diff --git a/model-00017-of-00050.safetensors b/model-00017-of-00050.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fc3bf5feeb4a48826119a1047647d064b073c448 --- /dev/null +++ b/model-00017-of-00050.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da196a44b60a20555a0ab1dae4b0de7ac91c9dfc9d8ce8478437332fc983e2ce +size 4788008390 diff --git a/model-00018-of-00050.safetensors b/model-00018-of-00050.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9b963ba9887273fe06cc55278840a98e87060159 --- /dev/null +++ b/model-00018-of-00050.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7dd85c7c6d23e7ceb9d0041384485ce0640a2bcef82822304f0449e42b57fd3f +size 5257770314 diff --git a/model-00019-of-00050.safetensors b/model-00019-of-00050.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8eed33e8ce7c2b1a8cf5cb6f36eb79dea3d05b15 --- /dev/null +++ b/model-00019-of-00050.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8463cd59497a0e65e448b0b6eb2b3cafd136b018f0bbe4d78a044e985d979074 +size 4878394792 diff --git a/model-00020-of-00050.safetensors b/model-00020-of-00050.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..eafe3b44ed80bd67560ae77a770472d83a8e8fad --- /dev/null +++ b/model-00020-of-00050.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca608aaa2c8fb803975b1481c0ec089d2f587f70c6d7ec18d8f3a48cf6959c08 +size 5257770230 diff --git a/model-00021-of-00050.safetensors b/model-00021-of-00050.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a00fcb65b00e8acd4d2d5a91f9c7d57fbe00a0b7 --- /dev/null +++ b/model-00021-of-00050.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d071ce383e7e630dcbdc8a7a22c97f9eab887751034f2581e30523e8d126e88c +size 4788008394 diff --git a/model-00022-of-00050.safetensors b/model-00022-of-00050.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..495b34186ea99558b0e53d9af4e9e6f8ae5256b3 --- /dev/null +++ b/model-00022-of-00050.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b7bdf96e6d1003f70f1276d3669a752cb9e6af69eec80c3b9df54332720dca2 +size 5257770268 diff --git a/model-00023-of-00050.safetensors b/model-00023-of-00050.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..de8d1a9810fe61c0e0bdadc44faaa33a1fc800e0 --- /dev/null +++ b/model-00023-of-00050.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e79abe93e1a727eb1cce684001fd3dbeaa480de81227b7c101e124eaaa0369d6 +size 4788008376 diff --git a/model-00024-of-00050.safetensors b/model-00024-of-00050.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f09d7aaf0ab2c67562965f75537e464e593d583e --- /dev/null +++ b/model-00024-of-00050.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:176d03c3a8a7a0eec20f869a2fa5fc9be4ba9c0e9c16a7749a423597e5df4cda +size 5257770310 diff --git a/model-00025-of-00050.safetensors b/model-00025-of-00050.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0e7e467861a1e2da86cda88f0152f185c2562a47 --- /dev/null +++ b/model-00025-of-00050.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:956183344ba9e3a4a434ccaf453bbbda1941b2d0a1053ceb31e3bd0282519534 +size 4878394780 diff --git a/model-00026-of-00050.safetensors b/model-00026-of-00050.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..74300603bcccf32adfabb8daefb132598a412ee0 --- /dev/null +++ b/model-00026-of-00050.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b07dd8e8ffb1321f5e4a6bb1d903c3fb2dcbfd8e734e0e93480bb02dd793a68f +size 5257770226 diff --git a/model-00027-of-00050.safetensors b/model-00027-of-00050.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..cbb464d5af2a86fe9cfd4b3477b3f4ed55d77a18 --- /dev/null +++ b/model-00027-of-00050.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7c016935cbcd18d701f2559fb9a4773580600392f8c3bbec1962602750c373b +size 4788008354 diff --git a/model-00028-of-00050.safetensors b/model-00028-of-00050.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..290cbed6f8093b4026dfbd8cf917e3365f715c8f --- /dev/null +++ b/model-00028-of-00050.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c90f1981a0452d647b418352c7eee50c0427a8dcaa9e237851c3fc69aa88c988 +size 5257770300 diff --git a/model-00029-of-00050.safetensors b/model-00029-of-00050.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f35ee0a88420cd16d35813ff1f2d893010e9d9be --- /dev/null +++ b/model-00029-of-00050.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f261947fc374b56b4d613612854b794f6ac2076434ea8c44bcd5489f52b512d +size 4788008388 diff --git a/model-00030-of-00050.safetensors b/model-00030-of-00050.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b0cc27a03c6352ed521e2f37783b3db7957c2d02 --- /dev/null +++ b/model-00030-of-00050.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:748ac82f72a8fa816a20cf8ed8814de5b993810f700f2149fae01772e321651d +size 5257770312 diff --git a/model-00031-of-00050.safetensors b/model-00031-of-00050.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9577ec0f65600d4e7551834201ae7049fcca52e5 --- /dev/null +++ b/model-00031-of-00050.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b0564754498ba2fed7c9780b9cf318bf0d3eb4bdc62354bfaa9d552c7b20cb6 +size 4878394754 diff --git a/model-00032-of-00050.safetensors b/model-00032-of-00050.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6dc6c72931769fe87c79537bf836534eefe1b38c --- /dev/null +++ b/model-00032-of-00050.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b6e20785689cc4fb429afffc79ace249180145a869ff0b729edb47c6972b077 +size 5257770268 diff --git a/model-00033-of-00050.safetensors b/model-00033-of-00050.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9f2be7dd81154d8a939ffb7d49a43c993ca26dbf --- /dev/null +++ b/model-00033-of-00050.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e6d510fe945a8fb97444afaaa38befb36a2c4c1c7c41545b81a0e3a4f52574a +size 4788008394 diff --git a/model-00034-of-00050.safetensors b/model-00034-of-00050.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..bd3aac922e23e4cfe002a2370169b357d68c01d7 --- /dev/null +++ b/model-00034-of-00050.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63f8034c7572dc58cddd90182411b3bfaf59a583a8d828b17b15ed348ef6eda0 +size 5257770312 diff --git a/model-00035-of-00050.safetensors b/model-00035-of-00050.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b99cfce7cc8ea32065bdf496d963b882e2400932 --- /dev/null +++ b/model-00035-of-00050.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6dc05593644e99480cc40b01bba569eb41242d8828651c50a11630d5282053fa +size 4788008376 diff --git a/model-00036-of-00050.safetensors b/model-00036-of-00050.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a415b2f019f10d984604fa17aca3065fc4313065 --- /dev/null +++ b/model-00036-of-00050.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd7fa83d6992caa41551a87eceee32195b648d906df92712e92c97742f6f99e7 +size 5257770314 diff --git a/model-00037-of-00050.safetensors b/model-00037-of-00050.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b47f9e44c200db6610e49f610b0d5aa82defa151 --- /dev/null +++ b/model-00037-of-00050.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2d771f323a15f0b693105279ebc483cb8c6250cafb73f4107df78c34ff96d97 +size 4878394790 diff --git a/model-00038-of-00050.safetensors b/model-00038-of-00050.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e9408a43349f6dd63360342c59cfa72015ed35b2 --- /dev/null +++ b/model-00038-of-00050.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71ca196aa3a68ee07102af3a3a29fad24a87f2321b159941dd5a853e928d4cb8 +size 5257770284 diff --git a/model-00039-of-00050.safetensors b/model-00039-of-00050.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ec1304e867e3cadccc3bc2cbbbaad54d1006bca5 --- /dev/null +++ b/model-00039-of-00050.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb97c740278c0d595c2f248944c4020577623315be3334297c25953d9f92b8f7 +size 4788008366 diff --git a/model-00040-of-00050.safetensors b/model-00040-of-00050.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6f941913682f842c7c5997b69eeb786f28e860bb --- /dev/null +++ b/model-00040-of-00050.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69bb74e112cd6be9fa48197d35dbd1ffa20298a6194a8671434fb31045475ff1 +size 5257770312 diff --git a/model-00041-of-00050.safetensors b/model-00041-of-00050.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..626d764575ccbbb79514808aad2b0625ceeb9c90 --- /dev/null +++ b/model-00041-of-00050.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbcd7a1cf5abfb8f15e4527d98462e3888a1efea343ebf7add349d1fb5d0fb46 +size 4788008390 diff --git a/model-00042-of-00050.safetensors b/model-00042-of-00050.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..bb2396ef7795605deca6b2e53a816073e4d20b32 --- /dev/null +++ b/model-00042-of-00050.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e19f0f901e53e0850bf9ecd0feea852b71c12931084b6ecf74dea693efeb9a01 +size 5257770314 diff --git a/model-00043-of-00050.safetensors b/model-00043-of-00050.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..60174b4a1d8222b72f3e60e19864e43bd1976d9d --- /dev/null +++ b/model-00043-of-00050.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be358e8a10f4960a36c56ff8dc9ae405eb23468a42ad13a6b26419268a54fec0 +size 4878394780 diff --git a/model-00044-of-00050.safetensors b/model-00044-of-00050.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..01d916faee08155e54a9686d9adc022fe70ca2cc --- /dev/null +++ b/model-00044-of-00050.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:116cdfe97720780703b88520e3b777f6118f872819b4633df45ebfd7d591d3ea +size 5257770238 diff --git a/model-00045-of-00050.safetensors b/model-00045-of-00050.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..472d6fb6d4ca9ed640131ce0d49f7b5921fe30e5 --- /dev/null +++ b/model-00045-of-00050.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a705cbc7739d69e97076415a17a7ebf2a844b9b3b18b6519b29eb67e70a06c87 +size 4788008372 diff --git a/model-00046-of-00050.safetensors b/model-00046-of-00050.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4d572e02fca1de80ba069bdbfa597108363ef317 --- /dev/null +++ b/model-00046-of-00050.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23d2b9e138639afc33450da79d49d04e1dcaf49a5dab8d1270e8af9de7b592d2 +size 5257770274 diff --git a/model-00047-of-00050.safetensors b/model-00047-of-00050.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..512e49fb800c61e70701fde204010dd14da1f2dd --- /dev/null +++ b/model-00047-of-00050.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f46600d3c47fd71b6cd3c58307f389f6b19a5b41da201aafa8ceb4fc86e6a820 +size 4788008388 diff --git a/model-00048-of-00050.safetensors b/model-00048-of-00050.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..559f7fe77c8fd6ce182abc9798064bdabd3adee3 --- /dev/null +++ b/model-00048-of-00050.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2803509d16740128406054670244ee546a5468014ac203ab5bc7e51afbe02141 +size 5257770314 diff --git a/model-00049-of-00050.safetensors b/model-00049-of-00050.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..199dc72cbe8d79594a86cfa780b39f36493062f4 --- /dev/null +++ b/model-00049-of-00050.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e42bd96e049846a44f5e98dd7f8a12d8c55efa19ee1c43462f05f09f60b95e13 +size 4878394770 diff --git a/model-00050-of-00050.safetensors b/model-00050-of-00050.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..75d7f581311814cf85b601945684992a25b45a44 --- /dev/null +++ b/model-00050-of-00050.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e17f0133f3d373ca58114a1fc891a322394e4c5cdb4064d73b0daab9fb309ff +size 4595594820 diff --git a/model.safetensors.index.json b/model.safetensors.index.json index d7abc1d4f24d042be8b77e0e7b782ea99249b6ca..f1f718695f4bfd222c37366fc7a8f85e5924737b 100644 --- a/model.safetensors.index.json +++ b/model.safetensors.index.json @@ -1,2360 +1,2360 @@ { "metadata": { - "total_size": 209877081088 + "total_size": 251809516544 }, "weight_map": { - "lm_head.biases": "model-00044-of-00044.safetensors", - "lm_head.scales": "model-00044-of-00044.safetensors", - "lm_head.weight": "model-00044-of-00044.safetensors", - "model.embed_tokens.biases": "model-00001-of-00044.safetensors", - "model.embed_tokens.scales": "model-00001-of-00044.safetensors", - "model.embed_tokens.weight": "model-00001-of-00044.safetensors", - "model.layers.0.input_layernorm.weight": "model-00001-of-00044.safetensors", - "model.layers.0.mlp.down_proj.biases": "model-00001-of-00044.safetensors", - "model.layers.0.mlp.down_proj.scales": "model-00001-of-00044.safetensors", - "model.layers.0.mlp.down_proj.weight": "model-00001-of-00044.safetensors", - "model.layers.0.mlp.gate_proj.biases": "model-00001-of-00044.safetensors", - "model.layers.0.mlp.gate_proj.scales": "model-00001-of-00044.safetensors", - "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00044.safetensors", - "model.layers.0.mlp.up_proj.biases": "model-00001-of-00044.safetensors", - "model.layers.0.mlp.up_proj.scales": "model-00001-of-00044.safetensors", - "model.layers.0.mlp.up_proj.weight": "model-00001-of-00044.safetensors", - "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00044.safetensors", - "model.layers.0.self_attn.kv_a_layernorm.weight": "model-00001-of-00044.safetensors", - "model.layers.0.self_attn.kv_a_proj_with_mqa.biases": "model-00001-of-00044.safetensors", - "model.layers.0.self_attn.kv_a_proj_with_mqa.scales": "model-00001-of-00044.safetensors", - "model.layers.0.self_attn.kv_a_proj_with_mqa.weight": "model-00001-of-00044.safetensors", - "model.layers.0.self_attn.kv_b_proj.biases": "model-00001-of-00044.safetensors", - "model.layers.0.self_attn.kv_b_proj.scales": "model-00001-of-00044.safetensors", - "model.layers.0.self_attn.kv_b_proj.weight": "model-00001-of-00044.safetensors", - "model.layers.0.self_attn.o_proj.biases": "model-00001-of-00044.safetensors", - "model.layers.0.self_attn.o_proj.scales": "model-00001-of-00044.safetensors", - "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00044.safetensors", - "model.layers.0.self_attn.q_a_layernorm.weight": "model-00001-of-00044.safetensors", - "model.layers.0.self_attn.q_a_proj.biases": "model-00001-of-00044.safetensors", - "model.layers.0.self_attn.q_a_proj.scales": "model-00001-of-00044.safetensors", - "model.layers.0.self_attn.q_a_proj.weight": "model-00001-of-00044.safetensors", - "model.layers.0.self_attn.q_b_proj.biases": "model-00001-of-00044.safetensors", - "model.layers.0.self_attn.q_b_proj.scales": "model-00001-of-00044.safetensors", - "model.layers.0.self_attn.q_b_proj.weight": "model-00001-of-00044.safetensors", - "model.layers.1.input_layernorm.weight": "model-00001-of-00044.safetensors", - "model.layers.1.mlp.down_proj.biases": "model-00001-of-00044.safetensors", - "model.layers.1.mlp.down_proj.scales": "model-00001-of-00044.safetensors", - "model.layers.1.mlp.down_proj.weight": "model-00001-of-00044.safetensors", - "model.layers.1.mlp.gate_proj.biases": "model-00001-of-00044.safetensors", - "model.layers.1.mlp.gate_proj.scales": "model-00001-of-00044.safetensors", - "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00044.safetensors", - "model.layers.1.mlp.up_proj.biases": "model-00001-of-00044.safetensors", - "model.layers.1.mlp.up_proj.scales": "model-00001-of-00044.safetensors", - "model.layers.1.mlp.up_proj.weight": "model-00001-of-00044.safetensors", - "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00044.safetensors", - "model.layers.1.self_attn.kv_a_layernorm.weight": "model-00001-of-00044.safetensors", - "model.layers.1.self_attn.kv_a_proj_with_mqa.biases": "model-00001-of-00044.safetensors", - "model.layers.1.self_attn.kv_a_proj_with_mqa.scales": "model-00001-of-00044.safetensors", - "model.layers.1.self_attn.kv_a_proj_with_mqa.weight": "model-00001-of-00044.safetensors", - "model.layers.1.self_attn.kv_b_proj.biases": "model-00001-of-00044.safetensors", - "model.layers.1.self_attn.kv_b_proj.scales": "model-00001-of-00044.safetensors", - "model.layers.1.self_attn.kv_b_proj.weight": "model-00001-of-00044.safetensors", - "model.layers.1.self_attn.o_proj.biases": "model-00001-of-00044.safetensors", - "model.layers.1.self_attn.o_proj.scales": "model-00001-of-00044.safetensors", - "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00044.safetensors", - "model.layers.1.self_attn.q_a_layernorm.weight": "model-00001-of-00044.safetensors", - "model.layers.1.self_attn.q_a_proj.biases": "model-00001-of-00044.safetensors", - "model.layers.1.self_attn.q_a_proj.scales": "model-00001-of-00044.safetensors", - "model.layers.1.self_attn.q_a_proj.weight": "model-00001-of-00044.safetensors", - "model.layers.1.self_attn.q_b_proj.biases": "model-00001-of-00044.safetensors", - "model.layers.1.self_attn.q_b_proj.scales": "model-00001-of-00044.safetensors", - "model.layers.1.self_attn.q_b_proj.weight": "model-00001-of-00044.safetensors", - "model.layers.10.input_layernorm.weight": "model-00007-of-00044.safetensors", - "model.layers.10.mlp.gate.e_score_correction_bias": "model-00007-of-00044.safetensors", - "model.layers.10.mlp.gate.weight": "model-00007-of-00044.safetensors", - "model.layers.10.mlp.shared_experts.down_proj.biases": "model-00007-of-00044.safetensors", - "model.layers.10.mlp.shared_experts.down_proj.scales": "model-00007-of-00044.safetensors", - "model.layers.10.mlp.shared_experts.down_proj.weight": "model-00007-of-00044.safetensors", - "model.layers.10.mlp.shared_experts.gate_proj.biases": "model-00007-of-00044.safetensors", - "model.layers.10.mlp.shared_experts.gate_proj.scales": "model-00007-of-00044.safetensors", - "model.layers.10.mlp.shared_experts.gate_proj.weight": "model-00007-of-00044.safetensors", - "model.layers.10.mlp.shared_experts.up_proj.biases": "model-00007-of-00044.safetensors", - "model.layers.10.mlp.shared_experts.up_proj.scales": "model-00007-of-00044.safetensors", - "model.layers.10.mlp.shared_experts.up_proj.weight": "model-00007-of-00044.safetensors", - "model.layers.10.mlp.switch_mlp.down_proj.biases": "model-00007-of-00044.safetensors", - "model.layers.10.mlp.switch_mlp.down_proj.scales": "model-00007-of-00044.safetensors", - "model.layers.10.mlp.switch_mlp.down_proj.weight": "model-00007-of-00044.safetensors", - "model.layers.10.mlp.switch_mlp.gate_proj.biases": "model-00006-of-00044.safetensors", - "model.layers.10.mlp.switch_mlp.gate_proj.scales": "model-00006-of-00044.safetensors", - "model.layers.10.mlp.switch_mlp.gate_proj.weight": "model-00006-of-00044.safetensors", - "model.layers.10.mlp.switch_mlp.up_proj.biases": "model-00006-of-00044.safetensors", - "model.layers.10.mlp.switch_mlp.up_proj.scales": "model-00006-of-00044.safetensors", - "model.layers.10.mlp.switch_mlp.up_proj.weight": "model-00006-of-00044.safetensors", - "model.layers.10.post_attention_layernorm.weight": "model-00007-of-00044.safetensors", - "model.layers.10.self_attn.kv_a_layernorm.weight": "model-00006-of-00044.safetensors", - "model.layers.10.self_attn.kv_a_proj_with_mqa.biases": "model-00006-of-00044.safetensors", - "model.layers.10.self_attn.kv_a_proj_with_mqa.scales": "model-00006-of-00044.safetensors", - "model.layers.10.self_attn.kv_a_proj_with_mqa.weight": "model-00006-of-00044.safetensors", - "model.layers.10.self_attn.kv_b_proj.biases": "model-00006-of-00044.safetensors", - "model.layers.10.self_attn.kv_b_proj.scales": "model-00006-of-00044.safetensors", - "model.layers.10.self_attn.kv_b_proj.weight": "model-00006-of-00044.safetensors", - "model.layers.10.self_attn.o_proj.biases": "model-00006-of-00044.safetensors", - "model.layers.10.self_attn.o_proj.scales": "model-00006-of-00044.safetensors", - "model.layers.10.self_attn.o_proj.weight": "model-00006-of-00044.safetensors", - "model.layers.10.self_attn.q_a_layernorm.weight": "model-00006-of-00044.safetensors", - "model.layers.10.self_attn.q_a_proj.biases": "model-00006-of-00044.safetensors", - "model.layers.10.self_attn.q_a_proj.scales": "model-00006-of-00044.safetensors", - "model.layers.10.self_attn.q_a_proj.weight": "model-00006-of-00044.safetensors", - "model.layers.10.self_attn.q_b_proj.biases": "model-00006-of-00044.safetensors", - "model.layers.10.self_attn.q_b_proj.scales": "model-00006-of-00044.safetensors", - "model.layers.10.self_attn.q_b_proj.weight": "model-00006-of-00044.safetensors", - "model.layers.11.input_layernorm.weight": "model-00007-of-00044.safetensors", - "model.layers.11.mlp.gate.e_score_correction_bias": "model-00007-of-00044.safetensors", - "model.layers.11.mlp.gate.weight": "model-00007-of-00044.safetensors", - "model.layers.11.mlp.shared_experts.down_proj.biases": "model-00007-of-00044.safetensors", - "model.layers.11.mlp.shared_experts.down_proj.scales": "model-00007-of-00044.safetensors", - "model.layers.11.mlp.shared_experts.down_proj.weight": "model-00007-of-00044.safetensors", - "model.layers.11.mlp.shared_experts.gate_proj.biases": "model-00007-of-00044.safetensors", - "model.layers.11.mlp.shared_experts.gate_proj.scales": "model-00007-of-00044.safetensors", - "model.layers.11.mlp.shared_experts.gate_proj.weight": "model-00007-of-00044.safetensors", - "model.layers.11.mlp.shared_experts.up_proj.biases": "model-00007-of-00044.safetensors", - "model.layers.11.mlp.shared_experts.up_proj.scales": "model-00007-of-00044.safetensors", - "model.layers.11.mlp.shared_experts.up_proj.weight": "model-00007-of-00044.safetensors", - "model.layers.11.mlp.switch_mlp.down_proj.biases": "model-00007-of-00044.safetensors", - "model.layers.11.mlp.switch_mlp.down_proj.scales": "model-00007-of-00044.safetensors", - "model.layers.11.mlp.switch_mlp.down_proj.weight": "model-00007-of-00044.safetensors", - "model.layers.11.mlp.switch_mlp.gate_proj.biases": "model-00007-of-00044.safetensors", - "model.layers.11.mlp.switch_mlp.gate_proj.scales": "model-00007-of-00044.safetensors", - "model.layers.11.mlp.switch_mlp.gate_proj.weight": "model-00007-of-00044.safetensors", - "model.layers.11.mlp.switch_mlp.up_proj.biases": "model-00007-of-00044.safetensors", - "model.layers.11.mlp.switch_mlp.up_proj.scales": "model-00007-of-00044.safetensors", - "model.layers.11.mlp.switch_mlp.up_proj.weight": "model-00007-of-00044.safetensors", - "model.layers.11.post_attention_layernorm.weight": "model-00007-of-00044.safetensors", - "model.layers.11.self_attn.kv_a_layernorm.weight": "model-00007-of-00044.safetensors", - "model.layers.11.self_attn.kv_a_proj_with_mqa.biases": "model-00007-of-00044.safetensors", - "model.layers.11.self_attn.kv_a_proj_with_mqa.scales": "model-00007-of-00044.safetensors", - "model.layers.11.self_attn.kv_a_proj_with_mqa.weight": "model-00007-of-00044.safetensors", - "model.layers.11.self_attn.kv_b_proj.biases": "model-00007-of-00044.safetensors", - "model.layers.11.self_attn.kv_b_proj.scales": "model-00007-of-00044.safetensors", - "model.layers.11.self_attn.kv_b_proj.weight": "model-00007-of-00044.safetensors", - "model.layers.11.self_attn.o_proj.biases": "model-00007-of-00044.safetensors", - "model.layers.11.self_attn.o_proj.scales": "model-00007-of-00044.safetensors", - "model.layers.11.self_attn.o_proj.weight": "model-00007-of-00044.safetensors", - "model.layers.11.self_attn.q_a_layernorm.weight": "model-00007-of-00044.safetensors", - "model.layers.11.self_attn.q_a_proj.biases": "model-00007-of-00044.safetensors", - "model.layers.11.self_attn.q_a_proj.scales": "model-00007-of-00044.safetensors", - "model.layers.11.self_attn.q_a_proj.weight": "model-00007-of-00044.safetensors", - "model.layers.11.self_attn.q_b_proj.biases": "model-00007-of-00044.safetensors", - "model.layers.11.self_attn.q_b_proj.scales": "model-00007-of-00044.safetensors", - "model.layers.11.self_attn.q_b_proj.weight": "model-00007-of-00044.safetensors", - "model.layers.12.input_layernorm.weight": "model-00008-of-00044.safetensors", - "model.layers.12.mlp.gate.e_score_correction_bias": "model-00008-of-00044.safetensors", - "model.layers.12.mlp.gate.weight": "model-00008-of-00044.safetensors", - "model.layers.12.mlp.shared_experts.down_proj.biases": "model-00008-of-00044.safetensors", - "model.layers.12.mlp.shared_experts.down_proj.scales": "model-00008-of-00044.safetensors", - "model.layers.12.mlp.shared_experts.down_proj.weight": "model-00008-of-00044.safetensors", - "model.layers.12.mlp.shared_experts.gate_proj.biases": "model-00008-of-00044.safetensors", - "model.layers.12.mlp.shared_experts.gate_proj.scales": "model-00008-of-00044.safetensors", - "model.layers.12.mlp.shared_experts.gate_proj.weight": "model-00008-of-00044.safetensors", - "model.layers.12.mlp.shared_experts.up_proj.biases": "model-00008-of-00044.safetensors", - "model.layers.12.mlp.shared_experts.up_proj.scales": "model-00008-of-00044.safetensors", - "model.layers.12.mlp.shared_experts.up_proj.weight": "model-00008-of-00044.safetensors", - "model.layers.12.mlp.switch_mlp.down_proj.biases": "model-00008-of-00044.safetensors", - "model.layers.12.mlp.switch_mlp.down_proj.scales": "model-00008-of-00044.safetensors", - "model.layers.12.mlp.switch_mlp.down_proj.weight": "model-00008-of-00044.safetensors", - "model.layers.12.mlp.switch_mlp.gate_proj.biases": "model-00008-of-00044.safetensors", - "model.layers.12.mlp.switch_mlp.gate_proj.scales": "model-00008-of-00044.safetensors", - "model.layers.12.mlp.switch_mlp.gate_proj.weight": "model-00008-of-00044.safetensors", - "model.layers.12.mlp.switch_mlp.up_proj.biases": "model-00008-of-00044.safetensors", - "model.layers.12.mlp.switch_mlp.up_proj.scales": "model-00008-of-00044.safetensors", - "model.layers.12.mlp.switch_mlp.up_proj.weight": "model-00008-of-00044.safetensors", - "model.layers.12.post_attention_layernorm.weight": "model-00008-of-00044.safetensors", - "model.layers.12.self_attn.kv_a_layernorm.weight": "model-00007-of-00044.safetensors", - "model.layers.12.self_attn.kv_a_proj_with_mqa.biases": "model-00007-of-00044.safetensors", - "model.layers.12.self_attn.kv_a_proj_with_mqa.scales": "model-00007-of-00044.safetensors", - "model.layers.12.self_attn.kv_a_proj_with_mqa.weight": "model-00007-of-00044.safetensors", - "model.layers.12.self_attn.kv_b_proj.biases": "model-00007-of-00044.safetensors", - "model.layers.12.self_attn.kv_b_proj.scales": "model-00007-of-00044.safetensors", - "model.layers.12.self_attn.kv_b_proj.weight": "model-00007-of-00044.safetensors", - "model.layers.12.self_attn.o_proj.biases": "model-00007-of-00044.safetensors", - "model.layers.12.self_attn.o_proj.scales": "model-00007-of-00044.safetensors", - "model.layers.12.self_attn.o_proj.weight": "model-00007-of-00044.safetensors", - "model.layers.12.self_attn.q_a_layernorm.weight": "model-00007-of-00044.safetensors", - "model.layers.12.self_attn.q_a_proj.biases": "model-00007-of-00044.safetensors", - "model.layers.12.self_attn.q_a_proj.scales": "model-00007-of-00044.safetensors", - "model.layers.12.self_attn.q_a_proj.weight": "model-00007-of-00044.safetensors", - "model.layers.12.self_attn.q_b_proj.biases": "model-00007-of-00044.safetensors", - "model.layers.12.self_attn.q_b_proj.scales": "model-00007-of-00044.safetensors", - "model.layers.12.self_attn.q_b_proj.weight": "model-00007-of-00044.safetensors", - "model.layers.13.input_layernorm.weight": "model-00009-of-00044.safetensors", - "model.layers.13.mlp.gate.e_score_correction_bias": "model-00009-of-00044.safetensors", - "model.layers.13.mlp.gate.weight": "model-00009-of-00044.safetensors", - "model.layers.13.mlp.shared_experts.down_proj.biases": "model-00009-of-00044.safetensors", - "model.layers.13.mlp.shared_experts.down_proj.scales": "model-00009-of-00044.safetensors", - "model.layers.13.mlp.shared_experts.down_proj.weight": "model-00009-of-00044.safetensors", - "model.layers.13.mlp.shared_experts.gate_proj.biases": "model-00009-of-00044.safetensors", - "model.layers.13.mlp.shared_experts.gate_proj.scales": "model-00009-of-00044.safetensors", - "model.layers.13.mlp.shared_experts.gate_proj.weight": "model-00009-of-00044.safetensors", - "model.layers.13.mlp.shared_experts.up_proj.biases": "model-00009-of-00044.safetensors", - "model.layers.13.mlp.shared_experts.up_proj.scales": "model-00009-of-00044.safetensors", - "model.layers.13.mlp.shared_experts.up_proj.weight": "model-00009-of-00044.safetensors", - "model.layers.13.mlp.switch_mlp.down_proj.biases": "model-00009-of-00044.safetensors", - "model.layers.13.mlp.switch_mlp.down_proj.scales": "model-00009-of-00044.safetensors", - "model.layers.13.mlp.switch_mlp.down_proj.weight": "model-00009-of-00044.safetensors", - "model.layers.13.mlp.switch_mlp.gate_proj.biases": "model-00008-of-00044.safetensors", - "model.layers.13.mlp.switch_mlp.gate_proj.scales": "model-00008-of-00044.safetensors", - "model.layers.13.mlp.switch_mlp.gate_proj.weight": "model-00008-of-00044.safetensors", - "model.layers.13.mlp.switch_mlp.up_proj.biases": "model-00009-of-00044.safetensors", - "model.layers.13.mlp.switch_mlp.up_proj.scales": "model-00009-of-00044.safetensors", - "model.layers.13.mlp.switch_mlp.up_proj.weight": "model-00009-of-00044.safetensors", - "model.layers.13.post_attention_layernorm.weight": "model-00009-of-00044.safetensors", - "model.layers.13.self_attn.kv_a_layernorm.weight": "model-00008-of-00044.safetensors", - "model.layers.13.self_attn.kv_a_proj_with_mqa.biases": "model-00008-of-00044.safetensors", - "model.layers.13.self_attn.kv_a_proj_with_mqa.scales": "model-00008-of-00044.safetensors", - "model.layers.13.self_attn.kv_a_proj_with_mqa.weight": "model-00008-of-00044.safetensors", - "model.layers.13.self_attn.kv_b_proj.biases": "model-00008-of-00044.safetensors", - "model.layers.13.self_attn.kv_b_proj.scales": "model-00008-of-00044.safetensors", - "model.layers.13.self_attn.kv_b_proj.weight": "model-00008-of-00044.safetensors", - "model.layers.13.self_attn.o_proj.biases": "model-00008-of-00044.safetensors", - "model.layers.13.self_attn.o_proj.scales": "model-00008-of-00044.safetensors", - "model.layers.13.self_attn.o_proj.weight": "model-00008-of-00044.safetensors", - "model.layers.13.self_attn.q_a_layernorm.weight": "model-00008-of-00044.safetensors", - "model.layers.13.self_attn.q_a_proj.biases": "model-00008-of-00044.safetensors", - "model.layers.13.self_attn.q_a_proj.scales": "model-00008-of-00044.safetensors", - "model.layers.13.self_attn.q_a_proj.weight": "model-00008-of-00044.safetensors", - "model.layers.13.self_attn.q_b_proj.biases": "model-00008-of-00044.safetensors", - "model.layers.13.self_attn.q_b_proj.scales": "model-00008-of-00044.safetensors", - "model.layers.13.self_attn.q_b_proj.weight": "model-00008-of-00044.safetensors", - "model.layers.14.input_layernorm.weight": "model-00010-of-00044.safetensors", - "model.layers.14.mlp.gate.e_score_correction_bias": "model-00010-of-00044.safetensors", - "model.layers.14.mlp.gate.weight": "model-00010-of-00044.safetensors", - "model.layers.14.mlp.shared_experts.down_proj.biases": "model-00010-of-00044.safetensors", - "model.layers.14.mlp.shared_experts.down_proj.scales": "model-00010-of-00044.safetensors", - "model.layers.14.mlp.shared_experts.down_proj.weight": "model-00010-of-00044.safetensors", - "model.layers.14.mlp.shared_experts.gate_proj.biases": "model-00010-of-00044.safetensors", - "model.layers.14.mlp.shared_experts.gate_proj.scales": "model-00010-of-00044.safetensors", - "model.layers.14.mlp.shared_experts.gate_proj.weight": "model-00010-of-00044.safetensors", - "model.layers.14.mlp.shared_experts.up_proj.biases": "model-00010-of-00044.safetensors", - "model.layers.14.mlp.shared_experts.up_proj.scales": "model-00010-of-00044.safetensors", - "model.layers.14.mlp.shared_experts.up_proj.weight": "model-00010-of-00044.safetensors", - "model.layers.14.mlp.switch_mlp.down_proj.biases": "model-00010-of-00044.safetensors", - "model.layers.14.mlp.switch_mlp.down_proj.scales": "model-00010-of-00044.safetensors", - "model.layers.14.mlp.switch_mlp.down_proj.weight": "model-00010-of-00044.safetensors", - "model.layers.14.mlp.switch_mlp.gate_proj.biases": "model-00009-of-00044.safetensors", - "model.layers.14.mlp.switch_mlp.gate_proj.scales": "model-00009-of-00044.safetensors", - "model.layers.14.mlp.switch_mlp.gate_proj.weight": "model-00009-of-00044.safetensors", - "model.layers.14.mlp.switch_mlp.up_proj.biases": "model-00009-of-00044.safetensors", - "model.layers.14.mlp.switch_mlp.up_proj.scales": "model-00009-of-00044.safetensors", - "model.layers.14.mlp.switch_mlp.up_proj.weight": "model-00009-of-00044.safetensors", - "model.layers.14.post_attention_layernorm.weight": "model-00010-of-00044.safetensors", - "model.layers.14.self_attn.kv_a_layernorm.weight": "model-00009-of-00044.safetensors", - "model.layers.14.self_attn.kv_a_proj_with_mqa.biases": "model-00009-of-00044.safetensors", - "model.layers.14.self_attn.kv_a_proj_with_mqa.scales": "model-00009-of-00044.safetensors", - "model.layers.14.self_attn.kv_a_proj_with_mqa.weight": "model-00009-of-00044.safetensors", - "model.layers.14.self_attn.kv_b_proj.biases": "model-00009-of-00044.safetensors", - "model.layers.14.self_attn.kv_b_proj.scales": "model-00009-of-00044.safetensors", - "model.layers.14.self_attn.kv_b_proj.weight": "model-00009-of-00044.safetensors", - "model.layers.14.self_attn.o_proj.biases": "model-00009-of-00044.safetensors", - "model.layers.14.self_attn.o_proj.scales": "model-00009-of-00044.safetensors", - "model.layers.14.self_attn.o_proj.weight": "model-00009-of-00044.safetensors", - "model.layers.14.self_attn.q_a_layernorm.weight": "model-00009-of-00044.safetensors", - "model.layers.14.self_attn.q_a_proj.biases": "model-00009-of-00044.safetensors", - "model.layers.14.self_attn.q_a_proj.scales": "model-00009-of-00044.safetensors", - "model.layers.14.self_attn.q_a_proj.weight": "model-00009-of-00044.safetensors", - "model.layers.14.self_attn.q_b_proj.biases": "model-00009-of-00044.safetensors", - "model.layers.14.self_attn.q_b_proj.scales": "model-00009-of-00044.safetensors", - "model.layers.14.self_attn.q_b_proj.weight": "model-00009-of-00044.safetensors", - "model.layers.15.input_layernorm.weight": "model-00010-of-00044.safetensors", - "model.layers.15.mlp.gate.e_score_correction_bias": "model-00010-of-00044.safetensors", - "model.layers.15.mlp.gate.weight": "model-00010-of-00044.safetensors", - "model.layers.15.mlp.shared_experts.down_proj.biases": "model-00010-of-00044.safetensors", - "model.layers.15.mlp.shared_experts.down_proj.scales": "model-00010-of-00044.safetensors", - "model.layers.15.mlp.shared_experts.down_proj.weight": "model-00010-of-00044.safetensors", - "model.layers.15.mlp.shared_experts.gate_proj.biases": "model-00010-of-00044.safetensors", - "model.layers.15.mlp.shared_experts.gate_proj.scales": "model-00010-of-00044.safetensors", - "model.layers.15.mlp.shared_experts.gate_proj.weight": "model-00010-of-00044.safetensors", - "model.layers.15.mlp.shared_experts.up_proj.biases": "model-00010-of-00044.safetensors", - "model.layers.15.mlp.shared_experts.up_proj.scales": "model-00010-of-00044.safetensors", - "model.layers.15.mlp.shared_experts.up_proj.weight": "model-00010-of-00044.safetensors", - "model.layers.15.mlp.switch_mlp.down_proj.biases": "model-00010-of-00044.safetensors", - "model.layers.15.mlp.switch_mlp.down_proj.scales": "model-00010-of-00044.safetensors", - "model.layers.15.mlp.switch_mlp.down_proj.weight": "model-00010-of-00044.safetensors", - "model.layers.15.mlp.switch_mlp.gate_proj.biases": "model-00010-of-00044.safetensors", - "model.layers.15.mlp.switch_mlp.gate_proj.scales": "model-00010-of-00044.safetensors", - "model.layers.15.mlp.switch_mlp.gate_proj.weight": "model-00010-of-00044.safetensors", - "model.layers.15.mlp.switch_mlp.up_proj.biases": "model-00010-of-00044.safetensors", - "model.layers.15.mlp.switch_mlp.up_proj.scales": "model-00010-of-00044.safetensors", - "model.layers.15.mlp.switch_mlp.up_proj.weight": "model-00010-of-00044.safetensors", - "model.layers.15.post_attention_layernorm.weight": "model-00010-of-00044.safetensors", - "model.layers.15.self_attn.kv_a_layernorm.weight": "model-00010-of-00044.safetensors", - "model.layers.15.self_attn.kv_a_proj_with_mqa.biases": "model-00010-of-00044.safetensors", - "model.layers.15.self_attn.kv_a_proj_with_mqa.scales": "model-00010-of-00044.safetensors", - "model.layers.15.self_attn.kv_a_proj_with_mqa.weight": "model-00010-of-00044.safetensors", - "model.layers.15.self_attn.kv_b_proj.biases": "model-00010-of-00044.safetensors", - "model.layers.15.self_attn.kv_b_proj.scales": "model-00010-of-00044.safetensors", - "model.layers.15.self_attn.kv_b_proj.weight": "model-00010-of-00044.safetensors", - "model.layers.15.self_attn.o_proj.biases": "model-00010-of-00044.safetensors", - "model.layers.15.self_attn.o_proj.scales": "model-00010-of-00044.safetensors", - "model.layers.15.self_attn.o_proj.weight": "model-00010-of-00044.safetensors", - "model.layers.15.self_attn.q_a_layernorm.weight": "model-00010-of-00044.safetensors", - "model.layers.15.self_attn.q_a_proj.biases": "model-00010-of-00044.safetensors", - "model.layers.15.self_attn.q_a_proj.scales": "model-00010-of-00044.safetensors", - "model.layers.15.self_attn.q_a_proj.weight": "model-00010-of-00044.safetensors", - "model.layers.15.self_attn.q_b_proj.biases": "model-00010-of-00044.safetensors", - "model.layers.15.self_attn.q_b_proj.scales": "model-00010-of-00044.safetensors", - "model.layers.15.self_attn.q_b_proj.weight": "model-00010-of-00044.safetensors", - "model.layers.16.input_layernorm.weight": "model-00011-of-00044.safetensors", - "model.layers.16.mlp.gate.e_score_correction_bias": "model-00011-of-00044.safetensors", - "model.layers.16.mlp.gate.weight": "model-00011-of-00044.safetensors", - "model.layers.16.mlp.shared_experts.down_proj.biases": "model-00011-of-00044.safetensors", - "model.layers.16.mlp.shared_experts.down_proj.scales": "model-00011-of-00044.safetensors", - "model.layers.16.mlp.shared_experts.down_proj.weight": "model-00011-of-00044.safetensors", - "model.layers.16.mlp.shared_experts.gate_proj.biases": "model-00011-of-00044.safetensors", - "model.layers.16.mlp.shared_experts.gate_proj.scales": "model-00011-of-00044.safetensors", - "model.layers.16.mlp.shared_experts.gate_proj.weight": "model-00011-of-00044.safetensors", - "model.layers.16.mlp.shared_experts.up_proj.biases": "model-00011-of-00044.safetensors", - "model.layers.16.mlp.shared_experts.up_proj.scales": "model-00011-of-00044.safetensors", - "model.layers.16.mlp.shared_experts.up_proj.weight": "model-00011-of-00044.safetensors", - "model.layers.16.mlp.switch_mlp.down_proj.biases": "model-00011-of-00044.safetensors", - "model.layers.16.mlp.switch_mlp.down_proj.scales": "model-00011-of-00044.safetensors", - "model.layers.16.mlp.switch_mlp.down_proj.weight": "model-00011-of-00044.safetensors", - "model.layers.16.mlp.switch_mlp.gate_proj.biases": "model-00011-of-00044.safetensors", - "model.layers.16.mlp.switch_mlp.gate_proj.scales": "model-00011-of-00044.safetensors", - "model.layers.16.mlp.switch_mlp.gate_proj.weight": "model-00011-of-00044.safetensors", - "model.layers.16.mlp.switch_mlp.up_proj.biases": "model-00011-of-00044.safetensors", - "model.layers.16.mlp.switch_mlp.up_proj.scales": "model-00011-of-00044.safetensors", - "model.layers.16.mlp.switch_mlp.up_proj.weight": "model-00011-of-00044.safetensors", - "model.layers.16.post_attention_layernorm.weight": "model-00011-of-00044.safetensors", - "model.layers.16.self_attn.kv_a_layernorm.weight": "model-00010-of-00044.safetensors", - "model.layers.16.self_attn.kv_a_proj_with_mqa.biases": "model-00010-of-00044.safetensors", - "model.layers.16.self_attn.kv_a_proj_with_mqa.scales": "model-00010-of-00044.safetensors", - "model.layers.16.self_attn.kv_a_proj_with_mqa.weight": "model-00010-of-00044.safetensors", - "model.layers.16.self_attn.kv_b_proj.biases": "model-00010-of-00044.safetensors", - "model.layers.16.self_attn.kv_b_proj.scales": "model-00010-of-00044.safetensors", - "model.layers.16.self_attn.kv_b_proj.weight": "model-00010-of-00044.safetensors", - "model.layers.16.self_attn.o_proj.biases": "model-00010-of-00044.safetensors", - "model.layers.16.self_attn.o_proj.scales": "model-00010-of-00044.safetensors", - "model.layers.16.self_attn.o_proj.weight": "model-00010-of-00044.safetensors", - "model.layers.16.self_attn.q_a_layernorm.weight": "model-00010-of-00044.safetensors", - "model.layers.16.self_attn.q_a_proj.biases": "model-00010-of-00044.safetensors", - "model.layers.16.self_attn.q_a_proj.scales": "model-00010-of-00044.safetensors", - "model.layers.16.self_attn.q_a_proj.weight": "model-00010-of-00044.safetensors", - "model.layers.16.self_attn.q_b_proj.biases": "model-00010-of-00044.safetensors", - "model.layers.16.self_attn.q_b_proj.scales": "model-00010-of-00044.safetensors", - "model.layers.16.self_attn.q_b_proj.weight": "model-00010-of-00044.safetensors", - "model.layers.17.input_layernorm.weight": "model-00012-of-00044.safetensors", - "model.layers.17.mlp.gate.e_score_correction_bias": "model-00012-of-00044.safetensors", - "model.layers.17.mlp.gate.weight": "model-00012-of-00044.safetensors", - "model.layers.17.mlp.shared_experts.down_proj.biases": "model-00012-of-00044.safetensors", - "model.layers.17.mlp.shared_experts.down_proj.scales": "model-00012-of-00044.safetensors", - "model.layers.17.mlp.shared_experts.down_proj.weight": "model-00012-of-00044.safetensors", - "model.layers.17.mlp.shared_experts.gate_proj.biases": "model-00012-of-00044.safetensors", - "model.layers.17.mlp.shared_experts.gate_proj.scales": "model-00012-of-00044.safetensors", - "model.layers.17.mlp.shared_experts.gate_proj.weight": "model-00012-of-00044.safetensors", - "model.layers.17.mlp.shared_experts.up_proj.biases": "model-00012-of-00044.safetensors", - "model.layers.17.mlp.shared_experts.up_proj.scales": "model-00012-of-00044.safetensors", - "model.layers.17.mlp.shared_experts.up_proj.weight": "model-00012-of-00044.safetensors", - "model.layers.17.mlp.switch_mlp.down_proj.biases": "model-00012-of-00044.safetensors", - "model.layers.17.mlp.switch_mlp.down_proj.scales": "model-00012-of-00044.safetensors", - "model.layers.17.mlp.switch_mlp.down_proj.weight": "model-00012-of-00044.safetensors", - "model.layers.17.mlp.switch_mlp.gate_proj.biases": "model-00011-of-00044.safetensors", - "model.layers.17.mlp.switch_mlp.gate_proj.scales": "model-00011-of-00044.safetensors", - "model.layers.17.mlp.switch_mlp.gate_proj.weight": "model-00011-of-00044.safetensors", - "model.layers.17.mlp.switch_mlp.up_proj.biases": "model-00012-of-00044.safetensors", - "model.layers.17.mlp.switch_mlp.up_proj.scales": "model-00012-of-00044.safetensors", - "model.layers.17.mlp.switch_mlp.up_proj.weight": "model-00012-of-00044.safetensors", - "model.layers.17.post_attention_layernorm.weight": "model-00012-of-00044.safetensors", - "model.layers.17.self_attn.kv_a_layernorm.weight": "model-00011-of-00044.safetensors", - "model.layers.17.self_attn.kv_a_proj_with_mqa.biases": "model-00011-of-00044.safetensors", - "model.layers.17.self_attn.kv_a_proj_with_mqa.scales": "model-00011-of-00044.safetensors", - "model.layers.17.self_attn.kv_a_proj_with_mqa.weight": "model-00011-of-00044.safetensors", - "model.layers.17.self_attn.kv_b_proj.biases": "model-00011-of-00044.safetensors", - "model.layers.17.self_attn.kv_b_proj.scales": "model-00011-of-00044.safetensors", - "model.layers.17.self_attn.kv_b_proj.weight": "model-00011-of-00044.safetensors", - "model.layers.17.self_attn.o_proj.biases": "model-00011-of-00044.safetensors", - "model.layers.17.self_attn.o_proj.scales": "model-00011-of-00044.safetensors", - "model.layers.17.self_attn.o_proj.weight": "model-00011-of-00044.safetensors", - "model.layers.17.self_attn.q_a_layernorm.weight": "model-00011-of-00044.safetensors", - "model.layers.17.self_attn.q_a_proj.biases": "model-00011-of-00044.safetensors", - "model.layers.17.self_attn.q_a_proj.scales": "model-00011-of-00044.safetensors", - "model.layers.17.self_attn.q_a_proj.weight": "model-00011-of-00044.safetensors", - "model.layers.17.self_attn.q_b_proj.biases": "model-00011-of-00044.safetensors", - "model.layers.17.self_attn.q_b_proj.scales": "model-00011-of-00044.safetensors", - "model.layers.17.self_attn.q_b_proj.weight": "model-00011-of-00044.safetensors", - "model.layers.18.input_layernorm.weight": "model-00013-of-00044.safetensors", - "model.layers.18.mlp.gate.e_score_correction_bias": "model-00013-of-00044.safetensors", - "model.layers.18.mlp.gate.weight": "model-00013-of-00044.safetensors", - "model.layers.18.mlp.shared_experts.down_proj.biases": "model-00013-of-00044.safetensors", - "model.layers.18.mlp.shared_experts.down_proj.scales": "model-00013-of-00044.safetensors", - "model.layers.18.mlp.shared_experts.down_proj.weight": "model-00013-of-00044.safetensors", - "model.layers.18.mlp.shared_experts.gate_proj.biases": "model-00013-of-00044.safetensors", - "model.layers.18.mlp.shared_experts.gate_proj.scales": "model-00013-of-00044.safetensors", - "model.layers.18.mlp.shared_experts.gate_proj.weight": "model-00013-of-00044.safetensors", - "model.layers.18.mlp.shared_experts.up_proj.biases": "model-00013-of-00044.safetensors", - "model.layers.18.mlp.shared_experts.up_proj.scales": "model-00013-of-00044.safetensors", - "model.layers.18.mlp.shared_experts.up_proj.weight": "model-00013-of-00044.safetensors", - "model.layers.18.mlp.switch_mlp.down_proj.biases": "model-00013-of-00044.safetensors", - "model.layers.18.mlp.switch_mlp.down_proj.scales": "model-00013-of-00044.safetensors", - "model.layers.18.mlp.switch_mlp.down_proj.weight": "model-00013-of-00044.safetensors", - "model.layers.18.mlp.switch_mlp.gate_proj.biases": "model-00012-of-00044.safetensors", - "model.layers.18.mlp.switch_mlp.gate_proj.scales": "model-00012-of-00044.safetensors", - "model.layers.18.mlp.switch_mlp.gate_proj.weight": "model-00012-of-00044.safetensors", - "model.layers.18.mlp.switch_mlp.up_proj.biases": "model-00012-of-00044.safetensors", - "model.layers.18.mlp.switch_mlp.up_proj.scales": "model-00012-of-00044.safetensors", - "model.layers.18.mlp.switch_mlp.up_proj.weight": "model-00012-of-00044.safetensors", - "model.layers.18.post_attention_layernorm.weight": "model-00013-of-00044.safetensors", - "model.layers.18.self_attn.kv_a_layernorm.weight": "model-00012-of-00044.safetensors", - "model.layers.18.self_attn.kv_a_proj_with_mqa.biases": "model-00012-of-00044.safetensors", - "model.layers.18.self_attn.kv_a_proj_with_mqa.scales": "model-00012-of-00044.safetensors", - "model.layers.18.self_attn.kv_a_proj_with_mqa.weight": "model-00012-of-00044.safetensors", - "model.layers.18.self_attn.kv_b_proj.biases": "model-00012-of-00044.safetensors", - "model.layers.18.self_attn.kv_b_proj.scales": "model-00012-of-00044.safetensors", - "model.layers.18.self_attn.kv_b_proj.weight": "model-00012-of-00044.safetensors", - "model.layers.18.self_attn.o_proj.biases": "model-00012-of-00044.safetensors", - "model.layers.18.self_attn.o_proj.scales": "model-00012-of-00044.safetensors", - "model.layers.18.self_attn.o_proj.weight": "model-00012-of-00044.safetensors", - "model.layers.18.self_attn.q_a_layernorm.weight": "model-00012-of-00044.safetensors", - "model.layers.18.self_attn.q_a_proj.biases": "model-00012-of-00044.safetensors", - "model.layers.18.self_attn.q_a_proj.scales": "model-00012-of-00044.safetensors", - "model.layers.18.self_attn.q_a_proj.weight": "model-00012-of-00044.safetensors", - "model.layers.18.self_attn.q_b_proj.biases": "model-00012-of-00044.safetensors", - "model.layers.18.self_attn.q_b_proj.scales": "model-00012-of-00044.safetensors", - "model.layers.18.self_attn.q_b_proj.weight": "model-00012-of-00044.safetensors", - "model.layers.19.input_layernorm.weight": "model-00013-of-00044.safetensors", - "model.layers.19.mlp.gate.e_score_correction_bias": "model-00013-of-00044.safetensors", - "model.layers.19.mlp.gate.weight": "model-00013-of-00044.safetensors", - "model.layers.19.mlp.shared_experts.down_proj.biases": "model-00013-of-00044.safetensors", - "model.layers.19.mlp.shared_experts.down_proj.scales": "model-00013-of-00044.safetensors", - "model.layers.19.mlp.shared_experts.down_proj.weight": "model-00013-of-00044.safetensors", - "model.layers.19.mlp.shared_experts.gate_proj.biases": "model-00013-of-00044.safetensors", - "model.layers.19.mlp.shared_experts.gate_proj.scales": "model-00013-of-00044.safetensors", - "model.layers.19.mlp.shared_experts.gate_proj.weight": "model-00013-of-00044.safetensors", - "model.layers.19.mlp.shared_experts.up_proj.biases": "model-00013-of-00044.safetensors", - "model.layers.19.mlp.shared_experts.up_proj.scales": "model-00013-of-00044.safetensors", - "model.layers.19.mlp.shared_experts.up_proj.weight": "model-00013-of-00044.safetensors", - "model.layers.19.mlp.switch_mlp.down_proj.biases": "model-00013-of-00044.safetensors", - "model.layers.19.mlp.switch_mlp.down_proj.scales": "model-00013-of-00044.safetensors", - "model.layers.19.mlp.switch_mlp.down_proj.weight": "model-00013-of-00044.safetensors", - "model.layers.19.mlp.switch_mlp.gate_proj.biases": "model-00013-of-00044.safetensors", - "model.layers.19.mlp.switch_mlp.gate_proj.scales": "model-00013-of-00044.safetensors", - "model.layers.19.mlp.switch_mlp.gate_proj.weight": "model-00013-of-00044.safetensors", - "model.layers.19.mlp.switch_mlp.up_proj.biases": "model-00013-of-00044.safetensors", - "model.layers.19.mlp.switch_mlp.up_proj.scales": "model-00013-of-00044.safetensors", - "model.layers.19.mlp.switch_mlp.up_proj.weight": "model-00013-of-00044.safetensors", - "model.layers.19.post_attention_layernorm.weight": "model-00013-of-00044.safetensors", - "model.layers.19.self_attn.kv_a_layernorm.weight": "model-00013-of-00044.safetensors", - "model.layers.19.self_attn.kv_a_proj_with_mqa.biases": "model-00013-of-00044.safetensors", - "model.layers.19.self_attn.kv_a_proj_with_mqa.scales": "model-00013-of-00044.safetensors", - "model.layers.19.self_attn.kv_a_proj_with_mqa.weight": "model-00013-of-00044.safetensors", - "model.layers.19.self_attn.kv_b_proj.biases": "model-00013-of-00044.safetensors", - "model.layers.19.self_attn.kv_b_proj.scales": "model-00013-of-00044.safetensors", - "model.layers.19.self_attn.kv_b_proj.weight": "model-00013-of-00044.safetensors", - "model.layers.19.self_attn.o_proj.biases": "model-00013-of-00044.safetensors", - "model.layers.19.self_attn.o_proj.scales": "model-00013-of-00044.safetensors", - "model.layers.19.self_attn.o_proj.weight": "model-00013-of-00044.safetensors", - "model.layers.19.self_attn.q_a_layernorm.weight": "model-00013-of-00044.safetensors", - "model.layers.19.self_attn.q_a_proj.biases": "model-00013-of-00044.safetensors", - "model.layers.19.self_attn.q_a_proj.scales": "model-00013-of-00044.safetensors", - "model.layers.19.self_attn.q_a_proj.weight": "model-00013-of-00044.safetensors", - "model.layers.19.self_attn.q_b_proj.biases": "model-00013-of-00044.safetensors", - "model.layers.19.self_attn.q_b_proj.scales": "model-00013-of-00044.safetensors", - "model.layers.19.self_attn.q_b_proj.weight": "model-00013-of-00044.safetensors", - "model.layers.2.input_layernorm.weight": "model-00001-of-00044.safetensors", - "model.layers.2.mlp.down_proj.biases": "model-00001-of-00044.safetensors", - "model.layers.2.mlp.down_proj.scales": "model-00001-of-00044.safetensors", - "model.layers.2.mlp.down_proj.weight": "model-00001-of-00044.safetensors", - "model.layers.2.mlp.gate_proj.biases": "model-00001-of-00044.safetensors", - "model.layers.2.mlp.gate_proj.scales": "model-00001-of-00044.safetensors", - "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00044.safetensors", - "model.layers.2.mlp.up_proj.biases": "model-00001-of-00044.safetensors", - "model.layers.2.mlp.up_proj.scales": "model-00001-of-00044.safetensors", - "model.layers.2.mlp.up_proj.weight": "model-00001-of-00044.safetensors", - "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00044.safetensors", - "model.layers.2.self_attn.kv_a_layernorm.weight": "model-00001-of-00044.safetensors", - "model.layers.2.self_attn.kv_a_proj_with_mqa.biases": "model-00001-of-00044.safetensors", - "model.layers.2.self_attn.kv_a_proj_with_mqa.scales": "model-00001-of-00044.safetensors", - "model.layers.2.self_attn.kv_a_proj_with_mqa.weight": "model-00001-of-00044.safetensors", - "model.layers.2.self_attn.kv_b_proj.biases": "model-00001-of-00044.safetensors", - "model.layers.2.self_attn.kv_b_proj.scales": "model-00001-of-00044.safetensors", - "model.layers.2.self_attn.kv_b_proj.weight": "model-00001-of-00044.safetensors", - "model.layers.2.self_attn.o_proj.biases": "model-00001-of-00044.safetensors", - "model.layers.2.self_attn.o_proj.scales": "model-00001-of-00044.safetensors", - "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00044.safetensors", - "model.layers.2.self_attn.q_a_layernorm.weight": "model-00001-of-00044.safetensors", - "model.layers.2.self_attn.q_a_proj.biases": "model-00001-of-00044.safetensors", - "model.layers.2.self_attn.q_a_proj.scales": "model-00001-of-00044.safetensors", - "model.layers.2.self_attn.q_a_proj.weight": "model-00001-of-00044.safetensors", - "model.layers.2.self_attn.q_b_proj.biases": "model-00001-of-00044.safetensors", - "model.layers.2.self_attn.q_b_proj.scales": "model-00001-of-00044.safetensors", - "model.layers.2.self_attn.q_b_proj.weight": "model-00001-of-00044.safetensors", - "model.layers.20.input_layernorm.weight": "model-00014-of-00044.safetensors", - "model.layers.20.mlp.gate.e_score_correction_bias": "model-00014-of-00044.safetensors", - "model.layers.20.mlp.gate.weight": "model-00014-of-00044.safetensors", - "model.layers.20.mlp.shared_experts.down_proj.biases": "model-00014-of-00044.safetensors", - "model.layers.20.mlp.shared_experts.down_proj.scales": "model-00014-of-00044.safetensors", - "model.layers.20.mlp.shared_experts.down_proj.weight": "model-00014-of-00044.safetensors", - "model.layers.20.mlp.shared_experts.gate_proj.biases": "model-00014-of-00044.safetensors", - "model.layers.20.mlp.shared_experts.gate_proj.scales": "model-00014-of-00044.safetensors", - "model.layers.20.mlp.shared_experts.gate_proj.weight": "model-00014-of-00044.safetensors", - "model.layers.20.mlp.shared_experts.up_proj.biases": "model-00014-of-00044.safetensors", - "model.layers.20.mlp.shared_experts.up_proj.scales": "model-00014-of-00044.safetensors", - "model.layers.20.mlp.shared_experts.up_proj.weight": "model-00014-of-00044.safetensors", - "model.layers.20.mlp.switch_mlp.down_proj.biases": "model-00014-of-00044.safetensors", - "model.layers.20.mlp.switch_mlp.down_proj.scales": "model-00014-of-00044.safetensors", - "model.layers.20.mlp.switch_mlp.down_proj.weight": "model-00014-of-00044.safetensors", - "model.layers.20.mlp.switch_mlp.gate_proj.biases": "model-00014-of-00044.safetensors", - "model.layers.20.mlp.switch_mlp.gate_proj.scales": "model-00014-of-00044.safetensors", - "model.layers.20.mlp.switch_mlp.gate_proj.weight": "model-00014-of-00044.safetensors", - "model.layers.20.mlp.switch_mlp.up_proj.biases": "model-00014-of-00044.safetensors", - "model.layers.20.mlp.switch_mlp.up_proj.scales": "model-00014-of-00044.safetensors", - "model.layers.20.mlp.switch_mlp.up_proj.weight": "model-00014-of-00044.safetensors", - "model.layers.20.post_attention_layernorm.weight": "model-00014-of-00044.safetensors", - "model.layers.20.self_attn.kv_a_layernorm.weight": "model-00013-of-00044.safetensors", - "model.layers.20.self_attn.kv_a_proj_with_mqa.biases": "model-00013-of-00044.safetensors", - "model.layers.20.self_attn.kv_a_proj_with_mqa.scales": "model-00013-of-00044.safetensors", - "model.layers.20.self_attn.kv_a_proj_with_mqa.weight": "model-00013-of-00044.safetensors", - "model.layers.20.self_attn.kv_b_proj.biases": "model-00013-of-00044.safetensors", - "model.layers.20.self_attn.kv_b_proj.scales": "model-00013-of-00044.safetensors", - "model.layers.20.self_attn.kv_b_proj.weight": "model-00013-of-00044.safetensors", - "model.layers.20.self_attn.o_proj.biases": "model-00013-of-00044.safetensors", - "model.layers.20.self_attn.o_proj.scales": "model-00013-of-00044.safetensors", - "model.layers.20.self_attn.o_proj.weight": "model-00013-of-00044.safetensors", - "model.layers.20.self_attn.q_a_layernorm.weight": "model-00013-of-00044.safetensors", - "model.layers.20.self_attn.q_a_proj.biases": "model-00013-of-00044.safetensors", - "model.layers.20.self_attn.q_a_proj.scales": "model-00013-of-00044.safetensors", - "model.layers.20.self_attn.q_a_proj.weight": "model-00013-of-00044.safetensors", - "model.layers.20.self_attn.q_b_proj.biases": "model-00013-of-00044.safetensors", - "model.layers.20.self_attn.q_b_proj.scales": "model-00013-of-00044.safetensors", - "model.layers.20.self_attn.q_b_proj.weight": "model-00013-of-00044.safetensors", - "model.layers.21.input_layernorm.weight": "model-00015-of-00044.safetensors", - "model.layers.21.mlp.gate.e_score_correction_bias": "model-00015-of-00044.safetensors", - "model.layers.21.mlp.gate.weight": "model-00015-of-00044.safetensors", - "model.layers.21.mlp.shared_experts.down_proj.biases": "model-00015-of-00044.safetensors", - "model.layers.21.mlp.shared_experts.down_proj.scales": "model-00015-of-00044.safetensors", - "model.layers.21.mlp.shared_experts.down_proj.weight": "model-00015-of-00044.safetensors", - "model.layers.21.mlp.shared_experts.gate_proj.biases": "model-00015-of-00044.safetensors", - "model.layers.21.mlp.shared_experts.gate_proj.scales": "model-00015-of-00044.safetensors", - "model.layers.21.mlp.shared_experts.gate_proj.weight": "model-00015-of-00044.safetensors", - "model.layers.21.mlp.shared_experts.up_proj.biases": "model-00015-of-00044.safetensors", - "model.layers.21.mlp.shared_experts.up_proj.scales": "model-00015-of-00044.safetensors", - "model.layers.21.mlp.shared_experts.up_proj.weight": "model-00015-of-00044.safetensors", - "model.layers.21.mlp.switch_mlp.down_proj.biases": "model-00015-of-00044.safetensors", - "model.layers.21.mlp.switch_mlp.down_proj.scales": "model-00015-of-00044.safetensors", - "model.layers.21.mlp.switch_mlp.down_proj.weight": "model-00015-of-00044.safetensors", - "model.layers.21.mlp.switch_mlp.gate_proj.biases": "model-00014-of-00044.safetensors", - "model.layers.21.mlp.switch_mlp.gate_proj.scales": "model-00014-of-00044.safetensors", - "model.layers.21.mlp.switch_mlp.gate_proj.weight": "model-00014-of-00044.safetensors", - "model.layers.21.mlp.switch_mlp.up_proj.biases": "model-00015-of-00044.safetensors", - "model.layers.21.mlp.switch_mlp.up_proj.scales": "model-00015-of-00044.safetensors", - "model.layers.21.mlp.switch_mlp.up_proj.weight": "model-00015-of-00044.safetensors", - "model.layers.21.post_attention_layernorm.weight": "model-00015-of-00044.safetensors", - "model.layers.21.self_attn.kv_a_layernorm.weight": "model-00014-of-00044.safetensors", - "model.layers.21.self_attn.kv_a_proj_with_mqa.biases": "model-00014-of-00044.safetensors", - "model.layers.21.self_attn.kv_a_proj_with_mqa.scales": "model-00014-of-00044.safetensors", - "model.layers.21.self_attn.kv_a_proj_with_mqa.weight": "model-00014-of-00044.safetensors", - "model.layers.21.self_attn.kv_b_proj.biases": "model-00014-of-00044.safetensors", - "model.layers.21.self_attn.kv_b_proj.scales": "model-00014-of-00044.safetensors", - "model.layers.21.self_attn.kv_b_proj.weight": "model-00014-of-00044.safetensors", - "model.layers.21.self_attn.o_proj.biases": "model-00014-of-00044.safetensors", - "model.layers.21.self_attn.o_proj.scales": "model-00014-of-00044.safetensors", - "model.layers.21.self_attn.o_proj.weight": "model-00014-of-00044.safetensors", - "model.layers.21.self_attn.q_a_layernorm.weight": "model-00014-of-00044.safetensors", - "model.layers.21.self_attn.q_a_proj.biases": "model-00014-of-00044.safetensors", - "model.layers.21.self_attn.q_a_proj.scales": "model-00014-of-00044.safetensors", - "model.layers.21.self_attn.q_a_proj.weight": "model-00014-of-00044.safetensors", - "model.layers.21.self_attn.q_b_proj.biases": "model-00014-of-00044.safetensors", - "model.layers.21.self_attn.q_b_proj.scales": "model-00014-of-00044.safetensors", - "model.layers.21.self_attn.q_b_proj.weight": "model-00014-of-00044.safetensors", - "model.layers.22.input_layernorm.weight": "model-00016-of-00044.safetensors", - "model.layers.22.mlp.gate.e_score_correction_bias": "model-00016-of-00044.safetensors", - "model.layers.22.mlp.gate.weight": "model-00016-of-00044.safetensors", - "model.layers.22.mlp.shared_experts.down_proj.biases": "model-00016-of-00044.safetensors", - "model.layers.22.mlp.shared_experts.down_proj.scales": "model-00016-of-00044.safetensors", - "model.layers.22.mlp.shared_experts.down_proj.weight": "model-00016-of-00044.safetensors", - "model.layers.22.mlp.shared_experts.gate_proj.biases": "model-00016-of-00044.safetensors", - "model.layers.22.mlp.shared_experts.gate_proj.scales": "model-00016-of-00044.safetensors", - "model.layers.22.mlp.shared_experts.gate_proj.weight": "model-00016-of-00044.safetensors", - "model.layers.22.mlp.shared_experts.up_proj.biases": "model-00016-of-00044.safetensors", - "model.layers.22.mlp.shared_experts.up_proj.scales": "model-00016-of-00044.safetensors", - "model.layers.22.mlp.shared_experts.up_proj.weight": "model-00016-of-00044.safetensors", - "model.layers.22.mlp.switch_mlp.down_proj.biases": "model-00016-of-00044.safetensors", - "model.layers.22.mlp.switch_mlp.down_proj.scales": "model-00016-of-00044.safetensors", - "model.layers.22.mlp.switch_mlp.down_proj.weight": "model-00016-of-00044.safetensors", - "model.layers.22.mlp.switch_mlp.gate_proj.biases": "model-00015-of-00044.safetensors", - "model.layers.22.mlp.switch_mlp.gate_proj.scales": "model-00015-of-00044.safetensors", - "model.layers.22.mlp.switch_mlp.gate_proj.weight": "model-00015-of-00044.safetensors", - "model.layers.22.mlp.switch_mlp.up_proj.biases": "model-00015-of-00044.safetensors", - "model.layers.22.mlp.switch_mlp.up_proj.scales": "model-00015-of-00044.safetensors", - "model.layers.22.mlp.switch_mlp.up_proj.weight": "model-00015-of-00044.safetensors", - "model.layers.22.post_attention_layernorm.weight": "model-00016-of-00044.safetensors", - "model.layers.22.self_attn.kv_a_layernorm.weight": "model-00015-of-00044.safetensors", - "model.layers.22.self_attn.kv_a_proj_with_mqa.biases": "model-00015-of-00044.safetensors", - "model.layers.22.self_attn.kv_a_proj_with_mqa.scales": "model-00015-of-00044.safetensors", - "model.layers.22.self_attn.kv_a_proj_with_mqa.weight": "model-00015-of-00044.safetensors", - "model.layers.22.self_attn.kv_b_proj.biases": "model-00015-of-00044.safetensors", - "model.layers.22.self_attn.kv_b_proj.scales": "model-00015-of-00044.safetensors", - "model.layers.22.self_attn.kv_b_proj.weight": "model-00015-of-00044.safetensors", - "model.layers.22.self_attn.o_proj.biases": "model-00015-of-00044.safetensors", - "model.layers.22.self_attn.o_proj.scales": "model-00015-of-00044.safetensors", - "model.layers.22.self_attn.o_proj.weight": "model-00015-of-00044.safetensors", - "model.layers.22.self_attn.q_a_layernorm.weight": "model-00015-of-00044.safetensors", - "model.layers.22.self_attn.q_a_proj.biases": "model-00015-of-00044.safetensors", - "model.layers.22.self_attn.q_a_proj.scales": "model-00015-of-00044.safetensors", - "model.layers.22.self_attn.q_a_proj.weight": "model-00015-of-00044.safetensors", - "model.layers.22.self_attn.q_b_proj.biases": "model-00015-of-00044.safetensors", - "model.layers.22.self_attn.q_b_proj.scales": "model-00015-of-00044.safetensors", - "model.layers.22.self_attn.q_b_proj.weight": "model-00015-of-00044.safetensors", - "model.layers.23.input_layernorm.weight": "model-00016-of-00044.safetensors", - "model.layers.23.mlp.gate.e_score_correction_bias": "model-00016-of-00044.safetensors", - "model.layers.23.mlp.gate.weight": "model-00016-of-00044.safetensors", - "model.layers.23.mlp.shared_experts.down_proj.biases": "model-00016-of-00044.safetensors", - "model.layers.23.mlp.shared_experts.down_proj.scales": "model-00016-of-00044.safetensors", - "model.layers.23.mlp.shared_experts.down_proj.weight": "model-00016-of-00044.safetensors", - "model.layers.23.mlp.shared_experts.gate_proj.biases": "model-00016-of-00044.safetensors", - "model.layers.23.mlp.shared_experts.gate_proj.scales": "model-00016-of-00044.safetensors", - "model.layers.23.mlp.shared_experts.gate_proj.weight": "model-00016-of-00044.safetensors", - "model.layers.23.mlp.shared_experts.up_proj.biases": "model-00016-of-00044.safetensors", - "model.layers.23.mlp.shared_experts.up_proj.scales": "model-00016-of-00044.safetensors", - "model.layers.23.mlp.shared_experts.up_proj.weight": "model-00016-of-00044.safetensors", - "model.layers.23.mlp.switch_mlp.down_proj.biases": "model-00016-of-00044.safetensors", - "model.layers.23.mlp.switch_mlp.down_proj.scales": "model-00016-of-00044.safetensors", - "model.layers.23.mlp.switch_mlp.down_proj.weight": "model-00016-of-00044.safetensors", - "model.layers.23.mlp.switch_mlp.gate_proj.biases": "model-00016-of-00044.safetensors", - "model.layers.23.mlp.switch_mlp.gate_proj.scales": "model-00016-of-00044.safetensors", - "model.layers.23.mlp.switch_mlp.gate_proj.weight": "model-00016-of-00044.safetensors", - "model.layers.23.mlp.switch_mlp.up_proj.biases": "model-00016-of-00044.safetensors", - "model.layers.23.mlp.switch_mlp.up_proj.scales": "model-00016-of-00044.safetensors", - "model.layers.23.mlp.switch_mlp.up_proj.weight": "model-00016-of-00044.safetensors", - "model.layers.23.post_attention_layernorm.weight": "model-00016-of-00044.safetensors", - "model.layers.23.self_attn.kv_a_layernorm.weight": "model-00016-of-00044.safetensors", - "model.layers.23.self_attn.kv_a_proj_with_mqa.biases": "model-00016-of-00044.safetensors", - "model.layers.23.self_attn.kv_a_proj_with_mqa.scales": "model-00016-of-00044.safetensors", - "model.layers.23.self_attn.kv_a_proj_with_mqa.weight": "model-00016-of-00044.safetensors", - "model.layers.23.self_attn.kv_b_proj.biases": "model-00016-of-00044.safetensors", - "model.layers.23.self_attn.kv_b_proj.scales": "model-00016-of-00044.safetensors", - "model.layers.23.self_attn.kv_b_proj.weight": "model-00016-of-00044.safetensors", - "model.layers.23.self_attn.o_proj.biases": "model-00016-of-00044.safetensors", - "model.layers.23.self_attn.o_proj.scales": "model-00016-of-00044.safetensors", - "model.layers.23.self_attn.o_proj.weight": "model-00016-of-00044.safetensors", - "model.layers.23.self_attn.q_a_layernorm.weight": "model-00016-of-00044.safetensors", - "model.layers.23.self_attn.q_a_proj.biases": "model-00016-of-00044.safetensors", - "model.layers.23.self_attn.q_a_proj.scales": "model-00016-of-00044.safetensors", - "model.layers.23.self_attn.q_a_proj.weight": "model-00016-of-00044.safetensors", - "model.layers.23.self_attn.q_b_proj.biases": "model-00016-of-00044.safetensors", - "model.layers.23.self_attn.q_b_proj.scales": "model-00016-of-00044.safetensors", - "model.layers.23.self_attn.q_b_proj.weight": "model-00016-of-00044.safetensors", - "model.layers.24.input_layernorm.weight": "model-00017-of-00044.safetensors", - "model.layers.24.mlp.gate.e_score_correction_bias": "model-00017-of-00044.safetensors", - "model.layers.24.mlp.gate.weight": "model-00017-of-00044.safetensors", - "model.layers.24.mlp.shared_experts.down_proj.biases": "model-00017-of-00044.safetensors", - "model.layers.24.mlp.shared_experts.down_proj.scales": "model-00017-of-00044.safetensors", - "model.layers.24.mlp.shared_experts.down_proj.weight": "model-00017-of-00044.safetensors", - "model.layers.24.mlp.shared_experts.gate_proj.biases": "model-00017-of-00044.safetensors", - "model.layers.24.mlp.shared_experts.gate_proj.scales": "model-00017-of-00044.safetensors", - "model.layers.24.mlp.shared_experts.gate_proj.weight": "model-00017-of-00044.safetensors", - "model.layers.24.mlp.shared_experts.up_proj.biases": "model-00017-of-00044.safetensors", - "model.layers.24.mlp.shared_experts.up_proj.scales": "model-00017-of-00044.safetensors", - "model.layers.24.mlp.shared_experts.up_proj.weight": "model-00017-of-00044.safetensors", - "model.layers.24.mlp.switch_mlp.down_proj.biases": "model-00017-of-00044.safetensors", - "model.layers.24.mlp.switch_mlp.down_proj.scales": "model-00017-of-00044.safetensors", - "model.layers.24.mlp.switch_mlp.down_proj.weight": "model-00017-of-00044.safetensors", - "model.layers.24.mlp.switch_mlp.gate_proj.biases": "model-00017-of-00044.safetensors", - "model.layers.24.mlp.switch_mlp.gate_proj.scales": "model-00017-of-00044.safetensors", - "model.layers.24.mlp.switch_mlp.gate_proj.weight": "model-00017-of-00044.safetensors", - "model.layers.24.mlp.switch_mlp.up_proj.biases": "model-00017-of-00044.safetensors", - "model.layers.24.mlp.switch_mlp.up_proj.scales": "model-00017-of-00044.safetensors", - "model.layers.24.mlp.switch_mlp.up_proj.weight": "model-00017-of-00044.safetensors", - "model.layers.24.post_attention_layernorm.weight": "model-00017-of-00044.safetensors", - "model.layers.24.self_attn.kv_a_layernorm.weight": "model-00016-of-00044.safetensors", - "model.layers.24.self_attn.kv_a_proj_with_mqa.biases": "model-00016-of-00044.safetensors", - "model.layers.24.self_attn.kv_a_proj_with_mqa.scales": "model-00016-of-00044.safetensors", - "model.layers.24.self_attn.kv_a_proj_with_mqa.weight": "model-00016-of-00044.safetensors", - "model.layers.24.self_attn.kv_b_proj.biases": "model-00016-of-00044.safetensors", - "model.layers.24.self_attn.kv_b_proj.scales": "model-00016-of-00044.safetensors", - "model.layers.24.self_attn.kv_b_proj.weight": "model-00016-of-00044.safetensors", - "model.layers.24.self_attn.o_proj.biases": "model-00016-of-00044.safetensors", - "model.layers.24.self_attn.o_proj.scales": "model-00016-of-00044.safetensors", - "model.layers.24.self_attn.o_proj.weight": "model-00016-of-00044.safetensors", - "model.layers.24.self_attn.q_a_layernorm.weight": "model-00016-of-00044.safetensors", - "model.layers.24.self_attn.q_a_proj.biases": "model-00016-of-00044.safetensors", - "model.layers.24.self_attn.q_a_proj.scales": "model-00016-of-00044.safetensors", - "model.layers.24.self_attn.q_a_proj.weight": "model-00016-of-00044.safetensors", - "model.layers.24.self_attn.q_b_proj.biases": "model-00016-of-00044.safetensors", - "model.layers.24.self_attn.q_b_proj.scales": "model-00016-of-00044.safetensors", - "model.layers.24.self_attn.q_b_proj.weight": "model-00016-of-00044.safetensors", - "model.layers.25.input_layernorm.weight": "model-00018-of-00044.safetensors", - "model.layers.25.mlp.gate.e_score_correction_bias": "model-00018-of-00044.safetensors", - "model.layers.25.mlp.gate.weight": "model-00018-of-00044.safetensors", - "model.layers.25.mlp.shared_experts.down_proj.biases": "model-00018-of-00044.safetensors", - "model.layers.25.mlp.shared_experts.down_proj.scales": "model-00018-of-00044.safetensors", - "model.layers.25.mlp.shared_experts.down_proj.weight": "model-00018-of-00044.safetensors", - "model.layers.25.mlp.shared_experts.gate_proj.biases": "model-00018-of-00044.safetensors", - "model.layers.25.mlp.shared_experts.gate_proj.scales": "model-00018-of-00044.safetensors", - "model.layers.25.mlp.shared_experts.gate_proj.weight": "model-00018-of-00044.safetensors", - "model.layers.25.mlp.shared_experts.up_proj.biases": "model-00018-of-00044.safetensors", - "model.layers.25.mlp.shared_experts.up_proj.scales": "model-00018-of-00044.safetensors", - "model.layers.25.mlp.shared_experts.up_proj.weight": "model-00018-of-00044.safetensors", - "model.layers.25.mlp.switch_mlp.down_proj.biases": "model-00018-of-00044.safetensors", - "model.layers.25.mlp.switch_mlp.down_proj.scales": "model-00018-of-00044.safetensors", - "model.layers.25.mlp.switch_mlp.down_proj.weight": "model-00018-of-00044.safetensors", - "model.layers.25.mlp.switch_mlp.gate_proj.biases": "model-00017-of-00044.safetensors", - "model.layers.25.mlp.switch_mlp.gate_proj.scales": "model-00017-of-00044.safetensors", - "model.layers.25.mlp.switch_mlp.gate_proj.weight": "model-00017-of-00044.safetensors", - "model.layers.25.mlp.switch_mlp.up_proj.biases": "model-00018-of-00044.safetensors", - "model.layers.25.mlp.switch_mlp.up_proj.scales": "model-00018-of-00044.safetensors", - "model.layers.25.mlp.switch_mlp.up_proj.weight": "model-00018-of-00044.safetensors", - "model.layers.25.post_attention_layernorm.weight": "model-00018-of-00044.safetensors", - "model.layers.25.self_attn.kv_a_layernorm.weight": "model-00017-of-00044.safetensors", - "model.layers.25.self_attn.kv_a_proj_with_mqa.biases": "model-00017-of-00044.safetensors", - "model.layers.25.self_attn.kv_a_proj_with_mqa.scales": "model-00017-of-00044.safetensors", - "model.layers.25.self_attn.kv_a_proj_with_mqa.weight": "model-00017-of-00044.safetensors", - "model.layers.25.self_attn.kv_b_proj.biases": "model-00017-of-00044.safetensors", - "model.layers.25.self_attn.kv_b_proj.scales": "model-00017-of-00044.safetensors", - "model.layers.25.self_attn.kv_b_proj.weight": "model-00017-of-00044.safetensors", - "model.layers.25.self_attn.o_proj.biases": "model-00017-of-00044.safetensors", - "model.layers.25.self_attn.o_proj.scales": "model-00017-of-00044.safetensors", - "model.layers.25.self_attn.o_proj.weight": "model-00017-of-00044.safetensors", - "model.layers.25.self_attn.q_a_layernorm.weight": "model-00017-of-00044.safetensors", - "model.layers.25.self_attn.q_a_proj.biases": "model-00017-of-00044.safetensors", - "model.layers.25.self_attn.q_a_proj.scales": "model-00017-of-00044.safetensors", - "model.layers.25.self_attn.q_a_proj.weight": "model-00017-of-00044.safetensors", - "model.layers.25.self_attn.q_b_proj.biases": "model-00017-of-00044.safetensors", - "model.layers.25.self_attn.q_b_proj.scales": "model-00017-of-00044.safetensors", - "model.layers.25.self_attn.q_b_proj.weight": "model-00017-of-00044.safetensors", - "model.layers.26.input_layernorm.weight": "model-00019-of-00044.safetensors", - "model.layers.26.mlp.gate.e_score_correction_bias": "model-00019-of-00044.safetensors", - "model.layers.26.mlp.gate.weight": "model-00019-of-00044.safetensors", - "model.layers.26.mlp.shared_experts.down_proj.biases": "model-00019-of-00044.safetensors", - "model.layers.26.mlp.shared_experts.down_proj.scales": "model-00019-of-00044.safetensors", - "model.layers.26.mlp.shared_experts.down_proj.weight": "model-00019-of-00044.safetensors", - "model.layers.26.mlp.shared_experts.gate_proj.biases": "model-00019-of-00044.safetensors", - "model.layers.26.mlp.shared_experts.gate_proj.scales": "model-00019-of-00044.safetensors", - "model.layers.26.mlp.shared_experts.gate_proj.weight": "model-00019-of-00044.safetensors", - "model.layers.26.mlp.shared_experts.up_proj.biases": "model-00019-of-00044.safetensors", - "model.layers.26.mlp.shared_experts.up_proj.scales": "model-00019-of-00044.safetensors", - "model.layers.26.mlp.shared_experts.up_proj.weight": "model-00019-of-00044.safetensors", - "model.layers.26.mlp.switch_mlp.down_proj.biases": "model-00019-of-00044.safetensors", - "model.layers.26.mlp.switch_mlp.down_proj.scales": "model-00019-of-00044.safetensors", - "model.layers.26.mlp.switch_mlp.down_proj.weight": "model-00019-of-00044.safetensors", - "model.layers.26.mlp.switch_mlp.gate_proj.biases": "model-00018-of-00044.safetensors", - "model.layers.26.mlp.switch_mlp.gate_proj.scales": "model-00018-of-00044.safetensors", - "model.layers.26.mlp.switch_mlp.gate_proj.weight": "model-00018-of-00044.safetensors", - "model.layers.26.mlp.switch_mlp.up_proj.biases": "model-00018-of-00044.safetensors", - "model.layers.26.mlp.switch_mlp.up_proj.scales": "model-00018-of-00044.safetensors", - "model.layers.26.mlp.switch_mlp.up_proj.weight": "model-00018-of-00044.safetensors", - "model.layers.26.post_attention_layernorm.weight": "model-00019-of-00044.safetensors", - "model.layers.26.self_attn.kv_a_layernorm.weight": "model-00018-of-00044.safetensors", - "model.layers.26.self_attn.kv_a_proj_with_mqa.biases": "model-00018-of-00044.safetensors", - "model.layers.26.self_attn.kv_a_proj_with_mqa.scales": "model-00018-of-00044.safetensors", - "model.layers.26.self_attn.kv_a_proj_with_mqa.weight": "model-00018-of-00044.safetensors", - "model.layers.26.self_attn.kv_b_proj.biases": "model-00018-of-00044.safetensors", - "model.layers.26.self_attn.kv_b_proj.scales": "model-00018-of-00044.safetensors", - "model.layers.26.self_attn.kv_b_proj.weight": "model-00018-of-00044.safetensors", - "model.layers.26.self_attn.o_proj.biases": "model-00018-of-00044.safetensors", - "model.layers.26.self_attn.o_proj.scales": "model-00018-of-00044.safetensors", - "model.layers.26.self_attn.o_proj.weight": "model-00018-of-00044.safetensors", - "model.layers.26.self_attn.q_a_layernorm.weight": "model-00018-of-00044.safetensors", - "model.layers.26.self_attn.q_a_proj.biases": "model-00018-of-00044.safetensors", - "model.layers.26.self_attn.q_a_proj.scales": "model-00018-of-00044.safetensors", - "model.layers.26.self_attn.q_a_proj.weight": "model-00018-of-00044.safetensors", - "model.layers.26.self_attn.q_b_proj.biases": "model-00018-of-00044.safetensors", - "model.layers.26.self_attn.q_b_proj.scales": "model-00018-of-00044.safetensors", - "model.layers.26.self_attn.q_b_proj.weight": "model-00018-of-00044.safetensors", - "model.layers.27.input_layernorm.weight": "model-00019-of-00044.safetensors", - "model.layers.27.mlp.gate.e_score_correction_bias": "model-00019-of-00044.safetensors", - "model.layers.27.mlp.gate.weight": "model-00019-of-00044.safetensors", - "model.layers.27.mlp.shared_experts.down_proj.biases": "model-00019-of-00044.safetensors", - "model.layers.27.mlp.shared_experts.down_proj.scales": "model-00019-of-00044.safetensors", - "model.layers.27.mlp.shared_experts.down_proj.weight": "model-00019-of-00044.safetensors", - "model.layers.27.mlp.shared_experts.gate_proj.biases": "model-00019-of-00044.safetensors", - "model.layers.27.mlp.shared_experts.gate_proj.scales": "model-00019-of-00044.safetensors", - "model.layers.27.mlp.shared_experts.gate_proj.weight": "model-00019-of-00044.safetensors", - "model.layers.27.mlp.shared_experts.up_proj.biases": "model-00019-of-00044.safetensors", - "model.layers.27.mlp.shared_experts.up_proj.scales": "model-00019-of-00044.safetensors", - "model.layers.27.mlp.shared_experts.up_proj.weight": "model-00019-of-00044.safetensors", - "model.layers.27.mlp.switch_mlp.down_proj.biases": "model-00019-of-00044.safetensors", - "model.layers.27.mlp.switch_mlp.down_proj.scales": "model-00019-of-00044.safetensors", - "model.layers.27.mlp.switch_mlp.down_proj.weight": "model-00019-of-00044.safetensors", - "model.layers.27.mlp.switch_mlp.gate_proj.biases": "model-00019-of-00044.safetensors", - "model.layers.27.mlp.switch_mlp.gate_proj.scales": "model-00019-of-00044.safetensors", - "model.layers.27.mlp.switch_mlp.gate_proj.weight": "model-00019-of-00044.safetensors", - "model.layers.27.mlp.switch_mlp.up_proj.biases": "model-00019-of-00044.safetensors", - "model.layers.27.mlp.switch_mlp.up_proj.scales": "model-00019-of-00044.safetensors", - "model.layers.27.mlp.switch_mlp.up_proj.weight": "model-00019-of-00044.safetensors", - "model.layers.27.post_attention_layernorm.weight": "model-00019-of-00044.safetensors", - "model.layers.27.self_attn.kv_a_layernorm.weight": "model-00019-of-00044.safetensors", - "model.layers.27.self_attn.kv_a_proj_with_mqa.biases": "model-00019-of-00044.safetensors", - "model.layers.27.self_attn.kv_a_proj_with_mqa.scales": "model-00019-of-00044.safetensors", - "model.layers.27.self_attn.kv_a_proj_with_mqa.weight": "model-00019-of-00044.safetensors", - "model.layers.27.self_attn.kv_b_proj.biases": "model-00019-of-00044.safetensors", - "model.layers.27.self_attn.kv_b_proj.scales": "model-00019-of-00044.safetensors", - "model.layers.27.self_attn.kv_b_proj.weight": "model-00019-of-00044.safetensors", - "model.layers.27.self_attn.o_proj.biases": "model-00019-of-00044.safetensors", - "model.layers.27.self_attn.o_proj.scales": "model-00019-of-00044.safetensors", - "model.layers.27.self_attn.o_proj.weight": "model-00019-of-00044.safetensors", - "model.layers.27.self_attn.q_a_layernorm.weight": "model-00019-of-00044.safetensors", - "model.layers.27.self_attn.q_a_proj.biases": "model-00019-of-00044.safetensors", - "model.layers.27.self_attn.q_a_proj.scales": "model-00019-of-00044.safetensors", - "model.layers.27.self_attn.q_a_proj.weight": "model-00019-of-00044.safetensors", - "model.layers.27.self_attn.q_b_proj.biases": "model-00019-of-00044.safetensors", - "model.layers.27.self_attn.q_b_proj.scales": "model-00019-of-00044.safetensors", - "model.layers.27.self_attn.q_b_proj.weight": "model-00019-of-00044.safetensors", - "model.layers.28.input_layernorm.weight": "model-00020-of-00044.safetensors", - "model.layers.28.mlp.gate.e_score_correction_bias": "model-00020-of-00044.safetensors", - "model.layers.28.mlp.gate.weight": "model-00020-of-00044.safetensors", - "model.layers.28.mlp.shared_experts.down_proj.biases": "model-00020-of-00044.safetensors", - "model.layers.28.mlp.shared_experts.down_proj.scales": "model-00020-of-00044.safetensors", - "model.layers.28.mlp.shared_experts.down_proj.weight": "model-00020-of-00044.safetensors", - "model.layers.28.mlp.shared_experts.gate_proj.biases": "model-00020-of-00044.safetensors", - "model.layers.28.mlp.shared_experts.gate_proj.scales": "model-00020-of-00044.safetensors", - "model.layers.28.mlp.shared_experts.gate_proj.weight": "model-00020-of-00044.safetensors", - "model.layers.28.mlp.shared_experts.up_proj.biases": "model-00020-of-00044.safetensors", - "model.layers.28.mlp.shared_experts.up_proj.scales": "model-00020-of-00044.safetensors", - "model.layers.28.mlp.shared_experts.up_proj.weight": "model-00020-of-00044.safetensors", - "model.layers.28.mlp.switch_mlp.down_proj.biases": "model-00020-of-00044.safetensors", - "model.layers.28.mlp.switch_mlp.down_proj.scales": "model-00020-of-00044.safetensors", - "model.layers.28.mlp.switch_mlp.down_proj.weight": "model-00020-of-00044.safetensors", - "model.layers.28.mlp.switch_mlp.gate_proj.biases": "model-00020-of-00044.safetensors", - "model.layers.28.mlp.switch_mlp.gate_proj.scales": "model-00020-of-00044.safetensors", - "model.layers.28.mlp.switch_mlp.gate_proj.weight": "model-00020-of-00044.safetensors", - "model.layers.28.mlp.switch_mlp.up_proj.biases": "model-00020-of-00044.safetensors", - "model.layers.28.mlp.switch_mlp.up_proj.scales": "model-00020-of-00044.safetensors", - "model.layers.28.mlp.switch_mlp.up_proj.weight": "model-00020-of-00044.safetensors", - "model.layers.28.post_attention_layernorm.weight": "model-00020-of-00044.safetensors", - "model.layers.28.self_attn.kv_a_layernorm.weight": "model-00019-of-00044.safetensors", - "model.layers.28.self_attn.kv_a_proj_with_mqa.biases": "model-00019-of-00044.safetensors", - "model.layers.28.self_attn.kv_a_proj_with_mqa.scales": "model-00019-of-00044.safetensors", - "model.layers.28.self_attn.kv_a_proj_with_mqa.weight": "model-00019-of-00044.safetensors", - "model.layers.28.self_attn.kv_b_proj.biases": "model-00019-of-00044.safetensors", - "model.layers.28.self_attn.kv_b_proj.scales": "model-00019-of-00044.safetensors", - "model.layers.28.self_attn.kv_b_proj.weight": "model-00019-of-00044.safetensors", - "model.layers.28.self_attn.o_proj.biases": "model-00019-of-00044.safetensors", - "model.layers.28.self_attn.o_proj.scales": "model-00019-of-00044.safetensors", - "model.layers.28.self_attn.o_proj.weight": "model-00019-of-00044.safetensors", - "model.layers.28.self_attn.q_a_layernorm.weight": "model-00019-of-00044.safetensors", - "model.layers.28.self_attn.q_a_proj.biases": "model-00019-of-00044.safetensors", - "model.layers.28.self_attn.q_a_proj.scales": "model-00019-of-00044.safetensors", - "model.layers.28.self_attn.q_a_proj.weight": "model-00019-of-00044.safetensors", - "model.layers.28.self_attn.q_b_proj.biases": "model-00019-of-00044.safetensors", - "model.layers.28.self_attn.q_b_proj.scales": "model-00019-of-00044.safetensors", - "model.layers.28.self_attn.q_b_proj.weight": "model-00019-of-00044.safetensors", - "model.layers.29.input_layernorm.weight": "model-00021-of-00044.safetensors", - "model.layers.29.mlp.gate.e_score_correction_bias": "model-00021-of-00044.safetensors", - "model.layers.29.mlp.gate.weight": "model-00021-of-00044.safetensors", - "model.layers.29.mlp.shared_experts.down_proj.biases": "model-00021-of-00044.safetensors", - "model.layers.29.mlp.shared_experts.down_proj.scales": "model-00021-of-00044.safetensors", - "model.layers.29.mlp.shared_experts.down_proj.weight": "model-00021-of-00044.safetensors", - "model.layers.29.mlp.shared_experts.gate_proj.biases": "model-00021-of-00044.safetensors", - "model.layers.29.mlp.shared_experts.gate_proj.scales": "model-00021-of-00044.safetensors", - "model.layers.29.mlp.shared_experts.gate_proj.weight": "model-00021-of-00044.safetensors", - "model.layers.29.mlp.shared_experts.up_proj.biases": "model-00021-of-00044.safetensors", - "model.layers.29.mlp.shared_experts.up_proj.scales": "model-00021-of-00044.safetensors", - "model.layers.29.mlp.shared_experts.up_proj.weight": "model-00021-of-00044.safetensors", - "model.layers.29.mlp.switch_mlp.down_proj.biases": "model-00021-of-00044.safetensors", - "model.layers.29.mlp.switch_mlp.down_proj.scales": "model-00021-of-00044.safetensors", - "model.layers.29.mlp.switch_mlp.down_proj.weight": "model-00021-of-00044.safetensors", - "model.layers.29.mlp.switch_mlp.gate_proj.biases": "model-00020-of-00044.safetensors", - "model.layers.29.mlp.switch_mlp.gate_proj.scales": "model-00020-of-00044.safetensors", - "model.layers.29.mlp.switch_mlp.gate_proj.weight": "model-00020-of-00044.safetensors", - "model.layers.29.mlp.switch_mlp.up_proj.biases": "model-00021-of-00044.safetensors", - "model.layers.29.mlp.switch_mlp.up_proj.scales": "model-00021-of-00044.safetensors", - "model.layers.29.mlp.switch_mlp.up_proj.weight": "model-00021-of-00044.safetensors", - "model.layers.29.post_attention_layernorm.weight": "model-00021-of-00044.safetensors", - "model.layers.29.self_attn.kv_a_layernorm.weight": "model-00020-of-00044.safetensors", - "model.layers.29.self_attn.kv_a_proj_with_mqa.biases": "model-00020-of-00044.safetensors", - "model.layers.29.self_attn.kv_a_proj_with_mqa.scales": "model-00020-of-00044.safetensors", - "model.layers.29.self_attn.kv_a_proj_with_mqa.weight": "model-00020-of-00044.safetensors", - "model.layers.29.self_attn.kv_b_proj.biases": "model-00020-of-00044.safetensors", - "model.layers.29.self_attn.kv_b_proj.scales": "model-00020-of-00044.safetensors", - "model.layers.29.self_attn.kv_b_proj.weight": "model-00020-of-00044.safetensors", - "model.layers.29.self_attn.o_proj.biases": "model-00020-of-00044.safetensors", - "model.layers.29.self_attn.o_proj.scales": "model-00020-of-00044.safetensors", - "model.layers.29.self_attn.o_proj.weight": "model-00020-of-00044.safetensors", - "model.layers.29.self_attn.q_a_layernorm.weight": "model-00020-of-00044.safetensors", - "model.layers.29.self_attn.q_a_proj.biases": "model-00020-of-00044.safetensors", - "model.layers.29.self_attn.q_a_proj.scales": "model-00020-of-00044.safetensors", - "model.layers.29.self_attn.q_a_proj.weight": "model-00020-of-00044.safetensors", - "model.layers.29.self_attn.q_b_proj.biases": "model-00020-of-00044.safetensors", - "model.layers.29.self_attn.q_b_proj.scales": "model-00020-of-00044.safetensors", - "model.layers.29.self_attn.q_b_proj.weight": "model-00020-of-00044.safetensors", - "model.layers.3.input_layernorm.weight": "model-00001-of-00044.safetensors", - "model.layers.3.mlp.gate.e_score_correction_bias": "model-00001-of-00044.safetensors", - "model.layers.3.mlp.gate.weight": "model-00001-of-00044.safetensors", - "model.layers.3.mlp.shared_experts.down_proj.biases": "model-00001-of-00044.safetensors", - "model.layers.3.mlp.shared_experts.down_proj.scales": "model-00001-of-00044.safetensors", - "model.layers.3.mlp.shared_experts.down_proj.weight": "model-00001-of-00044.safetensors", - "model.layers.3.mlp.shared_experts.gate_proj.biases": "model-00001-of-00044.safetensors", - "model.layers.3.mlp.shared_experts.gate_proj.scales": "model-00001-of-00044.safetensors", - "model.layers.3.mlp.shared_experts.gate_proj.weight": "model-00001-of-00044.safetensors", - "model.layers.3.mlp.shared_experts.up_proj.biases": "model-00001-of-00044.safetensors", - "model.layers.3.mlp.shared_experts.up_proj.scales": "model-00001-of-00044.safetensors", - "model.layers.3.mlp.shared_experts.up_proj.weight": "model-00001-of-00044.safetensors", - "model.layers.3.mlp.switch_mlp.down_proj.biases": "model-00001-of-00044.safetensors", - "model.layers.3.mlp.switch_mlp.down_proj.scales": "model-00001-of-00044.safetensors", - "model.layers.3.mlp.switch_mlp.down_proj.weight": "model-00001-of-00044.safetensors", - "model.layers.3.mlp.switch_mlp.gate_proj.biases": "model-00001-of-00044.safetensors", - "model.layers.3.mlp.switch_mlp.gate_proj.scales": "model-00001-of-00044.safetensors", - "model.layers.3.mlp.switch_mlp.gate_proj.weight": "model-00001-of-00044.safetensors", - "model.layers.3.mlp.switch_mlp.up_proj.biases": "model-00001-of-00044.safetensors", - "model.layers.3.mlp.switch_mlp.up_proj.scales": "model-00001-of-00044.safetensors", - "model.layers.3.mlp.switch_mlp.up_proj.weight": "model-00001-of-00044.safetensors", - "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00044.safetensors", - "model.layers.3.self_attn.kv_a_layernorm.weight": "model-00001-of-00044.safetensors", - "model.layers.3.self_attn.kv_a_proj_with_mqa.biases": "model-00001-of-00044.safetensors", - "model.layers.3.self_attn.kv_a_proj_with_mqa.scales": "model-00001-of-00044.safetensors", - "model.layers.3.self_attn.kv_a_proj_with_mqa.weight": "model-00001-of-00044.safetensors", - "model.layers.3.self_attn.kv_b_proj.biases": "model-00001-of-00044.safetensors", - "model.layers.3.self_attn.kv_b_proj.scales": "model-00001-of-00044.safetensors", - "model.layers.3.self_attn.kv_b_proj.weight": "model-00001-of-00044.safetensors", - "model.layers.3.self_attn.o_proj.biases": "model-00001-of-00044.safetensors", - "model.layers.3.self_attn.o_proj.scales": "model-00001-of-00044.safetensors", - "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00044.safetensors", - "model.layers.3.self_attn.q_a_layernorm.weight": "model-00001-of-00044.safetensors", - "model.layers.3.self_attn.q_a_proj.biases": "model-00001-of-00044.safetensors", - "model.layers.3.self_attn.q_a_proj.scales": "model-00001-of-00044.safetensors", - "model.layers.3.self_attn.q_a_proj.weight": "model-00001-of-00044.safetensors", - "model.layers.3.self_attn.q_b_proj.biases": "model-00001-of-00044.safetensors", - "model.layers.3.self_attn.q_b_proj.scales": "model-00001-of-00044.safetensors", - "model.layers.3.self_attn.q_b_proj.weight": "model-00001-of-00044.safetensors", - "model.layers.30.input_layernorm.weight": "model-00022-of-00044.safetensors", - "model.layers.30.mlp.gate.e_score_correction_bias": "model-00022-of-00044.safetensors", - "model.layers.30.mlp.gate.weight": "model-00022-of-00044.safetensors", - "model.layers.30.mlp.shared_experts.down_proj.biases": "model-00022-of-00044.safetensors", - "model.layers.30.mlp.shared_experts.down_proj.scales": "model-00022-of-00044.safetensors", - "model.layers.30.mlp.shared_experts.down_proj.weight": "model-00022-of-00044.safetensors", - "model.layers.30.mlp.shared_experts.gate_proj.biases": "model-00022-of-00044.safetensors", - "model.layers.30.mlp.shared_experts.gate_proj.scales": "model-00022-of-00044.safetensors", - "model.layers.30.mlp.shared_experts.gate_proj.weight": "model-00022-of-00044.safetensors", - "model.layers.30.mlp.shared_experts.up_proj.biases": "model-00022-of-00044.safetensors", - "model.layers.30.mlp.shared_experts.up_proj.scales": "model-00022-of-00044.safetensors", - "model.layers.30.mlp.shared_experts.up_proj.weight": "model-00022-of-00044.safetensors", - "model.layers.30.mlp.switch_mlp.down_proj.biases": "model-00022-of-00044.safetensors", - "model.layers.30.mlp.switch_mlp.down_proj.scales": "model-00022-of-00044.safetensors", - "model.layers.30.mlp.switch_mlp.down_proj.weight": "model-00022-of-00044.safetensors", - "model.layers.30.mlp.switch_mlp.gate_proj.biases": "model-00021-of-00044.safetensors", - "model.layers.30.mlp.switch_mlp.gate_proj.scales": "model-00021-of-00044.safetensors", - "model.layers.30.mlp.switch_mlp.gate_proj.weight": "model-00021-of-00044.safetensors", - "model.layers.30.mlp.switch_mlp.up_proj.biases": "model-00021-of-00044.safetensors", - "model.layers.30.mlp.switch_mlp.up_proj.scales": "model-00021-of-00044.safetensors", - "model.layers.30.mlp.switch_mlp.up_proj.weight": "model-00021-of-00044.safetensors", - "model.layers.30.post_attention_layernorm.weight": "model-00022-of-00044.safetensors", - "model.layers.30.self_attn.kv_a_layernorm.weight": "model-00021-of-00044.safetensors", - "model.layers.30.self_attn.kv_a_proj_with_mqa.biases": "model-00021-of-00044.safetensors", - "model.layers.30.self_attn.kv_a_proj_with_mqa.scales": "model-00021-of-00044.safetensors", - "model.layers.30.self_attn.kv_a_proj_with_mqa.weight": "model-00021-of-00044.safetensors", - "model.layers.30.self_attn.kv_b_proj.biases": "model-00021-of-00044.safetensors", - "model.layers.30.self_attn.kv_b_proj.scales": "model-00021-of-00044.safetensors", - "model.layers.30.self_attn.kv_b_proj.weight": "model-00021-of-00044.safetensors", - "model.layers.30.self_attn.o_proj.biases": "model-00021-of-00044.safetensors", - "model.layers.30.self_attn.o_proj.scales": "model-00021-of-00044.safetensors", - "model.layers.30.self_attn.o_proj.weight": "model-00021-of-00044.safetensors", - "model.layers.30.self_attn.q_a_layernorm.weight": "model-00021-of-00044.safetensors", - "model.layers.30.self_attn.q_a_proj.biases": "model-00021-of-00044.safetensors", - "model.layers.30.self_attn.q_a_proj.scales": "model-00021-of-00044.safetensors", - "model.layers.30.self_attn.q_a_proj.weight": "model-00021-of-00044.safetensors", - "model.layers.30.self_attn.q_b_proj.biases": "model-00021-of-00044.safetensors", - "model.layers.30.self_attn.q_b_proj.scales": "model-00021-of-00044.safetensors", - "model.layers.30.self_attn.q_b_proj.weight": "model-00021-of-00044.safetensors", - "model.layers.31.input_layernorm.weight": "model-00022-of-00044.safetensors", - "model.layers.31.mlp.gate.e_score_correction_bias": "model-00022-of-00044.safetensors", - "model.layers.31.mlp.gate.weight": "model-00022-of-00044.safetensors", - "model.layers.31.mlp.shared_experts.down_proj.biases": "model-00022-of-00044.safetensors", - "model.layers.31.mlp.shared_experts.down_proj.scales": "model-00022-of-00044.safetensors", - "model.layers.31.mlp.shared_experts.down_proj.weight": "model-00022-of-00044.safetensors", - "model.layers.31.mlp.shared_experts.gate_proj.biases": "model-00022-of-00044.safetensors", - "model.layers.31.mlp.shared_experts.gate_proj.scales": "model-00022-of-00044.safetensors", - "model.layers.31.mlp.shared_experts.gate_proj.weight": "model-00022-of-00044.safetensors", - "model.layers.31.mlp.shared_experts.up_proj.biases": "model-00022-of-00044.safetensors", - "model.layers.31.mlp.shared_experts.up_proj.scales": "model-00022-of-00044.safetensors", - "model.layers.31.mlp.shared_experts.up_proj.weight": "model-00022-of-00044.safetensors", - "model.layers.31.mlp.switch_mlp.down_proj.biases": "model-00022-of-00044.safetensors", - "model.layers.31.mlp.switch_mlp.down_proj.scales": "model-00022-of-00044.safetensors", - "model.layers.31.mlp.switch_mlp.down_proj.weight": "model-00022-of-00044.safetensors", - "model.layers.31.mlp.switch_mlp.gate_proj.biases": "model-00022-of-00044.safetensors", - "model.layers.31.mlp.switch_mlp.gate_proj.scales": "model-00022-of-00044.safetensors", - "model.layers.31.mlp.switch_mlp.gate_proj.weight": "model-00022-of-00044.safetensors", - "model.layers.31.mlp.switch_mlp.up_proj.biases": "model-00022-of-00044.safetensors", - "model.layers.31.mlp.switch_mlp.up_proj.scales": "model-00022-of-00044.safetensors", - "model.layers.31.mlp.switch_mlp.up_proj.weight": "model-00022-of-00044.safetensors", - "model.layers.31.post_attention_layernorm.weight": "model-00022-of-00044.safetensors", - "model.layers.31.self_attn.kv_a_layernorm.weight": "model-00022-of-00044.safetensors", - "model.layers.31.self_attn.kv_a_proj_with_mqa.biases": "model-00022-of-00044.safetensors", - "model.layers.31.self_attn.kv_a_proj_with_mqa.scales": "model-00022-of-00044.safetensors", - "model.layers.31.self_attn.kv_a_proj_with_mqa.weight": "model-00022-of-00044.safetensors", - "model.layers.31.self_attn.kv_b_proj.biases": "model-00022-of-00044.safetensors", - "model.layers.31.self_attn.kv_b_proj.scales": "model-00022-of-00044.safetensors", - "model.layers.31.self_attn.kv_b_proj.weight": "model-00022-of-00044.safetensors", - "model.layers.31.self_attn.o_proj.biases": "model-00022-of-00044.safetensors", - "model.layers.31.self_attn.o_proj.scales": "model-00022-of-00044.safetensors", - "model.layers.31.self_attn.o_proj.weight": "model-00022-of-00044.safetensors", - "model.layers.31.self_attn.q_a_layernorm.weight": "model-00022-of-00044.safetensors", - "model.layers.31.self_attn.q_a_proj.biases": "model-00022-of-00044.safetensors", - "model.layers.31.self_attn.q_a_proj.scales": "model-00022-of-00044.safetensors", - "model.layers.31.self_attn.q_a_proj.weight": "model-00022-of-00044.safetensors", - "model.layers.31.self_attn.q_b_proj.biases": "model-00022-of-00044.safetensors", - "model.layers.31.self_attn.q_b_proj.scales": "model-00022-of-00044.safetensors", - "model.layers.31.self_attn.q_b_proj.weight": "model-00022-of-00044.safetensors", - "model.layers.32.input_layernorm.weight": "model-00023-of-00044.safetensors", - "model.layers.32.mlp.gate.e_score_correction_bias": "model-00023-of-00044.safetensors", - "model.layers.32.mlp.gate.weight": "model-00023-of-00044.safetensors", - "model.layers.32.mlp.shared_experts.down_proj.biases": "model-00023-of-00044.safetensors", - "model.layers.32.mlp.shared_experts.down_proj.scales": "model-00023-of-00044.safetensors", - "model.layers.32.mlp.shared_experts.down_proj.weight": "model-00023-of-00044.safetensors", - "model.layers.32.mlp.shared_experts.gate_proj.biases": "model-00023-of-00044.safetensors", - "model.layers.32.mlp.shared_experts.gate_proj.scales": "model-00023-of-00044.safetensors", - "model.layers.32.mlp.shared_experts.gate_proj.weight": "model-00023-of-00044.safetensors", - "model.layers.32.mlp.shared_experts.up_proj.biases": "model-00023-of-00044.safetensors", - "model.layers.32.mlp.shared_experts.up_proj.scales": "model-00023-of-00044.safetensors", - "model.layers.32.mlp.shared_experts.up_proj.weight": "model-00023-of-00044.safetensors", - "model.layers.32.mlp.switch_mlp.down_proj.biases": "model-00023-of-00044.safetensors", - "model.layers.32.mlp.switch_mlp.down_proj.scales": "model-00023-of-00044.safetensors", - "model.layers.32.mlp.switch_mlp.down_proj.weight": "model-00023-of-00044.safetensors", - "model.layers.32.mlp.switch_mlp.gate_proj.biases": "model-00023-of-00044.safetensors", - "model.layers.32.mlp.switch_mlp.gate_proj.scales": "model-00023-of-00044.safetensors", - "model.layers.32.mlp.switch_mlp.gate_proj.weight": "model-00023-of-00044.safetensors", - "model.layers.32.mlp.switch_mlp.up_proj.biases": "model-00023-of-00044.safetensors", - "model.layers.32.mlp.switch_mlp.up_proj.scales": "model-00023-of-00044.safetensors", - "model.layers.32.mlp.switch_mlp.up_proj.weight": "model-00023-of-00044.safetensors", - "model.layers.32.post_attention_layernorm.weight": "model-00023-of-00044.safetensors", - "model.layers.32.self_attn.kv_a_layernorm.weight": "model-00022-of-00044.safetensors", - "model.layers.32.self_attn.kv_a_proj_with_mqa.biases": "model-00022-of-00044.safetensors", - "model.layers.32.self_attn.kv_a_proj_with_mqa.scales": "model-00022-of-00044.safetensors", - "model.layers.32.self_attn.kv_a_proj_with_mqa.weight": "model-00022-of-00044.safetensors", - "model.layers.32.self_attn.kv_b_proj.biases": "model-00022-of-00044.safetensors", - "model.layers.32.self_attn.kv_b_proj.scales": "model-00022-of-00044.safetensors", - "model.layers.32.self_attn.kv_b_proj.weight": "model-00022-of-00044.safetensors", - "model.layers.32.self_attn.o_proj.biases": "model-00022-of-00044.safetensors", - "model.layers.32.self_attn.o_proj.scales": "model-00022-of-00044.safetensors", - "model.layers.32.self_attn.o_proj.weight": "model-00022-of-00044.safetensors", - "model.layers.32.self_attn.q_a_layernorm.weight": "model-00022-of-00044.safetensors", - "model.layers.32.self_attn.q_a_proj.biases": "model-00022-of-00044.safetensors", - "model.layers.32.self_attn.q_a_proj.scales": "model-00022-of-00044.safetensors", - "model.layers.32.self_attn.q_a_proj.weight": "model-00022-of-00044.safetensors", - "model.layers.32.self_attn.q_b_proj.biases": "model-00022-of-00044.safetensors", - "model.layers.32.self_attn.q_b_proj.scales": "model-00022-of-00044.safetensors", - "model.layers.32.self_attn.q_b_proj.weight": "model-00022-of-00044.safetensors", - "model.layers.33.input_layernorm.weight": "model-00024-of-00044.safetensors", - "model.layers.33.mlp.gate.e_score_correction_bias": "model-00024-of-00044.safetensors", - "model.layers.33.mlp.gate.weight": "model-00024-of-00044.safetensors", - "model.layers.33.mlp.shared_experts.down_proj.biases": "model-00024-of-00044.safetensors", - "model.layers.33.mlp.shared_experts.down_proj.scales": "model-00024-of-00044.safetensors", - "model.layers.33.mlp.shared_experts.down_proj.weight": "model-00024-of-00044.safetensors", - "model.layers.33.mlp.shared_experts.gate_proj.biases": "model-00024-of-00044.safetensors", - "model.layers.33.mlp.shared_experts.gate_proj.scales": "model-00024-of-00044.safetensors", - "model.layers.33.mlp.shared_experts.gate_proj.weight": "model-00024-of-00044.safetensors", - "model.layers.33.mlp.shared_experts.up_proj.biases": "model-00024-of-00044.safetensors", - "model.layers.33.mlp.shared_experts.up_proj.scales": "model-00024-of-00044.safetensors", - "model.layers.33.mlp.shared_experts.up_proj.weight": "model-00024-of-00044.safetensors", - "model.layers.33.mlp.switch_mlp.down_proj.biases": "model-00024-of-00044.safetensors", - "model.layers.33.mlp.switch_mlp.down_proj.scales": "model-00024-of-00044.safetensors", - "model.layers.33.mlp.switch_mlp.down_proj.weight": "model-00024-of-00044.safetensors", - "model.layers.33.mlp.switch_mlp.gate_proj.biases": "model-00023-of-00044.safetensors", - "model.layers.33.mlp.switch_mlp.gate_proj.scales": "model-00023-of-00044.safetensors", - "model.layers.33.mlp.switch_mlp.gate_proj.weight": "model-00023-of-00044.safetensors", - "model.layers.33.mlp.switch_mlp.up_proj.biases": "model-00024-of-00044.safetensors", - "model.layers.33.mlp.switch_mlp.up_proj.scales": "model-00024-of-00044.safetensors", - "model.layers.33.mlp.switch_mlp.up_proj.weight": "model-00024-of-00044.safetensors", - "model.layers.33.post_attention_layernorm.weight": "model-00024-of-00044.safetensors", - "model.layers.33.self_attn.kv_a_layernorm.weight": "model-00023-of-00044.safetensors", - "model.layers.33.self_attn.kv_a_proj_with_mqa.biases": "model-00023-of-00044.safetensors", - "model.layers.33.self_attn.kv_a_proj_with_mqa.scales": "model-00023-of-00044.safetensors", - "model.layers.33.self_attn.kv_a_proj_with_mqa.weight": "model-00023-of-00044.safetensors", - "model.layers.33.self_attn.kv_b_proj.biases": "model-00023-of-00044.safetensors", - "model.layers.33.self_attn.kv_b_proj.scales": "model-00023-of-00044.safetensors", - "model.layers.33.self_attn.kv_b_proj.weight": "model-00023-of-00044.safetensors", - "model.layers.33.self_attn.o_proj.biases": "model-00023-of-00044.safetensors", - "model.layers.33.self_attn.o_proj.scales": "model-00023-of-00044.safetensors", - "model.layers.33.self_attn.o_proj.weight": "model-00023-of-00044.safetensors", - "model.layers.33.self_attn.q_a_layernorm.weight": "model-00023-of-00044.safetensors", - "model.layers.33.self_attn.q_a_proj.biases": "model-00023-of-00044.safetensors", - "model.layers.33.self_attn.q_a_proj.scales": "model-00023-of-00044.safetensors", - "model.layers.33.self_attn.q_a_proj.weight": "model-00023-of-00044.safetensors", - "model.layers.33.self_attn.q_b_proj.biases": "model-00023-of-00044.safetensors", - "model.layers.33.self_attn.q_b_proj.scales": "model-00023-of-00044.safetensors", - "model.layers.33.self_attn.q_b_proj.weight": "model-00023-of-00044.safetensors", - "model.layers.34.input_layernorm.weight": "model-00025-of-00044.safetensors", - "model.layers.34.mlp.gate.e_score_correction_bias": "model-00025-of-00044.safetensors", - "model.layers.34.mlp.gate.weight": "model-00025-of-00044.safetensors", - "model.layers.34.mlp.shared_experts.down_proj.biases": "model-00025-of-00044.safetensors", - "model.layers.34.mlp.shared_experts.down_proj.scales": "model-00025-of-00044.safetensors", - "model.layers.34.mlp.shared_experts.down_proj.weight": "model-00025-of-00044.safetensors", - "model.layers.34.mlp.shared_experts.gate_proj.biases": "model-00025-of-00044.safetensors", - "model.layers.34.mlp.shared_experts.gate_proj.scales": "model-00025-of-00044.safetensors", - "model.layers.34.mlp.shared_experts.gate_proj.weight": "model-00025-of-00044.safetensors", - "model.layers.34.mlp.shared_experts.up_proj.biases": "model-00025-of-00044.safetensors", - "model.layers.34.mlp.shared_experts.up_proj.scales": "model-00025-of-00044.safetensors", - "model.layers.34.mlp.shared_experts.up_proj.weight": "model-00025-of-00044.safetensors", - "model.layers.34.mlp.switch_mlp.down_proj.biases": "model-00025-of-00044.safetensors", - "model.layers.34.mlp.switch_mlp.down_proj.scales": "model-00025-of-00044.safetensors", - "model.layers.34.mlp.switch_mlp.down_proj.weight": "model-00025-of-00044.safetensors", - "model.layers.34.mlp.switch_mlp.gate_proj.biases": "model-00024-of-00044.safetensors", - "model.layers.34.mlp.switch_mlp.gate_proj.scales": "model-00024-of-00044.safetensors", - "model.layers.34.mlp.switch_mlp.gate_proj.weight": "model-00024-of-00044.safetensors", - "model.layers.34.mlp.switch_mlp.up_proj.biases": "model-00024-of-00044.safetensors", - "model.layers.34.mlp.switch_mlp.up_proj.scales": "model-00024-of-00044.safetensors", - "model.layers.34.mlp.switch_mlp.up_proj.weight": "model-00024-of-00044.safetensors", - "model.layers.34.post_attention_layernorm.weight": "model-00025-of-00044.safetensors", - "model.layers.34.self_attn.kv_a_layernorm.weight": "model-00024-of-00044.safetensors", - "model.layers.34.self_attn.kv_a_proj_with_mqa.biases": "model-00024-of-00044.safetensors", - "model.layers.34.self_attn.kv_a_proj_with_mqa.scales": "model-00024-of-00044.safetensors", - "model.layers.34.self_attn.kv_a_proj_with_mqa.weight": "model-00024-of-00044.safetensors", - "model.layers.34.self_attn.kv_b_proj.biases": "model-00024-of-00044.safetensors", - "model.layers.34.self_attn.kv_b_proj.scales": "model-00024-of-00044.safetensors", - "model.layers.34.self_attn.kv_b_proj.weight": "model-00024-of-00044.safetensors", - "model.layers.34.self_attn.o_proj.biases": "model-00024-of-00044.safetensors", - "model.layers.34.self_attn.o_proj.scales": "model-00024-of-00044.safetensors", - "model.layers.34.self_attn.o_proj.weight": "model-00024-of-00044.safetensors", - "model.layers.34.self_attn.q_a_layernorm.weight": "model-00024-of-00044.safetensors", - "model.layers.34.self_attn.q_a_proj.biases": "model-00024-of-00044.safetensors", - "model.layers.34.self_attn.q_a_proj.scales": "model-00024-of-00044.safetensors", - "model.layers.34.self_attn.q_a_proj.weight": "model-00024-of-00044.safetensors", - "model.layers.34.self_attn.q_b_proj.biases": "model-00024-of-00044.safetensors", - "model.layers.34.self_attn.q_b_proj.scales": "model-00024-of-00044.safetensors", - "model.layers.34.self_attn.q_b_proj.weight": "model-00024-of-00044.safetensors", - "model.layers.35.input_layernorm.weight": "model-00025-of-00044.safetensors", - "model.layers.35.mlp.gate.e_score_correction_bias": "model-00025-of-00044.safetensors", - "model.layers.35.mlp.gate.weight": "model-00025-of-00044.safetensors", - "model.layers.35.mlp.shared_experts.down_proj.biases": "model-00025-of-00044.safetensors", - "model.layers.35.mlp.shared_experts.down_proj.scales": "model-00025-of-00044.safetensors", - "model.layers.35.mlp.shared_experts.down_proj.weight": "model-00025-of-00044.safetensors", - "model.layers.35.mlp.shared_experts.gate_proj.biases": "model-00025-of-00044.safetensors", - "model.layers.35.mlp.shared_experts.gate_proj.scales": "model-00025-of-00044.safetensors", - "model.layers.35.mlp.shared_experts.gate_proj.weight": "model-00025-of-00044.safetensors", - "model.layers.35.mlp.shared_experts.up_proj.biases": "model-00025-of-00044.safetensors", - "model.layers.35.mlp.shared_experts.up_proj.scales": "model-00025-of-00044.safetensors", - "model.layers.35.mlp.shared_experts.up_proj.weight": "model-00025-of-00044.safetensors", - "model.layers.35.mlp.switch_mlp.down_proj.biases": "model-00025-of-00044.safetensors", - "model.layers.35.mlp.switch_mlp.down_proj.scales": "model-00025-of-00044.safetensors", - "model.layers.35.mlp.switch_mlp.down_proj.weight": "model-00025-of-00044.safetensors", - "model.layers.35.mlp.switch_mlp.gate_proj.biases": "model-00025-of-00044.safetensors", - "model.layers.35.mlp.switch_mlp.gate_proj.scales": "model-00025-of-00044.safetensors", - "model.layers.35.mlp.switch_mlp.gate_proj.weight": "model-00025-of-00044.safetensors", - "model.layers.35.mlp.switch_mlp.up_proj.biases": "model-00025-of-00044.safetensors", - "model.layers.35.mlp.switch_mlp.up_proj.scales": "model-00025-of-00044.safetensors", - "model.layers.35.mlp.switch_mlp.up_proj.weight": "model-00025-of-00044.safetensors", - "model.layers.35.post_attention_layernorm.weight": "model-00025-of-00044.safetensors", - "model.layers.35.self_attn.kv_a_layernorm.weight": "model-00025-of-00044.safetensors", - "model.layers.35.self_attn.kv_a_proj_with_mqa.biases": "model-00025-of-00044.safetensors", - "model.layers.35.self_attn.kv_a_proj_with_mqa.scales": "model-00025-of-00044.safetensors", - "model.layers.35.self_attn.kv_a_proj_with_mqa.weight": "model-00025-of-00044.safetensors", - "model.layers.35.self_attn.kv_b_proj.biases": "model-00025-of-00044.safetensors", - "model.layers.35.self_attn.kv_b_proj.scales": "model-00025-of-00044.safetensors", - "model.layers.35.self_attn.kv_b_proj.weight": "model-00025-of-00044.safetensors", - "model.layers.35.self_attn.o_proj.biases": "model-00025-of-00044.safetensors", - "model.layers.35.self_attn.o_proj.scales": "model-00025-of-00044.safetensors", - "model.layers.35.self_attn.o_proj.weight": "model-00025-of-00044.safetensors", - "model.layers.35.self_attn.q_a_layernorm.weight": "model-00025-of-00044.safetensors", - "model.layers.35.self_attn.q_a_proj.biases": "model-00025-of-00044.safetensors", - "model.layers.35.self_attn.q_a_proj.scales": "model-00025-of-00044.safetensors", - "model.layers.35.self_attn.q_a_proj.weight": "model-00025-of-00044.safetensors", - "model.layers.35.self_attn.q_b_proj.biases": "model-00025-of-00044.safetensors", - "model.layers.35.self_attn.q_b_proj.scales": "model-00025-of-00044.safetensors", - "model.layers.35.self_attn.q_b_proj.weight": "model-00025-of-00044.safetensors", - "model.layers.36.input_layernorm.weight": "model-00026-of-00044.safetensors", - "model.layers.36.mlp.gate.e_score_correction_bias": "model-00026-of-00044.safetensors", - "model.layers.36.mlp.gate.weight": "model-00026-of-00044.safetensors", - "model.layers.36.mlp.shared_experts.down_proj.biases": "model-00026-of-00044.safetensors", - "model.layers.36.mlp.shared_experts.down_proj.scales": "model-00026-of-00044.safetensors", - "model.layers.36.mlp.shared_experts.down_proj.weight": "model-00026-of-00044.safetensors", - "model.layers.36.mlp.shared_experts.gate_proj.biases": "model-00026-of-00044.safetensors", - "model.layers.36.mlp.shared_experts.gate_proj.scales": "model-00026-of-00044.safetensors", - "model.layers.36.mlp.shared_experts.gate_proj.weight": "model-00026-of-00044.safetensors", - "model.layers.36.mlp.shared_experts.up_proj.biases": "model-00026-of-00044.safetensors", - "model.layers.36.mlp.shared_experts.up_proj.scales": "model-00026-of-00044.safetensors", - "model.layers.36.mlp.shared_experts.up_proj.weight": "model-00026-of-00044.safetensors", - "model.layers.36.mlp.switch_mlp.down_proj.biases": "model-00026-of-00044.safetensors", - "model.layers.36.mlp.switch_mlp.down_proj.scales": "model-00026-of-00044.safetensors", - "model.layers.36.mlp.switch_mlp.down_proj.weight": "model-00026-of-00044.safetensors", - "model.layers.36.mlp.switch_mlp.gate_proj.biases": "model-00026-of-00044.safetensors", - "model.layers.36.mlp.switch_mlp.gate_proj.scales": "model-00026-of-00044.safetensors", - "model.layers.36.mlp.switch_mlp.gate_proj.weight": "model-00026-of-00044.safetensors", - "model.layers.36.mlp.switch_mlp.up_proj.biases": "model-00026-of-00044.safetensors", - "model.layers.36.mlp.switch_mlp.up_proj.scales": "model-00026-of-00044.safetensors", - "model.layers.36.mlp.switch_mlp.up_proj.weight": "model-00026-of-00044.safetensors", - "model.layers.36.post_attention_layernorm.weight": "model-00026-of-00044.safetensors", - "model.layers.36.self_attn.kv_a_layernorm.weight": "model-00025-of-00044.safetensors", - "model.layers.36.self_attn.kv_a_proj_with_mqa.biases": "model-00025-of-00044.safetensors", - "model.layers.36.self_attn.kv_a_proj_with_mqa.scales": "model-00025-of-00044.safetensors", - "model.layers.36.self_attn.kv_a_proj_with_mqa.weight": "model-00025-of-00044.safetensors", - "model.layers.36.self_attn.kv_b_proj.biases": "model-00025-of-00044.safetensors", - "model.layers.36.self_attn.kv_b_proj.scales": "model-00025-of-00044.safetensors", - "model.layers.36.self_attn.kv_b_proj.weight": "model-00025-of-00044.safetensors", - "model.layers.36.self_attn.o_proj.biases": "model-00025-of-00044.safetensors", - "model.layers.36.self_attn.o_proj.scales": "model-00025-of-00044.safetensors", - "model.layers.36.self_attn.o_proj.weight": "model-00025-of-00044.safetensors", - "model.layers.36.self_attn.q_a_layernorm.weight": "model-00025-of-00044.safetensors", - "model.layers.36.self_attn.q_a_proj.biases": "model-00025-of-00044.safetensors", - "model.layers.36.self_attn.q_a_proj.scales": "model-00025-of-00044.safetensors", - "model.layers.36.self_attn.q_a_proj.weight": "model-00025-of-00044.safetensors", - "model.layers.36.self_attn.q_b_proj.biases": "model-00025-of-00044.safetensors", - "model.layers.36.self_attn.q_b_proj.scales": "model-00025-of-00044.safetensors", - "model.layers.36.self_attn.q_b_proj.weight": "model-00025-of-00044.safetensors", - "model.layers.37.input_layernorm.weight": "model-00027-of-00044.safetensors", - "model.layers.37.mlp.gate.e_score_correction_bias": "model-00027-of-00044.safetensors", - "model.layers.37.mlp.gate.weight": "model-00027-of-00044.safetensors", - "model.layers.37.mlp.shared_experts.down_proj.biases": "model-00027-of-00044.safetensors", - "model.layers.37.mlp.shared_experts.down_proj.scales": "model-00027-of-00044.safetensors", - "model.layers.37.mlp.shared_experts.down_proj.weight": "model-00027-of-00044.safetensors", - "model.layers.37.mlp.shared_experts.gate_proj.biases": "model-00027-of-00044.safetensors", - "model.layers.37.mlp.shared_experts.gate_proj.scales": "model-00027-of-00044.safetensors", - "model.layers.37.mlp.shared_experts.gate_proj.weight": "model-00027-of-00044.safetensors", - "model.layers.37.mlp.shared_experts.up_proj.biases": "model-00027-of-00044.safetensors", - "model.layers.37.mlp.shared_experts.up_proj.scales": "model-00027-of-00044.safetensors", - "model.layers.37.mlp.shared_experts.up_proj.weight": "model-00027-of-00044.safetensors", - "model.layers.37.mlp.switch_mlp.down_proj.biases": "model-00027-of-00044.safetensors", - "model.layers.37.mlp.switch_mlp.down_proj.scales": "model-00027-of-00044.safetensors", - "model.layers.37.mlp.switch_mlp.down_proj.weight": "model-00027-of-00044.safetensors", - "model.layers.37.mlp.switch_mlp.gate_proj.biases": "model-00026-of-00044.safetensors", - "model.layers.37.mlp.switch_mlp.gate_proj.scales": "model-00026-of-00044.safetensors", - "model.layers.37.mlp.switch_mlp.gate_proj.weight": "model-00026-of-00044.safetensors", - "model.layers.37.mlp.switch_mlp.up_proj.biases": "model-00027-of-00044.safetensors", - "model.layers.37.mlp.switch_mlp.up_proj.scales": "model-00027-of-00044.safetensors", - "model.layers.37.mlp.switch_mlp.up_proj.weight": "model-00027-of-00044.safetensors", - "model.layers.37.post_attention_layernorm.weight": "model-00027-of-00044.safetensors", - "model.layers.37.self_attn.kv_a_layernorm.weight": "model-00026-of-00044.safetensors", - "model.layers.37.self_attn.kv_a_proj_with_mqa.biases": "model-00026-of-00044.safetensors", - "model.layers.37.self_attn.kv_a_proj_with_mqa.scales": "model-00026-of-00044.safetensors", - "model.layers.37.self_attn.kv_a_proj_with_mqa.weight": "model-00026-of-00044.safetensors", - "model.layers.37.self_attn.kv_b_proj.biases": "model-00026-of-00044.safetensors", - "model.layers.37.self_attn.kv_b_proj.scales": "model-00026-of-00044.safetensors", - "model.layers.37.self_attn.kv_b_proj.weight": "model-00026-of-00044.safetensors", - "model.layers.37.self_attn.o_proj.biases": "model-00026-of-00044.safetensors", - "model.layers.37.self_attn.o_proj.scales": "model-00026-of-00044.safetensors", - "model.layers.37.self_attn.o_proj.weight": "model-00026-of-00044.safetensors", - "model.layers.37.self_attn.q_a_layernorm.weight": "model-00026-of-00044.safetensors", - "model.layers.37.self_attn.q_a_proj.biases": "model-00026-of-00044.safetensors", - "model.layers.37.self_attn.q_a_proj.scales": "model-00026-of-00044.safetensors", - "model.layers.37.self_attn.q_a_proj.weight": "model-00026-of-00044.safetensors", - "model.layers.37.self_attn.q_b_proj.biases": "model-00026-of-00044.safetensors", - "model.layers.37.self_attn.q_b_proj.scales": "model-00026-of-00044.safetensors", - "model.layers.37.self_attn.q_b_proj.weight": "model-00026-of-00044.safetensors", - "model.layers.38.input_layernorm.weight": "model-00028-of-00044.safetensors", - "model.layers.38.mlp.gate.e_score_correction_bias": "model-00028-of-00044.safetensors", - "model.layers.38.mlp.gate.weight": "model-00028-of-00044.safetensors", - "model.layers.38.mlp.shared_experts.down_proj.biases": "model-00028-of-00044.safetensors", - "model.layers.38.mlp.shared_experts.down_proj.scales": "model-00028-of-00044.safetensors", - "model.layers.38.mlp.shared_experts.down_proj.weight": "model-00028-of-00044.safetensors", - "model.layers.38.mlp.shared_experts.gate_proj.biases": "model-00028-of-00044.safetensors", - "model.layers.38.mlp.shared_experts.gate_proj.scales": "model-00028-of-00044.safetensors", - "model.layers.38.mlp.shared_experts.gate_proj.weight": "model-00028-of-00044.safetensors", - "model.layers.38.mlp.shared_experts.up_proj.biases": "model-00028-of-00044.safetensors", - "model.layers.38.mlp.shared_experts.up_proj.scales": "model-00028-of-00044.safetensors", - "model.layers.38.mlp.shared_experts.up_proj.weight": "model-00028-of-00044.safetensors", - "model.layers.38.mlp.switch_mlp.down_proj.biases": "model-00028-of-00044.safetensors", - "model.layers.38.mlp.switch_mlp.down_proj.scales": "model-00028-of-00044.safetensors", - "model.layers.38.mlp.switch_mlp.down_proj.weight": "model-00028-of-00044.safetensors", - "model.layers.38.mlp.switch_mlp.gate_proj.biases": "model-00027-of-00044.safetensors", - "model.layers.38.mlp.switch_mlp.gate_proj.scales": "model-00027-of-00044.safetensors", - "model.layers.38.mlp.switch_mlp.gate_proj.weight": "model-00027-of-00044.safetensors", - "model.layers.38.mlp.switch_mlp.up_proj.biases": "model-00027-of-00044.safetensors", - "model.layers.38.mlp.switch_mlp.up_proj.scales": "model-00027-of-00044.safetensors", - "model.layers.38.mlp.switch_mlp.up_proj.weight": "model-00027-of-00044.safetensors", - "model.layers.38.post_attention_layernorm.weight": "model-00028-of-00044.safetensors", - "model.layers.38.self_attn.kv_a_layernorm.weight": "model-00027-of-00044.safetensors", - "model.layers.38.self_attn.kv_a_proj_with_mqa.biases": "model-00027-of-00044.safetensors", - "model.layers.38.self_attn.kv_a_proj_with_mqa.scales": "model-00027-of-00044.safetensors", - "model.layers.38.self_attn.kv_a_proj_with_mqa.weight": "model-00027-of-00044.safetensors", - "model.layers.38.self_attn.kv_b_proj.biases": "model-00027-of-00044.safetensors", - "model.layers.38.self_attn.kv_b_proj.scales": "model-00027-of-00044.safetensors", - "model.layers.38.self_attn.kv_b_proj.weight": "model-00027-of-00044.safetensors", - "model.layers.38.self_attn.o_proj.biases": "model-00027-of-00044.safetensors", - "model.layers.38.self_attn.o_proj.scales": "model-00027-of-00044.safetensors", - "model.layers.38.self_attn.o_proj.weight": "model-00027-of-00044.safetensors", - "model.layers.38.self_attn.q_a_layernorm.weight": "model-00027-of-00044.safetensors", - "model.layers.38.self_attn.q_a_proj.biases": "model-00027-of-00044.safetensors", - "model.layers.38.self_attn.q_a_proj.scales": "model-00027-of-00044.safetensors", - "model.layers.38.self_attn.q_a_proj.weight": "model-00027-of-00044.safetensors", - "model.layers.38.self_attn.q_b_proj.biases": "model-00027-of-00044.safetensors", - "model.layers.38.self_attn.q_b_proj.scales": "model-00027-of-00044.safetensors", - "model.layers.38.self_attn.q_b_proj.weight": "model-00027-of-00044.safetensors", - "model.layers.39.input_layernorm.weight": "model-00028-of-00044.safetensors", - "model.layers.39.mlp.gate.e_score_correction_bias": "model-00028-of-00044.safetensors", - "model.layers.39.mlp.gate.weight": "model-00028-of-00044.safetensors", - "model.layers.39.mlp.shared_experts.down_proj.biases": "model-00028-of-00044.safetensors", - "model.layers.39.mlp.shared_experts.down_proj.scales": "model-00028-of-00044.safetensors", - "model.layers.39.mlp.shared_experts.down_proj.weight": "model-00028-of-00044.safetensors", - "model.layers.39.mlp.shared_experts.gate_proj.biases": "model-00028-of-00044.safetensors", - "model.layers.39.mlp.shared_experts.gate_proj.scales": "model-00028-of-00044.safetensors", - "model.layers.39.mlp.shared_experts.gate_proj.weight": "model-00028-of-00044.safetensors", - "model.layers.39.mlp.shared_experts.up_proj.biases": "model-00028-of-00044.safetensors", - "model.layers.39.mlp.shared_experts.up_proj.scales": "model-00028-of-00044.safetensors", - "model.layers.39.mlp.shared_experts.up_proj.weight": "model-00028-of-00044.safetensors", - "model.layers.39.mlp.switch_mlp.down_proj.biases": "model-00028-of-00044.safetensors", - "model.layers.39.mlp.switch_mlp.down_proj.scales": "model-00028-of-00044.safetensors", - "model.layers.39.mlp.switch_mlp.down_proj.weight": "model-00028-of-00044.safetensors", - "model.layers.39.mlp.switch_mlp.gate_proj.biases": "model-00028-of-00044.safetensors", - "model.layers.39.mlp.switch_mlp.gate_proj.scales": "model-00028-of-00044.safetensors", - "model.layers.39.mlp.switch_mlp.gate_proj.weight": "model-00028-of-00044.safetensors", - "model.layers.39.mlp.switch_mlp.up_proj.biases": "model-00028-of-00044.safetensors", - "model.layers.39.mlp.switch_mlp.up_proj.scales": "model-00028-of-00044.safetensors", - "model.layers.39.mlp.switch_mlp.up_proj.weight": "model-00028-of-00044.safetensors", - "model.layers.39.post_attention_layernorm.weight": "model-00028-of-00044.safetensors", - "model.layers.39.self_attn.kv_a_layernorm.weight": "model-00028-of-00044.safetensors", - "model.layers.39.self_attn.kv_a_proj_with_mqa.biases": "model-00028-of-00044.safetensors", - "model.layers.39.self_attn.kv_a_proj_with_mqa.scales": "model-00028-of-00044.safetensors", - "model.layers.39.self_attn.kv_a_proj_with_mqa.weight": "model-00028-of-00044.safetensors", - "model.layers.39.self_attn.kv_b_proj.biases": "model-00028-of-00044.safetensors", - "model.layers.39.self_attn.kv_b_proj.scales": "model-00028-of-00044.safetensors", - "model.layers.39.self_attn.kv_b_proj.weight": "model-00028-of-00044.safetensors", - "model.layers.39.self_attn.o_proj.biases": "model-00028-of-00044.safetensors", - "model.layers.39.self_attn.o_proj.scales": "model-00028-of-00044.safetensors", - "model.layers.39.self_attn.o_proj.weight": "model-00028-of-00044.safetensors", - "model.layers.39.self_attn.q_a_layernorm.weight": "model-00028-of-00044.safetensors", - "model.layers.39.self_attn.q_a_proj.biases": "model-00028-of-00044.safetensors", - "model.layers.39.self_attn.q_a_proj.scales": "model-00028-of-00044.safetensors", - "model.layers.39.self_attn.q_a_proj.weight": "model-00028-of-00044.safetensors", - "model.layers.39.self_attn.q_b_proj.biases": "model-00028-of-00044.safetensors", - "model.layers.39.self_attn.q_b_proj.scales": "model-00028-of-00044.safetensors", - "model.layers.39.self_attn.q_b_proj.weight": "model-00028-of-00044.safetensors", - "model.layers.4.input_layernorm.weight": "model-00002-of-00044.safetensors", - "model.layers.4.mlp.gate.e_score_correction_bias": "model-00002-of-00044.safetensors", - "model.layers.4.mlp.gate.weight": "model-00002-of-00044.safetensors", - "model.layers.4.mlp.shared_experts.down_proj.biases": "model-00002-of-00044.safetensors", - "model.layers.4.mlp.shared_experts.down_proj.scales": "model-00002-of-00044.safetensors", - "model.layers.4.mlp.shared_experts.down_proj.weight": "model-00002-of-00044.safetensors", - "model.layers.4.mlp.shared_experts.gate_proj.biases": "model-00002-of-00044.safetensors", - "model.layers.4.mlp.shared_experts.gate_proj.scales": "model-00002-of-00044.safetensors", - "model.layers.4.mlp.shared_experts.gate_proj.weight": "model-00002-of-00044.safetensors", - "model.layers.4.mlp.shared_experts.up_proj.biases": "model-00002-of-00044.safetensors", - "model.layers.4.mlp.shared_experts.up_proj.scales": "model-00002-of-00044.safetensors", - "model.layers.4.mlp.shared_experts.up_proj.weight": "model-00002-of-00044.safetensors", - "model.layers.4.mlp.switch_mlp.down_proj.biases": "model-00002-of-00044.safetensors", - "model.layers.4.mlp.switch_mlp.down_proj.scales": "model-00002-of-00044.safetensors", - "model.layers.4.mlp.switch_mlp.down_proj.weight": "model-00002-of-00044.safetensors", - "model.layers.4.mlp.switch_mlp.gate_proj.biases": "model-00002-of-00044.safetensors", - "model.layers.4.mlp.switch_mlp.gate_proj.scales": "model-00002-of-00044.safetensors", - "model.layers.4.mlp.switch_mlp.gate_proj.weight": "model-00002-of-00044.safetensors", - "model.layers.4.mlp.switch_mlp.up_proj.biases": "model-00002-of-00044.safetensors", - "model.layers.4.mlp.switch_mlp.up_proj.scales": "model-00002-of-00044.safetensors", - "model.layers.4.mlp.switch_mlp.up_proj.weight": "model-00002-of-00044.safetensors", - "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00044.safetensors", - "model.layers.4.self_attn.kv_a_layernorm.weight": "model-00001-of-00044.safetensors", - "model.layers.4.self_attn.kv_a_proj_with_mqa.biases": "model-00001-of-00044.safetensors", - "model.layers.4.self_attn.kv_a_proj_with_mqa.scales": "model-00001-of-00044.safetensors", - "model.layers.4.self_attn.kv_a_proj_with_mqa.weight": "model-00001-of-00044.safetensors", - "model.layers.4.self_attn.kv_b_proj.biases": "model-00001-of-00044.safetensors", - "model.layers.4.self_attn.kv_b_proj.scales": "model-00001-of-00044.safetensors", - "model.layers.4.self_attn.kv_b_proj.weight": "model-00001-of-00044.safetensors", - "model.layers.4.self_attn.o_proj.biases": "model-00001-of-00044.safetensors", - "model.layers.4.self_attn.o_proj.scales": "model-00001-of-00044.safetensors", - "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00044.safetensors", - "model.layers.4.self_attn.q_a_layernorm.weight": "model-00001-of-00044.safetensors", - "model.layers.4.self_attn.q_a_proj.biases": "model-00001-of-00044.safetensors", - "model.layers.4.self_attn.q_a_proj.scales": "model-00001-of-00044.safetensors", - "model.layers.4.self_attn.q_a_proj.weight": "model-00001-of-00044.safetensors", - "model.layers.4.self_attn.q_b_proj.biases": "model-00001-of-00044.safetensors", - "model.layers.4.self_attn.q_b_proj.scales": "model-00001-of-00044.safetensors", - "model.layers.4.self_attn.q_b_proj.weight": "model-00001-of-00044.safetensors", - "model.layers.40.input_layernorm.weight": "model-00029-of-00044.safetensors", - "model.layers.40.mlp.gate.e_score_correction_bias": "model-00029-of-00044.safetensors", - "model.layers.40.mlp.gate.weight": "model-00029-of-00044.safetensors", - "model.layers.40.mlp.shared_experts.down_proj.biases": "model-00029-of-00044.safetensors", - "model.layers.40.mlp.shared_experts.down_proj.scales": "model-00029-of-00044.safetensors", - "model.layers.40.mlp.shared_experts.down_proj.weight": "model-00029-of-00044.safetensors", - "model.layers.40.mlp.shared_experts.gate_proj.biases": "model-00029-of-00044.safetensors", - "model.layers.40.mlp.shared_experts.gate_proj.scales": "model-00029-of-00044.safetensors", - "model.layers.40.mlp.shared_experts.gate_proj.weight": "model-00029-of-00044.safetensors", - "model.layers.40.mlp.shared_experts.up_proj.biases": "model-00029-of-00044.safetensors", - "model.layers.40.mlp.shared_experts.up_proj.scales": "model-00029-of-00044.safetensors", - "model.layers.40.mlp.shared_experts.up_proj.weight": "model-00029-of-00044.safetensors", - "model.layers.40.mlp.switch_mlp.down_proj.biases": "model-00029-of-00044.safetensors", - "model.layers.40.mlp.switch_mlp.down_proj.scales": "model-00029-of-00044.safetensors", - "model.layers.40.mlp.switch_mlp.down_proj.weight": "model-00029-of-00044.safetensors", - "model.layers.40.mlp.switch_mlp.gate_proj.biases": "model-00029-of-00044.safetensors", - "model.layers.40.mlp.switch_mlp.gate_proj.scales": "model-00029-of-00044.safetensors", - "model.layers.40.mlp.switch_mlp.gate_proj.weight": "model-00029-of-00044.safetensors", - "model.layers.40.mlp.switch_mlp.up_proj.biases": "model-00029-of-00044.safetensors", - "model.layers.40.mlp.switch_mlp.up_proj.scales": "model-00029-of-00044.safetensors", - "model.layers.40.mlp.switch_mlp.up_proj.weight": "model-00029-of-00044.safetensors", - "model.layers.40.post_attention_layernorm.weight": "model-00029-of-00044.safetensors", - "model.layers.40.self_attn.kv_a_layernorm.weight": "model-00028-of-00044.safetensors", - "model.layers.40.self_attn.kv_a_proj_with_mqa.biases": "model-00028-of-00044.safetensors", - "model.layers.40.self_attn.kv_a_proj_with_mqa.scales": "model-00028-of-00044.safetensors", - "model.layers.40.self_attn.kv_a_proj_with_mqa.weight": "model-00028-of-00044.safetensors", - "model.layers.40.self_attn.kv_b_proj.biases": "model-00028-of-00044.safetensors", - "model.layers.40.self_attn.kv_b_proj.scales": "model-00028-of-00044.safetensors", - "model.layers.40.self_attn.kv_b_proj.weight": "model-00028-of-00044.safetensors", - "model.layers.40.self_attn.o_proj.biases": "model-00028-of-00044.safetensors", - "model.layers.40.self_attn.o_proj.scales": "model-00028-of-00044.safetensors", - "model.layers.40.self_attn.o_proj.weight": "model-00028-of-00044.safetensors", - "model.layers.40.self_attn.q_a_layernorm.weight": "model-00028-of-00044.safetensors", - "model.layers.40.self_attn.q_a_proj.biases": "model-00028-of-00044.safetensors", - "model.layers.40.self_attn.q_a_proj.scales": "model-00028-of-00044.safetensors", - "model.layers.40.self_attn.q_a_proj.weight": "model-00028-of-00044.safetensors", - "model.layers.40.self_attn.q_b_proj.biases": "model-00028-of-00044.safetensors", - "model.layers.40.self_attn.q_b_proj.scales": "model-00028-of-00044.safetensors", - "model.layers.40.self_attn.q_b_proj.weight": "model-00028-of-00044.safetensors", - "model.layers.41.input_layernorm.weight": "model-00030-of-00044.safetensors", - "model.layers.41.mlp.gate.e_score_correction_bias": "model-00030-of-00044.safetensors", - "model.layers.41.mlp.gate.weight": "model-00030-of-00044.safetensors", - "model.layers.41.mlp.shared_experts.down_proj.biases": "model-00030-of-00044.safetensors", - "model.layers.41.mlp.shared_experts.down_proj.scales": "model-00030-of-00044.safetensors", - "model.layers.41.mlp.shared_experts.down_proj.weight": "model-00030-of-00044.safetensors", - "model.layers.41.mlp.shared_experts.gate_proj.biases": "model-00030-of-00044.safetensors", - "model.layers.41.mlp.shared_experts.gate_proj.scales": "model-00030-of-00044.safetensors", - "model.layers.41.mlp.shared_experts.gate_proj.weight": "model-00030-of-00044.safetensors", - "model.layers.41.mlp.shared_experts.up_proj.biases": "model-00030-of-00044.safetensors", - "model.layers.41.mlp.shared_experts.up_proj.scales": "model-00030-of-00044.safetensors", - "model.layers.41.mlp.shared_experts.up_proj.weight": "model-00030-of-00044.safetensors", - "model.layers.41.mlp.switch_mlp.down_proj.biases": "model-00030-of-00044.safetensors", - "model.layers.41.mlp.switch_mlp.down_proj.scales": "model-00030-of-00044.safetensors", - "model.layers.41.mlp.switch_mlp.down_proj.weight": "model-00030-of-00044.safetensors", - "model.layers.41.mlp.switch_mlp.gate_proj.biases": "model-00029-of-00044.safetensors", - "model.layers.41.mlp.switch_mlp.gate_proj.scales": "model-00029-of-00044.safetensors", - "model.layers.41.mlp.switch_mlp.gate_proj.weight": "model-00029-of-00044.safetensors", - "model.layers.41.mlp.switch_mlp.up_proj.biases": "model-00030-of-00044.safetensors", - "model.layers.41.mlp.switch_mlp.up_proj.scales": "model-00030-of-00044.safetensors", - "model.layers.41.mlp.switch_mlp.up_proj.weight": "model-00030-of-00044.safetensors", - "model.layers.41.post_attention_layernorm.weight": "model-00030-of-00044.safetensors", - "model.layers.41.self_attn.kv_a_layernorm.weight": "model-00029-of-00044.safetensors", - "model.layers.41.self_attn.kv_a_proj_with_mqa.biases": "model-00029-of-00044.safetensors", - "model.layers.41.self_attn.kv_a_proj_with_mqa.scales": "model-00029-of-00044.safetensors", - "model.layers.41.self_attn.kv_a_proj_with_mqa.weight": "model-00029-of-00044.safetensors", - "model.layers.41.self_attn.kv_b_proj.biases": "model-00029-of-00044.safetensors", - "model.layers.41.self_attn.kv_b_proj.scales": "model-00029-of-00044.safetensors", - "model.layers.41.self_attn.kv_b_proj.weight": "model-00029-of-00044.safetensors", - "model.layers.41.self_attn.o_proj.biases": "model-00029-of-00044.safetensors", - "model.layers.41.self_attn.o_proj.scales": "model-00029-of-00044.safetensors", - "model.layers.41.self_attn.o_proj.weight": "model-00029-of-00044.safetensors", - "model.layers.41.self_attn.q_a_layernorm.weight": "model-00029-of-00044.safetensors", - "model.layers.41.self_attn.q_a_proj.biases": "model-00029-of-00044.safetensors", - "model.layers.41.self_attn.q_a_proj.scales": "model-00029-of-00044.safetensors", - "model.layers.41.self_attn.q_a_proj.weight": "model-00029-of-00044.safetensors", - "model.layers.41.self_attn.q_b_proj.biases": "model-00029-of-00044.safetensors", - "model.layers.41.self_attn.q_b_proj.scales": "model-00029-of-00044.safetensors", - "model.layers.41.self_attn.q_b_proj.weight": "model-00029-of-00044.safetensors", - "model.layers.42.input_layernorm.weight": "model-00031-of-00044.safetensors", - "model.layers.42.mlp.gate.e_score_correction_bias": "model-00031-of-00044.safetensors", - "model.layers.42.mlp.gate.weight": "model-00031-of-00044.safetensors", - "model.layers.42.mlp.shared_experts.down_proj.biases": "model-00031-of-00044.safetensors", - "model.layers.42.mlp.shared_experts.down_proj.scales": "model-00031-of-00044.safetensors", - "model.layers.42.mlp.shared_experts.down_proj.weight": "model-00031-of-00044.safetensors", - "model.layers.42.mlp.shared_experts.gate_proj.biases": "model-00031-of-00044.safetensors", - "model.layers.42.mlp.shared_experts.gate_proj.scales": "model-00031-of-00044.safetensors", - "model.layers.42.mlp.shared_experts.gate_proj.weight": "model-00031-of-00044.safetensors", - "model.layers.42.mlp.shared_experts.up_proj.biases": "model-00031-of-00044.safetensors", - "model.layers.42.mlp.shared_experts.up_proj.scales": "model-00031-of-00044.safetensors", - "model.layers.42.mlp.shared_experts.up_proj.weight": "model-00031-of-00044.safetensors", - "model.layers.42.mlp.switch_mlp.down_proj.biases": "model-00031-of-00044.safetensors", - "model.layers.42.mlp.switch_mlp.down_proj.scales": "model-00031-of-00044.safetensors", - "model.layers.42.mlp.switch_mlp.down_proj.weight": "model-00031-of-00044.safetensors", - "model.layers.42.mlp.switch_mlp.gate_proj.biases": "model-00030-of-00044.safetensors", - "model.layers.42.mlp.switch_mlp.gate_proj.scales": "model-00030-of-00044.safetensors", - "model.layers.42.mlp.switch_mlp.gate_proj.weight": "model-00030-of-00044.safetensors", - "model.layers.42.mlp.switch_mlp.up_proj.biases": "model-00030-of-00044.safetensors", - "model.layers.42.mlp.switch_mlp.up_proj.scales": "model-00030-of-00044.safetensors", - "model.layers.42.mlp.switch_mlp.up_proj.weight": "model-00030-of-00044.safetensors", - "model.layers.42.post_attention_layernorm.weight": "model-00031-of-00044.safetensors", - "model.layers.42.self_attn.kv_a_layernorm.weight": "model-00030-of-00044.safetensors", - "model.layers.42.self_attn.kv_a_proj_with_mqa.biases": "model-00030-of-00044.safetensors", - "model.layers.42.self_attn.kv_a_proj_with_mqa.scales": "model-00030-of-00044.safetensors", - "model.layers.42.self_attn.kv_a_proj_with_mqa.weight": "model-00030-of-00044.safetensors", - "model.layers.42.self_attn.kv_b_proj.biases": "model-00030-of-00044.safetensors", - "model.layers.42.self_attn.kv_b_proj.scales": "model-00030-of-00044.safetensors", - "model.layers.42.self_attn.kv_b_proj.weight": "model-00030-of-00044.safetensors", - "model.layers.42.self_attn.o_proj.biases": "model-00030-of-00044.safetensors", - "model.layers.42.self_attn.o_proj.scales": "model-00030-of-00044.safetensors", - "model.layers.42.self_attn.o_proj.weight": "model-00030-of-00044.safetensors", - "model.layers.42.self_attn.q_a_layernorm.weight": "model-00030-of-00044.safetensors", - "model.layers.42.self_attn.q_a_proj.biases": "model-00030-of-00044.safetensors", - "model.layers.42.self_attn.q_a_proj.scales": "model-00030-of-00044.safetensors", - "model.layers.42.self_attn.q_a_proj.weight": "model-00030-of-00044.safetensors", - "model.layers.42.self_attn.q_b_proj.biases": "model-00030-of-00044.safetensors", - "model.layers.42.self_attn.q_b_proj.scales": "model-00030-of-00044.safetensors", - "model.layers.42.self_attn.q_b_proj.weight": "model-00030-of-00044.safetensors", - "model.layers.43.input_layernorm.weight": "model-00031-of-00044.safetensors", - "model.layers.43.mlp.gate.e_score_correction_bias": "model-00031-of-00044.safetensors", - "model.layers.43.mlp.gate.weight": "model-00031-of-00044.safetensors", - "model.layers.43.mlp.shared_experts.down_proj.biases": "model-00031-of-00044.safetensors", - "model.layers.43.mlp.shared_experts.down_proj.scales": "model-00031-of-00044.safetensors", - "model.layers.43.mlp.shared_experts.down_proj.weight": "model-00031-of-00044.safetensors", - "model.layers.43.mlp.shared_experts.gate_proj.biases": "model-00031-of-00044.safetensors", - "model.layers.43.mlp.shared_experts.gate_proj.scales": "model-00031-of-00044.safetensors", - "model.layers.43.mlp.shared_experts.gate_proj.weight": "model-00031-of-00044.safetensors", - "model.layers.43.mlp.shared_experts.up_proj.biases": "model-00031-of-00044.safetensors", - "model.layers.43.mlp.shared_experts.up_proj.scales": "model-00031-of-00044.safetensors", - "model.layers.43.mlp.shared_experts.up_proj.weight": "model-00031-of-00044.safetensors", - "model.layers.43.mlp.switch_mlp.down_proj.biases": "model-00031-of-00044.safetensors", - "model.layers.43.mlp.switch_mlp.down_proj.scales": "model-00031-of-00044.safetensors", - "model.layers.43.mlp.switch_mlp.down_proj.weight": "model-00031-of-00044.safetensors", - "model.layers.43.mlp.switch_mlp.gate_proj.biases": "model-00031-of-00044.safetensors", - "model.layers.43.mlp.switch_mlp.gate_proj.scales": "model-00031-of-00044.safetensors", - "model.layers.43.mlp.switch_mlp.gate_proj.weight": "model-00031-of-00044.safetensors", - "model.layers.43.mlp.switch_mlp.up_proj.biases": "model-00031-of-00044.safetensors", - "model.layers.43.mlp.switch_mlp.up_proj.scales": "model-00031-of-00044.safetensors", - "model.layers.43.mlp.switch_mlp.up_proj.weight": "model-00031-of-00044.safetensors", - "model.layers.43.post_attention_layernorm.weight": "model-00031-of-00044.safetensors", - "model.layers.43.self_attn.kv_a_layernorm.weight": "model-00031-of-00044.safetensors", - "model.layers.43.self_attn.kv_a_proj_with_mqa.biases": "model-00031-of-00044.safetensors", - "model.layers.43.self_attn.kv_a_proj_with_mqa.scales": "model-00031-of-00044.safetensors", - "model.layers.43.self_attn.kv_a_proj_with_mqa.weight": "model-00031-of-00044.safetensors", - "model.layers.43.self_attn.kv_b_proj.biases": "model-00031-of-00044.safetensors", - "model.layers.43.self_attn.kv_b_proj.scales": "model-00031-of-00044.safetensors", - "model.layers.43.self_attn.kv_b_proj.weight": "model-00031-of-00044.safetensors", - "model.layers.43.self_attn.o_proj.biases": "model-00031-of-00044.safetensors", - "model.layers.43.self_attn.o_proj.scales": "model-00031-of-00044.safetensors", - "model.layers.43.self_attn.o_proj.weight": "model-00031-of-00044.safetensors", - "model.layers.43.self_attn.q_a_layernorm.weight": "model-00031-of-00044.safetensors", - "model.layers.43.self_attn.q_a_proj.biases": "model-00031-of-00044.safetensors", - "model.layers.43.self_attn.q_a_proj.scales": "model-00031-of-00044.safetensors", - "model.layers.43.self_attn.q_a_proj.weight": "model-00031-of-00044.safetensors", - "model.layers.43.self_attn.q_b_proj.biases": "model-00031-of-00044.safetensors", - "model.layers.43.self_attn.q_b_proj.scales": "model-00031-of-00044.safetensors", - "model.layers.43.self_attn.q_b_proj.weight": "model-00031-of-00044.safetensors", - "model.layers.44.input_layernorm.weight": "model-00032-of-00044.safetensors", - "model.layers.44.mlp.gate.e_score_correction_bias": "model-00032-of-00044.safetensors", - "model.layers.44.mlp.gate.weight": "model-00032-of-00044.safetensors", - "model.layers.44.mlp.shared_experts.down_proj.biases": "model-00032-of-00044.safetensors", - "model.layers.44.mlp.shared_experts.down_proj.scales": "model-00032-of-00044.safetensors", - "model.layers.44.mlp.shared_experts.down_proj.weight": "model-00032-of-00044.safetensors", - "model.layers.44.mlp.shared_experts.gate_proj.biases": "model-00032-of-00044.safetensors", - "model.layers.44.mlp.shared_experts.gate_proj.scales": "model-00032-of-00044.safetensors", - "model.layers.44.mlp.shared_experts.gate_proj.weight": "model-00032-of-00044.safetensors", - "model.layers.44.mlp.shared_experts.up_proj.biases": "model-00032-of-00044.safetensors", - "model.layers.44.mlp.shared_experts.up_proj.scales": "model-00032-of-00044.safetensors", - "model.layers.44.mlp.shared_experts.up_proj.weight": "model-00032-of-00044.safetensors", - "model.layers.44.mlp.switch_mlp.down_proj.biases": "model-00032-of-00044.safetensors", - "model.layers.44.mlp.switch_mlp.down_proj.scales": "model-00032-of-00044.safetensors", - "model.layers.44.mlp.switch_mlp.down_proj.weight": "model-00032-of-00044.safetensors", - "model.layers.44.mlp.switch_mlp.gate_proj.biases": "model-00032-of-00044.safetensors", - "model.layers.44.mlp.switch_mlp.gate_proj.scales": "model-00032-of-00044.safetensors", - "model.layers.44.mlp.switch_mlp.gate_proj.weight": "model-00032-of-00044.safetensors", - "model.layers.44.mlp.switch_mlp.up_proj.biases": "model-00032-of-00044.safetensors", - "model.layers.44.mlp.switch_mlp.up_proj.scales": "model-00032-of-00044.safetensors", - "model.layers.44.mlp.switch_mlp.up_proj.weight": "model-00032-of-00044.safetensors", - "model.layers.44.post_attention_layernorm.weight": "model-00032-of-00044.safetensors", - "model.layers.44.self_attn.kv_a_layernorm.weight": "model-00031-of-00044.safetensors", - "model.layers.44.self_attn.kv_a_proj_with_mqa.biases": "model-00031-of-00044.safetensors", - "model.layers.44.self_attn.kv_a_proj_with_mqa.scales": "model-00031-of-00044.safetensors", - "model.layers.44.self_attn.kv_a_proj_with_mqa.weight": "model-00031-of-00044.safetensors", - "model.layers.44.self_attn.kv_b_proj.biases": "model-00031-of-00044.safetensors", - "model.layers.44.self_attn.kv_b_proj.scales": "model-00031-of-00044.safetensors", - "model.layers.44.self_attn.kv_b_proj.weight": "model-00031-of-00044.safetensors", - "model.layers.44.self_attn.o_proj.biases": "model-00031-of-00044.safetensors", - "model.layers.44.self_attn.o_proj.scales": "model-00031-of-00044.safetensors", - "model.layers.44.self_attn.o_proj.weight": "model-00031-of-00044.safetensors", - "model.layers.44.self_attn.q_a_layernorm.weight": "model-00031-of-00044.safetensors", - "model.layers.44.self_attn.q_a_proj.biases": "model-00031-of-00044.safetensors", - "model.layers.44.self_attn.q_a_proj.scales": "model-00031-of-00044.safetensors", - "model.layers.44.self_attn.q_a_proj.weight": "model-00031-of-00044.safetensors", - "model.layers.44.self_attn.q_b_proj.biases": "model-00031-of-00044.safetensors", - "model.layers.44.self_attn.q_b_proj.scales": "model-00031-of-00044.safetensors", - "model.layers.44.self_attn.q_b_proj.weight": "model-00031-of-00044.safetensors", - "model.layers.45.input_layernorm.weight": "model-00033-of-00044.safetensors", - "model.layers.45.mlp.gate.e_score_correction_bias": "model-00033-of-00044.safetensors", - "model.layers.45.mlp.gate.weight": "model-00033-of-00044.safetensors", - "model.layers.45.mlp.shared_experts.down_proj.biases": "model-00033-of-00044.safetensors", - "model.layers.45.mlp.shared_experts.down_proj.scales": "model-00033-of-00044.safetensors", - "model.layers.45.mlp.shared_experts.down_proj.weight": "model-00033-of-00044.safetensors", - "model.layers.45.mlp.shared_experts.gate_proj.biases": "model-00033-of-00044.safetensors", - "model.layers.45.mlp.shared_experts.gate_proj.scales": "model-00033-of-00044.safetensors", - "model.layers.45.mlp.shared_experts.gate_proj.weight": "model-00033-of-00044.safetensors", - "model.layers.45.mlp.shared_experts.up_proj.biases": "model-00033-of-00044.safetensors", - "model.layers.45.mlp.shared_experts.up_proj.scales": "model-00033-of-00044.safetensors", - "model.layers.45.mlp.shared_experts.up_proj.weight": "model-00033-of-00044.safetensors", - "model.layers.45.mlp.switch_mlp.down_proj.biases": "model-00033-of-00044.safetensors", - "model.layers.45.mlp.switch_mlp.down_proj.scales": "model-00033-of-00044.safetensors", - "model.layers.45.mlp.switch_mlp.down_proj.weight": "model-00033-of-00044.safetensors", - "model.layers.45.mlp.switch_mlp.gate_proj.biases": "model-00032-of-00044.safetensors", - "model.layers.45.mlp.switch_mlp.gate_proj.scales": "model-00032-of-00044.safetensors", - "model.layers.45.mlp.switch_mlp.gate_proj.weight": "model-00032-of-00044.safetensors", - "model.layers.45.mlp.switch_mlp.up_proj.biases": "model-00033-of-00044.safetensors", - "model.layers.45.mlp.switch_mlp.up_proj.scales": "model-00033-of-00044.safetensors", - "model.layers.45.mlp.switch_mlp.up_proj.weight": "model-00033-of-00044.safetensors", - "model.layers.45.post_attention_layernorm.weight": "model-00033-of-00044.safetensors", - "model.layers.45.self_attn.kv_a_layernorm.weight": "model-00032-of-00044.safetensors", - "model.layers.45.self_attn.kv_a_proj_with_mqa.biases": "model-00032-of-00044.safetensors", - "model.layers.45.self_attn.kv_a_proj_with_mqa.scales": "model-00032-of-00044.safetensors", - "model.layers.45.self_attn.kv_a_proj_with_mqa.weight": "model-00032-of-00044.safetensors", - "model.layers.45.self_attn.kv_b_proj.biases": "model-00032-of-00044.safetensors", - "model.layers.45.self_attn.kv_b_proj.scales": "model-00032-of-00044.safetensors", - "model.layers.45.self_attn.kv_b_proj.weight": "model-00032-of-00044.safetensors", - "model.layers.45.self_attn.o_proj.biases": "model-00032-of-00044.safetensors", - "model.layers.45.self_attn.o_proj.scales": "model-00032-of-00044.safetensors", - "model.layers.45.self_attn.o_proj.weight": "model-00032-of-00044.safetensors", - "model.layers.45.self_attn.q_a_layernorm.weight": "model-00032-of-00044.safetensors", - "model.layers.45.self_attn.q_a_proj.biases": "model-00032-of-00044.safetensors", - "model.layers.45.self_attn.q_a_proj.scales": "model-00032-of-00044.safetensors", - "model.layers.45.self_attn.q_a_proj.weight": "model-00032-of-00044.safetensors", - "model.layers.45.self_attn.q_b_proj.biases": "model-00032-of-00044.safetensors", - "model.layers.45.self_attn.q_b_proj.scales": "model-00032-of-00044.safetensors", - "model.layers.45.self_attn.q_b_proj.weight": "model-00032-of-00044.safetensors", - "model.layers.46.input_layernorm.weight": "model-00034-of-00044.safetensors", - "model.layers.46.mlp.gate.e_score_correction_bias": "model-00034-of-00044.safetensors", - "model.layers.46.mlp.gate.weight": "model-00034-of-00044.safetensors", - "model.layers.46.mlp.shared_experts.down_proj.biases": "model-00034-of-00044.safetensors", - "model.layers.46.mlp.shared_experts.down_proj.scales": "model-00034-of-00044.safetensors", - "model.layers.46.mlp.shared_experts.down_proj.weight": "model-00034-of-00044.safetensors", - "model.layers.46.mlp.shared_experts.gate_proj.biases": "model-00034-of-00044.safetensors", - "model.layers.46.mlp.shared_experts.gate_proj.scales": "model-00034-of-00044.safetensors", - "model.layers.46.mlp.shared_experts.gate_proj.weight": "model-00034-of-00044.safetensors", - "model.layers.46.mlp.shared_experts.up_proj.biases": "model-00034-of-00044.safetensors", - "model.layers.46.mlp.shared_experts.up_proj.scales": "model-00034-of-00044.safetensors", - "model.layers.46.mlp.shared_experts.up_proj.weight": "model-00034-of-00044.safetensors", - "model.layers.46.mlp.switch_mlp.down_proj.biases": "model-00034-of-00044.safetensors", - "model.layers.46.mlp.switch_mlp.down_proj.scales": "model-00034-of-00044.safetensors", - "model.layers.46.mlp.switch_mlp.down_proj.weight": "model-00034-of-00044.safetensors", - "model.layers.46.mlp.switch_mlp.gate_proj.biases": "model-00033-of-00044.safetensors", - "model.layers.46.mlp.switch_mlp.gate_proj.scales": "model-00033-of-00044.safetensors", - "model.layers.46.mlp.switch_mlp.gate_proj.weight": "model-00033-of-00044.safetensors", - "model.layers.46.mlp.switch_mlp.up_proj.biases": "model-00033-of-00044.safetensors", - "model.layers.46.mlp.switch_mlp.up_proj.scales": "model-00033-of-00044.safetensors", - "model.layers.46.mlp.switch_mlp.up_proj.weight": "model-00033-of-00044.safetensors", - "model.layers.46.post_attention_layernorm.weight": "model-00034-of-00044.safetensors", - "model.layers.46.self_attn.kv_a_layernorm.weight": "model-00033-of-00044.safetensors", - "model.layers.46.self_attn.kv_a_proj_with_mqa.biases": "model-00033-of-00044.safetensors", - "model.layers.46.self_attn.kv_a_proj_with_mqa.scales": "model-00033-of-00044.safetensors", - "model.layers.46.self_attn.kv_a_proj_with_mqa.weight": "model-00033-of-00044.safetensors", - "model.layers.46.self_attn.kv_b_proj.biases": "model-00033-of-00044.safetensors", - "model.layers.46.self_attn.kv_b_proj.scales": "model-00033-of-00044.safetensors", - "model.layers.46.self_attn.kv_b_proj.weight": "model-00033-of-00044.safetensors", - "model.layers.46.self_attn.o_proj.biases": "model-00033-of-00044.safetensors", - "model.layers.46.self_attn.o_proj.scales": "model-00033-of-00044.safetensors", - "model.layers.46.self_attn.o_proj.weight": "model-00033-of-00044.safetensors", - "model.layers.46.self_attn.q_a_layernorm.weight": "model-00033-of-00044.safetensors", - "model.layers.46.self_attn.q_a_proj.biases": "model-00033-of-00044.safetensors", - "model.layers.46.self_attn.q_a_proj.scales": "model-00033-of-00044.safetensors", - "model.layers.46.self_attn.q_a_proj.weight": "model-00033-of-00044.safetensors", - "model.layers.46.self_attn.q_b_proj.biases": "model-00033-of-00044.safetensors", - "model.layers.46.self_attn.q_b_proj.scales": "model-00033-of-00044.safetensors", - "model.layers.46.self_attn.q_b_proj.weight": "model-00033-of-00044.safetensors", - "model.layers.47.input_layernorm.weight": "model-00034-of-00044.safetensors", - "model.layers.47.mlp.gate.e_score_correction_bias": "model-00034-of-00044.safetensors", - "model.layers.47.mlp.gate.weight": "model-00034-of-00044.safetensors", - "model.layers.47.mlp.shared_experts.down_proj.biases": "model-00034-of-00044.safetensors", - "model.layers.47.mlp.shared_experts.down_proj.scales": "model-00034-of-00044.safetensors", - "model.layers.47.mlp.shared_experts.down_proj.weight": "model-00034-of-00044.safetensors", - "model.layers.47.mlp.shared_experts.gate_proj.biases": "model-00034-of-00044.safetensors", - "model.layers.47.mlp.shared_experts.gate_proj.scales": "model-00034-of-00044.safetensors", - "model.layers.47.mlp.shared_experts.gate_proj.weight": "model-00034-of-00044.safetensors", - "model.layers.47.mlp.shared_experts.up_proj.biases": "model-00034-of-00044.safetensors", - "model.layers.47.mlp.shared_experts.up_proj.scales": "model-00034-of-00044.safetensors", - "model.layers.47.mlp.shared_experts.up_proj.weight": "model-00034-of-00044.safetensors", - "model.layers.47.mlp.switch_mlp.down_proj.biases": "model-00034-of-00044.safetensors", - "model.layers.47.mlp.switch_mlp.down_proj.scales": "model-00034-of-00044.safetensors", - "model.layers.47.mlp.switch_mlp.down_proj.weight": "model-00034-of-00044.safetensors", - "model.layers.47.mlp.switch_mlp.gate_proj.biases": "model-00034-of-00044.safetensors", - "model.layers.47.mlp.switch_mlp.gate_proj.scales": "model-00034-of-00044.safetensors", - "model.layers.47.mlp.switch_mlp.gate_proj.weight": "model-00034-of-00044.safetensors", - "model.layers.47.mlp.switch_mlp.up_proj.biases": "model-00034-of-00044.safetensors", - "model.layers.47.mlp.switch_mlp.up_proj.scales": "model-00034-of-00044.safetensors", - "model.layers.47.mlp.switch_mlp.up_proj.weight": "model-00034-of-00044.safetensors", - "model.layers.47.post_attention_layernorm.weight": "model-00034-of-00044.safetensors", - "model.layers.47.self_attn.kv_a_layernorm.weight": "model-00034-of-00044.safetensors", - "model.layers.47.self_attn.kv_a_proj_with_mqa.biases": "model-00034-of-00044.safetensors", - "model.layers.47.self_attn.kv_a_proj_with_mqa.scales": "model-00034-of-00044.safetensors", - "model.layers.47.self_attn.kv_a_proj_with_mqa.weight": "model-00034-of-00044.safetensors", - "model.layers.47.self_attn.kv_b_proj.biases": "model-00034-of-00044.safetensors", - "model.layers.47.self_attn.kv_b_proj.scales": "model-00034-of-00044.safetensors", - "model.layers.47.self_attn.kv_b_proj.weight": "model-00034-of-00044.safetensors", - "model.layers.47.self_attn.o_proj.biases": "model-00034-of-00044.safetensors", - "model.layers.47.self_attn.o_proj.scales": "model-00034-of-00044.safetensors", - "model.layers.47.self_attn.o_proj.weight": "model-00034-of-00044.safetensors", - "model.layers.47.self_attn.q_a_layernorm.weight": "model-00034-of-00044.safetensors", - "model.layers.47.self_attn.q_a_proj.biases": "model-00034-of-00044.safetensors", - "model.layers.47.self_attn.q_a_proj.scales": "model-00034-of-00044.safetensors", - "model.layers.47.self_attn.q_a_proj.weight": "model-00034-of-00044.safetensors", - "model.layers.47.self_attn.q_b_proj.biases": "model-00034-of-00044.safetensors", - "model.layers.47.self_attn.q_b_proj.scales": "model-00034-of-00044.safetensors", - "model.layers.47.self_attn.q_b_proj.weight": "model-00034-of-00044.safetensors", - "model.layers.48.input_layernorm.weight": "model-00035-of-00044.safetensors", - "model.layers.48.mlp.gate.e_score_correction_bias": "model-00035-of-00044.safetensors", - "model.layers.48.mlp.gate.weight": "model-00035-of-00044.safetensors", - "model.layers.48.mlp.shared_experts.down_proj.biases": "model-00035-of-00044.safetensors", - "model.layers.48.mlp.shared_experts.down_proj.scales": "model-00035-of-00044.safetensors", - "model.layers.48.mlp.shared_experts.down_proj.weight": "model-00035-of-00044.safetensors", - "model.layers.48.mlp.shared_experts.gate_proj.biases": "model-00035-of-00044.safetensors", - "model.layers.48.mlp.shared_experts.gate_proj.scales": "model-00035-of-00044.safetensors", - "model.layers.48.mlp.shared_experts.gate_proj.weight": "model-00035-of-00044.safetensors", - "model.layers.48.mlp.shared_experts.up_proj.biases": "model-00035-of-00044.safetensors", - "model.layers.48.mlp.shared_experts.up_proj.scales": "model-00035-of-00044.safetensors", - "model.layers.48.mlp.shared_experts.up_proj.weight": "model-00035-of-00044.safetensors", - "model.layers.48.mlp.switch_mlp.down_proj.biases": "model-00035-of-00044.safetensors", - "model.layers.48.mlp.switch_mlp.down_proj.scales": "model-00035-of-00044.safetensors", - "model.layers.48.mlp.switch_mlp.down_proj.weight": "model-00035-of-00044.safetensors", - "model.layers.48.mlp.switch_mlp.gate_proj.biases": "model-00035-of-00044.safetensors", - "model.layers.48.mlp.switch_mlp.gate_proj.scales": "model-00035-of-00044.safetensors", - "model.layers.48.mlp.switch_mlp.gate_proj.weight": "model-00035-of-00044.safetensors", - "model.layers.48.mlp.switch_mlp.up_proj.biases": "model-00035-of-00044.safetensors", - "model.layers.48.mlp.switch_mlp.up_proj.scales": "model-00035-of-00044.safetensors", - "model.layers.48.mlp.switch_mlp.up_proj.weight": "model-00035-of-00044.safetensors", - "model.layers.48.post_attention_layernorm.weight": "model-00035-of-00044.safetensors", - "model.layers.48.self_attn.kv_a_layernorm.weight": "model-00034-of-00044.safetensors", - "model.layers.48.self_attn.kv_a_proj_with_mqa.biases": "model-00034-of-00044.safetensors", - "model.layers.48.self_attn.kv_a_proj_with_mqa.scales": "model-00034-of-00044.safetensors", - "model.layers.48.self_attn.kv_a_proj_with_mqa.weight": "model-00034-of-00044.safetensors", - "model.layers.48.self_attn.kv_b_proj.biases": "model-00034-of-00044.safetensors", - "model.layers.48.self_attn.kv_b_proj.scales": "model-00034-of-00044.safetensors", - "model.layers.48.self_attn.kv_b_proj.weight": "model-00034-of-00044.safetensors", - "model.layers.48.self_attn.o_proj.biases": "model-00034-of-00044.safetensors", - "model.layers.48.self_attn.o_proj.scales": "model-00034-of-00044.safetensors", - "model.layers.48.self_attn.o_proj.weight": "model-00034-of-00044.safetensors", - "model.layers.48.self_attn.q_a_layernorm.weight": "model-00034-of-00044.safetensors", - "model.layers.48.self_attn.q_a_proj.biases": "model-00034-of-00044.safetensors", - "model.layers.48.self_attn.q_a_proj.scales": "model-00034-of-00044.safetensors", - "model.layers.48.self_attn.q_a_proj.weight": "model-00034-of-00044.safetensors", - "model.layers.48.self_attn.q_b_proj.biases": "model-00034-of-00044.safetensors", - "model.layers.48.self_attn.q_b_proj.scales": "model-00034-of-00044.safetensors", - "model.layers.48.self_attn.q_b_proj.weight": "model-00034-of-00044.safetensors", - "model.layers.49.input_layernorm.weight": "model-00036-of-00044.safetensors", - "model.layers.49.mlp.gate.e_score_correction_bias": "model-00036-of-00044.safetensors", - "model.layers.49.mlp.gate.weight": "model-00036-of-00044.safetensors", - "model.layers.49.mlp.shared_experts.down_proj.biases": "model-00036-of-00044.safetensors", - "model.layers.49.mlp.shared_experts.down_proj.scales": "model-00036-of-00044.safetensors", - "model.layers.49.mlp.shared_experts.down_proj.weight": "model-00036-of-00044.safetensors", - "model.layers.49.mlp.shared_experts.gate_proj.biases": "model-00036-of-00044.safetensors", - "model.layers.49.mlp.shared_experts.gate_proj.scales": "model-00036-of-00044.safetensors", - "model.layers.49.mlp.shared_experts.gate_proj.weight": "model-00036-of-00044.safetensors", - "model.layers.49.mlp.shared_experts.up_proj.biases": "model-00036-of-00044.safetensors", - "model.layers.49.mlp.shared_experts.up_proj.scales": "model-00036-of-00044.safetensors", - "model.layers.49.mlp.shared_experts.up_proj.weight": "model-00036-of-00044.safetensors", - "model.layers.49.mlp.switch_mlp.down_proj.biases": "model-00036-of-00044.safetensors", - "model.layers.49.mlp.switch_mlp.down_proj.scales": "model-00036-of-00044.safetensors", - "model.layers.49.mlp.switch_mlp.down_proj.weight": "model-00036-of-00044.safetensors", - "model.layers.49.mlp.switch_mlp.gate_proj.biases": "model-00035-of-00044.safetensors", - "model.layers.49.mlp.switch_mlp.gate_proj.scales": "model-00035-of-00044.safetensors", - "model.layers.49.mlp.switch_mlp.gate_proj.weight": "model-00035-of-00044.safetensors", - "model.layers.49.mlp.switch_mlp.up_proj.biases": "model-00036-of-00044.safetensors", - "model.layers.49.mlp.switch_mlp.up_proj.scales": "model-00036-of-00044.safetensors", - "model.layers.49.mlp.switch_mlp.up_proj.weight": "model-00036-of-00044.safetensors", - "model.layers.49.post_attention_layernorm.weight": "model-00036-of-00044.safetensors", - "model.layers.49.self_attn.kv_a_layernorm.weight": "model-00035-of-00044.safetensors", - "model.layers.49.self_attn.kv_a_proj_with_mqa.biases": "model-00035-of-00044.safetensors", - "model.layers.49.self_attn.kv_a_proj_with_mqa.scales": "model-00035-of-00044.safetensors", - "model.layers.49.self_attn.kv_a_proj_with_mqa.weight": "model-00035-of-00044.safetensors", - "model.layers.49.self_attn.kv_b_proj.biases": "model-00035-of-00044.safetensors", - "model.layers.49.self_attn.kv_b_proj.scales": "model-00035-of-00044.safetensors", - "model.layers.49.self_attn.kv_b_proj.weight": "model-00035-of-00044.safetensors", - "model.layers.49.self_attn.o_proj.biases": "model-00035-of-00044.safetensors", - "model.layers.49.self_attn.o_proj.scales": "model-00035-of-00044.safetensors", - "model.layers.49.self_attn.o_proj.weight": "model-00035-of-00044.safetensors", - "model.layers.49.self_attn.q_a_layernorm.weight": "model-00035-of-00044.safetensors", - "model.layers.49.self_attn.q_a_proj.biases": "model-00035-of-00044.safetensors", - "model.layers.49.self_attn.q_a_proj.scales": "model-00035-of-00044.safetensors", - "model.layers.49.self_attn.q_a_proj.weight": "model-00035-of-00044.safetensors", - "model.layers.49.self_attn.q_b_proj.biases": "model-00035-of-00044.safetensors", - "model.layers.49.self_attn.q_b_proj.scales": "model-00035-of-00044.safetensors", - "model.layers.49.self_attn.q_b_proj.weight": "model-00035-of-00044.safetensors", - "model.layers.5.input_layernorm.weight": "model-00003-of-00044.safetensors", - "model.layers.5.mlp.gate.e_score_correction_bias": "model-00003-of-00044.safetensors", - "model.layers.5.mlp.gate.weight": "model-00003-of-00044.safetensors", - "model.layers.5.mlp.shared_experts.down_proj.biases": "model-00003-of-00044.safetensors", - "model.layers.5.mlp.shared_experts.down_proj.scales": "model-00003-of-00044.safetensors", - "model.layers.5.mlp.shared_experts.down_proj.weight": "model-00003-of-00044.safetensors", - "model.layers.5.mlp.shared_experts.gate_proj.biases": "model-00003-of-00044.safetensors", - "model.layers.5.mlp.shared_experts.gate_proj.scales": "model-00003-of-00044.safetensors", - "model.layers.5.mlp.shared_experts.gate_proj.weight": "model-00003-of-00044.safetensors", - "model.layers.5.mlp.shared_experts.up_proj.biases": "model-00003-of-00044.safetensors", - "model.layers.5.mlp.shared_experts.up_proj.scales": "model-00003-of-00044.safetensors", - "model.layers.5.mlp.shared_experts.up_proj.weight": "model-00003-of-00044.safetensors", - "model.layers.5.mlp.switch_mlp.down_proj.biases": "model-00003-of-00044.safetensors", - "model.layers.5.mlp.switch_mlp.down_proj.scales": "model-00003-of-00044.safetensors", - "model.layers.5.mlp.switch_mlp.down_proj.weight": "model-00003-of-00044.safetensors", - "model.layers.5.mlp.switch_mlp.gate_proj.biases": "model-00002-of-00044.safetensors", - "model.layers.5.mlp.switch_mlp.gate_proj.scales": "model-00002-of-00044.safetensors", - "model.layers.5.mlp.switch_mlp.gate_proj.weight": "model-00002-of-00044.safetensors", - "model.layers.5.mlp.switch_mlp.up_proj.biases": "model-00003-of-00044.safetensors", - "model.layers.5.mlp.switch_mlp.up_proj.scales": "model-00003-of-00044.safetensors", - "model.layers.5.mlp.switch_mlp.up_proj.weight": "model-00003-of-00044.safetensors", - "model.layers.5.post_attention_layernorm.weight": "model-00003-of-00044.safetensors", - "model.layers.5.self_attn.kv_a_layernorm.weight": "model-00002-of-00044.safetensors", - "model.layers.5.self_attn.kv_a_proj_with_mqa.biases": "model-00002-of-00044.safetensors", - "model.layers.5.self_attn.kv_a_proj_with_mqa.scales": "model-00002-of-00044.safetensors", - "model.layers.5.self_attn.kv_a_proj_with_mqa.weight": "model-00002-of-00044.safetensors", - "model.layers.5.self_attn.kv_b_proj.biases": "model-00002-of-00044.safetensors", - "model.layers.5.self_attn.kv_b_proj.scales": "model-00002-of-00044.safetensors", - "model.layers.5.self_attn.kv_b_proj.weight": "model-00002-of-00044.safetensors", - "model.layers.5.self_attn.o_proj.biases": "model-00002-of-00044.safetensors", - "model.layers.5.self_attn.o_proj.scales": "model-00002-of-00044.safetensors", - "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00044.safetensors", - "model.layers.5.self_attn.q_a_layernorm.weight": "model-00002-of-00044.safetensors", - "model.layers.5.self_attn.q_a_proj.biases": "model-00002-of-00044.safetensors", - "model.layers.5.self_attn.q_a_proj.scales": "model-00002-of-00044.safetensors", - "model.layers.5.self_attn.q_a_proj.weight": "model-00002-of-00044.safetensors", - "model.layers.5.self_attn.q_b_proj.biases": "model-00002-of-00044.safetensors", - "model.layers.5.self_attn.q_b_proj.scales": "model-00002-of-00044.safetensors", - "model.layers.5.self_attn.q_b_proj.weight": "model-00002-of-00044.safetensors", - "model.layers.50.input_layernorm.weight": "model-00037-of-00044.safetensors", - "model.layers.50.mlp.gate.e_score_correction_bias": "model-00037-of-00044.safetensors", - "model.layers.50.mlp.gate.weight": "model-00037-of-00044.safetensors", - "model.layers.50.mlp.shared_experts.down_proj.biases": "model-00037-of-00044.safetensors", - "model.layers.50.mlp.shared_experts.down_proj.scales": "model-00037-of-00044.safetensors", - "model.layers.50.mlp.shared_experts.down_proj.weight": "model-00037-of-00044.safetensors", - "model.layers.50.mlp.shared_experts.gate_proj.biases": "model-00037-of-00044.safetensors", - "model.layers.50.mlp.shared_experts.gate_proj.scales": "model-00037-of-00044.safetensors", - "model.layers.50.mlp.shared_experts.gate_proj.weight": "model-00037-of-00044.safetensors", - "model.layers.50.mlp.shared_experts.up_proj.biases": "model-00037-of-00044.safetensors", - "model.layers.50.mlp.shared_experts.up_proj.scales": "model-00037-of-00044.safetensors", - "model.layers.50.mlp.shared_experts.up_proj.weight": "model-00037-of-00044.safetensors", - "model.layers.50.mlp.switch_mlp.down_proj.biases": "model-00037-of-00044.safetensors", - "model.layers.50.mlp.switch_mlp.down_proj.scales": "model-00037-of-00044.safetensors", - "model.layers.50.mlp.switch_mlp.down_proj.weight": "model-00037-of-00044.safetensors", - "model.layers.50.mlp.switch_mlp.gate_proj.biases": "model-00036-of-00044.safetensors", - "model.layers.50.mlp.switch_mlp.gate_proj.scales": "model-00036-of-00044.safetensors", - "model.layers.50.mlp.switch_mlp.gate_proj.weight": "model-00036-of-00044.safetensors", - "model.layers.50.mlp.switch_mlp.up_proj.biases": "model-00036-of-00044.safetensors", - "model.layers.50.mlp.switch_mlp.up_proj.scales": "model-00036-of-00044.safetensors", - "model.layers.50.mlp.switch_mlp.up_proj.weight": "model-00036-of-00044.safetensors", - "model.layers.50.post_attention_layernorm.weight": "model-00037-of-00044.safetensors", - "model.layers.50.self_attn.kv_a_layernorm.weight": "model-00036-of-00044.safetensors", - "model.layers.50.self_attn.kv_a_proj_with_mqa.biases": "model-00036-of-00044.safetensors", - "model.layers.50.self_attn.kv_a_proj_with_mqa.scales": "model-00036-of-00044.safetensors", - "model.layers.50.self_attn.kv_a_proj_with_mqa.weight": "model-00036-of-00044.safetensors", - "model.layers.50.self_attn.kv_b_proj.biases": "model-00036-of-00044.safetensors", - "model.layers.50.self_attn.kv_b_proj.scales": "model-00036-of-00044.safetensors", - "model.layers.50.self_attn.kv_b_proj.weight": "model-00036-of-00044.safetensors", - "model.layers.50.self_attn.o_proj.biases": "model-00036-of-00044.safetensors", - "model.layers.50.self_attn.o_proj.scales": "model-00036-of-00044.safetensors", - "model.layers.50.self_attn.o_proj.weight": "model-00036-of-00044.safetensors", - "model.layers.50.self_attn.q_a_layernorm.weight": "model-00036-of-00044.safetensors", - "model.layers.50.self_attn.q_a_proj.biases": "model-00036-of-00044.safetensors", - "model.layers.50.self_attn.q_a_proj.scales": "model-00036-of-00044.safetensors", - "model.layers.50.self_attn.q_a_proj.weight": "model-00036-of-00044.safetensors", - "model.layers.50.self_attn.q_b_proj.biases": "model-00036-of-00044.safetensors", - "model.layers.50.self_attn.q_b_proj.scales": "model-00036-of-00044.safetensors", - "model.layers.50.self_attn.q_b_proj.weight": "model-00036-of-00044.safetensors", - "model.layers.51.input_layernorm.weight": "model-00037-of-00044.safetensors", - "model.layers.51.mlp.gate.e_score_correction_bias": "model-00037-of-00044.safetensors", - "model.layers.51.mlp.gate.weight": "model-00037-of-00044.safetensors", - "model.layers.51.mlp.shared_experts.down_proj.biases": "model-00037-of-00044.safetensors", - "model.layers.51.mlp.shared_experts.down_proj.scales": "model-00037-of-00044.safetensors", - "model.layers.51.mlp.shared_experts.down_proj.weight": "model-00037-of-00044.safetensors", - "model.layers.51.mlp.shared_experts.gate_proj.biases": "model-00037-of-00044.safetensors", - "model.layers.51.mlp.shared_experts.gate_proj.scales": "model-00037-of-00044.safetensors", - "model.layers.51.mlp.shared_experts.gate_proj.weight": "model-00037-of-00044.safetensors", - "model.layers.51.mlp.shared_experts.up_proj.biases": "model-00037-of-00044.safetensors", - "model.layers.51.mlp.shared_experts.up_proj.scales": "model-00037-of-00044.safetensors", - "model.layers.51.mlp.shared_experts.up_proj.weight": "model-00037-of-00044.safetensors", - "model.layers.51.mlp.switch_mlp.down_proj.biases": "model-00037-of-00044.safetensors", - "model.layers.51.mlp.switch_mlp.down_proj.scales": "model-00037-of-00044.safetensors", - "model.layers.51.mlp.switch_mlp.down_proj.weight": "model-00037-of-00044.safetensors", - "model.layers.51.mlp.switch_mlp.gate_proj.biases": "model-00037-of-00044.safetensors", - "model.layers.51.mlp.switch_mlp.gate_proj.scales": "model-00037-of-00044.safetensors", - "model.layers.51.mlp.switch_mlp.gate_proj.weight": "model-00037-of-00044.safetensors", - "model.layers.51.mlp.switch_mlp.up_proj.biases": "model-00037-of-00044.safetensors", - "model.layers.51.mlp.switch_mlp.up_proj.scales": "model-00037-of-00044.safetensors", - "model.layers.51.mlp.switch_mlp.up_proj.weight": "model-00037-of-00044.safetensors", - "model.layers.51.post_attention_layernorm.weight": "model-00037-of-00044.safetensors", - "model.layers.51.self_attn.kv_a_layernorm.weight": "model-00037-of-00044.safetensors", - "model.layers.51.self_attn.kv_a_proj_with_mqa.biases": "model-00037-of-00044.safetensors", - "model.layers.51.self_attn.kv_a_proj_with_mqa.scales": "model-00037-of-00044.safetensors", - "model.layers.51.self_attn.kv_a_proj_with_mqa.weight": "model-00037-of-00044.safetensors", - "model.layers.51.self_attn.kv_b_proj.biases": "model-00037-of-00044.safetensors", - "model.layers.51.self_attn.kv_b_proj.scales": "model-00037-of-00044.safetensors", - "model.layers.51.self_attn.kv_b_proj.weight": "model-00037-of-00044.safetensors", - "model.layers.51.self_attn.o_proj.biases": "model-00037-of-00044.safetensors", - "model.layers.51.self_attn.o_proj.scales": "model-00037-of-00044.safetensors", - "model.layers.51.self_attn.o_proj.weight": "model-00037-of-00044.safetensors", - "model.layers.51.self_attn.q_a_layernorm.weight": "model-00037-of-00044.safetensors", - "model.layers.51.self_attn.q_a_proj.biases": "model-00037-of-00044.safetensors", - "model.layers.51.self_attn.q_a_proj.scales": "model-00037-of-00044.safetensors", - "model.layers.51.self_attn.q_a_proj.weight": "model-00037-of-00044.safetensors", - "model.layers.51.self_attn.q_b_proj.biases": "model-00037-of-00044.safetensors", - "model.layers.51.self_attn.q_b_proj.scales": "model-00037-of-00044.safetensors", - "model.layers.51.self_attn.q_b_proj.weight": "model-00037-of-00044.safetensors", - "model.layers.52.input_layernorm.weight": "model-00038-of-00044.safetensors", - "model.layers.52.mlp.gate.e_score_correction_bias": "model-00038-of-00044.safetensors", - "model.layers.52.mlp.gate.weight": "model-00038-of-00044.safetensors", - "model.layers.52.mlp.shared_experts.down_proj.biases": "model-00038-of-00044.safetensors", - "model.layers.52.mlp.shared_experts.down_proj.scales": "model-00038-of-00044.safetensors", - "model.layers.52.mlp.shared_experts.down_proj.weight": "model-00038-of-00044.safetensors", - "model.layers.52.mlp.shared_experts.gate_proj.biases": "model-00038-of-00044.safetensors", - "model.layers.52.mlp.shared_experts.gate_proj.scales": "model-00038-of-00044.safetensors", - "model.layers.52.mlp.shared_experts.gate_proj.weight": "model-00038-of-00044.safetensors", - "model.layers.52.mlp.shared_experts.up_proj.biases": "model-00038-of-00044.safetensors", - "model.layers.52.mlp.shared_experts.up_proj.scales": "model-00038-of-00044.safetensors", - "model.layers.52.mlp.shared_experts.up_proj.weight": "model-00038-of-00044.safetensors", - "model.layers.52.mlp.switch_mlp.down_proj.biases": "model-00038-of-00044.safetensors", - "model.layers.52.mlp.switch_mlp.down_proj.scales": "model-00038-of-00044.safetensors", - "model.layers.52.mlp.switch_mlp.down_proj.weight": "model-00038-of-00044.safetensors", - "model.layers.52.mlp.switch_mlp.gate_proj.biases": "model-00038-of-00044.safetensors", - "model.layers.52.mlp.switch_mlp.gate_proj.scales": "model-00038-of-00044.safetensors", - "model.layers.52.mlp.switch_mlp.gate_proj.weight": "model-00038-of-00044.safetensors", - "model.layers.52.mlp.switch_mlp.up_proj.biases": "model-00038-of-00044.safetensors", - "model.layers.52.mlp.switch_mlp.up_proj.scales": "model-00038-of-00044.safetensors", - "model.layers.52.mlp.switch_mlp.up_proj.weight": "model-00038-of-00044.safetensors", - "model.layers.52.post_attention_layernorm.weight": "model-00038-of-00044.safetensors", - "model.layers.52.self_attn.kv_a_layernorm.weight": "model-00037-of-00044.safetensors", - "model.layers.52.self_attn.kv_a_proj_with_mqa.biases": "model-00037-of-00044.safetensors", - "model.layers.52.self_attn.kv_a_proj_with_mqa.scales": "model-00037-of-00044.safetensors", - "model.layers.52.self_attn.kv_a_proj_with_mqa.weight": "model-00037-of-00044.safetensors", - "model.layers.52.self_attn.kv_b_proj.biases": "model-00037-of-00044.safetensors", - "model.layers.52.self_attn.kv_b_proj.scales": "model-00037-of-00044.safetensors", - "model.layers.52.self_attn.kv_b_proj.weight": "model-00037-of-00044.safetensors", - "model.layers.52.self_attn.o_proj.biases": "model-00037-of-00044.safetensors", - "model.layers.52.self_attn.o_proj.scales": "model-00037-of-00044.safetensors", - "model.layers.52.self_attn.o_proj.weight": "model-00037-of-00044.safetensors", - "model.layers.52.self_attn.q_a_layernorm.weight": "model-00037-of-00044.safetensors", - "model.layers.52.self_attn.q_a_proj.biases": "model-00037-of-00044.safetensors", - "model.layers.52.self_attn.q_a_proj.scales": "model-00037-of-00044.safetensors", - "model.layers.52.self_attn.q_a_proj.weight": "model-00037-of-00044.safetensors", - "model.layers.52.self_attn.q_b_proj.biases": "model-00037-of-00044.safetensors", - "model.layers.52.self_attn.q_b_proj.scales": "model-00037-of-00044.safetensors", - "model.layers.52.self_attn.q_b_proj.weight": "model-00037-of-00044.safetensors", - "model.layers.53.input_layernorm.weight": "model-00039-of-00044.safetensors", - "model.layers.53.mlp.gate.e_score_correction_bias": "model-00039-of-00044.safetensors", - "model.layers.53.mlp.gate.weight": "model-00039-of-00044.safetensors", - "model.layers.53.mlp.shared_experts.down_proj.biases": "model-00039-of-00044.safetensors", - "model.layers.53.mlp.shared_experts.down_proj.scales": "model-00039-of-00044.safetensors", - "model.layers.53.mlp.shared_experts.down_proj.weight": "model-00039-of-00044.safetensors", - "model.layers.53.mlp.shared_experts.gate_proj.biases": "model-00039-of-00044.safetensors", - "model.layers.53.mlp.shared_experts.gate_proj.scales": "model-00039-of-00044.safetensors", - "model.layers.53.mlp.shared_experts.gate_proj.weight": "model-00039-of-00044.safetensors", - "model.layers.53.mlp.shared_experts.up_proj.biases": "model-00039-of-00044.safetensors", - "model.layers.53.mlp.shared_experts.up_proj.scales": "model-00039-of-00044.safetensors", - "model.layers.53.mlp.shared_experts.up_proj.weight": "model-00039-of-00044.safetensors", - "model.layers.53.mlp.switch_mlp.down_proj.biases": "model-00039-of-00044.safetensors", - "model.layers.53.mlp.switch_mlp.down_proj.scales": "model-00039-of-00044.safetensors", - "model.layers.53.mlp.switch_mlp.down_proj.weight": "model-00039-of-00044.safetensors", - "model.layers.53.mlp.switch_mlp.gate_proj.biases": "model-00038-of-00044.safetensors", - "model.layers.53.mlp.switch_mlp.gate_proj.scales": "model-00038-of-00044.safetensors", - "model.layers.53.mlp.switch_mlp.gate_proj.weight": "model-00038-of-00044.safetensors", - "model.layers.53.mlp.switch_mlp.up_proj.biases": "model-00039-of-00044.safetensors", - "model.layers.53.mlp.switch_mlp.up_proj.scales": "model-00039-of-00044.safetensors", - "model.layers.53.mlp.switch_mlp.up_proj.weight": "model-00039-of-00044.safetensors", - "model.layers.53.post_attention_layernorm.weight": "model-00039-of-00044.safetensors", - "model.layers.53.self_attn.kv_a_layernorm.weight": "model-00038-of-00044.safetensors", - "model.layers.53.self_attn.kv_a_proj_with_mqa.biases": "model-00038-of-00044.safetensors", - "model.layers.53.self_attn.kv_a_proj_with_mqa.scales": "model-00038-of-00044.safetensors", - "model.layers.53.self_attn.kv_a_proj_with_mqa.weight": "model-00038-of-00044.safetensors", - "model.layers.53.self_attn.kv_b_proj.biases": "model-00038-of-00044.safetensors", - "model.layers.53.self_attn.kv_b_proj.scales": "model-00038-of-00044.safetensors", - "model.layers.53.self_attn.kv_b_proj.weight": "model-00038-of-00044.safetensors", - "model.layers.53.self_attn.o_proj.biases": "model-00038-of-00044.safetensors", - "model.layers.53.self_attn.o_proj.scales": "model-00038-of-00044.safetensors", - "model.layers.53.self_attn.o_proj.weight": "model-00038-of-00044.safetensors", - "model.layers.53.self_attn.q_a_layernorm.weight": "model-00038-of-00044.safetensors", - "model.layers.53.self_attn.q_a_proj.biases": "model-00038-of-00044.safetensors", - "model.layers.53.self_attn.q_a_proj.scales": "model-00038-of-00044.safetensors", - "model.layers.53.self_attn.q_a_proj.weight": "model-00038-of-00044.safetensors", - "model.layers.53.self_attn.q_b_proj.biases": "model-00038-of-00044.safetensors", - "model.layers.53.self_attn.q_b_proj.scales": "model-00038-of-00044.safetensors", - "model.layers.53.self_attn.q_b_proj.weight": "model-00038-of-00044.safetensors", - "model.layers.54.input_layernorm.weight": "model-00040-of-00044.safetensors", - "model.layers.54.mlp.gate.e_score_correction_bias": "model-00040-of-00044.safetensors", - "model.layers.54.mlp.gate.weight": "model-00040-of-00044.safetensors", - "model.layers.54.mlp.shared_experts.down_proj.biases": "model-00040-of-00044.safetensors", - "model.layers.54.mlp.shared_experts.down_proj.scales": "model-00040-of-00044.safetensors", - "model.layers.54.mlp.shared_experts.down_proj.weight": "model-00040-of-00044.safetensors", - "model.layers.54.mlp.shared_experts.gate_proj.biases": "model-00040-of-00044.safetensors", - "model.layers.54.mlp.shared_experts.gate_proj.scales": "model-00040-of-00044.safetensors", - "model.layers.54.mlp.shared_experts.gate_proj.weight": "model-00040-of-00044.safetensors", - "model.layers.54.mlp.shared_experts.up_proj.biases": "model-00040-of-00044.safetensors", - "model.layers.54.mlp.shared_experts.up_proj.scales": "model-00040-of-00044.safetensors", - "model.layers.54.mlp.shared_experts.up_proj.weight": "model-00040-of-00044.safetensors", - "model.layers.54.mlp.switch_mlp.down_proj.biases": "model-00040-of-00044.safetensors", - "model.layers.54.mlp.switch_mlp.down_proj.scales": "model-00040-of-00044.safetensors", - "model.layers.54.mlp.switch_mlp.down_proj.weight": "model-00040-of-00044.safetensors", - "model.layers.54.mlp.switch_mlp.gate_proj.biases": "model-00039-of-00044.safetensors", - "model.layers.54.mlp.switch_mlp.gate_proj.scales": "model-00039-of-00044.safetensors", - "model.layers.54.mlp.switch_mlp.gate_proj.weight": "model-00039-of-00044.safetensors", - "model.layers.54.mlp.switch_mlp.up_proj.biases": "model-00039-of-00044.safetensors", - "model.layers.54.mlp.switch_mlp.up_proj.scales": "model-00039-of-00044.safetensors", - "model.layers.54.mlp.switch_mlp.up_proj.weight": "model-00039-of-00044.safetensors", - "model.layers.54.post_attention_layernorm.weight": "model-00040-of-00044.safetensors", - "model.layers.54.self_attn.kv_a_layernorm.weight": "model-00039-of-00044.safetensors", - "model.layers.54.self_attn.kv_a_proj_with_mqa.biases": "model-00039-of-00044.safetensors", - "model.layers.54.self_attn.kv_a_proj_with_mqa.scales": "model-00039-of-00044.safetensors", - "model.layers.54.self_attn.kv_a_proj_with_mqa.weight": "model-00039-of-00044.safetensors", - "model.layers.54.self_attn.kv_b_proj.biases": "model-00039-of-00044.safetensors", - "model.layers.54.self_attn.kv_b_proj.scales": "model-00039-of-00044.safetensors", - "model.layers.54.self_attn.kv_b_proj.weight": "model-00039-of-00044.safetensors", - "model.layers.54.self_attn.o_proj.biases": "model-00039-of-00044.safetensors", - "model.layers.54.self_attn.o_proj.scales": "model-00039-of-00044.safetensors", - "model.layers.54.self_attn.o_proj.weight": "model-00039-of-00044.safetensors", - "model.layers.54.self_attn.q_a_layernorm.weight": "model-00039-of-00044.safetensors", - "model.layers.54.self_attn.q_a_proj.biases": "model-00039-of-00044.safetensors", - "model.layers.54.self_attn.q_a_proj.scales": "model-00039-of-00044.safetensors", - "model.layers.54.self_attn.q_a_proj.weight": "model-00039-of-00044.safetensors", - "model.layers.54.self_attn.q_b_proj.biases": "model-00039-of-00044.safetensors", - "model.layers.54.self_attn.q_b_proj.scales": "model-00039-of-00044.safetensors", - "model.layers.54.self_attn.q_b_proj.weight": "model-00039-of-00044.safetensors", - "model.layers.55.input_layernorm.weight": "model-00040-of-00044.safetensors", - "model.layers.55.mlp.gate.e_score_correction_bias": "model-00040-of-00044.safetensors", - "model.layers.55.mlp.gate.weight": "model-00040-of-00044.safetensors", - "model.layers.55.mlp.shared_experts.down_proj.biases": "model-00040-of-00044.safetensors", - "model.layers.55.mlp.shared_experts.down_proj.scales": "model-00040-of-00044.safetensors", - "model.layers.55.mlp.shared_experts.down_proj.weight": "model-00040-of-00044.safetensors", - "model.layers.55.mlp.shared_experts.gate_proj.biases": "model-00040-of-00044.safetensors", - "model.layers.55.mlp.shared_experts.gate_proj.scales": "model-00040-of-00044.safetensors", - "model.layers.55.mlp.shared_experts.gate_proj.weight": "model-00040-of-00044.safetensors", - "model.layers.55.mlp.shared_experts.up_proj.biases": "model-00040-of-00044.safetensors", - "model.layers.55.mlp.shared_experts.up_proj.scales": "model-00040-of-00044.safetensors", - "model.layers.55.mlp.shared_experts.up_proj.weight": "model-00040-of-00044.safetensors", - "model.layers.55.mlp.switch_mlp.down_proj.biases": "model-00040-of-00044.safetensors", - "model.layers.55.mlp.switch_mlp.down_proj.scales": "model-00040-of-00044.safetensors", - "model.layers.55.mlp.switch_mlp.down_proj.weight": "model-00040-of-00044.safetensors", - "model.layers.55.mlp.switch_mlp.gate_proj.biases": "model-00040-of-00044.safetensors", - "model.layers.55.mlp.switch_mlp.gate_proj.scales": "model-00040-of-00044.safetensors", - "model.layers.55.mlp.switch_mlp.gate_proj.weight": "model-00040-of-00044.safetensors", - "model.layers.55.mlp.switch_mlp.up_proj.biases": "model-00040-of-00044.safetensors", - "model.layers.55.mlp.switch_mlp.up_proj.scales": "model-00040-of-00044.safetensors", - "model.layers.55.mlp.switch_mlp.up_proj.weight": "model-00040-of-00044.safetensors", - "model.layers.55.post_attention_layernorm.weight": "model-00040-of-00044.safetensors", - "model.layers.55.self_attn.kv_a_layernorm.weight": "model-00040-of-00044.safetensors", - "model.layers.55.self_attn.kv_a_proj_with_mqa.biases": "model-00040-of-00044.safetensors", - "model.layers.55.self_attn.kv_a_proj_with_mqa.scales": "model-00040-of-00044.safetensors", - "model.layers.55.self_attn.kv_a_proj_with_mqa.weight": "model-00040-of-00044.safetensors", - "model.layers.55.self_attn.kv_b_proj.biases": "model-00040-of-00044.safetensors", - "model.layers.55.self_attn.kv_b_proj.scales": "model-00040-of-00044.safetensors", - "model.layers.55.self_attn.kv_b_proj.weight": "model-00040-of-00044.safetensors", - "model.layers.55.self_attn.o_proj.biases": "model-00040-of-00044.safetensors", - "model.layers.55.self_attn.o_proj.scales": "model-00040-of-00044.safetensors", - "model.layers.55.self_attn.o_proj.weight": "model-00040-of-00044.safetensors", - "model.layers.55.self_attn.q_a_layernorm.weight": "model-00040-of-00044.safetensors", - "model.layers.55.self_attn.q_a_proj.biases": "model-00040-of-00044.safetensors", - "model.layers.55.self_attn.q_a_proj.scales": "model-00040-of-00044.safetensors", - "model.layers.55.self_attn.q_a_proj.weight": "model-00040-of-00044.safetensors", - "model.layers.55.self_attn.q_b_proj.biases": "model-00040-of-00044.safetensors", - "model.layers.55.self_attn.q_b_proj.scales": "model-00040-of-00044.safetensors", - "model.layers.55.self_attn.q_b_proj.weight": "model-00040-of-00044.safetensors", - "model.layers.56.input_layernorm.weight": "model-00041-of-00044.safetensors", - "model.layers.56.mlp.gate.e_score_correction_bias": "model-00041-of-00044.safetensors", - "model.layers.56.mlp.gate.weight": "model-00041-of-00044.safetensors", - "model.layers.56.mlp.shared_experts.down_proj.biases": "model-00041-of-00044.safetensors", - "model.layers.56.mlp.shared_experts.down_proj.scales": "model-00041-of-00044.safetensors", - "model.layers.56.mlp.shared_experts.down_proj.weight": "model-00041-of-00044.safetensors", - "model.layers.56.mlp.shared_experts.gate_proj.biases": "model-00041-of-00044.safetensors", - "model.layers.56.mlp.shared_experts.gate_proj.scales": "model-00041-of-00044.safetensors", - "model.layers.56.mlp.shared_experts.gate_proj.weight": "model-00041-of-00044.safetensors", - "model.layers.56.mlp.shared_experts.up_proj.biases": "model-00041-of-00044.safetensors", - "model.layers.56.mlp.shared_experts.up_proj.scales": "model-00041-of-00044.safetensors", - "model.layers.56.mlp.shared_experts.up_proj.weight": "model-00041-of-00044.safetensors", - "model.layers.56.mlp.switch_mlp.down_proj.biases": "model-00041-of-00044.safetensors", - "model.layers.56.mlp.switch_mlp.down_proj.scales": "model-00041-of-00044.safetensors", - "model.layers.56.mlp.switch_mlp.down_proj.weight": "model-00041-of-00044.safetensors", - "model.layers.56.mlp.switch_mlp.gate_proj.biases": "model-00041-of-00044.safetensors", - "model.layers.56.mlp.switch_mlp.gate_proj.scales": "model-00041-of-00044.safetensors", - "model.layers.56.mlp.switch_mlp.gate_proj.weight": "model-00041-of-00044.safetensors", - "model.layers.56.mlp.switch_mlp.up_proj.biases": "model-00041-of-00044.safetensors", - "model.layers.56.mlp.switch_mlp.up_proj.scales": "model-00041-of-00044.safetensors", - "model.layers.56.mlp.switch_mlp.up_proj.weight": "model-00041-of-00044.safetensors", - "model.layers.56.post_attention_layernorm.weight": "model-00041-of-00044.safetensors", - "model.layers.56.self_attn.kv_a_layernorm.weight": "model-00040-of-00044.safetensors", - "model.layers.56.self_attn.kv_a_proj_with_mqa.biases": "model-00040-of-00044.safetensors", - "model.layers.56.self_attn.kv_a_proj_with_mqa.scales": "model-00040-of-00044.safetensors", - "model.layers.56.self_attn.kv_a_proj_with_mqa.weight": "model-00040-of-00044.safetensors", - "model.layers.56.self_attn.kv_b_proj.biases": "model-00040-of-00044.safetensors", - "model.layers.56.self_attn.kv_b_proj.scales": "model-00040-of-00044.safetensors", - "model.layers.56.self_attn.kv_b_proj.weight": "model-00040-of-00044.safetensors", - "model.layers.56.self_attn.o_proj.biases": "model-00040-of-00044.safetensors", - "model.layers.56.self_attn.o_proj.scales": "model-00040-of-00044.safetensors", - "model.layers.56.self_attn.o_proj.weight": "model-00040-of-00044.safetensors", - "model.layers.56.self_attn.q_a_layernorm.weight": "model-00040-of-00044.safetensors", - "model.layers.56.self_attn.q_a_proj.biases": "model-00040-of-00044.safetensors", - "model.layers.56.self_attn.q_a_proj.scales": "model-00040-of-00044.safetensors", - "model.layers.56.self_attn.q_a_proj.weight": "model-00040-of-00044.safetensors", - "model.layers.56.self_attn.q_b_proj.biases": "model-00040-of-00044.safetensors", - "model.layers.56.self_attn.q_b_proj.scales": "model-00040-of-00044.safetensors", - "model.layers.56.self_attn.q_b_proj.weight": "model-00040-of-00044.safetensors", - "model.layers.57.input_layernorm.weight": "model-00042-of-00044.safetensors", - "model.layers.57.mlp.gate.e_score_correction_bias": "model-00042-of-00044.safetensors", - "model.layers.57.mlp.gate.weight": "model-00042-of-00044.safetensors", - "model.layers.57.mlp.shared_experts.down_proj.biases": "model-00042-of-00044.safetensors", - "model.layers.57.mlp.shared_experts.down_proj.scales": "model-00042-of-00044.safetensors", - "model.layers.57.mlp.shared_experts.down_proj.weight": "model-00042-of-00044.safetensors", - "model.layers.57.mlp.shared_experts.gate_proj.biases": "model-00042-of-00044.safetensors", - "model.layers.57.mlp.shared_experts.gate_proj.scales": "model-00042-of-00044.safetensors", - "model.layers.57.mlp.shared_experts.gate_proj.weight": "model-00042-of-00044.safetensors", - "model.layers.57.mlp.shared_experts.up_proj.biases": "model-00042-of-00044.safetensors", - "model.layers.57.mlp.shared_experts.up_proj.scales": "model-00042-of-00044.safetensors", - "model.layers.57.mlp.shared_experts.up_proj.weight": "model-00042-of-00044.safetensors", - "model.layers.57.mlp.switch_mlp.down_proj.biases": "model-00042-of-00044.safetensors", - "model.layers.57.mlp.switch_mlp.down_proj.scales": "model-00042-of-00044.safetensors", - "model.layers.57.mlp.switch_mlp.down_proj.weight": "model-00042-of-00044.safetensors", - "model.layers.57.mlp.switch_mlp.gate_proj.biases": "model-00041-of-00044.safetensors", - "model.layers.57.mlp.switch_mlp.gate_proj.scales": "model-00041-of-00044.safetensors", - "model.layers.57.mlp.switch_mlp.gate_proj.weight": "model-00041-of-00044.safetensors", - "model.layers.57.mlp.switch_mlp.up_proj.biases": "model-00042-of-00044.safetensors", - "model.layers.57.mlp.switch_mlp.up_proj.scales": "model-00042-of-00044.safetensors", - "model.layers.57.mlp.switch_mlp.up_proj.weight": "model-00042-of-00044.safetensors", - "model.layers.57.post_attention_layernorm.weight": "model-00042-of-00044.safetensors", - "model.layers.57.self_attn.kv_a_layernorm.weight": "model-00041-of-00044.safetensors", - "model.layers.57.self_attn.kv_a_proj_with_mqa.biases": "model-00041-of-00044.safetensors", - "model.layers.57.self_attn.kv_a_proj_with_mqa.scales": "model-00041-of-00044.safetensors", - "model.layers.57.self_attn.kv_a_proj_with_mqa.weight": "model-00041-of-00044.safetensors", - "model.layers.57.self_attn.kv_b_proj.biases": "model-00041-of-00044.safetensors", - "model.layers.57.self_attn.kv_b_proj.scales": "model-00041-of-00044.safetensors", - "model.layers.57.self_attn.kv_b_proj.weight": "model-00041-of-00044.safetensors", - "model.layers.57.self_attn.o_proj.biases": "model-00041-of-00044.safetensors", - "model.layers.57.self_attn.o_proj.scales": "model-00041-of-00044.safetensors", - "model.layers.57.self_attn.o_proj.weight": "model-00041-of-00044.safetensors", - "model.layers.57.self_attn.q_a_layernorm.weight": "model-00041-of-00044.safetensors", - "model.layers.57.self_attn.q_a_proj.biases": "model-00041-of-00044.safetensors", - "model.layers.57.self_attn.q_a_proj.scales": "model-00041-of-00044.safetensors", - "model.layers.57.self_attn.q_a_proj.weight": "model-00041-of-00044.safetensors", - "model.layers.57.self_attn.q_b_proj.biases": "model-00041-of-00044.safetensors", - "model.layers.57.self_attn.q_b_proj.scales": "model-00041-of-00044.safetensors", - "model.layers.57.self_attn.q_b_proj.weight": "model-00041-of-00044.safetensors", - "model.layers.58.input_layernorm.weight": "model-00043-of-00044.safetensors", - "model.layers.58.mlp.gate.e_score_correction_bias": "model-00043-of-00044.safetensors", - "model.layers.58.mlp.gate.weight": "model-00043-of-00044.safetensors", - "model.layers.58.mlp.shared_experts.down_proj.biases": "model-00043-of-00044.safetensors", - "model.layers.58.mlp.shared_experts.down_proj.scales": "model-00043-of-00044.safetensors", - "model.layers.58.mlp.shared_experts.down_proj.weight": "model-00043-of-00044.safetensors", - "model.layers.58.mlp.shared_experts.gate_proj.biases": "model-00043-of-00044.safetensors", - "model.layers.58.mlp.shared_experts.gate_proj.scales": "model-00043-of-00044.safetensors", - "model.layers.58.mlp.shared_experts.gate_proj.weight": "model-00043-of-00044.safetensors", - "model.layers.58.mlp.shared_experts.up_proj.biases": "model-00043-of-00044.safetensors", - "model.layers.58.mlp.shared_experts.up_proj.scales": "model-00043-of-00044.safetensors", - "model.layers.58.mlp.shared_experts.up_proj.weight": "model-00043-of-00044.safetensors", - "model.layers.58.mlp.switch_mlp.down_proj.biases": "model-00043-of-00044.safetensors", - "model.layers.58.mlp.switch_mlp.down_proj.scales": "model-00043-of-00044.safetensors", - "model.layers.58.mlp.switch_mlp.down_proj.weight": "model-00043-of-00044.safetensors", - "model.layers.58.mlp.switch_mlp.gate_proj.biases": "model-00042-of-00044.safetensors", - "model.layers.58.mlp.switch_mlp.gate_proj.scales": "model-00042-of-00044.safetensors", - "model.layers.58.mlp.switch_mlp.gate_proj.weight": "model-00042-of-00044.safetensors", - "model.layers.58.mlp.switch_mlp.up_proj.biases": "model-00042-of-00044.safetensors", - "model.layers.58.mlp.switch_mlp.up_proj.scales": "model-00042-of-00044.safetensors", - "model.layers.58.mlp.switch_mlp.up_proj.weight": "model-00042-of-00044.safetensors", - "model.layers.58.post_attention_layernorm.weight": "model-00043-of-00044.safetensors", - "model.layers.58.self_attn.kv_a_layernorm.weight": "model-00042-of-00044.safetensors", - "model.layers.58.self_attn.kv_a_proj_with_mqa.biases": "model-00042-of-00044.safetensors", - "model.layers.58.self_attn.kv_a_proj_with_mqa.scales": "model-00042-of-00044.safetensors", - "model.layers.58.self_attn.kv_a_proj_with_mqa.weight": "model-00042-of-00044.safetensors", - "model.layers.58.self_attn.kv_b_proj.biases": "model-00042-of-00044.safetensors", - "model.layers.58.self_attn.kv_b_proj.scales": "model-00042-of-00044.safetensors", - "model.layers.58.self_attn.kv_b_proj.weight": "model-00042-of-00044.safetensors", - "model.layers.58.self_attn.o_proj.biases": "model-00042-of-00044.safetensors", - "model.layers.58.self_attn.o_proj.scales": "model-00042-of-00044.safetensors", - "model.layers.58.self_attn.o_proj.weight": "model-00042-of-00044.safetensors", - "model.layers.58.self_attn.q_a_layernorm.weight": "model-00042-of-00044.safetensors", - "model.layers.58.self_attn.q_a_proj.biases": "model-00042-of-00044.safetensors", - "model.layers.58.self_attn.q_a_proj.scales": "model-00042-of-00044.safetensors", - "model.layers.58.self_attn.q_a_proj.weight": "model-00042-of-00044.safetensors", - "model.layers.58.self_attn.q_b_proj.biases": "model-00042-of-00044.safetensors", - "model.layers.58.self_attn.q_b_proj.scales": "model-00042-of-00044.safetensors", - "model.layers.58.self_attn.q_b_proj.weight": "model-00042-of-00044.safetensors", - "model.layers.59.input_layernorm.weight": "model-00043-of-00044.safetensors", - "model.layers.59.mlp.gate.e_score_correction_bias": "model-00043-of-00044.safetensors", - "model.layers.59.mlp.gate.weight": "model-00043-of-00044.safetensors", - "model.layers.59.mlp.shared_experts.down_proj.biases": "model-00043-of-00044.safetensors", - "model.layers.59.mlp.shared_experts.down_proj.scales": "model-00043-of-00044.safetensors", - "model.layers.59.mlp.shared_experts.down_proj.weight": "model-00043-of-00044.safetensors", - "model.layers.59.mlp.shared_experts.gate_proj.biases": "model-00043-of-00044.safetensors", - "model.layers.59.mlp.shared_experts.gate_proj.scales": "model-00043-of-00044.safetensors", - "model.layers.59.mlp.shared_experts.gate_proj.weight": "model-00043-of-00044.safetensors", - "model.layers.59.mlp.shared_experts.up_proj.biases": "model-00043-of-00044.safetensors", - "model.layers.59.mlp.shared_experts.up_proj.scales": "model-00043-of-00044.safetensors", - "model.layers.59.mlp.shared_experts.up_proj.weight": "model-00043-of-00044.safetensors", - "model.layers.59.mlp.switch_mlp.down_proj.biases": "model-00043-of-00044.safetensors", - "model.layers.59.mlp.switch_mlp.down_proj.scales": "model-00043-of-00044.safetensors", - "model.layers.59.mlp.switch_mlp.down_proj.weight": "model-00043-of-00044.safetensors", - "model.layers.59.mlp.switch_mlp.gate_proj.biases": "model-00043-of-00044.safetensors", - "model.layers.59.mlp.switch_mlp.gate_proj.scales": "model-00043-of-00044.safetensors", - "model.layers.59.mlp.switch_mlp.gate_proj.weight": "model-00043-of-00044.safetensors", - "model.layers.59.mlp.switch_mlp.up_proj.biases": "model-00043-of-00044.safetensors", - "model.layers.59.mlp.switch_mlp.up_proj.scales": "model-00043-of-00044.safetensors", - "model.layers.59.mlp.switch_mlp.up_proj.weight": "model-00043-of-00044.safetensors", - "model.layers.59.post_attention_layernorm.weight": "model-00043-of-00044.safetensors", - "model.layers.59.self_attn.kv_a_layernorm.weight": "model-00043-of-00044.safetensors", - "model.layers.59.self_attn.kv_a_proj_with_mqa.biases": "model-00043-of-00044.safetensors", - "model.layers.59.self_attn.kv_a_proj_with_mqa.scales": "model-00043-of-00044.safetensors", - "model.layers.59.self_attn.kv_a_proj_with_mqa.weight": "model-00043-of-00044.safetensors", - "model.layers.59.self_attn.kv_b_proj.biases": "model-00043-of-00044.safetensors", - "model.layers.59.self_attn.kv_b_proj.scales": "model-00043-of-00044.safetensors", - "model.layers.59.self_attn.kv_b_proj.weight": "model-00043-of-00044.safetensors", - "model.layers.59.self_attn.o_proj.biases": "model-00043-of-00044.safetensors", - "model.layers.59.self_attn.o_proj.scales": "model-00043-of-00044.safetensors", - "model.layers.59.self_attn.o_proj.weight": "model-00043-of-00044.safetensors", - "model.layers.59.self_attn.q_a_layernorm.weight": "model-00043-of-00044.safetensors", - "model.layers.59.self_attn.q_a_proj.biases": "model-00043-of-00044.safetensors", - "model.layers.59.self_attn.q_a_proj.scales": "model-00043-of-00044.safetensors", - "model.layers.59.self_attn.q_a_proj.weight": "model-00043-of-00044.safetensors", - "model.layers.59.self_attn.q_b_proj.biases": "model-00043-of-00044.safetensors", - "model.layers.59.self_attn.q_b_proj.scales": "model-00043-of-00044.safetensors", - "model.layers.59.self_attn.q_b_proj.weight": "model-00043-of-00044.safetensors", - "model.layers.6.input_layernorm.weight": "model-00004-of-00044.safetensors", - "model.layers.6.mlp.gate.e_score_correction_bias": "model-00004-of-00044.safetensors", - "model.layers.6.mlp.gate.weight": "model-00004-of-00044.safetensors", - "model.layers.6.mlp.shared_experts.down_proj.biases": "model-00004-of-00044.safetensors", - "model.layers.6.mlp.shared_experts.down_proj.scales": "model-00004-of-00044.safetensors", - "model.layers.6.mlp.shared_experts.down_proj.weight": "model-00004-of-00044.safetensors", - "model.layers.6.mlp.shared_experts.gate_proj.biases": "model-00004-of-00044.safetensors", - "model.layers.6.mlp.shared_experts.gate_proj.scales": "model-00004-of-00044.safetensors", - "model.layers.6.mlp.shared_experts.gate_proj.weight": "model-00004-of-00044.safetensors", - "model.layers.6.mlp.shared_experts.up_proj.biases": "model-00004-of-00044.safetensors", - "model.layers.6.mlp.shared_experts.up_proj.scales": "model-00004-of-00044.safetensors", - "model.layers.6.mlp.shared_experts.up_proj.weight": "model-00004-of-00044.safetensors", - "model.layers.6.mlp.switch_mlp.down_proj.biases": "model-00004-of-00044.safetensors", - "model.layers.6.mlp.switch_mlp.down_proj.scales": "model-00004-of-00044.safetensors", - "model.layers.6.mlp.switch_mlp.down_proj.weight": "model-00004-of-00044.safetensors", - "model.layers.6.mlp.switch_mlp.gate_proj.biases": "model-00003-of-00044.safetensors", - "model.layers.6.mlp.switch_mlp.gate_proj.scales": "model-00003-of-00044.safetensors", - "model.layers.6.mlp.switch_mlp.gate_proj.weight": "model-00003-of-00044.safetensors", - "model.layers.6.mlp.switch_mlp.up_proj.biases": "model-00003-of-00044.safetensors", - "model.layers.6.mlp.switch_mlp.up_proj.scales": "model-00003-of-00044.safetensors", - "model.layers.6.mlp.switch_mlp.up_proj.weight": "model-00003-of-00044.safetensors", - "model.layers.6.post_attention_layernorm.weight": "model-00004-of-00044.safetensors", - "model.layers.6.self_attn.kv_a_layernorm.weight": "model-00003-of-00044.safetensors", - "model.layers.6.self_attn.kv_a_proj_with_mqa.biases": "model-00003-of-00044.safetensors", - "model.layers.6.self_attn.kv_a_proj_with_mqa.scales": "model-00003-of-00044.safetensors", - "model.layers.6.self_attn.kv_a_proj_with_mqa.weight": "model-00003-of-00044.safetensors", - "model.layers.6.self_attn.kv_b_proj.biases": "model-00003-of-00044.safetensors", - "model.layers.6.self_attn.kv_b_proj.scales": "model-00003-of-00044.safetensors", - "model.layers.6.self_attn.kv_b_proj.weight": "model-00003-of-00044.safetensors", - "model.layers.6.self_attn.o_proj.biases": "model-00003-of-00044.safetensors", - "model.layers.6.self_attn.o_proj.scales": "model-00003-of-00044.safetensors", - "model.layers.6.self_attn.o_proj.weight": "model-00003-of-00044.safetensors", - "model.layers.6.self_attn.q_a_layernorm.weight": "model-00003-of-00044.safetensors", - "model.layers.6.self_attn.q_a_proj.biases": "model-00003-of-00044.safetensors", - "model.layers.6.self_attn.q_a_proj.scales": "model-00003-of-00044.safetensors", - "model.layers.6.self_attn.q_a_proj.weight": "model-00003-of-00044.safetensors", - "model.layers.6.self_attn.q_b_proj.biases": "model-00003-of-00044.safetensors", - "model.layers.6.self_attn.q_b_proj.scales": "model-00003-of-00044.safetensors", - "model.layers.6.self_attn.q_b_proj.weight": "model-00003-of-00044.safetensors", - "model.layers.60.input_layernorm.weight": "model-00044-of-00044.safetensors", - "model.layers.60.mlp.gate.e_score_correction_bias": "model-00044-of-00044.safetensors", - "model.layers.60.mlp.gate.weight": "model-00044-of-00044.safetensors", - "model.layers.60.mlp.shared_experts.down_proj.biases": "model-00044-of-00044.safetensors", - "model.layers.60.mlp.shared_experts.down_proj.scales": "model-00044-of-00044.safetensors", - "model.layers.60.mlp.shared_experts.down_proj.weight": "model-00044-of-00044.safetensors", - "model.layers.60.mlp.shared_experts.gate_proj.biases": "model-00044-of-00044.safetensors", - "model.layers.60.mlp.shared_experts.gate_proj.scales": "model-00044-of-00044.safetensors", - "model.layers.60.mlp.shared_experts.gate_proj.weight": "model-00044-of-00044.safetensors", - "model.layers.60.mlp.shared_experts.up_proj.biases": "model-00044-of-00044.safetensors", - "model.layers.60.mlp.shared_experts.up_proj.scales": "model-00044-of-00044.safetensors", - "model.layers.60.mlp.shared_experts.up_proj.weight": "model-00044-of-00044.safetensors", - "model.layers.60.mlp.switch_mlp.down_proj.biases": "model-00044-of-00044.safetensors", - "model.layers.60.mlp.switch_mlp.down_proj.scales": "model-00044-of-00044.safetensors", - "model.layers.60.mlp.switch_mlp.down_proj.weight": "model-00044-of-00044.safetensors", - "model.layers.60.mlp.switch_mlp.gate_proj.biases": "model-00044-of-00044.safetensors", - "model.layers.60.mlp.switch_mlp.gate_proj.scales": "model-00044-of-00044.safetensors", - "model.layers.60.mlp.switch_mlp.gate_proj.weight": "model-00044-of-00044.safetensors", - "model.layers.60.mlp.switch_mlp.up_proj.biases": "model-00044-of-00044.safetensors", - "model.layers.60.mlp.switch_mlp.up_proj.scales": "model-00044-of-00044.safetensors", - "model.layers.60.mlp.switch_mlp.up_proj.weight": "model-00044-of-00044.safetensors", - "model.layers.60.post_attention_layernorm.weight": "model-00044-of-00044.safetensors", - "model.layers.60.self_attn.kv_a_layernorm.weight": "model-00043-of-00044.safetensors", - "model.layers.60.self_attn.kv_a_proj_with_mqa.biases": "model-00043-of-00044.safetensors", - "model.layers.60.self_attn.kv_a_proj_with_mqa.scales": "model-00043-of-00044.safetensors", - "model.layers.60.self_attn.kv_a_proj_with_mqa.weight": "model-00043-of-00044.safetensors", - "model.layers.60.self_attn.kv_b_proj.biases": "model-00043-of-00044.safetensors", - "model.layers.60.self_attn.kv_b_proj.scales": "model-00043-of-00044.safetensors", - "model.layers.60.self_attn.kv_b_proj.weight": "model-00043-of-00044.safetensors", - "model.layers.60.self_attn.o_proj.biases": "model-00043-of-00044.safetensors", - "model.layers.60.self_attn.o_proj.scales": "model-00043-of-00044.safetensors", - "model.layers.60.self_attn.o_proj.weight": "model-00043-of-00044.safetensors", - "model.layers.60.self_attn.q_a_layernorm.weight": "model-00043-of-00044.safetensors", - "model.layers.60.self_attn.q_a_proj.biases": "model-00043-of-00044.safetensors", - "model.layers.60.self_attn.q_a_proj.scales": "model-00043-of-00044.safetensors", - "model.layers.60.self_attn.q_a_proj.weight": "model-00043-of-00044.safetensors", - "model.layers.60.self_attn.q_b_proj.biases": "model-00043-of-00044.safetensors", - "model.layers.60.self_attn.q_b_proj.scales": "model-00043-of-00044.safetensors", - "model.layers.60.self_attn.q_b_proj.weight": "model-00043-of-00044.safetensors", - "model.layers.7.input_layernorm.weight": "model-00004-of-00044.safetensors", - "model.layers.7.mlp.gate.e_score_correction_bias": "model-00004-of-00044.safetensors", - "model.layers.7.mlp.gate.weight": "model-00004-of-00044.safetensors", - "model.layers.7.mlp.shared_experts.down_proj.biases": "model-00004-of-00044.safetensors", - "model.layers.7.mlp.shared_experts.down_proj.scales": "model-00004-of-00044.safetensors", - "model.layers.7.mlp.shared_experts.down_proj.weight": "model-00004-of-00044.safetensors", - "model.layers.7.mlp.shared_experts.gate_proj.biases": "model-00004-of-00044.safetensors", - "model.layers.7.mlp.shared_experts.gate_proj.scales": "model-00004-of-00044.safetensors", - "model.layers.7.mlp.shared_experts.gate_proj.weight": "model-00004-of-00044.safetensors", - "model.layers.7.mlp.shared_experts.up_proj.biases": "model-00004-of-00044.safetensors", - "model.layers.7.mlp.shared_experts.up_proj.scales": "model-00004-of-00044.safetensors", - "model.layers.7.mlp.shared_experts.up_proj.weight": "model-00004-of-00044.safetensors", - "model.layers.7.mlp.switch_mlp.down_proj.biases": "model-00004-of-00044.safetensors", - "model.layers.7.mlp.switch_mlp.down_proj.scales": "model-00004-of-00044.safetensors", - "model.layers.7.mlp.switch_mlp.down_proj.weight": "model-00004-of-00044.safetensors", - "model.layers.7.mlp.switch_mlp.gate_proj.biases": "model-00004-of-00044.safetensors", - "model.layers.7.mlp.switch_mlp.gate_proj.scales": "model-00004-of-00044.safetensors", - "model.layers.7.mlp.switch_mlp.gate_proj.weight": "model-00004-of-00044.safetensors", - "model.layers.7.mlp.switch_mlp.up_proj.biases": "model-00004-of-00044.safetensors", - "model.layers.7.mlp.switch_mlp.up_proj.scales": "model-00004-of-00044.safetensors", - "model.layers.7.mlp.switch_mlp.up_proj.weight": "model-00004-of-00044.safetensors", - "model.layers.7.post_attention_layernorm.weight": "model-00004-of-00044.safetensors", - "model.layers.7.self_attn.kv_a_layernorm.weight": "model-00004-of-00044.safetensors", - "model.layers.7.self_attn.kv_a_proj_with_mqa.biases": "model-00004-of-00044.safetensors", - "model.layers.7.self_attn.kv_a_proj_with_mqa.scales": "model-00004-of-00044.safetensors", - "model.layers.7.self_attn.kv_a_proj_with_mqa.weight": "model-00004-of-00044.safetensors", - "model.layers.7.self_attn.kv_b_proj.biases": "model-00004-of-00044.safetensors", - "model.layers.7.self_attn.kv_b_proj.scales": "model-00004-of-00044.safetensors", - "model.layers.7.self_attn.kv_b_proj.weight": "model-00004-of-00044.safetensors", - "model.layers.7.self_attn.o_proj.biases": "model-00004-of-00044.safetensors", - "model.layers.7.self_attn.o_proj.scales": "model-00004-of-00044.safetensors", - "model.layers.7.self_attn.o_proj.weight": "model-00004-of-00044.safetensors", - "model.layers.7.self_attn.q_a_layernorm.weight": "model-00004-of-00044.safetensors", - "model.layers.7.self_attn.q_a_proj.biases": "model-00004-of-00044.safetensors", - "model.layers.7.self_attn.q_a_proj.scales": "model-00004-of-00044.safetensors", - "model.layers.7.self_attn.q_a_proj.weight": "model-00004-of-00044.safetensors", - "model.layers.7.self_attn.q_b_proj.biases": "model-00004-of-00044.safetensors", - "model.layers.7.self_attn.q_b_proj.scales": "model-00004-of-00044.safetensors", - "model.layers.7.self_attn.q_b_proj.weight": "model-00004-of-00044.safetensors", - "model.layers.8.input_layernorm.weight": "model-00005-of-00044.safetensors", - "model.layers.8.mlp.gate.e_score_correction_bias": "model-00005-of-00044.safetensors", - "model.layers.8.mlp.gate.weight": "model-00005-of-00044.safetensors", - "model.layers.8.mlp.shared_experts.down_proj.biases": "model-00005-of-00044.safetensors", - "model.layers.8.mlp.shared_experts.down_proj.scales": "model-00005-of-00044.safetensors", - "model.layers.8.mlp.shared_experts.down_proj.weight": "model-00005-of-00044.safetensors", - "model.layers.8.mlp.shared_experts.gate_proj.biases": "model-00005-of-00044.safetensors", - "model.layers.8.mlp.shared_experts.gate_proj.scales": "model-00005-of-00044.safetensors", - "model.layers.8.mlp.shared_experts.gate_proj.weight": "model-00005-of-00044.safetensors", - "model.layers.8.mlp.shared_experts.up_proj.biases": "model-00005-of-00044.safetensors", - "model.layers.8.mlp.shared_experts.up_proj.scales": "model-00005-of-00044.safetensors", - "model.layers.8.mlp.shared_experts.up_proj.weight": "model-00005-of-00044.safetensors", - "model.layers.8.mlp.switch_mlp.down_proj.biases": "model-00005-of-00044.safetensors", - "model.layers.8.mlp.switch_mlp.down_proj.scales": "model-00005-of-00044.safetensors", - "model.layers.8.mlp.switch_mlp.down_proj.weight": "model-00005-of-00044.safetensors", - "model.layers.8.mlp.switch_mlp.gate_proj.biases": "model-00005-of-00044.safetensors", - "model.layers.8.mlp.switch_mlp.gate_proj.scales": "model-00005-of-00044.safetensors", - "model.layers.8.mlp.switch_mlp.gate_proj.weight": "model-00005-of-00044.safetensors", - "model.layers.8.mlp.switch_mlp.up_proj.biases": "model-00005-of-00044.safetensors", - "model.layers.8.mlp.switch_mlp.up_proj.scales": "model-00005-of-00044.safetensors", - "model.layers.8.mlp.switch_mlp.up_proj.weight": "model-00005-of-00044.safetensors", - "model.layers.8.post_attention_layernorm.weight": "model-00005-of-00044.safetensors", - "model.layers.8.self_attn.kv_a_layernorm.weight": "model-00004-of-00044.safetensors", - "model.layers.8.self_attn.kv_a_proj_with_mqa.biases": "model-00004-of-00044.safetensors", - "model.layers.8.self_attn.kv_a_proj_with_mqa.scales": "model-00004-of-00044.safetensors", - "model.layers.8.self_attn.kv_a_proj_with_mqa.weight": "model-00004-of-00044.safetensors", - "model.layers.8.self_attn.kv_b_proj.biases": "model-00004-of-00044.safetensors", - "model.layers.8.self_attn.kv_b_proj.scales": "model-00004-of-00044.safetensors", - "model.layers.8.self_attn.kv_b_proj.weight": "model-00004-of-00044.safetensors", - "model.layers.8.self_attn.o_proj.biases": "model-00004-of-00044.safetensors", - "model.layers.8.self_attn.o_proj.scales": "model-00004-of-00044.safetensors", - "model.layers.8.self_attn.o_proj.weight": "model-00004-of-00044.safetensors", - "model.layers.8.self_attn.q_a_layernorm.weight": "model-00004-of-00044.safetensors", - "model.layers.8.self_attn.q_a_proj.biases": "model-00004-of-00044.safetensors", - "model.layers.8.self_attn.q_a_proj.scales": "model-00004-of-00044.safetensors", - "model.layers.8.self_attn.q_a_proj.weight": "model-00004-of-00044.safetensors", - "model.layers.8.self_attn.q_b_proj.biases": "model-00004-of-00044.safetensors", - "model.layers.8.self_attn.q_b_proj.scales": "model-00004-of-00044.safetensors", - "model.layers.8.self_attn.q_b_proj.weight": "model-00004-of-00044.safetensors", - "model.layers.9.input_layernorm.weight": "model-00006-of-00044.safetensors", - "model.layers.9.mlp.gate.e_score_correction_bias": "model-00006-of-00044.safetensors", - "model.layers.9.mlp.gate.weight": "model-00006-of-00044.safetensors", - "model.layers.9.mlp.shared_experts.down_proj.biases": "model-00006-of-00044.safetensors", - "model.layers.9.mlp.shared_experts.down_proj.scales": "model-00006-of-00044.safetensors", - "model.layers.9.mlp.shared_experts.down_proj.weight": "model-00006-of-00044.safetensors", - "model.layers.9.mlp.shared_experts.gate_proj.biases": "model-00006-of-00044.safetensors", - "model.layers.9.mlp.shared_experts.gate_proj.scales": "model-00006-of-00044.safetensors", - "model.layers.9.mlp.shared_experts.gate_proj.weight": "model-00006-of-00044.safetensors", - "model.layers.9.mlp.shared_experts.up_proj.biases": "model-00006-of-00044.safetensors", - "model.layers.9.mlp.shared_experts.up_proj.scales": "model-00006-of-00044.safetensors", - "model.layers.9.mlp.shared_experts.up_proj.weight": "model-00006-of-00044.safetensors", - "model.layers.9.mlp.switch_mlp.down_proj.biases": "model-00006-of-00044.safetensors", - "model.layers.9.mlp.switch_mlp.down_proj.scales": "model-00006-of-00044.safetensors", - "model.layers.9.mlp.switch_mlp.down_proj.weight": "model-00006-of-00044.safetensors", - "model.layers.9.mlp.switch_mlp.gate_proj.biases": "model-00005-of-00044.safetensors", - "model.layers.9.mlp.switch_mlp.gate_proj.scales": "model-00005-of-00044.safetensors", - "model.layers.9.mlp.switch_mlp.gate_proj.weight": "model-00005-of-00044.safetensors", - "model.layers.9.mlp.switch_mlp.up_proj.biases": "model-00006-of-00044.safetensors", - "model.layers.9.mlp.switch_mlp.up_proj.scales": "model-00006-of-00044.safetensors", - "model.layers.9.mlp.switch_mlp.up_proj.weight": "model-00006-of-00044.safetensors", - "model.layers.9.post_attention_layernorm.weight": "model-00006-of-00044.safetensors", - "model.layers.9.self_attn.kv_a_layernorm.weight": "model-00005-of-00044.safetensors", - "model.layers.9.self_attn.kv_a_proj_with_mqa.biases": "model-00005-of-00044.safetensors", - "model.layers.9.self_attn.kv_a_proj_with_mqa.scales": "model-00005-of-00044.safetensors", - "model.layers.9.self_attn.kv_a_proj_with_mqa.weight": "model-00005-of-00044.safetensors", - "model.layers.9.self_attn.kv_b_proj.biases": "model-00005-of-00044.safetensors", - "model.layers.9.self_attn.kv_b_proj.scales": "model-00005-of-00044.safetensors", - "model.layers.9.self_attn.kv_b_proj.weight": "model-00005-of-00044.safetensors", - "model.layers.9.self_attn.o_proj.biases": "model-00005-of-00044.safetensors", - "model.layers.9.self_attn.o_proj.scales": "model-00005-of-00044.safetensors", - "model.layers.9.self_attn.o_proj.weight": "model-00005-of-00044.safetensors", - "model.layers.9.self_attn.q_a_layernorm.weight": "model-00005-of-00044.safetensors", - "model.layers.9.self_attn.q_a_proj.biases": "model-00005-of-00044.safetensors", - "model.layers.9.self_attn.q_a_proj.scales": "model-00005-of-00044.safetensors", - "model.layers.9.self_attn.q_a_proj.weight": "model-00005-of-00044.safetensors", - "model.layers.9.self_attn.q_b_proj.biases": "model-00005-of-00044.safetensors", - "model.layers.9.self_attn.q_b_proj.scales": "model-00005-of-00044.safetensors", - "model.layers.9.self_attn.q_b_proj.weight": "model-00005-of-00044.safetensors", - "model.norm.weight": "model-00044-of-00044.safetensors" + "lm_head.biases": "model-00050-of-00050.safetensors", + "lm_head.scales": "model-00050-of-00050.safetensors", + "lm_head.weight": "model-00050-of-00050.safetensors", + "model.embed_tokens.biases": "model-00001-of-00050.safetensors", + "model.embed_tokens.scales": "model-00001-of-00050.safetensors", + "model.embed_tokens.weight": "model-00001-of-00050.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00050.safetensors", + "model.layers.0.mlp.down_proj.biases": "model-00001-of-00050.safetensors", + "model.layers.0.mlp.down_proj.scales": "model-00001-of-00050.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00050.safetensors", + "model.layers.0.mlp.gate_proj.biases": "model-00001-of-00050.safetensors", + "model.layers.0.mlp.gate_proj.scales": "model-00001-of-00050.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00050.safetensors", + "model.layers.0.mlp.up_proj.biases": "model-00001-of-00050.safetensors", + "model.layers.0.mlp.up_proj.scales": "model-00001-of-00050.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00050.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00050.safetensors", + "model.layers.0.self_attn.kv_a_layernorm.weight": "model-00001-of-00050.safetensors", + "model.layers.0.self_attn.kv_a_proj_with_mqa.biases": "model-00001-of-00050.safetensors", + "model.layers.0.self_attn.kv_a_proj_with_mqa.scales": "model-00001-of-00050.safetensors", + "model.layers.0.self_attn.kv_a_proj_with_mqa.weight": "model-00001-of-00050.safetensors", + "model.layers.0.self_attn.kv_b_proj.biases": "model-00001-of-00050.safetensors", + "model.layers.0.self_attn.kv_b_proj.scales": "model-00001-of-00050.safetensors", + "model.layers.0.self_attn.kv_b_proj.weight": "model-00001-of-00050.safetensors", + "model.layers.0.self_attn.o_proj.biases": "model-00001-of-00050.safetensors", + "model.layers.0.self_attn.o_proj.scales": "model-00001-of-00050.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00050.safetensors", + "model.layers.0.self_attn.q_a_layernorm.weight": "model-00001-of-00050.safetensors", + "model.layers.0.self_attn.q_a_proj.biases": "model-00001-of-00050.safetensors", + "model.layers.0.self_attn.q_a_proj.scales": "model-00001-of-00050.safetensors", + "model.layers.0.self_attn.q_a_proj.weight": "model-00001-of-00050.safetensors", + "model.layers.0.self_attn.q_b_proj.biases": "model-00001-of-00050.safetensors", + "model.layers.0.self_attn.q_b_proj.scales": "model-00001-of-00050.safetensors", + "model.layers.0.self_attn.q_b_proj.weight": "model-00001-of-00050.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00050.safetensors", + "model.layers.1.mlp.down_proj.biases": "model-00001-of-00050.safetensors", + "model.layers.1.mlp.down_proj.scales": "model-00001-of-00050.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00050.safetensors", + "model.layers.1.mlp.gate_proj.biases": "model-00001-of-00050.safetensors", + "model.layers.1.mlp.gate_proj.scales": "model-00001-of-00050.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00050.safetensors", + "model.layers.1.mlp.up_proj.biases": "model-00001-of-00050.safetensors", + "model.layers.1.mlp.up_proj.scales": "model-00001-of-00050.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00050.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00050.safetensors", + "model.layers.1.self_attn.kv_a_layernorm.weight": "model-00001-of-00050.safetensors", + "model.layers.1.self_attn.kv_a_proj_with_mqa.biases": "model-00001-of-00050.safetensors", + "model.layers.1.self_attn.kv_a_proj_with_mqa.scales": "model-00001-of-00050.safetensors", + "model.layers.1.self_attn.kv_a_proj_with_mqa.weight": "model-00001-of-00050.safetensors", + "model.layers.1.self_attn.kv_b_proj.biases": "model-00001-of-00050.safetensors", + "model.layers.1.self_attn.kv_b_proj.scales": "model-00001-of-00050.safetensors", + "model.layers.1.self_attn.kv_b_proj.weight": "model-00001-of-00050.safetensors", + "model.layers.1.self_attn.o_proj.biases": "model-00001-of-00050.safetensors", + "model.layers.1.self_attn.o_proj.scales": "model-00001-of-00050.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00050.safetensors", + "model.layers.1.self_attn.q_a_layernorm.weight": "model-00001-of-00050.safetensors", + "model.layers.1.self_attn.q_a_proj.biases": "model-00001-of-00050.safetensors", + "model.layers.1.self_attn.q_a_proj.scales": "model-00001-of-00050.safetensors", + "model.layers.1.self_attn.q_a_proj.weight": "model-00001-of-00050.safetensors", + "model.layers.1.self_attn.q_b_proj.biases": "model-00001-of-00050.safetensors", + "model.layers.1.self_attn.q_b_proj.scales": "model-00001-of-00050.safetensors", + "model.layers.1.self_attn.q_b_proj.weight": "model-00001-of-00050.safetensors", + "model.layers.10.input_layernorm.weight": "model-00007-of-00050.safetensors", + "model.layers.10.mlp.gate.e_score_correction_bias": "model-00007-of-00050.safetensors", + "model.layers.10.mlp.gate.weight": "model-00007-of-00050.safetensors", + "model.layers.10.mlp.shared_experts.down_proj.biases": "model-00007-of-00050.safetensors", + "model.layers.10.mlp.shared_experts.down_proj.scales": "model-00007-of-00050.safetensors", + "model.layers.10.mlp.shared_experts.down_proj.weight": "model-00007-of-00050.safetensors", + "model.layers.10.mlp.shared_experts.gate_proj.biases": "model-00007-of-00050.safetensors", + "model.layers.10.mlp.shared_experts.gate_proj.scales": "model-00007-of-00050.safetensors", + "model.layers.10.mlp.shared_experts.gate_proj.weight": "model-00007-of-00050.safetensors", + "model.layers.10.mlp.shared_experts.up_proj.biases": "model-00007-of-00050.safetensors", + "model.layers.10.mlp.shared_experts.up_proj.scales": "model-00007-of-00050.safetensors", + "model.layers.10.mlp.shared_experts.up_proj.weight": "model-00007-of-00050.safetensors", + "model.layers.10.mlp.switch_mlp.down_proj.biases": "model-00007-of-00050.safetensors", + "model.layers.10.mlp.switch_mlp.down_proj.scales": "model-00007-of-00050.safetensors", + "model.layers.10.mlp.switch_mlp.down_proj.weight": "model-00007-of-00050.safetensors", + "model.layers.10.mlp.switch_mlp.gate_proj.biases": "model-00007-of-00050.safetensors", + "model.layers.10.mlp.switch_mlp.gate_proj.scales": "model-00007-of-00050.safetensors", + "model.layers.10.mlp.switch_mlp.gate_proj.weight": "model-00007-of-00050.safetensors", + "model.layers.10.mlp.switch_mlp.up_proj.biases": "model-00007-of-00050.safetensors", + "model.layers.10.mlp.switch_mlp.up_proj.scales": "model-00007-of-00050.safetensors", + "model.layers.10.mlp.switch_mlp.up_proj.weight": "model-00007-of-00050.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00007-of-00050.safetensors", + "model.layers.10.self_attn.kv_a_layernorm.weight": "model-00007-of-00050.safetensors", + "model.layers.10.self_attn.kv_a_proj_with_mqa.biases": "model-00007-of-00050.safetensors", + "model.layers.10.self_attn.kv_a_proj_with_mqa.scales": "model-00007-of-00050.safetensors", + "model.layers.10.self_attn.kv_a_proj_with_mqa.weight": "model-00007-of-00050.safetensors", + "model.layers.10.self_attn.kv_b_proj.biases": "model-00007-of-00050.safetensors", + "model.layers.10.self_attn.kv_b_proj.scales": "model-00007-of-00050.safetensors", + "model.layers.10.self_attn.kv_b_proj.weight": "model-00007-of-00050.safetensors", + "model.layers.10.self_attn.o_proj.biases": "model-00007-of-00050.safetensors", + "model.layers.10.self_attn.o_proj.scales": "model-00007-of-00050.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00007-of-00050.safetensors", + "model.layers.10.self_attn.q_a_layernorm.weight": "model-00007-of-00050.safetensors", + "model.layers.10.self_attn.q_a_proj.biases": "model-00007-of-00050.safetensors", + "model.layers.10.self_attn.q_a_proj.scales": "model-00007-of-00050.safetensors", + "model.layers.10.self_attn.q_a_proj.weight": "model-00007-of-00050.safetensors", + "model.layers.10.self_attn.q_b_proj.biases": "model-00007-of-00050.safetensors", + "model.layers.10.self_attn.q_b_proj.scales": "model-00007-of-00050.safetensors", + "model.layers.10.self_attn.q_b_proj.weight": "model-00007-of-00050.safetensors", + "model.layers.11.input_layernorm.weight": "model-00008-of-00050.safetensors", + "model.layers.11.mlp.gate.e_score_correction_bias": "model-00008-of-00050.safetensors", + "model.layers.11.mlp.gate.weight": "model-00008-of-00050.safetensors", + "model.layers.11.mlp.shared_experts.down_proj.biases": "model-00008-of-00050.safetensors", + "model.layers.11.mlp.shared_experts.down_proj.scales": "model-00008-of-00050.safetensors", + "model.layers.11.mlp.shared_experts.down_proj.weight": "model-00008-of-00050.safetensors", + "model.layers.11.mlp.shared_experts.gate_proj.biases": "model-00008-of-00050.safetensors", + "model.layers.11.mlp.shared_experts.gate_proj.scales": "model-00008-of-00050.safetensors", + "model.layers.11.mlp.shared_experts.gate_proj.weight": "model-00008-of-00050.safetensors", + "model.layers.11.mlp.shared_experts.up_proj.biases": "model-00008-of-00050.safetensors", + "model.layers.11.mlp.shared_experts.up_proj.scales": "model-00008-of-00050.safetensors", + "model.layers.11.mlp.shared_experts.up_proj.weight": "model-00008-of-00050.safetensors", + "model.layers.11.mlp.switch_mlp.down_proj.biases": "model-00008-of-00050.safetensors", + "model.layers.11.mlp.switch_mlp.down_proj.scales": "model-00008-of-00050.safetensors", + "model.layers.11.mlp.switch_mlp.down_proj.weight": "model-00008-of-00050.safetensors", + "model.layers.11.mlp.switch_mlp.gate_proj.biases": "model-00008-of-00050.safetensors", + "model.layers.11.mlp.switch_mlp.gate_proj.scales": "model-00008-of-00050.safetensors", + "model.layers.11.mlp.switch_mlp.gate_proj.weight": "model-00008-of-00050.safetensors", + "model.layers.11.mlp.switch_mlp.up_proj.biases": "model-00008-of-00050.safetensors", + "model.layers.11.mlp.switch_mlp.up_proj.scales": "model-00008-of-00050.safetensors", + "model.layers.11.mlp.switch_mlp.up_proj.weight": "model-00008-of-00050.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00008-of-00050.safetensors", + "model.layers.11.self_attn.kv_a_layernorm.weight": "model-00007-of-00050.safetensors", + "model.layers.11.self_attn.kv_a_proj_with_mqa.biases": "model-00007-of-00050.safetensors", + "model.layers.11.self_attn.kv_a_proj_with_mqa.scales": "model-00007-of-00050.safetensors", + "model.layers.11.self_attn.kv_a_proj_with_mqa.weight": "model-00007-of-00050.safetensors", + "model.layers.11.self_attn.kv_b_proj.biases": "model-00007-of-00050.safetensors", + "model.layers.11.self_attn.kv_b_proj.scales": "model-00007-of-00050.safetensors", + "model.layers.11.self_attn.kv_b_proj.weight": "model-00007-of-00050.safetensors", + "model.layers.11.self_attn.o_proj.biases": "model-00007-of-00050.safetensors", + "model.layers.11.self_attn.o_proj.scales": "model-00007-of-00050.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00007-of-00050.safetensors", + "model.layers.11.self_attn.q_a_layernorm.weight": "model-00007-of-00050.safetensors", + "model.layers.11.self_attn.q_a_proj.biases": "model-00007-of-00050.safetensors", + "model.layers.11.self_attn.q_a_proj.scales": "model-00007-of-00050.safetensors", + "model.layers.11.self_attn.q_a_proj.weight": "model-00007-of-00050.safetensors", + "model.layers.11.self_attn.q_b_proj.biases": "model-00007-of-00050.safetensors", + "model.layers.11.self_attn.q_b_proj.scales": "model-00007-of-00050.safetensors", + "model.layers.11.self_attn.q_b_proj.weight": "model-00007-of-00050.safetensors", + "model.layers.12.input_layernorm.weight": "model-00009-of-00050.safetensors", + "model.layers.12.mlp.gate.e_score_correction_bias": "model-00009-of-00050.safetensors", + "model.layers.12.mlp.gate.weight": "model-00009-of-00050.safetensors", + "model.layers.12.mlp.shared_experts.down_proj.biases": "model-00009-of-00050.safetensors", + "model.layers.12.mlp.shared_experts.down_proj.scales": "model-00009-of-00050.safetensors", + "model.layers.12.mlp.shared_experts.down_proj.weight": "model-00009-of-00050.safetensors", + "model.layers.12.mlp.shared_experts.gate_proj.biases": "model-00009-of-00050.safetensors", + "model.layers.12.mlp.shared_experts.gate_proj.scales": "model-00009-of-00050.safetensors", + "model.layers.12.mlp.shared_experts.gate_proj.weight": "model-00009-of-00050.safetensors", + "model.layers.12.mlp.shared_experts.up_proj.biases": "model-00009-of-00050.safetensors", + "model.layers.12.mlp.shared_experts.up_proj.scales": "model-00009-of-00050.safetensors", + "model.layers.12.mlp.shared_experts.up_proj.weight": "model-00009-of-00050.safetensors", + "model.layers.12.mlp.switch_mlp.down_proj.biases": "model-00009-of-00050.safetensors", + "model.layers.12.mlp.switch_mlp.down_proj.scales": "model-00009-of-00050.safetensors", + "model.layers.12.mlp.switch_mlp.down_proj.weight": "model-00009-of-00050.safetensors", + "model.layers.12.mlp.switch_mlp.gate_proj.biases": "model-00009-of-00050.safetensors", + "model.layers.12.mlp.switch_mlp.gate_proj.scales": "model-00009-of-00050.safetensors", + "model.layers.12.mlp.switch_mlp.gate_proj.weight": "model-00008-of-00050.safetensors", + "model.layers.12.mlp.switch_mlp.up_proj.biases": "model-00009-of-00050.safetensors", + "model.layers.12.mlp.switch_mlp.up_proj.scales": "model-00009-of-00050.safetensors", + "model.layers.12.mlp.switch_mlp.up_proj.weight": "model-00009-of-00050.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00009-of-00050.safetensors", + "model.layers.12.self_attn.kv_a_layernorm.weight": "model-00008-of-00050.safetensors", + "model.layers.12.self_attn.kv_a_proj_with_mqa.biases": "model-00008-of-00050.safetensors", + "model.layers.12.self_attn.kv_a_proj_with_mqa.scales": "model-00008-of-00050.safetensors", + "model.layers.12.self_attn.kv_a_proj_with_mqa.weight": "model-00008-of-00050.safetensors", + "model.layers.12.self_attn.kv_b_proj.biases": "model-00008-of-00050.safetensors", + "model.layers.12.self_attn.kv_b_proj.scales": "model-00008-of-00050.safetensors", + "model.layers.12.self_attn.kv_b_proj.weight": "model-00008-of-00050.safetensors", + "model.layers.12.self_attn.o_proj.biases": "model-00008-of-00050.safetensors", + "model.layers.12.self_attn.o_proj.scales": "model-00008-of-00050.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00008-of-00050.safetensors", + "model.layers.12.self_attn.q_a_layernorm.weight": "model-00008-of-00050.safetensors", + "model.layers.12.self_attn.q_a_proj.biases": "model-00008-of-00050.safetensors", + "model.layers.12.self_attn.q_a_proj.scales": "model-00008-of-00050.safetensors", + "model.layers.12.self_attn.q_a_proj.weight": "model-00008-of-00050.safetensors", + "model.layers.12.self_attn.q_b_proj.biases": "model-00008-of-00050.safetensors", + "model.layers.12.self_attn.q_b_proj.scales": "model-00008-of-00050.safetensors", + "model.layers.12.self_attn.q_b_proj.weight": "model-00008-of-00050.safetensors", + "model.layers.13.input_layernorm.weight": "model-00010-of-00050.safetensors", + "model.layers.13.mlp.gate.e_score_correction_bias": "model-00010-of-00050.safetensors", + "model.layers.13.mlp.gate.weight": "model-00010-of-00050.safetensors", + "model.layers.13.mlp.shared_experts.down_proj.biases": "model-00010-of-00050.safetensors", + "model.layers.13.mlp.shared_experts.down_proj.scales": "model-00010-of-00050.safetensors", + "model.layers.13.mlp.shared_experts.down_proj.weight": "model-00010-of-00050.safetensors", + "model.layers.13.mlp.shared_experts.gate_proj.biases": "model-00010-of-00050.safetensors", + "model.layers.13.mlp.shared_experts.gate_proj.scales": "model-00010-of-00050.safetensors", + "model.layers.13.mlp.shared_experts.gate_proj.weight": "model-00010-of-00050.safetensors", + "model.layers.13.mlp.shared_experts.up_proj.biases": "model-00010-of-00050.safetensors", + "model.layers.13.mlp.shared_experts.up_proj.scales": "model-00010-of-00050.safetensors", + "model.layers.13.mlp.shared_experts.up_proj.weight": "model-00010-of-00050.safetensors", + "model.layers.13.mlp.switch_mlp.down_proj.biases": "model-00010-of-00050.safetensors", + "model.layers.13.mlp.switch_mlp.down_proj.scales": "model-00010-of-00050.safetensors", + "model.layers.13.mlp.switch_mlp.down_proj.weight": "model-00010-of-00050.safetensors", + "model.layers.13.mlp.switch_mlp.gate_proj.biases": "model-00009-of-00050.safetensors", + "model.layers.13.mlp.switch_mlp.gate_proj.scales": "model-00009-of-00050.safetensors", + "model.layers.13.mlp.switch_mlp.gate_proj.weight": "model-00009-of-00050.safetensors", + "model.layers.13.mlp.switch_mlp.up_proj.biases": "model-00010-of-00050.safetensors", + "model.layers.13.mlp.switch_mlp.up_proj.scales": "model-00010-of-00050.safetensors", + "model.layers.13.mlp.switch_mlp.up_proj.weight": "model-00010-of-00050.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00010-of-00050.safetensors", + "model.layers.13.self_attn.kv_a_layernorm.weight": "model-00009-of-00050.safetensors", + "model.layers.13.self_attn.kv_a_proj_with_mqa.biases": "model-00009-of-00050.safetensors", + "model.layers.13.self_attn.kv_a_proj_with_mqa.scales": "model-00009-of-00050.safetensors", + "model.layers.13.self_attn.kv_a_proj_with_mqa.weight": "model-00009-of-00050.safetensors", + "model.layers.13.self_attn.kv_b_proj.biases": "model-00009-of-00050.safetensors", + "model.layers.13.self_attn.kv_b_proj.scales": "model-00009-of-00050.safetensors", + "model.layers.13.self_attn.kv_b_proj.weight": "model-00009-of-00050.safetensors", + "model.layers.13.self_attn.o_proj.biases": "model-00009-of-00050.safetensors", + "model.layers.13.self_attn.o_proj.scales": "model-00009-of-00050.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00009-of-00050.safetensors", + "model.layers.13.self_attn.q_a_layernorm.weight": "model-00009-of-00050.safetensors", + "model.layers.13.self_attn.q_a_proj.biases": "model-00009-of-00050.safetensors", + "model.layers.13.self_attn.q_a_proj.scales": "model-00009-of-00050.safetensors", + "model.layers.13.self_attn.q_a_proj.weight": "model-00009-of-00050.safetensors", + "model.layers.13.self_attn.q_b_proj.biases": "model-00009-of-00050.safetensors", + "model.layers.13.self_attn.q_b_proj.scales": "model-00009-of-00050.safetensors", + "model.layers.13.self_attn.q_b_proj.weight": "model-00009-of-00050.safetensors", + "model.layers.14.input_layernorm.weight": "model-00011-of-00050.safetensors", + "model.layers.14.mlp.gate.e_score_correction_bias": "model-00011-of-00050.safetensors", + "model.layers.14.mlp.gate.weight": "model-00011-of-00050.safetensors", + "model.layers.14.mlp.shared_experts.down_proj.biases": "model-00011-of-00050.safetensors", + "model.layers.14.mlp.shared_experts.down_proj.scales": "model-00011-of-00050.safetensors", + "model.layers.14.mlp.shared_experts.down_proj.weight": "model-00011-of-00050.safetensors", + "model.layers.14.mlp.shared_experts.gate_proj.biases": "model-00011-of-00050.safetensors", + "model.layers.14.mlp.shared_experts.gate_proj.scales": "model-00011-of-00050.safetensors", + "model.layers.14.mlp.shared_experts.gate_proj.weight": "model-00011-of-00050.safetensors", + "model.layers.14.mlp.shared_experts.up_proj.biases": "model-00011-of-00050.safetensors", + "model.layers.14.mlp.shared_experts.up_proj.scales": "model-00011-of-00050.safetensors", + "model.layers.14.mlp.shared_experts.up_proj.weight": "model-00011-of-00050.safetensors", + "model.layers.14.mlp.switch_mlp.down_proj.biases": "model-00011-of-00050.safetensors", + "model.layers.14.mlp.switch_mlp.down_proj.scales": "model-00011-of-00050.safetensors", + "model.layers.14.mlp.switch_mlp.down_proj.weight": "model-00011-of-00050.safetensors", + "model.layers.14.mlp.switch_mlp.gate_proj.biases": "model-00010-of-00050.safetensors", + "model.layers.14.mlp.switch_mlp.gate_proj.scales": "model-00010-of-00050.safetensors", + "model.layers.14.mlp.switch_mlp.gate_proj.weight": "model-00010-of-00050.safetensors", + "model.layers.14.mlp.switch_mlp.up_proj.biases": "model-00011-of-00050.safetensors", + "model.layers.14.mlp.switch_mlp.up_proj.scales": "model-00011-of-00050.safetensors", + "model.layers.14.mlp.switch_mlp.up_proj.weight": "model-00010-of-00050.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00011-of-00050.safetensors", + "model.layers.14.self_attn.kv_a_layernorm.weight": "model-00010-of-00050.safetensors", + "model.layers.14.self_attn.kv_a_proj_with_mqa.biases": "model-00010-of-00050.safetensors", + "model.layers.14.self_attn.kv_a_proj_with_mqa.scales": "model-00010-of-00050.safetensors", + "model.layers.14.self_attn.kv_a_proj_with_mqa.weight": "model-00010-of-00050.safetensors", + "model.layers.14.self_attn.kv_b_proj.biases": "model-00010-of-00050.safetensors", + "model.layers.14.self_attn.kv_b_proj.scales": "model-00010-of-00050.safetensors", + "model.layers.14.self_attn.kv_b_proj.weight": "model-00010-of-00050.safetensors", + "model.layers.14.self_attn.o_proj.biases": "model-00010-of-00050.safetensors", + "model.layers.14.self_attn.o_proj.scales": "model-00010-of-00050.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00010-of-00050.safetensors", + "model.layers.14.self_attn.q_a_layernorm.weight": "model-00010-of-00050.safetensors", + "model.layers.14.self_attn.q_a_proj.biases": "model-00010-of-00050.safetensors", + "model.layers.14.self_attn.q_a_proj.scales": "model-00010-of-00050.safetensors", + "model.layers.14.self_attn.q_a_proj.weight": "model-00010-of-00050.safetensors", + "model.layers.14.self_attn.q_b_proj.biases": "model-00010-of-00050.safetensors", + "model.layers.14.self_attn.q_b_proj.scales": "model-00010-of-00050.safetensors", + "model.layers.14.self_attn.q_b_proj.weight": "model-00010-of-00050.safetensors", + "model.layers.15.input_layernorm.weight": "model-00012-of-00050.safetensors", + "model.layers.15.mlp.gate.e_score_correction_bias": "model-00012-of-00050.safetensors", + "model.layers.15.mlp.gate.weight": "model-00012-of-00050.safetensors", + "model.layers.15.mlp.shared_experts.down_proj.biases": "model-00012-of-00050.safetensors", + "model.layers.15.mlp.shared_experts.down_proj.scales": "model-00012-of-00050.safetensors", + "model.layers.15.mlp.shared_experts.down_proj.weight": "model-00012-of-00050.safetensors", + "model.layers.15.mlp.shared_experts.gate_proj.biases": "model-00012-of-00050.safetensors", + "model.layers.15.mlp.shared_experts.gate_proj.scales": "model-00012-of-00050.safetensors", + "model.layers.15.mlp.shared_experts.gate_proj.weight": "model-00012-of-00050.safetensors", + "model.layers.15.mlp.shared_experts.up_proj.biases": "model-00012-of-00050.safetensors", + "model.layers.15.mlp.shared_experts.up_proj.scales": "model-00012-of-00050.safetensors", + "model.layers.15.mlp.shared_experts.up_proj.weight": "model-00012-of-00050.safetensors", + "model.layers.15.mlp.switch_mlp.down_proj.biases": "model-00012-of-00050.safetensors", + "model.layers.15.mlp.switch_mlp.down_proj.scales": "model-00012-of-00050.safetensors", + "model.layers.15.mlp.switch_mlp.down_proj.weight": "model-00012-of-00050.safetensors", + "model.layers.15.mlp.switch_mlp.gate_proj.biases": "model-00011-of-00050.safetensors", + "model.layers.15.mlp.switch_mlp.gate_proj.scales": "model-00011-of-00050.safetensors", + "model.layers.15.mlp.switch_mlp.gate_proj.weight": "model-00011-of-00050.safetensors", + "model.layers.15.mlp.switch_mlp.up_proj.biases": "model-00011-of-00050.safetensors", + "model.layers.15.mlp.switch_mlp.up_proj.scales": "model-00011-of-00050.safetensors", + "model.layers.15.mlp.switch_mlp.up_proj.weight": "model-00011-of-00050.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00012-of-00050.safetensors", + "model.layers.15.self_attn.kv_a_layernorm.weight": "model-00011-of-00050.safetensors", + "model.layers.15.self_attn.kv_a_proj_with_mqa.biases": "model-00011-of-00050.safetensors", + "model.layers.15.self_attn.kv_a_proj_with_mqa.scales": "model-00011-of-00050.safetensors", + "model.layers.15.self_attn.kv_a_proj_with_mqa.weight": "model-00011-of-00050.safetensors", + "model.layers.15.self_attn.kv_b_proj.biases": "model-00011-of-00050.safetensors", + "model.layers.15.self_attn.kv_b_proj.scales": "model-00011-of-00050.safetensors", + "model.layers.15.self_attn.kv_b_proj.weight": "model-00011-of-00050.safetensors", + "model.layers.15.self_attn.o_proj.biases": "model-00011-of-00050.safetensors", + "model.layers.15.self_attn.o_proj.scales": "model-00011-of-00050.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00011-of-00050.safetensors", + "model.layers.15.self_attn.q_a_layernorm.weight": "model-00011-of-00050.safetensors", + "model.layers.15.self_attn.q_a_proj.biases": "model-00011-of-00050.safetensors", + "model.layers.15.self_attn.q_a_proj.scales": "model-00011-of-00050.safetensors", + "model.layers.15.self_attn.q_a_proj.weight": "model-00011-of-00050.safetensors", + "model.layers.15.self_attn.q_b_proj.biases": "model-00011-of-00050.safetensors", + "model.layers.15.self_attn.q_b_proj.scales": "model-00011-of-00050.safetensors", + "model.layers.15.self_attn.q_b_proj.weight": "model-00011-of-00050.safetensors", + "model.layers.16.input_layernorm.weight": "model-00013-of-00050.safetensors", + "model.layers.16.mlp.gate.e_score_correction_bias": "model-00013-of-00050.safetensors", + "model.layers.16.mlp.gate.weight": "model-00013-of-00050.safetensors", + "model.layers.16.mlp.shared_experts.down_proj.biases": "model-00013-of-00050.safetensors", + "model.layers.16.mlp.shared_experts.down_proj.scales": "model-00013-of-00050.safetensors", + "model.layers.16.mlp.shared_experts.down_proj.weight": "model-00013-of-00050.safetensors", + "model.layers.16.mlp.shared_experts.gate_proj.biases": "model-00013-of-00050.safetensors", + "model.layers.16.mlp.shared_experts.gate_proj.scales": "model-00013-of-00050.safetensors", + "model.layers.16.mlp.shared_experts.gate_proj.weight": "model-00013-of-00050.safetensors", + "model.layers.16.mlp.shared_experts.up_proj.biases": "model-00013-of-00050.safetensors", + "model.layers.16.mlp.shared_experts.up_proj.scales": "model-00013-of-00050.safetensors", + "model.layers.16.mlp.shared_experts.up_proj.weight": "model-00013-of-00050.safetensors", + "model.layers.16.mlp.switch_mlp.down_proj.biases": "model-00013-of-00050.safetensors", + "model.layers.16.mlp.switch_mlp.down_proj.scales": "model-00013-of-00050.safetensors", + "model.layers.16.mlp.switch_mlp.down_proj.weight": "model-00012-of-00050.safetensors", + "model.layers.16.mlp.switch_mlp.gate_proj.biases": "model-00012-of-00050.safetensors", + "model.layers.16.mlp.switch_mlp.gate_proj.scales": "model-00012-of-00050.safetensors", + "model.layers.16.mlp.switch_mlp.gate_proj.weight": "model-00012-of-00050.safetensors", + "model.layers.16.mlp.switch_mlp.up_proj.biases": "model-00012-of-00050.safetensors", + "model.layers.16.mlp.switch_mlp.up_proj.scales": "model-00012-of-00050.safetensors", + "model.layers.16.mlp.switch_mlp.up_proj.weight": "model-00012-of-00050.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00013-of-00050.safetensors", + "model.layers.16.self_attn.kv_a_layernorm.weight": "model-00012-of-00050.safetensors", + "model.layers.16.self_attn.kv_a_proj_with_mqa.biases": "model-00012-of-00050.safetensors", + "model.layers.16.self_attn.kv_a_proj_with_mqa.scales": "model-00012-of-00050.safetensors", + "model.layers.16.self_attn.kv_a_proj_with_mqa.weight": "model-00012-of-00050.safetensors", + "model.layers.16.self_attn.kv_b_proj.biases": "model-00012-of-00050.safetensors", + "model.layers.16.self_attn.kv_b_proj.scales": "model-00012-of-00050.safetensors", + "model.layers.16.self_attn.kv_b_proj.weight": "model-00012-of-00050.safetensors", + "model.layers.16.self_attn.o_proj.biases": "model-00012-of-00050.safetensors", + "model.layers.16.self_attn.o_proj.scales": "model-00012-of-00050.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00012-of-00050.safetensors", + "model.layers.16.self_attn.q_a_layernorm.weight": "model-00012-of-00050.safetensors", + "model.layers.16.self_attn.q_a_proj.biases": "model-00012-of-00050.safetensors", + "model.layers.16.self_attn.q_a_proj.scales": "model-00012-of-00050.safetensors", + "model.layers.16.self_attn.q_a_proj.weight": "model-00012-of-00050.safetensors", + "model.layers.16.self_attn.q_b_proj.biases": "model-00012-of-00050.safetensors", + "model.layers.16.self_attn.q_b_proj.scales": "model-00012-of-00050.safetensors", + "model.layers.16.self_attn.q_b_proj.weight": "model-00012-of-00050.safetensors", + "model.layers.17.input_layernorm.weight": "model-00013-of-00050.safetensors", + "model.layers.17.mlp.gate.e_score_correction_bias": "model-00013-of-00050.safetensors", + "model.layers.17.mlp.gate.weight": "model-00013-of-00050.safetensors", + "model.layers.17.mlp.shared_experts.down_proj.biases": "model-00013-of-00050.safetensors", + "model.layers.17.mlp.shared_experts.down_proj.scales": "model-00013-of-00050.safetensors", + "model.layers.17.mlp.shared_experts.down_proj.weight": "model-00013-of-00050.safetensors", + "model.layers.17.mlp.shared_experts.gate_proj.biases": "model-00013-of-00050.safetensors", + "model.layers.17.mlp.shared_experts.gate_proj.scales": "model-00013-of-00050.safetensors", + "model.layers.17.mlp.shared_experts.gate_proj.weight": "model-00013-of-00050.safetensors", + "model.layers.17.mlp.shared_experts.up_proj.biases": "model-00013-of-00050.safetensors", + "model.layers.17.mlp.shared_experts.up_proj.scales": "model-00013-of-00050.safetensors", + "model.layers.17.mlp.shared_experts.up_proj.weight": "model-00013-of-00050.safetensors", + "model.layers.17.mlp.switch_mlp.down_proj.biases": "model-00013-of-00050.safetensors", + "model.layers.17.mlp.switch_mlp.down_proj.scales": "model-00013-of-00050.safetensors", + "model.layers.17.mlp.switch_mlp.down_proj.weight": "model-00013-of-00050.safetensors", + "model.layers.17.mlp.switch_mlp.gate_proj.biases": "model-00013-of-00050.safetensors", + "model.layers.17.mlp.switch_mlp.gate_proj.scales": "model-00013-of-00050.safetensors", + "model.layers.17.mlp.switch_mlp.gate_proj.weight": "model-00013-of-00050.safetensors", + "model.layers.17.mlp.switch_mlp.up_proj.biases": "model-00013-of-00050.safetensors", + "model.layers.17.mlp.switch_mlp.up_proj.scales": "model-00013-of-00050.safetensors", + "model.layers.17.mlp.switch_mlp.up_proj.weight": "model-00013-of-00050.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00013-of-00050.safetensors", + "model.layers.17.self_attn.kv_a_layernorm.weight": "model-00013-of-00050.safetensors", + "model.layers.17.self_attn.kv_a_proj_with_mqa.biases": "model-00013-of-00050.safetensors", + "model.layers.17.self_attn.kv_a_proj_with_mqa.scales": "model-00013-of-00050.safetensors", + "model.layers.17.self_attn.kv_a_proj_with_mqa.weight": "model-00013-of-00050.safetensors", + "model.layers.17.self_attn.kv_b_proj.biases": "model-00013-of-00050.safetensors", + "model.layers.17.self_attn.kv_b_proj.scales": "model-00013-of-00050.safetensors", + "model.layers.17.self_attn.kv_b_proj.weight": "model-00013-of-00050.safetensors", + "model.layers.17.self_attn.o_proj.biases": "model-00013-of-00050.safetensors", + "model.layers.17.self_attn.o_proj.scales": "model-00013-of-00050.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00013-of-00050.safetensors", + "model.layers.17.self_attn.q_a_layernorm.weight": "model-00013-of-00050.safetensors", + "model.layers.17.self_attn.q_a_proj.biases": "model-00013-of-00050.safetensors", + "model.layers.17.self_attn.q_a_proj.scales": "model-00013-of-00050.safetensors", + "model.layers.17.self_attn.q_a_proj.weight": "model-00013-of-00050.safetensors", + "model.layers.17.self_attn.q_b_proj.biases": "model-00013-of-00050.safetensors", + "model.layers.17.self_attn.q_b_proj.scales": "model-00013-of-00050.safetensors", + "model.layers.17.self_attn.q_b_proj.weight": "model-00013-of-00050.safetensors", + "model.layers.18.input_layernorm.weight": "model-00014-of-00050.safetensors", + "model.layers.18.mlp.gate.e_score_correction_bias": "model-00014-of-00050.safetensors", + "model.layers.18.mlp.gate.weight": "model-00014-of-00050.safetensors", + "model.layers.18.mlp.shared_experts.down_proj.biases": "model-00014-of-00050.safetensors", + "model.layers.18.mlp.shared_experts.down_proj.scales": "model-00014-of-00050.safetensors", + "model.layers.18.mlp.shared_experts.down_proj.weight": "model-00014-of-00050.safetensors", + "model.layers.18.mlp.shared_experts.gate_proj.biases": "model-00014-of-00050.safetensors", + "model.layers.18.mlp.shared_experts.gate_proj.scales": "model-00014-of-00050.safetensors", + "model.layers.18.mlp.shared_experts.gate_proj.weight": "model-00014-of-00050.safetensors", + "model.layers.18.mlp.shared_experts.up_proj.biases": "model-00014-of-00050.safetensors", + "model.layers.18.mlp.shared_experts.up_proj.scales": "model-00014-of-00050.safetensors", + "model.layers.18.mlp.shared_experts.up_proj.weight": "model-00014-of-00050.safetensors", + "model.layers.18.mlp.switch_mlp.down_proj.biases": "model-00014-of-00050.safetensors", + "model.layers.18.mlp.switch_mlp.down_proj.scales": "model-00014-of-00050.safetensors", + "model.layers.18.mlp.switch_mlp.down_proj.weight": "model-00014-of-00050.safetensors", + "model.layers.18.mlp.switch_mlp.gate_proj.biases": "model-00014-of-00050.safetensors", + "model.layers.18.mlp.switch_mlp.gate_proj.scales": "model-00014-of-00050.safetensors", + "model.layers.18.mlp.switch_mlp.gate_proj.weight": "model-00014-of-00050.safetensors", + "model.layers.18.mlp.switch_mlp.up_proj.biases": "model-00014-of-00050.safetensors", + "model.layers.18.mlp.switch_mlp.up_proj.scales": "model-00014-of-00050.safetensors", + "model.layers.18.mlp.switch_mlp.up_proj.weight": "model-00014-of-00050.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00014-of-00050.safetensors", + "model.layers.18.self_attn.kv_a_layernorm.weight": "model-00013-of-00050.safetensors", + "model.layers.18.self_attn.kv_a_proj_with_mqa.biases": "model-00013-of-00050.safetensors", + "model.layers.18.self_attn.kv_a_proj_with_mqa.scales": "model-00013-of-00050.safetensors", + "model.layers.18.self_attn.kv_a_proj_with_mqa.weight": "model-00013-of-00050.safetensors", + "model.layers.18.self_attn.kv_b_proj.biases": "model-00013-of-00050.safetensors", + "model.layers.18.self_attn.kv_b_proj.scales": "model-00013-of-00050.safetensors", + "model.layers.18.self_attn.kv_b_proj.weight": "model-00013-of-00050.safetensors", + "model.layers.18.self_attn.o_proj.biases": "model-00013-of-00050.safetensors", + "model.layers.18.self_attn.o_proj.scales": "model-00013-of-00050.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00013-of-00050.safetensors", + "model.layers.18.self_attn.q_a_layernorm.weight": "model-00013-of-00050.safetensors", + "model.layers.18.self_attn.q_a_proj.biases": "model-00013-of-00050.safetensors", + "model.layers.18.self_attn.q_a_proj.scales": "model-00013-of-00050.safetensors", + "model.layers.18.self_attn.q_a_proj.weight": "model-00013-of-00050.safetensors", + "model.layers.18.self_attn.q_b_proj.biases": "model-00013-of-00050.safetensors", + "model.layers.18.self_attn.q_b_proj.scales": "model-00013-of-00050.safetensors", + "model.layers.18.self_attn.q_b_proj.weight": "model-00013-of-00050.safetensors", + "model.layers.19.input_layernorm.weight": "model-00015-of-00050.safetensors", + "model.layers.19.mlp.gate.e_score_correction_bias": "model-00015-of-00050.safetensors", + "model.layers.19.mlp.gate.weight": "model-00015-of-00050.safetensors", + "model.layers.19.mlp.shared_experts.down_proj.biases": "model-00015-of-00050.safetensors", + "model.layers.19.mlp.shared_experts.down_proj.scales": "model-00015-of-00050.safetensors", + "model.layers.19.mlp.shared_experts.down_proj.weight": "model-00015-of-00050.safetensors", + "model.layers.19.mlp.shared_experts.gate_proj.biases": "model-00015-of-00050.safetensors", + "model.layers.19.mlp.shared_experts.gate_proj.scales": "model-00015-of-00050.safetensors", + "model.layers.19.mlp.shared_experts.gate_proj.weight": "model-00015-of-00050.safetensors", + "model.layers.19.mlp.shared_experts.up_proj.biases": "model-00015-of-00050.safetensors", + "model.layers.19.mlp.shared_experts.up_proj.scales": "model-00015-of-00050.safetensors", + "model.layers.19.mlp.shared_experts.up_proj.weight": "model-00015-of-00050.safetensors", + "model.layers.19.mlp.switch_mlp.down_proj.biases": "model-00015-of-00050.safetensors", + "model.layers.19.mlp.switch_mlp.down_proj.scales": "model-00015-of-00050.safetensors", + "model.layers.19.mlp.switch_mlp.down_proj.weight": "model-00015-of-00050.safetensors", + "model.layers.19.mlp.switch_mlp.gate_proj.biases": "model-00015-of-00050.safetensors", + "model.layers.19.mlp.switch_mlp.gate_proj.scales": "model-00015-of-00050.safetensors", + "model.layers.19.mlp.switch_mlp.gate_proj.weight": "model-00014-of-00050.safetensors", + "model.layers.19.mlp.switch_mlp.up_proj.biases": "model-00015-of-00050.safetensors", + "model.layers.19.mlp.switch_mlp.up_proj.scales": "model-00015-of-00050.safetensors", + "model.layers.19.mlp.switch_mlp.up_proj.weight": "model-00015-of-00050.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00015-of-00050.safetensors", + "model.layers.19.self_attn.kv_a_layernorm.weight": "model-00014-of-00050.safetensors", + "model.layers.19.self_attn.kv_a_proj_with_mqa.biases": "model-00014-of-00050.safetensors", + "model.layers.19.self_attn.kv_a_proj_with_mqa.scales": "model-00014-of-00050.safetensors", + "model.layers.19.self_attn.kv_a_proj_with_mqa.weight": "model-00014-of-00050.safetensors", + "model.layers.19.self_attn.kv_b_proj.biases": "model-00014-of-00050.safetensors", + "model.layers.19.self_attn.kv_b_proj.scales": "model-00014-of-00050.safetensors", + "model.layers.19.self_attn.kv_b_proj.weight": "model-00014-of-00050.safetensors", + "model.layers.19.self_attn.o_proj.biases": "model-00014-of-00050.safetensors", + "model.layers.19.self_attn.o_proj.scales": "model-00014-of-00050.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00014-of-00050.safetensors", + "model.layers.19.self_attn.q_a_layernorm.weight": "model-00014-of-00050.safetensors", + "model.layers.19.self_attn.q_a_proj.biases": "model-00014-of-00050.safetensors", + "model.layers.19.self_attn.q_a_proj.scales": "model-00014-of-00050.safetensors", + "model.layers.19.self_attn.q_a_proj.weight": "model-00014-of-00050.safetensors", + "model.layers.19.self_attn.q_b_proj.biases": "model-00014-of-00050.safetensors", + "model.layers.19.self_attn.q_b_proj.scales": "model-00014-of-00050.safetensors", + "model.layers.19.self_attn.q_b_proj.weight": "model-00014-of-00050.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00050.safetensors", + "model.layers.2.mlp.down_proj.biases": "model-00001-of-00050.safetensors", + "model.layers.2.mlp.down_proj.scales": "model-00001-of-00050.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00050.safetensors", + "model.layers.2.mlp.gate_proj.biases": "model-00001-of-00050.safetensors", + "model.layers.2.mlp.gate_proj.scales": "model-00001-of-00050.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00050.safetensors", + "model.layers.2.mlp.up_proj.biases": "model-00001-of-00050.safetensors", + "model.layers.2.mlp.up_proj.scales": "model-00001-of-00050.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00050.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00050.safetensors", + "model.layers.2.self_attn.kv_a_layernorm.weight": "model-00001-of-00050.safetensors", + "model.layers.2.self_attn.kv_a_proj_with_mqa.biases": "model-00001-of-00050.safetensors", + "model.layers.2.self_attn.kv_a_proj_with_mqa.scales": "model-00001-of-00050.safetensors", + "model.layers.2.self_attn.kv_a_proj_with_mqa.weight": "model-00001-of-00050.safetensors", + "model.layers.2.self_attn.kv_b_proj.biases": "model-00001-of-00050.safetensors", + "model.layers.2.self_attn.kv_b_proj.scales": "model-00001-of-00050.safetensors", + "model.layers.2.self_attn.kv_b_proj.weight": "model-00001-of-00050.safetensors", + "model.layers.2.self_attn.o_proj.biases": "model-00001-of-00050.safetensors", + "model.layers.2.self_attn.o_proj.scales": "model-00001-of-00050.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00050.safetensors", + "model.layers.2.self_attn.q_a_layernorm.weight": "model-00001-of-00050.safetensors", + "model.layers.2.self_attn.q_a_proj.biases": "model-00001-of-00050.safetensors", + "model.layers.2.self_attn.q_a_proj.scales": "model-00001-of-00050.safetensors", + "model.layers.2.self_attn.q_a_proj.weight": "model-00001-of-00050.safetensors", + "model.layers.2.self_attn.q_b_proj.biases": "model-00001-of-00050.safetensors", + "model.layers.2.self_attn.q_b_proj.scales": "model-00001-of-00050.safetensors", + "model.layers.2.self_attn.q_b_proj.weight": "model-00001-of-00050.safetensors", + "model.layers.20.input_layernorm.weight": "model-00016-of-00050.safetensors", + "model.layers.20.mlp.gate.e_score_correction_bias": "model-00016-of-00050.safetensors", + "model.layers.20.mlp.gate.weight": "model-00016-of-00050.safetensors", + "model.layers.20.mlp.shared_experts.down_proj.biases": "model-00016-of-00050.safetensors", + "model.layers.20.mlp.shared_experts.down_proj.scales": "model-00016-of-00050.safetensors", + "model.layers.20.mlp.shared_experts.down_proj.weight": "model-00016-of-00050.safetensors", + "model.layers.20.mlp.shared_experts.gate_proj.biases": "model-00016-of-00050.safetensors", + "model.layers.20.mlp.shared_experts.gate_proj.scales": "model-00016-of-00050.safetensors", + "model.layers.20.mlp.shared_experts.gate_proj.weight": "model-00016-of-00050.safetensors", + "model.layers.20.mlp.shared_experts.up_proj.biases": "model-00016-of-00050.safetensors", + "model.layers.20.mlp.shared_experts.up_proj.scales": "model-00016-of-00050.safetensors", + "model.layers.20.mlp.shared_experts.up_proj.weight": "model-00016-of-00050.safetensors", + "model.layers.20.mlp.switch_mlp.down_proj.biases": "model-00016-of-00050.safetensors", + "model.layers.20.mlp.switch_mlp.down_proj.scales": "model-00016-of-00050.safetensors", + "model.layers.20.mlp.switch_mlp.down_proj.weight": "model-00016-of-00050.safetensors", + "model.layers.20.mlp.switch_mlp.gate_proj.biases": "model-00015-of-00050.safetensors", + "model.layers.20.mlp.switch_mlp.gate_proj.scales": "model-00015-of-00050.safetensors", + "model.layers.20.mlp.switch_mlp.gate_proj.weight": "model-00015-of-00050.safetensors", + "model.layers.20.mlp.switch_mlp.up_proj.biases": "model-00016-of-00050.safetensors", + "model.layers.20.mlp.switch_mlp.up_proj.scales": "model-00016-of-00050.safetensors", + "model.layers.20.mlp.switch_mlp.up_proj.weight": "model-00016-of-00050.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00016-of-00050.safetensors", + "model.layers.20.self_attn.kv_a_layernorm.weight": "model-00015-of-00050.safetensors", + "model.layers.20.self_attn.kv_a_proj_with_mqa.biases": "model-00015-of-00050.safetensors", + "model.layers.20.self_attn.kv_a_proj_with_mqa.scales": "model-00015-of-00050.safetensors", + "model.layers.20.self_attn.kv_a_proj_with_mqa.weight": "model-00015-of-00050.safetensors", + "model.layers.20.self_attn.kv_b_proj.biases": "model-00015-of-00050.safetensors", + "model.layers.20.self_attn.kv_b_proj.scales": "model-00015-of-00050.safetensors", + "model.layers.20.self_attn.kv_b_proj.weight": "model-00015-of-00050.safetensors", + "model.layers.20.self_attn.o_proj.biases": "model-00015-of-00050.safetensors", + "model.layers.20.self_attn.o_proj.scales": "model-00015-of-00050.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00015-of-00050.safetensors", + "model.layers.20.self_attn.q_a_layernorm.weight": "model-00015-of-00050.safetensors", + "model.layers.20.self_attn.q_a_proj.biases": "model-00015-of-00050.safetensors", + "model.layers.20.self_attn.q_a_proj.scales": "model-00015-of-00050.safetensors", + "model.layers.20.self_attn.q_a_proj.weight": "model-00015-of-00050.safetensors", + "model.layers.20.self_attn.q_b_proj.biases": "model-00015-of-00050.safetensors", + "model.layers.20.self_attn.q_b_proj.scales": "model-00015-of-00050.safetensors", + "model.layers.20.self_attn.q_b_proj.weight": "model-00015-of-00050.safetensors", + "model.layers.21.input_layernorm.weight": "model-00017-of-00050.safetensors", + "model.layers.21.mlp.gate.e_score_correction_bias": "model-00017-of-00050.safetensors", + "model.layers.21.mlp.gate.weight": "model-00017-of-00050.safetensors", + "model.layers.21.mlp.shared_experts.down_proj.biases": "model-00017-of-00050.safetensors", + "model.layers.21.mlp.shared_experts.down_proj.scales": "model-00017-of-00050.safetensors", + "model.layers.21.mlp.shared_experts.down_proj.weight": "model-00017-of-00050.safetensors", + "model.layers.21.mlp.shared_experts.gate_proj.biases": "model-00017-of-00050.safetensors", + "model.layers.21.mlp.shared_experts.gate_proj.scales": "model-00017-of-00050.safetensors", + "model.layers.21.mlp.shared_experts.gate_proj.weight": "model-00017-of-00050.safetensors", + "model.layers.21.mlp.shared_experts.up_proj.biases": "model-00017-of-00050.safetensors", + "model.layers.21.mlp.shared_experts.up_proj.scales": "model-00017-of-00050.safetensors", + "model.layers.21.mlp.shared_experts.up_proj.weight": "model-00017-of-00050.safetensors", + "model.layers.21.mlp.switch_mlp.down_proj.biases": "model-00017-of-00050.safetensors", + "model.layers.21.mlp.switch_mlp.down_proj.scales": "model-00017-of-00050.safetensors", + "model.layers.21.mlp.switch_mlp.down_proj.weight": "model-00017-of-00050.safetensors", + "model.layers.21.mlp.switch_mlp.gate_proj.biases": "model-00016-of-00050.safetensors", + "model.layers.21.mlp.switch_mlp.gate_proj.scales": "model-00016-of-00050.safetensors", + "model.layers.21.mlp.switch_mlp.gate_proj.weight": "model-00016-of-00050.safetensors", + "model.layers.21.mlp.switch_mlp.up_proj.biases": "model-00017-of-00050.safetensors", + "model.layers.21.mlp.switch_mlp.up_proj.scales": "model-00017-of-00050.safetensors", + "model.layers.21.mlp.switch_mlp.up_proj.weight": "model-00016-of-00050.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00017-of-00050.safetensors", + "model.layers.21.self_attn.kv_a_layernorm.weight": "model-00016-of-00050.safetensors", + "model.layers.21.self_attn.kv_a_proj_with_mqa.biases": "model-00016-of-00050.safetensors", + "model.layers.21.self_attn.kv_a_proj_with_mqa.scales": "model-00016-of-00050.safetensors", + "model.layers.21.self_attn.kv_a_proj_with_mqa.weight": "model-00016-of-00050.safetensors", + "model.layers.21.self_attn.kv_b_proj.biases": "model-00016-of-00050.safetensors", + "model.layers.21.self_attn.kv_b_proj.scales": "model-00016-of-00050.safetensors", + "model.layers.21.self_attn.kv_b_proj.weight": "model-00016-of-00050.safetensors", + "model.layers.21.self_attn.o_proj.biases": "model-00016-of-00050.safetensors", + "model.layers.21.self_attn.o_proj.scales": "model-00016-of-00050.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00016-of-00050.safetensors", + "model.layers.21.self_attn.q_a_layernorm.weight": "model-00016-of-00050.safetensors", + "model.layers.21.self_attn.q_a_proj.biases": "model-00016-of-00050.safetensors", + "model.layers.21.self_attn.q_a_proj.scales": "model-00016-of-00050.safetensors", + "model.layers.21.self_attn.q_a_proj.weight": "model-00016-of-00050.safetensors", + "model.layers.21.self_attn.q_b_proj.biases": "model-00016-of-00050.safetensors", + "model.layers.21.self_attn.q_b_proj.scales": "model-00016-of-00050.safetensors", + "model.layers.21.self_attn.q_b_proj.weight": "model-00016-of-00050.safetensors", + "model.layers.22.input_layernorm.weight": "model-00018-of-00050.safetensors", + "model.layers.22.mlp.gate.e_score_correction_bias": "model-00018-of-00050.safetensors", + "model.layers.22.mlp.gate.weight": "model-00018-of-00050.safetensors", + "model.layers.22.mlp.shared_experts.down_proj.biases": "model-00018-of-00050.safetensors", + "model.layers.22.mlp.shared_experts.down_proj.scales": "model-00018-of-00050.safetensors", + "model.layers.22.mlp.shared_experts.down_proj.weight": "model-00018-of-00050.safetensors", + "model.layers.22.mlp.shared_experts.gate_proj.biases": "model-00018-of-00050.safetensors", + "model.layers.22.mlp.shared_experts.gate_proj.scales": "model-00018-of-00050.safetensors", + "model.layers.22.mlp.shared_experts.gate_proj.weight": "model-00018-of-00050.safetensors", + "model.layers.22.mlp.shared_experts.up_proj.biases": "model-00018-of-00050.safetensors", + "model.layers.22.mlp.shared_experts.up_proj.scales": "model-00018-of-00050.safetensors", + "model.layers.22.mlp.shared_experts.up_proj.weight": "model-00018-of-00050.safetensors", + "model.layers.22.mlp.switch_mlp.down_proj.biases": "model-00018-of-00050.safetensors", + "model.layers.22.mlp.switch_mlp.down_proj.scales": "model-00018-of-00050.safetensors", + "model.layers.22.mlp.switch_mlp.down_proj.weight": "model-00018-of-00050.safetensors", + "model.layers.22.mlp.switch_mlp.gate_proj.biases": "model-00017-of-00050.safetensors", + "model.layers.22.mlp.switch_mlp.gate_proj.scales": "model-00017-of-00050.safetensors", + "model.layers.22.mlp.switch_mlp.gate_proj.weight": "model-00017-of-00050.safetensors", + "model.layers.22.mlp.switch_mlp.up_proj.biases": "model-00017-of-00050.safetensors", + "model.layers.22.mlp.switch_mlp.up_proj.scales": "model-00017-of-00050.safetensors", + "model.layers.22.mlp.switch_mlp.up_proj.weight": "model-00017-of-00050.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00018-of-00050.safetensors", + "model.layers.22.self_attn.kv_a_layernorm.weight": "model-00017-of-00050.safetensors", + "model.layers.22.self_attn.kv_a_proj_with_mqa.biases": "model-00017-of-00050.safetensors", + "model.layers.22.self_attn.kv_a_proj_with_mqa.scales": "model-00017-of-00050.safetensors", + "model.layers.22.self_attn.kv_a_proj_with_mqa.weight": "model-00017-of-00050.safetensors", + "model.layers.22.self_attn.kv_b_proj.biases": "model-00017-of-00050.safetensors", + "model.layers.22.self_attn.kv_b_proj.scales": "model-00017-of-00050.safetensors", + "model.layers.22.self_attn.kv_b_proj.weight": "model-00017-of-00050.safetensors", + "model.layers.22.self_attn.o_proj.biases": "model-00017-of-00050.safetensors", + "model.layers.22.self_attn.o_proj.scales": "model-00017-of-00050.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00017-of-00050.safetensors", + "model.layers.22.self_attn.q_a_layernorm.weight": "model-00017-of-00050.safetensors", + "model.layers.22.self_attn.q_a_proj.biases": "model-00017-of-00050.safetensors", + "model.layers.22.self_attn.q_a_proj.scales": "model-00017-of-00050.safetensors", + "model.layers.22.self_attn.q_a_proj.weight": "model-00017-of-00050.safetensors", + "model.layers.22.self_attn.q_b_proj.biases": "model-00017-of-00050.safetensors", + "model.layers.22.self_attn.q_b_proj.scales": "model-00017-of-00050.safetensors", + "model.layers.22.self_attn.q_b_proj.weight": "model-00017-of-00050.safetensors", + "model.layers.23.input_layernorm.weight": "model-00019-of-00050.safetensors", + "model.layers.23.mlp.gate.e_score_correction_bias": "model-00019-of-00050.safetensors", + "model.layers.23.mlp.gate.weight": "model-00019-of-00050.safetensors", + "model.layers.23.mlp.shared_experts.down_proj.biases": "model-00019-of-00050.safetensors", + "model.layers.23.mlp.shared_experts.down_proj.scales": "model-00019-of-00050.safetensors", + "model.layers.23.mlp.shared_experts.down_proj.weight": "model-00019-of-00050.safetensors", + "model.layers.23.mlp.shared_experts.gate_proj.biases": "model-00019-of-00050.safetensors", + "model.layers.23.mlp.shared_experts.gate_proj.scales": "model-00019-of-00050.safetensors", + "model.layers.23.mlp.shared_experts.gate_proj.weight": "model-00019-of-00050.safetensors", + "model.layers.23.mlp.shared_experts.up_proj.biases": "model-00019-of-00050.safetensors", + "model.layers.23.mlp.shared_experts.up_proj.scales": "model-00019-of-00050.safetensors", + "model.layers.23.mlp.shared_experts.up_proj.weight": "model-00019-of-00050.safetensors", + "model.layers.23.mlp.switch_mlp.down_proj.biases": "model-00019-of-00050.safetensors", + "model.layers.23.mlp.switch_mlp.down_proj.scales": "model-00019-of-00050.safetensors", + "model.layers.23.mlp.switch_mlp.down_proj.weight": "model-00018-of-00050.safetensors", + "model.layers.23.mlp.switch_mlp.gate_proj.biases": "model-00018-of-00050.safetensors", + "model.layers.23.mlp.switch_mlp.gate_proj.scales": "model-00018-of-00050.safetensors", + "model.layers.23.mlp.switch_mlp.gate_proj.weight": "model-00018-of-00050.safetensors", + "model.layers.23.mlp.switch_mlp.up_proj.biases": "model-00018-of-00050.safetensors", + "model.layers.23.mlp.switch_mlp.up_proj.scales": "model-00018-of-00050.safetensors", + "model.layers.23.mlp.switch_mlp.up_proj.weight": "model-00018-of-00050.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00019-of-00050.safetensors", + "model.layers.23.self_attn.kv_a_layernorm.weight": "model-00018-of-00050.safetensors", + "model.layers.23.self_attn.kv_a_proj_with_mqa.biases": "model-00018-of-00050.safetensors", + "model.layers.23.self_attn.kv_a_proj_with_mqa.scales": "model-00018-of-00050.safetensors", + "model.layers.23.self_attn.kv_a_proj_with_mqa.weight": "model-00018-of-00050.safetensors", + "model.layers.23.self_attn.kv_b_proj.biases": "model-00018-of-00050.safetensors", + "model.layers.23.self_attn.kv_b_proj.scales": "model-00018-of-00050.safetensors", + "model.layers.23.self_attn.kv_b_proj.weight": "model-00018-of-00050.safetensors", + "model.layers.23.self_attn.o_proj.biases": "model-00018-of-00050.safetensors", + "model.layers.23.self_attn.o_proj.scales": "model-00018-of-00050.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00018-of-00050.safetensors", + "model.layers.23.self_attn.q_a_layernorm.weight": "model-00018-of-00050.safetensors", + "model.layers.23.self_attn.q_a_proj.biases": "model-00018-of-00050.safetensors", + "model.layers.23.self_attn.q_a_proj.scales": "model-00018-of-00050.safetensors", + "model.layers.23.self_attn.q_a_proj.weight": "model-00018-of-00050.safetensors", + "model.layers.23.self_attn.q_b_proj.biases": "model-00018-of-00050.safetensors", + "model.layers.23.self_attn.q_b_proj.scales": "model-00018-of-00050.safetensors", + "model.layers.23.self_attn.q_b_proj.weight": "model-00018-of-00050.safetensors", + "model.layers.24.input_layernorm.weight": "model-00019-of-00050.safetensors", + "model.layers.24.mlp.gate.e_score_correction_bias": "model-00019-of-00050.safetensors", + "model.layers.24.mlp.gate.weight": "model-00019-of-00050.safetensors", + "model.layers.24.mlp.shared_experts.down_proj.biases": "model-00019-of-00050.safetensors", + "model.layers.24.mlp.shared_experts.down_proj.scales": "model-00019-of-00050.safetensors", + "model.layers.24.mlp.shared_experts.down_proj.weight": "model-00019-of-00050.safetensors", + "model.layers.24.mlp.shared_experts.gate_proj.biases": "model-00019-of-00050.safetensors", + "model.layers.24.mlp.shared_experts.gate_proj.scales": "model-00019-of-00050.safetensors", + "model.layers.24.mlp.shared_experts.gate_proj.weight": "model-00019-of-00050.safetensors", + "model.layers.24.mlp.shared_experts.up_proj.biases": "model-00019-of-00050.safetensors", + "model.layers.24.mlp.shared_experts.up_proj.scales": "model-00019-of-00050.safetensors", + "model.layers.24.mlp.shared_experts.up_proj.weight": "model-00019-of-00050.safetensors", + "model.layers.24.mlp.switch_mlp.down_proj.biases": "model-00019-of-00050.safetensors", + "model.layers.24.mlp.switch_mlp.down_proj.scales": "model-00019-of-00050.safetensors", + "model.layers.24.mlp.switch_mlp.down_proj.weight": "model-00019-of-00050.safetensors", + "model.layers.24.mlp.switch_mlp.gate_proj.biases": "model-00019-of-00050.safetensors", + "model.layers.24.mlp.switch_mlp.gate_proj.scales": "model-00019-of-00050.safetensors", + "model.layers.24.mlp.switch_mlp.gate_proj.weight": "model-00019-of-00050.safetensors", + "model.layers.24.mlp.switch_mlp.up_proj.biases": "model-00019-of-00050.safetensors", + "model.layers.24.mlp.switch_mlp.up_proj.scales": "model-00019-of-00050.safetensors", + "model.layers.24.mlp.switch_mlp.up_proj.weight": "model-00019-of-00050.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00019-of-00050.safetensors", + "model.layers.24.self_attn.kv_a_layernorm.weight": "model-00019-of-00050.safetensors", + "model.layers.24.self_attn.kv_a_proj_with_mqa.biases": "model-00019-of-00050.safetensors", + "model.layers.24.self_attn.kv_a_proj_with_mqa.scales": "model-00019-of-00050.safetensors", + "model.layers.24.self_attn.kv_a_proj_with_mqa.weight": "model-00019-of-00050.safetensors", + "model.layers.24.self_attn.kv_b_proj.biases": "model-00019-of-00050.safetensors", + "model.layers.24.self_attn.kv_b_proj.scales": "model-00019-of-00050.safetensors", + "model.layers.24.self_attn.kv_b_proj.weight": "model-00019-of-00050.safetensors", + "model.layers.24.self_attn.o_proj.biases": "model-00019-of-00050.safetensors", + "model.layers.24.self_attn.o_proj.scales": "model-00019-of-00050.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00019-of-00050.safetensors", + "model.layers.24.self_attn.q_a_layernorm.weight": "model-00019-of-00050.safetensors", + "model.layers.24.self_attn.q_a_proj.biases": "model-00019-of-00050.safetensors", + "model.layers.24.self_attn.q_a_proj.scales": "model-00019-of-00050.safetensors", + "model.layers.24.self_attn.q_a_proj.weight": "model-00019-of-00050.safetensors", + "model.layers.24.self_attn.q_b_proj.biases": "model-00019-of-00050.safetensors", + "model.layers.24.self_attn.q_b_proj.scales": "model-00019-of-00050.safetensors", + "model.layers.24.self_attn.q_b_proj.weight": "model-00019-of-00050.safetensors", + "model.layers.25.input_layernorm.weight": "model-00020-of-00050.safetensors", + "model.layers.25.mlp.gate.e_score_correction_bias": "model-00020-of-00050.safetensors", + "model.layers.25.mlp.gate.weight": "model-00020-of-00050.safetensors", + "model.layers.25.mlp.shared_experts.down_proj.biases": "model-00020-of-00050.safetensors", + "model.layers.25.mlp.shared_experts.down_proj.scales": "model-00020-of-00050.safetensors", + "model.layers.25.mlp.shared_experts.down_proj.weight": "model-00020-of-00050.safetensors", + "model.layers.25.mlp.shared_experts.gate_proj.biases": "model-00020-of-00050.safetensors", + "model.layers.25.mlp.shared_experts.gate_proj.scales": "model-00020-of-00050.safetensors", + "model.layers.25.mlp.shared_experts.gate_proj.weight": "model-00020-of-00050.safetensors", + "model.layers.25.mlp.shared_experts.up_proj.biases": "model-00020-of-00050.safetensors", + "model.layers.25.mlp.shared_experts.up_proj.scales": "model-00020-of-00050.safetensors", + "model.layers.25.mlp.shared_experts.up_proj.weight": "model-00020-of-00050.safetensors", + "model.layers.25.mlp.switch_mlp.down_proj.biases": "model-00020-of-00050.safetensors", + "model.layers.25.mlp.switch_mlp.down_proj.scales": "model-00020-of-00050.safetensors", + "model.layers.25.mlp.switch_mlp.down_proj.weight": "model-00020-of-00050.safetensors", + "model.layers.25.mlp.switch_mlp.gate_proj.biases": "model-00020-of-00050.safetensors", + "model.layers.25.mlp.switch_mlp.gate_proj.scales": "model-00020-of-00050.safetensors", + "model.layers.25.mlp.switch_mlp.gate_proj.weight": "model-00020-of-00050.safetensors", + "model.layers.25.mlp.switch_mlp.up_proj.biases": "model-00020-of-00050.safetensors", + "model.layers.25.mlp.switch_mlp.up_proj.scales": "model-00020-of-00050.safetensors", + "model.layers.25.mlp.switch_mlp.up_proj.weight": "model-00020-of-00050.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00020-of-00050.safetensors", + "model.layers.25.self_attn.kv_a_layernorm.weight": "model-00019-of-00050.safetensors", + "model.layers.25.self_attn.kv_a_proj_with_mqa.biases": "model-00019-of-00050.safetensors", + "model.layers.25.self_attn.kv_a_proj_with_mqa.scales": "model-00019-of-00050.safetensors", + "model.layers.25.self_attn.kv_a_proj_with_mqa.weight": "model-00019-of-00050.safetensors", + "model.layers.25.self_attn.kv_b_proj.biases": "model-00019-of-00050.safetensors", + "model.layers.25.self_attn.kv_b_proj.scales": "model-00019-of-00050.safetensors", + "model.layers.25.self_attn.kv_b_proj.weight": "model-00019-of-00050.safetensors", + "model.layers.25.self_attn.o_proj.biases": "model-00019-of-00050.safetensors", + "model.layers.25.self_attn.o_proj.scales": "model-00019-of-00050.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00019-of-00050.safetensors", + "model.layers.25.self_attn.q_a_layernorm.weight": "model-00019-of-00050.safetensors", + "model.layers.25.self_attn.q_a_proj.biases": "model-00019-of-00050.safetensors", + "model.layers.25.self_attn.q_a_proj.scales": "model-00019-of-00050.safetensors", + "model.layers.25.self_attn.q_a_proj.weight": "model-00019-of-00050.safetensors", + "model.layers.25.self_attn.q_b_proj.biases": "model-00019-of-00050.safetensors", + "model.layers.25.self_attn.q_b_proj.scales": "model-00019-of-00050.safetensors", + "model.layers.25.self_attn.q_b_proj.weight": "model-00019-of-00050.safetensors", + "model.layers.26.input_layernorm.weight": "model-00021-of-00050.safetensors", + "model.layers.26.mlp.gate.e_score_correction_bias": "model-00021-of-00050.safetensors", + "model.layers.26.mlp.gate.weight": "model-00021-of-00050.safetensors", + "model.layers.26.mlp.shared_experts.down_proj.biases": "model-00021-of-00050.safetensors", + "model.layers.26.mlp.shared_experts.down_proj.scales": "model-00021-of-00050.safetensors", + "model.layers.26.mlp.shared_experts.down_proj.weight": "model-00021-of-00050.safetensors", + "model.layers.26.mlp.shared_experts.gate_proj.biases": "model-00021-of-00050.safetensors", + "model.layers.26.mlp.shared_experts.gate_proj.scales": "model-00021-of-00050.safetensors", + "model.layers.26.mlp.shared_experts.gate_proj.weight": "model-00021-of-00050.safetensors", + "model.layers.26.mlp.shared_experts.up_proj.biases": "model-00021-of-00050.safetensors", + "model.layers.26.mlp.shared_experts.up_proj.scales": "model-00021-of-00050.safetensors", + "model.layers.26.mlp.shared_experts.up_proj.weight": "model-00021-of-00050.safetensors", + "model.layers.26.mlp.switch_mlp.down_proj.biases": "model-00021-of-00050.safetensors", + "model.layers.26.mlp.switch_mlp.down_proj.scales": "model-00021-of-00050.safetensors", + "model.layers.26.mlp.switch_mlp.down_proj.weight": "model-00021-of-00050.safetensors", + "model.layers.26.mlp.switch_mlp.gate_proj.biases": "model-00021-of-00050.safetensors", + "model.layers.26.mlp.switch_mlp.gate_proj.scales": "model-00021-of-00050.safetensors", + "model.layers.26.mlp.switch_mlp.gate_proj.weight": "model-00020-of-00050.safetensors", + "model.layers.26.mlp.switch_mlp.up_proj.biases": "model-00021-of-00050.safetensors", + "model.layers.26.mlp.switch_mlp.up_proj.scales": "model-00021-of-00050.safetensors", + "model.layers.26.mlp.switch_mlp.up_proj.weight": "model-00021-of-00050.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00021-of-00050.safetensors", + "model.layers.26.self_attn.kv_a_layernorm.weight": "model-00020-of-00050.safetensors", + "model.layers.26.self_attn.kv_a_proj_with_mqa.biases": "model-00020-of-00050.safetensors", + "model.layers.26.self_attn.kv_a_proj_with_mqa.scales": "model-00020-of-00050.safetensors", + "model.layers.26.self_attn.kv_a_proj_with_mqa.weight": "model-00020-of-00050.safetensors", + "model.layers.26.self_attn.kv_b_proj.biases": "model-00020-of-00050.safetensors", + "model.layers.26.self_attn.kv_b_proj.scales": "model-00020-of-00050.safetensors", + "model.layers.26.self_attn.kv_b_proj.weight": "model-00020-of-00050.safetensors", + "model.layers.26.self_attn.o_proj.biases": "model-00020-of-00050.safetensors", + "model.layers.26.self_attn.o_proj.scales": "model-00020-of-00050.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00020-of-00050.safetensors", + "model.layers.26.self_attn.q_a_layernorm.weight": "model-00020-of-00050.safetensors", + "model.layers.26.self_attn.q_a_proj.biases": "model-00020-of-00050.safetensors", + "model.layers.26.self_attn.q_a_proj.scales": "model-00020-of-00050.safetensors", + "model.layers.26.self_attn.q_a_proj.weight": "model-00020-of-00050.safetensors", + "model.layers.26.self_attn.q_b_proj.biases": "model-00020-of-00050.safetensors", + "model.layers.26.self_attn.q_b_proj.scales": "model-00020-of-00050.safetensors", + "model.layers.26.self_attn.q_b_proj.weight": "model-00020-of-00050.safetensors", + "model.layers.27.input_layernorm.weight": "model-00022-of-00050.safetensors", + "model.layers.27.mlp.gate.e_score_correction_bias": "model-00022-of-00050.safetensors", + "model.layers.27.mlp.gate.weight": "model-00022-of-00050.safetensors", + "model.layers.27.mlp.shared_experts.down_proj.biases": "model-00022-of-00050.safetensors", + "model.layers.27.mlp.shared_experts.down_proj.scales": "model-00022-of-00050.safetensors", + "model.layers.27.mlp.shared_experts.down_proj.weight": "model-00022-of-00050.safetensors", + "model.layers.27.mlp.shared_experts.gate_proj.biases": "model-00022-of-00050.safetensors", + "model.layers.27.mlp.shared_experts.gate_proj.scales": "model-00022-of-00050.safetensors", + "model.layers.27.mlp.shared_experts.gate_proj.weight": "model-00022-of-00050.safetensors", + "model.layers.27.mlp.shared_experts.up_proj.biases": "model-00022-of-00050.safetensors", + "model.layers.27.mlp.shared_experts.up_proj.scales": "model-00022-of-00050.safetensors", + "model.layers.27.mlp.shared_experts.up_proj.weight": "model-00022-of-00050.safetensors", + "model.layers.27.mlp.switch_mlp.down_proj.biases": "model-00022-of-00050.safetensors", + "model.layers.27.mlp.switch_mlp.down_proj.scales": "model-00022-of-00050.safetensors", + "model.layers.27.mlp.switch_mlp.down_proj.weight": "model-00022-of-00050.safetensors", + "model.layers.27.mlp.switch_mlp.gate_proj.biases": "model-00021-of-00050.safetensors", + "model.layers.27.mlp.switch_mlp.gate_proj.scales": "model-00021-of-00050.safetensors", + "model.layers.27.mlp.switch_mlp.gate_proj.weight": "model-00021-of-00050.safetensors", + "model.layers.27.mlp.switch_mlp.up_proj.biases": "model-00022-of-00050.safetensors", + "model.layers.27.mlp.switch_mlp.up_proj.scales": "model-00022-of-00050.safetensors", + "model.layers.27.mlp.switch_mlp.up_proj.weight": "model-00022-of-00050.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00022-of-00050.safetensors", + "model.layers.27.self_attn.kv_a_layernorm.weight": "model-00021-of-00050.safetensors", + "model.layers.27.self_attn.kv_a_proj_with_mqa.biases": "model-00021-of-00050.safetensors", + "model.layers.27.self_attn.kv_a_proj_with_mqa.scales": "model-00021-of-00050.safetensors", + "model.layers.27.self_attn.kv_a_proj_with_mqa.weight": "model-00021-of-00050.safetensors", + "model.layers.27.self_attn.kv_b_proj.biases": "model-00021-of-00050.safetensors", + "model.layers.27.self_attn.kv_b_proj.scales": "model-00021-of-00050.safetensors", + "model.layers.27.self_attn.kv_b_proj.weight": "model-00021-of-00050.safetensors", + "model.layers.27.self_attn.o_proj.biases": "model-00021-of-00050.safetensors", + "model.layers.27.self_attn.o_proj.scales": "model-00021-of-00050.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00021-of-00050.safetensors", + "model.layers.27.self_attn.q_a_layernorm.weight": "model-00021-of-00050.safetensors", + "model.layers.27.self_attn.q_a_proj.biases": "model-00021-of-00050.safetensors", + "model.layers.27.self_attn.q_a_proj.scales": "model-00021-of-00050.safetensors", + "model.layers.27.self_attn.q_a_proj.weight": "model-00021-of-00050.safetensors", + "model.layers.27.self_attn.q_b_proj.biases": "model-00021-of-00050.safetensors", + "model.layers.27.self_attn.q_b_proj.scales": "model-00021-of-00050.safetensors", + "model.layers.27.self_attn.q_b_proj.weight": "model-00021-of-00050.safetensors", + "model.layers.28.input_layernorm.weight": "model-00023-of-00050.safetensors", + "model.layers.28.mlp.gate.e_score_correction_bias": "model-00023-of-00050.safetensors", + "model.layers.28.mlp.gate.weight": "model-00023-of-00050.safetensors", + "model.layers.28.mlp.shared_experts.down_proj.biases": "model-00023-of-00050.safetensors", + "model.layers.28.mlp.shared_experts.down_proj.scales": "model-00023-of-00050.safetensors", + "model.layers.28.mlp.shared_experts.down_proj.weight": "model-00023-of-00050.safetensors", + "model.layers.28.mlp.shared_experts.gate_proj.biases": "model-00023-of-00050.safetensors", + "model.layers.28.mlp.shared_experts.gate_proj.scales": "model-00023-of-00050.safetensors", + "model.layers.28.mlp.shared_experts.gate_proj.weight": "model-00023-of-00050.safetensors", + "model.layers.28.mlp.shared_experts.up_proj.biases": "model-00023-of-00050.safetensors", + "model.layers.28.mlp.shared_experts.up_proj.scales": "model-00023-of-00050.safetensors", + "model.layers.28.mlp.shared_experts.up_proj.weight": "model-00023-of-00050.safetensors", + "model.layers.28.mlp.switch_mlp.down_proj.biases": "model-00023-of-00050.safetensors", + "model.layers.28.mlp.switch_mlp.down_proj.scales": "model-00023-of-00050.safetensors", + "model.layers.28.mlp.switch_mlp.down_proj.weight": "model-00023-of-00050.safetensors", + "model.layers.28.mlp.switch_mlp.gate_proj.biases": "model-00022-of-00050.safetensors", + "model.layers.28.mlp.switch_mlp.gate_proj.scales": "model-00022-of-00050.safetensors", + "model.layers.28.mlp.switch_mlp.gate_proj.weight": "model-00022-of-00050.safetensors", + "model.layers.28.mlp.switch_mlp.up_proj.biases": "model-00023-of-00050.safetensors", + "model.layers.28.mlp.switch_mlp.up_proj.scales": "model-00023-of-00050.safetensors", + "model.layers.28.mlp.switch_mlp.up_proj.weight": "model-00022-of-00050.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00023-of-00050.safetensors", + "model.layers.28.self_attn.kv_a_layernorm.weight": "model-00022-of-00050.safetensors", + "model.layers.28.self_attn.kv_a_proj_with_mqa.biases": "model-00022-of-00050.safetensors", + "model.layers.28.self_attn.kv_a_proj_with_mqa.scales": "model-00022-of-00050.safetensors", + "model.layers.28.self_attn.kv_a_proj_with_mqa.weight": "model-00022-of-00050.safetensors", + "model.layers.28.self_attn.kv_b_proj.biases": "model-00022-of-00050.safetensors", + "model.layers.28.self_attn.kv_b_proj.scales": "model-00022-of-00050.safetensors", + "model.layers.28.self_attn.kv_b_proj.weight": "model-00022-of-00050.safetensors", + "model.layers.28.self_attn.o_proj.biases": "model-00022-of-00050.safetensors", + "model.layers.28.self_attn.o_proj.scales": "model-00022-of-00050.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00022-of-00050.safetensors", + "model.layers.28.self_attn.q_a_layernorm.weight": "model-00022-of-00050.safetensors", + "model.layers.28.self_attn.q_a_proj.biases": "model-00022-of-00050.safetensors", + "model.layers.28.self_attn.q_a_proj.scales": "model-00022-of-00050.safetensors", + "model.layers.28.self_attn.q_a_proj.weight": "model-00022-of-00050.safetensors", + "model.layers.28.self_attn.q_b_proj.biases": "model-00022-of-00050.safetensors", + "model.layers.28.self_attn.q_b_proj.scales": "model-00022-of-00050.safetensors", + "model.layers.28.self_attn.q_b_proj.weight": "model-00022-of-00050.safetensors", + "model.layers.29.input_layernorm.weight": "model-00024-of-00050.safetensors", + "model.layers.29.mlp.gate.e_score_correction_bias": "model-00024-of-00050.safetensors", + "model.layers.29.mlp.gate.weight": "model-00024-of-00050.safetensors", + "model.layers.29.mlp.shared_experts.down_proj.biases": "model-00024-of-00050.safetensors", + "model.layers.29.mlp.shared_experts.down_proj.scales": "model-00024-of-00050.safetensors", + "model.layers.29.mlp.shared_experts.down_proj.weight": "model-00024-of-00050.safetensors", + "model.layers.29.mlp.shared_experts.gate_proj.biases": "model-00024-of-00050.safetensors", + "model.layers.29.mlp.shared_experts.gate_proj.scales": "model-00024-of-00050.safetensors", + "model.layers.29.mlp.shared_experts.gate_proj.weight": "model-00024-of-00050.safetensors", + "model.layers.29.mlp.shared_experts.up_proj.biases": "model-00024-of-00050.safetensors", + "model.layers.29.mlp.shared_experts.up_proj.scales": "model-00024-of-00050.safetensors", + "model.layers.29.mlp.shared_experts.up_proj.weight": "model-00024-of-00050.safetensors", + "model.layers.29.mlp.switch_mlp.down_proj.biases": "model-00024-of-00050.safetensors", + "model.layers.29.mlp.switch_mlp.down_proj.scales": "model-00024-of-00050.safetensors", + "model.layers.29.mlp.switch_mlp.down_proj.weight": "model-00024-of-00050.safetensors", + "model.layers.29.mlp.switch_mlp.gate_proj.biases": "model-00023-of-00050.safetensors", + "model.layers.29.mlp.switch_mlp.gate_proj.scales": "model-00023-of-00050.safetensors", + "model.layers.29.mlp.switch_mlp.gate_proj.weight": "model-00023-of-00050.safetensors", + "model.layers.29.mlp.switch_mlp.up_proj.biases": "model-00023-of-00050.safetensors", + "model.layers.29.mlp.switch_mlp.up_proj.scales": "model-00023-of-00050.safetensors", + "model.layers.29.mlp.switch_mlp.up_proj.weight": "model-00023-of-00050.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00024-of-00050.safetensors", + "model.layers.29.self_attn.kv_a_layernorm.weight": "model-00023-of-00050.safetensors", + "model.layers.29.self_attn.kv_a_proj_with_mqa.biases": "model-00023-of-00050.safetensors", + "model.layers.29.self_attn.kv_a_proj_with_mqa.scales": "model-00023-of-00050.safetensors", + "model.layers.29.self_attn.kv_a_proj_with_mqa.weight": "model-00023-of-00050.safetensors", + "model.layers.29.self_attn.kv_b_proj.biases": "model-00023-of-00050.safetensors", + "model.layers.29.self_attn.kv_b_proj.scales": "model-00023-of-00050.safetensors", + "model.layers.29.self_attn.kv_b_proj.weight": "model-00023-of-00050.safetensors", + "model.layers.29.self_attn.o_proj.biases": "model-00023-of-00050.safetensors", + "model.layers.29.self_attn.o_proj.scales": "model-00023-of-00050.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00023-of-00050.safetensors", + "model.layers.29.self_attn.q_a_layernorm.weight": "model-00023-of-00050.safetensors", + "model.layers.29.self_attn.q_a_proj.biases": "model-00023-of-00050.safetensors", + "model.layers.29.self_attn.q_a_proj.scales": "model-00023-of-00050.safetensors", + "model.layers.29.self_attn.q_a_proj.weight": "model-00023-of-00050.safetensors", + "model.layers.29.self_attn.q_b_proj.biases": "model-00023-of-00050.safetensors", + "model.layers.29.self_attn.q_b_proj.scales": "model-00023-of-00050.safetensors", + "model.layers.29.self_attn.q_b_proj.weight": "model-00023-of-00050.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00050.safetensors", + "model.layers.3.mlp.gate.e_score_correction_bias": "model-00001-of-00050.safetensors", + "model.layers.3.mlp.gate.weight": "model-00001-of-00050.safetensors", + "model.layers.3.mlp.shared_experts.down_proj.biases": "model-00001-of-00050.safetensors", + "model.layers.3.mlp.shared_experts.down_proj.scales": "model-00001-of-00050.safetensors", + "model.layers.3.mlp.shared_experts.down_proj.weight": "model-00001-of-00050.safetensors", + "model.layers.3.mlp.shared_experts.gate_proj.biases": "model-00001-of-00050.safetensors", + "model.layers.3.mlp.shared_experts.gate_proj.scales": "model-00001-of-00050.safetensors", + "model.layers.3.mlp.shared_experts.gate_proj.weight": "model-00001-of-00050.safetensors", + "model.layers.3.mlp.shared_experts.up_proj.biases": "model-00001-of-00050.safetensors", + "model.layers.3.mlp.shared_experts.up_proj.scales": "model-00001-of-00050.safetensors", + "model.layers.3.mlp.shared_experts.up_proj.weight": "model-00001-of-00050.safetensors", + "model.layers.3.mlp.switch_mlp.down_proj.biases": "model-00001-of-00050.safetensors", + "model.layers.3.mlp.switch_mlp.down_proj.scales": "model-00001-of-00050.safetensors", + "model.layers.3.mlp.switch_mlp.down_proj.weight": "model-00001-of-00050.safetensors", + "model.layers.3.mlp.switch_mlp.gate_proj.biases": "model-00001-of-00050.safetensors", + "model.layers.3.mlp.switch_mlp.gate_proj.scales": "model-00001-of-00050.safetensors", + "model.layers.3.mlp.switch_mlp.gate_proj.weight": "model-00001-of-00050.safetensors", + "model.layers.3.mlp.switch_mlp.up_proj.biases": "model-00001-of-00050.safetensors", + "model.layers.3.mlp.switch_mlp.up_proj.scales": "model-00001-of-00050.safetensors", + "model.layers.3.mlp.switch_mlp.up_proj.weight": "model-00001-of-00050.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00050.safetensors", + "model.layers.3.self_attn.kv_a_layernorm.weight": "model-00001-of-00050.safetensors", + "model.layers.3.self_attn.kv_a_proj_with_mqa.biases": "model-00001-of-00050.safetensors", + "model.layers.3.self_attn.kv_a_proj_with_mqa.scales": "model-00001-of-00050.safetensors", + "model.layers.3.self_attn.kv_a_proj_with_mqa.weight": "model-00001-of-00050.safetensors", + "model.layers.3.self_attn.kv_b_proj.biases": "model-00001-of-00050.safetensors", + "model.layers.3.self_attn.kv_b_proj.scales": "model-00001-of-00050.safetensors", + "model.layers.3.self_attn.kv_b_proj.weight": "model-00001-of-00050.safetensors", + "model.layers.3.self_attn.o_proj.biases": "model-00001-of-00050.safetensors", + "model.layers.3.self_attn.o_proj.scales": "model-00001-of-00050.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00050.safetensors", + "model.layers.3.self_attn.q_a_layernorm.weight": "model-00001-of-00050.safetensors", + "model.layers.3.self_attn.q_a_proj.biases": "model-00001-of-00050.safetensors", + "model.layers.3.self_attn.q_a_proj.scales": "model-00001-of-00050.safetensors", + "model.layers.3.self_attn.q_a_proj.weight": "model-00001-of-00050.safetensors", + "model.layers.3.self_attn.q_b_proj.biases": "model-00001-of-00050.safetensors", + "model.layers.3.self_attn.q_b_proj.scales": "model-00001-of-00050.safetensors", + "model.layers.3.self_attn.q_b_proj.weight": "model-00001-of-00050.safetensors", + "model.layers.30.input_layernorm.weight": "model-00025-of-00050.safetensors", + "model.layers.30.mlp.gate.e_score_correction_bias": "model-00025-of-00050.safetensors", + "model.layers.30.mlp.gate.weight": "model-00025-of-00050.safetensors", + "model.layers.30.mlp.shared_experts.down_proj.biases": "model-00025-of-00050.safetensors", + "model.layers.30.mlp.shared_experts.down_proj.scales": "model-00025-of-00050.safetensors", + "model.layers.30.mlp.shared_experts.down_proj.weight": "model-00025-of-00050.safetensors", + "model.layers.30.mlp.shared_experts.gate_proj.biases": "model-00025-of-00050.safetensors", + "model.layers.30.mlp.shared_experts.gate_proj.scales": "model-00025-of-00050.safetensors", + "model.layers.30.mlp.shared_experts.gate_proj.weight": "model-00025-of-00050.safetensors", + "model.layers.30.mlp.shared_experts.up_proj.biases": "model-00025-of-00050.safetensors", + "model.layers.30.mlp.shared_experts.up_proj.scales": "model-00025-of-00050.safetensors", + "model.layers.30.mlp.shared_experts.up_proj.weight": "model-00025-of-00050.safetensors", + "model.layers.30.mlp.switch_mlp.down_proj.biases": "model-00025-of-00050.safetensors", + "model.layers.30.mlp.switch_mlp.down_proj.scales": "model-00025-of-00050.safetensors", + "model.layers.30.mlp.switch_mlp.down_proj.weight": "model-00024-of-00050.safetensors", + "model.layers.30.mlp.switch_mlp.gate_proj.biases": "model-00024-of-00050.safetensors", + "model.layers.30.mlp.switch_mlp.gate_proj.scales": "model-00024-of-00050.safetensors", + "model.layers.30.mlp.switch_mlp.gate_proj.weight": "model-00024-of-00050.safetensors", + "model.layers.30.mlp.switch_mlp.up_proj.biases": "model-00024-of-00050.safetensors", + "model.layers.30.mlp.switch_mlp.up_proj.scales": "model-00024-of-00050.safetensors", + "model.layers.30.mlp.switch_mlp.up_proj.weight": "model-00024-of-00050.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00025-of-00050.safetensors", + "model.layers.30.self_attn.kv_a_layernorm.weight": "model-00024-of-00050.safetensors", + "model.layers.30.self_attn.kv_a_proj_with_mqa.biases": "model-00024-of-00050.safetensors", + "model.layers.30.self_attn.kv_a_proj_with_mqa.scales": "model-00024-of-00050.safetensors", + "model.layers.30.self_attn.kv_a_proj_with_mqa.weight": "model-00024-of-00050.safetensors", + "model.layers.30.self_attn.kv_b_proj.biases": "model-00024-of-00050.safetensors", + "model.layers.30.self_attn.kv_b_proj.scales": "model-00024-of-00050.safetensors", + "model.layers.30.self_attn.kv_b_proj.weight": "model-00024-of-00050.safetensors", + "model.layers.30.self_attn.o_proj.biases": "model-00024-of-00050.safetensors", + "model.layers.30.self_attn.o_proj.scales": "model-00024-of-00050.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00024-of-00050.safetensors", + "model.layers.30.self_attn.q_a_layernorm.weight": "model-00024-of-00050.safetensors", + "model.layers.30.self_attn.q_a_proj.biases": "model-00024-of-00050.safetensors", + "model.layers.30.self_attn.q_a_proj.scales": "model-00024-of-00050.safetensors", + "model.layers.30.self_attn.q_a_proj.weight": "model-00024-of-00050.safetensors", + "model.layers.30.self_attn.q_b_proj.biases": "model-00024-of-00050.safetensors", + "model.layers.30.self_attn.q_b_proj.scales": "model-00024-of-00050.safetensors", + "model.layers.30.self_attn.q_b_proj.weight": "model-00024-of-00050.safetensors", + "model.layers.31.input_layernorm.weight": "model-00025-of-00050.safetensors", + "model.layers.31.mlp.gate.e_score_correction_bias": "model-00025-of-00050.safetensors", + "model.layers.31.mlp.gate.weight": "model-00025-of-00050.safetensors", + "model.layers.31.mlp.shared_experts.down_proj.biases": "model-00025-of-00050.safetensors", + "model.layers.31.mlp.shared_experts.down_proj.scales": "model-00025-of-00050.safetensors", + "model.layers.31.mlp.shared_experts.down_proj.weight": "model-00025-of-00050.safetensors", + "model.layers.31.mlp.shared_experts.gate_proj.biases": "model-00025-of-00050.safetensors", + "model.layers.31.mlp.shared_experts.gate_proj.scales": "model-00025-of-00050.safetensors", + "model.layers.31.mlp.shared_experts.gate_proj.weight": "model-00025-of-00050.safetensors", + "model.layers.31.mlp.shared_experts.up_proj.biases": "model-00025-of-00050.safetensors", + "model.layers.31.mlp.shared_experts.up_proj.scales": "model-00025-of-00050.safetensors", + "model.layers.31.mlp.shared_experts.up_proj.weight": "model-00025-of-00050.safetensors", + "model.layers.31.mlp.switch_mlp.down_proj.biases": "model-00025-of-00050.safetensors", + "model.layers.31.mlp.switch_mlp.down_proj.scales": "model-00025-of-00050.safetensors", + "model.layers.31.mlp.switch_mlp.down_proj.weight": "model-00025-of-00050.safetensors", + "model.layers.31.mlp.switch_mlp.gate_proj.biases": "model-00025-of-00050.safetensors", + "model.layers.31.mlp.switch_mlp.gate_proj.scales": "model-00025-of-00050.safetensors", + "model.layers.31.mlp.switch_mlp.gate_proj.weight": "model-00025-of-00050.safetensors", + "model.layers.31.mlp.switch_mlp.up_proj.biases": "model-00025-of-00050.safetensors", + "model.layers.31.mlp.switch_mlp.up_proj.scales": "model-00025-of-00050.safetensors", + "model.layers.31.mlp.switch_mlp.up_proj.weight": "model-00025-of-00050.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00025-of-00050.safetensors", + "model.layers.31.self_attn.kv_a_layernorm.weight": "model-00025-of-00050.safetensors", + "model.layers.31.self_attn.kv_a_proj_with_mqa.biases": "model-00025-of-00050.safetensors", + "model.layers.31.self_attn.kv_a_proj_with_mqa.scales": "model-00025-of-00050.safetensors", + "model.layers.31.self_attn.kv_a_proj_with_mqa.weight": "model-00025-of-00050.safetensors", + "model.layers.31.self_attn.kv_b_proj.biases": "model-00025-of-00050.safetensors", + "model.layers.31.self_attn.kv_b_proj.scales": "model-00025-of-00050.safetensors", + "model.layers.31.self_attn.kv_b_proj.weight": "model-00025-of-00050.safetensors", + "model.layers.31.self_attn.o_proj.biases": "model-00025-of-00050.safetensors", + "model.layers.31.self_attn.o_proj.scales": "model-00025-of-00050.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00025-of-00050.safetensors", + "model.layers.31.self_attn.q_a_layernorm.weight": "model-00025-of-00050.safetensors", + "model.layers.31.self_attn.q_a_proj.biases": "model-00025-of-00050.safetensors", + "model.layers.31.self_attn.q_a_proj.scales": "model-00025-of-00050.safetensors", + "model.layers.31.self_attn.q_a_proj.weight": "model-00025-of-00050.safetensors", + "model.layers.31.self_attn.q_b_proj.biases": "model-00025-of-00050.safetensors", + "model.layers.31.self_attn.q_b_proj.scales": "model-00025-of-00050.safetensors", + "model.layers.31.self_attn.q_b_proj.weight": "model-00025-of-00050.safetensors", + "model.layers.32.input_layernorm.weight": "model-00026-of-00050.safetensors", + "model.layers.32.mlp.gate.e_score_correction_bias": "model-00026-of-00050.safetensors", + "model.layers.32.mlp.gate.weight": "model-00026-of-00050.safetensors", + "model.layers.32.mlp.shared_experts.down_proj.biases": "model-00026-of-00050.safetensors", + "model.layers.32.mlp.shared_experts.down_proj.scales": "model-00026-of-00050.safetensors", + "model.layers.32.mlp.shared_experts.down_proj.weight": "model-00026-of-00050.safetensors", + "model.layers.32.mlp.shared_experts.gate_proj.biases": "model-00026-of-00050.safetensors", + "model.layers.32.mlp.shared_experts.gate_proj.scales": "model-00026-of-00050.safetensors", + "model.layers.32.mlp.shared_experts.gate_proj.weight": "model-00026-of-00050.safetensors", + "model.layers.32.mlp.shared_experts.up_proj.biases": "model-00026-of-00050.safetensors", + "model.layers.32.mlp.shared_experts.up_proj.scales": "model-00026-of-00050.safetensors", + "model.layers.32.mlp.shared_experts.up_proj.weight": "model-00026-of-00050.safetensors", + "model.layers.32.mlp.switch_mlp.down_proj.biases": "model-00026-of-00050.safetensors", + "model.layers.32.mlp.switch_mlp.down_proj.scales": "model-00026-of-00050.safetensors", + "model.layers.32.mlp.switch_mlp.down_proj.weight": "model-00026-of-00050.safetensors", + "model.layers.32.mlp.switch_mlp.gate_proj.biases": "model-00026-of-00050.safetensors", + "model.layers.32.mlp.switch_mlp.gate_proj.scales": "model-00026-of-00050.safetensors", + "model.layers.32.mlp.switch_mlp.gate_proj.weight": "model-00026-of-00050.safetensors", + "model.layers.32.mlp.switch_mlp.up_proj.biases": "model-00026-of-00050.safetensors", + "model.layers.32.mlp.switch_mlp.up_proj.scales": "model-00026-of-00050.safetensors", + "model.layers.32.mlp.switch_mlp.up_proj.weight": "model-00026-of-00050.safetensors", + "model.layers.32.post_attention_layernorm.weight": "model-00026-of-00050.safetensors", + "model.layers.32.self_attn.kv_a_layernorm.weight": "model-00025-of-00050.safetensors", + "model.layers.32.self_attn.kv_a_proj_with_mqa.biases": "model-00025-of-00050.safetensors", + "model.layers.32.self_attn.kv_a_proj_with_mqa.scales": "model-00025-of-00050.safetensors", + "model.layers.32.self_attn.kv_a_proj_with_mqa.weight": "model-00025-of-00050.safetensors", + "model.layers.32.self_attn.kv_b_proj.biases": "model-00025-of-00050.safetensors", + "model.layers.32.self_attn.kv_b_proj.scales": "model-00025-of-00050.safetensors", + "model.layers.32.self_attn.kv_b_proj.weight": "model-00025-of-00050.safetensors", + "model.layers.32.self_attn.o_proj.biases": "model-00025-of-00050.safetensors", + "model.layers.32.self_attn.o_proj.scales": "model-00025-of-00050.safetensors", + "model.layers.32.self_attn.o_proj.weight": "model-00025-of-00050.safetensors", + "model.layers.32.self_attn.q_a_layernorm.weight": "model-00025-of-00050.safetensors", + "model.layers.32.self_attn.q_a_proj.biases": "model-00025-of-00050.safetensors", + "model.layers.32.self_attn.q_a_proj.scales": "model-00025-of-00050.safetensors", + "model.layers.32.self_attn.q_a_proj.weight": "model-00025-of-00050.safetensors", + "model.layers.32.self_attn.q_b_proj.biases": "model-00025-of-00050.safetensors", + "model.layers.32.self_attn.q_b_proj.scales": "model-00025-of-00050.safetensors", + "model.layers.32.self_attn.q_b_proj.weight": "model-00025-of-00050.safetensors", + "model.layers.33.input_layernorm.weight": "model-00027-of-00050.safetensors", + "model.layers.33.mlp.gate.e_score_correction_bias": "model-00027-of-00050.safetensors", + "model.layers.33.mlp.gate.weight": "model-00027-of-00050.safetensors", + "model.layers.33.mlp.shared_experts.down_proj.biases": "model-00027-of-00050.safetensors", + "model.layers.33.mlp.shared_experts.down_proj.scales": "model-00027-of-00050.safetensors", + "model.layers.33.mlp.shared_experts.down_proj.weight": "model-00027-of-00050.safetensors", + "model.layers.33.mlp.shared_experts.gate_proj.biases": "model-00027-of-00050.safetensors", + "model.layers.33.mlp.shared_experts.gate_proj.scales": "model-00027-of-00050.safetensors", + "model.layers.33.mlp.shared_experts.gate_proj.weight": "model-00027-of-00050.safetensors", + "model.layers.33.mlp.shared_experts.up_proj.biases": "model-00027-of-00050.safetensors", + "model.layers.33.mlp.shared_experts.up_proj.scales": "model-00027-of-00050.safetensors", + "model.layers.33.mlp.shared_experts.up_proj.weight": "model-00027-of-00050.safetensors", + "model.layers.33.mlp.switch_mlp.down_proj.biases": "model-00027-of-00050.safetensors", + "model.layers.33.mlp.switch_mlp.down_proj.scales": "model-00027-of-00050.safetensors", + "model.layers.33.mlp.switch_mlp.down_proj.weight": "model-00027-of-00050.safetensors", + "model.layers.33.mlp.switch_mlp.gate_proj.biases": "model-00027-of-00050.safetensors", + "model.layers.33.mlp.switch_mlp.gate_proj.scales": "model-00027-of-00050.safetensors", + "model.layers.33.mlp.switch_mlp.gate_proj.weight": "model-00026-of-00050.safetensors", + "model.layers.33.mlp.switch_mlp.up_proj.biases": "model-00027-of-00050.safetensors", + "model.layers.33.mlp.switch_mlp.up_proj.scales": "model-00027-of-00050.safetensors", + "model.layers.33.mlp.switch_mlp.up_proj.weight": "model-00027-of-00050.safetensors", + "model.layers.33.post_attention_layernorm.weight": "model-00027-of-00050.safetensors", + "model.layers.33.self_attn.kv_a_layernorm.weight": "model-00026-of-00050.safetensors", + "model.layers.33.self_attn.kv_a_proj_with_mqa.biases": "model-00026-of-00050.safetensors", + "model.layers.33.self_attn.kv_a_proj_with_mqa.scales": "model-00026-of-00050.safetensors", + "model.layers.33.self_attn.kv_a_proj_with_mqa.weight": "model-00026-of-00050.safetensors", + "model.layers.33.self_attn.kv_b_proj.biases": "model-00026-of-00050.safetensors", + "model.layers.33.self_attn.kv_b_proj.scales": "model-00026-of-00050.safetensors", + "model.layers.33.self_attn.kv_b_proj.weight": "model-00026-of-00050.safetensors", + "model.layers.33.self_attn.o_proj.biases": "model-00026-of-00050.safetensors", + "model.layers.33.self_attn.o_proj.scales": "model-00026-of-00050.safetensors", + "model.layers.33.self_attn.o_proj.weight": "model-00026-of-00050.safetensors", + "model.layers.33.self_attn.q_a_layernorm.weight": "model-00026-of-00050.safetensors", + "model.layers.33.self_attn.q_a_proj.biases": "model-00026-of-00050.safetensors", + "model.layers.33.self_attn.q_a_proj.scales": "model-00026-of-00050.safetensors", + "model.layers.33.self_attn.q_a_proj.weight": "model-00026-of-00050.safetensors", + "model.layers.33.self_attn.q_b_proj.biases": "model-00026-of-00050.safetensors", + "model.layers.33.self_attn.q_b_proj.scales": "model-00026-of-00050.safetensors", + "model.layers.33.self_attn.q_b_proj.weight": "model-00026-of-00050.safetensors", + "model.layers.34.input_layernorm.weight": "model-00028-of-00050.safetensors", + "model.layers.34.mlp.gate.e_score_correction_bias": "model-00028-of-00050.safetensors", + "model.layers.34.mlp.gate.weight": "model-00028-of-00050.safetensors", + "model.layers.34.mlp.shared_experts.down_proj.biases": "model-00028-of-00050.safetensors", + "model.layers.34.mlp.shared_experts.down_proj.scales": "model-00028-of-00050.safetensors", + "model.layers.34.mlp.shared_experts.down_proj.weight": "model-00028-of-00050.safetensors", + "model.layers.34.mlp.shared_experts.gate_proj.biases": "model-00028-of-00050.safetensors", + "model.layers.34.mlp.shared_experts.gate_proj.scales": "model-00028-of-00050.safetensors", + "model.layers.34.mlp.shared_experts.gate_proj.weight": "model-00028-of-00050.safetensors", + "model.layers.34.mlp.shared_experts.up_proj.biases": "model-00028-of-00050.safetensors", + "model.layers.34.mlp.shared_experts.up_proj.scales": "model-00028-of-00050.safetensors", + "model.layers.34.mlp.shared_experts.up_proj.weight": "model-00028-of-00050.safetensors", + "model.layers.34.mlp.switch_mlp.down_proj.biases": "model-00028-of-00050.safetensors", + "model.layers.34.mlp.switch_mlp.down_proj.scales": "model-00028-of-00050.safetensors", + "model.layers.34.mlp.switch_mlp.down_proj.weight": "model-00028-of-00050.safetensors", + "model.layers.34.mlp.switch_mlp.gate_proj.biases": "model-00027-of-00050.safetensors", + "model.layers.34.mlp.switch_mlp.gate_proj.scales": "model-00027-of-00050.safetensors", + "model.layers.34.mlp.switch_mlp.gate_proj.weight": "model-00027-of-00050.safetensors", + "model.layers.34.mlp.switch_mlp.up_proj.biases": "model-00028-of-00050.safetensors", + "model.layers.34.mlp.switch_mlp.up_proj.scales": "model-00028-of-00050.safetensors", + "model.layers.34.mlp.switch_mlp.up_proj.weight": "model-00028-of-00050.safetensors", + "model.layers.34.post_attention_layernorm.weight": "model-00028-of-00050.safetensors", + "model.layers.34.self_attn.kv_a_layernorm.weight": "model-00027-of-00050.safetensors", + "model.layers.34.self_attn.kv_a_proj_with_mqa.biases": "model-00027-of-00050.safetensors", + "model.layers.34.self_attn.kv_a_proj_with_mqa.scales": "model-00027-of-00050.safetensors", + "model.layers.34.self_attn.kv_a_proj_with_mqa.weight": "model-00027-of-00050.safetensors", + "model.layers.34.self_attn.kv_b_proj.biases": "model-00027-of-00050.safetensors", + "model.layers.34.self_attn.kv_b_proj.scales": "model-00027-of-00050.safetensors", + "model.layers.34.self_attn.kv_b_proj.weight": "model-00027-of-00050.safetensors", + "model.layers.34.self_attn.o_proj.biases": "model-00027-of-00050.safetensors", + "model.layers.34.self_attn.o_proj.scales": "model-00027-of-00050.safetensors", + "model.layers.34.self_attn.o_proj.weight": "model-00027-of-00050.safetensors", + "model.layers.34.self_attn.q_a_layernorm.weight": "model-00027-of-00050.safetensors", + "model.layers.34.self_attn.q_a_proj.biases": "model-00027-of-00050.safetensors", + "model.layers.34.self_attn.q_a_proj.scales": "model-00027-of-00050.safetensors", + "model.layers.34.self_attn.q_a_proj.weight": "model-00027-of-00050.safetensors", + "model.layers.34.self_attn.q_b_proj.biases": "model-00027-of-00050.safetensors", + "model.layers.34.self_attn.q_b_proj.scales": "model-00027-of-00050.safetensors", + "model.layers.34.self_attn.q_b_proj.weight": "model-00027-of-00050.safetensors", + "model.layers.35.input_layernorm.weight": "model-00029-of-00050.safetensors", + "model.layers.35.mlp.gate.e_score_correction_bias": "model-00029-of-00050.safetensors", + "model.layers.35.mlp.gate.weight": "model-00029-of-00050.safetensors", + "model.layers.35.mlp.shared_experts.down_proj.biases": "model-00029-of-00050.safetensors", + "model.layers.35.mlp.shared_experts.down_proj.scales": "model-00029-of-00050.safetensors", + "model.layers.35.mlp.shared_experts.down_proj.weight": "model-00029-of-00050.safetensors", + "model.layers.35.mlp.shared_experts.gate_proj.biases": "model-00029-of-00050.safetensors", + "model.layers.35.mlp.shared_experts.gate_proj.scales": "model-00029-of-00050.safetensors", + "model.layers.35.mlp.shared_experts.gate_proj.weight": "model-00029-of-00050.safetensors", + "model.layers.35.mlp.shared_experts.up_proj.biases": "model-00029-of-00050.safetensors", + "model.layers.35.mlp.shared_experts.up_proj.scales": "model-00029-of-00050.safetensors", + "model.layers.35.mlp.shared_experts.up_proj.weight": "model-00029-of-00050.safetensors", + "model.layers.35.mlp.switch_mlp.down_proj.biases": "model-00029-of-00050.safetensors", + "model.layers.35.mlp.switch_mlp.down_proj.scales": "model-00029-of-00050.safetensors", + "model.layers.35.mlp.switch_mlp.down_proj.weight": "model-00029-of-00050.safetensors", + "model.layers.35.mlp.switch_mlp.gate_proj.biases": "model-00028-of-00050.safetensors", + "model.layers.35.mlp.switch_mlp.gate_proj.scales": "model-00028-of-00050.safetensors", + "model.layers.35.mlp.switch_mlp.gate_proj.weight": "model-00028-of-00050.safetensors", + "model.layers.35.mlp.switch_mlp.up_proj.biases": "model-00029-of-00050.safetensors", + "model.layers.35.mlp.switch_mlp.up_proj.scales": "model-00029-of-00050.safetensors", + "model.layers.35.mlp.switch_mlp.up_proj.weight": "model-00028-of-00050.safetensors", + "model.layers.35.post_attention_layernorm.weight": "model-00029-of-00050.safetensors", + "model.layers.35.self_attn.kv_a_layernorm.weight": "model-00028-of-00050.safetensors", + "model.layers.35.self_attn.kv_a_proj_with_mqa.biases": "model-00028-of-00050.safetensors", + "model.layers.35.self_attn.kv_a_proj_with_mqa.scales": "model-00028-of-00050.safetensors", + "model.layers.35.self_attn.kv_a_proj_with_mqa.weight": "model-00028-of-00050.safetensors", + "model.layers.35.self_attn.kv_b_proj.biases": "model-00028-of-00050.safetensors", + "model.layers.35.self_attn.kv_b_proj.scales": "model-00028-of-00050.safetensors", + "model.layers.35.self_attn.kv_b_proj.weight": "model-00028-of-00050.safetensors", + "model.layers.35.self_attn.o_proj.biases": "model-00028-of-00050.safetensors", + "model.layers.35.self_attn.o_proj.scales": "model-00028-of-00050.safetensors", + "model.layers.35.self_attn.o_proj.weight": "model-00028-of-00050.safetensors", + "model.layers.35.self_attn.q_a_layernorm.weight": "model-00028-of-00050.safetensors", + "model.layers.35.self_attn.q_a_proj.biases": "model-00028-of-00050.safetensors", + "model.layers.35.self_attn.q_a_proj.scales": "model-00028-of-00050.safetensors", + "model.layers.35.self_attn.q_a_proj.weight": "model-00028-of-00050.safetensors", + "model.layers.35.self_attn.q_b_proj.biases": "model-00028-of-00050.safetensors", + "model.layers.35.self_attn.q_b_proj.scales": "model-00028-of-00050.safetensors", + "model.layers.35.self_attn.q_b_proj.weight": "model-00028-of-00050.safetensors", + "model.layers.36.input_layernorm.weight": "model-00030-of-00050.safetensors", + "model.layers.36.mlp.gate.e_score_correction_bias": "model-00030-of-00050.safetensors", + "model.layers.36.mlp.gate.weight": "model-00030-of-00050.safetensors", + "model.layers.36.mlp.shared_experts.down_proj.biases": "model-00030-of-00050.safetensors", + "model.layers.36.mlp.shared_experts.down_proj.scales": "model-00030-of-00050.safetensors", + "model.layers.36.mlp.shared_experts.down_proj.weight": "model-00030-of-00050.safetensors", + "model.layers.36.mlp.shared_experts.gate_proj.biases": "model-00030-of-00050.safetensors", + "model.layers.36.mlp.shared_experts.gate_proj.scales": "model-00030-of-00050.safetensors", + "model.layers.36.mlp.shared_experts.gate_proj.weight": "model-00030-of-00050.safetensors", + "model.layers.36.mlp.shared_experts.up_proj.biases": "model-00030-of-00050.safetensors", + "model.layers.36.mlp.shared_experts.up_proj.scales": "model-00030-of-00050.safetensors", + "model.layers.36.mlp.shared_experts.up_proj.weight": "model-00030-of-00050.safetensors", + "model.layers.36.mlp.switch_mlp.down_proj.biases": "model-00030-of-00050.safetensors", + "model.layers.36.mlp.switch_mlp.down_proj.scales": "model-00030-of-00050.safetensors", + "model.layers.36.mlp.switch_mlp.down_proj.weight": "model-00030-of-00050.safetensors", + "model.layers.36.mlp.switch_mlp.gate_proj.biases": "model-00029-of-00050.safetensors", + "model.layers.36.mlp.switch_mlp.gate_proj.scales": "model-00029-of-00050.safetensors", + "model.layers.36.mlp.switch_mlp.gate_proj.weight": "model-00029-of-00050.safetensors", + "model.layers.36.mlp.switch_mlp.up_proj.biases": "model-00029-of-00050.safetensors", + "model.layers.36.mlp.switch_mlp.up_proj.scales": "model-00029-of-00050.safetensors", + "model.layers.36.mlp.switch_mlp.up_proj.weight": "model-00029-of-00050.safetensors", + "model.layers.36.post_attention_layernorm.weight": "model-00030-of-00050.safetensors", + "model.layers.36.self_attn.kv_a_layernorm.weight": "model-00029-of-00050.safetensors", + "model.layers.36.self_attn.kv_a_proj_with_mqa.biases": "model-00029-of-00050.safetensors", + "model.layers.36.self_attn.kv_a_proj_with_mqa.scales": "model-00029-of-00050.safetensors", + "model.layers.36.self_attn.kv_a_proj_with_mqa.weight": "model-00029-of-00050.safetensors", + "model.layers.36.self_attn.kv_b_proj.biases": "model-00029-of-00050.safetensors", + "model.layers.36.self_attn.kv_b_proj.scales": "model-00029-of-00050.safetensors", + "model.layers.36.self_attn.kv_b_proj.weight": "model-00029-of-00050.safetensors", + "model.layers.36.self_attn.o_proj.biases": "model-00029-of-00050.safetensors", + "model.layers.36.self_attn.o_proj.scales": "model-00029-of-00050.safetensors", + "model.layers.36.self_attn.o_proj.weight": "model-00029-of-00050.safetensors", + "model.layers.36.self_attn.q_a_layernorm.weight": "model-00029-of-00050.safetensors", + "model.layers.36.self_attn.q_a_proj.biases": "model-00029-of-00050.safetensors", + "model.layers.36.self_attn.q_a_proj.scales": "model-00029-of-00050.safetensors", + "model.layers.36.self_attn.q_a_proj.weight": "model-00029-of-00050.safetensors", + "model.layers.36.self_attn.q_b_proj.biases": "model-00029-of-00050.safetensors", + "model.layers.36.self_attn.q_b_proj.scales": "model-00029-of-00050.safetensors", + "model.layers.36.self_attn.q_b_proj.weight": "model-00029-of-00050.safetensors", + "model.layers.37.input_layernorm.weight": "model-00031-of-00050.safetensors", + "model.layers.37.mlp.gate.e_score_correction_bias": "model-00031-of-00050.safetensors", + "model.layers.37.mlp.gate.weight": "model-00031-of-00050.safetensors", + "model.layers.37.mlp.shared_experts.down_proj.biases": "model-00031-of-00050.safetensors", + "model.layers.37.mlp.shared_experts.down_proj.scales": "model-00031-of-00050.safetensors", + "model.layers.37.mlp.shared_experts.down_proj.weight": "model-00031-of-00050.safetensors", + "model.layers.37.mlp.shared_experts.gate_proj.biases": "model-00031-of-00050.safetensors", + "model.layers.37.mlp.shared_experts.gate_proj.scales": "model-00031-of-00050.safetensors", + "model.layers.37.mlp.shared_experts.gate_proj.weight": "model-00031-of-00050.safetensors", + "model.layers.37.mlp.shared_experts.up_proj.biases": "model-00031-of-00050.safetensors", + "model.layers.37.mlp.shared_experts.up_proj.scales": "model-00031-of-00050.safetensors", + "model.layers.37.mlp.shared_experts.up_proj.weight": "model-00031-of-00050.safetensors", + "model.layers.37.mlp.switch_mlp.down_proj.biases": "model-00031-of-00050.safetensors", + "model.layers.37.mlp.switch_mlp.down_proj.scales": "model-00031-of-00050.safetensors", + "model.layers.37.mlp.switch_mlp.down_proj.weight": "model-00030-of-00050.safetensors", + "model.layers.37.mlp.switch_mlp.gate_proj.biases": "model-00030-of-00050.safetensors", + "model.layers.37.mlp.switch_mlp.gate_proj.scales": "model-00030-of-00050.safetensors", + "model.layers.37.mlp.switch_mlp.gate_proj.weight": "model-00030-of-00050.safetensors", + "model.layers.37.mlp.switch_mlp.up_proj.biases": "model-00030-of-00050.safetensors", + "model.layers.37.mlp.switch_mlp.up_proj.scales": "model-00030-of-00050.safetensors", + "model.layers.37.mlp.switch_mlp.up_proj.weight": "model-00030-of-00050.safetensors", + "model.layers.37.post_attention_layernorm.weight": "model-00031-of-00050.safetensors", + "model.layers.37.self_attn.kv_a_layernorm.weight": "model-00030-of-00050.safetensors", + "model.layers.37.self_attn.kv_a_proj_with_mqa.biases": "model-00030-of-00050.safetensors", + "model.layers.37.self_attn.kv_a_proj_with_mqa.scales": "model-00030-of-00050.safetensors", + "model.layers.37.self_attn.kv_a_proj_with_mqa.weight": "model-00030-of-00050.safetensors", + "model.layers.37.self_attn.kv_b_proj.biases": "model-00030-of-00050.safetensors", + "model.layers.37.self_attn.kv_b_proj.scales": "model-00030-of-00050.safetensors", + "model.layers.37.self_attn.kv_b_proj.weight": "model-00030-of-00050.safetensors", + "model.layers.37.self_attn.o_proj.biases": "model-00030-of-00050.safetensors", + "model.layers.37.self_attn.o_proj.scales": "model-00030-of-00050.safetensors", + "model.layers.37.self_attn.o_proj.weight": "model-00030-of-00050.safetensors", + "model.layers.37.self_attn.q_a_layernorm.weight": "model-00030-of-00050.safetensors", + "model.layers.37.self_attn.q_a_proj.biases": "model-00030-of-00050.safetensors", + "model.layers.37.self_attn.q_a_proj.scales": "model-00030-of-00050.safetensors", + "model.layers.37.self_attn.q_a_proj.weight": "model-00030-of-00050.safetensors", + "model.layers.37.self_attn.q_b_proj.biases": "model-00030-of-00050.safetensors", + "model.layers.37.self_attn.q_b_proj.scales": "model-00030-of-00050.safetensors", + "model.layers.37.self_attn.q_b_proj.weight": "model-00030-of-00050.safetensors", + "model.layers.38.input_layernorm.weight": "model-00031-of-00050.safetensors", + "model.layers.38.mlp.gate.e_score_correction_bias": "model-00031-of-00050.safetensors", + "model.layers.38.mlp.gate.weight": "model-00031-of-00050.safetensors", + "model.layers.38.mlp.shared_experts.down_proj.biases": "model-00031-of-00050.safetensors", + "model.layers.38.mlp.shared_experts.down_proj.scales": "model-00031-of-00050.safetensors", + "model.layers.38.mlp.shared_experts.down_proj.weight": "model-00031-of-00050.safetensors", + "model.layers.38.mlp.shared_experts.gate_proj.biases": "model-00031-of-00050.safetensors", + "model.layers.38.mlp.shared_experts.gate_proj.scales": "model-00031-of-00050.safetensors", + "model.layers.38.mlp.shared_experts.gate_proj.weight": "model-00031-of-00050.safetensors", + "model.layers.38.mlp.shared_experts.up_proj.biases": "model-00031-of-00050.safetensors", + "model.layers.38.mlp.shared_experts.up_proj.scales": "model-00031-of-00050.safetensors", + "model.layers.38.mlp.shared_experts.up_proj.weight": "model-00031-of-00050.safetensors", + "model.layers.38.mlp.switch_mlp.down_proj.biases": "model-00031-of-00050.safetensors", + "model.layers.38.mlp.switch_mlp.down_proj.scales": "model-00031-of-00050.safetensors", + "model.layers.38.mlp.switch_mlp.down_proj.weight": "model-00031-of-00050.safetensors", + "model.layers.38.mlp.switch_mlp.gate_proj.biases": "model-00031-of-00050.safetensors", + "model.layers.38.mlp.switch_mlp.gate_proj.scales": "model-00031-of-00050.safetensors", + "model.layers.38.mlp.switch_mlp.gate_proj.weight": "model-00031-of-00050.safetensors", + "model.layers.38.mlp.switch_mlp.up_proj.biases": "model-00031-of-00050.safetensors", + "model.layers.38.mlp.switch_mlp.up_proj.scales": "model-00031-of-00050.safetensors", + "model.layers.38.mlp.switch_mlp.up_proj.weight": "model-00031-of-00050.safetensors", + "model.layers.38.post_attention_layernorm.weight": "model-00031-of-00050.safetensors", + "model.layers.38.self_attn.kv_a_layernorm.weight": "model-00031-of-00050.safetensors", + "model.layers.38.self_attn.kv_a_proj_with_mqa.biases": "model-00031-of-00050.safetensors", + "model.layers.38.self_attn.kv_a_proj_with_mqa.scales": "model-00031-of-00050.safetensors", + "model.layers.38.self_attn.kv_a_proj_with_mqa.weight": "model-00031-of-00050.safetensors", + "model.layers.38.self_attn.kv_b_proj.biases": "model-00031-of-00050.safetensors", + "model.layers.38.self_attn.kv_b_proj.scales": "model-00031-of-00050.safetensors", + "model.layers.38.self_attn.kv_b_proj.weight": "model-00031-of-00050.safetensors", + "model.layers.38.self_attn.o_proj.biases": "model-00031-of-00050.safetensors", + "model.layers.38.self_attn.o_proj.scales": "model-00031-of-00050.safetensors", + "model.layers.38.self_attn.o_proj.weight": "model-00031-of-00050.safetensors", + "model.layers.38.self_attn.q_a_layernorm.weight": "model-00031-of-00050.safetensors", + "model.layers.38.self_attn.q_a_proj.biases": "model-00031-of-00050.safetensors", + "model.layers.38.self_attn.q_a_proj.scales": "model-00031-of-00050.safetensors", + "model.layers.38.self_attn.q_a_proj.weight": "model-00031-of-00050.safetensors", + "model.layers.38.self_attn.q_b_proj.biases": "model-00031-of-00050.safetensors", + "model.layers.38.self_attn.q_b_proj.scales": "model-00031-of-00050.safetensors", + "model.layers.38.self_attn.q_b_proj.weight": "model-00031-of-00050.safetensors", + "model.layers.39.input_layernorm.weight": "model-00032-of-00050.safetensors", + "model.layers.39.mlp.gate.e_score_correction_bias": "model-00032-of-00050.safetensors", + "model.layers.39.mlp.gate.weight": "model-00032-of-00050.safetensors", + "model.layers.39.mlp.shared_experts.down_proj.biases": "model-00032-of-00050.safetensors", + "model.layers.39.mlp.shared_experts.down_proj.scales": "model-00032-of-00050.safetensors", + "model.layers.39.mlp.shared_experts.down_proj.weight": "model-00032-of-00050.safetensors", + "model.layers.39.mlp.shared_experts.gate_proj.biases": "model-00032-of-00050.safetensors", + "model.layers.39.mlp.shared_experts.gate_proj.scales": "model-00032-of-00050.safetensors", + "model.layers.39.mlp.shared_experts.gate_proj.weight": "model-00032-of-00050.safetensors", + "model.layers.39.mlp.shared_experts.up_proj.biases": "model-00032-of-00050.safetensors", + "model.layers.39.mlp.shared_experts.up_proj.scales": "model-00032-of-00050.safetensors", + "model.layers.39.mlp.shared_experts.up_proj.weight": "model-00032-of-00050.safetensors", + "model.layers.39.mlp.switch_mlp.down_proj.biases": "model-00032-of-00050.safetensors", + "model.layers.39.mlp.switch_mlp.down_proj.scales": "model-00032-of-00050.safetensors", + "model.layers.39.mlp.switch_mlp.down_proj.weight": "model-00032-of-00050.safetensors", + "model.layers.39.mlp.switch_mlp.gate_proj.biases": "model-00032-of-00050.safetensors", + "model.layers.39.mlp.switch_mlp.gate_proj.scales": "model-00032-of-00050.safetensors", + "model.layers.39.mlp.switch_mlp.gate_proj.weight": "model-00032-of-00050.safetensors", + "model.layers.39.mlp.switch_mlp.up_proj.biases": "model-00032-of-00050.safetensors", + "model.layers.39.mlp.switch_mlp.up_proj.scales": "model-00032-of-00050.safetensors", + "model.layers.39.mlp.switch_mlp.up_proj.weight": "model-00032-of-00050.safetensors", + "model.layers.39.post_attention_layernorm.weight": "model-00032-of-00050.safetensors", + "model.layers.39.self_attn.kv_a_layernorm.weight": "model-00031-of-00050.safetensors", + "model.layers.39.self_attn.kv_a_proj_with_mqa.biases": "model-00031-of-00050.safetensors", + "model.layers.39.self_attn.kv_a_proj_with_mqa.scales": "model-00031-of-00050.safetensors", + "model.layers.39.self_attn.kv_a_proj_with_mqa.weight": "model-00031-of-00050.safetensors", + "model.layers.39.self_attn.kv_b_proj.biases": "model-00031-of-00050.safetensors", + "model.layers.39.self_attn.kv_b_proj.scales": "model-00031-of-00050.safetensors", + "model.layers.39.self_attn.kv_b_proj.weight": "model-00031-of-00050.safetensors", + "model.layers.39.self_attn.o_proj.biases": "model-00031-of-00050.safetensors", + "model.layers.39.self_attn.o_proj.scales": "model-00031-of-00050.safetensors", + "model.layers.39.self_attn.o_proj.weight": "model-00031-of-00050.safetensors", + "model.layers.39.self_attn.q_a_layernorm.weight": "model-00031-of-00050.safetensors", + "model.layers.39.self_attn.q_a_proj.biases": "model-00031-of-00050.safetensors", + "model.layers.39.self_attn.q_a_proj.scales": "model-00031-of-00050.safetensors", + "model.layers.39.self_attn.q_a_proj.weight": "model-00031-of-00050.safetensors", + "model.layers.39.self_attn.q_b_proj.biases": "model-00031-of-00050.safetensors", + "model.layers.39.self_attn.q_b_proj.scales": "model-00031-of-00050.safetensors", + "model.layers.39.self_attn.q_b_proj.weight": "model-00031-of-00050.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00050.safetensors", + "model.layers.4.mlp.gate.e_score_correction_bias": "model-00002-of-00050.safetensors", + "model.layers.4.mlp.gate.weight": "model-00002-of-00050.safetensors", + "model.layers.4.mlp.shared_experts.down_proj.biases": "model-00002-of-00050.safetensors", + "model.layers.4.mlp.shared_experts.down_proj.scales": "model-00002-of-00050.safetensors", + "model.layers.4.mlp.shared_experts.down_proj.weight": "model-00002-of-00050.safetensors", + "model.layers.4.mlp.shared_experts.gate_proj.biases": "model-00002-of-00050.safetensors", + "model.layers.4.mlp.shared_experts.gate_proj.scales": "model-00002-of-00050.safetensors", + "model.layers.4.mlp.shared_experts.gate_proj.weight": "model-00002-of-00050.safetensors", + "model.layers.4.mlp.shared_experts.up_proj.biases": "model-00002-of-00050.safetensors", + "model.layers.4.mlp.shared_experts.up_proj.scales": "model-00002-of-00050.safetensors", + "model.layers.4.mlp.shared_experts.up_proj.weight": "model-00002-of-00050.safetensors", + "model.layers.4.mlp.switch_mlp.down_proj.biases": "model-00002-of-00050.safetensors", + "model.layers.4.mlp.switch_mlp.down_proj.scales": "model-00002-of-00050.safetensors", + "model.layers.4.mlp.switch_mlp.down_proj.weight": "model-00002-of-00050.safetensors", + "model.layers.4.mlp.switch_mlp.gate_proj.biases": "model-00002-of-00050.safetensors", + "model.layers.4.mlp.switch_mlp.gate_proj.scales": "model-00002-of-00050.safetensors", + "model.layers.4.mlp.switch_mlp.gate_proj.weight": "model-00002-of-00050.safetensors", + "model.layers.4.mlp.switch_mlp.up_proj.biases": "model-00002-of-00050.safetensors", + "model.layers.4.mlp.switch_mlp.up_proj.scales": "model-00002-of-00050.safetensors", + "model.layers.4.mlp.switch_mlp.up_proj.weight": "model-00002-of-00050.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00050.safetensors", + "model.layers.4.self_attn.kv_a_layernorm.weight": "model-00001-of-00050.safetensors", + "model.layers.4.self_attn.kv_a_proj_with_mqa.biases": "model-00001-of-00050.safetensors", + "model.layers.4.self_attn.kv_a_proj_with_mqa.scales": "model-00001-of-00050.safetensors", + "model.layers.4.self_attn.kv_a_proj_with_mqa.weight": "model-00001-of-00050.safetensors", + "model.layers.4.self_attn.kv_b_proj.biases": "model-00001-of-00050.safetensors", + "model.layers.4.self_attn.kv_b_proj.scales": "model-00001-of-00050.safetensors", + "model.layers.4.self_attn.kv_b_proj.weight": "model-00001-of-00050.safetensors", + "model.layers.4.self_attn.o_proj.biases": "model-00002-of-00050.safetensors", + "model.layers.4.self_attn.o_proj.scales": "model-00002-of-00050.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00050.safetensors", + "model.layers.4.self_attn.q_a_layernorm.weight": "model-00001-of-00050.safetensors", + "model.layers.4.self_attn.q_a_proj.biases": "model-00001-of-00050.safetensors", + "model.layers.4.self_attn.q_a_proj.scales": "model-00001-of-00050.safetensors", + "model.layers.4.self_attn.q_a_proj.weight": "model-00001-of-00050.safetensors", + "model.layers.4.self_attn.q_b_proj.biases": "model-00001-of-00050.safetensors", + "model.layers.4.self_attn.q_b_proj.scales": "model-00001-of-00050.safetensors", + "model.layers.4.self_attn.q_b_proj.weight": "model-00001-of-00050.safetensors", + "model.layers.40.input_layernorm.weight": "model-00033-of-00050.safetensors", + "model.layers.40.mlp.gate.e_score_correction_bias": "model-00033-of-00050.safetensors", + "model.layers.40.mlp.gate.weight": "model-00033-of-00050.safetensors", + "model.layers.40.mlp.shared_experts.down_proj.biases": "model-00033-of-00050.safetensors", + "model.layers.40.mlp.shared_experts.down_proj.scales": "model-00033-of-00050.safetensors", + "model.layers.40.mlp.shared_experts.down_proj.weight": "model-00033-of-00050.safetensors", + "model.layers.40.mlp.shared_experts.gate_proj.biases": "model-00033-of-00050.safetensors", + "model.layers.40.mlp.shared_experts.gate_proj.scales": "model-00033-of-00050.safetensors", + "model.layers.40.mlp.shared_experts.gate_proj.weight": "model-00033-of-00050.safetensors", + "model.layers.40.mlp.shared_experts.up_proj.biases": "model-00033-of-00050.safetensors", + "model.layers.40.mlp.shared_experts.up_proj.scales": "model-00033-of-00050.safetensors", + "model.layers.40.mlp.shared_experts.up_proj.weight": "model-00033-of-00050.safetensors", + "model.layers.40.mlp.switch_mlp.down_proj.biases": "model-00033-of-00050.safetensors", + "model.layers.40.mlp.switch_mlp.down_proj.scales": "model-00033-of-00050.safetensors", + "model.layers.40.mlp.switch_mlp.down_proj.weight": "model-00033-of-00050.safetensors", + "model.layers.40.mlp.switch_mlp.gate_proj.biases": "model-00033-of-00050.safetensors", + "model.layers.40.mlp.switch_mlp.gate_proj.scales": "model-00033-of-00050.safetensors", + "model.layers.40.mlp.switch_mlp.gate_proj.weight": "model-00032-of-00050.safetensors", + "model.layers.40.mlp.switch_mlp.up_proj.biases": "model-00033-of-00050.safetensors", + "model.layers.40.mlp.switch_mlp.up_proj.scales": "model-00033-of-00050.safetensors", + "model.layers.40.mlp.switch_mlp.up_proj.weight": "model-00033-of-00050.safetensors", + "model.layers.40.post_attention_layernorm.weight": "model-00033-of-00050.safetensors", + "model.layers.40.self_attn.kv_a_layernorm.weight": "model-00032-of-00050.safetensors", + "model.layers.40.self_attn.kv_a_proj_with_mqa.biases": "model-00032-of-00050.safetensors", + "model.layers.40.self_attn.kv_a_proj_with_mqa.scales": "model-00032-of-00050.safetensors", + "model.layers.40.self_attn.kv_a_proj_with_mqa.weight": "model-00032-of-00050.safetensors", + "model.layers.40.self_attn.kv_b_proj.biases": "model-00032-of-00050.safetensors", + "model.layers.40.self_attn.kv_b_proj.scales": "model-00032-of-00050.safetensors", + "model.layers.40.self_attn.kv_b_proj.weight": "model-00032-of-00050.safetensors", + "model.layers.40.self_attn.o_proj.biases": "model-00032-of-00050.safetensors", + "model.layers.40.self_attn.o_proj.scales": "model-00032-of-00050.safetensors", + "model.layers.40.self_attn.o_proj.weight": "model-00032-of-00050.safetensors", + "model.layers.40.self_attn.q_a_layernorm.weight": "model-00032-of-00050.safetensors", + "model.layers.40.self_attn.q_a_proj.biases": "model-00032-of-00050.safetensors", + "model.layers.40.self_attn.q_a_proj.scales": "model-00032-of-00050.safetensors", + "model.layers.40.self_attn.q_a_proj.weight": "model-00032-of-00050.safetensors", + "model.layers.40.self_attn.q_b_proj.biases": "model-00032-of-00050.safetensors", + "model.layers.40.self_attn.q_b_proj.scales": "model-00032-of-00050.safetensors", + "model.layers.40.self_attn.q_b_proj.weight": "model-00032-of-00050.safetensors", + "model.layers.41.input_layernorm.weight": "model-00034-of-00050.safetensors", + "model.layers.41.mlp.gate.e_score_correction_bias": "model-00034-of-00050.safetensors", + "model.layers.41.mlp.gate.weight": "model-00034-of-00050.safetensors", + "model.layers.41.mlp.shared_experts.down_proj.biases": "model-00034-of-00050.safetensors", + "model.layers.41.mlp.shared_experts.down_proj.scales": "model-00034-of-00050.safetensors", + "model.layers.41.mlp.shared_experts.down_proj.weight": "model-00034-of-00050.safetensors", + "model.layers.41.mlp.shared_experts.gate_proj.biases": "model-00034-of-00050.safetensors", + "model.layers.41.mlp.shared_experts.gate_proj.scales": "model-00034-of-00050.safetensors", + "model.layers.41.mlp.shared_experts.gate_proj.weight": "model-00034-of-00050.safetensors", + "model.layers.41.mlp.shared_experts.up_proj.biases": "model-00034-of-00050.safetensors", + "model.layers.41.mlp.shared_experts.up_proj.scales": "model-00034-of-00050.safetensors", + "model.layers.41.mlp.shared_experts.up_proj.weight": "model-00034-of-00050.safetensors", + "model.layers.41.mlp.switch_mlp.down_proj.biases": "model-00034-of-00050.safetensors", + "model.layers.41.mlp.switch_mlp.down_proj.scales": "model-00034-of-00050.safetensors", + "model.layers.41.mlp.switch_mlp.down_proj.weight": "model-00034-of-00050.safetensors", + "model.layers.41.mlp.switch_mlp.gate_proj.biases": "model-00033-of-00050.safetensors", + "model.layers.41.mlp.switch_mlp.gate_proj.scales": "model-00033-of-00050.safetensors", + "model.layers.41.mlp.switch_mlp.gate_proj.weight": "model-00033-of-00050.safetensors", + "model.layers.41.mlp.switch_mlp.up_proj.biases": "model-00034-of-00050.safetensors", + "model.layers.41.mlp.switch_mlp.up_proj.scales": "model-00034-of-00050.safetensors", + "model.layers.41.mlp.switch_mlp.up_proj.weight": "model-00034-of-00050.safetensors", + "model.layers.41.post_attention_layernorm.weight": "model-00034-of-00050.safetensors", + "model.layers.41.self_attn.kv_a_layernorm.weight": "model-00033-of-00050.safetensors", + "model.layers.41.self_attn.kv_a_proj_with_mqa.biases": "model-00033-of-00050.safetensors", + "model.layers.41.self_attn.kv_a_proj_with_mqa.scales": "model-00033-of-00050.safetensors", + "model.layers.41.self_attn.kv_a_proj_with_mqa.weight": "model-00033-of-00050.safetensors", + "model.layers.41.self_attn.kv_b_proj.biases": "model-00033-of-00050.safetensors", + "model.layers.41.self_attn.kv_b_proj.scales": "model-00033-of-00050.safetensors", + "model.layers.41.self_attn.kv_b_proj.weight": "model-00033-of-00050.safetensors", + "model.layers.41.self_attn.o_proj.biases": "model-00033-of-00050.safetensors", + "model.layers.41.self_attn.o_proj.scales": "model-00033-of-00050.safetensors", + "model.layers.41.self_attn.o_proj.weight": "model-00033-of-00050.safetensors", + "model.layers.41.self_attn.q_a_layernorm.weight": "model-00033-of-00050.safetensors", + "model.layers.41.self_attn.q_a_proj.biases": "model-00033-of-00050.safetensors", + "model.layers.41.self_attn.q_a_proj.scales": "model-00033-of-00050.safetensors", + "model.layers.41.self_attn.q_a_proj.weight": "model-00033-of-00050.safetensors", + "model.layers.41.self_attn.q_b_proj.biases": "model-00033-of-00050.safetensors", + "model.layers.41.self_attn.q_b_proj.scales": "model-00033-of-00050.safetensors", + "model.layers.41.self_attn.q_b_proj.weight": "model-00033-of-00050.safetensors", + "model.layers.42.input_layernorm.weight": "model-00035-of-00050.safetensors", + "model.layers.42.mlp.gate.e_score_correction_bias": "model-00035-of-00050.safetensors", + "model.layers.42.mlp.gate.weight": "model-00035-of-00050.safetensors", + "model.layers.42.mlp.shared_experts.down_proj.biases": "model-00035-of-00050.safetensors", + "model.layers.42.mlp.shared_experts.down_proj.scales": "model-00035-of-00050.safetensors", + "model.layers.42.mlp.shared_experts.down_proj.weight": "model-00035-of-00050.safetensors", + "model.layers.42.mlp.shared_experts.gate_proj.biases": "model-00035-of-00050.safetensors", + "model.layers.42.mlp.shared_experts.gate_proj.scales": "model-00035-of-00050.safetensors", + "model.layers.42.mlp.shared_experts.gate_proj.weight": "model-00035-of-00050.safetensors", + "model.layers.42.mlp.shared_experts.up_proj.biases": "model-00035-of-00050.safetensors", + "model.layers.42.mlp.shared_experts.up_proj.scales": "model-00035-of-00050.safetensors", + "model.layers.42.mlp.shared_experts.up_proj.weight": "model-00035-of-00050.safetensors", + "model.layers.42.mlp.switch_mlp.down_proj.biases": "model-00035-of-00050.safetensors", + "model.layers.42.mlp.switch_mlp.down_proj.scales": "model-00035-of-00050.safetensors", + "model.layers.42.mlp.switch_mlp.down_proj.weight": "model-00035-of-00050.safetensors", + "model.layers.42.mlp.switch_mlp.gate_proj.biases": "model-00034-of-00050.safetensors", + "model.layers.42.mlp.switch_mlp.gate_proj.scales": "model-00034-of-00050.safetensors", + "model.layers.42.mlp.switch_mlp.gate_proj.weight": "model-00034-of-00050.safetensors", + "model.layers.42.mlp.switch_mlp.up_proj.biases": "model-00035-of-00050.safetensors", + "model.layers.42.mlp.switch_mlp.up_proj.scales": "model-00035-of-00050.safetensors", + "model.layers.42.mlp.switch_mlp.up_proj.weight": "model-00034-of-00050.safetensors", + "model.layers.42.post_attention_layernorm.weight": "model-00035-of-00050.safetensors", + "model.layers.42.self_attn.kv_a_layernorm.weight": "model-00034-of-00050.safetensors", + "model.layers.42.self_attn.kv_a_proj_with_mqa.biases": "model-00034-of-00050.safetensors", + "model.layers.42.self_attn.kv_a_proj_with_mqa.scales": "model-00034-of-00050.safetensors", + "model.layers.42.self_attn.kv_a_proj_with_mqa.weight": "model-00034-of-00050.safetensors", + "model.layers.42.self_attn.kv_b_proj.biases": "model-00034-of-00050.safetensors", + "model.layers.42.self_attn.kv_b_proj.scales": "model-00034-of-00050.safetensors", + "model.layers.42.self_attn.kv_b_proj.weight": "model-00034-of-00050.safetensors", + "model.layers.42.self_attn.o_proj.biases": "model-00034-of-00050.safetensors", + "model.layers.42.self_attn.o_proj.scales": "model-00034-of-00050.safetensors", + "model.layers.42.self_attn.o_proj.weight": "model-00034-of-00050.safetensors", + "model.layers.42.self_attn.q_a_layernorm.weight": "model-00034-of-00050.safetensors", + "model.layers.42.self_attn.q_a_proj.biases": "model-00034-of-00050.safetensors", + "model.layers.42.self_attn.q_a_proj.scales": "model-00034-of-00050.safetensors", + "model.layers.42.self_attn.q_a_proj.weight": "model-00034-of-00050.safetensors", + "model.layers.42.self_attn.q_b_proj.biases": "model-00034-of-00050.safetensors", + "model.layers.42.self_attn.q_b_proj.scales": "model-00034-of-00050.safetensors", + "model.layers.42.self_attn.q_b_proj.weight": "model-00034-of-00050.safetensors", + "model.layers.43.input_layernorm.weight": "model-00036-of-00050.safetensors", + "model.layers.43.mlp.gate.e_score_correction_bias": "model-00036-of-00050.safetensors", + "model.layers.43.mlp.gate.weight": "model-00036-of-00050.safetensors", + "model.layers.43.mlp.shared_experts.down_proj.biases": "model-00036-of-00050.safetensors", + "model.layers.43.mlp.shared_experts.down_proj.scales": "model-00036-of-00050.safetensors", + "model.layers.43.mlp.shared_experts.down_proj.weight": "model-00036-of-00050.safetensors", + "model.layers.43.mlp.shared_experts.gate_proj.biases": "model-00036-of-00050.safetensors", + "model.layers.43.mlp.shared_experts.gate_proj.scales": "model-00036-of-00050.safetensors", + "model.layers.43.mlp.shared_experts.gate_proj.weight": "model-00036-of-00050.safetensors", + "model.layers.43.mlp.shared_experts.up_proj.biases": "model-00036-of-00050.safetensors", + "model.layers.43.mlp.shared_experts.up_proj.scales": "model-00036-of-00050.safetensors", + "model.layers.43.mlp.shared_experts.up_proj.weight": "model-00036-of-00050.safetensors", + "model.layers.43.mlp.switch_mlp.down_proj.biases": "model-00036-of-00050.safetensors", + "model.layers.43.mlp.switch_mlp.down_proj.scales": "model-00036-of-00050.safetensors", + "model.layers.43.mlp.switch_mlp.down_proj.weight": "model-00036-of-00050.safetensors", + "model.layers.43.mlp.switch_mlp.gate_proj.biases": "model-00035-of-00050.safetensors", + "model.layers.43.mlp.switch_mlp.gate_proj.scales": "model-00035-of-00050.safetensors", + "model.layers.43.mlp.switch_mlp.gate_proj.weight": "model-00035-of-00050.safetensors", + "model.layers.43.mlp.switch_mlp.up_proj.biases": "model-00035-of-00050.safetensors", + "model.layers.43.mlp.switch_mlp.up_proj.scales": "model-00035-of-00050.safetensors", + "model.layers.43.mlp.switch_mlp.up_proj.weight": "model-00035-of-00050.safetensors", + "model.layers.43.post_attention_layernorm.weight": "model-00036-of-00050.safetensors", + "model.layers.43.self_attn.kv_a_layernorm.weight": "model-00035-of-00050.safetensors", + "model.layers.43.self_attn.kv_a_proj_with_mqa.biases": "model-00035-of-00050.safetensors", + "model.layers.43.self_attn.kv_a_proj_with_mqa.scales": "model-00035-of-00050.safetensors", + "model.layers.43.self_attn.kv_a_proj_with_mqa.weight": "model-00035-of-00050.safetensors", + "model.layers.43.self_attn.kv_b_proj.biases": "model-00035-of-00050.safetensors", + "model.layers.43.self_attn.kv_b_proj.scales": "model-00035-of-00050.safetensors", + "model.layers.43.self_attn.kv_b_proj.weight": "model-00035-of-00050.safetensors", + "model.layers.43.self_attn.o_proj.biases": "model-00035-of-00050.safetensors", + "model.layers.43.self_attn.o_proj.scales": "model-00035-of-00050.safetensors", + "model.layers.43.self_attn.o_proj.weight": "model-00035-of-00050.safetensors", + "model.layers.43.self_attn.q_a_layernorm.weight": "model-00035-of-00050.safetensors", + "model.layers.43.self_attn.q_a_proj.biases": "model-00035-of-00050.safetensors", + "model.layers.43.self_attn.q_a_proj.scales": "model-00035-of-00050.safetensors", + "model.layers.43.self_attn.q_a_proj.weight": "model-00035-of-00050.safetensors", + "model.layers.43.self_attn.q_b_proj.biases": "model-00035-of-00050.safetensors", + "model.layers.43.self_attn.q_b_proj.scales": "model-00035-of-00050.safetensors", + "model.layers.43.self_attn.q_b_proj.weight": "model-00035-of-00050.safetensors", + "model.layers.44.input_layernorm.weight": "model-00037-of-00050.safetensors", + "model.layers.44.mlp.gate.e_score_correction_bias": "model-00037-of-00050.safetensors", + "model.layers.44.mlp.gate.weight": "model-00037-of-00050.safetensors", + "model.layers.44.mlp.shared_experts.down_proj.biases": "model-00037-of-00050.safetensors", + "model.layers.44.mlp.shared_experts.down_proj.scales": "model-00037-of-00050.safetensors", + "model.layers.44.mlp.shared_experts.down_proj.weight": "model-00037-of-00050.safetensors", + "model.layers.44.mlp.shared_experts.gate_proj.biases": "model-00037-of-00050.safetensors", + "model.layers.44.mlp.shared_experts.gate_proj.scales": "model-00037-of-00050.safetensors", + "model.layers.44.mlp.shared_experts.gate_proj.weight": "model-00037-of-00050.safetensors", + "model.layers.44.mlp.shared_experts.up_proj.biases": "model-00037-of-00050.safetensors", + "model.layers.44.mlp.shared_experts.up_proj.scales": "model-00037-of-00050.safetensors", + "model.layers.44.mlp.shared_experts.up_proj.weight": "model-00037-of-00050.safetensors", + "model.layers.44.mlp.switch_mlp.down_proj.biases": "model-00037-of-00050.safetensors", + "model.layers.44.mlp.switch_mlp.down_proj.scales": "model-00037-of-00050.safetensors", + "model.layers.44.mlp.switch_mlp.down_proj.weight": "model-00036-of-00050.safetensors", + "model.layers.44.mlp.switch_mlp.gate_proj.biases": "model-00036-of-00050.safetensors", + "model.layers.44.mlp.switch_mlp.gate_proj.scales": "model-00036-of-00050.safetensors", + "model.layers.44.mlp.switch_mlp.gate_proj.weight": "model-00036-of-00050.safetensors", + "model.layers.44.mlp.switch_mlp.up_proj.biases": "model-00036-of-00050.safetensors", + "model.layers.44.mlp.switch_mlp.up_proj.scales": "model-00036-of-00050.safetensors", + "model.layers.44.mlp.switch_mlp.up_proj.weight": "model-00036-of-00050.safetensors", + "model.layers.44.post_attention_layernorm.weight": "model-00037-of-00050.safetensors", + "model.layers.44.self_attn.kv_a_layernorm.weight": "model-00036-of-00050.safetensors", + "model.layers.44.self_attn.kv_a_proj_with_mqa.biases": "model-00036-of-00050.safetensors", + "model.layers.44.self_attn.kv_a_proj_with_mqa.scales": "model-00036-of-00050.safetensors", + "model.layers.44.self_attn.kv_a_proj_with_mqa.weight": "model-00036-of-00050.safetensors", + "model.layers.44.self_attn.kv_b_proj.biases": "model-00036-of-00050.safetensors", + "model.layers.44.self_attn.kv_b_proj.scales": "model-00036-of-00050.safetensors", + "model.layers.44.self_attn.kv_b_proj.weight": "model-00036-of-00050.safetensors", + "model.layers.44.self_attn.o_proj.biases": "model-00036-of-00050.safetensors", + "model.layers.44.self_attn.o_proj.scales": "model-00036-of-00050.safetensors", + "model.layers.44.self_attn.o_proj.weight": "model-00036-of-00050.safetensors", + "model.layers.44.self_attn.q_a_layernorm.weight": "model-00036-of-00050.safetensors", + "model.layers.44.self_attn.q_a_proj.biases": "model-00036-of-00050.safetensors", + "model.layers.44.self_attn.q_a_proj.scales": "model-00036-of-00050.safetensors", + "model.layers.44.self_attn.q_a_proj.weight": "model-00036-of-00050.safetensors", + "model.layers.44.self_attn.q_b_proj.biases": "model-00036-of-00050.safetensors", + "model.layers.44.self_attn.q_b_proj.scales": "model-00036-of-00050.safetensors", + "model.layers.44.self_attn.q_b_proj.weight": "model-00036-of-00050.safetensors", + "model.layers.45.input_layernorm.weight": "model-00037-of-00050.safetensors", + "model.layers.45.mlp.gate.e_score_correction_bias": "model-00037-of-00050.safetensors", + "model.layers.45.mlp.gate.weight": "model-00037-of-00050.safetensors", + "model.layers.45.mlp.shared_experts.down_proj.biases": "model-00037-of-00050.safetensors", + "model.layers.45.mlp.shared_experts.down_proj.scales": "model-00037-of-00050.safetensors", + "model.layers.45.mlp.shared_experts.down_proj.weight": "model-00037-of-00050.safetensors", + "model.layers.45.mlp.shared_experts.gate_proj.biases": "model-00037-of-00050.safetensors", + "model.layers.45.mlp.shared_experts.gate_proj.scales": "model-00037-of-00050.safetensors", + "model.layers.45.mlp.shared_experts.gate_proj.weight": "model-00037-of-00050.safetensors", + "model.layers.45.mlp.shared_experts.up_proj.biases": "model-00037-of-00050.safetensors", + "model.layers.45.mlp.shared_experts.up_proj.scales": "model-00037-of-00050.safetensors", + "model.layers.45.mlp.shared_experts.up_proj.weight": "model-00037-of-00050.safetensors", + "model.layers.45.mlp.switch_mlp.down_proj.biases": "model-00037-of-00050.safetensors", + "model.layers.45.mlp.switch_mlp.down_proj.scales": "model-00037-of-00050.safetensors", + "model.layers.45.mlp.switch_mlp.down_proj.weight": "model-00037-of-00050.safetensors", + "model.layers.45.mlp.switch_mlp.gate_proj.biases": "model-00037-of-00050.safetensors", + "model.layers.45.mlp.switch_mlp.gate_proj.scales": "model-00037-of-00050.safetensors", + "model.layers.45.mlp.switch_mlp.gate_proj.weight": "model-00037-of-00050.safetensors", + "model.layers.45.mlp.switch_mlp.up_proj.biases": "model-00037-of-00050.safetensors", + "model.layers.45.mlp.switch_mlp.up_proj.scales": "model-00037-of-00050.safetensors", + "model.layers.45.mlp.switch_mlp.up_proj.weight": "model-00037-of-00050.safetensors", + "model.layers.45.post_attention_layernorm.weight": "model-00037-of-00050.safetensors", + "model.layers.45.self_attn.kv_a_layernorm.weight": "model-00037-of-00050.safetensors", + "model.layers.45.self_attn.kv_a_proj_with_mqa.biases": "model-00037-of-00050.safetensors", + "model.layers.45.self_attn.kv_a_proj_with_mqa.scales": "model-00037-of-00050.safetensors", + "model.layers.45.self_attn.kv_a_proj_with_mqa.weight": "model-00037-of-00050.safetensors", + "model.layers.45.self_attn.kv_b_proj.biases": "model-00037-of-00050.safetensors", + "model.layers.45.self_attn.kv_b_proj.scales": "model-00037-of-00050.safetensors", + "model.layers.45.self_attn.kv_b_proj.weight": "model-00037-of-00050.safetensors", + "model.layers.45.self_attn.o_proj.biases": "model-00037-of-00050.safetensors", + "model.layers.45.self_attn.o_proj.scales": "model-00037-of-00050.safetensors", + "model.layers.45.self_attn.o_proj.weight": "model-00037-of-00050.safetensors", + "model.layers.45.self_attn.q_a_layernorm.weight": "model-00037-of-00050.safetensors", + "model.layers.45.self_attn.q_a_proj.biases": "model-00037-of-00050.safetensors", + "model.layers.45.self_attn.q_a_proj.scales": "model-00037-of-00050.safetensors", + "model.layers.45.self_attn.q_a_proj.weight": "model-00037-of-00050.safetensors", + "model.layers.45.self_attn.q_b_proj.biases": "model-00037-of-00050.safetensors", + "model.layers.45.self_attn.q_b_proj.scales": "model-00037-of-00050.safetensors", + "model.layers.45.self_attn.q_b_proj.weight": "model-00037-of-00050.safetensors", + "model.layers.46.input_layernorm.weight": "model-00038-of-00050.safetensors", + "model.layers.46.mlp.gate.e_score_correction_bias": "model-00038-of-00050.safetensors", + "model.layers.46.mlp.gate.weight": "model-00038-of-00050.safetensors", + "model.layers.46.mlp.shared_experts.down_proj.biases": "model-00038-of-00050.safetensors", + "model.layers.46.mlp.shared_experts.down_proj.scales": "model-00038-of-00050.safetensors", + "model.layers.46.mlp.shared_experts.down_proj.weight": "model-00038-of-00050.safetensors", + "model.layers.46.mlp.shared_experts.gate_proj.biases": "model-00038-of-00050.safetensors", + "model.layers.46.mlp.shared_experts.gate_proj.scales": "model-00038-of-00050.safetensors", + "model.layers.46.mlp.shared_experts.gate_proj.weight": "model-00038-of-00050.safetensors", + "model.layers.46.mlp.shared_experts.up_proj.biases": "model-00038-of-00050.safetensors", + "model.layers.46.mlp.shared_experts.up_proj.scales": "model-00038-of-00050.safetensors", + "model.layers.46.mlp.shared_experts.up_proj.weight": "model-00038-of-00050.safetensors", + "model.layers.46.mlp.switch_mlp.down_proj.biases": "model-00038-of-00050.safetensors", + "model.layers.46.mlp.switch_mlp.down_proj.scales": "model-00038-of-00050.safetensors", + "model.layers.46.mlp.switch_mlp.down_proj.weight": "model-00038-of-00050.safetensors", + "model.layers.46.mlp.switch_mlp.gate_proj.biases": "model-00038-of-00050.safetensors", + "model.layers.46.mlp.switch_mlp.gate_proj.scales": "model-00038-of-00050.safetensors", + "model.layers.46.mlp.switch_mlp.gate_proj.weight": "model-00038-of-00050.safetensors", + "model.layers.46.mlp.switch_mlp.up_proj.biases": "model-00038-of-00050.safetensors", + "model.layers.46.mlp.switch_mlp.up_proj.scales": "model-00038-of-00050.safetensors", + "model.layers.46.mlp.switch_mlp.up_proj.weight": "model-00038-of-00050.safetensors", + "model.layers.46.post_attention_layernorm.weight": "model-00038-of-00050.safetensors", + "model.layers.46.self_attn.kv_a_layernorm.weight": "model-00037-of-00050.safetensors", + "model.layers.46.self_attn.kv_a_proj_with_mqa.biases": "model-00037-of-00050.safetensors", + "model.layers.46.self_attn.kv_a_proj_with_mqa.scales": "model-00037-of-00050.safetensors", + "model.layers.46.self_attn.kv_a_proj_with_mqa.weight": "model-00037-of-00050.safetensors", + "model.layers.46.self_attn.kv_b_proj.biases": "model-00037-of-00050.safetensors", + "model.layers.46.self_attn.kv_b_proj.scales": "model-00037-of-00050.safetensors", + "model.layers.46.self_attn.kv_b_proj.weight": "model-00037-of-00050.safetensors", + "model.layers.46.self_attn.o_proj.biases": "model-00037-of-00050.safetensors", + "model.layers.46.self_attn.o_proj.scales": "model-00037-of-00050.safetensors", + "model.layers.46.self_attn.o_proj.weight": "model-00037-of-00050.safetensors", + "model.layers.46.self_attn.q_a_layernorm.weight": "model-00037-of-00050.safetensors", + "model.layers.46.self_attn.q_a_proj.biases": "model-00037-of-00050.safetensors", + "model.layers.46.self_attn.q_a_proj.scales": "model-00037-of-00050.safetensors", + "model.layers.46.self_attn.q_a_proj.weight": "model-00037-of-00050.safetensors", + "model.layers.46.self_attn.q_b_proj.biases": "model-00037-of-00050.safetensors", + "model.layers.46.self_attn.q_b_proj.scales": "model-00037-of-00050.safetensors", + "model.layers.46.self_attn.q_b_proj.weight": "model-00037-of-00050.safetensors", + "model.layers.47.input_layernorm.weight": "model-00039-of-00050.safetensors", + "model.layers.47.mlp.gate.e_score_correction_bias": "model-00039-of-00050.safetensors", + "model.layers.47.mlp.gate.weight": "model-00039-of-00050.safetensors", + "model.layers.47.mlp.shared_experts.down_proj.biases": "model-00039-of-00050.safetensors", + "model.layers.47.mlp.shared_experts.down_proj.scales": "model-00039-of-00050.safetensors", + "model.layers.47.mlp.shared_experts.down_proj.weight": "model-00039-of-00050.safetensors", + "model.layers.47.mlp.shared_experts.gate_proj.biases": "model-00039-of-00050.safetensors", + "model.layers.47.mlp.shared_experts.gate_proj.scales": "model-00039-of-00050.safetensors", + "model.layers.47.mlp.shared_experts.gate_proj.weight": "model-00039-of-00050.safetensors", + "model.layers.47.mlp.shared_experts.up_proj.biases": "model-00039-of-00050.safetensors", + "model.layers.47.mlp.shared_experts.up_proj.scales": "model-00039-of-00050.safetensors", + "model.layers.47.mlp.shared_experts.up_proj.weight": "model-00039-of-00050.safetensors", + "model.layers.47.mlp.switch_mlp.down_proj.biases": "model-00039-of-00050.safetensors", + "model.layers.47.mlp.switch_mlp.down_proj.scales": "model-00039-of-00050.safetensors", + "model.layers.47.mlp.switch_mlp.down_proj.weight": "model-00039-of-00050.safetensors", + "model.layers.47.mlp.switch_mlp.gate_proj.biases": "model-00039-of-00050.safetensors", + "model.layers.47.mlp.switch_mlp.gate_proj.scales": "model-00039-of-00050.safetensors", + "model.layers.47.mlp.switch_mlp.gate_proj.weight": "model-00038-of-00050.safetensors", + "model.layers.47.mlp.switch_mlp.up_proj.biases": "model-00039-of-00050.safetensors", + "model.layers.47.mlp.switch_mlp.up_proj.scales": "model-00039-of-00050.safetensors", + "model.layers.47.mlp.switch_mlp.up_proj.weight": "model-00039-of-00050.safetensors", + "model.layers.47.post_attention_layernorm.weight": "model-00039-of-00050.safetensors", + "model.layers.47.self_attn.kv_a_layernorm.weight": "model-00038-of-00050.safetensors", + "model.layers.47.self_attn.kv_a_proj_with_mqa.biases": "model-00038-of-00050.safetensors", + "model.layers.47.self_attn.kv_a_proj_with_mqa.scales": "model-00038-of-00050.safetensors", + "model.layers.47.self_attn.kv_a_proj_with_mqa.weight": "model-00038-of-00050.safetensors", + "model.layers.47.self_attn.kv_b_proj.biases": "model-00038-of-00050.safetensors", + "model.layers.47.self_attn.kv_b_proj.scales": "model-00038-of-00050.safetensors", + "model.layers.47.self_attn.kv_b_proj.weight": "model-00038-of-00050.safetensors", + "model.layers.47.self_attn.o_proj.biases": "model-00038-of-00050.safetensors", + "model.layers.47.self_attn.o_proj.scales": "model-00038-of-00050.safetensors", + "model.layers.47.self_attn.o_proj.weight": "model-00038-of-00050.safetensors", + "model.layers.47.self_attn.q_a_layernorm.weight": "model-00038-of-00050.safetensors", + "model.layers.47.self_attn.q_a_proj.biases": "model-00038-of-00050.safetensors", + "model.layers.47.self_attn.q_a_proj.scales": "model-00038-of-00050.safetensors", + "model.layers.47.self_attn.q_a_proj.weight": "model-00038-of-00050.safetensors", + "model.layers.47.self_attn.q_b_proj.biases": "model-00038-of-00050.safetensors", + "model.layers.47.self_attn.q_b_proj.scales": "model-00038-of-00050.safetensors", + "model.layers.47.self_attn.q_b_proj.weight": "model-00038-of-00050.safetensors", + "model.layers.48.input_layernorm.weight": "model-00040-of-00050.safetensors", + "model.layers.48.mlp.gate.e_score_correction_bias": "model-00040-of-00050.safetensors", + "model.layers.48.mlp.gate.weight": "model-00040-of-00050.safetensors", + "model.layers.48.mlp.shared_experts.down_proj.biases": "model-00040-of-00050.safetensors", + "model.layers.48.mlp.shared_experts.down_proj.scales": "model-00040-of-00050.safetensors", + "model.layers.48.mlp.shared_experts.down_proj.weight": "model-00040-of-00050.safetensors", + "model.layers.48.mlp.shared_experts.gate_proj.biases": "model-00040-of-00050.safetensors", + "model.layers.48.mlp.shared_experts.gate_proj.scales": "model-00040-of-00050.safetensors", + "model.layers.48.mlp.shared_experts.gate_proj.weight": "model-00040-of-00050.safetensors", + "model.layers.48.mlp.shared_experts.up_proj.biases": "model-00040-of-00050.safetensors", + "model.layers.48.mlp.shared_experts.up_proj.scales": "model-00040-of-00050.safetensors", + "model.layers.48.mlp.shared_experts.up_proj.weight": "model-00040-of-00050.safetensors", + "model.layers.48.mlp.switch_mlp.down_proj.biases": "model-00040-of-00050.safetensors", + "model.layers.48.mlp.switch_mlp.down_proj.scales": "model-00040-of-00050.safetensors", + "model.layers.48.mlp.switch_mlp.down_proj.weight": "model-00040-of-00050.safetensors", + "model.layers.48.mlp.switch_mlp.gate_proj.biases": "model-00039-of-00050.safetensors", + "model.layers.48.mlp.switch_mlp.gate_proj.scales": "model-00039-of-00050.safetensors", + "model.layers.48.mlp.switch_mlp.gate_proj.weight": "model-00039-of-00050.safetensors", + "model.layers.48.mlp.switch_mlp.up_proj.biases": "model-00040-of-00050.safetensors", + "model.layers.48.mlp.switch_mlp.up_proj.scales": "model-00040-of-00050.safetensors", + "model.layers.48.mlp.switch_mlp.up_proj.weight": "model-00040-of-00050.safetensors", + "model.layers.48.post_attention_layernorm.weight": "model-00040-of-00050.safetensors", + "model.layers.48.self_attn.kv_a_layernorm.weight": "model-00039-of-00050.safetensors", + "model.layers.48.self_attn.kv_a_proj_with_mqa.biases": "model-00039-of-00050.safetensors", + "model.layers.48.self_attn.kv_a_proj_with_mqa.scales": "model-00039-of-00050.safetensors", + "model.layers.48.self_attn.kv_a_proj_with_mqa.weight": "model-00039-of-00050.safetensors", + "model.layers.48.self_attn.kv_b_proj.biases": "model-00039-of-00050.safetensors", + "model.layers.48.self_attn.kv_b_proj.scales": "model-00039-of-00050.safetensors", + "model.layers.48.self_attn.kv_b_proj.weight": "model-00039-of-00050.safetensors", + "model.layers.48.self_attn.o_proj.biases": "model-00039-of-00050.safetensors", + "model.layers.48.self_attn.o_proj.scales": "model-00039-of-00050.safetensors", + "model.layers.48.self_attn.o_proj.weight": "model-00039-of-00050.safetensors", + "model.layers.48.self_attn.q_a_layernorm.weight": "model-00039-of-00050.safetensors", + "model.layers.48.self_attn.q_a_proj.biases": "model-00039-of-00050.safetensors", + "model.layers.48.self_attn.q_a_proj.scales": "model-00039-of-00050.safetensors", + "model.layers.48.self_attn.q_a_proj.weight": "model-00039-of-00050.safetensors", + "model.layers.48.self_attn.q_b_proj.biases": "model-00039-of-00050.safetensors", + "model.layers.48.self_attn.q_b_proj.scales": "model-00039-of-00050.safetensors", + "model.layers.48.self_attn.q_b_proj.weight": "model-00039-of-00050.safetensors", + "model.layers.49.input_layernorm.weight": "model-00041-of-00050.safetensors", + "model.layers.49.mlp.gate.e_score_correction_bias": "model-00041-of-00050.safetensors", + "model.layers.49.mlp.gate.weight": "model-00041-of-00050.safetensors", + "model.layers.49.mlp.shared_experts.down_proj.biases": "model-00041-of-00050.safetensors", + "model.layers.49.mlp.shared_experts.down_proj.scales": "model-00041-of-00050.safetensors", + "model.layers.49.mlp.shared_experts.down_proj.weight": "model-00041-of-00050.safetensors", + "model.layers.49.mlp.shared_experts.gate_proj.biases": "model-00041-of-00050.safetensors", + "model.layers.49.mlp.shared_experts.gate_proj.scales": "model-00041-of-00050.safetensors", + "model.layers.49.mlp.shared_experts.gate_proj.weight": "model-00041-of-00050.safetensors", + "model.layers.49.mlp.shared_experts.up_proj.biases": "model-00041-of-00050.safetensors", + "model.layers.49.mlp.shared_experts.up_proj.scales": "model-00041-of-00050.safetensors", + "model.layers.49.mlp.shared_experts.up_proj.weight": "model-00041-of-00050.safetensors", + "model.layers.49.mlp.switch_mlp.down_proj.biases": "model-00041-of-00050.safetensors", + "model.layers.49.mlp.switch_mlp.down_proj.scales": "model-00041-of-00050.safetensors", + "model.layers.49.mlp.switch_mlp.down_proj.weight": "model-00041-of-00050.safetensors", + "model.layers.49.mlp.switch_mlp.gate_proj.biases": "model-00040-of-00050.safetensors", + "model.layers.49.mlp.switch_mlp.gate_proj.scales": "model-00040-of-00050.safetensors", + "model.layers.49.mlp.switch_mlp.gate_proj.weight": "model-00040-of-00050.safetensors", + "model.layers.49.mlp.switch_mlp.up_proj.biases": "model-00041-of-00050.safetensors", + "model.layers.49.mlp.switch_mlp.up_proj.scales": "model-00041-of-00050.safetensors", + "model.layers.49.mlp.switch_mlp.up_proj.weight": "model-00040-of-00050.safetensors", + "model.layers.49.post_attention_layernorm.weight": "model-00041-of-00050.safetensors", + "model.layers.49.self_attn.kv_a_layernorm.weight": "model-00040-of-00050.safetensors", + "model.layers.49.self_attn.kv_a_proj_with_mqa.biases": "model-00040-of-00050.safetensors", + "model.layers.49.self_attn.kv_a_proj_with_mqa.scales": "model-00040-of-00050.safetensors", + "model.layers.49.self_attn.kv_a_proj_with_mqa.weight": "model-00040-of-00050.safetensors", + "model.layers.49.self_attn.kv_b_proj.biases": "model-00040-of-00050.safetensors", + "model.layers.49.self_attn.kv_b_proj.scales": "model-00040-of-00050.safetensors", + "model.layers.49.self_attn.kv_b_proj.weight": "model-00040-of-00050.safetensors", + "model.layers.49.self_attn.o_proj.biases": "model-00040-of-00050.safetensors", + "model.layers.49.self_attn.o_proj.scales": "model-00040-of-00050.safetensors", + "model.layers.49.self_attn.o_proj.weight": "model-00040-of-00050.safetensors", + "model.layers.49.self_attn.q_a_layernorm.weight": "model-00040-of-00050.safetensors", + "model.layers.49.self_attn.q_a_proj.biases": "model-00040-of-00050.safetensors", + "model.layers.49.self_attn.q_a_proj.scales": "model-00040-of-00050.safetensors", + "model.layers.49.self_attn.q_a_proj.weight": "model-00040-of-00050.safetensors", + "model.layers.49.self_attn.q_b_proj.biases": "model-00040-of-00050.safetensors", + "model.layers.49.self_attn.q_b_proj.scales": "model-00040-of-00050.safetensors", + "model.layers.49.self_attn.q_b_proj.weight": "model-00040-of-00050.safetensors", + "model.layers.5.input_layernorm.weight": "model-00003-of-00050.safetensors", + "model.layers.5.mlp.gate.e_score_correction_bias": "model-00003-of-00050.safetensors", + "model.layers.5.mlp.gate.weight": "model-00003-of-00050.safetensors", + "model.layers.5.mlp.shared_experts.down_proj.biases": "model-00003-of-00050.safetensors", + "model.layers.5.mlp.shared_experts.down_proj.scales": "model-00003-of-00050.safetensors", + "model.layers.5.mlp.shared_experts.down_proj.weight": "model-00003-of-00050.safetensors", + "model.layers.5.mlp.shared_experts.gate_proj.biases": "model-00003-of-00050.safetensors", + "model.layers.5.mlp.shared_experts.gate_proj.scales": "model-00003-of-00050.safetensors", + "model.layers.5.mlp.shared_experts.gate_proj.weight": "model-00003-of-00050.safetensors", + "model.layers.5.mlp.shared_experts.up_proj.biases": "model-00003-of-00050.safetensors", + "model.layers.5.mlp.shared_experts.up_proj.scales": "model-00003-of-00050.safetensors", + "model.layers.5.mlp.shared_experts.up_proj.weight": "model-00003-of-00050.safetensors", + "model.layers.5.mlp.switch_mlp.down_proj.biases": "model-00003-of-00050.safetensors", + "model.layers.5.mlp.switch_mlp.down_proj.scales": "model-00003-of-00050.safetensors", + "model.layers.5.mlp.switch_mlp.down_proj.weight": "model-00003-of-00050.safetensors", + "model.layers.5.mlp.switch_mlp.gate_proj.biases": "model-00003-of-00050.safetensors", + "model.layers.5.mlp.switch_mlp.gate_proj.scales": "model-00003-of-00050.safetensors", + "model.layers.5.mlp.switch_mlp.gate_proj.weight": "model-00002-of-00050.safetensors", + "model.layers.5.mlp.switch_mlp.up_proj.biases": "model-00003-of-00050.safetensors", + "model.layers.5.mlp.switch_mlp.up_proj.scales": "model-00003-of-00050.safetensors", + "model.layers.5.mlp.switch_mlp.up_proj.weight": "model-00003-of-00050.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00003-of-00050.safetensors", + "model.layers.5.self_attn.kv_a_layernorm.weight": "model-00002-of-00050.safetensors", + "model.layers.5.self_attn.kv_a_proj_with_mqa.biases": "model-00002-of-00050.safetensors", + "model.layers.5.self_attn.kv_a_proj_with_mqa.scales": "model-00002-of-00050.safetensors", + "model.layers.5.self_attn.kv_a_proj_with_mqa.weight": "model-00002-of-00050.safetensors", + "model.layers.5.self_attn.kv_b_proj.biases": "model-00002-of-00050.safetensors", + "model.layers.5.self_attn.kv_b_proj.scales": "model-00002-of-00050.safetensors", + "model.layers.5.self_attn.kv_b_proj.weight": "model-00002-of-00050.safetensors", + "model.layers.5.self_attn.o_proj.biases": "model-00002-of-00050.safetensors", + "model.layers.5.self_attn.o_proj.scales": "model-00002-of-00050.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00050.safetensors", + "model.layers.5.self_attn.q_a_layernorm.weight": "model-00002-of-00050.safetensors", + "model.layers.5.self_attn.q_a_proj.biases": "model-00002-of-00050.safetensors", + "model.layers.5.self_attn.q_a_proj.scales": "model-00002-of-00050.safetensors", + "model.layers.5.self_attn.q_a_proj.weight": "model-00002-of-00050.safetensors", + "model.layers.5.self_attn.q_b_proj.biases": "model-00002-of-00050.safetensors", + "model.layers.5.self_attn.q_b_proj.scales": "model-00002-of-00050.safetensors", + "model.layers.5.self_attn.q_b_proj.weight": "model-00002-of-00050.safetensors", + "model.layers.50.input_layernorm.weight": "model-00042-of-00050.safetensors", + "model.layers.50.mlp.gate.e_score_correction_bias": "model-00042-of-00050.safetensors", + "model.layers.50.mlp.gate.weight": "model-00042-of-00050.safetensors", + "model.layers.50.mlp.shared_experts.down_proj.biases": "model-00042-of-00050.safetensors", + "model.layers.50.mlp.shared_experts.down_proj.scales": "model-00042-of-00050.safetensors", + "model.layers.50.mlp.shared_experts.down_proj.weight": "model-00042-of-00050.safetensors", + "model.layers.50.mlp.shared_experts.gate_proj.biases": "model-00042-of-00050.safetensors", + "model.layers.50.mlp.shared_experts.gate_proj.scales": "model-00042-of-00050.safetensors", + "model.layers.50.mlp.shared_experts.gate_proj.weight": "model-00042-of-00050.safetensors", + "model.layers.50.mlp.shared_experts.up_proj.biases": "model-00042-of-00050.safetensors", + "model.layers.50.mlp.shared_experts.up_proj.scales": "model-00042-of-00050.safetensors", + "model.layers.50.mlp.shared_experts.up_proj.weight": "model-00042-of-00050.safetensors", + "model.layers.50.mlp.switch_mlp.down_proj.biases": "model-00042-of-00050.safetensors", + "model.layers.50.mlp.switch_mlp.down_proj.scales": "model-00042-of-00050.safetensors", + "model.layers.50.mlp.switch_mlp.down_proj.weight": "model-00042-of-00050.safetensors", + "model.layers.50.mlp.switch_mlp.gate_proj.biases": "model-00041-of-00050.safetensors", + "model.layers.50.mlp.switch_mlp.gate_proj.scales": "model-00041-of-00050.safetensors", + "model.layers.50.mlp.switch_mlp.gate_proj.weight": "model-00041-of-00050.safetensors", + "model.layers.50.mlp.switch_mlp.up_proj.biases": "model-00041-of-00050.safetensors", + "model.layers.50.mlp.switch_mlp.up_proj.scales": "model-00041-of-00050.safetensors", + "model.layers.50.mlp.switch_mlp.up_proj.weight": "model-00041-of-00050.safetensors", + "model.layers.50.post_attention_layernorm.weight": "model-00042-of-00050.safetensors", + "model.layers.50.self_attn.kv_a_layernorm.weight": "model-00041-of-00050.safetensors", + "model.layers.50.self_attn.kv_a_proj_with_mqa.biases": "model-00041-of-00050.safetensors", + "model.layers.50.self_attn.kv_a_proj_with_mqa.scales": "model-00041-of-00050.safetensors", + "model.layers.50.self_attn.kv_a_proj_with_mqa.weight": "model-00041-of-00050.safetensors", + "model.layers.50.self_attn.kv_b_proj.biases": "model-00041-of-00050.safetensors", + "model.layers.50.self_attn.kv_b_proj.scales": "model-00041-of-00050.safetensors", + "model.layers.50.self_attn.kv_b_proj.weight": "model-00041-of-00050.safetensors", + "model.layers.50.self_attn.o_proj.biases": "model-00041-of-00050.safetensors", + "model.layers.50.self_attn.o_proj.scales": "model-00041-of-00050.safetensors", + "model.layers.50.self_attn.o_proj.weight": "model-00041-of-00050.safetensors", + "model.layers.50.self_attn.q_a_layernorm.weight": "model-00041-of-00050.safetensors", + "model.layers.50.self_attn.q_a_proj.biases": "model-00041-of-00050.safetensors", + "model.layers.50.self_attn.q_a_proj.scales": "model-00041-of-00050.safetensors", + "model.layers.50.self_attn.q_a_proj.weight": "model-00041-of-00050.safetensors", + "model.layers.50.self_attn.q_b_proj.biases": "model-00041-of-00050.safetensors", + "model.layers.50.self_attn.q_b_proj.scales": "model-00041-of-00050.safetensors", + "model.layers.50.self_attn.q_b_proj.weight": "model-00041-of-00050.safetensors", + "model.layers.51.input_layernorm.weight": "model-00043-of-00050.safetensors", + "model.layers.51.mlp.gate.e_score_correction_bias": "model-00043-of-00050.safetensors", + "model.layers.51.mlp.gate.weight": "model-00043-of-00050.safetensors", + "model.layers.51.mlp.shared_experts.down_proj.biases": "model-00043-of-00050.safetensors", + "model.layers.51.mlp.shared_experts.down_proj.scales": "model-00043-of-00050.safetensors", + "model.layers.51.mlp.shared_experts.down_proj.weight": "model-00043-of-00050.safetensors", + "model.layers.51.mlp.shared_experts.gate_proj.biases": "model-00043-of-00050.safetensors", + "model.layers.51.mlp.shared_experts.gate_proj.scales": "model-00043-of-00050.safetensors", + "model.layers.51.mlp.shared_experts.gate_proj.weight": "model-00043-of-00050.safetensors", + "model.layers.51.mlp.shared_experts.up_proj.biases": "model-00043-of-00050.safetensors", + "model.layers.51.mlp.shared_experts.up_proj.scales": "model-00043-of-00050.safetensors", + "model.layers.51.mlp.shared_experts.up_proj.weight": "model-00043-of-00050.safetensors", + "model.layers.51.mlp.switch_mlp.down_proj.biases": "model-00043-of-00050.safetensors", + "model.layers.51.mlp.switch_mlp.down_proj.scales": "model-00043-of-00050.safetensors", + "model.layers.51.mlp.switch_mlp.down_proj.weight": "model-00042-of-00050.safetensors", + "model.layers.51.mlp.switch_mlp.gate_proj.biases": "model-00042-of-00050.safetensors", + "model.layers.51.mlp.switch_mlp.gate_proj.scales": "model-00042-of-00050.safetensors", + "model.layers.51.mlp.switch_mlp.gate_proj.weight": "model-00042-of-00050.safetensors", + "model.layers.51.mlp.switch_mlp.up_proj.biases": "model-00042-of-00050.safetensors", + "model.layers.51.mlp.switch_mlp.up_proj.scales": "model-00042-of-00050.safetensors", + "model.layers.51.mlp.switch_mlp.up_proj.weight": "model-00042-of-00050.safetensors", + "model.layers.51.post_attention_layernorm.weight": "model-00043-of-00050.safetensors", + "model.layers.51.self_attn.kv_a_layernorm.weight": "model-00042-of-00050.safetensors", + "model.layers.51.self_attn.kv_a_proj_with_mqa.biases": "model-00042-of-00050.safetensors", + "model.layers.51.self_attn.kv_a_proj_with_mqa.scales": "model-00042-of-00050.safetensors", + "model.layers.51.self_attn.kv_a_proj_with_mqa.weight": "model-00042-of-00050.safetensors", + "model.layers.51.self_attn.kv_b_proj.biases": "model-00042-of-00050.safetensors", + "model.layers.51.self_attn.kv_b_proj.scales": "model-00042-of-00050.safetensors", + "model.layers.51.self_attn.kv_b_proj.weight": "model-00042-of-00050.safetensors", + "model.layers.51.self_attn.o_proj.biases": "model-00042-of-00050.safetensors", + "model.layers.51.self_attn.o_proj.scales": "model-00042-of-00050.safetensors", + "model.layers.51.self_attn.o_proj.weight": "model-00042-of-00050.safetensors", + "model.layers.51.self_attn.q_a_layernorm.weight": "model-00042-of-00050.safetensors", + "model.layers.51.self_attn.q_a_proj.biases": "model-00042-of-00050.safetensors", + "model.layers.51.self_attn.q_a_proj.scales": "model-00042-of-00050.safetensors", + "model.layers.51.self_attn.q_a_proj.weight": "model-00042-of-00050.safetensors", + "model.layers.51.self_attn.q_b_proj.biases": "model-00042-of-00050.safetensors", + "model.layers.51.self_attn.q_b_proj.scales": "model-00042-of-00050.safetensors", + "model.layers.51.self_attn.q_b_proj.weight": "model-00042-of-00050.safetensors", + "model.layers.52.input_layernorm.weight": "model-00043-of-00050.safetensors", + "model.layers.52.mlp.gate.e_score_correction_bias": "model-00043-of-00050.safetensors", + "model.layers.52.mlp.gate.weight": "model-00043-of-00050.safetensors", + "model.layers.52.mlp.shared_experts.down_proj.biases": "model-00043-of-00050.safetensors", + "model.layers.52.mlp.shared_experts.down_proj.scales": "model-00043-of-00050.safetensors", + "model.layers.52.mlp.shared_experts.down_proj.weight": "model-00043-of-00050.safetensors", + "model.layers.52.mlp.shared_experts.gate_proj.biases": "model-00043-of-00050.safetensors", + "model.layers.52.mlp.shared_experts.gate_proj.scales": "model-00043-of-00050.safetensors", + "model.layers.52.mlp.shared_experts.gate_proj.weight": "model-00043-of-00050.safetensors", + "model.layers.52.mlp.shared_experts.up_proj.biases": "model-00043-of-00050.safetensors", + "model.layers.52.mlp.shared_experts.up_proj.scales": "model-00043-of-00050.safetensors", + "model.layers.52.mlp.shared_experts.up_proj.weight": "model-00043-of-00050.safetensors", + "model.layers.52.mlp.switch_mlp.down_proj.biases": "model-00043-of-00050.safetensors", + "model.layers.52.mlp.switch_mlp.down_proj.scales": "model-00043-of-00050.safetensors", + "model.layers.52.mlp.switch_mlp.down_proj.weight": "model-00043-of-00050.safetensors", + "model.layers.52.mlp.switch_mlp.gate_proj.biases": "model-00043-of-00050.safetensors", + "model.layers.52.mlp.switch_mlp.gate_proj.scales": "model-00043-of-00050.safetensors", + "model.layers.52.mlp.switch_mlp.gate_proj.weight": "model-00043-of-00050.safetensors", + "model.layers.52.mlp.switch_mlp.up_proj.biases": "model-00043-of-00050.safetensors", + "model.layers.52.mlp.switch_mlp.up_proj.scales": "model-00043-of-00050.safetensors", + "model.layers.52.mlp.switch_mlp.up_proj.weight": "model-00043-of-00050.safetensors", + "model.layers.52.post_attention_layernorm.weight": "model-00043-of-00050.safetensors", + "model.layers.52.self_attn.kv_a_layernorm.weight": "model-00043-of-00050.safetensors", + "model.layers.52.self_attn.kv_a_proj_with_mqa.biases": "model-00043-of-00050.safetensors", + "model.layers.52.self_attn.kv_a_proj_with_mqa.scales": "model-00043-of-00050.safetensors", + "model.layers.52.self_attn.kv_a_proj_with_mqa.weight": "model-00043-of-00050.safetensors", + "model.layers.52.self_attn.kv_b_proj.biases": "model-00043-of-00050.safetensors", + "model.layers.52.self_attn.kv_b_proj.scales": "model-00043-of-00050.safetensors", + "model.layers.52.self_attn.kv_b_proj.weight": "model-00043-of-00050.safetensors", + "model.layers.52.self_attn.o_proj.biases": "model-00043-of-00050.safetensors", + "model.layers.52.self_attn.o_proj.scales": "model-00043-of-00050.safetensors", + "model.layers.52.self_attn.o_proj.weight": "model-00043-of-00050.safetensors", + "model.layers.52.self_attn.q_a_layernorm.weight": "model-00043-of-00050.safetensors", + "model.layers.52.self_attn.q_a_proj.biases": "model-00043-of-00050.safetensors", + "model.layers.52.self_attn.q_a_proj.scales": "model-00043-of-00050.safetensors", + "model.layers.52.self_attn.q_a_proj.weight": "model-00043-of-00050.safetensors", + "model.layers.52.self_attn.q_b_proj.biases": "model-00043-of-00050.safetensors", + "model.layers.52.self_attn.q_b_proj.scales": "model-00043-of-00050.safetensors", + "model.layers.52.self_attn.q_b_proj.weight": "model-00043-of-00050.safetensors", + "model.layers.53.input_layernorm.weight": "model-00044-of-00050.safetensors", + "model.layers.53.mlp.gate.e_score_correction_bias": "model-00044-of-00050.safetensors", + "model.layers.53.mlp.gate.weight": "model-00044-of-00050.safetensors", + "model.layers.53.mlp.shared_experts.down_proj.biases": "model-00044-of-00050.safetensors", + "model.layers.53.mlp.shared_experts.down_proj.scales": "model-00044-of-00050.safetensors", + "model.layers.53.mlp.shared_experts.down_proj.weight": "model-00044-of-00050.safetensors", + "model.layers.53.mlp.shared_experts.gate_proj.biases": "model-00044-of-00050.safetensors", + "model.layers.53.mlp.shared_experts.gate_proj.scales": "model-00044-of-00050.safetensors", + "model.layers.53.mlp.shared_experts.gate_proj.weight": "model-00044-of-00050.safetensors", + "model.layers.53.mlp.shared_experts.up_proj.biases": "model-00044-of-00050.safetensors", + "model.layers.53.mlp.shared_experts.up_proj.scales": "model-00044-of-00050.safetensors", + "model.layers.53.mlp.shared_experts.up_proj.weight": "model-00044-of-00050.safetensors", + "model.layers.53.mlp.switch_mlp.down_proj.biases": "model-00044-of-00050.safetensors", + "model.layers.53.mlp.switch_mlp.down_proj.scales": "model-00044-of-00050.safetensors", + "model.layers.53.mlp.switch_mlp.down_proj.weight": "model-00044-of-00050.safetensors", + "model.layers.53.mlp.switch_mlp.gate_proj.biases": "model-00044-of-00050.safetensors", + "model.layers.53.mlp.switch_mlp.gate_proj.scales": "model-00044-of-00050.safetensors", + "model.layers.53.mlp.switch_mlp.gate_proj.weight": "model-00044-of-00050.safetensors", + "model.layers.53.mlp.switch_mlp.up_proj.biases": "model-00044-of-00050.safetensors", + "model.layers.53.mlp.switch_mlp.up_proj.scales": "model-00044-of-00050.safetensors", + "model.layers.53.mlp.switch_mlp.up_proj.weight": "model-00044-of-00050.safetensors", + "model.layers.53.post_attention_layernorm.weight": "model-00044-of-00050.safetensors", + "model.layers.53.self_attn.kv_a_layernorm.weight": "model-00043-of-00050.safetensors", + "model.layers.53.self_attn.kv_a_proj_with_mqa.biases": "model-00043-of-00050.safetensors", + "model.layers.53.self_attn.kv_a_proj_with_mqa.scales": "model-00043-of-00050.safetensors", + "model.layers.53.self_attn.kv_a_proj_with_mqa.weight": "model-00043-of-00050.safetensors", + "model.layers.53.self_attn.kv_b_proj.biases": "model-00043-of-00050.safetensors", + "model.layers.53.self_attn.kv_b_proj.scales": "model-00043-of-00050.safetensors", + "model.layers.53.self_attn.kv_b_proj.weight": "model-00043-of-00050.safetensors", + "model.layers.53.self_attn.o_proj.biases": "model-00043-of-00050.safetensors", + "model.layers.53.self_attn.o_proj.scales": "model-00043-of-00050.safetensors", + "model.layers.53.self_attn.o_proj.weight": "model-00043-of-00050.safetensors", + "model.layers.53.self_attn.q_a_layernorm.weight": "model-00043-of-00050.safetensors", + "model.layers.53.self_attn.q_a_proj.biases": "model-00043-of-00050.safetensors", + "model.layers.53.self_attn.q_a_proj.scales": "model-00043-of-00050.safetensors", + "model.layers.53.self_attn.q_a_proj.weight": "model-00043-of-00050.safetensors", + "model.layers.53.self_attn.q_b_proj.biases": "model-00043-of-00050.safetensors", + "model.layers.53.self_attn.q_b_proj.scales": "model-00043-of-00050.safetensors", + "model.layers.53.self_attn.q_b_proj.weight": "model-00043-of-00050.safetensors", + "model.layers.54.input_layernorm.weight": "model-00045-of-00050.safetensors", + "model.layers.54.mlp.gate.e_score_correction_bias": "model-00045-of-00050.safetensors", + "model.layers.54.mlp.gate.weight": "model-00045-of-00050.safetensors", + "model.layers.54.mlp.shared_experts.down_proj.biases": "model-00045-of-00050.safetensors", + "model.layers.54.mlp.shared_experts.down_proj.scales": "model-00045-of-00050.safetensors", + "model.layers.54.mlp.shared_experts.down_proj.weight": "model-00045-of-00050.safetensors", + "model.layers.54.mlp.shared_experts.gate_proj.biases": "model-00045-of-00050.safetensors", + "model.layers.54.mlp.shared_experts.gate_proj.scales": "model-00045-of-00050.safetensors", + "model.layers.54.mlp.shared_experts.gate_proj.weight": "model-00045-of-00050.safetensors", + "model.layers.54.mlp.shared_experts.up_proj.biases": "model-00045-of-00050.safetensors", + "model.layers.54.mlp.shared_experts.up_proj.scales": "model-00045-of-00050.safetensors", + "model.layers.54.mlp.shared_experts.up_proj.weight": "model-00045-of-00050.safetensors", + "model.layers.54.mlp.switch_mlp.down_proj.biases": "model-00045-of-00050.safetensors", + "model.layers.54.mlp.switch_mlp.down_proj.scales": "model-00045-of-00050.safetensors", + "model.layers.54.mlp.switch_mlp.down_proj.weight": "model-00045-of-00050.safetensors", + "model.layers.54.mlp.switch_mlp.gate_proj.biases": "model-00045-of-00050.safetensors", + "model.layers.54.mlp.switch_mlp.gate_proj.scales": "model-00045-of-00050.safetensors", + "model.layers.54.mlp.switch_mlp.gate_proj.weight": "model-00044-of-00050.safetensors", + "model.layers.54.mlp.switch_mlp.up_proj.biases": "model-00045-of-00050.safetensors", + "model.layers.54.mlp.switch_mlp.up_proj.scales": "model-00045-of-00050.safetensors", + "model.layers.54.mlp.switch_mlp.up_proj.weight": "model-00045-of-00050.safetensors", + "model.layers.54.post_attention_layernorm.weight": "model-00045-of-00050.safetensors", + "model.layers.54.self_attn.kv_a_layernorm.weight": "model-00044-of-00050.safetensors", + "model.layers.54.self_attn.kv_a_proj_with_mqa.biases": "model-00044-of-00050.safetensors", + "model.layers.54.self_attn.kv_a_proj_with_mqa.scales": "model-00044-of-00050.safetensors", + "model.layers.54.self_attn.kv_a_proj_with_mqa.weight": "model-00044-of-00050.safetensors", + "model.layers.54.self_attn.kv_b_proj.biases": "model-00044-of-00050.safetensors", + "model.layers.54.self_attn.kv_b_proj.scales": "model-00044-of-00050.safetensors", + "model.layers.54.self_attn.kv_b_proj.weight": "model-00044-of-00050.safetensors", + "model.layers.54.self_attn.o_proj.biases": "model-00044-of-00050.safetensors", + "model.layers.54.self_attn.o_proj.scales": "model-00044-of-00050.safetensors", + "model.layers.54.self_attn.o_proj.weight": "model-00044-of-00050.safetensors", + "model.layers.54.self_attn.q_a_layernorm.weight": "model-00044-of-00050.safetensors", + "model.layers.54.self_attn.q_a_proj.biases": "model-00044-of-00050.safetensors", + "model.layers.54.self_attn.q_a_proj.scales": "model-00044-of-00050.safetensors", + "model.layers.54.self_attn.q_a_proj.weight": "model-00044-of-00050.safetensors", + "model.layers.54.self_attn.q_b_proj.biases": "model-00044-of-00050.safetensors", + "model.layers.54.self_attn.q_b_proj.scales": "model-00044-of-00050.safetensors", + "model.layers.54.self_attn.q_b_proj.weight": "model-00044-of-00050.safetensors", + "model.layers.55.input_layernorm.weight": "model-00046-of-00050.safetensors", + "model.layers.55.mlp.gate.e_score_correction_bias": "model-00046-of-00050.safetensors", + "model.layers.55.mlp.gate.weight": "model-00046-of-00050.safetensors", + "model.layers.55.mlp.shared_experts.down_proj.biases": "model-00046-of-00050.safetensors", + "model.layers.55.mlp.shared_experts.down_proj.scales": "model-00046-of-00050.safetensors", + "model.layers.55.mlp.shared_experts.down_proj.weight": "model-00046-of-00050.safetensors", + "model.layers.55.mlp.shared_experts.gate_proj.biases": "model-00046-of-00050.safetensors", + "model.layers.55.mlp.shared_experts.gate_proj.scales": "model-00046-of-00050.safetensors", + "model.layers.55.mlp.shared_experts.gate_proj.weight": "model-00046-of-00050.safetensors", + "model.layers.55.mlp.shared_experts.up_proj.biases": "model-00046-of-00050.safetensors", + "model.layers.55.mlp.shared_experts.up_proj.scales": "model-00046-of-00050.safetensors", + "model.layers.55.mlp.shared_experts.up_proj.weight": "model-00046-of-00050.safetensors", + "model.layers.55.mlp.switch_mlp.down_proj.biases": "model-00046-of-00050.safetensors", + "model.layers.55.mlp.switch_mlp.down_proj.scales": "model-00046-of-00050.safetensors", + "model.layers.55.mlp.switch_mlp.down_proj.weight": "model-00046-of-00050.safetensors", + "model.layers.55.mlp.switch_mlp.gate_proj.biases": "model-00045-of-00050.safetensors", + "model.layers.55.mlp.switch_mlp.gate_proj.scales": "model-00045-of-00050.safetensors", + "model.layers.55.mlp.switch_mlp.gate_proj.weight": "model-00045-of-00050.safetensors", + "model.layers.55.mlp.switch_mlp.up_proj.biases": "model-00046-of-00050.safetensors", + "model.layers.55.mlp.switch_mlp.up_proj.scales": "model-00046-of-00050.safetensors", + "model.layers.55.mlp.switch_mlp.up_proj.weight": "model-00046-of-00050.safetensors", + "model.layers.55.post_attention_layernorm.weight": "model-00046-of-00050.safetensors", + "model.layers.55.self_attn.kv_a_layernorm.weight": "model-00045-of-00050.safetensors", + "model.layers.55.self_attn.kv_a_proj_with_mqa.biases": "model-00045-of-00050.safetensors", + "model.layers.55.self_attn.kv_a_proj_with_mqa.scales": "model-00045-of-00050.safetensors", + "model.layers.55.self_attn.kv_a_proj_with_mqa.weight": "model-00045-of-00050.safetensors", + "model.layers.55.self_attn.kv_b_proj.biases": "model-00045-of-00050.safetensors", + "model.layers.55.self_attn.kv_b_proj.scales": "model-00045-of-00050.safetensors", + "model.layers.55.self_attn.kv_b_proj.weight": "model-00045-of-00050.safetensors", + "model.layers.55.self_attn.o_proj.biases": "model-00045-of-00050.safetensors", + "model.layers.55.self_attn.o_proj.scales": "model-00045-of-00050.safetensors", + "model.layers.55.self_attn.o_proj.weight": "model-00045-of-00050.safetensors", + "model.layers.55.self_attn.q_a_layernorm.weight": "model-00045-of-00050.safetensors", + "model.layers.55.self_attn.q_a_proj.biases": "model-00045-of-00050.safetensors", + "model.layers.55.self_attn.q_a_proj.scales": "model-00045-of-00050.safetensors", + "model.layers.55.self_attn.q_a_proj.weight": "model-00045-of-00050.safetensors", + "model.layers.55.self_attn.q_b_proj.biases": "model-00045-of-00050.safetensors", + "model.layers.55.self_attn.q_b_proj.scales": "model-00045-of-00050.safetensors", + "model.layers.55.self_attn.q_b_proj.weight": "model-00045-of-00050.safetensors", + "model.layers.56.input_layernorm.weight": "model-00047-of-00050.safetensors", + "model.layers.56.mlp.gate.e_score_correction_bias": "model-00047-of-00050.safetensors", + "model.layers.56.mlp.gate.weight": "model-00047-of-00050.safetensors", + "model.layers.56.mlp.shared_experts.down_proj.biases": "model-00047-of-00050.safetensors", + "model.layers.56.mlp.shared_experts.down_proj.scales": "model-00047-of-00050.safetensors", + "model.layers.56.mlp.shared_experts.down_proj.weight": "model-00047-of-00050.safetensors", + "model.layers.56.mlp.shared_experts.gate_proj.biases": "model-00047-of-00050.safetensors", + "model.layers.56.mlp.shared_experts.gate_proj.scales": "model-00047-of-00050.safetensors", + "model.layers.56.mlp.shared_experts.gate_proj.weight": "model-00047-of-00050.safetensors", + "model.layers.56.mlp.shared_experts.up_proj.biases": "model-00047-of-00050.safetensors", + "model.layers.56.mlp.shared_experts.up_proj.scales": "model-00047-of-00050.safetensors", + "model.layers.56.mlp.shared_experts.up_proj.weight": "model-00047-of-00050.safetensors", + "model.layers.56.mlp.switch_mlp.down_proj.biases": "model-00047-of-00050.safetensors", + "model.layers.56.mlp.switch_mlp.down_proj.scales": "model-00047-of-00050.safetensors", + "model.layers.56.mlp.switch_mlp.down_proj.weight": "model-00047-of-00050.safetensors", + "model.layers.56.mlp.switch_mlp.gate_proj.biases": "model-00046-of-00050.safetensors", + "model.layers.56.mlp.switch_mlp.gate_proj.scales": "model-00046-of-00050.safetensors", + "model.layers.56.mlp.switch_mlp.gate_proj.weight": "model-00046-of-00050.safetensors", + "model.layers.56.mlp.switch_mlp.up_proj.biases": "model-00047-of-00050.safetensors", + "model.layers.56.mlp.switch_mlp.up_proj.scales": "model-00047-of-00050.safetensors", + "model.layers.56.mlp.switch_mlp.up_proj.weight": "model-00046-of-00050.safetensors", + "model.layers.56.post_attention_layernorm.weight": "model-00047-of-00050.safetensors", + "model.layers.56.self_attn.kv_a_layernorm.weight": "model-00046-of-00050.safetensors", + "model.layers.56.self_attn.kv_a_proj_with_mqa.biases": "model-00046-of-00050.safetensors", + "model.layers.56.self_attn.kv_a_proj_with_mqa.scales": "model-00046-of-00050.safetensors", + "model.layers.56.self_attn.kv_a_proj_with_mqa.weight": "model-00046-of-00050.safetensors", + "model.layers.56.self_attn.kv_b_proj.biases": "model-00046-of-00050.safetensors", + "model.layers.56.self_attn.kv_b_proj.scales": "model-00046-of-00050.safetensors", + "model.layers.56.self_attn.kv_b_proj.weight": "model-00046-of-00050.safetensors", + "model.layers.56.self_attn.o_proj.biases": "model-00046-of-00050.safetensors", + "model.layers.56.self_attn.o_proj.scales": "model-00046-of-00050.safetensors", + "model.layers.56.self_attn.o_proj.weight": "model-00046-of-00050.safetensors", + "model.layers.56.self_attn.q_a_layernorm.weight": "model-00046-of-00050.safetensors", + "model.layers.56.self_attn.q_a_proj.biases": "model-00046-of-00050.safetensors", + "model.layers.56.self_attn.q_a_proj.scales": "model-00046-of-00050.safetensors", + "model.layers.56.self_attn.q_a_proj.weight": "model-00046-of-00050.safetensors", + "model.layers.56.self_attn.q_b_proj.biases": "model-00046-of-00050.safetensors", + "model.layers.56.self_attn.q_b_proj.scales": "model-00046-of-00050.safetensors", + "model.layers.56.self_attn.q_b_proj.weight": "model-00046-of-00050.safetensors", + "model.layers.57.input_layernorm.weight": "model-00048-of-00050.safetensors", + "model.layers.57.mlp.gate.e_score_correction_bias": "model-00048-of-00050.safetensors", + "model.layers.57.mlp.gate.weight": "model-00048-of-00050.safetensors", + "model.layers.57.mlp.shared_experts.down_proj.biases": "model-00048-of-00050.safetensors", + "model.layers.57.mlp.shared_experts.down_proj.scales": "model-00048-of-00050.safetensors", + "model.layers.57.mlp.shared_experts.down_proj.weight": "model-00048-of-00050.safetensors", + "model.layers.57.mlp.shared_experts.gate_proj.biases": "model-00048-of-00050.safetensors", + "model.layers.57.mlp.shared_experts.gate_proj.scales": "model-00048-of-00050.safetensors", + "model.layers.57.mlp.shared_experts.gate_proj.weight": "model-00048-of-00050.safetensors", + "model.layers.57.mlp.shared_experts.up_proj.biases": "model-00048-of-00050.safetensors", + "model.layers.57.mlp.shared_experts.up_proj.scales": "model-00048-of-00050.safetensors", + "model.layers.57.mlp.shared_experts.up_proj.weight": "model-00048-of-00050.safetensors", + "model.layers.57.mlp.switch_mlp.down_proj.biases": "model-00048-of-00050.safetensors", + "model.layers.57.mlp.switch_mlp.down_proj.scales": "model-00048-of-00050.safetensors", + "model.layers.57.mlp.switch_mlp.down_proj.weight": "model-00048-of-00050.safetensors", + "model.layers.57.mlp.switch_mlp.gate_proj.biases": "model-00047-of-00050.safetensors", + "model.layers.57.mlp.switch_mlp.gate_proj.scales": "model-00047-of-00050.safetensors", + "model.layers.57.mlp.switch_mlp.gate_proj.weight": "model-00047-of-00050.safetensors", + "model.layers.57.mlp.switch_mlp.up_proj.biases": "model-00047-of-00050.safetensors", + "model.layers.57.mlp.switch_mlp.up_proj.scales": "model-00047-of-00050.safetensors", + "model.layers.57.mlp.switch_mlp.up_proj.weight": "model-00047-of-00050.safetensors", + "model.layers.57.post_attention_layernorm.weight": "model-00048-of-00050.safetensors", + "model.layers.57.self_attn.kv_a_layernorm.weight": "model-00047-of-00050.safetensors", + "model.layers.57.self_attn.kv_a_proj_with_mqa.biases": "model-00047-of-00050.safetensors", + "model.layers.57.self_attn.kv_a_proj_with_mqa.scales": "model-00047-of-00050.safetensors", + "model.layers.57.self_attn.kv_a_proj_with_mqa.weight": "model-00047-of-00050.safetensors", + "model.layers.57.self_attn.kv_b_proj.biases": "model-00047-of-00050.safetensors", + "model.layers.57.self_attn.kv_b_proj.scales": "model-00047-of-00050.safetensors", + "model.layers.57.self_attn.kv_b_proj.weight": "model-00047-of-00050.safetensors", + "model.layers.57.self_attn.o_proj.biases": "model-00047-of-00050.safetensors", + "model.layers.57.self_attn.o_proj.scales": "model-00047-of-00050.safetensors", + "model.layers.57.self_attn.o_proj.weight": "model-00047-of-00050.safetensors", + "model.layers.57.self_attn.q_a_layernorm.weight": "model-00047-of-00050.safetensors", + "model.layers.57.self_attn.q_a_proj.biases": "model-00047-of-00050.safetensors", + "model.layers.57.self_attn.q_a_proj.scales": "model-00047-of-00050.safetensors", + "model.layers.57.self_attn.q_a_proj.weight": "model-00047-of-00050.safetensors", + "model.layers.57.self_attn.q_b_proj.biases": "model-00047-of-00050.safetensors", + "model.layers.57.self_attn.q_b_proj.scales": "model-00047-of-00050.safetensors", + "model.layers.57.self_attn.q_b_proj.weight": "model-00047-of-00050.safetensors", + "model.layers.58.input_layernorm.weight": "model-00049-of-00050.safetensors", + "model.layers.58.mlp.gate.e_score_correction_bias": "model-00049-of-00050.safetensors", + "model.layers.58.mlp.gate.weight": "model-00049-of-00050.safetensors", + "model.layers.58.mlp.shared_experts.down_proj.biases": "model-00049-of-00050.safetensors", + "model.layers.58.mlp.shared_experts.down_proj.scales": "model-00049-of-00050.safetensors", + "model.layers.58.mlp.shared_experts.down_proj.weight": "model-00049-of-00050.safetensors", + "model.layers.58.mlp.shared_experts.gate_proj.biases": "model-00049-of-00050.safetensors", + "model.layers.58.mlp.shared_experts.gate_proj.scales": "model-00049-of-00050.safetensors", + "model.layers.58.mlp.shared_experts.gate_proj.weight": "model-00049-of-00050.safetensors", + "model.layers.58.mlp.shared_experts.up_proj.biases": "model-00049-of-00050.safetensors", + "model.layers.58.mlp.shared_experts.up_proj.scales": "model-00049-of-00050.safetensors", + "model.layers.58.mlp.shared_experts.up_proj.weight": "model-00049-of-00050.safetensors", + "model.layers.58.mlp.switch_mlp.down_proj.biases": "model-00049-of-00050.safetensors", + "model.layers.58.mlp.switch_mlp.down_proj.scales": "model-00049-of-00050.safetensors", + "model.layers.58.mlp.switch_mlp.down_proj.weight": "model-00048-of-00050.safetensors", + "model.layers.58.mlp.switch_mlp.gate_proj.biases": "model-00048-of-00050.safetensors", + "model.layers.58.mlp.switch_mlp.gate_proj.scales": "model-00048-of-00050.safetensors", + "model.layers.58.mlp.switch_mlp.gate_proj.weight": "model-00048-of-00050.safetensors", + "model.layers.58.mlp.switch_mlp.up_proj.biases": "model-00048-of-00050.safetensors", + "model.layers.58.mlp.switch_mlp.up_proj.scales": "model-00048-of-00050.safetensors", + "model.layers.58.mlp.switch_mlp.up_proj.weight": "model-00048-of-00050.safetensors", + "model.layers.58.post_attention_layernorm.weight": "model-00049-of-00050.safetensors", + "model.layers.58.self_attn.kv_a_layernorm.weight": "model-00048-of-00050.safetensors", + "model.layers.58.self_attn.kv_a_proj_with_mqa.biases": "model-00048-of-00050.safetensors", + "model.layers.58.self_attn.kv_a_proj_with_mqa.scales": "model-00048-of-00050.safetensors", + "model.layers.58.self_attn.kv_a_proj_with_mqa.weight": "model-00048-of-00050.safetensors", + "model.layers.58.self_attn.kv_b_proj.biases": "model-00048-of-00050.safetensors", + "model.layers.58.self_attn.kv_b_proj.scales": "model-00048-of-00050.safetensors", + "model.layers.58.self_attn.kv_b_proj.weight": "model-00048-of-00050.safetensors", + "model.layers.58.self_attn.o_proj.biases": "model-00048-of-00050.safetensors", + "model.layers.58.self_attn.o_proj.scales": "model-00048-of-00050.safetensors", + "model.layers.58.self_attn.o_proj.weight": "model-00048-of-00050.safetensors", + "model.layers.58.self_attn.q_a_layernorm.weight": "model-00048-of-00050.safetensors", + "model.layers.58.self_attn.q_a_proj.biases": "model-00048-of-00050.safetensors", + "model.layers.58.self_attn.q_a_proj.scales": "model-00048-of-00050.safetensors", + "model.layers.58.self_attn.q_a_proj.weight": "model-00048-of-00050.safetensors", + "model.layers.58.self_attn.q_b_proj.biases": "model-00048-of-00050.safetensors", + "model.layers.58.self_attn.q_b_proj.scales": "model-00048-of-00050.safetensors", + "model.layers.58.self_attn.q_b_proj.weight": "model-00048-of-00050.safetensors", + "model.layers.59.input_layernorm.weight": "model-00049-of-00050.safetensors", + "model.layers.59.mlp.gate.e_score_correction_bias": "model-00049-of-00050.safetensors", + "model.layers.59.mlp.gate.weight": "model-00049-of-00050.safetensors", + "model.layers.59.mlp.shared_experts.down_proj.biases": "model-00049-of-00050.safetensors", + "model.layers.59.mlp.shared_experts.down_proj.scales": "model-00049-of-00050.safetensors", + "model.layers.59.mlp.shared_experts.down_proj.weight": "model-00049-of-00050.safetensors", + "model.layers.59.mlp.shared_experts.gate_proj.biases": "model-00049-of-00050.safetensors", + "model.layers.59.mlp.shared_experts.gate_proj.scales": "model-00049-of-00050.safetensors", + "model.layers.59.mlp.shared_experts.gate_proj.weight": "model-00049-of-00050.safetensors", + "model.layers.59.mlp.shared_experts.up_proj.biases": "model-00049-of-00050.safetensors", + "model.layers.59.mlp.shared_experts.up_proj.scales": "model-00049-of-00050.safetensors", + "model.layers.59.mlp.shared_experts.up_proj.weight": "model-00049-of-00050.safetensors", + "model.layers.59.mlp.switch_mlp.down_proj.biases": "model-00049-of-00050.safetensors", + "model.layers.59.mlp.switch_mlp.down_proj.scales": "model-00049-of-00050.safetensors", + "model.layers.59.mlp.switch_mlp.down_proj.weight": "model-00049-of-00050.safetensors", + "model.layers.59.mlp.switch_mlp.gate_proj.biases": "model-00049-of-00050.safetensors", + "model.layers.59.mlp.switch_mlp.gate_proj.scales": "model-00049-of-00050.safetensors", + "model.layers.59.mlp.switch_mlp.gate_proj.weight": "model-00049-of-00050.safetensors", + "model.layers.59.mlp.switch_mlp.up_proj.biases": "model-00049-of-00050.safetensors", + "model.layers.59.mlp.switch_mlp.up_proj.scales": "model-00049-of-00050.safetensors", + "model.layers.59.mlp.switch_mlp.up_proj.weight": "model-00049-of-00050.safetensors", + "model.layers.59.post_attention_layernorm.weight": "model-00049-of-00050.safetensors", + "model.layers.59.self_attn.kv_a_layernorm.weight": "model-00049-of-00050.safetensors", + "model.layers.59.self_attn.kv_a_proj_with_mqa.biases": "model-00049-of-00050.safetensors", + "model.layers.59.self_attn.kv_a_proj_with_mqa.scales": "model-00049-of-00050.safetensors", + "model.layers.59.self_attn.kv_a_proj_with_mqa.weight": "model-00049-of-00050.safetensors", + "model.layers.59.self_attn.kv_b_proj.biases": "model-00049-of-00050.safetensors", + "model.layers.59.self_attn.kv_b_proj.scales": "model-00049-of-00050.safetensors", + "model.layers.59.self_attn.kv_b_proj.weight": "model-00049-of-00050.safetensors", + "model.layers.59.self_attn.o_proj.biases": "model-00049-of-00050.safetensors", + "model.layers.59.self_attn.o_proj.scales": "model-00049-of-00050.safetensors", + "model.layers.59.self_attn.o_proj.weight": "model-00049-of-00050.safetensors", + "model.layers.59.self_attn.q_a_layernorm.weight": "model-00049-of-00050.safetensors", + "model.layers.59.self_attn.q_a_proj.biases": "model-00049-of-00050.safetensors", + "model.layers.59.self_attn.q_a_proj.scales": "model-00049-of-00050.safetensors", + "model.layers.59.self_attn.q_a_proj.weight": "model-00049-of-00050.safetensors", + "model.layers.59.self_attn.q_b_proj.biases": "model-00049-of-00050.safetensors", + "model.layers.59.self_attn.q_b_proj.scales": "model-00049-of-00050.safetensors", + "model.layers.59.self_attn.q_b_proj.weight": "model-00049-of-00050.safetensors", + "model.layers.6.input_layernorm.weight": "model-00004-of-00050.safetensors", + "model.layers.6.mlp.gate.e_score_correction_bias": "model-00004-of-00050.safetensors", + "model.layers.6.mlp.gate.weight": "model-00004-of-00050.safetensors", + "model.layers.6.mlp.shared_experts.down_proj.biases": "model-00004-of-00050.safetensors", + "model.layers.6.mlp.shared_experts.down_proj.scales": "model-00004-of-00050.safetensors", + "model.layers.6.mlp.shared_experts.down_proj.weight": "model-00004-of-00050.safetensors", + "model.layers.6.mlp.shared_experts.gate_proj.biases": "model-00004-of-00050.safetensors", + "model.layers.6.mlp.shared_experts.gate_proj.scales": "model-00004-of-00050.safetensors", + "model.layers.6.mlp.shared_experts.gate_proj.weight": "model-00004-of-00050.safetensors", + "model.layers.6.mlp.shared_experts.up_proj.biases": "model-00004-of-00050.safetensors", + "model.layers.6.mlp.shared_experts.up_proj.scales": "model-00004-of-00050.safetensors", + "model.layers.6.mlp.shared_experts.up_proj.weight": "model-00004-of-00050.safetensors", + "model.layers.6.mlp.switch_mlp.down_proj.biases": "model-00004-of-00050.safetensors", + "model.layers.6.mlp.switch_mlp.down_proj.scales": "model-00004-of-00050.safetensors", + "model.layers.6.mlp.switch_mlp.down_proj.weight": "model-00004-of-00050.safetensors", + "model.layers.6.mlp.switch_mlp.gate_proj.biases": "model-00003-of-00050.safetensors", + "model.layers.6.mlp.switch_mlp.gate_proj.scales": "model-00003-of-00050.safetensors", + "model.layers.6.mlp.switch_mlp.gate_proj.weight": "model-00003-of-00050.safetensors", + "model.layers.6.mlp.switch_mlp.up_proj.biases": "model-00004-of-00050.safetensors", + "model.layers.6.mlp.switch_mlp.up_proj.scales": "model-00004-of-00050.safetensors", + "model.layers.6.mlp.switch_mlp.up_proj.weight": "model-00004-of-00050.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00004-of-00050.safetensors", + "model.layers.6.self_attn.kv_a_layernorm.weight": "model-00003-of-00050.safetensors", + "model.layers.6.self_attn.kv_a_proj_with_mqa.biases": "model-00003-of-00050.safetensors", + "model.layers.6.self_attn.kv_a_proj_with_mqa.scales": "model-00003-of-00050.safetensors", + "model.layers.6.self_attn.kv_a_proj_with_mqa.weight": "model-00003-of-00050.safetensors", + "model.layers.6.self_attn.kv_b_proj.biases": "model-00003-of-00050.safetensors", + "model.layers.6.self_attn.kv_b_proj.scales": "model-00003-of-00050.safetensors", + "model.layers.6.self_attn.kv_b_proj.weight": "model-00003-of-00050.safetensors", + "model.layers.6.self_attn.o_proj.biases": "model-00003-of-00050.safetensors", + "model.layers.6.self_attn.o_proj.scales": "model-00003-of-00050.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00003-of-00050.safetensors", + "model.layers.6.self_attn.q_a_layernorm.weight": "model-00003-of-00050.safetensors", + "model.layers.6.self_attn.q_a_proj.biases": "model-00003-of-00050.safetensors", + "model.layers.6.self_attn.q_a_proj.scales": "model-00003-of-00050.safetensors", + "model.layers.6.self_attn.q_a_proj.weight": "model-00003-of-00050.safetensors", + "model.layers.6.self_attn.q_b_proj.biases": "model-00003-of-00050.safetensors", + "model.layers.6.self_attn.q_b_proj.scales": "model-00003-of-00050.safetensors", + "model.layers.6.self_attn.q_b_proj.weight": "model-00003-of-00050.safetensors", + "model.layers.60.input_layernorm.weight": "model-00050-of-00050.safetensors", + "model.layers.60.mlp.gate.e_score_correction_bias": "model-00050-of-00050.safetensors", + "model.layers.60.mlp.gate.weight": "model-00050-of-00050.safetensors", + "model.layers.60.mlp.shared_experts.down_proj.biases": "model-00050-of-00050.safetensors", + "model.layers.60.mlp.shared_experts.down_proj.scales": "model-00050-of-00050.safetensors", + "model.layers.60.mlp.shared_experts.down_proj.weight": "model-00050-of-00050.safetensors", + "model.layers.60.mlp.shared_experts.gate_proj.biases": "model-00050-of-00050.safetensors", + "model.layers.60.mlp.shared_experts.gate_proj.scales": "model-00050-of-00050.safetensors", + "model.layers.60.mlp.shared_experts.gate_proj.weight": "model-00050-of-00050.safetensors", + "model.layers.60.mlp.shared_experts.up_proj.biases": "model-00050-of-00050.safetensors", + "model.layers.60.mlp.shared_experts.up_proj.scales": "model-00050-of-00050.safetensors", + "model.layers.60.mlp.shared_experts.up_proj.weight": "model-00050-of-00050.safetensors", + "model.layers.60.mlp.switch_mlp.down_proj.biases": "model-00050-of-00050.safetensors", + "model.layers.60.mlp.switch_mlp.down_proj.scales": "model-00050-of-00050.safetensors", + "model.layers.60.mlp.switch_mlp.down_proj.weight": "model-00050-of-00050.safetensors", + "model.layers.60.mlp.switch_mlp.gate_proj.biases": "model-00050-of-00050.safetensors", + "model.layers.60.mlp.switch_mlp.gate_proj.scales": "model-00050-of-00050.safetensors", + "model.layers.60.mlp.switch_mlp.gate_proj.weight": "model-00050-of-00050.safetensors", + "model.layers.60.mlp.switch_mlp.up_proj.biases": "model-00050-of-00050.safetensors", + "model.layers.60.mlp.switch_mlp.up_proj.scales": "model-00050-of-00050.safetensors", + "model.layers.60.mlp.switch_mlp.up_proj.weight": "model-00050-of-00050.safetensors", + "model.layers.60.post_attention_layernorm.weight": "model-00050-of-00050.safetensors", + "model.layers.60.self_attn.kv_a_layernorm.weight": "model-00049-of-00050.safetensors", + "model.layers.60.self_attn.kv_a_proj_with_mqa.biases": "model-00049-of-00050.safetensors", + "model.layers.60.self_attn.kv_a_proj_with_mqa.scales": "model-00049-of-00050.safetensors", + "model.layers.60.self_attn.kv_a_proj_with_mqa.weight": "model-00049-of-00050.safetensors", + "model.layers.60.self_attn.kv_b_proj.biases": "model-00049-of-00050.safetensors", + "model.layers.60.self_attn.kv_b_proj.scales": "model-00049-of-00050.safetensors", + "model.layers.60.self_attn.kv_b_proj.weight": "model-00049-of-00050.safetensors", + "model.layers.60.self_attn.o_proj.biases": "model-00049-of-00050.safetensors", + "model.layers.60.self_attn.o_proj.scales": "model-00049-of-00050.safetensors", + "model.layers.60.self_attn.o_proj.weight": "model-00049-of-00050.safetensors", + "model.layers.60.self_attn.q_a_layernorm.weight": "model-00049-of-00050.safetensors", + "model.layers.60.self_attn.q_a_proj.biases": "model-00049-of-00050.safetensors", + "model.layers.60.self_attn.q_a_proj.scales": "model-00049-of-00050.safetensors", + "model.layers.60.self_attn.q_a_proj.weight": "model-00049-of-00050.safetensors", + "model.layers.60.self_attn.q_b_proj.biases": "model-00049-of-00050.safetensors", + "model.layers.60.self_attn.q_b_proj.scales": "model-00049-of-00050.safetensors", + "model.layers.60.self_attn.q_b_proj.weight": "model-00049-of-00050.safetensors", + "model.layers.7.input_layernorm.weight": "model-00005-of-00050.safetensors", + "model.layers.7.mlp.gate.e_score_correction_bias": "model-00005-of-00050.safetensors", + "model.layers.7.mlp.gate.weight": "model-00005-of-00050.safetensors", + "model.layers.7.mlp.shared_experts.down_proj.biases": "model-00005-of-00050.safetensors", + "model.layers.7.mlp.shared_experts.down_proj.scales": "model-00005-of-00050.safetensors", + "model.layers.7.mlp.shared_experts.down_proj.weight": "model-00005-of-00050.safetensors", + "model.layers.7.mlp.shared_experts.gate_proj.biases": "model-00005-of-00050.safetensors", + "model.layers.7.mlp.shared_experts.gate_proj.scales": "model-00005-of-00050.safetensors", + "model.layers.7.mlp.shared_experts.gate_proj.weight": "model-00005-of-00050.safetensors", + "model.layers.7.mlp.shared_experts.up_proj.biases": "model-00005-of-00050.safetensors", + "model.layers.7.mlp.shared_experts.up_proj.scales": "model-00005-of-00050.safetensors", + "model.layers.7.mlp.shared_experts.up_proj.weight": "model-00005-of-00050.safetensors", + "model.layers.7.mlp.switch_mlp.down_proj.biases": "model-00005-of-00050.safetensors", + "model.layers.7.mlp.switch_mlp.down_proj.scales": "model-00005-of-00050.safetensors", + "model.layers.7.mlp.switch_mlp.down_proj.weight": "model-00005-of-00050.safetensors", + "model.layers.7.mlp.switch_mlp.gate_proj.biases": "model-00004-of-00050.safetensors", + "model.layers.7.mlp.switch_mlp.gate_proj.scales": "model-00004-of-00050.safetensors", + "model.layers.7.mlp.switch_mlp.gate_proj.weight": "model-00004-of-00050.safetensors", + "model.layers.7.mlp.switch_mlp.up_proj.biases": "model-00005-of-00050.safetensors", + "model.layers.7.mlp.switch_mlp.up_proj.scales": "model-00005-of-00050.safetensors", + "model.layers.7.mlp.switch_mlp.up_proj.weight": "model-00004-of-00050.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00005-of-00050.safetensors", + "model.layers.7.self_attn.kv_a_layernorm.weight": "model-00004-of-00050.safetensors", + "model.layers.7.self_attn.kv_a_proj_with_mqa.biases": "model-00004-of-00050.safetensors", + "model.layers.7.self_attn.kv_a_proj_with_mqa.scales": "model-00004-of-00050.safetensors", + "model.layers.7.self_attn.kv_a_proj_with_mqa.weight": "model-00004-of-00050.safetensors", + "model.layers.7.self_attn.kv_b_proj.biases": "model-00004-of-00050.safetensors", + "model.layers.7.self_attn.kv_b_proj.scales": "model-00004-of-00050.safetensors", + "model.layers.7.self_attn.kv_b_proj.weight": "model-00004-of-00050.safetensors", + "model.layers.7.self_attn.o_proj.biases": "model-00004-of-00050.safetensors", + "model.layers.7.self_attn.o_proj.scales": "model-00004-of-00050.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00004-of-00050.safetensors", + "model.layers.7.self_attn.q_a_layernorm.weight": "model-00004-of-00050.safetensors", + "model.layers.7.self_attn.q_a_proj.biases": "model-00004-of-00050.safetensors", + "model.layers.7.self_attn.q_a_proj.scales": "model-00004-of-00050.safetensors", + "model.layers.7.self_attn.q_a_proj.weight": "model-00004-of-00050.safetensors", + "model.layers.7.self_attn.q_b_proj.biases": "model-00004-of-00050.safetensors", + "model.layers.7.self_attn.q_b_proj.scales": "model-00004-of-00050.safetensors", + "model.layers.7.self_attn.q_b_proj.weight": "model-00004-of-00050.safetensors", + "model.layers.8.input_layernorm.weight": "model-00006-of-00050.safetensors", + "model.layers.8.mlp.gate.e_score_correction_bias": "model-00006-of-00050.safetensors", + "model.layers.8.mlp.gate.weight": "model-00006-of-00050.safetensors", + "model.layers.8.mlp.shared_experts.down_proj.biases": "model-00006-of-00050.safetensors", + "model.layers.8.mlp.shared_experts.down_proj.scales": "model-00006-of-00050.safetensors", + "model.layers.8.mlp.shared_experts.down_proj.weight": "model-00006-of-00050.safetensors", + "model.layers.8.mlp.shared_experts.gate_proj.biases": "model-00006-of-00050.safetensors", + "model.layers.8.mlp.shared_experts.gate_proj.scales": "model-00006-of-00050.safetensors", + "model.layers.8.mlp.shared_experts.gate_proj.weight": "model-00006-of-00050.safetensors", + "model.layers.8.mlp.shared_experts.up_proj.biases": "model-00006-of-00050.safetensors", + "model.layers.8.mlp.shared_experts.up_proj.scales": "model-00006-of-00050.safetensors", + "model.layers.8.mlp.shared_experts.up_proj.weight": "model-00006-of-00050.safetensors", + "model.layers.8.mlp.switch_mlp.down_proj.biases": "model-00006-of-00050.safetensors", + "model.layers.8.mlp.switch_mlp.down_proj.scales": "model-00006-of-00050.safetensors", + "model.layers.8.mlp.switch_mlp.down_proj.weight": "model-00006-of-00050.safetensors", + "model.layers.8.mlp.switch_mlp.gate_proj.biases": "model-00005-of-00050.safetensors", + "model.layers.8.mlp.switch_mlp.gate_proj.scales": "model-00005-of-00050.safetensors", + "model.layers.8.mlp.switch_mlp.gate_proj.weight": "model-00005-of-00050.safetensors", + "model.layers.8.mlp.switch_mlp.up_proj.biases": "model-00005-of-00050.safetensors", + "model.layers.8.mlp.switch_mlp.up_proj.scales": "model-00005-of-00050.safetensors", + "model.layers.8.mlp.switch_mlp.up_proj.weight": "model-00005-of-00050.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00006-of-00050.safetensors", + "model.layers.8.self_attn.kv_a_layernorm.weight": "model-00005-of-00050.safetensors", + "model.layers.8.self_attn.kv_a_proj_with_mqa.biases": "model-00005-of-00050.safetensors", + "model.layers.8.self_attn.kv_a_proj_with_mqa.scales": "model-00005-of-00050.safetensors", + "model.layers.8.self_attn.kv_a_proj_with_mqa.weight": "model-00005-of-00050.safetensors", + "model.layers.8.self_attn.kv_b_proj.biases": "model-00005-of-00050.safetensors", + "model.layers.8.self_attn.kv_b_proj.scales": "model-00005-of-00050.safetensors", + "model.layers.8.self_attn.kv_b_proj.weight": "model-00005-of-00050.safetensors", + "model.layers.8.self_attn.o_proj.biases": "model-00005-of-00050.safetensors", + "model.layers.8.self_attn.o_proj.scales": "model-00005-of-00050.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00005-of-00050.safetensors", + "model.layers.8.self_attn.q_a_layernorm.weight": "model-00005-of-00050.safetensors", + "model.layers.8.self_attn.q_a_proj.biases": "model-00005-of-00050.safetensors", + "model.layers.8.self_attn.q_a_proj.scales": "model-00005-of-00050.safetensors", + "model.layers.8.self_attn.q_a_proj.weight": "model-00005-of-00050.safetensors", + "model.layers.8.self_attn.q_b_proj.biases": "model-00005-of-00050.safetensors", + "model.layers.8.self_attn.q_b_proj.scales": "model-00005-of-00050.safetensors", + "model.layers.8.self_attn.q_b_proj.weight": "model-00005-of-00050.safetensors", + "model.layers.9.input_layernorm.weight": "model-00007-of-00050.safetensors", + "model.layers.9.mlp.gate.e_score_correction_bias": "model-00007-of-00050.safetensors", + "model.layers.9.mlp.gate.weight": "model-00007-of-00050.safetensors", + "model.layers.9.mlp.shared_experts.down_proj.biases": "model-00007-of-00050.safetensors", + "model.layers.9.mlp.shared_experts.down_proj.scales": "model-00007-of-00050.safetensors", + "model.layers.9.mlp.shared_experts.down_proj.weight": "model-00007-of-00050.safetensors", + "model.layers.9.mlp.shared_experts.gate_proj.biases": "model-00007-of-00050.safetensors", + "model.layers.9.mlp.shared_experts.gate_proj.scales": "model-00007-of-00050.safetensors", + "model.layers.9.mlp.shared_experts.gate_proj.weight": "model-00007-of-00050.safetensors", + "model.layers.9.mlp.shared_experts.up_proj.biases": "model-00007-of-00050.safetensors", + "model.layers.9.mlp.shared_experts.up_proj.scales": "model-00007-of-00050.safetensors", + "model.layers.9.mlp.shared_experts.up_proj.weight": "model-00007-of-00050.safetensors", + "model.layers.9.mlp.switch_mlp.down_proj.biases": "model-00007-of-00050.safetensors", + "model.layers.9.mlp.switch_mlp.down_proj.scales": "model-00007-of-00050.safetensors", + "model.layers.9.mlp.switch_mlp.down_proj.weight": "model-00006-of-00050.safetensors", + "model.layers.9.mlp.switch_mlp.gate_proj.biases": "model-00006-of-00050.safetensors", + "model.layers.9.mlp.switch_mlp.gate_proj.scales": "model-00006-of-00050.safetensors", + "model.layers.9.mlp.switch_mlp.gate_proj.weight": "model-00006-of-00050.safetensors", + "model.layers.9.mlp.switch_mlp.up_proj.biases": "model-00006-of-00050.safetensors", + "model.layers.9.mlp.switch_mlp.up_proj.scales": "model-00006-of-00050.safetensors", + "model.layers.9.mlp.switch_mlp.up_proj.weight": "model-00006-of-00050.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00007-of-00050.safetensors", + "model.layers.9.self_attn.kv_a_layernorm.weight": "model-00006-of-00050.safetensors", + "model.layers.9.self_attn.kv_a_proj_with_mqa.biases": "model-00006-of-00050.safetensors", + "model.layers.9.self_attn.kv_a_proj_with_mqa.scales": "model-00006-of-00050.safetensors", + "model.layers.9.self_attn.kv_a_proj_with_mqa.weight": "model-00006-of-00050.safetensors", + "model.layers.9.self_attn.kv_b_proj.biases": "model-00006-of-00050.safetensors", + "model.layers.9.self_attn.kv_b_proj.scales": "model-00006-of-00050.safetensors", + "model.layers.9.self_attn.kv_b_proj.weight": "model-00006-of-00050.safetensors", + "model.layers.9.self_attn.o_proj.biases": "model-00006-of-00050.safetensors", + "model.layers.9.self_attn.o_proj.scales": "model-00006-of-00050.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00006-of-00050.safetensors", + "model.layers.9.self_attn.q_a_layernorm.weight": "model-00006-of-00050.safetensors", + "model.layers.9.self_attn.q_a_proj.biases": "model-00006-of-00050.safetensors", + "model.layers.9.self_attn.q_a_proj.scales": "model-00006-of-00050.safetensors", + "model.layers.9.self_attn.q_a_proj.weight": "model-00006-of-00050.safetensors", + "model.layers.9.self_attn.q_b_proj.biases": "model-00006-of-00050.safetensors", + "model.layers.9.self_attn.q_b_proj.scales": "model-00006-of-00050.safetensors", + "model.layers.9.self_attn.q_b_proj.weight": "model-00006-of-00050.safetensors", + "model.norm.weight": "model-00050-of-00050.safetensors" } } \ No newline at end of file diff --git a/tokenizer_config.json b/tokenizer_config.json index 7877c38b426e08ee8eee20c1557b72d9c22d7626..35ffce03d78608ce68667173a13e84b175a2b2d6 100644 --- a/tokenizer_config.json +++ b/tokenizer_config.json @@ -6549,9 +6549,10 @@ } }, "bos_token": "<|begin▁of▁sentence|>", - "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true) %}{%- for message in messages %}{%- if message['role'] == 'system' %}{%- if ns.is_first_sp %}{% set ns.system_prompt = ns.system_prompt + message['content'] %}{% set ns.is_first_sp = false %}{%- else %}{% set ns.system_prompt = ns.system_prompt + '\\n\\n' + message['content'] %}{%- endif %}{%- endif %}{%- endfor %}{{ bos_token }}{{ ns.system_prompt }}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and 'tool_calls' in message %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls'] %}{%- if not ns.is_first %}{%- if message['content'] is none %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- else %}{{'<|Assistant|>' + message['content'] + '<|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- endif %}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- endif %}{%- endfor %}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- if message['role'] == 'assistant' and 'tool_calls' not in message %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>'}}{% endif %}", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true) %}{%- for message in messages %}{%- if message['role'] == 'system' %}{%- if ns.is_first_sp %}{% set ns.system_prompt = ns.system_prompt + message['content'] %}{% set ns.is_first_sp = false %}{%- else %}{% set ns.system_prompt = ns.system_prompt + '\\n\\n' + message['content'] %}{%- endif %}{%- endif %}{%- endfor %}{{ bos_token }}{{ ns.system_prompt }}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and 'tool_calls' in message %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls'] %}{%- if not ns.is_first %}{%- if message['content'] is none %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- else %}{{'<|Assistant|>' + message['content'] + '<|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- endif %}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- endif %}{%- endfor %}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- if message['role'] == 'assistant' and 'tool_calls' not in message %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}", "clean_up_tokenization_spaces": false, "eos_token": "<|end▁of▁sentence|>", + "extra_special_tokens": {}, "legacy": true, "model_max_length": 16384, "pad_token": "<|end▁of▁sentence|>",