nm-testing
/

open-llama-3b-v2-instruct-pruned50-quant-ds

Text Generation

Model card Files Files and versions

mwitiderrick commited on Jan 10, 2024

Commit

320ea85

·

1 Parent(s): 36caaf4

Update recipe.yaml

Files changed (1) hide show

recipe.yaml +28 -13

recipe.yaml CHANGED Viewed

@@ -4,23 +4,38 @@ test_stage:
       mappings: [
         [["re:.*q_proj", "re:.*k_proj", "re:.*v_proj"], "re:.*input_layernorm"],
         [["re:.*gate_proj", "re:.*up_proj"], "re:.*post_attention_layernorm"],
-      ]
     QuantizationModifier:
       ignore:
-      # These operations don't make sense to quantize
-      - LlamaRotaryEmbedding
-      - LlamaRMSNorm
-      - SiLUActivation
-      - MatMulOutput_QK
-      - MatMulOutput_PV
-      # Skip quantizing the layers with the most sensitive activations
-      - model.layers.6.mlp.down_proj
-      - model.layers.24.mlp.down_proj
-      - model.layers.25.mlp.down_proj
-      - model.layers.23.mlp.down_proj
-      - model.layers.2.mlp.down_proj
       post_oneshot_calibration: true
       scheme_overrides:
         Embedding:
           input_activations: null
           weights:

       mappings: [
         [["re:.*q_proj", "re:.*k_proj", "re:.*v_proj"], "re:.*input_layernorm"],
         [["re:.*gate_proj", "re:.*up_proj"], "re:.*post_attention_layernorm"],
+      ]
     QuantizationModifier:
       ignore:
+        # These operations don't make sense to quantize
+        - LlamaRotaryEmbedding
+        - LlamaRMSNorm
+        - SiLUActivation
+        - MatMulOutput_QK
+        - MatMulOutput_PV
+        # Skip quantizing the layers with the most sensitive activations
+        - model.layers.6.mlp.down_proj
+        - model.layers.24.mlp.down_proj
+        - model.layers.25.mlp.down_proj
+        - model.layers.23.mlp.down_proj
+        - model.layers.2.mlp.down_proj
       post_oneshot_calibration: true
       scheme_overrides:
+        # Enable channelwise quantization for better accuracy
+        Linear:
+          weights:
+            num_bits: 8
+            symmetric: true
+            strategy: channel
+        MatMulLeftInput_QK:
+          input_activations:
+            num_bits: 8
+            symmetric: true
+        MatMulLeftInput_PV:
+          input_activations:
+            num_bits: 8
+            symmetric: true
+        # For the embeddings, only weight-quantization makes sense
         Embedding:
           input_activations: null
           weights: