Update README.md
Browse files
README.md
CHANGED
@@ -152,6 +152,7 @@ quantize_(
|
|
152 |
model,
|
153 |
quant_config,
|
154 |
)
|
|
|
155 |
TransformerEvalWrapper(
|
156 |
model=model,
|
157 |
tokenizer=tokenizer,
|
@@ -212,10 +213,12 @@ and use a token with write access, from https://huggingface.co/settings/tokens
|
|
212 |
# Model Quality
|
213 |
We rely on [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) to evaluate the quality of the quantized model. Here we only run on mmlu for sanity check.
|
214 |
|
215 |
-
|
216 |
-
|
217 |
-
|
|
218 |
-
|
|
|
|
|
219 |
|
220 |
|
221 |
<details>
|
@@ -245,8 +248,8 @@ lm_eval --model hf --model_args pretrained=$MODEL --tasks mmlu --device cuda:0 -
|
|
245 |
|
246 |
| Benchmark | | |
|
247 |
|------------------|----------------|--------------------------------|
|
248 |
-
| | microsoft/Phi-4-mini-instruct |
|
249 |
-
| Peak Memory (GB) |
|
250 |
|
251 |
|
252 |
|
@@ -259,7 +262,7 @@ We can use the following code to get a sense of peak memory usage during inferen
|
|
259 |
import torch
|
260 |
from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig
|
261 |
|
262 |
-
# use "microsoft/Phi-4-mini-instruct" or "
|
263 |
model_id = "jerryzh168/Phi-4-mini-instruct-AWQ-INT4"
|
264 |
quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16)
|
265 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
@@ -305,8 +308,13 @@ print(f"Peak Memory Usage: {mem:.02f} GB")
|
|
305 |
## Results (A100 machine)
|
306 |
| Benchmark (Latency) | | |
|
307 |
|----------------------------------|----------------|--------------------------|
|
308 |
-
| | microsoft/Phi-4-mini-instruct |
|
309 |
-
| latency (batch_size=1) |
|
|
|
|
|
|
|
|
|
|
|
310 |
|
311 |
<details>
|
312 |
<summary> Reproduce Model Performance Results </summary>
|
|
|
152 |
model,
|
153 |
quant_config,
|
154 |
)
|
155 |
+
tasks = ["mmlu_pro"]
|
156 |
TransformerEvalWrapper(
|
157 |
model=model,
|
158 |
tokenizer=tokenizer,
|
|
|
213 |
# Model Quality
|
214 |
We rely on [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) to evaluate the quality of the quantized model. Here we only run on mmlu for sanity check.
|
215 |
|
216 |
+
Since the checkpoint is tuned on `mmlu_pro`, we check against the accuracy for `mmlu_pro`:
|
217 |
+
|
218 |
+
| Benchmark | | | |
|
219 |
+
|----------------------------------|----------------|---------------------------|---------------------------|
|
220 |
+
| | microsoft/Phi-4-mini-instruct | pytorch/Phi-4-mini-instruct-INT4 | pytorch/Phi-4-mini-instruct-AWQ-INT4
|
221 |
+
| mmlu_pro | 46.43 | 36.74 | |
|
222 |
|
223 |
|
224 |
<details>
|
|
|
248 |
|
249 |
| Benchmark | | |
|
250 |
|------------------|----------------|--------------------------------|
|
251 |
+
| | microsoft/Phi-4-mini-instruct | pytorch/Phi-4-mini-instruct-AWQ-INT4 |
|
252 |
+
| Peak Memory (GB) | 8.91 | 3.95 (55.67% reduction) |
|
253 |
|
254 |
|
255 |
|
|
|
262 |
import torch
|
263 |
from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig
|
264 |
|
265 |
+
# use "microsoft/Phi-4-mini-instruct" or "pytorch/Phi-4-mini-instruct-AWQ-INT4"
|
266 |
model_id = "jerryzh168/Phi-4-mini-instruct-AWQ-INT4"
|
267 |
quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16)
|
268 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
|
|
308 |
## Results (A100 machine)
|
309 |
| Benchmark (Latency) | | |
|
310 |
|----------------------------------|----------------|--------------------------|
|
311 |
+
| | microsoft/Phi-4-mini-instruct | pytorch/Phi-4-mini-instruct-AWQ-INT4 |
|
312 |
+
| latency (batch_size=1) | 1.60s | 1.37s (1.17x speedup) |
|
313 |
+
| latency (batch_size=256) | 5.47s | 5.55s (0.98x speedup) |
|
314 |
+
|
315 |
+
|
316 |
+
Note: it's expected that the awq-int4 checkpoint is slower when batch size is 256 since the problem is not memory bound but becomes compute bound when batch size is larger, while
|
317 |
+
int4 weight only checkpoint is only expected to have speedup for memory bound situations.
|
318 |
|
319 |
<details>
|
320 |
<summary> Reproduce Model Performance Results </summary>
|