Update README.md
Browse files
README.md
CHANGED
@@ -28,14 +28,14 @@ pip install torchao
|
|
28 |
Then we can serve with the following command:
|
29 |
```Shell
|
30 |
# Server
|
31 |
-
export MODEL=
|
32 |
VLLM_DISABLE_COMPILE_CACHE=1 vllm serve $MODEL --tokenizer $MODEL -O3
|
33 |
```
|
34 |
|
35 |
```Shell
|
36 |
# Client
|
37 |
curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{
|
38 |
-
"model": "
|
39 |
"messages": [
|
40 |
{"role": "user", "content": "Give me a short introduction to large language models."}
|
41 |
],
|
@@ -64,7 +64,7 @@ Example:
|
|
64 |
import torch
|
65 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
66 |
|
67 |
-
model_name = "
|
68 |
|
69 |
# load the tokenizer and the model
|
70 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
@@ -234,7 +234,7 @@ lm_eval --model hf --model_args pretrained=microsoft/Phi-4-mini-instruct --tasks
|
|
234 |
|
235 |
## AWQ-INT4
|
236 |
```Shell
|
237 |
-
export MODEL=
|
238 |
lm_eval --model hf --model_args pretrained=$MODEL --tasks mmlu --device cuda:0 --batch_size 8
|
239 |
```
|
240 |
</details>
|
@@ -263,7 +263,7 @@ import torch
|
|
263 |
from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig
|
264 |
|
265 |
# use "microsoft/Phi-4-mini-instruct" or "pytorch/Phi-4-mini-instruct-AWQ-INT4"
|
266 |
-
model_id = "
|
267 |
quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16)
|
268 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
269 |
|
@@ -343,7 +343,7 @@ python benchmarks/benchmark_latency.py --input-len 256 --output-len 256 --model
|
|
343 |
|
344 |
### AWQ-INT4
|
345 |
```Shell
|
346 |
-
export MODEL=
|
347 |
VLLM_DISABLE_COMPILE_CACHE=1 python benchmarks/benchmark_latency.py --input-len 256 --output-len 256 --model $MODEL --batch-size 1
|
348 |
```
|
349 |
|
@@ -379,13 +379,13 @@ python benchmarks/benchmark_serving.py --backend vllm --dataset-name sharegpt --
|
|
379 |
### AWQ-INT4
|
380 |
Server:
|
381 |
```Shell
|
382 |
-
export MODEL=
|
383 |
VLLM_DISABLE_COMPILE_CACHE=1 vllm serve $MODEL --tokenizer $MODEL -O3 --pt-load-map-location cuda:0
|
384 |
```
|
385 |
|
386 |
Client:
|
387 |
```Shell
|
388 |
-
export MODEL=
|
389 |
python benchmarks/benchmark_serving.py --backend vllm --dataset-name sharegpt --tokenizer $MODEL --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json --model $MODEL --num-prompts 1
|
390 |
```
|
391 |
</details>
|
|
|
28 |
Then we can serve with the following command:
|
29 |
```Shell
|
30 |
# Server
|
31 |
+
export MODEL=pytorch/Phi-4-mini-instruct-AWQ-INT4
|
32 |
VLLM_DISABLE_COMPILE_CACHE=1 vllm serve $MODEL --tokenizer $MODEL -O3
|
33 |
```
|
34 |
|
35 |
```Shell
|
36 |
# Client
|
37 |
curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{
|
38 |
+
"model": "pytorch/Phi-4-mini-instruct-AWQ-INT4",
|
39 |
"messages": [
|
40 |
{"role": "user", "content": "Give me a short introduction to large language models."}
|
41 |
],
|
|
|
64 |
import torch
|
65 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
66 |
|
67 |
+
model_name = "pytorch/Phi-4-mini-instruct-AWQ-INT4"
|
68 |
|
69 |
# load the tokenizer and the model
|
70 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
|
234 |
|
235 |
## AWQ-INT4
|
236 |
```Shell
|
237 |
+
export MODEL=pytorch/Phi-4-mini-instruct-AWQ-INT4
|
238 |
lm_eval --model hf --model_args pretrained=$MODEL --tasks mmlu --device cuda:0 --batch_size 8
|
239 |
```
|
240 |
</details>
|
|
|
263 |
from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig
|
264 |
|
265 |
# use "microsoft/Phi-4-mini-instruct" or "pytorch/Phi-4-mini-instruct-AWQ-INT4"
|
266 |
+
model_id = "pytorch/Phi-4-mini-instruct-AWQ-INT4"
|
267 |
quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16)
|
268 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
269 |
|
|
|
343 |
|
344 |
### AWQ-INT4
|
345 |
```Shell
|
346 |
+
export MODEL=pytorch/Phi-4-mini-instruct-AWQ-INT4
|
347 |
VLLM_DISABLE_COMPILE_CACHE=1 python benchmarks/benchmark_latency.py --input-len 256 --output-len 256 --model $MODEL --batch-size 1
|
348 |
```
|
349 |
|
|
|
379 |
### AWQ-INT4
|
380 |
Server:
|
381 |
```Shell
|
382 |
+
export MODEL=pytorch/Phi-4-mini-instruct-AWQ-INT4
|
383 |
VLLM_DISABLE_COMPILE_CACHE=1 vllm serve $MODEL --tokenizer $MODEL -O3 --pt-load-map-location cuda:0
|
384 |
```
|
385 |
|
386 |
Client:
|
387 |
```Shell
|
388 |
+
export MODEL=pytorch/Phi-4-mini-instruct-AWQ-INT4
|
389 |
python benchmarks/benchmark_serving.py --backend vllm --dataset-name sharegpt --tokenizer $MODEL --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json --model $MODEL --num-prompts 1
|
390 |
```
|
391 |
</details>
|