jerryzh168 commited on
Commit
979c057
·
verified ·
1 Parent(s): ab54ce3

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +8 -8
README.md CHANGED
@@ -28,14 +28,14 @@ pip install torchao
28
  Then we can serve with the following command:
29
  ```Shell
30
  # Server
31
- export MODEL=jerryzh168/Phi-4-mini-instruct-AWQ-INT4
32
  VLLM_DISABLE_COMPILE_CACHE=1 vllm serve $MODEL --tokenizer $MODEL -O3
33
  ```
34
 
35
  ```Shell
36
  # Client
37
  curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{
38
- "model": "jerryzh168/Phi-4-mini-instruct-AWQ-INT4",
39
  "messages": [
40
  {"role": "user", "content": "Give me a short introduction to large language models."}
41
  ],
@@ -64,7 +64,7 @@ Example:
64
  import torch
65
  from transformers import AutoModelForCausalLM, AutoTokenizer
66
 
67
- model_name = "jerryzh168/Phi-4-mini-instruct-AWQ-INT4"
68
 
69
  # load the tokenizer and the model
70
  tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -234,7 +234,7 @@ lm_eval --model hf --model_args pretrained=microsoft/Phi-4-mini-instruct --tasks
234
 
235
  ## AWQ-INT4
236
  ```Shell
237
- export MODEL=jerryzh168/Phi-4-mini-instruct-AWQ-INT4
238
  lm_eval --model hf --model_args pretrained=$MODEL --tasks mmlu --device cuda:0 --batch_size 8
239
  ```
240
  </details>
@@ -263,7 +263,7 @@ import torch
263
  from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig
264
 
265
  # use "microsoft/Phi-4-mini-instruct" or "pytorch/Phi-4-mini-instruct-AWQ-INT4"
266
- model_id = "jerryzh168/Phi-4-mini-instruct-AWQ-INT4"
267
  quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16)
268
  tokenizer = AutoTokenizer.from_pretrained(model_id)
269
 
@@ -343,7 +343,7 @@ python benchmarks/benchmark_latency.py --input-len 256 --output-len 256 --model
343
 
344
  ### AWQ-INT4
345
  ```Shell
346
- export MODEL=jerryzh168/Phi-4-mini-instruct-AWQ-INT4
347
  VLLM_DISABLE_COMPILE_CACHE=1 python benchmarks/benchmark_latency.py --input-len 256 --output-len 256 --model $MODEL --batch-size 1
348
  ```
349
 
@@ -379,13 +379,13 @@ python benchmarks/benchmark_serving.py --backend vllm --dataset-name sharegpt --
379
  ### AWQ-INT4
380
  Server:
381
  ```Shell
382
- export MODEL=jerryzh168/Phi-4-mini-instruct-AWQ-INT4
383
  VLLM_DISABLE_COMPILE_CACHE=1 vllm serve $MODEL --tokenizer $MODEL -O3 --pt-load-map-location cuda:0
384
  ```
385
 
386
  Client:
387
  ```Shell
388
- export MODEL=jerryzh168/Phi-4-mini-instruct-AWQ-INT4
389
  python benchmarks/benchmark_serving.py --backend vllm --dataset-name sharegpt --tokenizer $MODEL --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json --model $MODEL --num-prompts 1
390
  ```
391
  </details>
 
28
  Then we can serve with the following command:
29
  ```Shell
30
  # Server
31
+ export MODEL=pytorch/Phi-4-mini-instruct-AWQ-INT4
32
  VLLM_DISABLE_COMPILE_CACHE=1 vllm serve $MODEL --tokenizer $MODEL -O3
33
  ```
34
 
35
  ```Shell
36
  # Client
37
  curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{
38
+ "model": "pytorch/Phi-4-mini-instruct-AWQ-INT4",
39
  "messages": [
40
  {"role": "user", "content": "Give me a short introduction to large language models."}
41
  ],
 
64
  import torch
65
  from transformers import AutoModelForCausalLM, AutoTokenizer
66
 
67
+ model_name = "pytorch/Phi-4-mini-instruct-AWQ-INT4"
68
 
69
  # load the tokenizer and the model
70
  tokenizer = AutoTokenizer.from_pretrained(model_name)
 
234
 
235
  ## AWQ-INT4
236
  ```Shell
237
+ export MODEL=pytorch/Phi-4-mini-instruct-AWQ-INT4
238
  lm_eval --model hf --model_args pretrained=$MODEL --tasks mmlu --device cuda:0 --batch_size 8
239
  ```
240
  </details>
 
263
  from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig
264
 
265
  # use "microsoft/Phi-4-mini-instruct" or "pytorch/Phi-4-mini-instruct-AWQ-INT4"
266
+ model_id = "pytorch/Phi-4-mini-instruct-AWQ-INT4"
267
  quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16)
268
  tokenizer = AutoTokenizer.from_pretrained(model_id)
269
 
 
343
 
344
  ### AWQ-INT4
345
  ```Shell
346
+ export MODEL=pytorch/Phi-4-mini-instruct-AWQ-INT4
347
  VLLM_DISABLE_COMPILE_CACHE=1 python benchmarks/benchmark_latency.py --input-len 256 --output-len 256 --model $MODEL --batch-size 1
348
  ```
349
 
 
379
  ### AWQ-INT4
380
  Server:
381
  ```Shell
382
+ export MODEL=pytorch/Phi-4-mini-instruct-AWQ-INT4
383
  VLLM_DISABLE_COMPILE_CACHE=1 vllm serve $MODEL --tokenizer $MODEL -O3 --pt-load-map-location cuda:0
384
  ```
385
 
386
  Client:
387
  ```Shell
388
+ export MODEL=pytorch/Phi-4-mini-instruct-AWQ-INT4
389
  python benchmarks/benchmark_serving.py --backend vllm --dataset-name sharegpt --tokenizer $MODEL --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json --model $MODEL --num-prompts 1
390
  ```
391
  </details>