File size: 10,770 Bytes
b10121d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#!/usr/bin/env bash

run() {
  # TP 4 GPUs
  python scripts/benchmark_one.py --backend vllm --server-image mlenergy/vllm:v0.3.0-openai --model meta-llama/Llama-2-13b-chat-hf --sharegpt-path ../../../sharegpt/ShareGPT_V3_filtered_500.json --request-rate 5.00 --power-limit $PL --result-root results/2024-02-19-scaling --huggingface-token $HF_TOKEN --gpu-ids 0 1 2 3
  python scripts/benchmark_one.py --backend vllm --server-image mlenergy/vllm:v0.3.0-openai --model meta-llama/Llama-2-13b-chat-hf --sharegpt-path ../../../sharegpt/ShareGPT_V3_filtered_500.json --request-rate 4.50 --power-limit $PL --result-root results/2024-02-19-scaling --huggingface-token $HF_TOKEN --gpu-ids 0 1 2 3
  python scripts/benchmark_one.py --backend vllm --server-image mlenergy/vllm:v0.3.0-openai --model meta-llama/Llama-2-13b-chat-hf --sharegpt-path ../../../sharegpt/ShareGPT_V3_filtered_500.json --request-rate 4.00 --power-limit $PL --result-root results/2024-02-19-scaling --huggingface-token $HF_TOKEN --gpu-ids 0 1 2 3
  python scripts/benchmark_one.py --backend vllm --server-image mlenergy/vllm:v0.3.0-openai --model meta-llama/Llama-2-13b-chat-hf --sharegpt-path ../../../sharegpt/ShareGPT_V3_filtered_500.json --request-rate 3.50 --power-limit $PL --result-root results/2024-02-19-scaling --huggingface-token $HF_TOKEN --gpu-ids 0 1 2 3
  python scripts/benchmark_one.py --backend vllm --server-image mlenergy/vllm:v0.3.0-openai --model meta-llama/Llama-2-13b-chat-hf --sharegpt-path ../../../sharegpt/ShareGPT_V3_filtered_500.json --request-rate 3.00 --power-limit $PL --result-root results/2024-02-19-scaling --huggingface-token $HF_TOKEN --gpu-ids 0 1 2 3
  python scripts/benchmark_one.py --backend vllm --server-image mlenergy/vllm:v0.3.0-openai --model meta-llama/Llama-2-13b-chat-hf --sharegpt-path ../../../sharegpt/ShareGPT_V3_filtered_500.json --request-rate 2.50 --power-limit $PL --result-root results/2024-02-19-scaling --huggingface-token $HF_TOKEN --gpu-ids 0 1 2 3
  python scripts/benchmark_one.py --backend vllm --server-image mlenergy/vllm:v0.3.0-openai --model meta-llama/Llama-2-13b-chat-hf --sharegpt-path ../../../sharegpt/ShareGPT_V3_filtered_500.json --request-rate 2.00 --power-limit $PL --result-root results/2024-02-19-scaling --huggingface-token $HF_TOKEN --gpu-ids 0 1 2 3
  python scripts/benchmark_one.py --backend vllm --server-image mlenergy/vllm:v0.3.0-openai --model meta-llama/Llama-2-13b-chat-hf --sharegpt-path ../../../sharegpt/ShareGPT_V3_filtered_500.json --request-rate 1.50 --power-limit $PL --result-root results/2024-02-19-scaling --huggingface-token $HF_TOKEN --gpu-ids 0 1 2 3
  python scripts/benchmark_one.py --backend vllm --server-image mlenergy/vllm:v0.3.0-openai --model meta-llama/Llama-2-13b-chat-hf --sharegpt-path ../../../sharegpt/ShareGPT_V3_filtered_500.json --request-rate 1.00 --power-limit $PL --result-root results/2024-02-19-scaling --huggingface-token $HF_TOKEN --gpu-ids 0 1 2 3
  python scripts/benchmark_one.py --backend vllm --server-image mlenergy/vllm:v0.3.0-openai --model meta-llama/Llama-2-13b-chat-hf --sharegpt-path ../../../sharegpt/ShareGPT_V3_filtered_500.json --request-rate 0.75 --power-limit $PL --result-root results/2024-02-19-scaling --huggingface-token $HF_TOKEN --gpu-ids 0 1 2 3
  python scripts/benchmark_one.py --backend vllm --server-image mlenergy/vllm:v0.3.0-openai --model meta-llama/Llama-2-13b-chat-hf --sharegpt-path ../../../sharegpt/ShareGPT_V3_filtered_500.json --request-rate 0.50 --power-limit $PL --result-root results/2024-02-19-scaling --huggingface-token $HF_TOKEN --gpu-ids 0 1 2 3
  python scripts/benchmark_one.py --backend vllm --server-image mlenergy/vllm:v0.3.0-openai --model meta-llama/Llama-2-13b-chat-hf --sharegpt-path ../../../sharegpt/ShareGPT_V3_filtered_500.json --request-rate 0.25 --power-limit $PL --result-root results/2024-02-19-scaling --huggingface-token $HF_TOKEN --gpu-ids 0 1 2 3

  # TP 2 GPUs
  python scripts/benchmark_one.py --backend vllm --server-image mlenergy/vllm:v0.3.0-openai --model meta-llama/Llama-2-13b-chat-hf --sharegpt-path ../../../sharegpt/ShareGPT_V3_filtered_500.json --request-rate 2.50 --power-limit $PL --result-root results/2024-02-19-scaling --huggingface-token $HF_TOKEN --gpu-ids 0 1
  python scripts/benchmark_one.py --backend vllm --server-image mlenergy/vllm:v0.3.0-openai --model meta-llama/Llama-2-13b-chat-hf --sharegpt-path ../../../sharegpt/ShareGPT_V3_filtered_500.json --request-rate 2.25 --power-limit $PL --result-root results/2024-02-19-scaling --huggingface-token $HF_TOKEN --gpu-ids 0 1
  python scripts/benchmark_one.py --backend vllm --server-image mlenergy/vllm:v0.3.0-openai --model meta-llama/Llama-2-13b-chat-hf --sharegpt-path ../../../sharegpt/ShareGPT_V3_filtered_500.json --request-rate 2.00 --power-limit $PL --result-root results/2024-02-19-scaling --huggingface-token $HF_TOKEN --gpu-ids 0 1
  python scripts/benchmark_one.py --backend vllm --server-image mlenergy/vllm:v0.3.0-openai --model meta-llama/Llama-2-13b-chat-hf --sharegpt-path ../../../sharegpt/ShareGPT_V3_filtered_500.json --request-rate 1.75 --power-limit $PL --result-root results/2024-02-19-scaling --huggingface-token $HF_TOKEN --gpu-ids 0 1
  python scripts/benchmark_one.py --backend vllm --server-image mlenergy/vllm:v0.3.0-openai --model meta-llama/Llama-2-13b-chat-hf --sharegpt-path ../../../sharegpt/ShareGPT_V3_filtered_500.json --request-rate 1.50 --power-limit $PL --result-root results/2024-02-19-scaling --huggingface-token $HF_TOKEN --gpu-ids 0 1
  python scripts/benchmark_one.py --backend vllm --server-image mlenergy/vllm:v0.3.0-openai --model meta-llama/Llama-2-13b-chat-hf --sharegpt-path ../../../sharegpt/ShareGPT_V3_filtered_500.json --request-rate 1.25 --power-limit $PL --result-root results/2024-02-19-scaling --huggingface-token $HF_TOKEN --gpu-ids 0 1
  python scripts/benchmark_one.py --backend vllm --server-image mlenergy/vllm:v0.3.0-openai --model meta-llama/Llama-2-13b-chat-hf --sharegpt-path ../../../sharegpt/ShareGPT_V3_filtered_500.json --request-rate 1.00 --power-limit $PL --result-root results/2024-02-19-scaling --huggingface-token $HF_TOKEN --gpu-ids 0 1
  python scripts/benchmark_one.py --backend vllm --server-image mlenergy/vllm:v0.3.0-openai --model meta-llama/Llama-2-13b-chat-hf --sharegpt-path ../../../sharegpt/ShareGPT_V3_filtered_500.json --request-rate 0.75 --power-limit $PL --result-root results/2024-02-19-scaling --huggingface-token $HF_TOKEN --gpu-ids 0 1
  python scripts/benchmark_one.py --backend vllm --server-image mlenergy/vllm:v0.3.0-openai --model meta-llama/Llama-2-13b-chat-hf --sharegpt-path ../../../sharegpt/ShareGPT_V3_filtered_500.json --request-rate 0.50 --power-limit $PL --result-root results/2024-02-19-scaling --huggingface-token $HF_TOKEN --gpu-ids 0 1
  python scripts/benchmark_one.py --backend vllm --server-image mlenergy/vllm:v0.3.0-openai --model meta-llama/Llama-2-13b-chat-hf --sharegpt-path ../../../sharegpt/ShareGPT_V3_filtered_500.json --request-rate 0.25 --power-limit $PL --result-root results/2024-02-19-scaling --huggingface-token $HF_TOKEN --gpu-ids 0 1

  # 1 GPU
  python scripts/benchmark_one.py --backend vllm --server-image mlenergy/vllm:v0.3.0-openai --model meta-llama/Llama-2-13b-chat-hf --sharegpt-path ../../../sharegpt/ShareGPT_V3_filtered_500.json --request-rate 1.25 --power-limit $PL --result-root results/2024-02-19-scaling --huggingface-token $HF_TOKEN --gpu-ids 0
  python scripts/benchmark_one.py --backend vllm --server-image mlenergy/vllm:v0.3.0-openai --model meta-llama/Llama-2-13b-chat-hf --sharegpt-path ../../../sharegpt/ShareGPT_V3_filtered_500.json --request-rate 1.125 --power-limit $PL --result-root results/2024-02-19-scaling --huggingface-token $HF_TOKEN --gpu-ids 0
  python scripts/benchmark_one.py --backend vllm --server-image mlenergy/vllm:v0.3.0-openai --model meta-llama/Llama-2-13b-chat-hf --sharegpt-path ../../../sharegpt/ShareGPT_V3_filtered_500.json --request-rate 1.00 --power-limit $PL --result-root results/2024-02-19-scaling --huggingface-token $HF_TOKEN --gpu-ids 0
  python scripts/benchmark_one.py --backend vllm --server-image mlenergy/vllm:v0.3.0-openai --model meta-llama/Llama-2-13b-chat-hf --sharegpt-path ../../../sharegpt/ShareGPT_V3_filtered_500.json --request-rate 0.875 --power-limit $PL --result-root results/2024-02-19-scaling --huggingface-token $HF_TOKEN --gpu-ids 0
  python scripts/benchmark_one.py --backend vllm --server-image mlenergy/vllm:v0.3.0-openai --model meta-llama/Llama-2-13b-chat-hf --sharegpt-path ../../../sharegpt/ShareGPT_V3_filtered_500.json --request-rate 0.75 --power-limit $PL --result-root results/2024-02-19-scaling --huggingface-token $HF_TOKEN --gpu-ids 0
  python scripts/benchmark_one.py --backend vllm --server-image mlenergy/vllm:v0.3.0-openai --model meta-llama/Llama-2-13b-chat-hf --sharegpt-path ../../../sharegpt/ShareGPT_V3_filtered_500.json --request-rate 0.625 --power-limit $PL --result-root results/2024-02-19-scaling --huggingface-token $HF_TOKEN --gpu-ids 0
  python scripts/benchmark_one.py --backend vllm --server-image mlenergy/vllm:v0.3.0-openai --model meta-llama/Llama-2-13b-chat-hf --sharegpt-path ../../../sharegpt/ShareGPT_V3_filtered_500.json --request-rate 0.50 --power-limit $PL --result-root results/2024-02-19-scaling --huggingface-token $HF_TOKEN --gpu-ids 0
  python scripts/benchmark_one.py --backend vllm --server-image mlenergy/vllm:v0.3.0-openai --model meta-llama/Llama-2-13b-chat-hf --sharegpt-path ../../../sharegpt/ShareGPT_V3_filtered_500.json --request-rate 0.375 --power-limit $PL --result-root results/2024-02-19-scaling --huggingface-token $HF_TOKEN --gpu-ids 0
  python scripts/benchmark_one.py --backend vllm --server-image mlenergy/vllm:v0.3.0-openai --model meta-llama/Llama-2-13b-chat-hf --sharegpt-path ../../../sharegpt/ShareGPT_V3_filtered_500.json --request-rate 0.25 --power-limit $PL --result-root results/2024-02-19-scaling --huggingface-token $HF_TOKEN --gpu-ids 0
  python scripts/benchmark_one.py --backend vllm --server-image mlenergy/vllm:v0.3.0-openai --model meta-llama/Llama-2-13b-chat-hf --sharegpt-path ../../../sharegpt/ShareGPT_V3_filtered_500.json --request-rate 0.125 --power-limit $PL --result-root results/2024-02-19-scaling --huggingface-token $HF_TOKEN --gpu-ids 0
}

# Warmup
timeout --signal SIGINT 120 python scripts/benchmark_one.py --backend vllm --server-image mlenergy/vllm:v0.3.0-openai --model meta-llama/Llama-2-13b-chat-hf --sharegpt-path ../../../sharegpt/ShareGPT_V3_filtered_500.json --request-rate 5.00 --power-limit 300 --result-root results/2024-02-19-scaling --huggingface-token $HF_TOKEN --gpu-ids 0 1 2 3

# PL=300
# run
#
# PL=275
# run
#
# PL=250
# run
#
# PL=225
# run
#
# PL=200
# run

PL=175
run

PL=150
run

PL=125
run

PL=100
run