noneUsername/reka-flash-3-W8A8

vllm (pretrained=/root/autodl-tmp/reka-flash-3,add_bos_token=true,max_model_len=4096,dtype=bfloat16), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.720	±	0.0285
		strict-match	5	exact_match	↑	0.676	±	0.0297

vllm (pretrained=/root/autodl-tmp/reka-flash-3,add_bos_token=true,max_model_len=4096,dtype=bfloat16), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.724	±	0.0200
		strict-match	5	exact_match	↑	0.684	±	0.0208

vllm (pretrained=/root/autodl-tmp/reka-flash-3,add_bos_token=true,max_model_len=4096,dtype=bfloat16,max_num_seqs=3), gen_kwargs: (None), limit: 15.0, num_fewshot: None, batch_size: 1

Groups	Version	Filter	Metric		Value		Stderr
mmlu	2	none	acc	↑	0.6480	±	0.0158
- humanities	2	none	acc	↑	0.6615	±	0.0328
- other	2	none	acc	↑	0.6667	±	0.0328
- social sciences	2	none	acc	↑	0.7167	±	0.0334
- stem	2	none	acc	↑	0.5825	±	0.0284

vllm (pretrained=/root/autodl-tmp/84-512,add_bos_token=true,max_model_len=4096,dtype=bfloat16), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.700	±	0.0290
		strict-match	5	exact_match	↑	0.648	±	0.0303

vllm (pretrained=/root/autodl-tmp/84-512,add_bos_token=true,max_model_len=4096,dtype=bfloat16), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.692	±	0.0207
		strict-match	5	exact_match	↑	0.648	±	0.0214

vllm (pretrained=/root/autodl-tmp/84-512,add_bos_token=true,max_model_len=4096,dtype=bfloat16,max_num_seqs=3), gen_kwargs: (None), limit: 15.0, num_fewshot: None, batch_size: 1

Groups	Version	Filter	Metric		Value		Stderr
mmlu	2	none	acc	↑	0.6515	±	0.0159
- humanities	2	none	acc	↑	0.6718	±	0.0325
- other	2	none	acc	↑	0.6718	±	0.0328
- social sciences	2	none	acc	↑	0.7056	±	0.0341
- stem	2	none	acc	↑	0.5895	±	0.0286

vllm (pretrained=/root/autodl-tmp/848-128,add_bos_token=true,max_model_len=4096,dtype=bfloat16), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.692	±	0.0293
		strict-match	5	exact_match	↑	0.660	±	0.0300

vllm (pretrained=/root/autodl-tmp/848-128,add_bos_token=true,max_model_len=4096,dtype=bfloat16), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.724	±	0.020
		strict-match	5	exact_match	↑	0.674	±	0.021

vllm (pretrained=/root/autodl-tmp/848-128,add_bos_token=true,max_model_len=4096,dtype=bfloat16,max_num_seqs=3), gen_kwargs: (None), limit: 15.0, num_fewshot: None, batch_size: 1

Groups	Version	Filter	Metric		Value		Stderr
mmlu	2	none	acc	↑	0.6398	±	0.0159
- humanities	2	none	acc	↑	0.6513	±	0.0333
- other	2	none	acc	↑	0.6564	±	0.0330
- social sciences	2	none	acc	↑	0.7222	±	0.0333
- stem	2	none	acc	↑	0.5684	±	0.0284

vllm (pretrained=/root/autodl-tmp/8485-512,add_bos_token=true,max_model_len=4096,dtype=bfloat16), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.732	±	0.0281
		strict-match	5	exact_match	↑	0.696	±	0.0292

vllm (pretrained=/root/autodl-tmp/8485-512,add_bos_token=true,max_model_len=4096,dtype=bfloat16), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.720	±	0.0201
		strict-match	5	exact_match	↑	0.692	±	0.0207

vllm (pretrained=/root/autodl-tmp/8485-512,add_bos_token=true,max_model_len=4096,dtype=bfloat16,max_num_seqs=3), gen_kwargs: (None), limit: 15.0, num_fewshot: None, batch_size: 1

Groups	Version	Filter	Metric		Value		Stderr
mmlu	2	none	acc	↑	0.6550	±	0.0158
- humanities	2	none	acc	↑	0.6872	±	0.0323
- other	2	none	acc	↑	0.6769	±	0.0327
- social sciences	2	none	acc	↑	0.7056	±	0.0341
- stem	2	none	acc	↑	0.5860	±	0.0284

vllm (pretrained=/root/autodl-tmp/8485-128,add_bos_token=true,max_model_len=4096,dtype=bfloat16), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.74	±	0.0278
		strict-match	5	exact_match	↑	0.68	±	0.0296

vllm (pretrained=/root/autodl-tmp/8485-128,add_bos_token=true,max_model_len=4096,dtype=bfloat16), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.714	±	0.0202
		strict-match	5	exact_match	↑	0.676	±	0.0210

vllm (pretrained=/root/autodl-tmp/8485-128,add_bos_token=true,max_model_len=4096,dtype=bfloat16,max_num_seqs=3), gen_kwargs: (None), limit: 15.0, num_fewshot: None, batch_size: 1

Groups	Version	Filter	Metric		Value		Stderr
mmlu	2	none	acc	↑	0.6433	±	0.0160
- humanities	2	none	acc	↑	0.6513	±	0.0337
- other	2	none	acc	↑	0.6615	±	0.0332
- social sciences	2	none	acc	↑	0.7111	±	0.0338
- stem	2	none	acc	↑	0.5825	±	0.0284

vllm (pretrained=/root/autodl-tmp/85-128,add_bos_token=true,max_model_len=4096,dtype=bfloat16), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.696	±	0.0292
		strict-match	5	exact_match	↑	0.648	±	0.0303

vllm (pretrained=/root/autodl-tmp/85-128,add_bos_token=true,max_model_len=4096,dtype=bfloat16), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.708	±	0.0204
		strict-match	5	exact_match	↑	0.660	±	0.0212

vllm (pretrained=/root/autodl-tmp/85-128,add_bos_token=true,max_model_len=4096,dtype=bfloat16,max_num_seqs=3), gen_kwargs: (None), limit: 15.0, num_fewshot: None, batch_size: 1

Groups	Version	Filter	Metric		Value		Stderr
mmlu	2	none	acc	↑	0.6526	±	0.0158
- humanities	2	none	acc	↑	0.6615	±	0.0331
- other	2	none	acc	↑	0.6769	±	0.0325
- social sciences	2	none	acc	↑	0.7389	±	0.0327
- stem	2	none	acc	↑	0.5754	±	0.0287

vllm (pretrained=/root/autodl-tmp/85-512,add_bos_token=true,max_model_len=4096,dtype=bfloat16), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.708	±	0.0288
		strict-match	5	exact_match	↑	0.648	±	0.0303

vllm (pretrained=/root/autodl-tmp/85-512,add_bos_token=true,max_model_len=4096,dtype=bfloat16), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.720	±	0.0201
		strict-match	5	exact_match	↑	0.658	±	0.0212

vllm (pretrained=/root/autodl-tmp/85-512,add_bos_token=true,max_model_len=4096,dtype=bfloat16,max_num_seqs=3), gen_kwargs: (None), limit: 15.0, num_fewshot: None, batch_size: 1

Groups	Version	Filter	Metric		Value		Stderr
mmlu	2	none	acc	↑	0.6550	±	0.0158
- humanities	2	none	acc	↑	0.6769	±	0.0324
- other	2	none	acc	↑	0.6667	±	0.0331
- social sciences	2	none	acc	↑	0.7278	±	0.0329
- stem	2	none	acc	↑	0.5860	±	0.0284

vllm (pretrained=/root/autodl-tmp/86-128,add_bos_token=true,max_model_len=4096,dtype=bfloat16), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.696	±	0.0292
		strict-match	5	exact_match	↑	0.636	±	0.0305

vllm (pretrained=/root/autodl-tmp/86-128,add_bos_token=true,max_model_len=4096,dtype=bfloat16), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.690	±	0.0207
		strict-match	5	exact_match	↑	0.648	±	0.0214

vllm (pretrained=/root/autodl-tmp/86-128,add_bos_token=true,max_model_len=4096,dtype=bfloat16,max_num_seqs=3), gen_kwargs: (None), limit: 15.0, num_fewshot: None, batch_size: 1

Groups	Version	Filter	Metric		Value		Stderr
mmlu	2	none	acc	↑	0.6398	±	0.0160
- humanities	2	none	acc	↑	0.6410	±	0.0336
- other	2	none	acc	↑	0.6564	±	0.0332
- social sciences	2	none	acc	↑	0.7278	±	0.0333
- stem	2	none	acc	↑	0.5719	±	0.0284

vllm (pretrained=/root/autodl-tmp/86-512,add_bos_token=true,max_model_len=4096,dtype=bfloat16), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.688	±	0.0294
		strict-match	5	exact_match	↑	0.640	±	0.0304

vllm (pretrained=/root/autodl-tmp/86-512,add_bos_token=true,max_model_len=4096,dtype=bfloat16), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.706	±	0.0204
		strict-match	5	exact_match	↑	0.660	±	0.0212

Groups	Version	Filter	Metric		Value		Stderr
mmlu	2	none	acc	↑	0.6526	±	0.0158
- humanities	2	none	acc	↑	0.6821	±	0.0327
- other	2	none	acc	↑	0.6615	±	0.0331
- social sciences	2	none	acc	↑	0.7278	±	0.0329
- stem	2	none	acc	↑	0.5789	±	0.0284

noneUsername
/

reka-flash-3-W8A8

Model tree for noneUsername/reka-flash-3-W8A8