|
--- |
|
datasets: |
|
- mindchain/wikitext2 |
|
- yahma/alpaca-cleaned |
|
metrics: |
|
- perplexity |
|
- accuracy |
|
base_model: |
|
- TinyLlama/TinyLlama_v1.1 |
|
|
|
|
|
model-index: |
|
- name: TinyLlama_v1.1_mix_wikitext_alpaca_2bit_BitDistiller_baseline |
|
results: |
|
- task: |
|
type: multiple-choice |
|
name: QA Benchmarking |
|
dataset: |
|
type: allenai/arc |
|
name: ARC-Challenge |
|
config: challenge |
|
split: test |
|
metrics: |
|
- type: accuracy |
|
name: Accuracy |
|
value: 0.2150170648464164 |
|
- type: accuracy |
|
name: Normalized Accuracy |
|
value: 0.24573378839590443 |
|
- task: |
|
type: multiple-choice |
|
name: QA Benchmarking |
|
dataset: |
|
type: hellaswag |
|
name: HellaSwag |
|
split: test |
|
metrics: |
|
- type: accuracy |
|
name: Accuracy |
|
value: 0.3240390360485959 |
|
- type: accuracy |
|
name: Normalized Accuracy |
|
value: 0.37333200557657836 |
|
- task: |
|
type: multiple-choice |
|
name: QA Benchmarking |
|
dataset: |
|
type: piqa |
|
name: PIQA |
|
split: validation |
|
metrics: |
|
- type: accuracy |
|
name: Accuracy |
|
value: 0.6082698585418934 |
|
- type: accuracy |
|
name: Normalized Accuracy |
|
value: 0.6071817192600653 |
|
- task: |
|
type: multiple-choice |
|
name: QA Benchmarking |
|
dataset: |
|
type: winogrande |
|
name: Winogrande |
|
split: test |
|
metrics: |
|
- type: accuracy |
|
name: Accuracy |
|
value: 0.5201262825572218 |
|
- task: |
|
type: multiple-choice |
|
name: QA Benchmarking |
|
dataset: |
|
type: aggregated |
|
name: QA-Avg |
|
metrics: |
|
- type: accuracy |
|
name: QA Average |
|
value: 0.4168630604985319 |
|
- task: |
|
type: language-modeling |
|
name: Language Modeling |
|
dataset: |
|
type: wikitext |
|
name: WikiText-2 |
|
split: test |
|
metrics: |
|
- type: perplexity |
|
name: Perplexity |
|
value: 22.655162811279297 |
|
|
|
|
|
--- |
|
|
|
TODO: check the splits of each dataset |
|
|
|
Loss curves: |
|
|
|
 |
|
|