File size: 4,393 Bytes
0737cdd
fdb3d3e
 
 
5068c86
 
 
fdb3d3e
 
e1ca246
732efe6
 
 
 
 
e1ca246
fdb3d3e
0737cdd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d4a7fd1
 
 
 
 
 
 
 
 
 
 
 
0737cdd
fdb3d3e
e1ca246
 
732efe6
 
 
 
 
 
 
 
 
0737cdd
 
 
 
 
732efe6
c1ea364
0737cdd
f3548e9
c1ea364
fdb3d3e
e1ca246
 
fdb3d3e
5068c86
e1ca246
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import fev
import gradio as gr
import pandas as pd

# Load the CSV data into a pandas DataFrame
df = pd.read_csv(
    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/seasonal_naive.csv"
)

markdown_text = """
This space hosts evaluation results for time series forecasting models.

Benchmark definitions, implementations of models, as well as the evaluation results for individual tasks are available under https://github.com/autogluon/fev.

Currently, the results in this space are a minimal proof of concept. Stay tuned for more benchmarks, results for new models and instructions on how to contribute your results.
"""

summary_urls = [
    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/auto_arima.csv",
    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/auto_ets.csv",
    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/auto_theta.csv",
    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_base.csv",
    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_large.csv",
    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_mini.csv",
    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_small.csv",
    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_tiny.csv",
    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_bolt_base.csv",
    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_bolt_mini.csv",
    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_bolt_small.csv",
    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_bolt_tiny.csv",
    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/moirai_base.csv",
    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/moirai_large.csv",
    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/moirai_small.csv",
    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/seasonal_naive.csv",
    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/timesfm.csv",
]

selected_cols = ["gmean_relative_error", "avg_rank", "median_inference_time_s"]
rename_cols = {
    "gmean_relative_error": "Average relative error",
    "avg_rank": "Average rank",
    "median_inference_time_s": "Median inference time (s)",
}

lb_mase = (
    fev.leaderboard(summary_urls, metric_column="MASE")[selected_cols]
    .rename(columns=rename_cols)
    .round(3)
    .reset_index()
)
lb_wql = (
    fev.leaderboard(summary_urls, metric_column="WQL")[selected_cols]
    .rename(columns=rename_cols)
    .round(3)
    .reset_index()
)


with gr.Blocks() as demo:
    with gr.Tab("Leaderboard"):
        gr.Markdown("""
                    ## Chronos zero-shot benchmark results

                    This tab contains results for various forecasting models on the 28 datasets used in Benchmark II (zero-shot evaluation) in the publication [Chronos: Learning the Language of Time Series](https://arxiv.org/abs/2403.07815).

                    Task definitions and the detailed results are available on [GitHub](https://github.com/autogluon/fev/tree/main/benchmarks/chronos_zeroshot).
                    """)
        gr.Markdown("""### Point forecast accuracy (measured by MASE)
                    """)
        gr.Dataframe(
            value=lb_mase,
            interactive=False,
        )

        gr.Markdown("### Probabilistic forecast accuracy (measured by WQL)")
        gr.Dataframe(
            value=lb_wql,
            interactive=False,
        )

    with gr.Tab("About"):
        gr.Markdown(markdown_text)

if __name__ == "__main__":
    demo.launch()