File size: 4,427 Bytes
0737cdd
fdb3d3e
 
 
5068c86
 
 
fdb3d3e
 
e1ca246
732efe6
 
 
 
 
e1ca246
fdb3d3e
0737cdd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d4a7fd1
 
 
 
 
a6d6654
d4a7fd1
 
 
 
 
 
a6d6654
d4a7fd1
0737cdd
fdb3d3e
e1ca246
 
732efe6
 
 
 
 
 
 
 
 
0737cdd
 
 
 
 
732efe6
c1ea364
0737cdd
f3548e9
c1ea364
fdb3d3e
e1ca246
 
fdb3d3e
5068c86
e1ca246
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import fev
import gradio as gr
import pandas as pd

# Load the CSV data into a pandas DataFrame
df = pd.read_csv(
    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/seasonal_naive.csv"
)

markdown_text = """
This space hosts evaluation results for time series forecasting models.

Benchmark definitions, implementations of models, as well as the evaluation results for individual tasks are available under https://github.com/autogluon/fev.

Currently, the results in this space are a minimal proof of concept. Stay tuned for more benchmarks, results for new models and instructions on how to contribute your results.
"""

summary_urls = [
    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/auto_arima.csv",
    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/auto_ets.csv",
    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/auto_theta.csv",
    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_base.csv",
    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_large.csv",
    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_mini.csv",
    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_small.csv",
    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_tiny.csv",
    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_bolt_base.csv",
    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_bolt_mini.csv",
    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_bolt_small.csv",
    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_bolt_tiny.csv",
    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/moirai_base.csv",
    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/moirai_large.csv",
    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/moirai_small.csv",
    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/seasonal_naive.csv",
    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/timesfm.csv",
]

selected_cols = ["gmean_relative_error", "avg_rank", "median_inference_time_s"]
rename_cols = {
    "gmean_relative_error": "Average relative error",
    "avg_rank": "Average rank",
    "median_inference_time_s": "Median inference time (s)",
}

lb_mase = (
    fev.leaderboard(summary_urls, metric_column="MASE")[selected_cols]
    .rename(columns=rename_cols)
    .round(3)
    .reset_index()
    .astype(str)
)
lb_wql = (
    fev.leaderboard(summary_urls, metric_column="WQL")[selected_cols]
    .rename(columns=rename_cols)
    .round(3)
    .reset_index()
    .astype(str)
)


with gr.Blocks() as demo:
    with gr.Tab("Leaderboard"):
        gr.Markdown("""
                    ## Chronos zero-shot benchmark results

                    This tab contains results for various forecasting models on the 28 datasets used in Benchmark II (zero-shot evaluation) in the publication [Chronos: Learning the Language of Time Series](https://arxiv.org/abs/2403.07815).

                    Task definitions and the detailed results are available on [GitHub](https://github.com/autogluon/fev/tree/main/benchmarks/chronos_zeroshot).
                    """)
        gr.Markdown("""### Point forecast accuracy (measured by MASE)
                    """)
        gr.Dataframe(
            value=lb_mase,
            interactive=False,
        )

        gr.Markdown("### Probabilistic forecast accuracy (measured by WQL)")
        gr.Dataframe(
            value=lb_wql,
            interactive=False,
        )

    with gr.Tab("About"):
        gr.Markdown(markdown_text)

if __name__ == "__main__":
    demo.launch()