Jude Khouja commited on
Commit
36ce9ab
·
0 Parent(s):

1st clean draft

Browse files
Files changed (13) hide show
  1. .gitattributes +35 -0
  2. .gitignore +175 -0
  3. README.md +12 -0
  4. app.py +41 -0
  5. assets/OII_logo.png +0 -0
  6. chat.py +1150 -0
  7. data_loader.py +457 -0
  8. leaderboard.csv +12 -0
  9. requirements.txt +4 -0
  10. results.csv +22 -0
  11. tabs/leaderboard.py +160 -0
  12. utils.py +107 -0
  13. visualization.py +247 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ #uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ #poetry.lock
109
+
110
+ # pdm
111
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112
+ #pdm.lock
113
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114
+ # in version control.
115
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
116
+ .pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
121
+ __pypackages__/
122
+
123
+ # Celery stuff
124
+ celerybeat-schedule
125
+ celerybeat.pid
126
+
127
+ # SageMath parsed files
128
+ *.sage.py
129
+
130
+ # Environments
131
+ .env
132
+ .venv
133
+ env/
134
+ venv/
135
+ ENV/
136
+ env.bak/
137
+ venv.bak/
138
+
139
+ # Spyder project settings
140
+ .spyderproject
141
+ .spyproject
142
+
143
+ # Rope project settings
144
+ .ropeproject
145
+
146
+ # mkdocs documentation
147
+ /site
148
+
149
+ # mypy
150
+ .mypy_cache/
151
+ .dmypy.json
152
+ dmypy.json
153
+
154
+ # Pyre type checker
155
+ .pyre/
156
+
157
+ # pytype static type analyzer
158
+ .pytype/
159
+
160
+ # Cython debug symbols
161
+ cython_debug/
162
+
163
+ # PyCharm
164
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
165
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
167
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
168
+ #.idea/
169
+
170
+ # PyPI configuration file
171
+ .pypirc
172
+
173
+ data/
174
+ .DS_Store
175
+ get_results.ipynb
README.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: LingOly-TOO benchmark
3
+ emoji: 💬
4
+ colorFrom: yellow
5
+ colorTo: pink
6
+ sdk: gradio
7
+ sdk_version: 5.0.1
8
+ app_file: app.py
9
+ pinned: false
10
+ license: apache-2.0
11
+ short_description: Reasoning benchmark in linguistics
12
+ ---
app.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Add this at the top of your script
2
+ import warnings
3
+
4
+ warnings.filterwarnings("ignore")
5
+
6
+ import gradio as gr
7
+ from data_loader import (
8
+ METHODOLOGY,
9
+ load_data,
10
+ HEADER_CONTENT,
11
+ CARDS,
12
+ )
13
+ from tabs.leaderboard import create_leaderboard_tab, filter_leaderboard
14
+
15
+
16
+ def create_app():
17
+ df = load_data()
18
+
19
+ with gr.Blocks(
20
+ theme=gr.themes.Soft(font=[gr.themes.GoogleFont("sans-serif")])
21
+ ) as app:
22
+ # Create tabs
23
+ lb_output = create_leaderboard_tab(
24
+ df, HEADER_CONTENT, CARDS
25
+ )
26
+
27
+ # Initial load
28
+ app.load(
29
+ fn=lambda: filter_leaderboard(
30
+ df, "Score after obfuscation"
31
+ ),
32
+ outputs=[lb_output],
33
+ )
34
+
35
+ gr.HTML(METHODOLOGY)
36
+
37
+ return app
38
+
39
+
40
+ demo = create_app()
41
+ demo.launch()
assets/OII_logo.png ADDED
chat.py ADDED
@@ -0,0 +1,1150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+
4
+ def format_user_message(msg):
5
+ """Format a user message for display."""
6
+ # Extract the content based on role
7
+ content = msg.get("content", "")
8
+
9
+ # Handle None content
10
+ if content is None:
11
+ content = ""
12
+ elif isinstance(content, (int, float)):
13
+ content = str(content)
14
+ elif isinstance(content, list):
15
+ # Handle list-type content (may contain multiple parts)
16
+ content_text = ""
17
+ for item in content:
18
+ if item is None:
19
+ continue
20
+ if isinstance(item, dict) and "text" in item:
21
+ text_value = item.get("text", "")
22
+ if text_value is not None:
23
+ content_text += str(text_value) + "\n"
24
+ elif isinstance(item, str):
25
+ content_text += item + "\n"
26
+ elif item is not None:
27
+ content_text += str(item) + "\n"
28
+ content = content_text.strip()
29
+
30
+ # User message - align right using text-align instead of flex
31
+ return f"""
32
+ <div style="
33
+ text-align: right;
34
+ margin-bottom: 1.25rem;
35
+ padding: 0 0.5rem;">
36
+ <div style="
37
+ display: inline-block;
38
+ max-width: 85%;
39
+ background-color: var(--message-bg-user);
40
+ padding: 1rem;
41
+ border-radius: 1rem 0 1rem 1rem;
42
+ color: var(--text-color);
43
+ text-align: left;
44
+ box-shadow: 0 1px 2px var(--shadow-color);">
45
+ <div style="
46
+ font-weight: 500;
47
+ margin-bottom: 0.5rem;
48
+ color: var(--primary-text);
49
+ display: flex;
50
+ align-items: center;">
51
+ <span style="margin-right: 0.5rem;">👤</span>User
52
+ </div>
53
+ <div style="white-space: pre-wrap; line-height: 1.5;">
54
+ {content}
55
+ </div>
56
+ </div>
57
+ </div>
58
+ """
59
+
60
+
61
+ def format_tool_call(tool_name, tool_input):
62
+ """Format a tool call for display."""
63
+ # Ensure tool_name is a string
64
+ if tool_name is None:
65
+ tool_name = "Unknown Tool"
66
+ elif not isinstance(tool_name, str):
67
+ tool_name = str(tool_name)
68
+
69
+ # Ensure tool_input is serializable
70
+ if tool_input is None:
71
+ tool_input = {}
72
+
73
+ try:
74
+ # Try to serialize the tool input as JSON
75
+ tool_input_json = json.dumps(tool_input, indent=2)
76
+ except TypeError:
77
+ # If serialization fails, create a simplified representation
78
+ if isinstance(tool_input, dict):
79
+ simplified_input = {}
80
+ for k, v in tool_input.items():
81
+ if v is None or isinstance(v, (str, int, float, bool, list, dict)):
82
+ simplified_input[k] = v
83
+ else:
84
+ simplified_input[k] = str(v)
85
+ tool_input_json = json.dumps(simplified_input, indent=2)
86
+ else:
87
+ tool_input_json = str(tool_input)
88
+
89
+ return f"""
90
+ <div style="
91
+ background-color: var(--surface-color-alt);
92
+ padding: 0.75rem;
93
+ border-radius: 0.5rem;
94
+ margin-top: 0.75rem;
95
+ border-left: 3px solid var(--primary-text-light);">
96
+ <div style="
97
+ font-weight: 500;
98
+ margin-bottom: 0.5rem;
99
+ font-size: 0.9rem;
100
+ color: var(--primary-text);">
101
+ <span style="margin-right: 0.5rem;">🔧</span>{tool_name}
102
+ </div>
103
+ <div style="
104
+ font-family: monospace;
105
+ font-size: 0.85rem;
106
+ white-space: pre-wrap;">
107
+ {tool_input_json}
108
+ </div>
109
+ </div>
110
+ """
111
+
112
+
113
+ def extract_assistant_content(msg):
114
+ """Extract text content and tool calls from an assistant message."""
115
+ assistant_text = ""
116
+ tool_calls_html = ""
117
+
118
+ if "content" in msg:
119
+ content = msg["content"]
120
+
121
+ # Handle string content
122
+ if content is None:
123
+ assistant_text = ""
124
+ elif isinstance(content, str):
125
+ assistant_text = content
126
+ elif isinstance(content, (int, float)):
127
+ assistant_text = str(content)
128
+ # Handle list content with text and tool calls
129
+ elif isinstance(content, list):
130
+ for item in content:
131
+ if item is None:
132
+ continue
133
+ if isinstance(item, dict):
134
+ if "text" in item:
135
+ text_value = item.get("text", "")
136
+ if text_value is not None:
137
+ assistant_text += str(text_value) + "\n"
138
+ elif "type" in item and item["type"] == "tool_use":
139
+ # Format tool call in a nicer way
140
+ tool_name = item.get("name", "Unknown Tool")
141
+ tool_input = item.get("input", {})
142
+ if tool_input is None:
143
+ tool_input = {}
144
+ tool_calls_html += format_tool_call(tool_name, tool_input)
145
+ elif isinstance(item, str):
146
+ assistant_text += item + "\n"
147
+ elif item is not None:
148
+ assistant_text += str(item) + "\n"
149
+
150
+ # Extract tool calls if present
151
+ elif "tool_calls" in msg:
152
+ assistant_text = "The assistant used the following tools:"
153
+ tool_calls = msg.get("tool_calls", [])
154
+ if tool_calls is None:
155
+ tool_calls = []
156
+
157
+ for tool_call in tool_calls:
158
+ if tool_call is None:
159
+ continue
160
+ tool_name = tool_call.get("name", "Unknown Tool")
161
+ tool_args = tool_call.get("args", {})
162
+ if tool_args is None:
163
+ tool_args = {}
164
+ tool_calls_html += format_tool_call(tool_name, tool_args)
165
+
166
+ return assistant_text.strip(), tool_calls_html
167
+
168
+
169
+ def format_assistant_message(msg):
170
+ """Format an assistant message for display."""
171
+ assistant_text, tool_calls_html = extract_assistant_content(msg)
172
+
173
+ return f"""
174
+ <div style="
175
+ text-align: left;
176
+ margin-bottom: 1.25rem;
177
+ padding: 0 0.5rem;">
178
+ <div style="
179
+ display: inline-block;
180
+ max-width: 85%;
181
+ background-color: var(--message-bg-assistant);
182
+ padding: 1rem;
183
+ border-radius: 0 1rem 1rem 1rem;
184
+ color: var(--text-color);
185
+ text-align: left;
186
+ box-shadow: 0 1px 2px var(--shadow-color);">
187
+ <div style="
188
+ font-weight: 500;
189
+ margin-bottom: 0.5rem;
190
+ color: var(--primary-text);
191
+ display: flex;
192
+ align-items: center;">
193
+ <span style="margin-right: 0.5rem;">🤖</span>Assistant
194
+ </div>
195
+ <div style="white-space: pre-wrap; line-height: 1.5;">
196
+ {assistant_text}
197
+ </div>
198
+ {tool_calls_html}
199
+ </div>
200
+ </div>
201
+ """
202
+
203
+
204
+ def format_system_message(msg):
205
+ """Format a system or other message for display."""
206
+ content = msg.get("content", "")
207
+
208
+ # Handle None content
209
+ if content is None:
210
+ content = ""
211
+ elif isinstance(content, (int, float)):
212
+ content = str(content)
213
+ elif isinstance(content, list):
214
+ content_text = ""
215
+ for item in content:
216
+ if item is None:
217
+ continue
218
+ if isinstance(item, dict) and "text" in item:
219
+ text_value = item.get("text", "")
220
+ if text_value is not None:
221
+ content_text += str(text_value) + "\n"
222
+ elif isinstance(item, str):
223
+ content_text += item + "\n"
224
+ elif item is not None:
225
+ content_text += str(item) + "\n"
226
+ content = content_text.strip()
227
+
228
+ return f"""
229
+ <div style="
230
+ text-align: center;
231
+ margin-bottom: 1rem;
232
+ padding: 0 0.5rem;">
233
+ <div style="
234
+ display: inline-block;
235
+ max-width: 85%;
236
+ background-color: var(--message-bg-system);
237
+ padding: 0.75rem;
238
+ border-radius: 0.5rem;
239
+ color: var(--text-color);
240
+ text-align: left;
241
+ font-style: italic;
242
+ font-size: 0.9rem;">
243
+ {content}
244
+ </div>
245
+ </div>
246
+ """
247
+
248
+
249
+ def parse_complex_response(response):
250
+ """Parse complex JSON response and extract text and tool calls."""
251
+ try:
252
+ # Ensure response is a string
253
+ if response is None:
254
+ return "", ""
255
+
256
+ if isinstance(response, (int, float)):
257
+ return str(response), ""
258
+
259
+ # Convert to string if it's not already
260
+ if not isinstance(response, str):
261
+ response = str(response)
262
+
263
+ # Try to parse as JSON
264
+ if not response.strip().startswith("[") and not response.strip().startswith(
265
+ "{"
266
+ ):
267
+ return response, ""
268
+
269
+ response_obj = json.loads(response)
270
+
271
+ # Handle array format like in the example
272
+ if isinstance(response_obj, list) and len(response_obj) > 0:
273
+ response_obj = response_obj[0] # Take first item in array
274
+
275
+ # Extract text content and tool calls
276
+ text_content = ""
277
+ tool_calls_html = ""
278
+
279
+ # Handle content field which can be string or list
280
+ if "content" in response_obj:
281
+ content = response_obj["content"]
282
+ if content is None:
283
+ text_content = ""
284
+ elif isinstance(content, str):
285
+ text_content = content
286
+ elif isinstance(content, (int, float)):
287
+ text_content = str(content)
288
+ elif isinstance(content, list):
289
+ # Extract only text content from items with type="text"
290
+ for item in content:
291
+ if item is None:
292
+ continue
293
+ if isinstance(item, dict):
294
+ if "type" in item and item["type"] == "text" and "text" in item:
295
+ text_value = item.get("text", "")
296
+ if text_value is not None:
297
+ text_content += str(text_value) + "\n"
298
+
299
+ # Get formatted tool calls if they exist
300
+ if "tool_calls" in response_obj:
301
+ tool_calls = response_obj.get("tool_calls", [])
302
+ if tool_calls is None:
303
+ tool_calls = []
304
+
305
+ if tool_calls:
306
+ try:
307
+ tool_calls_html = f"""
308
+ <div style="
309
+ background-color: var(--surface-color-alt);
310
+ padding: 0.75rem;
311
+ border-radius: 0.5rem;
312
+ margin-top: 0.75rem;
313
+ border-left: 3px solid var(--primary-text-light);">
314
+ <div style="
315
+ font-weight: 500;
316
+ margin-bottom: 0.5rem;
317
+ font-size: 0.9rem;
318
+ color: var(--primary-text);">
319
+ <span style="margin-right: 0.5rem;">🔧</span>Tool Calls
320
+ </div>
321
+ <div style="
322
+ font-family: monospace;
323
+ font-size: 0.85rem;
324
+ white-space: pre-wrap;">
325
+ {json.dumps(tool_calls, indent=2)}
326
+ </div>
327
+ </div>
328
+ """
329
+ except:
330
+ # Fallback if JSON serialization fails
331
+ tool_calls_html = (
332
+ "<div>Tool calls present but could not be formatted.</div>"
333
+ )
334
+
335
+ return text_content.strip(), tool_calls_html
336
+ except Exception as e:
337
+ # If parsing fails, return the original response with error info
338
+ return f"{response}\n\nError parsing response: {str(e)}", ""
339
+
340
+
341
+ def format_final_response(response):
342
+ """Format the final response for display."""
343
+ # First try to process as complex JSON with tool calls
344
+ text_content, tool_calls_html = parse_complex_response(response)
345
+
346
+ # If that didn't work, try basic JSON parsing
347
+ if text_content == response:
348
+ # Clean up JSON response if it looks like JSON
349
+ if response.strip().startswith("{") and "content" in response:
350
+ try:
351
+ response_obj = json.loads(response)
352
+ if isinstance(response_obj, dict) and "content" in response_obj:
353
+ if isinstance(response_obj["content"], str):
354
+ text_content = response_obj["content"]
355
+ else:
356
+ text_content = json.dumps(response_obj["content"], indent=2)
357
+ else:
358
+ text_content = response
359
+ except:
360
+ text_content = response
361
+ else:
362
+ text_content = response
363
+
364
+ return f"""
365
+ <div style="
366
+ text-align: left;
367
+ margin-bottom: 1.25rem;
368
+ margin-top: 1.5rem;
369
+ padding: 0 0.5rem;">
370
+ <div style="
371
+ display: inline-block;
372
+ max-width: 85%;
373
+ background-color: var(--response-bg);
374
+ padding: 1rem;
375
+ border-radius: 0 1rem 1rem 1rem;
376
+ color: var(--text-color);
377
+ text-align: left;
378
+ box-shadow: 0 1px 2px var(--shadow-color);
379
+ border-left: 4px solid var(--primary-text);">
380
+ <div style="
381
+ font-weight: 500;
382
+ margin-bottom: 0.5rem;
383
+ color: var(--primary-text);
384
+ display: flex;
385
+ align-items: center;">
386
+ <span style="margin-right: 0.5rem;">🤖</span>Final Response
387
+ </div>
388
+ <div style="
389
+ white-space: pre-wrap;
390
+ line-height: 1.5;
391
+ font-family: var(--font-sans);">
392
+ {text_content}
393
+ </div>
394
+ {tool_calls_html}
395
+ </div>
396
+ </div>
397
+ """
398
+
399
+
400
+ def update_chat_display(existing_display, new_message):
401
+ """Update an existing chat display with a new message."""
402
+ try:
403
+ # Parse the new message
404
+ role = new_message.get("role", "unknown").lower()
405
+
406
+ # Format the new message based on its role
407
+ if role == "user":
408
+ message_html = format_user_message(new_message)
409
+ elif role == "assistant" or role == "ai":
410
+ message_html = format_assistant_message(new_message)
411
+ else:
412
+ message_html = format_system_message(new_message)
413
+
414
+ # Find the position to insert the new message (before the Final Response section)
415
+ insert_marker = '<div style="padding-top: 0.5rem;margin-top: 1rem;margin-bottom: 1rem;border-top: 1px solid var(--border-color-light);'
416
+ parts = existing_display.split(insert_marker)
417
+
418
+ if len(parts) == 2:
419
+ # Insert the new message before the Final Response section
420
+ updated_display = parts[0] + message_html + insert_marker + parts[1]
421
+ return updated_display
422
+ else:
423
+ # If we can't find the insertion point, append to the end
424
+ return existing_display + message_html
425
+ except Exception as e:
426
+ return (
427
+ existing_display
428
+ + f"""
429
+ <div style="
430
+ padding: 1rem;
431
+ color: var(--score-low);
432
+ background-color: var(--surface-color);
433
+ border: 1px solid var(--score-low);
434
+ border-radius: 10px;
435
+ margin-top: 1rem;">
436
+ <div style="font-weight: 600; margin-bottom: 0.5rem;">Error Updating Chat</div>
437
+ <div style="font-family: monospace; white-space: pre-wrap;">{str(e)}</div>
438
+ </div>
439
+ """
440
+ )
441
+
442
+
443
+ def format_chat_display(row):
444
+ """Format the chat display with better styling for user and assistant messages."""
445
+ try:
446
+ # Parse the conversation JSON
447
+ messages = json.loads(row["conversation"])
448
+
449
+ # Create HTML for all messages
450
+ messages_html = ""
451
+ for msg in messages:
452
+ role = msg.get("role", "unknown").lower()
453
+
454
+ if role == "user":
455
+ messages_html += format_user_message(msg)
456
+ elif role == "assistant" or role == "ai":
457
+ messages_html += format_assistant_message(msg)
458
+ else:
459
+ # System or other message types
460
+ messages_html += format_system_message(msg)
461
+
462
+ # Format the final response from the assistant
463
+ response_html = format_final_response(row["response"])
464
+
465
+ # Combine all HTML
466
+ full_chat_html = f"""
467
+ <div style="
468
+ padding: 1.5rem;
469
+ background-color: var(--surface-color);
470
+ border-radius: 10px;
471
+ border: 1px solid var(--border-color);
472
+ box-shadow: 0 2px 6px var(--shadow-color);
473
+ height: 100%;
474
+ overflow-y: auto;
475
+ max-height: 600px;
476
+ font-family: var(--font-sans);">
477
+ <div style="
478
+ padding-bottom: 1rem;
479
+ margin-bottom: 1.5rem;
480
+ border-bottom: 1px solid var(--border-color-light);
481
+ display: flex;
482
+ align-items: center;">
483
+ <div style="
484
+ font-weight: 600;
485
+ font-size: 1.1rem;
486
+ color: var(--primary-text);">
487
+ <span style="margin-right: 0.5rem;">💬</span>Conversation
488
+ </div>
489
+ </div>
490
+ {messages_html}
491
+
492
+ {response_html}
493
+ </div>
494
+ """
495
+ return full_chat_html
496
+
497
+ except Exception as e:
498
+ return f"""
499
+ <div style="
500
+ padding: 1.5rem;
501
+ color: var(--score-low);
502
+ background-color: var(--surface-color);
503
+ border: 1px solid var(--score-low);
504
+ border-radius: 10px;">
505
+ <div style="font-weight: 600; margin-bottom: 0.5rem;">Error Formatting Chat</div>
506
+ <div style="font-family: monospace; white-space: pre-wrap;">{str(e)}</div>
507
+ <div style="margin-top: 1rem; font-family: monospace; font-size: 0.8rem;">
508
+ Original conversation: {str(row["conversation"])}
509
+ </div>
510
+ </div>
511
+ """
512
+
513
+
514
+ def parse_tool_schema(tool):
515
+ """Parse tool schema to extract name, description, and parameters properly."""
516
+
517
+ # Handle schema wrapped in a list
518
+ if isinstance(tool, list) and len(tool) > 0:
519
+ tool = tool[0]
520
+
521
+ # Extract function information from the new schema structure with "function" key
522
+ if "function" in tool:
523
+ function_data = tool["function"]
524
+ name = function_data.get("name", "Unnamed Tool")
525
+ description = function_data.get("description", "No description available")
526
+
527
+ parameters = {}
528
+ if (
529
+ "parameters" in function_data
530
+ and "properties" in function_data["parameters"]
531
+ ):
532
+ properties = function_data["parameters"]["properties"]
533
+ for param_name, param_data in properties.items():
534
+ param_desc = param_data.get("description", "No description")
535
+ param_type = param_data.get("type", "unknown")
536
+ param_default = param_data.get("default", "None")
537
+
538
+ # Include default value in parameter description
539
+ parameters[param_name] = (
540
+ f"{param_desc} (Type: {param_type}, Default: {param_default})"
541
+ )
542
+
543
+ # Check for required parameters
544
+ required_params = function_data.get("parameters", {}).get("required", [])
545
+ if required_params:
546
+ for param_name in required_params:
547
+ if param_name in parameters:
548
+ parameters[param_name] = f"[REQUIRED] {parameters[param_name]}"
549
+ else:
550
+ # Original schema parsing
551
+ name = tool.get("title", "Unnamed Tool")
552
+ description = tool.get("description", "No description available")
553
+
554
+ parameters = {}
555
+ if "properties" in tool:
556
+ for param_name, param_data in tool["properties"].items():
557
+ param_desc = param_data.get("description", "No description")
558
+ param_type = param_data.get("type", "unknown")
559
+ param_title = param_data.get("title", param_name)
560
+
561
+ parameters[param_name] = (
562
+ f"{param_desc} (Type: {param_type}, Title: {param_title})"
563
+ )
564
+
565
+ # Check for required parameters in the original schema
566
+ required_params = tool.get("required", [])
567
+ if required_params:
568
+ for param_name in required_params:
569
+ if param_name in parameters:
570
+ parameters[param_name] = f"[REQUIRED] {parameters[param_name]}"
571
+
572
+ return name, description, parameters
573
+
574
+
575
+ def format_parameters(parameters):
576
+ if not parameters:
577
+ return '<div style="color: var(--text-muted); font-style: italic;">No parameters</div>'
578
+
579
+ params_html = ""
580
+ for name, desc in parameters.items():
581
+ is_required = "[REQUIRED]" in desc
582
+ param_style = "required" if is_required else "optional"
583
+
584
+ # Clean up the description to remove the REQUIRED marker but keep the info
585
+ cleaned_desc = desc.replace("[REQUIRED] ", "") if is_required else desc
586
+
587
+ params_html += f"""
588
+ <div style="
589
+ margin-bottom: 1.2rem;
590
+ padding-bottom: 1.2rem;
591
+ border-bottom: 1px solid var(--border-color);
592
+ last-child: border-bottom: none;">
593
+ <div style="
594
+ display: flex;
595
+ align-items: center;
596
+ justify-content: space-between;
597
+ margin-bottom: 0.5rem;">
598
+ <div style="
599
+ font-weight: 600;
600
+ color: var(--primary-text);
601
+ font-size: 1.05rem;
602
+ display: flex;
603
+ align-items: center;">
604
+ {name}
605
+ </div>
606
+ <div style="
607
+ font-size: 0.8rem;
608
+ padding: 0.2rem 0.6rem;
609
+ border-radius: 12px;
610
+ background-color: {f"rgba(234, 67, 53, 0.1)" if is_required else "rgba(160, 160, 160, 0.1)"};
611
+ color: var(--{param_style}-color);
612
+ font-weight: 500;">
613
+ {f"Required" if is_required else "Optional"}
614
+ </div>
615
+ </div>
616
+ <div style="
617
+ color: var(--text-color);
618
+ line-height: 1.5;
619
+ font-size: 0.95rem;
620
+ opacity: 0.9;">
621
+ {cleaned_desc}
622
+ </div>
623
+ </div>
624
+ """
625
+
626
+ # Remove the border-bottom from the last parameter
627
+ params_html = params_html.replace("last-child: border-bottom: none;", "")
628
+ return (
629
+ params_html
630
+ + """
631
+ <style>
632
+ div:last-child {
633
+ border-bottom: none !important;
634
+ margin-bottom: 0 !important;
635
+ padding-bottom: 0 !important;
636
+ }
637
+ </style>
638
+ """
639
+ )
640
+
641
+
642
+ def format_metrics(score, rationale, explanation):
643
+ """Format metrics display with improved visual hierarchy and dark theme support."""
644
+ # Determine score color and add emoji indicator
645
+ if score >= 0.7:
646
+ score_color = "var(--score-high)"
647
+ score_emoji = "🟢"
648
+ score_text = "High"
649
+ elif score >= 0.4:
650
+ score_color = "var(--score-med)"
651
+ score_emoji = "🟠"
652
+ score_text = "Medium"
653
+ else:
654
+ score_color = "var(--score-low)"
655
+ score_emoji = "🔴"
656
+ score_text = "Low"
657
+
658
+ return f"""
659
+ <div style="
660
+ padding: 1.75rem;
661
+ background-color: var(--surface-color);
662
+ border-radius: 10px;
663
+ border: 1px solid var(--border-color);
664
+ box-shadow: 0 3px 8px var(--shadow-color);">
665
+ <div style="
666
+ display: flex;
667
+ align-items: center;
668
+ margin-bottom: 1.75rem;
669
+ padding-bottom: 1.5rem;
670
+ border-bottom: 1px solid var(--border-color-light);">
671
+ <div style="flex: 1;">
672
+ <h3 style="
673
+ color: var(--text-color);
674
+ font-size: 1.2rem;
675
+ margin-bottom: 0.25rem;
676
+ font-weight: 600;">TSQ Score</h3>
677
+ <div style="
678
+ display: flex;
679
+ align-items: baseline;">
680
+ <div style="
681
+ font-size: 2.5rem;
682
+ font-weight: 700;
683
+ color: {score_color};">
684
+ {score:.2f}
685
+ </div>
686
+ <div style="
687
+ margin-left: 0.75rem;
688
+ font-size: 1rem;
689
+ color: {score_color};
690
+ font-weight: 500;
691
+ display: flex;
692
+ align-items: center;">
693
+ <span style="margin-right: 0.5rem;">{score_emoji}</span>{score_text}
694
+ </div>
695
+ </div>
696
+ </div>
697
+ </div>
698
+ <div style="margin-bottom: 1.75rem;">
699
+ <h3 style="
700
+ color: var(--text-color);
701
+ font-size: 1.1rem;
702
+ margin-bottom: 0.75rem;
703
+ font-weight: 600;
704
+ display: flex;
705
+ align-items: center;">
706
+ <span style="
707
+ display: inline-block;
708
+ width: 18px;
709
+ height: 18px;
710
+ background-color: var(--primary-text-light);
711
+ border-radius: 4px;
712
+ margin-right: 0.5rem;"></span>
713
+ Rationale
714
+ </h3>
715
+ <div style="
716
+ color: var(--text-color);
717
+ line-height: 1.6;
718
+ padding-left: 1.5rem;
719
+ border-left: 3px solid var(--primary-text-light);
720
+ font-size: 0.95rem;">
721
+ {rationale}
722
+ </div>
723
+ </div>
724
+ <div>
725
+ <h3 style="
726
+ color: var(--text-color);
727
+ font-size: 1.1rem;
728
+ margin-bottom: 0.75rem;
729
+ font-weight: 600;
730
+ display: flex;
731
+ align-items: center;">
732
+ <span style="
733
+ display: inline-block;
734
+ width: 18px;
735
+ height: 18px;
736
+ background-color: var(--primary-text-light);
737
+ border-radius: 4px;
738
+ margin-right: 0.5rem;"></span>
739
+ Explanation
740
+ </h3>
741
+ <div style="
742
+ color: var(--text-color);
743
+ line-height: 1.6;
744
+ padding-left: 1.5rem;
745
+ border-left: 3px solid var(--primary-text-light);
746
+ font-size: 0.95rem;">
747
+ {explanation}
748
+ </div>
749
+ </div>
750
+ </div>
751
+ """
752
+
753
+
754
+ def format_metrics_display(row):
755
+ """Format the metrics display with score, rationale and explanation."""
756
+ try:
757
+ score = row["score"]
758
+ rationale = row["rationale"]
759
+ explanation = row["explanation"]
760
+
761
+ # Determine score color and add emoji indicator
762
+ if score >= 0.7:
763
+ score_color = "var(--score-high)"
764
+ score_emoji = "🟢"
765
+ score_text = "High"
766
+ elif score >= 0.4:
767
+ score_color = "var(--score-med)"
768
+ score_emoji = "🟠"
769
+ score_text = "Medium"
770
+ else:
771
+ score_color = "var(--score-low)"
772
+ score_emoji = "🔴"
773
+ score_text = "Low"
774
+
775
+ metrics_html = f"""
776
+ <div style="
777
+ padding: 1.5rem;
778
+ background-color: var(--surface-color);
779
+ border-radius: 10px;
780
+ border: 1px solid var(--border-color);
781
+ box-shadow: 0 2px 6px var(--shadow-color);
782
+ height: 100%;
783
+ overflow-y: auto;
784
+ max-height: 600px;">
785
+ <div style="
786
+ padding-bottom: 1rem;
787
+ margin-bottom: 1.5rem;
788
+ border-bottom: 1px solid var(--border-color-light);
789
+ display: flex;
790
+ align-items: center;">
791
+ <div style="
792
+ font-weight: 600;
793
+ font-size: 1.1rem;
794
+ color: var(--primary-text);">
795
+ <span style="margin-right: 0.5rem;">📊</span>Evaluation Metrics
796
+ </div>
797
+ </div>
798
+
799
+ <div style="
800
+ margin-bottom: 1.5rem;
801
+ padding-bottom: 1.5rem;
802
+ border-bottom: 1px solid var(--border-color-light);">
803
+ <div style="
804
+ display: flex;
805
+ align-items: center;
806
+ justify-content: space-between;">
807
+ <div>
808
+ <div style="
809
+ font-weight: 600;
810
+ margin-bottom: 0.25rem;
811
+ color: var(--text-color);">
812
+ TSQ Score
813
+ </div>
814
+ <div style="
815
+ font-size: 2.5rem;
816
+ font-weight: 700;
817
+ color: {score_color};
818
+ display: flex;
819
+ align-items: center;">
820
+ {score:.2f}
821
+ <div style="
822
+ margin-left: 0.75rem;
823
+ font-size: 1rem;
824
+ display: flex;
825
+ align-items: center;">
826
+ {score_emoji} <span style="margin-left: 0.25rem;">{score_text}</span>
827
+ </div>
828
+ </div>
829
+ </div>
830
+ </div>
831
+ </div>
832
+
833
+ <div style="margin-bottom: 1.5rem;">
834
+ <div style="
835
+ font-weight: 600;
836
+ margin-bottom: 0.75rem;
837
+ color: var(--text-color);
838
+ display: flex;
839
+ align-items: center;">
840
+ <span style="
841
+ display: inline-block;
842
+ width: 12px;
843
+ height: 12px;
844
+ background-color: var(--primary-text-light);
845
+ border-radius: 2px;
846
+ margin-right: 0.5rem;"></span>
847
+ Rationale
848
+ </div>
849
+ <div style="
850
+ background-color: var(--surface-color-alt);
851
+ padding: 1rem;
852
+ border-radius: 8px;
853
+ border-left: 3px solid var(--primary-text-light);
854
+ line-height: 1.5;
855
+ color: var(--text-color);
856
+ font-size: 0.95rem;">
857
+ {rationale}
858
+ </div>
859
+ </div>
860
+
861
+ <div>
862
+ <div style="
863
+ font-weight: 600;
864
+ margin-bottom: 0.75rem;
865
+ color: var(--text-color);
866
+ display: flex;
867
+ align-items: center;">
868
+ <span style="
869
+ display: inline-block;
870
+ width: 12px;
871
+ height: 12px;
872
+ background-color: var(--primary-text-light);
873
+ border-radius: 2px;
874
+ margin-right: 0.5rem;"></span>
875
+ Explanation
876
+ </div>
877
+ <div style="
878
+ background-color: var(--surface-color-alt);
879
+ padding: 1rem;
880
+ border-radius: 8px;
881
+ border-left: 3px solid var(--primary-text-light);
882
+ line-height: 1.5;
883
+ color: var(--text-color);
884
+ font-size: 0.95rem;">
885
+ {explanation}
886
+ </div>
887
+ </div>
888
+ </div>
889
+ """
890
+ return metrics_html
891
+ except Exception as e:
892
+ return f"""
893
+ <div style="
894
+ padding: 1.5rem;
895
+ color: var(--score-low);
896
+ background-color: var(--surface-color);
897
+ border: 1px solid var(--score-low);
898
+ border-radius: 10px;">
899
+ <div style="font-weight: 600; margin-bottom: 0.5rem;">Error Formatting Metrics</div>
900
+ <div style="font-family: monospace; white-space: pre-wrap;">{str(e)}</div>
901
+ </div>
902
+ """
903
+
904
+
905
+ def format_tool_info(tools_data):
906
+ """Format the tool information with improved styling."""
907
+ try:
908
+ if not tools_data or tools_data == "[]":
909
+ return """
910
+ <div style="
911
+ padding: 1.5rem;
912
+ text-align: center;
913
+ color: var(--text-muted);
914
+ background-color: var(--surface-color);
915
+ border-radius: 10px;
916
+ border: 1px solid var(--border-color);
917
+ box-shadow: 0 2px 6px var(--shadow-color);">
918
+ <div style="font-size: 1.5rem; margin-bottom: 0.75rem;">🔍</div>
919
+ <div style="font-weight: 500; margin-bottom: 0.5rem;">No Tool Information</div>
920
+ <div style="font-size: 0.9rem; font-style: italic;">This conversation doesn't use any tools</div>
921
+ </div>
922
+ """
923
+
924
+ if isinstance(tools_data, str):
925
+ try:
926
+ tools = json.loads(tools_data)
927
+ except:
928
+ tools = []
929
+ else:
930
+ tools = tools_data
931
+
932
+ if not tools:
933
+ return """
934
+ <div style="
935
+ padding: 1.5rem;
936
+ text-align: center;
937
+ color: var(--text-muted);
938
+ background-color: var(--surface-color);
939
+ border-radius: 10px;
940
+ border: 1px solid var(--border-color);
941
+ box-shadow: 0 2px 6px var(--shadow-color);">
942
+ <div style="font-size: 1.5rem; margin-bottom: 0.75rem;">🔍</div>
943
+ <div style="font-weight: 500; margin-bottom: 0.5rem;">No Tool Information</div>
944
+ <div style="font-size: 0.9rem; font-style: italic;">This conversation doesn't use any tools</div>
945
+ </div>
946
+ """
947
+
948
+ # Format each tool
949
+ tool_items = ""
950
+ for tool in tools:
951
+ name = tool.get("title", tool.get("name", "Unnamed Tool"))
952
+ description = tool.get("description", "No description available")
953
+
954
+ # Get parameters
955
+ parameters = {}
956
+ required_params = []
957
+
958
+ # Handle different schema formats
959
+ if "function" in tool:
960
+ # Function schema format
961
+ function_data = tool["function"]
962
+ name = function_data.get("name", name)
963
+ description = function_data.get("description", description)
964
+
965
+ if (
966
+ "parameters" in function_data
967
+ and "properties" in function_data["parameters"]
968
+ ):
969
+ properties = function_data["parameters"]["properties"]
970
+ for param_name, param_data in properties.items():
971
+ param_desc = param_data.get("description", "No description")
972
+ param_type = param_data.get("type", "unknown")
973
+ param_default = param_data.get("default", "None")
974
+ parameters[param_name] = {
975
+ "description": param_desc,
976
+ "type": param_type,
977
+ "default": param_default,
978
+ }
979
+
980
+ required_params = function_data.get("parameters", {}).get(
981
+ "required", []
982
+ )
983
+
984
+ elif "properties" in tool:
985
+ # Original schema format
986
+ if "properties" in tool:
987
+ for param_name, param_data in tool["properties"].items():
988
+ param_desc = param_data.get("description", "No description")
989
+ param_type = param_data.get("type", "unknown")
990
+ param_title = param_data.get("title", param_name)
991
+ parameters[param_name] = {
992
+ "description": param_desc,
993
+ "type": param_type,
994
+ "title": param_title,
995
+ }
996
+
997
+ required_params = tool.get("required", [])
998
+
999
+ # Format parameters
1000
+ params_html = ""
1001
+ if parameters:
1002
+ for param_name, param_data in parameters.items():
1003
+ is_required = param_name in required_params
1004
+ param_style = "required" if is_required else "optional"
1005
+
1006
+ params_html += f"""
1007
+ <div style="
1008
+ margin-bottom: 1rem;
1009
+ padding-bottom: 1rem;
1010
+ border-bottom: 1px solid var(--border-color-light);">
1011
+ <div style="
1012
+ display: flex;
1013
+ align-items: center;
1014
+ justify-content: space-between;
1015
+ margin-bottom: 0.5rem;">
1016
+ <div style="
1017
+ font-weight: 600;
1018
+ color: var(--primary-text);
1019
+ font-size: 0.95rem;">
1020
+ {param_name}
1021
+ </div>
1022
+ <div style="
1023
+ font-size: 0.75rem;
1024
+ padding: 0.15rem 0.5rem;
1025
+ border-radius: 12px;
1026
+ background-color: {f"rgba(234, 67, 53, 0.1)" if is_required else "rgba(160, 160, 160, 0.1)"};
1027
+ color: {f"var(--score-low)" if is_required else "var(--text-muted)"};
1028
+ font-weight: 500;">
1029
+ {f"Required" if is_required else "Optional"}
1030
+ </div>
1031
+ </div>
1032
+ <div style="
1033
+ color: var(--text-muted);
1034
+ line-height: 1.5;
1035
+ font-size: 0.85rem;
1036
+ margin-bottom: 0.25rem;">
1037
+ {param_data.get("description", "No description")}
1038
+ </div>
1039
+ <div style="
1040
+ display: flex;
1041
+ font-size: 0.8rem;
1042
+ color: var(--text-muted);">
1043
+ <div style="margin-right: 1rem;">
1044
+ <span style="font-weight: 500;">Type:</span> {param_data.get("type", "unknown")}
1045
+ </div>
1046
+ {f'<div><span style="font-weight: 500;">Default:</span> {param_data.get("default", "None")}</div>' if "default" in param_data else ''}
1047
+ </div>
1048
+ </div>
1049
+ """
1050
+ else:
1051
+ params_html = """
1052
+ <div style="
1053
+ color: var(--text-muted);
1054
+ font-style: italic;
1055
+ padding: 0.75rem;
1056
+ text-align: center;
1057
+ font-size: 0.9rem;">
1058
+ No parameters
1059
+ </div>
1060
+ """
1061
+
1062
+ # Remove border from last parameter
1063
+ params_html += """
1064
+ <style>
1065
+ .tool-params > div:last-child {
1066
+ border-bottom: none !important;
1067
+ margin-bottom: 0 !important;
1068
+ padding-bottom: 0 !important;
1069
+ }
1070
+ </style>
1071
+ """
1072
+
1073
+ tool_items += f"""
1074
+ <div style="
1075
+ margin-bottom: 1.5rem;
1076
+ padding: 1.5rem;
1077
+ border-radius: 8px;
1078
+ background-color: var(--surface-color-alt);
1079
+ border: 1px solid var(--border-color);
1080
+ box-shadow: 0 1px 3px var(--shadow-color);">
1081
+ <div style="
1082
+ font-weight: 600;
1083
+ color: var(--primary-text);
1084
+ margin-bottom: 0.75rem;
1085
+ font-size: 1.05rem;
1086
+ display: flex;
1087
+ align-items: center;">
1088
+ <span style="margin-right: 8px;">⚙️</span> {name}
1089
+ </div>
1090
+ <div style="
1091
+ color: var(--text-color);
1092
+ margin-bottom: 1.25rem;
1093
+ line-height: 1.5;
1094
+ font-size: 0.95rem;
1095
+ padding-left: 0.5rem;
1096
+ border-left: 3px solid var(--primary-text-light);">
1097
+ {description}
1098
+ </div>
1099
+ <div style="
1100
+ font-weight: 600;
1101
+ color: var(--text-color);
1102
+ margin-bottom: 0.75rem;
1103
+ font-size: 0.9rem;">
1104
+ Parameters:
1105
+ </div>
1106
+ <div class="tool-params">
1107
+ {params_html}
1108
+ </div>
1109
+ </div>
1110
+ """
1111
+
1112
+ full_tools_html = f"""
1113
+ <div style="
1114
+ padding: 1.5rem;
1115
+ background-color: var(--surface-color);
1116
+ border-radius: 10px;
1117
+ border: 1px solid var(--border-color);
1118
+ box-shadow: 0 2px 6px var(--shadow-color);
1119
+ height: 100%;
1120
+ overflow-y: auto;
1121
+ max-height: 600px;">
1122
+ <div style="
1123
+ padding-bottom: 1rem;
1124
+ margin-bottom: 1.5rem;
1125
+ border-bottom: 1px solid var(--border-color-light);
1126
+ display: flex;
1127
+ align-items: center;">
1128
+ <div style="
1129
+ font-weight: 600;
1130
+ font-size: 1.1rem;
1131
+ color: var(--primary-text);">
1132
+ <span style="margin-right: 0.5rem;">🛠️</span>Available Tools
1133
+ </div>
1134
+ </div>
1135
+ {tool_items}
1136
+ </div>
1137
+ """
1138
+ return full_tools_html
1139
+ except Exception as e:
1140
+ return f"""
1141
+ <div style="
1142
+ padding: 1.5rem;
1143
+ color: var(--score-low);
1144
+ background-color: var(--surface-color);
1145
+ border: 1px solid var(--score-low);
1146
+ border-radius: 10px;">
1147
+ <div style="font-weight: 600; margin-bottom: 0.5rem;">Error Formatting Tool Info</div>
1148
+ <div style="font-family: monospace; white-space: pre-wrap;">{str(e)}</div>
1149
+ </div>
1150
+ """
data_loader.py ADDED
@@ -0,0 +1,457 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ def load_data():
4
+ """Load and preprocess the data."""
5
+ df = pd.read_csv("leaderboard.csv").dropna()
6
+
7
+ return df
8
+
9
+ df = load_data()
10
+ MODELS = [x.strip() for x in df["Model"].unique().tolist()]
11
+
12
+ COMMON = """
13
+ <style>
14
+ @media (prefers-color-scheme: dark) {
15
+ :root {
16
+ --bg-primary: #0B0B19;
17
+ --bg-secondary: rgba(19, 19, 37, 0.4);
18
+ --bg-hover: rgba(30, 30, 45, 0.95);
19
+ --text-primary: #ffffff;
20
+ --text-secondary: #e2e8f0;
21
+ --text-tertiary: #e2e8f0;
22
+ --card-bg: rgba(17, 17, 27, 0.4);
23
+ --border-color: rgba(31, 41, 55, 0.5);
24
+ --border-hover: rgba(79, 70, 229, 0.4);
25
+ --accent-color: #ffffff;
26
+ --accent-bg: rgba(79, 70, 229, 0.1);
27
+ --blue-gradient: linear-gradient(45deg, #3B82F6, #A8C4F0);;
28
+ --orange-gradient: linear-gradient(45deg, #E05205, #FAD8D2);
29
+ --green-gradient: linear-gradient(45deg, #60cc1c, #a0e65e);
30
+ --shadow-color: rgba(0, 0, 0, 0.2);
31
+ }
32
+ }
33
+
34
+ @media (prefers-color-scheme: light) {
35
+ :root {
36
+ --bg-primary: #ffffff;
37
+ --bg-secondary: rgba(243, 244, 246, 0.4);
38
+ --bg-hover: rgba(229, 231, 235, 0.95);
39
+ --text-primary: #1F2937;
40
+ --text-secondary: #4B5563;
41
+ --text-tertiary: #6B7280;
42
+ --card-bg: rgba(249, 250, 251, 0.4);
43
+ --border-color: rgba(209, 213, 219, 0.5);
44
+ --border-hover: rgba(79, 70, 229, 0.4);
45
+ --accent-color: #4F46E5;
46
+ --accent-bg: rgba(79, 70, 229, 0.1);
47
+ --blue-gradient: linear-gradient(45deg, #3B82F6, #A8C4F0);;
48
+ --orange-gradient: linear-gradient(45deg, #E05205, #FF8340);
49
+ --green-gradient: linear-gradient(45deg, #60cc1c, #a0e65e);
50
+ --shadow-color: rgba(0, 0, 0, 0.1);
51
+ }
52
+ }
53
+ </style>
54
+ """
55
+
56
+
57
+ # Define constants for the links
58
+ PAPER_LINK = "https://github.com/jkhouja/L2"
59
+ CODE_LINK = "https://github.com/jkhouja/L2"
60
+ BLOG_LINK = "https://github.com/jkhouja/L2"
61
+ DATASET_LINK = "https://huggingface.co/datasets/jkhouja/LingOly-TOO"
62
+ ADD_MODEL_LINK = (
63
+ "https://mail.google.com/mail/?view=cm&fs=1&[email protected]"
64
+ "&su=Get%20Model%20Added%20to%20Leaderboard&body=Hi%20there%2C%0A%0AI%20"
65
+ "would%20like%20to%20add%20my%20model%20to%20the%20Lingoly-TOO%20Leaderboard.%0A%0AModel%20Name%3A%0AModel%20URL%3A%0A%0ABest%20regards"
66
+ )
67
+
68
+ HEADER_CONTENT = (
69
+ COMMON
70
+ + f"""
71
+ <style>
72
+
73
+ .header-wrapper {{
74
+ position: relative;
75
+ background: var(--bg-primary);
76
+ padding: 4rem 2rem;
77
+ border-radius: 16px;
78
+ margin-bottom: 0;
79
+ transition: all 0.3s ease;
80
+ }}
81
+
82
+ .header-content {{
83
+ max-width: 72rem;
84
+ margin: 0 auto;
85
+ }}
86
+
87
+ .title-section {{
88
+ position: relative;
89
+ display: flex;
90
+ align-items: center;
91
+ justify-content: center;
92
+ margin-bottom: 3rem;
93
+ }}
94
+
95
+ .title-gradient {{
96
+ font-size: 5rem;
97
+ font-weight: 800;
98
+ line-height: 1.1;
99
+ background: var(--orange-gradient);
100
+ -webkit-background-clip: text;
101
+ -webkit-text-fill-color: transparent;
102
+ margin-bottom: 0.5rem;
103
+ }}
104
+
105
+ .title-image {{
106
+ position: absolute;
107
+ top: 30px;
108
+ left: 30px;
109
+ width: 100px;
110
+ height: 100px;
111
+ /* To make it look ok on dark mode */
112
+ background-color: #ffffffd0;
113
+ padding: 10px;
114
+ border-radius: 6px;
115
+ }}
116
+
117
+ .subtitle-white {{
118
+ font-size: 5rem;
119
+ font-weight: 800;
120
+ line-height: 1.1;
121
+ color: var(--text-primary);
122
+ margin-bottom: 3rem;
123
+ transition: color 0.3s ease;
124
+ }}
125
+
126
+ .description {{
127
+ color: var(--text-secondary);
128
+ font-size: 1.25rem;
129
+ line-height: 1.75;
130
+ max-width: 800px;
131
+ margin: 0 auto;
132
+ text-align: center;
133
+ transition: color 0.3s ease;
134
+ }}
135
+
136
+ .highlight-question {{
137
+ background: var(--blue-gradient);
138
+ -webkit-background-clip: text;
139
+ -webkit-text-fill-color: transparent;
140
+ display: block;
141
+ margin-top: 1rem;
142
+ font-size: 1.5rem;
143
+ font-weight: 500;
144
+ }}
145
+
146
+ .metrics-grid {{
147
+ display: grid;
148
+ grid-template-columns: repeat(3, 1fr);
149
+ gap: 1.5rem;
150
+ margin-top: 4rem;
151
+ }}
152
+
153
+ .metric-card {{
154
+ background: var(--bg-secondary);
155
+ border: 1px solid var(--border-color);
156
+ text-align: center;
157
+ border-radius: 1rem;
158
+ padding: 2rem;
159
+ transition: all 0.3s ease;
160
+ align-items: center;
161
+ }}
162
+
163
+ .metric-card:hover {{
164
+ transform: translateY(-5px);
165
+ border-color: var(--border-hover);
166
+ box-shadow: 0 4px 20px var(--shadow-color);
167
+ }}
168
+
169
+ .metric-number {{
170
+ font-size: 4rem;
171
+ font-weight: 800;
172
+ margin-bottom: 1rem;
173
+ }}
174
+
175
+ .metric-blue {{
176
+ background: var(--blue-gradient);
177
+ -webkit-background-clip: text;
178
+ -webkit-text-fill-color: transparent;
179
+ }}
180
+
181
+ .metric-purple {{
182
+ background: var(--orange-gradient);
183
+ -webkit-background-clip: text;
184
+ -webkit-text-fill-color: transparent;
185
+ }}
186
+
187
+ .metric-green {{
188
+ background: var(--green-gradient);
189
+ -webkit-background-clip: text;
190
+ -webkit-text-fill-color: transparent;
191
+ }}
192
+
193
+ .metric-label {{
194
+ color: var(--text-secondary);
195
+ font-size: 1.5rem;
196
+ margin-bottom: 1.5rem;
197
+ transition: color 0.3s ease;
198
+ }}
199
+
200
+ .metric-detail {{
201
+ font-size: 1.125rem;
202
+ line-height: 1.75;
203
+ margin-top: 0.5rem;
204
+ transition: color 0.3s ease;
205
+ }}
206
+
207
+ .metric-detail.primary {{
208
+ color: var(--accent-color);
209
+ }}
210
+
211
+ .metric-detail.secondary {{
212
+ color: var(--text-secondary);
213
+ }}
214
+
215
+ .actions {{
216
+ display: flex;
217
+ gap: 1rem;
218
+ justify-content: center;
219
+ margin-top: 3rem;
220
+ }}
221
+
222
+ .action-button {{
223
+ display: flex;
224
+ align-items: center;
225
+ gap: 0.5rem;
226
+ padding: 0.75rem 1.5rem;
227
+ background: var(--bg-secondary);
228
+ border: 1px solid var(--border-color);
229
+ border-radius: 100px;
230
+ color: var(--text-primary) !important;
231
+ text-decoration: none !important;
232
+ font-size: 0.95rem;
233
+ transition: all 0.3s ease;
234
+ }}
235
+
236
+ .action-button:hover {{
237
+ transform: translateY(-2px);
238
+ border-color: var(--accent-color);
239
+ background: var(--accent-bg);
240
+ }}
241
+
242
+ @media (max-width: 1024px) {{
243
+ .title-image {{
244
+ top: 20px;
245
+ left: 20px;
246
+ width: 80px;
247
+ height: 80px;
248
+ }}
249
+ .title-gradient, .subtitle-white {{
250
+ font-size: 3rem;
251
+ }}
252
+ }}
253
+
254
+ @media (max-width: 620px) {{
255
+ .title-image {{
256
+ position: relative;
257
+ margin-top: -30px !important;
258
+ margin-bottom: 20px !important;
259
+ top: 0;
260
+ left: 0;
261
+ }}
262
+ }}
263
+ </style>
264
+
265
+ <div class="header-wrapper">
266
+ <div class="header-content">
267
+ <div class="title-section">
268
+ <div class="title-gradient">LingOly-TOO</div>
269
+ </div>
270
+
271
+ <div class="description">
272
+ LingOly-TOO (L2) is a challenging reasoning benchmark designed to minimize the chance of answering by guessing.
273
+ It is developed by rewriting (obfuscating) parts of questions and answers so that the chance of leakage in training data is minimum.
274
+ <div class="highlight-question">
275
+ "How do top LLMs reason on unseen linguistic questions?"
276
+ </div>
277
+ </div>
278
+ </div>
279
+
280
+ <div class="actions">
281
+ <a href="{PAPER_LINK}" class="action-button">
282
+ <svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
283
+ <path d="M15 7h3a5 5 0 0 1 5 5 5 5 0 0 1-5 5h-3m-6 0H6a5 5 0 0 1-5-5 5 5 0 0 1 5-5h3"/>
284
+ <line x1="8" y1="12" x2="16" y2="12"/>
285
+ </svg>
286
+ Paper
287
+ </a>
288
+ <a href="{CODE_LINK}" class="action-button">
289
+ <svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
290
+ <path d="M9 19c-5 1.5-5-2.5-7-3m14 6v-3.87a3.37 3.37 0 0 0-.94-2.61c3.14-.35 6.44-1.54 6.44-7A5.44 5.44 0 0 0 20 4.77 5.07 5.07 0 0 0 19.91 1S18.73.65 16 2.48a13.38 13.38 0 0 0-7 0C6.27.65 5.09 1 5.09 1A5.07 5.07 0 0 0 5 4.77a5.44 5.44 0 0 0-1.5 3.78c0 5.42 3.3 6.61 6.44 7A3.37 3.37 0 0 0 9 18.13V22"/>
291
+ </svg>
292
+ Code
293
+ </a>
294
+ <a href="{BLOG_LINK}" class="action-button">
295
+ <svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
296
+ <path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4"/>
297
+ <polyline points="7 10 12 15 17 10"/>
298
+ <line x1="12" y1="15" x2="12" y2="3"/>
299
+ </svg>
300
+ Blog
301
+ </a>
302
+ <a href="{DATASET_LINK}" class="action-button">
303
+ <svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
304
+ <path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4"/>
305
+ <polyline points="7 10 12 15 17 10"/>
306
+ <line x1="12" y1="15" x2="12" y2="3"/>
307
+ </svg>
308
+ Dataset
309
+ </a>
310
+ <a href="{ADD_MODEL_LINK}" class="action-button" target="_blank" rel="noopener noreferrer">
311
+ <svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
312
+ <path d="M19 3H5a2 2 0 0 0-2 2v14a2 2 0 0 0 2 2h14a2 2 0 0 0 2-2V5a2 2 0 0 0-2-2z"/>
313
+ <line x1="12" y1="8" x2="12" y2="16"/>
314
+ <line x1="8" y1="12" x2="16" y2="12"/>
315
+ </svg>
316
+ Add Your Model
317
+ </a>
318
+ </div>
319
+ </div>
320
+ """
321
+ )
322
+
323
+ CARDS = """ <div class="metrics-grid">
324
+ <div class="metric-card">
325
+ <div class="metric-number metric-blue">11</div>
326
+ <div class="metric-label">Total Models</div>
327
+ <div class="metric-detail primary">4 Reasoning Models</div>
328
+ <div class="metric-detail primary">4 Open Source Models</div>
329
+ </div>
330
+
331
+ <div class="metric-card">
332
+ <div class="metric-number metric-purple">82</div>
333
+ <div class="metric-label">Linguistics Problems</div>
334
+ <div class="metric-detail primary">6 Permutations per problem</div>
335
+ <div class="metric-detail primary">Problems from Low-resource Languages</div>
336
+ </div>
337
+
338
+ <div class="metric-card">
339
+ <div class="metric-number metric-green">1.2k</div>
340
+ <div class="metric-label">Total Questions</div>
341
+ <div class="metric-detail primary">Includes Match-Up, Multiple Choice and Completion</div>
342
+ </div>
343
+ </div>"""
344
+
345
+ METHODOLOGY = """
346
+ <style>
347
+ @media (prefers-color-scheme: dark) {
348
+ :root {
349
+ --bg-primary: #0B0B19;
350
+ --bg-secondary: rgba(19, 19, 37, 0.4);
351
+ --text-primary: #ffffff;
352
+ --text-secondary: #94A3B8;
353
+ --border-primary: rgba(31, 41, 55, 0.5);
354
+ --accent-blue: #60A5FA;
355
+ --accent-purple: #A78BFA;
356
+ --card-hover-bg: rgba(79, 70, 229, 0.1);
357
+ --shadow-color: rgba(79, 70, 229, 0.1);
358
+ }
359
+ }
360
+
361
+ @media (prefers-color-scheme: light) {
362
+ :root {
363
+ --bg-primary: #ffffff;
364
+ --bg-secondary: rgba(243, 244, 246, 0.4);
365
+ --text-primary: #111827;
366
+ --text-secondary: #4B5563;
367
+ --border-primary: rgba(209, 213, 219, 0.5);
368
+ --accent-blue: #3B82F6;
369
+ --accent-purple: #8B5CF6;
370
+ --card-hover-bg: rgba(243, 244, 246, 0.8);
371
+ --shadow-color: rgba(0, 0, 0, 0.1);
372
+ }
373
+ }
374
+
375
+ .dataset-table {
376
+ width: 100%;
377
+ border-collapse: separate;
378
+ border-spacing: 0;
379
+ margin: 2rem 0;
380
+ background: var(--bg-secondary);
381
+ border-radius: 1rem;
382
+ overflow: hidden;
383
+ box-shadow: 0 4px 20px var(--shadow-color);
384
+ }
385
+
386
+ .dataset-table thead {
387
+ background: linear-gradient(90deg, var(--accent-blue), var(--accent-purple));
388
+ }
389
+
390
+ .dataset-table th {
391
+ padding: 1.25rem 1rem;
392
+ text-align: left;
393
+ color: white;
394
+ font-weight: 600;
395
+ font-size: 1rem;
396
+ }
397
+
398
+ .dataset-table td {
399
+ padding: 1rem;
400
+ border-bottom: 1px solid var(--border-primary);
401
+ color: var(--text-secondary);
402
+ transition: all 0.2s ease;
403
+ }
404
+
405
+ .dataset-table tbody tr:hover td {
406
+ background: var(--card-hover-bg);
407
+ color: var(--text-primary);
408
+ }
409
+
410
+ .methodology-content {
411
+ max-width: 1200px;
412
+ margin: 0 auto;
413
+ padding: 2rem;
414
+ color: var(--text-secondary);
415
+ line-height: 1.7;
416
+ font-size: 1rem;
417
+ }
418
+
419
+ .section-title {
420
+ font-size: 2.5rem;
421
+ font-weight: 700;
422
+ margin: 3rem 0 1.5rem;
423
+ color: var(--text-primary);
424
+ background: linear-gradient(to right, var(--accent-blue), var(--accent-purple));
425
+ -webkit-background-clip: text;
426
+ -webkit-text-fill-color: transparent;
427
+ letter-spacing: -0.02em;
428
+ }
429
+ </style>
430
+
431
+ <div class="section-divider"></div>
432
+ <h1 class="section-title">Citation</h2>
433
+ <div class="bibtex-citation" style="font-family: monospace; white-space: pre; padding: 1em; background-color: rgba(128, 128, 128, 0.1); border: 1px solid rgba(128, 128, 128, 0.2); border-radius: 4px; color: currentColor;">@article{lingoly-too2025,
434
+ author = {Khouja, Jude and Korgul, Karolina and Hellsten, Simeon and Yang, Lingyi
435
+ and Neacșu, Vlad A. and Mayne, Harry and Kearns, Ryan O. and Bean, Andrew M. and Mahdi, Adam},
436
+ title = {LINGOLY-TOO: Disentangling Memorisation from Reasoning with Linguistic Templatisation and Orthographic Obfuscation},
437
+ year = {2025},
438
+ primaryClass={cs.CL},
439
+ archivePrefix=,
440
+ }</div>
441
+
442
+ """
443
+
444
+ UNUSED = """
445
+ <!-- Insights Section -->
446
+ <h1 class="section-title">Key insights</h1>
447
+ <p>
448
+ We use orthographic templatisation on Linguistics Olympiad problems to create obfuscated variants
449
+ that maintain the same reasoning steps. Through extensive experiments, we show that obfuscation
450
+ reduces measurement bias from data exposure and provides reasoning estimates that correlate with
451
+ the ability to solve linguistic reasoning problems. Additionally, we find that state-of-the-art
452
+ models exhibit inconsistent reasoning abilities and that simple fine-tuning does not necessarily
453
+ equip models with context-free and robust problem-solving skills. This work establishes a reasoning
454
+ measure that is resilient to data exposure effects and supports ongoing efforts to fully understand
455
+ response generation in advanced models.
456
+ </p>
457
+ """
leaderboard.csv ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model,Provider,Type,Baseline score,Obfuscated score
2
+ Aya 23 35B,Cohere,Open source,0.10654349746757057,0.05708180119638717
3
+ Claude 3.5 Sonnet,Anthropic,Closed source,0.48255271180599657,0.2810140963355337
4
+ Claude 3.7 Sonnet,Anthropic,Closed source,0.5994013309112796,0.4357505520191723
5
+ GPT 4.5,OpenAI,Closed source,0.4208265195574057,0.2545024812218498
6
+ GPT 4o,OpenAI,Closed source,0.31371291749661456,0.1563339989919302
7
+ Gemini 1.5 Pro,Google,Closed source,0.3690345167304693,0.20461522579355207
8
+ Llama 3.3 70B-Instruct,Meta,Open source,0.11452795751175084,0.08213118755937426
9
+ Phi4,Microsoft,Open source,0.1809802769595679,0.10996628714372364
10
+ DeepSeek R1,DeepSeek,Open source,0.3965527162895584,0.2649618642615188
11
+ o1-preview,OpenAI,Closed source,0.47730527712315257,0.3222020975619888
12
+ o3-mini,OpenAI,Closed source,0.42172257807447155,0.3059086523804619
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ gradio==5.18.0
2
+ pandas
3
+ matplotlib
4
+ plotly
results.csv ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model,Model Type,Model Output Type,Vendor,Input cost per million token,Output cost per million token,Model Avg,single turn perf,multi turn perf,BFCL_v3_multi_turn_base_multi_func_call,BFCL_v3_multi_turn_composite,tau_long_context,xlam_single_tool_multiple_call,BFCL_v3_multi_turn_miss_param,xlam_multiple_tool_single_call,xlam_tool_miss,BFCL_v3_multi_turn_long_context,BFCL_v3_irrelevance,BFCL_v3_multi_turn_base_single_func_call,xlam_single_tool_single_call,xlam_multiple_tool_multiple_call,BFCL_v3_multi_turn_miss_func,toolace_single_func_call
2
+ claude-3-7-sonnet-20250219,Private,Reasoning,Anthropic,3,15,0.953,0.96,0.95,0.92,0.96,1,0.95,0.97,1,0.96,0.94,0.97,0.96,0.99,0.82,0.92,0.975
3
+ gemini-2.0-flash-001,Private,Normal,Google,0.15,0.6,0.938,0.95,0.93,0.91,0.94,0.9,0.96,0.92,0.95,0.89,0.91,0.98,0.93,0.97,0.98,0.93,0.965
4
+ gemini-2.0-flash-lite-001,Private,Normal,Google,0.075,0.3,0.933,0.96,0.91,0.81,0.98,0.98,0.9,0.91,0.92,0.98,0.86,0.99,0.87,0.97,0.96,0.95,0.975
5
+ gpt-4o-2024-11-20,Private,Normal,OpenAI,2.5,10,0.900,0.92,0.88,0.85,0.9,0.92,0.95,0.88,0.99,0.63,0.83,0.98,0.89,0.98,0.98,0.86,0.965
6
+ gpt-4.5-preview-2025-02-27,Private,Normal,OpenAI,75,150,0.900,0.93,0.87,0.85,0.91,0.92,0.97,0.92,0.99,0.67,0.85,0.98,0.85,1,0.98,0.8,0.915
7
+ gemini-1.5-flash,Private,Normal,Google,0.075,0.3,0.895,0.88,0.91,0.9,0.9,0.89,0.87,0.91,0.83,0.71,0.87,0.98,0.89,0.94,0.93,0.92,0.99
8
+ gemini-1.5-pro,Private,Normal,Google,1.25,5,0.885,0.87,0.91,0.89,0.93,0.75,0.97,0.9,0.87,0.57,0.91,0.94,0.92,0.99,0.97,0.86,0.925
9
+ o1-2024-12-17,Private,Reasoning,OpenAI,15,60,0.876,0.83,0.92,0.89,0.92,0.98,0.71,0.91,0.99,0.73,0.88,0.98,0.96,1,0.43,0.94,0.95
10
+ o3-mini-2025-01-31,Private,Reasoning,OpenAI,1.1,4.4,0.847,0.80,0.90,0.87,0.91,0.84,0.72,0.93,0.98,0.63,0.85,0.97,0.84,1,0.43,0.91,0.975
11
+ mistral-small-2501,Open source,Normal,Mistral,0.1,0.3,0.832,0.88,0.78,0.83,0.78,0.92,0.97,0.76,0.99,0.62,0.8,0.82,0.77,0.95,0.92,0.74,0.775
12
+ gpt-4o-mini,Private,Normal,OpenAI,0.15,0.6,0.832,0.85,0.82,0.82,0.85,0.51,0.98,0.83,1,0.54,0.83,0.94,0.83,0.96,0.99,0.73,0.835
13
+ qwen2.5-72b-instruct,Open source,Normal,Alibaba,0.9,0.9,0.817,0.80,0.84,0.84,0.87,0.92,0.63,0.86,0.99,0.66,0.79,0.99,0.77,0.97,0.42,0.78,0.95
14
+ mistral-large-2411,Private,Normal,Mistral,2,6,0.810,0.87,0.75,0.77,0.76,0.83,0.93,0.75,0.97,0.65,0.77,0.87,0.78,0.9,0.94,0.7,0.725
15
+ claude-3-5-sonnet-20241022,Private,Normal,Anthropic,3,15,0.801,0.83,0.77,0.68,0.81,0.68,0.78,0.85,0.91,0.92,0.67,0.9,0.75,0.74,0.88,0.69,0.955
16
+ Llama-3.3-70B-Instruct-Turbo,Open source,Normal,Meta,0.9,0.9,0.774,0.86,0.69,0.85,0.5,0.72,0.87,0.57,0.99,0.61,0.79,0.9,0.73,0.93,0.97,0.54,0.865
17
+ claude-3-5-haiku-20241022,Private,Normal,Anthropic,0.8,4,0.765,0.78,0.75,0.72,0.72,0.72,0.79,0.79,0.85,0.76,0.73,0.84,0.69,0.65,0.88,0.66,0.905
18
+ mistral-small-2409,Private,Normal,Mistral,0.2,0.6,0.750,0.82,0.68,0.7,0.77,0.72,0.98,0.7,1,0.42,0.77,0.84,0.78,0.93,0.85,0.62,0.425
19
+ ministral-8b-2410,Private,Normal,Mistral,0.1,0.1,0.689,0.73,0.65,0.75,0.59,0.73,0.98,0.66,0.98,0.34,0.78,0.24,0.81,0.9,0.95,0.53,0.41
20
+ Meta-Llama-3.1-8B-Instruct-Turbo,Open source,Normal,Meta,0.2,0.2,0.678,0.71,0.64,0.77,0.49,0.44,0.96,0.66,0.98,0.25,0.73,0.48,0.76,0.93,0.96,0.51,0.575
21
+ open-mistral-nemo-2407,Open source,Normal,Mistral,0.15,0.15,0.661,0.68,0.64,0.7,0.64,0.51,0.98,0.68,0.99,0.26,0.78,0.21,0.75,0.9,0.94,0.51,0.41
22
+ Dataset Avg,,,,,,,0.84,0.81,0.82,0.81,0.79,0.89,0.82,0.96,0.64,0.82,0.84,0.83,0.93,0.86,0.76,0.82
tabs/leaderboard.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from data_loader import METHODOLOGY
3
+ from utils import (
4
+ get_rank_badge,
5
+ get_score_bar,
6
+ get_type_badge,
7
+ )
8
+
9
+ def filter_leaderboard(df, sort_by):
10
+ filtered_df = df.copy()
11
+
12
+ if sort_by == "Score after obfuscation":
13
+ filtered_df = filtered_df.sort_values(by="Obfuscated score", ascending=False)
14
+ else:
15
+ filtered_df = filtered_df.sort_values(by="Baseline score", ascending=False)
16
+
17
+ filtered_df["Rank"] = range(1, len(filtered_df) + 1)
18
+
19
+ # Generate styled table HTML
20
+ table_html = f"""
21
+ <style>
22
+ @media (prefers-color-scheme: dark) {{
23
+ :root {{
24
+ --bg-color: #1a1b1e;
25
+ --text-color: #ffffff;
26
+ --border-color: #2d2e32;
27
+ --hover-bg: #2d2e32;
28
+ --note-bg: #2d2e32;
29
+ --note-text: #a1a1aa;
30
+ --accent-blue: #60A5FA;
31
+ --accent-purple: #A78BFA;
32
+ --accent-pink: #F472B6;
33
+ --score-bg: rgba(255, 255, 255, 0.1);
34
+ }}
35
+ }}
36
+
37
+ @media (prefers-color-scheme: light) {{
38
+ :root {{
39
+ --bg-color: #ffffff;
40
+ --text-color: #000000;
41
+ --border-color: #e5e7eb;
42
+ --hover-bg: #f3f4f6;
43
+ --note-bg: #f3f4f6;
44
+ --note-text: #4b5563;
45
+ --accent-blue: #3B82F6;
46
+ --accent-purple: #8B5CF6;
47
+ --accent-pink: #EC4899;
48
+ --score-bg: rgba(0, 0, 0, 0.1);
49
+ }}
50
+ }}
51
+
52
+ .dark-table-container {{
53
+ background: var(--bg-color);
54
+ border-radius: 12px;
55
+ padding: 1px;
56
+ margin: 20px 0;
57
+ }}
58
+
59
+ .dark-styled-table {{
60
+ width: 100%;
61
+ border-collapse: collapse;
62
+ font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
63
+ background: var(--bg-color);
64
+ color: var(--text-color);
65
+ }}
66
+
67
+ .dark-styled-table thead {{
68
+ position: sticky;
69
+ top: 0;
70
+ background: var(--bg-color);
71
+ z-index: 1;
72
+ }}
73
+
74
+ .dark-styled-table th {{
75
+ padding: 16px;
76
+ text-align: left;
77
+ font-weight: 500;
78
+ color: var(--text-color);
79
+ border-bottom: 1px solid var(--border-color);
80
+ }}
81
+
82
+ .dark-styled-table td {{
83
+ padding: 16px;
84
+ border-bottom: 1px solid var(--border-color);
85
+ color: var(--text-color);
86
+ }}
87
+
88
+ .dark-styled-table tbody tr:hover {{
89
+ background: var(--hover-bg);
90
+ }}
91
+
92
+ .model-cell {{
93
+ font-weight: 500;
94
+ }}
95
+
96
+ .score-cell {{
97
+ font-weight: 500;
98
+ }}
99
+
100
+ .note-box {{
101
+ margin-top: 20px;
102
+ padding: 16px;
103
+ background: var(--note-bg);
104
+ border-radius: 8px;
105
+ color: var(--note-text);
106
+ }}
107
+ </style>
108
+
109
+ <div class="dark-table-container">
110
+ <table class="dark-styled-table">
111
+ <thead>
112
+ <tr>
113
+ <th>Rank</th>
114
+ <th>Model</th>
115
+ <th>Provider</th>
116
+ <th>Type</th>
117
+ <th>Exact match score (obfuscated questions)</th>
118
+ <th>Exact match score (all questions)</th>
119
+ </tr>
120
+ </thead>
121
+ <tbody>
122
+ """
123
+
124
+ for _, row in filtered_df.iterrows():
125
+ table_html += f"""
126
+ <tr>
127
+ <td>{get_rank_badge(row['Rank'])}</td>
128
+ <td class="model-cell">{row['Model']}</td>
129
+ <td class="vendor-cell">{row['Provider']}</td>
130
+ <td>{get_type_badge(row['Type'])}</td>
131
+ <td class="score-cell">{get_score_bar(row['Obfuscated score'])}</td>
132
+ <td class="score-cell">{get_score_bar(row['Baseline score'])}</td>
133
+ </tr>
134
+ """
135
+
136
+ return table_html
137
+
138
+
139
+ def create_leaderboard_tab(df, HEADER_CONTENT, CARDS):
140
+ gr.HTML(HEADER_CONTENT + CARDS)
141
+
142
+ # Filters row
143
+ with gr.Row(equal_height=True):
144
+ with gr.Column(scale=0.4):
145
+ sort_by = gr.Dropdown(
146
+ choices=["Score after obfuscation", "Score on all"],
147
+ value="Score after obfuscation",
148
+ label="Sort by",
149
+ )
150
+
151
+ # Content
152
+ output = gr.HTML()
153
+
154
+ sort_by.change(
155
+ fn=lambda s: filter_leaderboard(df, s),
156
+ inputs=[sort_by],
157
+ outputs=[output],
158
+ )
159
+
160
+ return output
utils.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def get_rank_badge(rank):
2
+ """Generate HTML for rank badge with appropriate styling"""
3
+ badge_styles = {
4
+ 1: ("1st", "linear-gradient(145deg, #ffd700, #ffc400)", "#000"),
5
+ 2: ("2nd", "linear-gradient(145deg, #9ca3af, #787C7E)", "#fff"),
6
+ 3: ("3rd", "linear-gradient(145deg, #CD7F32, #b36a1d)", "#fff"),
7
+ }
8
+
9
+ if rank in badge_styles:
10
+ label, gradient, text_color = badge_styles[rank]
11
+ return f"""
12
+ <div style="
13
+ display: inline-flex;
14
+ align-items: center;
15
+ justify-content: center;
16
+ min-width: 48px;
17
+ padding: 4px 12px;
18
+ background: {gradient};
19
+ color: {text_color};
20
+ border-radius: 6px;
21
+ font-weight: 600;
22
+ font-size: 0.9em;
23
+ box-shadow: 0 2px 4px rgba(0, 0, 0, 0.2);
24
+ ">
25
+ {label}
26
+ </div>
27
+ """
28
+ return f"""
29
+ <div style="
30
+ display: inline-flex;
31
+ align-items: center;
32
+ justify-content: center;
33
+ min-width: 28px;
34
+ color: #a1a1aa;
35
+ font-weight: 500;
36
+ ">
37
+ {rank}
38
+ </div>
39
+ """
40
+
41
+ def get_score_bar(score):
42
+ """Generate HTML for score bar with gradient styling"""
43
+ width = score * 100
44
+ return f"""
45
+ <div style="display: flex; align-items: center; gap: 12px; width: 100%;">
46
+ <div style="
47
+ flex-grow: 1;
48
+ height: 8px;
49
+ background: var(--score-bg, rgba(255, 255, 255, 0.1));
50
+ border-radius: 4px;
51
+ overflow: hidden;
52
+ max-width: 200px;
53
+ ">
54
+ <div style="
55
+ width: {width}%;
56
+ height: 100%;
57
+ background: linear-gradient(90deg, var(--accent-blue, #60A5FA), var(--accent-orange, #E05205));
58
+ border-radius: 4px;
59
+ transition: width 0.3s ease;
60
+ "></div>
61
+ </div>
62
+ <span style="
63
+ font-family: 'SF Mono', monospace;
64
+ font-weight: 600;
65
+ color: var(--text-primary, #ffffff);
66
+ min-width: 60px;
67
+ ">{width:.1f}</span>
68
+ </div>
69
+ """
70
+ def get_chart_colors():
71
+ # if is_dark_theme():
72
+ # return {
73
+ # "Private": "#60A5FA", # accent-blue
74
+ # "Open source": "#A78BFA", # accent-purple
75
+ # "performance_bands": ["#DCFCE7", "#FEF9C3", "#FEE2E2"],
76
+ # "text": "#FFFFFF",
77
+ # "background": "#1a1b1e",
78
+ # "grid": (1, 1, 1, 0.1), # RGBA tuple for grid
79
+ # }
80
+ return {
81
+ "Private": "#3B82F6", # accent-blue light
82
+ "Open source": "#60CC1C", # accent-purple light
83
+ "performance_bands": ["#DCFCE7", "#FEF9C3", "#FEE2E2"],
84
+ "text": "#111827",
85
+ "background": "#FFFFFF",
86
+ "grid": (0, 0, 0, 0.1), # RGBA tuple for grid
87
+ }
88
+
89
+ def get_type_badge(model_type):
90
+ """Generate HTML for model type badge"""
91
+ colors = get_chart_colors()
92
+ colors = {"Closed source": colors["Private"], "Open source": colors["Open source"]}
93
+ bg_color = colors.get(model_type, "#4F46E5")
94
+ return f"""
95
+ <div style="
96
+ display: inline-flex;
97
+ align-items: center;
98
+ padding: 4px 8px;
99
+ background: {bg_color};
100
+ color: white;
101
+ border-radius: 4px;
102
+ font-size: 0.85em;
103
+ font-weight: 500;
104
+ ">
105
+ {model_type}
106
+ </div>
107
+ """
visualization.py ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from utils import get_chart_colors
2
+ import matplotlib
3
+ import matplotlib.pyplot as plt
4
+ import numpy as np
5
+ import plotly.graph_objects as go
6
+
7
+
8
+ def setup_matplotlib():
9
+ matplotlib.use("Agg")
10
+ plt.close("all")
11
+
12
+
13
+ def get_performance_chart(df, category_name="Overall"):
14
+ plt.close("all")
15
+ colors = get_chart_colors()
16
+ score_column = "Category Score"
17
+ df_sorted = df.sort_values(score_column, ascending=True)
18
+
19
+ height = max(8, len(df_sorted) * 0.8)
20
+ fig, ax = plt.subplots(figsize=(16, height))
21
+ plt.rcParams.update({"font.size": 12})
22
+
23
+ fig.patch.set_facecolor(colors["background"])
24
+ ax.set_facecolor(colors["background"])
25
+
26
+ try:
27
+ bars = ax.barh(
28
+ np.arange(len(df_sorted)),
29
+ df_sorted[score_column],
30
+ height=0.4,
31
+ capstyle="round",
32
+ color=[colors[t] for t in df_sorted["Model Type"]],
33
+ )
34
+
35
+ ax.set_title(
36
+ f"Model Performance - {category_name}",
37
+ pad=20,
38
+ fontsize=20,
39
+ fontweight="bold",
40
+ color=colors["text"],
41
+ )
42
+ ax.set_xlabel(
43
+ "Average Score (Tool Selection Quality)",
44
+ fontsize=14,
45
+ labelpad=10,
46
+ color=colors["text"],
47
+ )
48
+ ax.set_xlim(0.0, 1.0)
49
+
50
+ ax.set_yticks(np.arange(len(df_sorted)))
51
+ ax.set_yticklabels(df_sorted["Model"], fontsize=12, color=colors["text"])
52
+
53
+ plt.subplots_adjust(left=0.35)
54
+
55
+ for i, v in enumerate(df_sorted[score_column]):
56
+ ax.text(
57
+ v + 0.01,
58
+ i,
59
+ f"{v:.3f}",
60
+ va="center",
61
+ fontsize=12,
62
+ fontweight="bold",
63
+ color=colors["text"],
64
+ )
65
+
66
+ ax.grid(True, axis="x", linestyle="--", alpha=0.2, color=colors["grid"])
67
+ ax.spines[["top", "right"]].set_visible(False)
68
+ ax.spines[["bottom", "left"]].set_color(colors["grid"])
69
+ ax.tick_params(colors=colors["text"])
70
+
71
+ legend_elements = [
72
+ plt.Rectangle((0, 0), 1, 1, facecolor=color, label=label)
73
+ for label, color in {
74
+ k: colors[k] for k in ["Private", "Open source"]
75
+ }.items()
76
+ ]
77
+ ax.legend(
78
+ handles=legend_elements,
79
+ title="Model Type",
80
+ loc="lower right",
81
+ fontsize=12,
82
+ title_fontsize=14,
83
+ facecolor=colors["background"],
84
+ labelcolor=colors["text"],
85
+ )
86
+
87
+ plt.tight_layout()
88
+ return fig
89
+ finally:
90
+ plt.close(fig)
91
+
92
+ def create_radar_plot(df, model_names):
93
+ datasets = [col for col in df.columns[7:] if col != "IO Cost"]
94
+ fig = go.Figure()
95
+
96
+ colors = ["rgba(99, 102, 241, 0.3)", "rgba(34, 197, 94, 0.3)"]
97
+ line_colors = ["#4F46E5", "#16A34A"]
98
+
99
+ for idx, model_name in enumerate(model_names):
100
+ model_data = df[df["Model"] == model_name].iloc[0]
101
+ values = [model_data[m] for m in datasets]
102
+ values.append(values[0])
103
+ datasets_plot = datasets + [datasets[0]]
104
+
105
+ fig.add_trace(
106
+ go.Scatterpolar(
107
+ r=values,
108
+ theta=datasets_plot,
109
+ fill="toself",
110
+ fillcolor=colors[idx % len(colors)],
111
+ line=dict(color=line_colors[idx % len(line_colors)], width=2),
112
+ name=model_name,
113
+ text=[f"{val:.3f}" for val in values],
114
+ textposition="middle right",
115
+ mode="lines+markers+text",
116
+ )
117
+ )
118
+
119
+ fig.update_layout(
120
+ polar=dict(
121
+ radialaxis=dict(
122
+ visible=True, range=[0, 1], showline=False, tickfont=dict(size=12)
123
+ ),
124
+ angularaxis=dict(
125
+ tickfont=dict(size=13, family="Arial"),
126
+ rotation=90,
127
+ direction="clockwise",
128
+ ),
129
+ ),
130
+ showlegend=True,
131
+ legend=dict(
132
+ orientation="h",
133
+ yanchor="bottom",
134
+ y=-0.2,
135
+ xanchor="center",
136
+ x=0.5,
137
+ font=dict(size=14),
138
+ ),
139
+ title=dict(
140
+ text="Model Comparison",
141
+ x=0.5,
142
+ y=0.95,
143
+ font=dict(size=24, family="Arial", color="#1F2937"),
144
+ ),
145
+ paper_bgcolor="white",
146
+ plot_bgcolor="white",
147
+ height=700,
148
+ width=900,
149
+ margin=dict(t=100, b=100, l=80, r=80),
150
+ )
151
+
152
+ return fig
153
+
154
+
155
+ def get_performance_cost_chart(df, category_name="Overall"):
156
+ colors = get_chart_colors()
157
+ fig, ax = plt.subplots(figsize=(12, 8), dpi=300)
158
+
159
+ fig.patch.set_facecolor(colors["background"])
160
+ ax.set_facecolor(colors["background"])
161
+ ax.grid(True, linestyle="--", alpha=0.15, which="both", color=colors["grid"])
162
+
163
+ score_column = "Category Score"
164
+
165
+ for _, row in df.iterrows():
166
+ color = colors[row["Model Type"]]
167
+ size = 100 if row[score_column] > 0.85 else 80
168
+ edge_color = (
169
+ colors["Private"]
170
+ if row["Model Type"] == "Private"
171
+ else colors["Open source"]
172
+ )
173
+
174
+ ax.scatter(
175
+ row["IO Cost"],
176
+ row[score_column] * 100,
177
+ c=color,
178
+ s=size,
179
+ alpha=0.9,
180
+ edgecolor=edge_color,
181
+ linewidth=1,
182
+ zorder=5,
183
+ )
184
+
185
+ bbox_props = dict(
186
+ boxstyle="round,pad=0.3", fc=colors["background"], ec="none", alpha=0.8
187
+ )
188
+
189
+ ax.annotate(
190
+ f"{row['Model']}\n(${row['IO Cost']:.2f})",
191
+ (row["IO Cost"], row[score_column] * 100),
192
+ xytext=(5, 5),
193
+ textcoords="offset points",
194
+ fontsize=8,
195
+ color=colors["text"],
196
+ bbox=bbox_props,
197
+ zorder=6,
198
+ )
199
+
200
+ ax.set_xscale("log")
201
+ ax.set_xlim(0.08, 40)
202
+ ax.set_ylim(60, 100)
203
+
204
+ ax.set_xlabel(
205
+ "I/O Cost per Million Tokens ($)",
206
+ fontsize=10,
207
+ labelpad=10,
208
+ color=colors["text"],
209
+ )
210
+ ax.set_ylabel(
211
+ "Model Performance Score", fontsize=10, labelpad=10, color=colors["text"]
212
+ )
213
+
214
+ legend_elements = [
215
+ plt.scatter([], [], c=colors[label], label=label, s=80)
216
+ for label in ["Private", "Open source"]
217
+ ]
218
+ ax.legend(
219
+ handles=legend_elements,
220
+ loc="upper right",
221
+ frameon=True,
222
+ facecolor=colors["background"],
223
+ edgecolor="none",
224
+ fontsize=9,
225
+ labelcolor=colors["text"],
226
+ )
227
+
228
+ ax.set_title(
229
+ f"Performance vs. Cost - {category_name}",
230
+ fontsize=14,
231
+ pad=15,
232
+ fontweight="bold",
233
+ color=colors["text"],
234
+ )
235
+
236
+ for y1, y2, color in zip([85, 75, 60], [100, 85, 75], colors["performance_bands"]):
237
+ ax.axhspan(y1, y2, alpha=0.2, color=color, zorder=1)
238
+
239
+ ax.tick_params(axis="both", which="major", labelsize=9, colors=colors["text"])
240
+ ax.tick_params(axis="both", which="minor", labelsize=8, colors=colors["text"])
241
+ ax.xaxis.set_minor_locator(plt.LogLocator(base=10.0, subs=np.arange(2, 10) * 0.1))
242
+
243
+ for spine in ax.spines.values():
244
+ spine.set_color(colors["grid"])
245
+
246
+ plt.tight_layout()
247
+ return fig