송종윤/AI Productivity팀(SR)/삼성전자 commited on
Commit
8a254d6
·
0 Parent(s):

Initial commit

Browse files

Initial commit

minor

.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ auto_evals/
2
+ venv/
3
+ __pycache__/
4
+ .env
5
+ .ipynb_checkpoints
6
+ *ipynb
7
+ .vscode/
8
+
9
+ eval-queue/
10
+ eval-results/
11
+ eval-queue-bk/
12
+ eval-results-bk/
13
+ logs/
.pre-commit-config.yaml ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ default_language_version:
16
+ python: python3
17
+
18
+ ci:
19
+ autofix_prs: true
20
+ autoupdate_commit_msg: '[pre-commit.ci] pre-commit suggestions'
21
+ autoupdate_schedule: quarterly
22
+
23
+ repos:
24
+ - repo: https://github.com/pre-commit/pre-commit-hooks
25
+ rev: v4.3.0
26
+ hooks:
27
+ - id: check-yaml
28
+ - id: check-case-conflict
29
+ - id: detect-private-key
30
+ - id: check-added-large-files
31
+ args: ['--maxkb=1000']
32
+ - id: requirements-txt-fixer
33
+ - id: end-of-file-fixer
34
+ - id: trailing-whitespace
35
+
36
+ - repo: https://github.com/PyCQA/isort
37
+ rev: 5.12.0
38
+ hooks:
39
+ - id: isort
40
+ name: Format imports
41
+
42
+ - repo: https://github.com/psf/black
43
+ rev: 22.12.0
44
+ hooks:
45
+ - id: black
46
+ name: Format code
47
+ additional_dependencies: ['click==8.0.2']
48
+
49
+ - repo: https://github.com/charliermarsh/ruff-pre-commit
50
+ # Ruff version.
51
+ rev: 'v0.0.267'
52
+ hooks:
53
+ - id: ruff
Makefile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .PHONY: style format
2
+
3
+
4
+ style:
5
+ python -m black --line-length 119 .
6
+ python -m isort .
7
+ ruff check --fix .
8
+
9
+
10
+ quality:
11
+ python -m black --check --line-length 119 .
12
+ python -m isort --check-only .
13
+ ruff check .
NOTICE ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ Certain styling elements of this project are partially adapted from HuggingFace leaderboard code,
2
+ (https://huggingface.co/spaces/galileo-ai/agent-leaderboard),
3
+ licensed under the Apache License, Version 2.0.
4
+ Modifications have been made by Samsung Research.
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: TRUEBench
3
+ emoji: 🔥
4
+ colorFrom: blue
5
+ colorTo: indigo
6
+ sdk: gradio
7
+ sdk_version: 5.38.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: apache-2.0
11
+ hf_oauth: true
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,654 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ from apscheduler.schedulers.background import BackgroundScheduler
4
+ from huggingface_hub import snapshot_download
5
+ from src.data_utils import get_dataframe_category, get_dataframe_language
6
+ import src.config as configs
7
+ from utils import get_profile_and_organizations, download_with_restart
8
+ from vis_utils import load_leaderboard_data, create_domain_radar_chart, create_len_overall_scatter
9
+
10
+ from src.about import (
11
+ CITATION_BUTTON_LABEL,
12
+ CITATION_BUTTON_TEXT,
13
+ EVALUATION_QUEUE_TEXT,
14
+ EVALUATION_QUEUE_TEXT_OPTION1,
15
+ INTRODUCTION_TEXT,
16
+ BANNER,
17
+ TITLE,
18
+ LINK,
19
+ )
20
+ from src.display.css_html_js import custom_css
21
+ from src.display.utils import (
22
+ Precision
23
+ )
24
+ from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
25
+ from src.submission.submit import add_new_eval_option
26
+
27
+ from ui import create_leaderboard_tab
28
+
29
+ def restart_space():
30
+ API.restart_space(repo_id=REPO_ID)
31
+
32
+ ### Space initialisation
33
+ download_with_restart(
34
+ snapshot_download,
35
+ repo_id=QUEUE_REPO,
36
+ local_dir=EVAL_REQUESTS_PATH,
37
+ repo_type="dataset",
38
+ token=TOKEN,
39
+ restart_func=restart_space
40
+ )
41
+ download_with_restart(
42
+ snapshot_download,
43
+ repo_id=RESULTS_REPO,
44
+ local_dir=EVAL_RESULTS_PATH,
45
+ repo_type="dataset",
46
+ token=TOKEN,
47
+ restart_func=restart_space
48
+ )
49
+
50
+ theme = gr.themes.Default(
51
+ primary_hue="gray",
52
+ neutral_hue="gray"
53
+ )
54
+
55
+ demo = gr.Blocks(css=custom_css, theme=theme)
56
+ with demo:
57
+ gr.HTML(BANNER + TITLE + LINK)
58
+ user_state = gr.State()
59
+ organization_state = gr.State()
60
+
61
+ with gr.Tabs(elem_classes="tab-buttons") as main_tabs:
62
+ with gr.TabItem("TRUEBench", elem_id="llm-benchmark-tab-table", id=2):
63
+ gr.HTML(INTRODUCTION_TEXT)
64
+
65
+ gr.HTML("""
66
+ <div class="dark-container" style="margin-bottom: 24px;">
67
+ <div class="section-header">
68
+ <h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;">
69
+ Category Analysis
70
+ </h3>
71
+ </div>
72
+ <p style="color: var(--text-secondary); margin-bottom: 20px; font-size: 1.1rem; font-family: 'Geist', sans-serif;">TRUEBench consists of 10 categories and 46 sub-categories which highly related to productivity assistants.</p>
73
+ """)
74
+ # --- Category Explanation Box (2x5 grid, emoji, desc from about.py) ---
75
+ from src.about import CATEGORY_DESCRIPTIONS
76
+ gr.HTML(f"""
77
+ <style>
78
+ .category-box-grid {{
79
+ display: flex;
80
+ flex-direction: column;
81
+ gap: 18px;
82
+ margin: 18px 0;
83
+ }}
84
+ .category-box-row {{
85
+ display: flex;
86
+ gap: 18px;
87
+ }}
88
+ .category-box {{
89
+ background: linear-gradient(135deg, #e3e6f3 60%, #f5f6fa 100%);
90
+ border-radius: 26px;
91
+ box-shadow: 0 0 16px #6c63ff44, 0 2px 8px rgba(0,0,0,0.08);
92
+ color: #222 !important;
93
+ min-height: 140px;
94
+ flex: 1 1 0;
95
+ display: flex;
96
+ flex-direction: column;
97
+ align-items: flex-start;
98
+ padding: 18px 16px 12px 16px;
99
+ box-shadow: 0 0 16px #6c63ff44, 0 2px 8px rgba(0,0,0,0.08);
100
+ font-size: 1.08rem;
101
+ color: #222 !important;
102
+ transition: box-shadow 0.2s;
103
+ position: relative;
104
+ overflow: hidden;
105
+ opacity: 1;
106
+ }}
107
+ .category-title {{
108
+ font-weight: 700;
109
+ font-size: 1.18rem;
110
+ margin-left: 8px;
111
+ vertical-align: middle;
112
+ color: #222 !important;
113
+ }}
114
+ .category-desc {{
115
+ margin-top: 12px;
116
+ font-size: 0.98rem;
117
+ color: #fff !important;
118
+ font-weight: 400;
119
+ min-height: 24px;
120
+ width: 100%;
121
+ line-height: 1.5;
122
+ letter-spacing: 0.01em;
123
+ }}
124
+ .category-box:hover {{
125
+ box-shadow: 0 0 24px #a5a1ff55, 0 4px 16px rgba(0,0,0,0.18);
126
+ }}
127
+ .category-title {{
128
+ font-weight: 700;
129
+ font-size: 1.18rem;
130
+ margin-left: 8px;
131
+ vertical-align: middle;
132
+ }}
133
+ .category-desc {{
134
+ margin-top: 12px;
135
+ font-size: 0.98rem;
136
+ color: #222 !important;
137
+ font-weight: 400;
138
+ min-height: 24px;
139
+ width: 100%;
140
+ line-height: 1.5;
141
+ letter-spacing: 0.01em;
142
+ }}
143
+ @media (prefers-color-scheme: dark) {{
144
+ .category-box .category-title {{
145
+ color: #f5f6f7 !important;
146
+ }}
147
+ }}
148
+ </style>
149
+ <div class='category-box-grid'>
150
+ <div class='category-box-row'>
151
+ <div class='category-box'><span class='category-title'>📝 Content Generation</span><div class='category-desc'>{CATEGORY_DESCRIPTIONS["Content Generation"]}</div></div>
152
+ <div class='category-box'><span class='category-title'>✂️ Editing</span><div class='category-desc'>{CATEGORY_DESCRIPTIONS["Editing"]}</div></div>
153
+ <div class='category-box'><span class='category-title'>📊 Data Analysis</span><div class='category-desc'>{CATEGORY_DESCRIPTIONS["Data Analysis"]}</div></div>
154
+ <div class='category-box'><span class='category-title'>🧠 Reasoning</span><div class='category-desc'>{CATEGORY_DESCRIPTIONS["Reasoning"]}</div></div>
155
+ <div class='category-box'><span class='category-title'>🦄 Hallucination</span><div class='category-desc'>{CATEGORY_DESCRIPTIONS["Hallucination"]}</div></div>
156
+ </div>
157
+ <div class='category-box-row'>
158
+ <div class='category-box'><span class='category-title'>🛡️ Safety</span><div class='category-desc'>{CATEGORY_DESCRIPTIONS["Safety"]}</div></div>
159
+ <div class='category-box'><span class='category-title'>🔁 Repetition</span><div class='category-desc'>{CATEGORY_DESCRIPTIONS["Repetition"]}</div></div>
160
+ <div class='category-box'><span class='category-title'>📝 Summarization</span><div class='category-desc'>{CATEGORY_DESCRIPTIONS["Summarization"]}</div></div>
161
+ <div class='category-box'><span class='category-title'>🌐 Translation</span><div class='category-desc'>{CATEGORY_DESCRIPTIONS["Translation"]}</div></div>
162
+ <div class='category-box'><span class='category-title'>💬 Multi-Turn</span><div class='category-desc'>{CATEGORY_DESCRIPTIONS["Multi-Turn"]}</div></div>
163
+ </div>
164
+ </div>
165
+ """)
166
+ df = get_dataframe_category()
167
+
168
+ gr.HTML("""
169
+ <style>
170
+ .leaderboard-container {
171
+ background: #fff;
172
+ }
173
+ @media (prefers-color-scheme: dark) {
174
+ .leaderboard-container {
175
+ background: #121212;
176
+ }
177
+ }
178
+ </style>
179
+ <div class="leaderboard-container">
180
+ """)
181
+ leaderboard_tab_cat = create_leaderboard_tab(
182
+ df,
183
+ "Category",
184
+ )
185
+ gr.HTML("</div>")
186
+
187
+
188
+ # --- Category Radar Chart Section ---
189
+ from vis_utils import load_leaderboard_data, create_domain_radar_chart
190
+ initial_df_cat = load_leaderboard_data()
191
+ # Top 5 models based on leaderboard (Average Accuracy)
192
+ if "Overall" in initial_df_cat.columns:
193
+ top5_models_cat = initial_df_cat.sort_values("Overall", ascending=False)['Model Name'].tolist()[:5]
194
+ else:
195
+ top5_models_cat = initial_df_cat['Model Name'].tolist()[:5]
196
+ gr.HTML('<div class="chart-container" style="display: flex; justify-content: center; align-items: center; width: 100%; max-width: 100%; margin: 0 auto; padding: 0;">')
197
+ # Radar chart model selector (up to 5)
198
+ from src.display.formatting import get_display_model_name
199
+ display_names_cat = initial_df_cat['Model Name'].apply(get_display_model_name).tolist()
200
+ original_names_cat = initial_df_cat['Model Name'].tolist()
201
+ display_to_original_cat = dict(zip(display_names_cat, original_names_cat))
202
+ top5_display_names_cat = [get_display_model_name(m) for m in top5_models_cat]
203
+ model_selector_cat = gr.Dropdown(
204
+ choices=display_names_cat,
205
+ value=top5_display_names_cat,
206
+ multiselect=True,
207
+ label="🎯 Select Models for Radar Chart",
208
+ info="Choose up to 5 models to visualize",
209
+ elem_classes=["dropdown", "custom-dropdown"],
210
+ interactive=True,
211
+ filterable=True,
212
+ allow_custom_value=False
213
+ )
214
+ gr.HTML("""
215
+ <script>
216
+ document.querySelector('.custom-dropdown').addEventListener('change', function(e) {
217
+ if (this.value.length > 5) {
218
+ alert('You can select up to 5 models only');
219
+ this.value = this.value.slice(0, 5);
220
+ }
221
+ });
222
+ </script>
223
+ """)
224
+ radar_chart_cat = gr.Plot(
225
+ label="",
226
+ value=create_domain_radar_chart(
227
+ initial_df_cat,
228
+ "Average Accuracy",
229
+ top5_models_cat
230
+ ),
231
+ elem_classes=["radar-chart", "plot-container"]
232
+ )
233
+ gr.HTML('</div>')
234
+
235
+ # Update radar chart when model_selector_cat selection changes
236
+ def update_radar_chart_cat(selected_display_names):
237
+ # If no selection, fallback to top-5
238
+ if not selected_display_names or len(selected_display_names) == 0:
239
+ df = load_leaderboard_data()
240
+ selected_display_names = [get_display_model_name(m) for m in df['Model Name'].tolist()[:5]]
241
+ selected_models = [display_to_original_cat[name] for name in selected_display_names if name in display_to_original_cat]
242
+ return create_domain_radar_chart(
243
+ load_leaderboard_data(),
244
+ "Average Accuracy",
245
+ selected_models
246
+ )
247
+ model_selector_cat.change(
248
+ fn=update_radar_chart_cat,
249
+ inputs=model_selector_cat,
250
+ outputs=radar_chart_cat
251
+ )
252
+ # --- Med. Len. vs Overall Scatter Plot Section ---
253
+ from vis_utils import create_len_overall_scatter
254
+ import json
255
+ with open("src/data/length_data.json", "r") as f:
256
+ length_data = json.load(f)
257
+
258
+ # --- Create a Gradio State component to hold length_data ---
259
+ length_data_state = gr.State(value=length_data)
260
+ gr.HTML("""
261
+ <div class="dark-container" style="margin-bottom: 24px; margin-top: 24px;">
262
+ <div class="section-header">
263
+ <h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;">
264
+ Output Length vs. Category Score
265
+ </h3>
266
+ </div>
267
+ <p style="color: var(--text-secondary); margin-bottom: 20px; font-size: 1.1rem; font-family: 'Geist', sans-serif;">
268
+ Explore the relationship between median output length and model performance by category
269
+ </p>
270
+ """)
271
+
272
+
273
+ # Category selection buttons (HTML + Gradio Radio for event)
274
+ category_columns = [col for col in configs.ON_LOAD_COLUMNS_CATEGORY if col not in configs.CATEGORY_EXCLUDED_COLUMNS]
275
+ # (cat-btn-radio related style block removed, now handled in custom_css)
276
+ category_selector = gr.Radio(
277
+ choices=category_columns,
278
+ value="Overall",
279
+ label="Select Category for Y-Axis",
280
+ elem_id="cat-btn-radio",
281
+ elem_classes=["cat-btn-radio"],
282
+ interactive=True,
283
+ show_label=False
284
+ )
285
+ x_axis_selector = gr.Radio(
286
+ choices=["Med. Len.", "Med. Resp. Len."],
287
+ value="Med. Len.",
288
+ label="Select X-Axis Data",
289
+ elem_id="x-axis-btn-radio",
290
+ elem_classes=["x-axis-btn-radio"],
291
+ interactive=True,
292
+ show_label=True
293
+ )
294
+ gr.HTML('<div class="chart-container" style="display: flex; justify-content: center; align-items: center;">')
295
+ scatter_plot_cat = gr.Plot(
296
+ label="",
297
+ value=create_len_overall_scatter(
298
+ load_leaderboard_data(),
299
+ y_col="Overall",
300
+ length_data=length_data,
301
+ x_axis_data_source=x_axis_selector.value
302
+ ),
303
+ elem_classes=["efficiency-chart", "plot-container"]
304
+ )
305
+ gr.HTML('</div>')
306
+ gr.HTML("</div>")
307
+
308
+ # Update plot when category or x-axis selection changes
309
+ def update_scatter_plot_cat(selected_category, selected_x_source, current_length_data_state):
310
+ return create_len_overall_scatter(
311
+ load_leaderboard_data(),
312
+ y_col=selected_category,
313
+ length_data=current_length_data_state,
314
+ x_axis_data_source=selected_x_source
315
+ )
316
+ category_selector.change(
317
+ fn=update_scatter_plot_cat,
318
+ inputs=[category_selector, x_axis_selector, length_data_state],
319
+ outputs=scatter_plot_cat
320
+ )
321
+ x_axis_selector.change(
322
+ fn=update_scatter_plot_cat,
323
+ inputs=[category_selector, x_axis_selector, length_data_state],
324
+ outputs=scatter_plot_cat
325
+ )
326
+
327
+ # When leaderboard selectors change, synchronize model_selector_cat and radar_chart_cat to top-5
328
+ def update_model_selector_and_radar_chart_cat_from_leaderboard(types, model_types, thinks, df, sort_col):
329
+ _, _, top5_models = leaderboard_tab_cat["unified_filter"](types, model_types, thinks, df, sort_col)
330
+
331
+ top5_display_names = [get_display_model_name(m) for m in top5_models[:5]]
332
+ return gr.update(value=top5_display_names), create_domain_radar_chart(
333
+ load_leaderboard_data(),
334
+ "Average Accuracy",
335
+ top5_models[:5]
336
+ )
337
+
338
+ leaderboard_selectors_cat = [
339
+ leaderboard_tab_cat["type_selector"],
340
+ leaderboard_tab_cat["model_type_selector"],
341
+ leaderboard_tab_cat["think_selector"],
342
+ leaderboard_tab_cat["df_state"],
343
+ leaderboard_tab_cat["sort_col_dropdown"]
344
+ ]
345
+ for selector in leaderboard_selectors_cat:
346
+ selector.change(
347
+ fn=update_model_selector_and_radar_chart_cat_from_leaderboard,
348
+ inputs=leaderboard_selectors_cat,
349
+ outputs=[model_selector_cat, radar_chart_cat]
350
+ )
351
+
352
+
353
+
354
+ gr.HTML("""
355
+ <div class="dark-container" style="margin-bottom: 24px;">
356
+ <div class="section-header">
357
+ <h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;">
358
+ Language Analysis
359
+ </h3>
360
+ </div>
361
+ <p style="color: var(--text-secondary); margin-bottom: 20px; font-size: 1.1rem; font-family: 'Geist', sans-serif;">As a multilingual benchmark, TRUEBench supports a total of 12 user input languages: Korean (KO), English (EN), Japanese (JA), Chinese (ZH), Polish (PL), German (DE), Portuguese (PT), Spanish (ES), French (FR), Italian (IT), Russian (RU), and Vietnamese (VI).</p>
362
+ """)
363
+ df = get_dataframe_language()
364
+
365
+ leaderboard_tab_lang = create_leaderboard_tab(
366
+ df,
367
+ "Language",
368
+ )
369
+
370
+ # --- Language Radar Chart Section ---
371
+
372
+ from vis_utils import load_leaderboard_language_data, create_language_radar_chart
373
+ initial_df_lang = load_leaderboard_language_data()
374
+
375
+ # Top 5 models based on leaderboard (Overall)
376
+ if "Overall" in initial_df_lang.columns:
377
+ top5_models_lang = initial_df_lang.sort_values("Overall", ascending=False)['Model Name'].tolist()[:5]
378
+ else:
379
+ top5_models_lang = initial_df_lang['Model Name'].tolist()[:5]
380
+
381
+ gr.HTML('<div class="chart-container" style="display: flex; justify-content: center; align-items: center;">')
382
+ # Add model selector
383
+ display_names_lang = initial_df_lang['Model Name'].apply(get_display_model_name).tolist()
384
+ original_names_lang = initial_df_lang['Model Name'].tolist()
385
+ display_to_original_lang = dict(zip(display_names_lang, original_names_lang))
386
+ top5_display_names_lang = [get_display_model_name(m) for m in top5_models_lang]
387
+ model_selector_lang = gr.Dropdown(
388
+ choices=display_names_lang,
389
+ value=top5_display_names_lang,
390
+ multiselect=True,
391
+ label="🎯 Select Models for Radar Chart",
392
+ info="Choose up to 5 models to visualize",
393
+ elem_classes=["dropdown", "custom-dropdown"],
394
+ interactive=True,
395
+ filterable=True,
396
+ allow_custom_value=False
397
+ )
398
+ gr.HTML("""
399
+ <script>
400
+ document.querySelectorAll('.custom-dropdown')[1].addEventListener('change', function(e) {
401
+ if (this.value.length > 5) {
402
+ alert('You can select up to 5 models only');
403
+ this.value = this.value.slice(0, 5);
404
+ }
405
+ });
406
+ </script>
407
+ """)
408
+ radar_chart_lang = gr.Plot(
409
+ label="",
410
+ value=create_language_radar_chart(
411
+ initial_df_lang,
412
+ "Average Accuracy",
413
+ top5_models_lang
414
+ ),
415
+ elem_classes=["radar-chart", "plot-container"]
416
+ )
417
+ gr.HTML('</div>')
418
+
419
+ # Update radar chart when model_selector_lang selection changes
420
+ def update_radar_chart_lang(selected_display_names):
421
+ if not selected_display_names or len(selected_display_names) == 0:
422
+ df = load_leaderboard_language_data()
423
+ selected_display_names = [get_display_model_name(m) for m in df['Model Name'].tolist()[:5]]
424
+ selected_models = [display_to_original_lang[name] for name in selected_display_names if name in display_to_original_lang]
425
+ return create_language_radar_chart(
426
+ load_leaderboard_language_data(),
427
+ "Average Accuracy",
428
+ selected_models
429
+ )
430
+ model_selector_lang.change(
431
+ fn=update_radar_chart_lang,
432
+ inputs=model_selector_lang,
433
+ outputs=radar_chart_lang
434
+ )
435
+
436
+ # When leaderboard selectors change, automatically synchronize model_selector_lang and radar_chart_lang to top-5
437
+ def update_model_selector_and_radar_chart_lang_from_leaderboard(types, model_types, thinks, df, sort_col):
438
+ _, _, top5_models = leaderboard_tab_lang["unified_filter"](types, model_types, thinks, df, sort_col)
439
+ top5_display_names = [get_display_model_name(m) for m in top5_models[:5]]
440
+ return gr.update(value=top5_display_names), create_language_radar_chart(
441
+ load_leaderboard_language_data(),
442
+ "Average Accuracy",
443
+ top5_models[:5]
444
+ )
445
+
446
+ leaderboard_selectors_lang = [
447
+ leaderboard_tab_lang["type_selector"],
448
+ leaderboard_tab_lang["model_type_selector"],
449
+ leaderboard_tab_lang["think_selector"],
450
+ leaderboard_tab_lang["df_state"],
451
+ leaderboard_tab_lang["sort_col_dropdown"]
452
+ ]
453
+
454
+ for selector in leaderboard_selectors_lang:
455
+ selector.change(
456
+ fn=update_model_selector_and_radar_chart_lang_from_leaderboard,
457
+ inputs=leaderboard_selectors_lang,
458
+ outputs=[model_selector_lang, radar_chart_lang]
459
+ )
460
+
461
+
462
+
463
+ with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
464
+ with gr.Column():
465
+ with gr.Row():
466
+ gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
467
+
468
+ with gr.Row():
469
+ gr.Markdown(EVALUATION_QUEUE_TEXT_OPTION1, elem_classes="markdown-text")
470
+
471
+ with gr.Row():
472
+ gr.Markdown("## ✉️ Submit your model here!", elem_classes="markdown-text")
473
+
474
+ login_button = gr.LoginButton()
475
+
476
+ with gr.Row():
477
+ with gr.Column():
478
+ contact_email = gr.Textbox(label="Contact Email", placeholder="Your email address", interactive=True)
479
+ model_name_textbox = gr.Textbox(label="Model Name")
480
+ model_type_dropdown = gr.Dropdown(
481
+ choices=["Instruct", "Think", "Hybrid"],
482
+ label="Model Type (Instruct, Think, or Hybrid)",
483
+ multiselect=False,
484
+ value="Instruct",
485
+ interactive=True,
486
+ )
487
+ think_type_dropdown = gr.Dropdown(
488
+ choices=["On", "Off"],
489
+ label="Think Mode (On/Off)",
490
+ multiselect=False,
491
+ value="Off",
492
+ interactive=False,
493
+ )
494
+ precision = gr.Dropdown(
495
+ choices=[i.value.name for i in Precision if i != Precision.Unknown],
496
+ label="Precision",
497
+ multiselect=False,
498
+ value="float16",
499
+ interactive=True,
500
+ )
501
+ # --- Dynamically control think_type based on model_type and connect event ---
502
+ def update_think_type(model_type_value):
503
+ if model_type_value == "Instruct":
504
+ return gr.update(value="Off", interactive=False)
505
+ elif model_type_value == "Think":
506
+ return gr.update(value="On", interactive=False)
507
+ else: # Hybrid
508
+ return gr.update(value="On", interactive=True)
509
+ model_type_dropdown.change(
510
+ fn=update_think_type,
511
+ inputs=model_type_dropdown,
512
+ outputs=think_type_dropdown
513
+ )
514
+ response_prefix_textbox = gr.Textbox(label="Response prefix", placeholder="(e.g., </think>)")
515
+
516
+ with gr.Column():
517
+ yml_textbox_placeholder = """# vLLM serving parameters
518
+ # Refence: https://docs.vllm.ai/en/latest/cli/serve.html
519
+ llm_serve_args:
520
+ max_model_len:
521
+ tensor_parallel_size:
522
+ dtype:
523
+ ...
524
+ # OpenAI-compatible API (chat completion)
525
+ # Reference: https://platform.openai.com/docs/api-reference/chat
526
+ sampling_params:
527
+ top_p:
528
+ temperature:
529
+ presence_penalty:
530
+ ...
531
+ # vLLM sampling parameters
532
+ # Reference: https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#chat-api_1
533
+ extra_body:
534
+ chat_template_kwargs:
535
+ enable_thinking:
536
+ ...
537
+ top_k:
538
+ repetition_penalty:
539
+ ..."""
540
+ yml_textbox = gr.Textbox(
541
+ label="Configuration (YAML format)",
542
+ elem_id="yml-textbox",
543
+ lines=7,
544
+ value=yml_textbox_placeholder
545
+ )
546
+ upbox = gr.File(
547
+ label="Upload configuration file as .yml or .yaml",
548
+ file_types=[".yml", ".yaml"],
549
+ type="filepath",
550
+ height=150
551
+ )
552
+ # Add Translate to JSON button below upbox
553
+ translate_button = gr.Button(
554
+ "Translate to JSON",
555
+ elem_id="translate-to-json-btn",
556
+ elem_classes=["translate-btn"],
557
+ scale=None
558
+ )
559
+ # Add custom style for the button
560
+ gr.HTML(
561
+ '''
562
+ <style>
563
+ #translate-to-json-btn, .translate-btn {
564
+ width: 100%;
565
+ min-height: 24px;
566
+ font-size: 1.1rem;
567
+ font-weight: 600;
568
+ background: linear-gradient(90deg, #6c63ff 60%, #a5a1ff 100%);
569
+ color: #fff;
570
+ border: none;
571
+ border-radius: 12px;
572
+ margin-top: 8px;
573
+ margin-bottom: 8px;
574
+ box-shadow: 0 2px 8px #6c63ff33;
575
+ transition: background 0.2s, box-shadow 0.2s;
576
+ }
577
+ #translate-to-json-btn:hover, .translate-btn:hover {
578
+ background: linear-gradient(90deg, #5a54d6 60%, #7e7bff 100%);
579
+ box-shadow: 0 4px 16px #6c63ff55;
580
+ }
581
+ </style>
582
+ '''
583
+ )
584
+ with gr.Column():
585
+ requirements_textbox = gr.Textbox(label="(Optional) Requirements", lines=30, elem_id="requirements-textbox")
586
+
587
+ output_dict = gr.Code(label="Translated Python Dictionary", language="json")
588
+ submit_button = gr.Button("Submit Eval")
589
+ submission_result = gr.Markdown()
590
+ def parse_and_display_yaml_config(upbox_path, yml_textbox_value):
591
+ import yaml, json
592
+ if upbox_path:
593
+ try:
594
+ with open(upbox_path, "r", encoding="utf-8") as f:
595
+ data = yaml.safe_load(f)
596
+ if data is None:
597
+ return "YAML file is empty."
598
+ return json.dumps(data, indent=4, ensure_ascii=False)
599
+ except Exception as e:
600
+ return f"Error parsing YAML file: {e}"
601
+ elif yml_textbox_value and yml_textbox_value.strip():
602
+ try:
603
+ data = yaml.safe_load(yml_textbox_value)
604
+ if data is None:
605
+ return "YAML textbox is empty or invalid."
606
+ return json.dumps(data, indent=4, ensure_ascii=False)
607
+ except Exception as e:
608
+ return f"Error parsing YAML textbox: {e}"
609
+ else:
610
+ return ""
611
+
612
+ event = submit_button.click(get_profile_and_organizations, inputs=[], outputs=[user_state, organization_state])
613
+ event.then(
614
+ add_new_eval_option,
615
+ [
616
+ contact_email,
617
+ model_name_textbox,
618
+ model_type_dropdown,
619
+ think_type_dropdown,
620
+ precision,
621
+ response_prefix_textbox,
622
+ requirements_textbox,
623
+ user_state,
624
+ organization_state,
625
+ yml_textbox,
626
+ upbox,
627
+ ],
628
+ submission_result,
629
+ ).then(
630
+ fn=parse_and_display_yaml_config,
631
+ inputs=[upbox, yml_textbox],
632
+ outputs=output_dict
633
+ )
634
+ translate_button.click(
635
+ fn=parse_and_display_yaml_config,
636
+ inputs=[upbox, yml_textbox],
637
+ outputs=output_dict
638
+ )
639
+
640
+ with gr.Row():
641
+ with gr.Accordion("📙 Citation", open=False):
642
+ citation_button = gr.Textbox(
643
+ value=CITATION_BUTTON_TEXT,
644
+ label=CITATION_BUTTON_LABEL,
645
+ lines=20,
646
+ elem_id="citation-button",
647
+ show_copy_button=True,
648
+ )
649
+
650
+
651
+ scheduler = BackgroundScheduler()
652
+ scheduler.add_job(restart_space, "interval", seconds=1800)
653
+ scheduler.start()
654
+ demo.queue(default_concurrency_limit=40).launch()
constants.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Leaderboard required columns (must always be included)
2
+ LEADERBOARD_REQUIRED_COLUMNS = [
3
+ "Model Name", "Group", "Overall", "Med. Len.", "Med. Resp. Len.", "Type", "Model Type", "Think", "Rank"
4
+ ]
5
+
6
+ # Columns for number formatting (by category/language)
7
+ NUMERIC_COLS_CATEGORY = [
8
+ "Overall", "Med. Len.", "Med. Resp. Len.", "Parameter Size (B)",
9
+ "Content Generation", "Editing", "Data Analysis", "Reasoning",
10
+ "Hallucination", "Safety", "Repetition", "Summarization", "Translation", "Multi-Turn"
11
+ ]
12
+ NUMERIC_INT_COLS_CATEGORY = ["Med. Len.", "Med. Resp. Len.", "Parameter Size (B)"]
13
+ NUMERIC_FLOAT_COLS_CATEGORY = [col for col in NUMERIC_COLS_CATEGORY if col not in NUMERIC_INT_COLS_CATEGORY]
14
+
15
+ NUMERIC_COLS_LANGUAGE = [
16
+ "Overall", "Med. Len.", "Med. Resp. Len.", "Parameter Size (B)",
17
+ "KO", "EN", "JA", "ZH", "PL", "DE", "PT", "ES", "FR", "IT", "RU", "VI"
18
+ ]
19
+ NUMERIC_INT_COLS_LANGUAGE = ["Med. Len.", "Med. Resp. Len.", "Parameter Size (B)"]
20
+ NUMERIC_FLOAT_COLS_LANGUAGE = [col for col in NUMERIC_COLS_LANGUAGE if col not in NUMERIC_INT_COLS_LANGUAGE]
pyproject.toml ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.ruff]
2
+ # Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
3
+ select = ["E", "F"]
4
+ ignore = ["E501"] # line too long (black is taking care of this)
5
+ line-length = 119
6
+ fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
7
+
8
+ [tool.isort]
9
+ profile = "black"
10
+ line_length = 119
11
+
12
+ [tool.black]
13
+ line-length = 119
requirements.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ APScheduler
2
+ black
3
+ datasets
4
+ gradio
5
+ gradio[oauth]
6
+ gradio_leaderboard==0.0.13
7
+ gradio_client
8
+ huggingface-hub>=0.18.0
9
+ matplotlib
10
+ numpy
11
+ pandas
12
+ python-dateutil
13
+ tqdm
14
+ transformers
15
+ tokenizers>=0.15.0
16
+ plotly
src/about.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ CATEGORY_DESCRIPTIONS = {
2
+ "Content Generation": "<p>Evaluates the model's ability to produce diverse written outputs across professional and creative domains. This category measures adaptability to linguistic, stylistic, and formatting constraints, as well as the effectiveness of prompt engineering.</p> <b>🏷️Email 🏷️ReportDrafting</b>",
3
+ "Editing": "<p>Evaluates refinement capabilities for optimizing given text. It focuses on queries related to rephrasing, revision, and correction, while preserving the rest of the content.</p> <b>🏷️QueryRephrase 🏷️DocumentRevision</b>",
4
+ "Data Analysis": "<p>Measures proficiency in processing structured and unstructured data. This category includes tasks related to information extraction and data processing.</p> <b>🏷️JSONFormatted 🏷️TableQuery</b>",
5
+ "Reasoning": "<p>Assesses logical problem-solving in coding, multiple-choice question answering, and mathematical operations. It also includes evaluation of rounding errors made by models in quantitative tasks.</p> <b>🏷️Logical 🏷️Mathematical</b>",
6
+ "Hallucination": "<p>Detects limitations in generating plausible but inaccurate responses when faced with ambiguous queries, insufficient context, hypothetical scenarios, or challenges in document interpretation.</p> <b>🏷️InsufficientContext 🏷️FalseQueries</b>",
7
+ "Safety": "<p>Verifies safeguards against harmful/inappropriate content. This category tests filtering of discriminatory, violent, or illegal material while upholding ethical standards.</p> <b>🏷️Illegal 🏷️Prejudice</b>",
8
+ "Repetition": "<p>Evaluates consistency in producing iterative content variations while maintaining quality and relevance across outputs.</p> <b>🏷️Listing</b>",
9
+ "Summarization": "<p>Measures ability to distill lengthy content into concise overviews preserving core concepts and eliminating redundancy. This category includes various constraints such as language, format, and output length.</p> <b>🏷️BulletPoints 🏷️N-lineSummary</b>",
10
+ "Translation": "<p>Tests the ability to accurately translate diverse real-world contexts while adhering to target language and specified constraints. Our benchmark includes linguistic conditions in 12 languages, ensuring comprehensive multilingual evaluation.</p> <b>🏷️Document 🏷️Line-by-line</b>",
11
+ "Multi-Turn": "<p>Assesses the model's ability to capture user intent in challenging scenarios where the context shifts or understanding of previous context is required.</p> <b>🏷️Consistency 🏷️Non-consistency</b>"
12
+ }
13
+
14
+ banner_url = "https://cdn-uploads.huggingface.co/production/uploads/6805a7222cbcd604c2e89cab/GIEbCbyNn7PjWBFftEgNm.png"
15
+ BANNER = f'<div style="display: flex; justify-content: flex-start; width: 100%;"> <img src="{banner_url}" alt="Banner" style="width: 100%; height: auto; object-fit: contain;"> </div> '
16
+
17
+ TITLE = """<html>
18
+ <body>
19
+ <p style="margin: 0; text-align: right">Leaderboards by Samsung Research for LLM evaluation.</p>
20
+ </body>
21
+ </html>"""
22
+
23
+ LINK = """
24
+ <h3 style="text-align: right; margin-top: 0;">
25
+ <span>✨</span>
26
+ <a href="https://research.samsung.com/" style="text-decoration: none;" rel="nofollow" target="_blank" onmouseover="this.style.textDecoration='underline'" onmouseout="this.style.textDecoration='none'">Samsung Research</a> |
27
+ <span>🌕</span>
28
+ <a href="https://github.com/samsung" style="text-decoration: none;" rel="nofollow" target="_blank" onmouseover="this.style.textDecoration='underline'" onmouseout="this.style.textDecoration='none'">GitHub</a> |
29
+ <span>🌎</span>
30
+ <a href="https://x.com/samsungresearch" style="text-decoration: none;" rel="nofollow" target="_blank" onmouseover="this.style.textDecoration='underline'" onmouseout="this.style.textDecoration='none'">X</a> |
31
+ <span>🌠</span>
32
+ <a href="https://huggingface.co/spaces/SamsungResearch/TRUEBench/discussions" style="text-decoration: none;" rel="nofollow" target="_blank" onmouseover="this.style.textDecoration='underline'" onmouseout="this.style.textDecoration='none'">Discussion</a> |
33
+ <span>🔭</span> Updated: 2025-09-16
34
+ </h3>
35
+ """
36
+
37
+ INTRODUCTION_TEXT = """
38
+ <div style="margin-bottom: 20px; text-align: center !important;">
39
+ <h2 style="padding-bottom: 5px !important; text-align: center !important; font-size: 2.6em !important; font-weight: 900 !important; margin-top: 0.2em !important; margin-bottom: 0.3em !important;">
40
+ 🏆 TRUEBench: A Benchmark for Assessing LLMs as Human Job Productivity Assistants
41
+ </h2>
42
+ <p style="font-size: 1.25em !important; line-height: 1.7 !important; margin: 14px 0 !important;">
43
+ TRUEBench (Trustworthy Real-world Usage Evaluation Benchmark) evaluates LLMs as productivity assistants. <br>
44
+ As LLMs become integral to tasks like report drafting and data analysis, existing benchmarks are suboptimal to capture real-world challenges. <br>
45
+ To address this gap, <strong>Samsung Research</strong> developed TRUEBench as a comprehensive evaluation framework for real-world LLM applications.
46
+ </p>
47
+ <p style="font-size: 1.25em !important; line-height: 1.7 !important; margin: 14px 0 !important;">
48
+ TRUEBench is a benchmark designed to evaluate the instruction-following capabilities of LLMs, determining whether a response receives a Pass (1 point) or Fail (0 points) based on checklists. <br> This aligns with user satisfaction from the perspective of job productivity.
49
+ </p>
50
+ <h3 style="font-size: 2em; font-weight: 800; margin-top: 1.2em; margin-bottom: 0.5em; line-height: 1.3; letter-spacing: -0.01em;">
51
+ Main Features
52
+ </h3>
53
+ <div class="intro-feature-row">
54
+ <div class="intro-feature-box">
55
+ <div class="intro-feature-icon">📝</div>
56
+ <div class="intro-feature-title">2,400+ Productivity-Oriented User Inputs</div>
57
+ <div class="intro-feature-desc">A large-scale collection of complex, real-world user inputs designed to reflect productivity assistant scenarios.</div>
58
+ </div>
59
+ <div class="intro-feature-box">
60
+ <div class="intro-feature-icon">🌎</div>
61
+ <div class="intro-feature-title">Multilinguality in Real Tasks</div>
62
+ <div class="intro-feature-desc">Comprehensive 12-language coverage with intra-instance multilingual instructions.</div>
63
+ <div class="intro-feature-desc" style="font-style: italic; color: #888;">For multilingual aspects, it was created through local research institutes.</div>
64
+ </div>
65
+ <div class="intro-feature-box">
66
+ <div class="intro-feature-icon">🧩</div>
67
+ <div class="intro-feature-title">Beyond Explicit Constraints</div>
68
+ <div class="intro-feature-desc">Human-annotated implicit requirements validated by LLMs.</div>
69
+ </div>
70
+ <div class="intro-feature-box">
71
+ <div class="intro-feature-icon">🧭</div>
72
+ <div class="intro-feature-title">Dynamic Multi-Turn Contexts</div>
73
+ <div class="intro-feature-desc">Realistic dialogue flows with evolving constraints.</div>
74
+ </div>
75
+ </div>
76
+ <a class="intro-dataset-btn" href="https://huggingface.co/datasets/SamsungResearch/TRUEBench" target="_blank" rel="nofollow">
77
+ 📂 Dataset Sample &rarr;
78
+ </a>
79
+ </div> """
80
+
81
+ MAIN_FEATURES_TEXT ="""
82
+ <div style="padding: 10px; border-radius: 8px; margin-bottom: 20px;">
83
+ <h2 style="color: #2c3e50; border-bottom: 2px solid #3498db; padding-bottom: 5px;">✨ Main Features</h2>
84
+ <ul style="list-style-type: none; padding-left: 0;">
85
+ <li style="margin-bottom: 10px; padding-left: 25px; position: relative;">
86
+ <span style="position: absolute; left: 0; color: #3498db;">✓</span>
87
+ Input prompts across 12 languages
88
+ </li>
89
+ <li style="margin-bottom: 10px; padding-left: 25px; position: relative;">
90
+ <span style="position: absolute; left: 0; color: #3498db;">✓</span>
91
+ Intra-instance multilingual instructions
92
+ </li>
93
+ <li style="margin-bottom: 10px; padding-left: 25px; position: relative;">
94
+ <span style="position: absolute; left: 0; color: #3498db;">✓</span>
95
+ Rigorous evaluation criteria for explicit and implicit constraints
96
+ </li>
97
+ <li style="margin-bottom: 10px; padding-left: 25px; position: relative;">
98
+ <span style="position: absolute; left: 0; color: #3498db;">✓</span>
99
+ Complex multi-turn dialogue scenarios
100
+ </li>
101
+ <li style="margin-bottom: 10px; padding-left: 25px; position: relative;">
102
+ <span style="position: absolute; left: 0; color: #3498db;">✓</span>
103
+ LLM-validated constraints for reliable evaluation
104
+ </li>
105
+ </ul>
106
+ <div style="margin: 20px 0 10px 0;">
107
+ <a href="https://huggingface.co/datasets/SamsungResearch/TRUEBench"
108
+ style="color: #3498db;
109
+ text-decoration: underline;
110
+ font-size: 1.2em;
111
+ font-weight: bold;"
112
+ rel="nofollow"
113
+ target="_blank"
114
+ onmouseover="this.style.textDecoration='none'; this.style.color='#2c3e50'"
115
+ onmouseout="this.style.textDecoration='underline'; this.style.color='#3498db'">
116
+ 📂 Dataset Sample →
117
+ </a>
118
+ </div>
119
+ </div>
120
+ """
121
+
122
+ LLM_BENCHMARKS_TEXT = f"""
123
+ ## How it works
124
+ We utilize LLM Judge with human-crafted criteria to assess AI response.
125
+ """
126
+
127
+ EVALUATION_QUEUE_TEXT = '''
128
+ <div style="font-size: 1.25em !important; line-height: 1.7 !important; margin: 14px 0 !important;">
129
+
130
+ ## Submission Policy
131
+ - Submissions are limited to models that are registered on *HuggingFace Models*.
132
+ - Each model affiliation (individual or organization) may submit up to **3** times within **24** hours.
133
+ - The same model can only be submitted once per 24 hours.
134
+ - Duplicate submissions will be determined based on the full model name (i.e., {affiliation}/{model name}). Sampling parameters, dtype, etc. are not considered for duplicate checking.
135
+ - Submissions are only valid if the model's affiliation matches that of the submitter.
136
+ - If the same model is submitted multiple times, only the version with the highest overall score will be reflected on the leaderboard. (Note: A maximum of 3 submissions per model is allowed.)
137
+
138
+ **[NOTE]** Models with commercial licenses may be excluded from evaluation. We focus on evaluating non-commercial models, such as those under Apache-2.0 or MIT licenses. <br>
139
+ **[NOTE]** We use your user name (via **OAuthProfile**) and your list of registered organizations (via **OAuthToken**) solely to verify submission eligibility. **This information is never stored.**<br><br>
140
+
141
+ ## Evaluation Environments
142
+ - Submitted models are run on our internal servers to generate inference outputs, which are then evaluated using an LLM judge.
143
+ - Models must be runnable on up to **32 H100 GPUs** to be eligible for submission.
144
+ - By default, we perform inference in the vLLM 0.10.1 environment. We recommend testing your model in this environment first. You may include additional requests in the requirements section in a free‑form manner, but please note that such requests could be rejected due to the constraints of inferencing environment.
145
+ - We serve the model based on vLLM and perform inference through the OpenAI-compatible API (chat completion).<br><br>
146
+
147
+ ## Evaluation Rules
148
+ - It might take more than 1 week for submitted models' scores to appear on the leaderboard.
149
+ - The maximum generation length is limited to **64K** tokens.
150
+ - Please provide a valid contact email address in the submission form so we can send notifications related to evaluation.
151
+
152
+ **[CAUTION]** If inference fails or if inappropriate content is detected, the model might be excluded from evaluation.<br><br>
153
+
154
+ ## Submission Rules
155
+ - For Think models, you must specify the sequence that separates the thinking process and the final response (e.g., &lt;/think&gt;) in the response_prefix field. We will use this prefix to extract the response for evaluation. (NOTE: Models that fail to provide a proper response prefix might be excluded from evaluation.)
156
+ - Referring to the configuration section of the submission form, provide the following in YAML format, either directly or via an uploaded `.yaml` file (if both are provided, the file takes priority):
157
+ - **Model serve arguments (llm_serve_args)**: vLLM-based model serving parameters ([Reference](https://docs.vllm.ai/en/latest/cli/serve.html))
158
+ - **Sampling parameters (sampling_params)**: Sampling parameters supported by the OpenAI API ([Reference](https://platform.openai.com/docs/api-reference/chat))
159
+ - **Extra body including chat template arguments (extra_body)**: `chat_template_kwargs` and sampling parameters supported by vLLM ([Reference](https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#extra-parameters_1))
160
+ - Any additional specifications outside the configuration format should be written in the requirements section.
161
+
162
+ **[NOTE]** If you need to use two or more H100 GPUs, be sure to specify `tensor_parallel_size` within `llm_serve_args`.<br><br>
163
+ </div>
164
+ '''
165
+
166
+ EVALUATION_QUEUE_TEXT_OPTION1 = """
167
+ <div style="font-size: 1.25em !important; line-height: 1.7 !important; margin: 14px 0 !important;">
168
+
169
+ ## Submission Form
170
+ 1. Sign in using the log-in button below.
171
+ 2. Fill the information including metadata, requirements, and configuration (fill the textbox or upload .yaml file).
172
+ 3. Press "Submit Eval" button to submit.
173
+ """
174
+
175
+ EVALUATION_QUEUE_TEXT_OPTION2 = """
176
+ """
177
+
178
+
179
+ CITATION_BUTTON_LABEL = "To be updated"
180
+ CITATION_BUTTON_TEXT = r"""
181
+ """
src/config.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ON_LOAD_COLUMNS_LANG = [
2
+ "Model Name",
3
+ "Group",
4
+ "Overall",
5
+ "Med. Len.",
6
+ "Med. Resp. Len.",
7
+ "Parameter Size (B)",
8
+ "Type",
9
+ "Model Type",
10
+ "Think",
11
+ "KO",
12
+ "EN",
13
+ "JA",
14
+ "ZH",
15
+ "PL",
16
+ "DE",
17
+ "PT",
18
+ "ES",
19
+ "FR",
20
+ "IT",
21
+ "RU",
22
+ "VI"
23
+ ]
24
+
25
+ ON_LOAD_COLUMNS_CATEGORY = [
26
+ "Model Name",
27
+ "Group",
28
+ "Overall",
29
+ "Med. Len.",
30
+ "Med. Resp. Len.",
31
+ "Parameter Size (B)",
32
+ "Type",
33
+ "Model Type",
34
+ "Think",
35
+ "Content Generation",
36
+ "Editing",
37
+ "Data Analysis",
38
+ "Reasoning",
39
+ "Hallucination",
40
+ "Safety",
41
+ "Repetition",
42
+ "Summarization",
43
+ "Translation",
44
+ "Multi-Turn"
45
+ ]
46
+
47
+ CATEGORY_EXCLUDED_COLUMNS = [
48
+ "Model Name",
49
+ "Group",
50
+ "Med. Len.",
51
+ "Med. Resp. Len.",
52
+ "Parameter Size (B)",
53
+ "Type",
54
+ "Model Type",
55
+ "Think"
56
+ ]
57
+
58
+ COLUMN_GROUP_LIST = [
59
+ "Category",
60
+ "Language"
61
+ ]
src/data/length_data.json ADDED
@@ -0,0 +1,1906 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Claude 4.1 Opus (20250805) (think)": {
3
+ "Overall": {
4
+ "Min": -10,
5
+ "Max": -2,
6
+ "Med": -2.0,
7
+ "Med Resp": -1.0
8
+ },
9
+ "Content Generation": {
10
+ "Min": -2,
11
+ "Max": -2,
12
+ "Med": -2.0,
13
+ "Med Resp": -1.0
14
+ },
15
+ "Editing": {
16
+ "Min": -2,
17
+ "Max": -2,
18
+ "Med": -2.0,
19
+ "Med Resp": -1.0
20
+ },
21
+ "Data Analysis": {
22
+ "Min": -2,
23
+ "Max": -2,
24
+ "Med": -2.0,
25
+ "Med Resp": -1.0
26
+ },
27
+ "Reasoning": {
28
+ "Min": -2,
29
+ "Max": -2,
30
+ "Med": -2.0,
31
+ "Med Resp": -1.0
32
+ },
33
+ "Hallucination": {
34
+ "Min": -2,
35
+ "Max": -2,
36
+ "Med": -2.0,
37
+ "Med Resp": -1.0
38
+ },
39
+ "Safety": {
40
+ "Min": -2,
41
+ "Max": -2,
42
+ "Med": -2.0,
43
+ "Med Resp": -1.0
44
+ },
45
+ "Repetition": {
46
+ "Min": -2,
47
+ "Max": -2,
48
+ "Med": -2.0,
49
+ "Med Resp": -1.0
50
+ },
51
+ "Summarization": {
52
+ "Min": -2,
53
+ "Max": -2,
54
+ "Med": -2.0,
55
+ "Med Resp": -1.0
56
+ },
57
+ "Translation": {
58
+ "Min": -2,
59
+ "Max": -2,
60
+ "Med": -2.0,
61
+ "Med Resp": -1.0
62
+ },
63
+ "Multi-Turn": {
64
+ "Min": -10,
65
+ "Max": -4,
66
+ "Med": -6.0,
67
+ "Med Resp": -3.0
68
+ }
69
+ },
70
+ "EXAONE 4.0 32B (think)": {
71
+ "Overall": {
72
+ "Min": 37,
73
+ "Max": 142387,
74
+ "Med": 1274.5,
75
+ "Med Resp": 503.0
76
+ },
77
+ "Content Generation": {
78
+ "Min": 160,
79
+ "Max": 131068,
80
+ "Med": 1178.5,
81
+ "Med Resp": 559.0
82
+ },
83
+ "Editing": {
84
+ "Min": 37,
85
+ "Max": 10786,
86
+ "Med": 1041.0,
87
+ "Med Resp": 423.5
88
+ },
89
+ "Data Analysis": {
90
+ "Min": 229,
91
+ "Max": 131072,
92
+ "Med": 1412.0,
93
+ "Med Resp": 345.0
94
+ },
95
+ "Reasoning": {
96
+ "Min": 567,
97
+ "Max": 131076,
98
+ "Med": 3961.5,
99
+ "Med Resp": 585.5
100
+ },
101
+ "Hallucination": {
102
+ "Min": 298,
103
+ "Max": 65533,
104
+ "Med": 1247.5,
105
+ "Med Resp": 627.5
106
+ },
107
+ "Safety": {
108
+ "Min": 227,
109
+ "Max": 5093,
110
+ "Med": 1145.0,
111
+ "Med Resp": 589.0
112
+ },
113
+ "Repetition": {
114
+ "Min": 441,
115
+ "Max": 131072,
116
+ "Med": 1744.5,
117
+ "Med Resp": 579.5
118
+ },
119
+ "Summarization": {
120
+ "Min": 149,
121
+ "Max": 8423,
122
+ "Med": 693.5,
123
+ "Med Resp": 311.0
124
+ },
125
+ "Translation": {
126
+ "Min": 227,
127
+ "Max": 14234,
128
+ "Med": 915.0,
129
+ "Med Resp": 411.5
130
+ },
131
+ "Multi-Turn": {
132
+ "Min": 390,
133
+ "Max": 142387,
134
+ "Med": 3222.0,
135
+ "Med Resp": 1488.0
136
+ }
137
+ },
138
+ "DeepSeek V3.1 (think)": {
139
+ "Overall": {
140
+ "Min": 80,
141
+ "Max": 31147,
142
+ "Med": 710.5,
143
+ "Med Resp": 356.0
144
+ },
145
+ "Content Generation": {
146
+ "Min": 132,
147
+ "Max": 5354,
148
+ "Med": 776.5,
149
+ "Med Resp": 500.0
150
+ },
151
+ "Editing": {
152
+ "Min": 119,
153
+ "Max": 2063,
154
+ "Med": 571.0,
155
+ "Med Resp": 287.0
156
+ },
157
+ "Data Analysis": {
158
+ "Min": 119,
159
+ "Max": 13106,
160
+ "Med": 644.0,
161
+ "Med Resp": 218.0
162
+ },
163
+ "Reasoning": {
164
+ "Min": 259,
165
+ "Max": 31147,
166
+ "Med": 1340.5,
167
+ "Med Resp": 338.0
168
+ },
169
+ "Hallucination": {
170
+ "Min": 206,
171
+ "Max": 10356,
172
+ "Med": 1132.5,
173
+ "Med Resp": 667.0
174
+ },
175
+ "Safety": {
176
+ "Min": 80,
177
+ "Max": 3412,
178
+ "Med": 565.0,
179
+ "Med Resp": 206.0
180
+ },
181
+ "Repetition": {
182
+ "Min": 290,
183
+ "Max": 6553,
184
+ "Med": 826.5,
185
+ "Med Resp": 450.0
186
+ },
187
+ "Summarization": {
188
+ "Min": 148,
189
+ "Max": 1533,
190
+ "Med": 432.0,
191
+ "Med Resp": 211.5
192
+ },
193
+ "Translation": {
194
+ "Min": 147,
195
+ "Max": 7448,
196
+ "Med": 554.5,
197
+ "Med Resp": 320.0
198
+ },
199
+ "Multi-Turn": {
200
+ "Min": 324,
201
+ "Max": 7862,
202
+ "Med": 2558.5,
203
+ "Med Resp": 1545.0
204
+ }
205
+ },
206
+ "o4-mini": {
207
+ "Overall": {
208
+ "Min": -10,
209
+ "Max": -2,
210
+ "Med": -2.0,
211
+ "Med Resp": -1.0
212
+ },
213
+ "Content Generation": {
214
+ "Min": -2,
215
+ "Max": -2,
216
+ "Med": -2.0,
217
+ "Med Resp": -1.0
218
+ },
219
+ "Editing": {
220
+ "Min": -2,
221
+ "Max": -2,
222
+ "Med": -2.0,
223
+ "Med Resp": -1.0
224
+ },
225
+ "Data Analysis": {
226
+ "Min": -2,
227
+ "Max": -2,
228
+ "Med": -2.0,
229
+ "Med Resp": -1.0
230
+ },
231
+ "Reasoning": {
232
+ "Min": -2,
233
+ "Max": -2,
234
+ "Med": -2.0,
235
+ "Med Resp": -1.0
236
+ },
237
+ "Hallucination": {
238
+ "Min": -2,
239
+ "Max": -2,
240
+ "Med": -2.0,
241
+ "Med Resp": -1.0
242
+ },
243
+ "Safety": {
244
+ "Min": -2,
245
+ "Max": -2,
246
+ "Med": -2.0,
247
+ "Med Resp": -1.0
248
+ },
249
+ "Repetition": {
250
+ "Min": -2,
251
+ "Max": -2,
252
+ "Med": -2.0,
253
+ "Med Resp": -1.0
254
+ },
255
+ "Summarization": {
256
+ "Min": -2,
257
+ "Max": -2,
258
+ "Med": -2.0,
259
+ "Med Resp": -1.0
260
+ },
261
+ "Translation": {
262
+ "Min": -2,
263
+ "Max": -2,
264
+ "Med": -2.0,
265
+ "Med Resp": -1.0
266
+ },
267
+ "Multi-Turn": {
268
+ "Min": -10,
269
+ "Max": -4,
270
+ "Med": -6.0,
271
+ "Med Resp": -3.0
272
+ }
273
+ },
274
+ "Gemini 2.5 Flash": {
275
+ "Overall": {
276
+ "Min": -10,
277
+ "Max": -2,
278
+ "Med": -2.0,
279
+ "Med Resp": -1.0
280
+ },
281
+ "Content Generation": {
282
+ "Min": -2,
283
+ "Max": -2,
284
+ "Med": -2.0,
285
+ "Med Resp": -1.0
286
+ },
287
+ "Editing": {
288
+ "Min": -2,
289
+ "Max": -2,
290
+ "Med": -2.0,
291
+ "Med Resp": -1.0
292
+ },
293
+ "Data Analysis": {
294
+ "Min": -2,
295
+ "Max": -2,
296
+ "Med": -2.0,
297
+ "Med Resp": -1.0
298
+ },
299
+ "Reasoning": {
300
+ "Min": -2,
301
+ "Max": -2,
302
+ "Med": -2.0,
303
+ "Med Resp": -1.0
304
+ },
305
+ "Hallucination": {
306
+ "Min": -2,
307
+ "Max": -2,
308
+ "Med": -2.0,
309
+ "Med Resp": -1.0
310
+ },
311
+ "Safety": {
312
+ "Min": -2,
313
+ "Max": -2,
314
+ "Med": -2.0,
315
+ "Med Resp": -1.0
316
+ },
317
+ "Repetition": {
318
+ "Min": -2,
319
+ "Max": -2,
320
+ "Med": -2.0,
321
+ "Med Resp": -1.0
322
+ },
323
+ "Summarization": {
324
+ "Min": -2,
325
+ "Max": -2,
326
+ "Med": -2.0,
327
+ "Med Resp": -1.0
328
+ },
329
+ "Translation": {
330
+ "Min": -2,
331
+ "Max": -2,
332
+ "Med": -2.0,
333
+ "Med Resp": -1.0
334
+ },
335
+ "Multi-Turn": {
336
+ "Min": -10,
337
+ "Max": -4,
338
+ "Med": -6.0,
339
+ "Med Resp": -3.0
340
+ }
341
+ },
342
+ "Claude 4 Sonnet (20250514) (think)": {
343
+ "Overall": {
344
+ "Min": -10,
345
+ "Max": -2,
346
+ "Med": -2.0,
347
+ "Med Resp": -1.0
348
+ },
349
+ "Content Generation": {
350
+ "Min": -2,
351
+ "Max": -2,
352
+ "Med": -2.0,
353
+ "Med Resp": -1.0
354
+ },
355
+ "Editing": {
356
+ "Min": -2,
357
+ "Max": -2,
358
+ "Med": -2.0,
359
+ "Med Resp": -1.0
360
+ },
361
+ "Data Analysis": {
362
+ "Min": -2,
363
+ "Max": -2,
364
+ "Med": -2.0,
365
+ "Med Resp": -1.0
366
+ },
367
+ "Reasoning": {
368
+ "Min": -2,
369
+ "Max": -2,
370
+ "Med": -2.0,
371
+ "Med Resp": -1.0
372
+ },
373
+ "Hallucination": {
374
+ "Min": -2,
375
+ "Max": -2,
376
+ "Med": -2.0,
377
+ "Med Resp": -1.0
378
+ },
379
+ "Safety": {
380
+ "Min": -2,
381
+ "Max": -2,
382
+ "Med": -2.0,
383
+ "Med Resp": -1.0
384
+ },
385
+ "Repetition": {
386
+ "Min": -2,
387
+ "Max": -2,
388
+ "Med": -2.0,
389
+ "Med Resp": -1.0
390
+ },
391
+ "Summarization": {
392
+ "Min": -2,
393
+ "Max": -2,
394
+ "Med": -2.0,
395
+ "Med Resp": -1.0
396
+ },
397
+ "Translation": {
398
+ "Min": -2,
399
+ "Max": -2,
400
+ "Med": -2.0,
401
+ "Med Resp": -1.0
402
+ },
403
+ "Multi-Turn": {
404
+ "Min": -10,
405
+ "Max": -4,
406
+ "Med": -6.0,
407
+ "Med Resp": -3.0
408
+ }
409
+ },
410
+ "Solar Pro Preview (top_p:0.95, temp: 0.7)": {
411
+ "Overall": {
412
+ "Min": 1,
413
+ "Max": 4060,
414
+ "Med": 260.0,
415
+ "Med Resp": 260.0
416
+ },
417
+ "Content Generation": {
418
+ "Min": 15,
419
+ "Max": 3643,
420
+ "Med": 426.0,
421
+ "Med Resp": 426.0
422
+ },
423
+ "Editing": {
424
+ "Min": 14,
425
+ "Max": 3948,
426
+ "Med": 218.0,
427
+ "Med Resp": 218.0
428
+ },
429
+ "Data Analysis": {
430
+ "Min": 2,
431
+ "Max": 3500,
432
+ "Med": 89.0,
433
+ "Med Resp": 89.0
434
+ },
435
+ "Reasoning": {
436
+ "Min": 1,
437
+ "Max": 3338,
438
+ "Med": 190.5,
439
+ "Med Resp": 190.5
440
+ },
441
+ "Hallucination": {
442
+ "Min": 20,
443
+ "Max": 1093,
444
+ "Med": 128.5,
445
+ "Med Resp": 128.5
446
+ },
447
+ "Safety": {
448
+ "Min": 11,
449
+ "Max": 1507,
450
+ "Med": 92.0,
451
+ "Med Resp": 92.0
452
+ },
453
+ "Repetition": {
454
+ "Min": 34,
455
+ "Max": 4060,
456
+ "Med": 214.0,
457
+ "Med Resp": 214.0
458
+ },
459
+ "Summarization": {
460
+ "Min": 43,
461
+ "Max": 2478,
462
+ "Med": 218.0,
463
+ "Med Resp": 218.0
464
+ },
465
+ "Translation": {
466
+ "Min": 20,
467
+ "Max": 1711,
468
+ "Med": 360.0,
469
+ "Med Resp": 360.0
470
+ },
471
+ "Multi-Turn": {
472
+ "Min": 5,
473
+ "Max": 3353,
474
+ "Med": 530.0,
475
+ "Med Resp": 530.0
476
+ }
477
+ },
478
+ "DeepSeek R1 (0528) (top_p: 0.95, temp:0.6)": {
479
+ "Overall": {
480
+ "Min": 4,
481
+ "Max": 16917,
482
+ "Med": 1177.5,
483
+ "Med Resp": 554.0
484
+ },
485
+ "Content Generation": {
486
+ "Min": 389,
487
+ "Max": 7861,
488
+ "Med": 1261.5,
489
+ "Med Resp": 694.0
490
+ },
491
+ "Editing": {
492
+ "Min": 4,
493
+ "Max": 7611,
494
+ "Med": 1054.5,
495
+ "Med Resp": 517.5
496
+ },
497
+ "Data Analysis": {
498
+ "Min": 4,
499
+ "Max": 8191,
500
+ "Med": 1112.0,
501
+ "Med Resp": 355.0
502
+ },
503
+ "Reasoning": {
504
+ "Min": 4,
505
+ "Max": 12257,
506
+ "Med": 1913.0,
507
+ "Med Resp": 455.5
508
+ },
509
+ "Hallucination": {
510
+ "Min": 4,
511
+ "Max": 7390,
512
+ "Med": 1214.5,
513
+ "Med Resp": 682.0
514
+ },
515
+ "Safety": {
516
+ "Min": 227,
517
+ "Max": 6387,
518
+ "Med": 963.0,
519
+ "Med Resp": 568.0
520
+ },
521
+ "Repetition": {
522
+ "Min": 4,
523
+ "Max": 7787,
524
+ "Med": 1405.5,
525
+ "Med Resp": 646.5
526
+ },
527
+ "Summarization": {
528
+ "Min": 319,
529
+ "Max": 2613,
530
+ "Med": 711.5,
531
+ "Med Resp": 321.0
532
+ },
533
+ "Translation": {
534
+ "Min": 4,
535
+ "Max": 7687,
536
+ "Med": 1021.0,
537
+ "Med Resp": 561.5
538
+ },
539
+ "Multi-Turn": {
540
+ "Min": 448,
541
+ "Max": 16917,
542
+ "Med": 3418.5,
543
+ "Med Resp": 1874.0
544
+ }
545
+ },
546
+ "A.X 4.0": {
547
+ "Overall": {
548
+ "Min": 1,
549
+ "Max": 65581,
550
+ "Med": 412.5,
551
+ "Med Resp": 412.5
552
+ },
553
+ "Content Generation": {
554
+ "Min": 2,
555
+ "Max": 65581,
556
+ "Med": 543.0,
557
+ "Med Resp": 543.0
558
+ },
559
+ "Editing": {
560
+ "Min": 8,
561
+ "Max": 1791,
562
+ "Med": 250.0,
563
+ "Med Resp": 250.0
564
+ },
565
+ "Data Analysis": {
566
+ "Min": 1,
567
+ "Max": 65537,
568
+ "Med": 267.0,
569
+ "Med Resp": 267.0
570
+ },
571
+ "Reasoning": {
572
+ "Min": 2,
573
+ "Max": 2046,
574
+ "Med": 498.0,
575
+ "Med Resp": 498.0
576
+ },
577
+ "Hallucination": {
578
+ "Min": 12,
579
+ "Max": 2639,
580
+ "Med": 511.5,
581
+ "Med Resp": 511.5
582
+ },
583
+ "Safety": {
584
+ "Min": 4,
585
+ "Max": 2942,
586
+ "Med": 516.0,
587
+ "Med Resp": 516.0
588
+ },
589
+ "Repetition": {
590
+ "Min": 84,
591
+ "Max": 65536,
592
+ "Med": 341.5,
593
+ "Med Resp": 341.5
594
+ },
595
+ "Summarization": {
596
+ "Min": 26,
597
+ "Max": 2369,
598
+ "Med": 282.0,
599
+ "Med Resp": 282.0
600
+ },
601
+ "Translation": {
602
+ "Min": 7,
603
+ "Max": 35068,
604
+ "Med": 343.0,
605
+ "Med Resp": 343.0
606
+ },
607
+ "Multi-Turn": {
608
+ "Min": 3,
609
+ "Max": 9420,
610
+ "Med": 1455.0,
611
+ "Med Resp": 1455.0
612
+ }
613
+ },
614
+ "GPT-5 (Reasoning: medium)": {
615
+ "Overall": {
616
+ "Min": -10,
617
+ "Max": -2,
618
+ "Med": -2.0,
619
+ "Med Resp": -1.0
620
+ },
621
+ "Content Generation": {
622
+ "Min": -2,
623
+ "Max": -2,
624
+ "Med": -2.0,
625
+ "Med Resp": -1.0
626
+ },
627
+ "Editing": {
628
+ "Min": -2,
629
+ "Max": -2,
630
+ "Med": -2.0,
631
+ "Med Resp": -1.0
632
+ },
633
+ "Data Analysis": {
634
+ "Min": -2,
635
+ "Max": -2,
636
+ "Med": -2.0,
637
+ "Med Resp": -1.0
638
+ },
639
+ "Reasoning": {
640
+ "Min": -2,
641
+ "Max": -2,
642
+ "Med": -2.0,
643
+ "Med Resp": -1.0
644
+ },
645
+ "Hallucination": {
646
+ "Min": -2,
647
+ "Max": -2,
648
+ "Med": -2.0,
649
+ "Med Resp": -1.0
650
+ },
651
+ "Safety": {
652
+ "Min": -2,
653
+ "Max": -2,
654
+ "Med": -2.0,
655
+ "Med Resp": -1.0
656
+ },
657
+ "Repetition": {
658
+ "Min": -2,
659
+ "Max": -2,
660
+ "Med": -2.0,
661
+ "Med Resp": -1.0
662
+ },
663
+ "Summarization": {
664
+ "Min": -2,
665
+ "Max": -2,
666
+ "Med": -2.0,
667
+ "Med Resp": -1.0
668
+ },
669
+ "Translation": {
670
+ "Min": -2,
671
+ "Max": -2,
672
+ "Med": -2.0,
673
+ "Med Resp": -1.0
674
+ },
675
+ "Multi-Turn": {
676
+ "Min": -10,
677
+ "Max": -4,
678
+ "Med": -6.0,
679
+ "Med Resp": -3.0
680
+ }
681
+ },
682
+ "Kanana 1.5 15.7B A3B Instruct": {
683
+ "Overall": {
684
+ "Min": 1,
685
+ "Max": 34276,
686
+ "Med": 414.0,
687
+ "Med Resp": 414.0
688
+ },
689
+ "Content Generation": {
690
+ "Min": 10,
691
+ "Max": 22194,
692
+ "Med": 463.5,
693
+ "Med Resp": 463.5
694
+ },
695
+ "Editing": {
696
+ "Min": 5,
697
+ "Max": 1311,
698
+ "Med": 249.5,
699
+ "Med Resp": 249.5
700
+ },
701
+ "Data Analysis": {
702
+ "Min": 1,
703
+ "Max": 22211,
704
+ "Med": 396.0,
705
+ "Med Resp": 396.0
706
+ },
707
+ "Reasoning": {
708
+ "Min": 1,
709
+ "Max": 20275,
710
+ "Med": 581.0,
711
+ "Med Resp": 581.0
712
+ },
713
+ "Hallucination": {
714
+ "Min": 24,
715
+ "Max": 21645,
716
+ "Med": 441.5,
717
+ "Med Resp": 441.5
718
+ },
719
+ "Safety": {
720
+ "Min": 18,
721
+ "Max": 1531,
722
+ "Med": 414.0,
723
+ "Med Resp": 414.0
724
+ },
725
+ "Repetition": {
726
+ "Min": 76,
727
+ "Max": 1912,
728
+ "Med": 299.5,
729
+ "Med Resp": 299.5
730
+ },
731
+ "Summarization": {
732
+ "Min": 1,
733
+ "Max": 29578,
734
+ "Med": 275.5,
735
+ "Med Resp": 275.5
736
+ },
737
+ "Translation": {
738
+ "Min": 9,
739
+ "Max": 31839,
740
+ "Med": 308.5,
741
+ "Med Resp": 308.5
742
+ },
743
+ "Multi-Turn": {
744
+ "Min": 3,
745
+ "Max": 34276,
746
+ "Med": 1167.5,
747
+ "Med Resp": 1167.5
748
+ }
749
+ },
750
+ "DeepSeek V3 (0324) (top_p: 0.95, temp:1.3)": {
751
+ "Overall": {
752
+ "Min": 1,
753
+ "Max": 5178,
754
+ "Med": 408.0,
755
+ "Med Resp": 408.0
756
+ },
757
+ "Content Generation": {
758
+ "Min": 7,
759
+ "Max": 1974,
760
+ "Med": 439.5,
761
+ "Med Resp": 439.5
762
+ },
763
+ "Editing": {
764
+ "Min": 5,
765
+ "Max": 1192,
766
+ "Med": 293.0,
767
+ "Med Resp": 293.0
768
+ },
769
+ "Data Analysis": {
770
+ "Min": 1,
771
+ "Max": 3155,
772
+ "Med": 330.0,
773
+ "Med Resp": 330.0
774
+ },
775
+ "Reasoning": {
776
+ "Min": 63,
777
+ "Max": 5178,
778
+ "Med": 519.0,
779
+ "Med Resp": 519.0
780
+ },
781
+ "Hallucination": {
782
+ "Min": 57,
783
+ "Max": 1621,
784
+ "Med": 502.5,
785
+ "Med Resp": 502.5
786
+ },
787
+ "Safety": {
788
+ "Min": 12,
789
+ "Max": 1726,
790
+ "Med": 337.0,
791
+ "Med Resp": 337.0
792
+ },
793
+ "Repetition": {
794
+ "Min": 98,
795
+ "Max": 2754,
796
+ "Med": 406.5,
797
+ "Med Resp": 406.5
798
+ },
799
+ "Summarization": {
800
+ "Min": 32,
801
+ "Max": 959,
802
+ "Med": 251.0,
803
+ "Med Resp": 251.0
804
+ },
805
+ "Translation": {
806
+ "Min": 60,
807
+ "Max": 2197,
808
+ "Med": 351.5,
809
+ "Med Resp": 351.5
810
+ },
811
+ "Multi-Turn": {
812
+ "Min": 4,
813
+ "Max": 4959,
814
+ "Med": 1318.5,
815
+ "Med Resp": 1318.5
816
+ }
817
+ },
818
+ "GLM-4.5 FP8 (think)": {
819
+ "Overall": {
820
+ "Min": 75,
821
+ "Max": 65432,
822
+ "Med": 1442.0,
823
+ "Med Resp": 604.0
824
+ },
825
+ "Content Generation": {
826
+ "Min": 322,
827
+ "Max": 9320,
828
+ "Med": 1283.0,
829
+ "Med Resp": 655.5
830
+ },
831
+ "Editing": {
832
+ "Min": 232,
833
+ "Max": 10227,
834
+ "Med": 1163.5,
835
+ "Med Resp": 571.0
836
+ },
837
+ "Data Analysis": {
838
+ "Min": 318,
839
+ "Max": 15748,
840
+ "Med": 1328.0,
841
+ "Med Resp": 481.0
842
+ },
843
+ "Reasoning": {
844
+ "Min": 558,
845
+ "Max": 65432,
846
+ "Med": 3187.5,
847
+ "Med Resp": 653.0
848
+ },
849
+ "Hallucination": {
850
+ "Min": 75,
851
+ "Max": 10541,
852
+ "Med": 1546.5,
853
+ "Med Resp": 962.5
854
+ },
855
+ "Safety": {
856
+ "Min": 159,
857
+ "Max": 5552,
858
+ "Med": 1418.0,
859
+ "Med Resp": 808.0
860
+ },
861
+ "Repetition": {
862
+ "Min": 284,
863
+ "Max": 65409,
864
+ "Med": 1492.0,
865
+ "Med Resp": 729.5
866
+ },
867
+ "Summarization": {
868
+ "Min": 242,
869
+ "Max": 3610,
870
+ "Med": 688.5,
871
+ "Med Resp": 268.0
872
+ },
873
+ "Translation": {
874
+ "Min": 156,
875
+ "Max": 10043,
876
+ "Med": 1448.5,
877
+ "Med Resp": 414.0
878
+ },
879
+ "Multi-Turn": {
880
+ "Min": 630,
881
+ "Max": 15831,
882
+ "Med": 3977.5,
883
+ "Med Resp": 2277.5
884
+ }
885
+ },
886
+ "Gauss2.3 Hybrid": {
887
+ "Overall": {
888
+ "Min": 7,
889
+ "Max": 134423,
890
+ "Med": 546.0,
891
+ "Med Resp": 308.0
892
+ },
893
+ "Content Generation": {
894
+ "Min": 16,
895
+ "Max": 6706,
896
+ "Med": 470.0,
897
+ "Med Resp": 416.5
898
+ },
899
+ "Editing": {
900
+ "Min": 9,
901
+ "Max": 2943,
902
+ "Med": 219.0,
903
+ "Med Resp": 188.5
904
+ },
905
+ "Data Analysis": {
906
+ "Min": 23,
907
+ "Max": 131072,
908
+ "Med": 585.0,
909
+ "Med Resp": 192.0
910
+ },
911
+ "Reasoning": {
912
+ "Min": 329,
913
+ "Max": 131072,
914
+ "Med": 2091.0,
915
+ "Med Resp": 387.0
916
+ },
917
+ "Hallucination": {
918
+ "Min": 20,
919
+ "Max": 131072,
920
+ "Med": 972.5,
921
+ "Med Resp": 387.0
922
+ },
923
+ "Safety": {
924
+ "Min": 20,
925
+ "Max": 131072,
926
+ "Med": 603.0,
927
+ "Med Resp": 270.0
928
+ },
929
+ "Repetition": {
930
+ "Min": 60,
931
+ "Max": 131085,
932
+ "Med": 869.5,
933
+ "Med Resp": 392.0
934
+ },
935
+ "Summarization": {
936
+ "Min": 26,
937
+ "Max": 2114,
938
+ "Med": 320.0,
939
+ "Med Resp": 208.0
940
+ },
941
+ "Translation": {
942
+ "Min": 7,
943
+ "Max": 71270,
944
+ "Med": 322.0,
945
+ "Med Resp": 273.0
946
+ },
947
+ "Multi-Turn": {
948
+ "Min": 7,
949
+ "Max": 134423,
950
+ "Med": 2478.5,
951
+ "Med Resp": 1208.5
952
+ }
953
+ },
954
+ "gpt-oss-120B (Reasoning: medium)": {
955
+ "Overall": {
956
+ "Min": 43,
957
+ "Max": 18693,
958
+ "Med": 759.5,
959
+ "Med Resp": 370.5
960
+ },
961
+ "Content Generation": {
962
+ "Min": 126,
963
+ "Max": 6264,
964
+ "Med": 897.0,
965
+ "Med Resp": 613.5
966
+ },
967
+ "Editing": {
968
+ "Min": 61,
969
+ "Max": 4605,
970
+ "Med": 475.5,
971
+ "Med Resp": 248.5
972
+ },
973
+ "Data Analysis": {
974
+ "Min": 49,
975
+ "Max": 6975,
976
+ "Med": 596.0,
977
+ "Med Resp": 213.0
978
+ },
979
+ "Reasoning": {
980
+ "Min": 147,
981
+ "Max": 10387,
982
+ "Med": 1170.5,
983
+ "Med Resp": 635.0
984
+ },
985
+ "Hallucination": {
986
+ "Min": 88,
987
+ "Max": 5277,
988
+ "Med": 1317.0,
989
+ "Med Resp": 1106.5
990
+ },
991
+ "Safety": {
992
+ "Min": 43,
993
+ "Max": 3651,
994
+ "Med": 199.0,
995
+ "Med Resp": 12.0
996
+ },
997
+ "Repetition": {
998
+ "Min": 122,
999
+ "Max": 6986,
1000
+ "Med": 940.0,
1001
+ "Med Resp": 407.0
1002
+ },
1003
+ "Summarization": {
1004
+ "Min": 83,
1005
+ "Max": 15231,
1006
+ "Med": 378.0,
1007
+ "Med Resp": 246.0
1008
+ },
1009
+ "Translation": {
1010
+ "Min": 107,
1011
+ "Max": 3659,
1012
+ "Med": 737.0,
1013
+ "Med Resp": 299.5
1014
+ },
1015
+ "Multi-Turn": {
1016
+ "Min": 135,
1017
+ "Max": 18693,
1018
+ "Med": 2826.0,
1019
+ "Med Resp": 2150.0
1020
+ }
1021
+ },
1022
+ "Qwen3 32B (think)": {
1023
+ "Overall": {
1024
+ "Min": 164,
1025
+ "Max": 34272,
1026
+ "Med": 1113.0,
1027
+ "Med Resp": 390.0
1028
+ },
1029
+ "Content Generation": {
1030
+ "Min": 164,
1031
+ "Max": 32768,
1032
+ "Med": 1027.5,
1033
+ "Med Resp": 476.0
1034
+ },
1035
+ "Editing": {
1036
+ "Min": 285,
1037
+ "Max": 3646,
1038
+ "Med": 843.0,
1039
+ "Med Resp": 283.0
1040
+ },
1041
+ "Data Analysis": {
1042
+ "Min": 210,
1043
+ "Max": 18774,
1044
+ "Med": 968.0,
1045
+ "Med Resp": 278.0
1046
+ },
1047
+ "Reasoning": {
1048
+ "Min": 477,
1049
+ "Max": 18676,
1050
+ "Med": 1759.0,
1051
+ "Med Resp": 459.0
1052
+ },
1053
+ "Hallucination": {
1054
+ "Min": 170,
1055
+ "Max": 3776,
1056
+ "Med": 1617.0,
1057
+ "Med Resp": 646.0
1058
+ },
1059
+ "Safety": {
1060
+ "Min": 169,
1061
+ "Max": 4053,
1062
+ "Med": 940.0,
1063
+ "Med Resp": 429.0
1064
+ },
1065
+ "Repetition": {
1066
+ "Min": 608,
1067
+ "Max": 32768,
1068
+ "Med": 2316.5,
1069
+ "Med Resp": 537.5
1070
+ },
1071
+ "Summarization": {
1072
+ "Min": 192,
1073
+ "Max": 2255,
1074
+ "Med": 586.0,
1075
+ "Med Resp": 236.5
1076
+ },
1077
+ "Translation": {
1078
+ "Min": 374,
1079
+ "Max": 10683,
1080
+ "Med": 1113.5,
1081
+ "Med Resp": 307.0
1082
+ },
1083
+ "Multi-Turn": {
1084
+ "Min": 493,
1085
+ "Max": 34272,
1086
+ "Med": 3210.0,
1087
+ "Med Resp": 1481.0
1088
+ }
1089
+ },
1090
+ "Qwen3 235B A22B Instruct 2507": {
1091
+ "Overall": {
1092
+ "Min": 1,
1093
+ "Max": 65405,
1094
+ "Med": 433.0,
1095
+ "Med Resp": 433.0
1096
+ },
1097
+ "Content Generation": {
1098
+ "Min": 7,
1099
+ "Max": 4604,
1100
+ "Med": 492.5,
1101
+ "Med Resp": 492.5
1102
+ },
1103
+ "Editing": {
1104
+ "Min": 6,
1105
+ "Max": 2067,
1106
+ "Med": 248.5,
1107
+ "Med Resp": 248.5
1108
+ },
1109
+ "Data Analysis": {
1110
+ "Min": 1,
1111
+ "Max": 5119,
1112
+ "Med": 357.0,
1113
+ "Med Resp": 357.0
1114
+ },
1115
+ "Reasoning": {
1116
+ "Min": 1,
1117
+ "Max": 11933,
1118
+ "Med": 730.5,
1119
+ "Med Resp": 730.5
1120
+ },
1121
+ "Hallucination": {
1122
+ "Min": 38,
1123
+ "Max": 2395,
1124
+ "Med": 630.0,
1125
+ "Med Resp": 630.0
1126
+ },
1127
+ "Safety": {
1128
+ "Min": 12,
1129
+ "Max": 2497,
1130
+ "Med": 352.0,
1131
+ "Med Resp": 352.0
1132
+ },
1133
+ "Repetition": {
1134
+ "Min": 73,
1135
+ "Max": 65405,
1136
+ "Med": 468.5,
1137
+ "Med Resp": 468.5
1138
+ },
1139
+ "Summarization": {
1140
+ "Min": 24,
1141
+ "Max": 1899,
1142
+ "Med": 249.0,
1143
+ "Med Resp": 249.0
1144
+ },
1145
+ "Translation": {
1146
+ "Min": 10,
1147
+ "Max": 64183,
1148
+ "Med": 299.0,
1149
+ "Med Resp": 299.0
1150
+ },
1151
+ "Multi-Turn": {
1152
+ "Min": 3,
1153
+ "Max": 8009,
1154
+ "Med": 1728.5,
1155
+ "Med Resp": 1728.5
1156
+ }
1157
+ },
1158
+ "Claude 4 Opus (20250514) (think)": {
1159
+ "Overall": {
1160
+ "Min": -10,
1161
+ "Max": -2,
1162
+ "Med": -2.0,
1163
+ "Med Resp": -1.0
1164
+ },
1165
+ "Content Generation": {
1166
+ "Min": -2,
1167
+ "Max": -2,
1168
+ "Med": -2.0,
1169
+ "Med Resp": -1.0
1170
+ },
1171
+ "Editing": {
1172
+ "Min": -2,
1173
+ "Max": -2,
1174
+ "Med": -2.0,
1175
+ "Med Resp": -1.0
1176
+ },
1177
+ "Data Analysis": {
1178
+ "Min": -2,
1179
+ "Max": -2,
1180
+ "Med": -2.0,
1181
+ "Med Resp": -1.0
1182
+ },
1183
+ "Reasoning": {
1184
+ "Min": -2,
1185
+ "Max": -2,
1186
+ "Med": -2.0,
1187
+ "Med Resp": -1.0
1188
+ },
1189
+ "Hallucination": {
1190
+ "Min": -2,
1191
+ "Max": -2,
1192
+ "Med": -2.0,
1193
+ "Med Resp": -1.0
1194
+ },
1195
+ "Safety": {
1196
+ "Min": -2,
1197
+ "Max": -2,
1198
+ "Med": -2.0,
1199
+ "Med Resp": -1.0
1200
+ },
1201
+ "Repetition": {
1202
+ "Min": -2,
1203
+ "Max": -2,
1204
+ "Med": -2.0,
1205
+ "Med Resp": -1.0
1206
+ },
1207
+ "Summarization": {
1208
+ "Min": -2,
1209
+ "Max": -2,
1210
+ "Med": -2.0,
1211
+ "Med Resp": -1.0
1212
+ },
1213
+ "Translation": {
1214
+ "Min": -2,
1215
+ "Max": -2,
1216
+ "Med": -2.0,
1217
+ "Med Resp": -1.0
1218
+ },
1219
+ "Multi-Turn": {
1220
+ "Min": -10,
1221
+ "Max": -4,
1222
+ "Med": -6.0,
1223
+ "Med Resp": -3.0
1224
+ }
1225
+ },
1226
+ "Gemini 2.5 Pro": {
1227
+ "Overall": {
1228
+ "Min": -10,
1229
+ "Max": -2,
1230
+ "Med": -2.0,
1231
+ "Med Resp": -1.0
1232
+ },
1233
+ "Content Generation": {
1234
+ "Min": -2,
1235
+ "Max": -2,
1236
+ "Med": -2.0,
1237
+ "Med Resp": -1.0
1238
+ },
1239
+ "Editing": {
1240
+ "Min": -2,
1241
+ "Max": -2,
1242
+ "Med": -2.0,
1243
+ "Med Resp": -1.0
1244
+ },
1245
+ "Data Analysis": {
1246
+ "Min": -2,
1247
+ "Max": -2,
1248
+ "Med": -2.0,
1249
+ "Med Resp": -1.0
1250
+ },
1251
+ "Reasoning": {
1252
+ "Min": -2,
1253
+ "Max": -2,
1254
+ "Med": -2.0,
1255
+ "Med Resp": -1.0
1256
+ },
1257
+ "Hallucination": {
1258
+ "Min": -2,
1259
+ "Max": -2,
1260
+ "Med": -2.0,
1261
+ "Med Resp": -1.0
1262
+ },
1263
+ "Safety": {
1264
+ "Min": -2,
1265
+ "Max": -2,
1266
+ "Med": -2.0,
1267
+ "Med Resp": -1.0
1268
+ },
1269
+ "Repetition": {
1270
+ "Min": -2,
1271
+ "Max": -2,
1272
+ "Med": -2.0,
1273
+ "Med Resp": -1.0
1274
+ },
1275
+ "Summarization": {
1276
+ "Min": -2,
1277
+ "Max": -2,
1278
+ "Med": -2.0,
1279
+ "Med Resp": -1.0
1280
+ },
1281
+ "Translation": {
1282
+ "Min": -2,
1283
+ "Max": -2,
1284
+ "Med": -2.0,
1285
+ "Med Resp": -1.0
1286
+ },
1287
+ "Multi-Turn": {
1288
+ "Min": -10,
1289
+ "Max": -4,
1290
+ "Med": -6.0,
1291
+ "Med Resp": -3.0
1292
+ }
1293
+ },
1294
+ "GPT-5 mini (Reasoning: medium)": {
1295
+ "Overall": {
1296
+ "Min": -10,
1297
+ "Max": -2,
1298
+ "Med": -2.0,
1299
+ "Med Resp": -1.0
1300
+ },
1301
+ "Content Generation": {
1302
+ "Min": -2,
1303
+ "Max": -2,
1304
+ "Med": -2.0,
1305
+ "Med Resp": -1.0
1306
+ },
1307
+ "Editing": {
1308
+ "Min": -2,
1309
+ "Max": -2,
1310
+ "Med": -2.0,
1311
+ "Med Resp": -1.0
1312
+ },
1313
+ "Data Analysis": {
1314
+ "Min": -2,
1315
+ "Max": -2,
1316
+ "Med": -2.0,
1317
+ "Med Resp": -1.0
1318
+ },
1319
+ "Reasoning": {
1320
+ "Min": -2,
1321
+ "Max": -2,
1322
+ "Med": -2.0,
1323
+ "Med Resp": -1.0
1324
+ },
1325
+ "Hallucination": {
1326
+ "Min": -2,
1327
+ "Max": -2,
1328
+ "Med": -2.0,
1329
+ "Med Resp": -1.0
1330
+ },
1331
+ "Safety": {
1332
+ "Min": -2,
1333
+ "Max": -2,
1334
+ "Med": -2.0,
1335
+ "Med Resp": -1.0
1336
+ },
1337
+ "Repetition": {
1338
+ "Min": -2,
1339
+ "Max": -2,
1340
+ "Med": -2.0,
1341
+ "Med Resp": -1.0
1342
+ },
1343
+ "Summarization": {
1344
+ "Min": -2,
1345
+ "Max": -2,
1346
+ "Med": -2.0,
1347
+ "Med Resp": -1.0
1348
+ },
1349
+ "Translation": {
1350
+ "Min": -2,
1351
+ "Max": -2,
1352
+ "Med": -2.0,
1353
+ "Med Resp": -1.0
1354
+ },
1355
+ "Multi-Turn": {
1356
+ "Min": -10,
1357
+ "Max": -4,
1358
+ "Med": -6.0,
1359
+ "Med Resp": -3.0
1360
+ }
1361
+ },
1362
+ "GPT-5 nano (Reasoning: medium)": {
1363
+ "Overall": {
1364
+ "Min": -10,
1365
+ "Max": -2,
1366
+ "Med": -2.0,
1367
+ "Med Resp": -1.0
1368
+ },
1369
+ "Content Generation": {
1370
+ "Min": -2,
1371
+ "Max": -2,
1372
+ "Med": -2.0,
1373
+ "Med Resp": -1.0
1374
+ },
1375
+ "Editing": {
1376
+ "Min": -2,
1377
+ "Max": -2,
1378
+ "Med": -2.0,
1379
+ "Med Resp": -1.0
1380
+ },
1381
+ "Data Analysis": {
1382
+ "Min": -2,
1383
+ "Max": -2,
1384
+ "Med": -2.0,
1385
+ "Med Resp": -1.0
1386
+ },
1387
+ "Reasoning": {
1388
+ "Min": -2,
1389
+ "Max": -2,
1390
+ "Med": -2.0,
1391
+ "Med Resp": -1.0
1392
+ },
1393
+ "Hallucination": {
1394
+ "Min": -2,
1395
+ "Max": -2,
1396
+ "Med": -2.0,
1397
+ "Med Resp": -1.0
1398
+ },
1399
+ "Safety": {
1400
+ "Min": -2,
1401
+ "Max": -2,
1402
+ "Med": -2.0,
1403
+ "Med Resp": -1.0
1404
+ },
1405
+ "Repetition": {
1406
+ "Min": -2,
1407
+ "Max": -2,
1408
+ "Med": -2.0,
1409
+ "Med Resp": -1.0
1410
+ },
1411
+ "Summarization": {
1412
+ "Min": -2,
1413
+ "Max": -2,
1414
+ "Med": -2.0,
1415
+ "Med Resp": -1.0
1416
+ },
1417
+ "Translation": {
1418
+ "Min": -2,
1419
+ "Max": -2,
1420
+ "Med": -2.0,
1421
+ "Med Resp": -1.0
1422
+ },
1423
+ "Multi-Turn": {
1424
+ "Min": -10,
1425
+ "Max": -4,
1426
+ "Med": -6.0,
1427
+ "Med Resp": -3.0
1428
+ }
1429
+ },
1430
+ "gpt-oss-20B (Reasoning: medium)": {
1431
+ "Overall": {
1432
+ "Min": 32,
1433
+ "Max": 18763,
1434
+ "Med": 953.5,
1435
+ "Med Resp": 326.0
1436
+ },
1437
+ "Content Generation": {
1438
+ "Min": 126,
1439
+ "Max": 6343,
1440
+ "Med": 983.5,
1441
+ "Med Resp": 486.5
1442
+ },
1443
+ "Editing": {
1444
+ "Min": 107,
1445
+ "Max": 7213,
1446
+ "Med": 667.0,
1447
+ "Med Resp": 195.0
1448
+ },
1449
+ "Data Analysis": {
1450
+ "Min": 94,
1451
+ "Max": 14599,
1452
+ "Med": 750.0,
1453
+ "Med Resp": 192.0
1454
+ },
1455
+ "Reasoning": {
1456
+ "Min": 109,
1457
+ "Max": 18763,
1458
+ "Med": 1290.5,
1459
+ "Med Resp": 475.5
1460
+ },
1461
+ "Hallucination": {
1462
+ "Min": 132,
1463
+ "Max": 7937,
1464
+ "Med": 1493.5,
1465
+ "Med Resp": 620.5
1466
+ },
1467
+ "Safety": {
1468
+ "Min": 32,
1469
+ "Max": 6678,
1470
+ "Med": 268.0,
1471
+ "Med Resp": 12.0
1472
+ },
1473
+ "Repetition": {
1474
+ "Min": 258,
1475
+ "Max": 17217,
1476
+ "Med": 1847.0,
1477
+ "Med Resp": 332.5
1478
+ },
1479
+ "Summarization": {
1480
+ "Min": 99,
1481
+ "Max": 4060,
1482
+ "Med": 438.5,
1483
+ "Med Resp": 219.0
1484
+ },
1485
+ "Translation": {
1486
+ "Min": 133,
1487
+ "Max": 10446,
1488
+ "Med": 1028.5,
1489
+ "Med Resp": 290.0
1490
+ },
1491
+ "Multi-Turn": {
1492
+ "Min": 102,
1493
+ "Max": 14863,
1494
+ "Med": 2483.0,
1495
+ "Med Resp": 1514.0
1496
+ }
1497
+ },
1498
+ "o3-pro (Reasoning: medium)": {
1499
+ "Overall": {
1500
+ "Min": -10,
1501
+ "Max": -2,
1502
+ "Med": -2.0,
1503
+ "Med Resp": -1.0
1504
+ },
1505
+ "Content Generation": {
1506
+ "Min": -2,
1507
+ "Max": -2,
1508
+ "Med": -2.0,
1509
+ "Med Resp": -1.0
1510
+ },
1511
+ "Editing": {
1512
+ "Min": -2,
1513
+ "Max": -2,
1514
+ "Med": -2.0,
1515
+ "Med Resp": -1.0
1516
+ },
1517
+ "Data Analysis": {
1518
+ "Min": -2,
1519
+ "Max": -2,
1520
+ "Med": -2.0,
1521
+ "Med Resp": -1.0
1522
+ },
1523
+ "Reasoning": {
1524
+ "Min": -2,
1525
+ "Max": -2,
1526
+ "Med": -2.0,
1527
+ "Med Resp": -1.0
1528
+ },
1529
+ "Hallucination": {
1530
+ "Min": -2,
1531
+ "Max": -2,
1532
+ "Med": -2.0,
1533
+ "Med Resp": -1.0
1534
+ },
1535
+ "Safety": {
1536
+ "Min": -2,
1537
+ "Max": -2,
1538
+ "Med": -2.0,
1539
+ "Med Resp": -1.0
1540
+ },
1541
+ "Repetition": {
1542
+ "Min": -2,
1543
+ "Max": -2,
1544
+ "Med": -2.0,
1545
+ "Med Resp": -1.0
1546
+ },
1547
+ "Summarization": {
1548
+ "Min": -2,
1549
+ "Max": -2,
1550
+ "Med": -2.0,
1551
+ "Med Resp": -1.0
1552
+ },
1553
+ "Translation": {
1554
+ "Min": -2,
1555
+ "Max": -2,
1556
+ "Med": -2.0,
1557
+ "Med Resp": -1.0
1558
+ },
1559
+ "Multi-Turn": {
1560
+ "Min": -10,
1561
+ "Max": -4,
1562
+ "Med": -6.0,
1563
+ "Med Resp": -3.0
1564
+ }
1565
+ },
1566
+ "Grok-4": {
1567
+ "Overall": {
1568
+ "Min": -10,
1569
+ "Max": -2,
1570
+ "Med": -2.0,
1571
+ "Med Resp": -1.0
1572
+ },
1573
+ "Content Generation": {
1574
+ "Min": -2,
1575
+ "Max": -2,
1576
+ "Med": -2.0,
1577
+ "Med Resp": -1.0
1578
+ },
1579
+ "Editing": {
1580
+ "Min": -2,
1581
+ "Max": -2,
1582
+ "Med": -2.0,
1583
+ "Med Resp": -1.0
1584
+ },
1585
+ "Data Analysis": {
1586
+ "Min": -2,
1587
+ "Max": -2,
1588
+ "Med": -2.0,
1589
+ "Med Resp": -1.0
1590
+ },
1591
+ "Reasoning": {
1592
+ "Min": -2,
1593
+ "Max": -2,
1594
+ "Med": -2.0,
1595
+ "Med Resp": -1.0
1596
+ },
1597
+ "Hallucination": {
1598
+ "Min": -2,
1599
+ "Max": -2,
1600
+ "Med": -2.0,
1601
+ "Med Resp": -1.0
1602
+ },
1603
+ "Safety": {
1604
+ "Min": -2,
1605
+ "Max": -2,
1606
+ "Med": -2.0,
1607
+ "Med Resp": -1.0
1608
+ },
1609
+ "Repetition": {
1610
+ "Min": -2,
1611
+ "Max": -2,
1612
+ "Med": -2.0,
1613
+ "Med Resp": -1.0
1614
+ },
1615
+ "Summarization": {
1616
+ "Min": -2,
1617
+ "Max": -2,
1618
+ "Med": -2.0,
1619
+ "Med Resp": -1.0
1620
+ },
1621
+ "Translation": {
1622
+ "Min": -2,
1623
+ "Max": -2,
1624
+ "Med": -2.0,
1625
+ "Med Resp": -1.0
1626
+ },
1627
+ "Multi-Turn": {
1628
+ "Min": -10,
1629
+ "Max": -4,
1630
+ "Med": -6.0,
1631
+ "Med Resp": -3.0
1632
+ }
1633
+ },
1634
+ "Mi:dm 2.0 Base Instruct": {
1635
+ "Overall": {
1636
+ "Min": 1,
1637
+ "Max": 32764,
1638
+ "Med": 316.0,
1639
+ "Med Resp": 316.0
1640
+ },
1641
+ "Content Generation": {
1642
+ "Min": 7,
1643
+ "Max": 3515,
1644
+ "Med": 400.0,
1645
+ "Med Resp": 400.0
1646
+ },
1647
+ "Editing": {
1648
+ "Min": 10,
1649
+ "Max": 1998,
1650
+ "Med": 191.0,
1651
+ "Med Resp": 191.0
1652
+ },
1653
+ "Data Analysis": {
1654
+ "Min": 1,
1655
+ "Max": 3302,
1656
+ "Med": 260.0,
1657
+ "Med Resp": 260.0
1658
+ },
1659
+ "Reasoning": {
1660
+ "Min": 1,
1661
+ "Max": 32071,
1662
+ "Med": 398.0,
1663
+ "Med Resp": 398.0
1664
+ },
1665
+ "Hallucination": {
1666
+ "Min": 13,
1667
+ "Max": 3061,
1668
+ "Med": 191.5,
1669
+ "Med Resp": 191.5
1670
+ },
1671
+ "Safety": {
1672
+ "Min": 10,
1673
+ "Max": 1110,
1674
+ "Med": 159.0,
1675
+ "Med Resp": 159.0
1676
+ },
1677
+ "Repetition": {
1678
+ "Min": 50,
1679
+ "Max": 2734,
1680
+ "Med": 316.5,
1681
+ "Med Resp": 316.5
1682
+ },
1683
+ "Summarization": {
1684
+ "Min": 35,
1685
+ "Max": 2967,
1686
+ "Med": 261.0,
1687
+ "Med Resp": 261.0
1688
+ },
1689
+ "Translation": {
1690
+ "Min": 7,
1691
+ "Max": 4703,
1692
+ "Med": 289.5,
1693
+ "Med Resp": 289.5
1694
+ },
1695
+ "Multi-Turn": {
1696
+ "Min": 3,
1697
+ "Max": 32764,
1698
+ "Med": 957.0,
1699
+ "Med Resp": 957.0
1700
+ }
1701
+ },
1702
+ "Qwen3 235B A22B Thinking 2507": {
1703
+ "Overall": {
1704
+ "Min": 8,
1705
+ "Max": 19533,
1706
+ "Med": 2404.5,
1707
+ "Med Resp": 423.0
1708
+ },
1709
+ "Content Generation": {
1710
+ "Min": 402,
1711
+ "Max": 13776,
1712
+ "Med": 2337.0,
1713
+ "Med Resp": 577.5
1714
+ },
1715
+ "Editing": {
1716
+ "Min": 482,
1717
+ "Max": 13235,
1718
+ "Med": 1894.5,
1719
+ "Med Resp": 274.5
1720
+ },
1721
+ "Data Analysis": {
1722
+ "Min": 8,
1723
+ "Max": 13217,
1724
+ "Med": 1427.0,
1725
+ "Med Resp": 303.0
1726
+ },
1727
+ "Reasoning": {
1728
+ "Min": 8,
1729
+ "Max": 19533,
1730
+ "Med": 2340.0,
1731
+ "Med Resp": 568.5
1732
+ },
1733
+ "Hallucination": {
1734
+ "Min": 305,
1735
+ "Max": 6670,
1736
+ "Med": 2005.0,
1737
+ "Med Resp": 848.0
1738
+ },
1739
+ "Safety": {
1740
+ "Min": 304,
1741
+ "Max": 8302,
1742
+ "Med": 1708.0,
1743
+ "Med Resp": 619.0
1744
+ },
1745
+ "Repetition": {
1746
+ "Min": 8,
1747
+ "Max": 11012,
1748
+ "Med": 3533.0,
1749
+ "Med Resp": 514.5
1750
+ },
1751
+ "Summarization": {
1752
+ "Min": 373,
1753
+ "Max": 11701,
1754
+ "Med": 1468.5,
1755
+ "Med Resp": 233.5
1756
+ },
1757
+ "Translation": {
1758
+ "Min": 381,
1759
+ "Max": 12124,
1760
+ "Med": 3332.5,
1761
+ "Med Resp": 284.0
1762
+ },
1763
+ "Multi-Turn": {
1764
+ "Min": 721,
1765
+ "Max": 19299,
1766
+ "Med": 5745.0,
1767
+ "Med Resp": 1736.5
1768
+ }
1769
+ },
1770
+ "HyperCLOVAX SEED Think 14B (think)": {
1771
+ "Overall": {
1772
+ "Min": 223,
1773
+ "Max": 131436,
1774
+ "Med": 1444.0,
1775
+ "Med Resp": 382.5
1776
+ },
1777
+ "Content Generation": {
1778
+ "Min": 279,
1779
+ "Max": 72029,
1780
+ "Med": 1222.0,
1781
+ "Med Resp": 476.5
1782
+ },
1783
+ "Editing": {
1784
+ "Min": 304,
1785
+ "Max": 65536,
1786
+ "Med": 1228.5,
1787
+ "Med Resp": 351.0
1788
+ },
1789
+ "Data Analysis": {
1790
+ "Min": 240,
1791
+ "Max": 65536,
1792
+ "Med": 1352.0,
1793
+ "Med Resp": 234.0
1794
+ },
1795
+ "Reasoning": {
1796
+ "Min": 414,
1797
+ "Max": 65536,
1798
+ "Med": 3010.0,
1799
+ "Med Resp": 315.0
1800
+ },
1801
+ "Hallucination": {
1802
+ "Min": 263,
1803
+ "Max": 65536,
1804
+ "Med": 1310.5,
1805
+ "Med Resp": 444.0
1806
+ },
1807
+ "Safety": {
1808
+ "Min": 241,
1809
+ "Max": 65536,
1810
+ "Med": 1100.0,
1811
+ "Med Resp": 412.0
1812
+ },
1813
+ "Repetition": {
1814
+ "Min": 389,
1815
+ "Max": 65536,
1816
+ "Med": 2233.0,
1817
+ "Med Resp": 355.0
1818
+ },
1819
+ "Summarization": {
1820
+ "Min": 223,
1821
+ "Max": 5987,
1822
+ "Med": 833.5,
1823
+ "Med Resp": 285.0
1824
+ },
1825
+ "Translation": {
1826
+ "Min": 457,
1827
+ "Max": 65536,
1828
+ "Med": 1611.0,
1829
+ "Med Resp": 352.0
1830
+ },
1831
+ "Multi-Turn": {
1832
+ "Min": 648,
1833
+ "Max": 131436,
1834
+ "Med": 3234.5,
1835
+ "Med Resp": 1324.5
1836
+ }
1837
+ },
1838
+ "o3": {
1839
+ "Overall": {
1840
+ "Min": -10,
1841
+ "Max": -2,
1842
+ "Med": -2.0,
1843
+ "Med Resp": -1.0
1844
+ },
1845
+ "Content Generation": {
1846
+ "Min": -2,
1847
+ "Max": -2,
1848
+ "Med": -2.0,
1849
+ "Med Resp": -1.0
1850
+ },
1851
+ "Editing": {
1852
+ "Min": -2,
1853
+ "Max": -2,
1854
+ "Med": -2.0,
1855
+ "Med Resp": -1.0
1856
+ },
1857
+ "Data Analysis": {
1858
+ "Min": -2,
1859
+ "Max": -2,
1860
+ "Med": -2.0,
1861
+ "Med Resp": -1.0
1862
+ },
1863
+ "Reasoning": {
1864
+ "Min": -2,
1865
+ "Max": -2,
1866
+ "Med": -2.0,
1867
+ "Med Resp": -1.0
1868
+ },
1869
+ "Hallucination": {
1870
+ "Min": -2,
1871
+ "Max": -2,
1872
+ "Med": -2.0,
1873
+ "Med Resp": -1.0
1874
+ },
1875
+ "Safety": {
1876
+ "Min": -2,
1877
+ "Max": -2,
1878
+ "Med": -2.0,
1879
+ "Med Resp": -1.0
1880
+ },
1881
+ "Repetition": {
1882
+ "Min": -2,
1883
+ "Max": -2,
1884
+ "Med": -2.0,
1885
+ "Med Resp": -1.0
1886
+ },
1887
+ "Summarization": {
1888
+ "Min": -2,
1889
+ "Max": -2,
1890
+ "Med": -2.0,
1891
+ "Med Resp": -1.0
1892
+ },
1893
+ "Translation": {
1894
+ "Min": -2,
1895
+ "Max": -2,
1896
+ "Med": -2.0,
1897
+ "Med Resp": -1.0
1898
+ },
1899
+ "Multi-Turn": {
1900
+ "Min": -10,
1901
+ "Max": -4,
1902
+ "Med": -6.0,
1903
+ "Med Resp": -3.0
1904
+ }
1905
+ }
1906
+ }
src/data/stats.csv ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ "Model Name" "Link" "Comment" "Group" "Med. Len." "Med. Resp. Len." "Parameter Size (B)" "Type" "Model Type" "Think" "Overall" "Content Generation" "Editing" "Data Analysis" "Reasoning" "Hallucination" "Safety" "Repetition" "Summarization" "Translation" "Multi-Turn"
2
+ "GPT-5 (Reasoning: medium)" "https://platform.openai.com/docs/models/gpt-5" "Reasoning: medium" "GPT" "" "" "" "Proprietary" "Think" "On" "70.73" "71.0" "74.38" "76.49" "79.75" "64.94" "56.2" "82.86" "80.16" "69.38" "54.36"
3
+ "o3-pro (Reasoning: medium)" "https://platform.openai.com/docs/models/o3-pro" "Reasoning: medium" "GPT" "" "" "" "Proprietary" "Think" "On" "66.47" "72.5" "70.31" "75.7" "83.88" "64.37" "33.88" "74.29" "65.48" "64.33" "48.32"
4
+ "Claude 4 Opus (20250514) (think)" "https://www.anthropic.com/claude/opus" "version: 20250514" "Claude" "" "" "" "Proprietary" "Hybrid" "On" "63.29" "60.75" "59.69" "73.31" "69.83" "78.74" "53.72" "55.71" "65.48" "65.45" "48.99"
5
+ "Claude 4.1 Opus (20250805) (think)" "https://www.anthropic.com/claude/opus" "version: 20250805" "Claude" "" "" "" "Proprietary" "Hybrid" "On" "63.24" "61.25" "60.0" "78.49" "72.73" "77.01" "56.2" "57.14" "61.9" "62.64" "46.98"
6
+ "GPT-5 mini (Reasoning: medium)" "https://platform.openai.com/docs/models/gpt-5-mini" "Reasoning: medium" "GPT" "" "" "" "Proprietary" "Think" "On" "62.56" "68.0" "62.5" "74.9" "76.86" "55.17" "47.93" "44.29" "74.6" "56.18" "45.3"
7
+ "Claude 4 Sonnet (20250514) (think)" "https://www.anthropic.com/claude/sonnet" "version: 20250514" "Claude" "" "" "" "Proprietary" "Hybrid" "On" "61.8" "58.0" "58.44" "76.49" "67.77" "79.31" "57.02" "44.29" "65.08" "62.92" "44.97"
8
+ "o3" "https://platform.openai.com/docs/models/o3" "" "GPT" "" "" "" "Proprietary" "Think" "On" "60.91" "68.75" "60.0" "73.31" "79.34" "54.02" "34.71" "64.29" "60.71" "55.06" "46.98"
9
+ "Gemini 2.5 Pro" "https://deepmind.google/models/gemini/pro/" "" "Gemini" "" "" "" "Proprietary" "Think" "On" "59.34" "54.0" "60.94" "78.88" "73.14" "63.22" "17.36" "52.86" "67.86" "53.93" "52.68"
10
+ "Grok-4" "https://x.ai/news/grok-4" "temperature: 0.6
11
+ top-p: 0.95" "Grok" "" "" "" "Proprietary" "Think" "On" "58.74" "61.0" "66.25" "72.51" "63.22" "66.09" "16.53" "58.57" "66.27" "54.21" "44.3"
12
+ "Gemini 2.5 Flash" "https://deepmind.google/models/gemini/flash/" "" "Gemini" "" "" "" "Proprietary" "Hybrid" "On" "58.62" "57.25" "62.19" "70.52" "72.31" "56.9" "28.93" "47.14" "68.65" "55.06" "46.98"
13
+ "o4-mini" "https://platform.openai.com/docs/models/o4-mini" "" "GPT" "" "" "" "Proprietary" "Think" "On" "57.57" "67.25" "61.25" "71.71" "75.62" "45.4" "39.67" "44.29" "59.92" "47.19" "41.95"
14
+ "Qwen3 235B A22B Thinking 2507" "https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507" "temperature: 0.6
15
+ top-p: 0.95" "Qwen" "2404.5" "423.0" "235.0" "Open" "Think" "On" "55.48" "57.5" "53.12" "73.31" "75.21" "55.17" "25.62" "35.71" "55.56" "56.18" "40.27"
16
+ "GPT-5 nano (Reasoning: medium)" "https://platform.openai.com/docs/models/gpt-5-nano" "Reasoning: medium" "GPT" "" "" "" "Proprietary" "Think" "On" "55.39" "63.5" "47.19" "68.92" "75.21" "55.17" "52.07" "34.29" "63.49" "40.73" "42.95"
17
+ "GLM-4.5 FP8 (think)" "https://huggingface.co/zai-org/GLM-4.5-FP8" "temperature: 0.6
18
+ top-p: 0.95" "GLM" "1442.0" "604.0" "355.0" "Open" "Hybrid" "On" "54.03" "60.75" "53.75" "68.92" "74.38" "47.13" "33.06" "41.43" "60.32" "46.07" "35.91"
19
+ "Qwen3 235B A22B Instruct 2507" "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507" "temperature: 0.7
20
+ top-p: 0.8" "Qwen" "433.0" "433.0" "235.0" "Open" "Instruct" "Off" "52.94" "58.0" "49.69" "68.13" "73.97" "55.17" "45.45" "30.0" "55.95" "38.48" "41.61"
21
+ "DeepSeek V3.1 (think)" "https://huggingface.co/deepseek-ai/DeepSeek-V3.1" "temperature: 0.6
22
+ top-p: 0.95" "DeepSeek" "710.5" "356.0" "671.0" "Open" "Hybrid" "On" "51.45" "52.0" "50.0" "67.33" "69.83" "50.0" "33.88" "35.71" "59.52" "41.85" "40.27"
23
+ "gpt-oss-120B (Reasoning: medium)" "https://huggingface.co/openai/gpt-oss-120b" "Reasoning: medium
24
+ temperature: 1.0
25
+ top-p: 1.0" "GPT" "759.5" "370.5" "117.0" "Open" "Think" "On" "49.11" "58.5" "48.44" "68.92" "69.83" "41.38" "39.67" "25.71" "50.79" "35.67" "32.21"
26
+ "DeepSeek R1 (0528) (top_p: 0.95, temp:0.6)" "https://huggingface.co/deepseek-ai/DeepSeek-R1-0528" "version: 0528
27
+ temperature: 0.6
28
+ top-p: 0.95" "DeepSeek" "1177.5" "554.0" "671.0" "Open" "Think" "On" "48.79" "49.75" "50.0" "65.34" "59.09" "48.85" "38.02" "32.86" "57.94" "36.52" "38.93"
29
+ "Gauss2.3 Hybrid" "" "" "Gauss" "546.0" "308.0" "" "Proprietary" "Hybrid" "On" "46.58" "52.0" "46.25" "59.76" "66.94" "41.95" "34.71" "25.71" "53.17" "34.55" "33.22"
30
+ "DeepSeek V3 (0324) (top_p: 0.95, temp:1.3)" "https://huggingface.co/deepseek-ai/DeepSeek-V3-0324" "version: 0324
31
+ temperature: 1.3
32
+ top-p: 0.95" "DeepSeek" "408.0" "408.0" "671.0" "Open" "Instruct" "Off" "45.09" "46.25" "45.0" "58.96" "60.33" "41.95" "21.49" "30.0" "55.95" "38.48" "33.22"
33
+ "Qwen3 32B (think)" "https://huggingface.co/Qwen/Qwen3-32B" "temperature: 0.6
34
+ top-p: 0.95" "Qwen" "1113.0" "390.0" "32.8" "Open" "Hybrid" "On" "44.44" "52.25" "41.56" "68.92" "66.53" "35.06" "19.83" "25.71" "46.43" "30.9" "32.89"
35
+ "A.X 4.0" "https://huggingface.co/skt/A.X-4.0" "" "SKT" "412.5" "412.5" "71.9" "Open" "Instruct" "Off" "41.59" "56.0" "43.75" "43.43" "42.56" "40.23" "15.7" "24.29" "53.97" "33.43" "32.21"
36
+ "gpt-oss-20B (Reasoning: medium)" "https://huggingface.co/openai/gpt-oss-20b" "Reasoning: medium
37
+ temperature: 1.0
38
+ top-p: 1.0" "GPT" "953.5" "326.0" "21.0" "Open" "Think" "On" "41.18" "52.0" "40.0" "61.35" "65.7" "43.1" "41.32" "22.86" "36.51" "20.51" "22.82"
39
+ "EXAONE 4.0 32B (think)" "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B" "temperature: 0.6
40
+ top-p: 0.95" "Exaone" "1274.5" "503.0" "32.0" "Open" "Hybrid" "On" "33.82" "34.25" "29.38" "56.97" "57.44" "24.71" "27.27" "17.14" "38.49" "18.54" "25.5"
41
+ "HyperCLOVAX SEED Think 14B (think)" "https://huggingface.co/naver-hyperclovax/HyperCLOVAX-SEED-Think-14B" "temperature: 0.5
42
+ top-p: 0.6" "HCX" "1444.0" "382.5" "14.7" "Open" "Hybrid" "On" "31.84" "35.0" "26.56" "53.78" "58.68" "27.59" "26.45" "17.14" "29.76" "17.13" "20.47"
43
+ "Solar Pro Preview (top_p:0.95, temp: 0.7)" "https://huggingface.co/upstage/solar-pro-preview-instruct" "temperature: 0.7
44
+ top-p: 0.95" "Solar" "260.0" "260.0" "22.0" "Open" "Instruct" "Off" "20.73" "28.0" "24.69" "16.73" "19.42" "17.24" "28.1" "11.43" "31.35" "13.76" "11.74"
45
+ "Mi:dm 2.0 Base Instruct" "https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct" "temperature: 0.8
46
+ top-p: 0.7" "kt" "316.0" "316.0" "11.5" "Open" "Instruct" "Off" "20.25" "21.75" "17.5" "16.73" "18.6" "27.59" "59.5" "14.29" "25.4" "12.64" "11.41"
47
+ "Kanana 1.5 15.7B A3B Instruct" "https://huggingface.co/kakaocorp/kanana-1.5-15.7b-a3b-instruct" "temperature: 1.0
48
+ top-p: 0.95" "kakao" "414.0" "414.0" "15.7" "Open" "Instruct" "Off" "11.71" "14.25" "10.62" "13.55" "11.16" "22.41" "22.31" "4.29" "11.9" "6.74" "5.37"
src/data/stats_lang.csv ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ "Model Name" "Link" "Comment" "Group" "Med. Len." "Med. Resp. Len." "Parameter Size (B)" "Type" "Model Type" "Think" "Overall" "KO" "EN" "JA" "ZH" "PL" "DE" "PT" "ES" "FR" "IT" "RU" "VI"
2
+ "GPT-5 (Reasoning: medium)" "https://platform.openai.com/docs/models/gpt-5" "Reasoning: medium" "GPT" "" "" "" "Proprietary" "Think" "On" "70.73" "64.72" "65.83" "71.69" "67.68" "72.78" "71.27" "73.74" "75.68" "72.83" "77.05" "70.79" "75.61"
3
+ "o3-pro (Reasoning: medium)" "https://platform.openai.com/docs/models/o3-pro" "Reasoning: medium" "GPT" "" "" "" "Proprietary" "Think" "On" "66.47" "63.61" "63.61" "69.28" "65.24" "63.89" "64.09" "68.16" "69.19" "70.11" "72.13" "62.36" "71.95"
4
+ "Claude 4 Opus (20250514) (think)" "https://www.anthropic.com/claude/opus" "version: 20250514" "Claude" "" "" "" "Proprietary" "Hybrid" "On" "63.29" "57.5" "62.5" "64.46" "62.8" "59.44" "65.19" "65.92" "60.54" "65.22" "65.57" "65.17" "72.56"
5
+ "Claude 4.1 Opus (20250805) (think)" "https://www.anthropic.com/claude/opus" "version: 20250805" "Claude" "" "" "" "Proprietary" "Hybrid" "On" "63.24" "58.33" "61.39" "60.84" "64.02" "61.67" "66.85" "68.16" "61.08" "65.76" "66.67" "65.73" "65.24"
6
+ "GPT-5 mini (Reasoning: medium)" "https://platform.openai.com/docs/models/gpt-5-mini" "Reasoning: medium" "GPT" "" "" "" "Proprietary" "Think" "On" "62.56" "57.5" "56.39" "62.65" "62.2" "63.89" "60.22" "66.48" "67.03" "70.11" "67.76" "66.29" "60.98"
7
+ "Claude 4 Sonnet (20250514) (think)" "https://www.anthropic.com/claude/sonnet" "version: 20250514" "Claude" "" "" "" "Proprietary" "Hybrid" "On" "61.8" "54.17" "59.17" "63.86" "64.63" "59.44" "61.33" "64.8" "62.16" "65.22" "67.21" "66.29" "64.02"
8
+ "o3" "https://platform.openai.com/docs/models/o3" "" "GPT" "" "" "" "Proprietary" "Think" "On" "60.91" "57.5" "59.17" "61.45" "58.54" "61.11" "64.09" "60.89" "62.16" "63.59" "65.03" "54.49" "68.29"
9
+ "Gemini 2.5 Pro" "https://deepmind.google/models/gemini/pro/" "" "Gemini" "" "" "" "Proprietary" "Think" "On" "59.34" "53.61" "57.78" "59.04" "57.93" "57.22" "56.91" "60.89" "63.24" "67.93" "62.3" "61.24" "60.98"
10
+ "Grok-4" "https://x.ai/news/grok-4" "temperature: 0.6
11
+ top-p: 0.95" "Grok" "" "" "" "Proprietary" "Think" "On" "58.74" "57.78" "56.67" "62.65" "60.37" "58.33" "60.22" "59.78" "56.22" "62.5" "60.66" "52.25" "60.98"
12
+ "Gemini 2.5 Flash" "https://deepmind.google/models/gemini/flash/" "" "Gemini" "" "" "" "Proprietary" "Hybrid" "On" "58.62" "51.11" "56.39" "62.05" "56.71" "62.78" "60.77" "61.45" "60.0" "63.04" "57.92" "64.04" "56.71"
13
+ "o4-mini" "https://platform.openai.com/docs/models/o4-mini" "" "GPT" "" "" "" "Proprietary" "Think" "On" "57.57" "54.17" "55.0" "62.05" "59.76" "52.78" "58.56" "63.69" "55.68" "57.61" "60.66" "56.74" "60.98"
14
+ "Qwen3 235B A22B Thinking 2507" "https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507" "temperature: 0.6
15
+ top-p: 0.95" "Qwen" "2404.5" "423.0" "235.0" "Open" "Think" "On" "55.48" "49.17" "53.33" "56.02" "58.54" "50.56" "62.43" "60.89" "52.97" "56.52" "60.11" "53.93" "60.37"
16
+ "GPT-5 nano (Reasoning: medium)" "https://platform.openai.com/docs/models/gpt-5-nano" "Reasoning: medium" "GPT" "" "" "" "Proprietary" "Think" "On" "55.39" "51.94" "53.89" "57.23" "53.66" "55.56" "58.01" "59.78" "54.59" "56.52" "59.02" "57.3" "51.83"
17
+ "GLM-4.5 FP8 (think)" "https://huggingface.co/zai-org/GLM-4.5-FP8" "temperature: 0.6
18
+ top-p: 0.95" "GLM" "1442.0" "604.0" "355.0" "Open" "Hybrid" "On" "54.03" "46.94" "54.17" "60.84" "58.54" "48.89" "55.8" "54.75" "48.11" "57.61" "57.92" "57.87" "54.88"
19
+ "Qwen3 235B A22B Instruct 2507" "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507" "temperature: 0.7
20
+ top-p: 0.8" "Qwen" "433.0" "433.0" "235.0" "Open" "Instruct" "Off" "52.94" "46.67" "55.28" "53.61" "59.15" "46.11" "51.38" "55.87" "54.59" "53.26" "56.28" "54.49" "53.05"
21
+ "DeepSeek V3.1 (think)" "https://huggingface.co/deepseek-ai/DeepSeek-V3.1" "temperature: 0.6
22
+ top-p: 0.95" "DeepSeek" "710.5" "356.0" "671.0" "Open" "Hybrid" "On" "51.45" "44.44" "48.33" "56.63" "48.78" "48.89" "55.25" "53.07" "52.97" "56.52" "57.92" "50.56" "54.27"
23
+ "gpt-oss-120B (Reasoning: medium)" "https://huggingface.co/openai/gpt-oss-120b" "Reasoning: medium
24
+ temperature: 1.0
25
+ top-p: 1.0" "GPT" "759.5" "370.5" "117.0" "Open" "Think" "On" "49.11" "46.67" "51.39" "51.81" "47.56" "45.0" "51.38" "54.75" "50.27" "51.63" "47.54" "46.07" "45.12"
26
+ "DeepSeek R1 (0528) (top_p: 0.95, temp:0.6)" "https://huggingface.co/deepseek-ai/DeepSeek-R1-0528" "version: 0528
27
+ temperature: 0.6
28
+ top-p: 0.95" "DeepSeek" "1177.5" "554.0" "671.0" "Open" "Think" "On" "48.79" "42.22" "49.44" "50.0" "53.05" "47.22" "48.62" "50.28" "48.11" "51.63" "54.1" "44.38" "53.05"
29
+ "Gauss2.3 Hybrid" "" "" "Gauss" "546.0" "308.0" "" "Proprietary" "Hybrid" "On" "46.58" "39.72" "45.56" "48.8" "48.17" "45.0" "44.2" "53.63" "45.41" "52.17" "51.91" "44.94" "47.56"
30
+ "DeepSeek V3 (0324) (top_p: 0.95, temp:1.3)" "https://huggingface.co/deepseek-ai/DeepSeek-V3-0324" "version: 0324
31
+ temperature: 1.3
32
+ top-p: 0.95" "DeepSeek" "408.0" "408.0" "671.0" "Open" "Instruct" "Off" "45.09" "37.5" "43.61" "46.99" "51.22" "45.56" "44.75" "44.69" "44.32" "48.91" "49.18" "44.94" "49.39"
33
+ "Qwen3 32B (think)" "https://huggingface.co/Qwen/Qwen3-32B" "temperature: 0.6
34
+ top-p: 0.95" "Qwen" "1113.0" "390.0" "32.8" "Open" "Hybrid" "On" "44.44" "38.89" "41.67" "48.8" "50.0" "38.33" "46.41" "44.69" "44.86" "44.57" "50.82" "46.07" "47.56"
35
+ "A.X 4.0" "https://huggingface.co/skt/A.X-4.0" "" "SKT" "412.5" "412.5" "71.9" "Open" "Instruct" "Off" "41.59" "38.89" "41.11" "43.98" "49.39" "36.11" "45.86" "43.58" "44.32" "39.67" "43.17" "39.89" "36.59"
36
+ "gpt-oss-20B (Reasoning: medium)" "https://huggingface.co/openai/gpt-oss-20b" "Reasoning: medium
37
+ temperature: 1.0
38
+ top-p: 1.0" "GPT" "953.5" "326.0" "21.0" "Open" "Think" "On" "41.18" "36.67" "42.78" "45.78" "45.73" "37.78" "35.91" "41.9" "39.46" "51.09" "40.44" "38.76" "41.46"
39
+ "EXAONE 4.0 32B (think)" "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B" "temperature: 0.6
40
+ top-p: 0.95" "Exaone" "1274.5" "503.0" "32.0" "Open" "Hybrid" "On" "33.82" "33.61" "38.33" "28.92" "35.98" "26.11" "35.91" "34.08" "38.92" "35.33" "33.88" "28.09" "31.71"
41
+ "HyperCLOVAX SEED Think 14B (think)" "https://huggingface.co/naver-hyperclovax/HyperCLOVAX-SEED-Think-14B" "temperature: 0.5
42
+ top-p: 0.6" "HCX" "1444.0" "382.5" "14.7" "Open" "Hybrid" "On" "31.84" "32.22" "37.22" "31.93" "38.41" "27.78" "32.6" "30.17" "29.19" "32.07" "33.33" "25.28" "26.22"
43
+ "Solar Pro Preview (top_p:0.95, temp: 0.7)" "https://huggingface.co/upstage/solar-pro-preview-instruct" "temperature: 0.7
44
+ top-p: 0.95" "Solar" "260.0" "260.0" "22.0" "Open" "Instruct" "Off" "20.73" "9.72" "22.22" "21.08" "24.39" "9.44" "18.23" "24.02" "29.73" "29.89" "33.33" "22.47" "12.8"
45
+ "Mi:dm 2.0 Base Instruct" "https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct" "temperature: 0.8
46
+ top-p: 0.7" "kt" "316.0" "316.0" "11.5" "Open" "Instruct" "Off" "20.25" "26.39" "26.39" "17.47" "26.83" "13.33" "18.78" "20.67" "16.22" "20.65" "21.31" "12.92" "9.15"
47
+ "Kanana 1.5 15.7B A3B Instruct" "https://huggingface.co/kakaocorp/kanana-1.5-15.7b-a3b-instruct" "temperature: 1.0
48
+ top-p: 0.95" "kakao" "414.0" "414.0" "15.7" "Open" "Instruct" "Off" "11.71" "21.11" "20.28" "10.84" "15.24" "5.56" "7.73" "8.94" "9.19" "8.15" "5.46" "5.06" "4.88"
src/data_loader.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from pathlib import Path
3
+ from typing import Optional
4
+
5
+ # Global cache variables
6
+ _category_df_cache: Optional[pd.DataFrame] = None
7
+ _language_df_cache: Optional[pd.DataFrame] = None
8
+
9
+ def _load_category_csv() -> pd.DataFrame:
10
+ """Load the category CSV file with proper encoding and delimiter."""
11
+ abs_path = Path(__file__).parent
12
+ df = pd.read_csv(str(abs_path / "data/stats.csv"), encoding='utf-8', delimiter="\t")
13
+ return df.copy()
14
+
15
+ def _load_language_csv() -> pd.DataFrame:
16
+ """Load the language CSV file with proper encoding and delimiter."""
17
+ abs_path = Path(__file__).parent
18
+ df = pd.read_csv(str(abs_path / "data/stats_lang.csv"), encoding='utf-8', delimiter="\t")
19
+ return df.copy()
20
+
21
+ def get_category_dataframe(processed: bool = True) -> pd.DataFrame:
22
+ """
23
+ Get the category dataframe.
24
+
25
+ Args:
26
+ processed: If True, returns processed dataframe (for vis_utils.py compatibility)
27
+ If False, returns raw dataframe sorted by Overall (for data_utils.py compatibility)
28
+
29
+ Returns:
30
+ pd.DataFrame: The category dataframe
31
+ """
32
+ global _category_df_cache
33
+
34
+ if _category_df_cache is None:
35
+ _category_df_cache = _load_category_csv()
36
+
37
+ df = _category_df_cache.copy()
38
+
39
+ if processed:
40
+ # Apply vis_utils.py processing
41
+ required_cols = ['Model Name', 'Link', "Group", "Overall", "Med. Len.", "Med. Resp. Len.", "Parameter Size (B)", "Type", "Model Type", "Think", 'Content Generation', 'Editing', 'Data Analysis',
42
+ 'Reasoning', 'Hallucination', 'Safety', 'Repetition',
43
+ 'Summarization', 'Translation', 'Multi-Turn']
44
+
45
+ for col in required_cols:
46
+ if col not in df.columns:
47
+ if col in ["Link", "Group"]:
48
+ df[col] = ""
49
+ else:
50
+ df[col] = 0
51
+
52
+ from constants import NUMERIC_COLS_CATEGORY, NUMERIC_INT_COLS_CATEGORY
53
+ for col in NUMERIC_COLS_CATEGORY:
54
+ if col in df.columns:
55
+ if col in NUMERIC_INT_COLS_CATEGORY:
56
+ df[col] = pd.to_numeric(df[col], errors='coerce').round(0)
57
+ else:
58
+ df[col] = pd.to_numeric(df[col], errors='coerce').round(3)
59
+ else:
60
+ df[col] = 0
61
+
62
+ if "Think" not in df.columns:
63
+ df["Think"] = "Off"
64
+
65
+ df = df.fillna('')
66
+
67
+ else:
68
+ # Apply data_utils.py processing
69
+ df = df.sort_values("Overall", ascending=False)
70
+
71
+ return df
72
+
73
+ def get_language_dataframe(processed: bool = True) -> pd.DataFrame:
74
+ """
75
+ Get the language dataframe.
76
+
77
+ Args:
78
+ processed: If True, returns processed dataframe (for vis_utils.py compatibility)
79
+ If False, returns raw dataframe sorted by Overall (for data_utils.py compatibility)
80
+
81
+ Returns:
82
+ pd.DataFrame: The language dataframe
83
+ """
84
+ global _language_df_cache
85
+
86
+ if _language_df_cache is None:
87
+ _language_df_cache = _load_language_csv()
88
+
89
+ df = _language_df_cache.copy()
90
+
91
+ if processed:
92
+ # Apply vis_utils.py processing
93
+ language_cols = ['Model Name', 'Link', "Group", "Overall", "Med. Len.", "Med. Resp. Len.", "Parameter Size (B)", "Type", "Model Type", "Think", 'KO', 'EN', 'JA', 'ZH', 'PL', 'DE', 'PT', 'ES', 'FR', 'IT', 'RU', 'VI']
94
+ for col in language_cols:
95
+ if col not in df.columns:
96
+ if col in ["Link", "Group"]:
97
+ df[col] = ""
98
+ else:
99
+ df[col] = 0
100
+
101
+ from constants import NUMERIC_COLS_LANGUAGE, NUMERIC_INT_COLS_LANGUAGE
102
+ for col in NUMERIC_COLS_LANGUAGE:
103
+ if col in df.columns:
104
+ if col in NUMERIC_INT_COLS_LANGUAGE:
105
+ df[col] = pd.to_numeric(df[col], errors='coerce').round(0)
106
+ else:
107
+ df[col] = pd.to_numeric(df[col], errors='coerce').round(3)
108
+ else:
109
+ df[col] = 0
110
+
111
+ df = df.fillna('')
112
+ else:
113
+ # Apply data_utils.py processing
114
+ df = df.sort_values("Overall", ascending=False)
115
+
116
+ return df
117
+
118
+ def clear_cache():
119
+ """Clear the cached dataframes to force reload on next access."""
120
+ global _category_df_cache, _language_df_cache
121
+ _category_df_cache = None
122
+ _language_df_cache = None
src/data_utils.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from pathlib import Path
3
+
4
+ def get_dataframe_category():
5
+ from src.data_loader import get_category_dataframe
6
+ return get_category_dataframe(processed=False)
7
+
8
+ def get_dataframe_language():
9
+ from src.data_loader import get_language_dataframe
10
+ return get_language_dataframe(processed=False)
11
+
12
+ import json
13
+
14
+ def get_length_category_df(selected_category):
15
+ """
16
+ Loads length_data.json and returns a DataFrame for the selected category.
17
+ Columns: Model Name, {Category} Min, {Category} Max, {Category} Med, {Category} Med Resp
18
+ """
19
+ abs_path = Path(__file__).parent
20
+ json_path = abs_path / "data/length_data.json"
21
+ with open(json_path, "r", encoding="utf-8") as f:
22
+ data = json.load(f)
23
+ rows = []
24
+ for model_name, stats in data.items():
25
+ cat = stats.get(selected_category, {})
26
+ row = {
27
+ "Model Name": model_name,
28
+ f"Min Len. ({selected_category})": cat.get("Min", None),
29
+ f"Max Len. ({selected_category}))": cat.get("Max", None),
30
+ f"Med. Len. ({selected_category})": cat.get("Med", None),
31
+ f"Med. Resp. Len. ({selected_category})": cat.get("Med Resp", None),
32
+ }
33
+ rows.append(row)
34
+ df = pd.DataFrame(rows)
35
+ return df
36
+
37
+ def get_length_category_list():
38
+ """
39
+ Returns the list of available categories in length_data.json (excluding 'Overall').
40
+ """
41
+ abs_path = Path(__file__).parent
42
+ json_path = abs_path / "data/length_data.json"
43
+ with open(json_path, "r", encoding="utf-8") as f:
44
+ data = json.load(f)
45
+ if not data:
46
+ return []
47
+ # Get categories from the first model
48
+ first_model = next(iter(data.values()))
49
+ categories = [k for k in first_model.keys() if k != "Overall"]
50
+ return categories
src/display/css_html_js.py ADDED
@@ -0,0 +1,766 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ custom_css = """
2
+ /* Info icon for column tooltips */
3
+ .info-icon {
4
+ color: #b0b0b0;
5
+ font-size: 15px;
6
+ margin-left: 4px;
7
+ vertical-align: middle;
8
+ font-family: Arial, sans-serif;
9
+ font-style: normal;
10
+ font-weight: bold;
11
+ user-select: none;
12
+ }
13
+ .info-icon:hover {
14
+ color: #888;
15
+ }
16
+ /* Model Name link hover effect */
17
+ .pretty-leaderboard-table a:hover {
18
+ text-decoration: underline;
19
+ color: #1098F7;
20
+ cursor: pointer;
21
+ }
22
+
23
+ /* INTRO FEATURE CARDS (for about page) */
24
+ .intro-feature-row {
25
+ display: flex;
26
+ flex-wrap: wrap;
27
+ justify-content: center;
28
+ gap: 24px;
29
+ margin: 18px 0 !important;
30
+ }
31
+ .intro-feature-box {
32
+ background: linear-gradient(135deg, #f8fafc 60%, #e3e6f3 100%);
33
+ border-radius: 18px;
34
+ box-shadow: 0 4px 16px rgba(44,62,80,0.08);
35
+ padding: 32px 28px;
36
+ width: 380px;
37
+ min-width: 260px;
38
+ max-width: 420px;
39
+ text-align: left;
40
+ display: flex;
41
+ flex-direction: column;
42
+ align-items: flex-start;
43
+ transition: box-shadow 0.2s;
44
+ }
45
+ .intro-feature-title {
46
+ font-weight: 900;
47
+ font-size: 1.45em;
48
+ margin-bottom: 12px;
49
+ color: #23244a;
50
+ }
51
+ .intro-feature-desc {
52
+ font-size: 1.18em;
53
+ color: #444;
54
+ margin-bottom: 7px;
55
+ }
56
+ .intro-feature-icon {
57
+ font-size: 2.3em;
58
+ margin-bottom: 16px;
59
+ color: #1098F7;
60
+ }
61
+ .intro-feature-box:hover {
62
+ box-shadow: 0 0 24px #a5a1ff55, 0 4px 16px rgba(0,0,0,0.18);
63
+ transform: translateY(-4px) scale(1.025);
64
+ transition: box-shadow 0.2s, transform 0.2s;
65
+ cursor: default;
66
+ }
67
+ @media (prefers-color-scheme: dark) {
68
+ .intro-feature-box {
69
+ background: linear-gradient(135deg, #23244a 0%, #2a1859 100%) !important;
70
+ color: #f5f6f7 !important;
71
+ }
72
+ .intro-feature-title, .intro-feature-desc {
73
+ color: #f5f6f7 !important;
74
+ }
75
+ }
76
+
77
+ /* Dataset Sample Button (below feature cards) */
78
+ .intro-dataset-btn {
79
+ display: inline-block;
80
+ background: #1098F7;
81
+ color: #fff !important;
82
+ border: none;
83
+ border-radius: 12px;
84
+ font-weight: 700;
85
+ font-size: 1.18em;
86
+ padding: 16px 36px;
87
+ margin: 32px auto 0 auto;
88
+ text-align: center;
89
+ text-decoration: none;
90
+ box-shadow: 0 2px 8px #1098f733;
91
+ transition: background 0.18s, color 0.18s, box-shadow 0.18s;
92
+ cursor: pointer;
93
+ outline: none;
94
+ }
95
+ .intro-dataset-btn:hover, .intro-dataset-btn:focus {
96
+ background: #0a6dc2;
97
+ color: #fff !important;
98
+ box-shadow: 0 4px 16px #1098f755;
99
+ text-decoration: none;
100
+ }
101
+ @media (prefers-color-scheme: dark) {
102
+ .intro-dataset-btn {
103
+ background: #1a4b7a;
104
+ color: #fff !important;
105
+ }
106
+ .intro-dataset-btn:hover, .intro-dataset-btn:focus {
107
+ background: #1098F7;
108
+ color: #fff !important;
109
+ }
110
+ }
111
+
112
+ .radar-chart, .plot-container {
113
+ display: block;
114
+ margin-left: auto;
115
+ margin-right: auto;
116
+ width: fit-content;
117
+ max-width: 100%;
118
+ }
119
+
120
+ /* Ensure injected HTML/Markdown blocks are transparent and text is visible in all color schemes */
121
+ .gr-html, .gr-markdown, .gr-html * {
122
+ background: transparent !important;
123
+ color: inherit !important;
124
+ }
125
+ .gr-html div, .gr-html body, .gr-markdown div, .gr-markdown body {
126
+ background: transparent !important;
127
+ color: inherit !important;
128
+ }
129
+ @media (prefers-color-scheme: dark) {
130
+ .gr-html, .gr-markdown, .gr-html *, .gr-markdown * {
131
+ color: #f5f6f7 !important;
132
+ }
133
+ }
134
+ @media (prefers-color-scheme: light) {
135
+ .gr-html, .gr-markdown, .gr-html *, .gr-markdown * {
136
+ color: #23244a !important;
137
+ }
138
+ }
139
+
140
+ /* Custom radio styles for category selector */
141
+ .cat-btn-radio label {
142
+ border-radius: 18px !important;
143
+ border: 1.5px solid #d1d5db !important;
144
+ background: #f8fafc !important;
145
+ color: #222 !important;
146
+ font-weight: 600 !important;
147
+ cursor: pointer !important;
148
+ padding: 8px 20px !important;
149
+ box-shadow: 0 2px 8px #e5e7eb88 !important;
150
+ margin: 0 !important;
151
+ font-size: 1.08rem !important;
152
+ transition: background 0.2s, color 0.2s, box-shadow 0.2s, border 0.2s !important;
153
+ display: inline-block !important;
154
+ }
155
+ .cat-btn-radio input[type="radio"] {
156
+ display: none !important;
157
+ }
158
+ .cat-btn-radio input[type="radio"]:checked + label,
159
+ .cat-btn-radio label.selected {
160
+ background: #1098F7 !important;
161
+ color: #fff !important;
162
+ border: 1.5px solid #1098F7 !important;
163
+ box-shadow: 0 4px 16px #1098f755, 0 2px 8px #e5e7eb88 !important;
164
+ }
165
+ .cat-btn-radio label:hover {
166
+ border: 1.5px solid #1098F7 !important;
167
+ box-shadow: 0 4px 16px #1098f733, 0 2px 8px #e5e7eb88 !important;
168
+ }
169
+
170
+ /* Gradio tab content: Space-themed background */
171
+ .gr-tabitem {
172
+ background: linear-gradient(135deg, #e3e6f3 60%, #f5f6fa 100%);
173
+ background-image:
174
+ radial-gradient(rgba(255,255,255,0.10) 1.2px, transparent 1.2px),
175
+ radial-gradient(rgba(255,255,255,0.06) 1px, transparent 1px);
176
+ background-size: 40px 40px, 80px 80px;
177
+ background-position: 0 0, 20px 20px;
178
+ }
179
+ @media (prefers-color-scheme: dark) {
180
+ .gr-tabitem {
181
+ background: linear-gradient(135deg, #181c3a 0%, #2a1859 100%) !important;
182
+ background-image:
183
+ radial-gradient(rgba(255,255,255,0.10) 1.2px, transparent 1.2px),
184
+ radial-gradient(rgba(255,255,255,0.06) 1px, transparent 1px);
185
+ background-size: 40px 40px, 80px 80px;
186
+ background-position: 0 0, 20px 20px;
187
+ }
188
+ }
189
+ @media (prefers-color-scheme: light) {
190
+ .gr-tabitem {
191
+ background: linear-gradient(135deg, #e3e6f3 60%, #f5f6fa 100%) !important;
192
+ background-image:
193
+ radial-gradient(rgba(255,255,255,0.10) 1.2px, transparent 1.2px),
194
+ radial-gradient(rgba(255,255,255,0.06) 1px, transparent 1px);
195
+ background-size: 40px 40px, 80px 80px;
196
+ background-position: 0 0, 20px 20px;
197
+ }
198
+ h3 a, h3 a:visited {
199
+ color: #222 !important;
200
+ }
201
+ }
202
+
203
+ /* Sort arrow/button styles */
204
+ .sort-arrow, .sort-btn {
205
+ display: inline-flex;
206
+ align-items: center;
207
+ justify-content: center;
208
+ background: #23244a;
209
+ color: #ffd700 !important; /* Always yellow */
210
+ border: 1.5px solid #ffd700; /* Gold border */
211
+ border-radius: 6px;
212
+ font-size: 15px;
213
+ font-weight: 700;
214
+ margin-left: 6px;
215
+ margin-right: 2px;
216
+ padding: 2px 8px 2px 6px;
217
+ cursor: pointer;
218
+ transition: background 0.2s, color 0.2s, border 0.2s;
219
+ min-width: 28px;
220
+ min-height: 28px;
221
+ outline: none;
222
+ }
223
+ .sort-arrow.active, .sort-btn.active {
224
+ color: #ffd700 !important; /* Gold */
225
+ border-color: #ffd700;
226
+ background: #1a237e;
227
+ }
228
+ .sort-arrow:hover, .sort-btn:hover {
229
+ background: #ffd700;
230
+ color: #23244a !important;
231
+ border-color: #ffd700;
232
+ }
233
+ .sort-arrow svg, .sort-btn svg {
234
+ margin-left: 2px;
235
+ margin-right: 0;
236
+ width: 1em;
237
+ height: 1em;
238
+ vertical-align: middle;
239
+ }
240
+
241
+ /* Enhanced leaderboard table styles */
242
+ .pretty-leaderboard-table {
243
+ width: 100%;
244
+ border-collapse: separate;
245
+ border-spacing: 0;
246
+ background: rgba(30, 34, 54, 0.98);
247
+ /* border-radius: 16px; 테이블 자체에는 radius 제거 */
248
+ box-shadow: 0 4px 24px 0 rgba(16, 152, 247, 0.10), 0 1.5px 6px 0 rgba(227, 84, 84, 0.08);
249
+ margin-bottom: 24px;
250
+ }
251
+ .pretty-leaderboard-table thead {
252
+ border-radius: 16px 16px 0 0;
253
+ overflow: hidden;
254
+ background: #23244a;
255
+ }
256
+
257
+ /* Sticky first and second columns */
258
+ /* Sticky first and second columns - header(th) */
259
+ .pretty-leaderboard-table th:nth-child(1) {
260
+ position: sticky;
261
+ left: 0;
262
+ top: 0;
263
+ z-index: 5;
264
+ background: #23244a;
265
+ min-width: 60px;
266
+ max-width: 60px;
267
+ width: 60px;
268
+ }
269
+ .pretty-leaderboard-table th:nth-child(2) {
270
+ position: sticky;
271
+ left: 60px;
272
+ top: 0;
273
+ z-index: 5;
274
+ background: #23244a;
275
+ min-width: 220px;
276
+ max-width: 400px;
277
+ width: 220px;
278
+ }
279
+ /* Sticky first and second columns - body(td) with CSS variable for background */
280
+ .pretty-leaderboard-table td:nth-child(1) {
281
+ position: sticky;
282
+ left: 0;
283
+ z-index: 4;
284
+ background: var(--row-bg) !important;
285
+ min-width: 60px;
286
+ max-width: 60px;
287
+ width: 60px;
288
+ }
289
+ .pretty-leaderboard-table td:nth-child(2) {
290
+ position: sticky;
291
+ left: 60px;
292
+ z-index: 4;
293
+ background: var(--row-bg) !important;
294
+ min-width: 220px;
295
+ max-width: 400px;
296
+ width: 220px;
297
+ }
298
+
299
+ /* Set --row-bg variable for each row type */
300
+ .pretty-leaderboard-table tr {
301
+ --row-bg: #1e2236;
302
+ }
303
+ .pretty-leaderboard-table tr:nth-child(even) {
304
+ --row-bg: #23253a;
305
+ }
306
+ .pretty-leaderboard-table tr:hover {
307
+ --row-bg: #2066a0;
308
+ }
309
+ .pretty-leaderboard-table th {
310
+ z-index: 4;
311
+ }
312
+ .pretty-leaderboard-table th, .pretty-leaderboard-table td {
313
+ padding: 12px 16px;
314
+ text-align: left;
315
+ border-bottom: 1px solid #23244a;
316
+ font-size: 16px;
317
+ }
318
+ .pretty-leaderboard-table th {
319
+ background: #23244a;
320
+ color: #fff;
321
+ font-weight: 800;
322
+ letter-spacing: 0.5px;
323
+ border-bottom: 2px solid #1098F7;
324
+ text-shadow: 0 1px 8px #0006;
325
+ transition: background 0.2s, color 0.2s;
326
+ position: sticky;
327
+ top: 0;
328
+ z-index: 2;
329
+ border-radius: 0 !important;
330
+ }
331
+ .pretty-leaderboard-table th:hover, .pretty-leaderboard-table th:focus {
332
+ background: #273a8a;
333
+ color: #fff;
334
+ }
335
+ .pretty-leaderboard-table td {
336
+ color: #F5F6F7;
337
+ vertical-align: middle;
338
+ background: var(--row-bg);
339
+ }
340
+ .pretty-leaderboard-table tr:last-child td {
341
+ border-bottom: none;
342
+ }
343
+ /* th/td의 border-radius는 모두 제거, 둥근 효과는 thead에만 */
344
+
345
+ /* Enhanced score bar styles */
346
+ .score-bar {
347
+ display: flex;
348
+ align-items: center;
349
+ gap: 12px;
350
+ width: 100%;
351
+ }
352
+ .score-bar-track {
353
+ flex-grow: 1;
354
+ height: 10px;
355
+ background: rgba(245, 246, 247, 0.12);
356
+ border-radius: 5px;
357
+ overflow: hidden;
358
+ max-width: 220px;
359
+ box-shadow: 0 1px 4px 0 rgba(16, 152, 247, 0.10);
360
+ }
361
+ .score-bar-fill {
362
+ height: 100%;
363
+ background: linear-gradient(90deg, #a259f7 0%, #6d28d9 100%);
364
+ border-radius: 5px;
365
+ transition: width 0.3s cubic-bezier(0.4,0,0.2,1);
366
+ }
367
+ .score-bar-value {
368
+ font-family: 'SF Mono', monospace;
369
+ font-weight: 600;
370
+ color: #F5F6F7;
371
+ min-width: 60px;
372
+ font-size: 14px;
373
+ }
374
+
375
+ body {
376
+ min-height: 100vh;
377
+ }
378
+ /* ���체 배경색은 브라우저 기본값을 따름. gradio-container도 마찬가지로 별도 배경 없음 */
379
+
380
+ .markdown-text {
381
+ font-size: 16px !important;
382
+ }
383
+
384
+ #citation-button span {
385
+ font-size: 16px !important;
386
+ }
387
+
388
+ #citation-button textarea {
389
+ font-size: 16px !important;
390
+ }
391
+
392
+ #citation-button > label > button {
393
+ margin: 6px;
394
+ transform: scale(1.3);
395
+ }
396
+
397
+ .leaderboard-table-container {
398
+ margin-top: 15px;
399
+ /* Space-themed background */
400
+ background: linear-gradient(135deg, #e3e6f3 60%, #f5f6fa 100%);
401
+ position: relative;
402
+ background-image:
403
+ radial-gradient(rgba(255,255,255,0.15) 1.2px, transparent 1.2px),
404
+ radial-gradient(rgba(255,255,255,0.10) 1px, transparent 1px);
405
+ background-size: 40px 40px, 80px 80px;
406
+ background-position: 0 0, 20px 20px;
407
+ }
408
+ @media (prefers-color-scheme: dark) {
409
+ .leaderboard-table-container {
410
+ background: linear-gradient(135deg, #1a237e 0%, #311b92 100%) !important;
411
+ background-image:
412
+ radial-gradient(rgba(255,255,255,0.15) 1.2px, transparent 1.2px),
413
+ radial-gradient(rgba(255,255,255,0.10) 1px, transparent 1px);
414
+ background-size: 40px 40px, 80px 80px;
415
+ background-position: 0 0, 20px 20px;
416
+ }
417
+ }
418
+ @media (prefers-color-scheme: light) {
419
+ .leaderboard-table-container {
420
+ background: linear-gradient(135deg, #e3e6f3 60%, #f5f6fa 100%) !important;
421
+ background-image:
422
+ radial-gradient(rgba(255,255,255,0.15) 1.2px, transparent 1.2px),
423
+ radial-gradient(rgba(255,255,255,0.10) 1px, transparent 1px);
424
+ background-size: 40px 40px, 80px 80px;
425
+ background-position: 0 0, 20px 20px;
426
+ }
427
+ }
428
+
429
+ /* Limit the width of the first column so that names don't expand too much */
430
+ .leaderboard-table-container td:nth-child(2),
431
+ .leaderboard-table-container th:nth-child(2) {
432
+ max-width: 400px;
433
+ overflow: auto;
434
+ white-space: nowrap;
435
+ }
436
+
437
+ .tab-buttons button {
438
+ font-size: 20px;
439
+ }
440
+
441
+
442
+ /* Model type and think badge styles */
443
+ .badge {
444
+ display: inline-block;
445
+ border-radius: 12px;
446
+ padding: 2px 10px;
447
+ font-size: 0.85em;
448
+ font-weight: 700;
449
+ margin-left: 6px;
450
+ box-shadow: 0 1px 4px rgba(0,0,0,0.10);
451
+ vertical-align: middle;
452
+ }
453
+ .badge-think-on {
454
+ background: #A7C7E7;
455
+ color: #234567;
456
+ border: 1.5px solid #A7C7E7;
457
+ }
458
+ .badge-think-off {
459
+ background: #E0E0E0;
460
+ color: #555;
461
+ border: 1.5px solid #E0E0E0;
462
+ }
463
+
464
+ /* Model Type badge styles */
465
+ .badge-modeltype-instruct {
466
+ background: #B2F2E9;
467
+ color: #22796A;
468
+ border: 1.5px solid #B2F2E9;
469
+ }
470
+ .badge-modeltype-think {
471
+ background: #D6C8F7;
472
+ color: #5B4B8A;
473
+ border: 1.5px solid #D6C8F7;
474
+ }
475
+ .badge-modeltype-hybrid {
476
+ background: #FFE0B2;
477
+ color: #A67C52;
478
+ border: 1.5px solid #FFE0B2;
479
+ }
480
+
481
+ /* Type badge Open/Proprietary styles */
482
+ .badge-type-open {
483
+ background: #A8E6A3;
484
+ color: #225522;
485
+ border: 1.5px solid #A8E6A3;
486
+ }
487
+ .badge-type-proprietary {
488
+ background: #F7B2B7;
489
+ color: #7A2F34;
490
+ border: 1.5px solid #F7B2B7;
491
+ }
492
+
493
+
494
+ /* Sort button styles */
495
+ .sort-btn {
496
+ background: #23244a;
497
+ color: #F5F6F7;
498
+ border: 1px solid #1098F7;
499
+ border-radius: 6px;
500
+ font-size: 13px;
501
+ font-weight: 700;
502
+ margin-left: 4px;
503
+ margin-right: 2px;
504
+ padding: 2px 7px;
505
+ cursor: pointer;
506
+ transition: background 0.2s, color 0.2s;
507
+ }
508
+ .sort-btn:hover {
509
+ background: #1098F7;
510
+ color: #fff;
511
+ }
512
+
513
+ /* Custom CheckboxGroup and Dropdown styles for table theme */
514
+ .gr-checkbox-group, .gr-checkbox, .gr-checkbox-group label, .gr-checkbox input[type="checkbox"] {
515
+ background: #23244a !important;
516
+ color: #F5F6F7 !important;
517
+ border: 1.5px solid #1098F7 !important;
518
+ border-radius: 6px !important;
519
+ }
520
+ .gr-checkbox input[type="checkbox"]:checked {
521
+ background: #1a237e !important;
522
+ border-color: #ffd700 !important;
523
+ }
524
+ .gr-dropdown, .gr-input, select {
525
+ background: #23244a !important;
526
+ color: #F5F6F7 !important;
527
+ border: 1.5px solid #1098F7 !important;
528
+ border-radius: 6px !important;
529
+ }
530
+
531
+ /* Custom style for radar chart model selector's selected tags (only the tag area, not the dropdown list) */
532
+ .custom-dropdown .multiselect__tag {
533
+ background: #1098F7 !important;
534
+ color: #fff !important;
535
+ border: 1.5px solid #1098F7 !important;
536
+ box-shadow: 0 4px 16px #1098f755, 0 2px 8px #e5e7eb88 !important;
537
+ border-radius: 18px !important;
538
+ font-weight: 600 !important;
539
+ padding: 8px 20px !important;
540
+ margin: 2px 4px !important;
541
+ font-size: 1.08rem !important;
542
+ display: inline-block !important;
543
+ transition: background 0.2s, color 0.2s, box-shadow 0.2s, border 0.2s !important;
544
+ }
545
+
546
+ .gr-dropdown:focus, .gr-input:focus, select:focus {
547
+ border-color: #ffd700 !important;
548
+ outline: none !important;
549
+ }
550
+
551
+ @media (prefers-color-scheme: dark) {
552
+ .category-box,
553
+ .space-info-box,
554
+ .pretty-leaderboard-table,
555
+ .dark-container {
556
+ background: linear-gradient(135deg, #23244a 0%, #2a1859 100%) !important;
557
+ color: #f5f6f7 !important;
558
+ }
559
+ .space-info-box, .space-info-box * {
560
+ color: #f5f6f7 !important;
561
+ }
562
+ h3 a, h3 a:visited {
563
+ color: #f5f6f7 !important;
564
+ }
565
+ }
566
+ """
567
+
568
+ # requirements_textbox and adaptor_class_textbox scroll/height control
569
+ custom_css += """
570
+ #requirements-textbox textarea {
571
+ overflow-y: auto !important;
572
+ resize: vertical;
573
+ height: 480px;
574
+ max-height: 480px;
575
+ }
576
+ #yml-textbox textarea {
577
+ overflow-y: auto !important;
578
+ resize: vertical;
579
+ height: 240px;
580
+ max-height: 240px;
581
+ }
582
+
583
+ /* No border textbox style for file upload status */
584
+ .no-border-textbox textarea {
585
+ border: none !important;
586
+ box-shadow: none !important;
587
+ background: transparent !important;
588
+ padding: 0 !important;
589
+ margin: 0 !important;
590
+ outline: none !important;
591
+ resize: none !important;
592
+ overflow: hidden !important;
593
+ }
594
+ .no-border-textbox .wrap {
595
+ background: transparent !important;
596
+ border: none !important;
597
+ box-shadow: none !important;
598
+ padding: 0 !important;
599
+ margin: 0 !important;
600
+ outline: none !important;
601
+ }
602
+ .no-border-textbox .prose {
603
+ background: transparent !important;
604
+ border: none !important;
605
+ box-shadow: none !important;
606
+ padding: 0 !important;
607
+ margin: 0 !important;
608
+ }
609
+ .no-border-textbox label {
610
+ display: none !important;
611
+ }
612
+ .no-border-textbox .gr-textbox {
613
+ border: none !important;
614
+ box-shadow: none !important;
615
+ background: transparent !important;
616
+ padding: 0 !important;
617
+ margin: 0 !important;
618
+ }
619
+ .no-border-textbox .gr-textbox > div {
620
+ border: none !important;
621
+ box-shadow: none !important;
622
+ background: transparent !important;
623
+ padding: 0 !important;
624
+ margin: 0 !important;
625
+ }
626
+ .no-border-textbox .gr-textbox > div > div {
627
+ border: none !important;
628
+ box-shadow: none !important;
629
+ background: transparent !important;
630
+ padding: 0 !important;
631
+ margin: 0 !important;
632
+ }
633
+
634
+ /* Ensure model name tooltips are hoverable */
635
+ .pretty-leaderboard-table td span[title] {
636
+ pointer-events: auto;
637
+ }
638
+
639
+ /* Tall file upload container to match lines=25 textbox height */
640
+ .tall-file-upload .file-upload-container {
641
+ min-height: 283px !important;
642
+ height: 283px !important;
643
+ }
644
+ .tall-file-upload .gr-file {
645
+ min-height: 283px !important;
646
+ height: 283px !important;
647
+ }
648
+ """
649
+
650
+ def get_rank_badge(rank: int) -> str:
651
+ """
652
+ Returns emoji for 1st, 2nd, 3rd, otherwise just the number.
653
+ """
654
+ if rank == 1:
655
+ return '<span style="font-size:1.5em;" title="1st">🥇</span>'
656
+ elif rank == 2:
657
+ return '<span style="font-size:1.5em;" title="2nd">🥈</span>'
658
+ elif rank == 3:
659
+ return '<span style="font-size:1.5em;" title="3rd">🥉</span>'
660
+ else:
661
+ return f'<span style="font-size:1.2em;color:#a1a1aa;font-weight:500;">{rank}</span>'
662
+
663
+ def get_score_gauge(score: float) -> str:
664
+ """
665
+ Returns HTML for an overall score gauge (progress bar style).
666
+ Robustly normalizes score to 0~100% regardless of input range (0~1 or 0~100).
667
+ """
668
+ # Handle None/NaN
669
+ try:
670
+ score = float(score)
671
+ except (TypeError, ValueError):
672
+ score = 0.0
673
+
674
+ # Normalize: if score is 0~1, treat as normalized and scale to 0~100
675
+ if score is None or score != score: # NaN check
676
+ percent = 0.0
677
+ display_score = 0.0
678
+ elif score <= 1.0:
679
+ percent = score * 100
680
+ display_score = percent
681
+ else:
682
+ percent = score
683
+ display_score = score
684
+
685
+ # For scores above 95, adjust to 98~100% so the bar appears almost full
686
+ if percent >= 95:
687
+ percent = 98 + (min(percent, 100) - 95) * 0.4 # 95=98%, 100=100%
688
+ # Clip to 0~100
689
+ percent = min(max(percent, 0), 100)
690
+ display_score = min(max(display_score, 0), 100)
691
+
692
+ return f'''
693
+ <div class="score-bar" style="margin: 0.5em 0;">
694
+ <div class="score-bar-track">
695
+ <div class="score-bar-fill" style="width: {percent}%;"></div>
696
+ </div>
697
+ <span class="score-bar-value">{display_score:.3f}</span>
698
+ </div>
699
+ '''
700
+
701
+ from src.display.formatting import get_score_stars
702
+
703
+ def get_leaderboard_table_html(df, key="Category") -> str:
704
+ """
705
+ Returns HTML for a pretty leaderboard table using badge and gauge.
706
+ Displays all columns in df, applying format_leaderboard_cell to each cell as needed.
707
+ key: "Category" or "Language" (default: "Category")
708
+ """
709
+ import pandas as pd
710
+ from src.display.formatting import get_score_stars, get_type_badge, get_model_type_badge, get_output_badge, format_leaderboard_cell, get_display_model_name
711
+ # Build table header
712
+ html = ['<table class="pretty-leaderboard-table">']
713
+ html.append("<thead><tr>")
714
+ for col in df.columns:
715
+ html.append(f"<th>{col}</th>")
716
+ html.append("</tr></thead>")
717
+ html.append("<tbody>")
718
+ for idx, row in df.iterrows():
719
+ html.append("<tr>")
720
+ for col in df.columns:
721
+ cell = row[col]
722
+ # Special cell handling
723
+ if col == "Rank":
724
+ badge = get_rank_badge(cell)
725
+ html.append(f"<td>{badge}</td>")
726
+ elif col == "Model Name":
727
+ # Highlight top 1~3
728
+ rank = row.get("Rank", None)
729
+ highlight_style = ""
730
+ if rank == 1 or rank == "1":
731
+ highlight_style = "color: #ffd700; font-weight: bold; text-shadow: 0 0 4px #fff2;"
732
+ elif rank == 2 or rank == "2":
733
+ highlight_style = "color: #b0b0b0; font-weight: bold;"
734
+ elif rank == 3 or rank == "3":
735
+ highlight_style = "color: #cd7f32; font-weight: bold;"
736
+ else:
737
+ highlight_style = "color: #fff; font-weight: 600;"
738
+ display_name = get_display_model_name(str(cell))
739
+ link_value = row["Link"] if "Link" in row and pd.notna(row["Link"]) and str(row["Link"]).strip() != "" else None
740
+ if link_value:
741
+ clickable_name = f'<a href="{link_value}" target="_blank" style="color:inherit;">{display_name}</a>'
742
+ else:
743
+ clickable_name = display_name
744
+ html.append(f'<td><span style="{highlight_style}">{clickable_name}</span></td>')
745
+ elif col == "Model Type":
746
+ html.append(f"<td>{get_model_type_badge(row.get('Model Type', ''))}</td>")
747
+ elif col == "Type":
748
+ html.append(f"<td>{get_type_badge(row.get('Type', ''))}</td>")
749
+ elif col == "Think":
750
+ html.append(f"<td>{get_output_badge(row.get('Think', ''))}</td>")
751
+ elif col == "Overall":
752
+ # 별점
753
+ try:
754
+ unique_id = row.get("Model Name", None)
755
+ unique_id = unique_id.replace(" ", "_").replace("-", "_").replace("(", "_").replace(")", "_")
756
+ cell_html = get_score_stars(float(cell), unique_id=unique_id)
757
+ except Exception:
758
+ cell_html = str(cell)
759
+ html.append(f"<td>{cell_html}</td>")
760
+ else:
761
+ html.append(f"<td>{format_leaderboard_cell(cell, col, key)}</td>")
762
+ html.append("</tr>")
763
+ html.append("</tbody></table>")
764
+ table_html = "\n".join(html)
765
+ # Wrap in scrollable div for sticky header
766
+ return f'<div class="leaderboard-table-container" style="max-height:900px;overflow-y:auto;">{table_html}</div>'
src/display/formatting.py ADDED
@@ -0,0 +1,305 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from constants import (
3
+ NUMERIC_COLS_CATEGORY, NUMERIC_INT_COLS_CATEGORY,
4
+ NUMERIC_COLS_LANGUAGE, NUMERIC_INT_COLS_LANGUAGE
5
+ )
6
+
7
+ def format_leaderboard_cell(cell, col, key="Category"):
8
+ """
9
+ Apply integer/two-decimal formatting to numeric columns.
10
+ key: "Category" or "Language"
11
+ """
12
+ if key == "Language":
13
+ numeric_cols = NUMERIC_COLS_LANGUAGE
14
+ int_cols = NUMERIC_INT_COLS_LANGUAGE
15
+ else:
16
+ numeric_cols = NUMERIC_COLS_CATEGORY
17
+ int_cols = NUMERIC_INT_COLS_CATEGORY
18
+ if pd.isna(cell) or (isinstance(cell, str) and cell.strip() == ""):
19
+ return ""
20
+ try:
21
+ if col in int_cols:
22
+ return str(int(round(float(cell))))
23
+ elif col in numeric_cols:
24
+ return "{:.2f}".format(float(cell))
25
+ else:
26
+ return str(cell)
27
+ except Exception:
28
+ return ""
29
+
30
+
31
+ def styled_error(error):
32
+ return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
33
+
34
+
35
+ def styled_warning(warn):
36
+ return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
37
+
38
+
39
+ def styled_message(message):
40
+ return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
41
+
42
+
43
+ def has_no_nan_values(df, columns):
44
+ return df[columns].notna().all(axis=1)
45
+
46
+
47
+ def has_nan_values(df, columns):
48
+ return df[columns].isna().any(axis=1)
49
+
50
+ def get_display_model_name(full_model_name: str) -> str:
51
+ """
52
+ Removes text within parentheses from the model name for display purposes.
53
+ Example: "Model (v1)" -> "Model"
54
+ """
55
+ import re
56
+ return re.sub(r'\s*\(.*?\)', '', full_model_name)
57
+
58
+ def get_score_stars(score, unique_id=None):
59
+ """
60
+ Generate HTML for a 5-star rating visualization.
61
+
62
+ Args:
63
+ score (float or int): Overall score, can be in 0~1 or 0~100 range.
64
+ - If 0~1, it will be automatically scaled to 0~100.
65
+ - If None, NaN, or negative, treated as 0.
66
+ unique_id (optional): Unique identifier for SVG gradient.
67
+
68
+ Returns:
69
+ str: HTML string with 5-star visualization, filled in proportion to score.
70
+ """
71
+ # Robust normalization: 0~1 -> 0~100, None/NaN/negative -> 0
72
+ max_stars = 5
73
+ full_stars = int(score // 20)
74
+ partial = (score % 20) / 20 # 0.0 ~ 0.999
75
+ stars_html = ""
76
+ star_size = 18 # px
77
+
78
+ # If unique_id is not provided, use "default"
79
+ uid = str(unique_id) if unique_id is not None else "default"
80
+
81
+ def star_svg(fill_ratio, idx):
82
+ # fill_ratio: 0.0 (empty) ~ 1.0 (full)
83
+ # White fill, gray background
84
+ grad_id = f"star-grad-{uid}-{idx}"
85
+ return f'''
86
+ <svg width="{star_size}" height="{star_size}" viewBox="0 0 24 24" style="margin-right:0.5px;vertical-align:middle;">
87
+ <defs>
88
+ <linearGradient id="{grad_id}" x1="0" x2="1" y1="0" y2="0">
89
+ <stop offset="0%" stop-color="#fff"/>
90
+ <stop offset="{fill_ratio*100:.1f}%" stop-color="#fff"/>
91
+ <stop offset="{fill_ratio*100:.1f}%" stop-color="#666666"/>
92
+ <stop offset="100%" stop-color="#666666"/>
93
+ </linearGradient>
94
+ </defs>
95
+ <polygon points="12,2 15,9 22,9.5 17,14.2 18.5,21 12,17.5 5.5,21 7,14.2 2,9.5 9,9"
96
+ fill="url(#{grad_id})" stroke="#888" stroke-width="1"/>
97
+ </svg>
98
+ '''
99
+
100
+ # Full stars
101
+ for i in range(full_stars):
102
+ stars_html += star_svg(1.0, i)
103
+ # Partial star (if needed)
104
+ if full_stars < max_stars:
105
+ if partial > 0:
106
+ stars_html += star_svg(partial, full_stars)
107
+ empty_stars = max_stars - full_stars - 1
108
+ start_empty = full_stars + 1
109
+ else:
110
+ empty_stars = max_stars - full_stars
111
+ start_empty = full_stars
112
+ else:
113
+ empty_stars = 0
114
+ start_empty = max_stars
115
+ # Empty stars
116
+ for i in range(start_empty, start_empty + empty_stars):
117
+ stars_html += star_svg(0.0, i)
118
+
119
+ # Score text
120
+ score_text = f'<span style="color:#fff;font-size:16px;margin-left:8px;">{score:.2f}</span>'
121
+
122
+ return f'''
123
+ <div style="display:flex;align-items:center;gap:4px;">
124
+ {stars_html}
125
+ {score_text}
126
+ </div>
127
+ '''
128
+
129
+ def get_type_badge(type_value):
130
+ """
131
+ type_value: e.g. 'Open', 'Proprietary'
132
+ Returns a badge with class depending on type (Open/Proprietary).
133
+ """
134
+ label = str(type_value).capitalize()
135
+ badge_class = ""
136
+ if str(type_value).lower() == "open":
137
+ badge_class = "badge-type-open"
138
+ elif str(type_value).lower() == "proprietary":
139
+ badge_class = "badge-type-proprietary"
140
+ else:
141
+ badge_class = "badge-type-proprietary"
142
+
143
+ return f'<span class="badge {badge_class}">{label}</span>'
144
+
145
+ def get_model_type_badge(model_type):
146
+ """
147
+ Model Type badge: Style varies depending on Think/Normal
148
+ """
149
+ label = str(model_type).capitalize()
150
+ model_type_str = str(model_type).strip().lower()
151
+ if model_type_str == "think":
152
+ badge_class = "badge-modeltype-think"
153
+ elif model_type_str == "instruct":
154
+ badge_class = "badge-modeltype-instruct"
155
+ elif model_type_str == "hybrid":
156
+ badge_class = "badge-modeltype-hybrid"
157
+ else:
158
+ badge_class = "badge-modeltype-instruct"
159
+ return f'<span class="badge {badge_class}">{label}</span>'
160
+
161
+ def get_think_badge(think_type):
162
+ label = str(think_type).capitalize()
163
+ if str(think_type).lower() == "on":
164
+ return f'<span class="badge badge-think-on">{label}</span>'
165
+ elif str(think_type).lower() == "off":
166
+ return f'<span class="badge badge-think-off">{label}</span>'
167
+ else:
168
+ return f'<span class="badge badge-think-off">{label}</span>'
169
+
170
+ import pandas as pd
171
+
172
+ def render_leaderboard_html(df, overall_col="Overall", key="Category"):
173
+ """
174
+ Render a DataFrame as an HTML table, replacing the overall_col with a star rating visualization.
175
+ key: "Category" or "Language"
176
+ """
177
+
178
+ # Force column order
179
+ desired_order = ["Rank", "Model Name", "Link", "Type", "Model Type", "Think", "Overall"]
180
+ cols = list(df.columns)
181
+ # Remaining columns
182
+ rest = [c for c in cols if c not in desired_order]
183
+ new_cols = []
184
+ for c in desired_order:
185
+ if c in cols:
186
+ new_cols.append(c)
187
+ new_cols += rest
188
+ df = df[new_cols]
189
+
190
+ # Columns to hide
191
+ hidden_cols = ["Comment", "Link"]
192
+
193
+ # Build table header
194
+ def get_sort_arrow():
195
+ # Arrow buttons removed as requested
196
+ return ""
197
+
198
+ # Extract sort state (from State or use default)
199
+ sort_col = getattr(df, "_sort_col", None) or (df.columns[0] if len(df.columns) > 0 else None)
200
+ sort_asc = getattr(df, "_sort_asc", None)
201
+ if sort_asc is None:
202
+ sort_asc = True
203
+
204
+ html = '<table class="pretty-leaderboard-table">\n<thead><tr>'
205
+ for col in df.columns:
206
+ if col in hidden_cols:
207
+ continue
208
+ # Info icon for Model Name, Med. Len. and Med. Resp. Len.
209
+ if col == "Model Name":
210
+ html += (
211
+ f'<th>{col}'
212
+ '<span class="info-icon" title="Hovering the mouse displays additional details, and clicking the model name navigates to the corresponding page.">ⓘ</span>'
213
+ f'{get_sort_arrow()}</th>'
214
+ )
215
+ elif col == "Med. Len.":
216
+ html += (
217
+ f'<th>{col}'
218
+ '<span class="info-icon" title="Median token length of think and response for the model.">ⓘ</span>'
219
+ f'{get_sort_arrow()}</th>'
220
+ )
221
+ elif col == "Med. Resp. Len.":
222
+ html += (
223
+ f'<th>{col}'
224
+ '<span class="info-icon" title="Median token length of the model\'s responses (excluding think).">ⓘ</span>'
225
+ f'{get_sort_arrow()}</th>'
226
+ )
227
+ elif col == overall_col:
228
+ html += f'<th style="min-width: 120px; max-width: 300px; width: 150px;">{col}{get_sort_arrow()}</th>'
229
+ else:
230
+ html += f'<th>{col}{get_sort_arrow()}</th>'
231
+ html += '</tr></thead>\n<tbody>\n'
232
+
233
+ # Build table rows
234
+ for _, row in df.iterrows():
235
+ html += '<tr>'
236
+ for col in df.columns:
237
+ if col in hidden_cols:
238
+ continue
239
+ cell = row[col]
240
+ if col == overall_col:
241
+ try:
242
+ # Use "Model Name" of the row as unique_id
243
+ unique_id = row.get("Model Name", None)
244
+ unique_id = unique_id.replace(" ", "_").replace("-", "_").replace("(", "_").replace(")", "_")
245
+ cell_html = get_score_stars(float(cell), unique_id=unique_id)
246
+ except Exception:
247
+ cell_html = str(cell)
248
+ html += f'<td style="min-width: 120px; max-width: 300px; width: 150px;">{cell_html}</td>'
249
+ elif col == "Rank":
250
+ # For 1st, 2nd, and 3rd place, emphasize with medal emoji and color
251
+ medal = ""
252
+ style = "color: #fff; font-weight: 600;"
253
+ if cell == 1 or cell == "1":
254
+ medal = "🥇"
255
+ style = "color: #ffd700; font-weight: bold; text-shadow: 0 0 4px #fff2;"
256
+ elif cell == 2 or cell == "2":
257
+ medal = "🥈"
258
+ style = "color: #b0b0b0; font-weight: bold;"
259
+ elif cell == 3 or cell == "3":
260
+ medal = "🥉"
261
+ style = "color: #cd7f32; font-weight: bold;"
262
+ html += f'<td><span style="{style}">{medal if medal else cell}</span></td>'
263
+ elif col in ["Model Name"]:
264
+ # Only highlight top 1~3, do not apply badge
265
+ rank = row.get("Rank", None)
266
+ highlight_style = ""
267
+ if rank == 1 or rank == "1":
268
+ highlight_style = "color: #ffd700; font-weight: bold; text-shadow: 0 0 4px #fff2;"
269
+ elif rank == 2 or rank == "2":
270
+ highlight_style = "color: #b0b0b0; font-weight: bold;"
271
+ elif rank == 3 or rank == "3":
272
+ highlight_style = "color: #cd7f32; font-weight: bold;"
273
+ else:
274
+ highlight_style = "color: #fff; font-weight: 600;"
275
+ display_name = get_display_model_name(str(cell))
276
+
277
+ # --- Start of new logic for tooltip ---
278
+ comment_value = ""
279
+ # Check if 'Comment' column exists and the value is not NaN/empty
280
+ if "Comment" in row and pd.notna(row["Comment"]) and str(row["Comment"]).strip() != "":
281
+ comment_value = str(row["Comment"]).strip()
282
+
283
+ title_attribute = f' title="{comment_value}"' if comment_value else ""
284
+ # --- End of new logic for tooltip ---
285
+
286
+ # Link logic
287
+ link_value = row["Link"] if "Link" in row and pd.notna(row["Link"]) and str(row["Link"]).strip() != "" else None
288
+ if link_value:
289
+ clickable_name = f'<a href="{link_value}" target="_blank" style="color:inherit;">{display_name}</a>'
290
+ else:
291
+ clickable_name = display_name
292
+
293
+ html += f'<td><span style="{highlight_style}"{title_attribute}>{clickable_name}</span></td>'
294
+ elif col == "Type":
295
+ html += f'<td>{get_type_badge(row.get("Type", ""))}</td>'
296
+ elif col == "Model Type":
297
+ html += f'<td>{get_model_type_badge(row.get("Model Type", ""))}</td>'
298
+ elif col == "Think":
299
+ html += f'<td>{get_think_badge(row.get("Think", ""))}</td>'
300
+ else:
301
+ html += f'<td>{format_leaderboard_cell(cell, col, key)}</td>'
302
+ html += '</tr>\n'
303
+ html += '</tbody></table>'
304
+ # Wrap in scrollable div for sticky header
305
+ return f'<div class="leaderboard-table-container" style="max-height:900px;overflow-y:auto;">{html}</div>'
src/display/utils.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from enum import Enum
3
+
4
+ def fields(raw_class):
5
+ return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
6
+
7
+ ## All the model information that we might need
8
+ @dataclass
9
+ class ModelDetails:
10
+ name: str
11
+ display_name: str = ""
12
+ symbol: str = "" # emoji
13
+
14
+
15
+ class Precision(Enum):
16
+ float16 = ModelDetails("float16")
17
+ bfloat16 = ModelDetails("bfloat16")
18
+ fp8 = ModelDetails("fp8")
19
+ int4 = ModelDetails("int4")
20
+ Unknown = ModelDetails("?")
21
+
22
+ def from_str(precision):
23
+ if precision in ["torch.float16", "float16"]:
24
+ return Precision.float16
25
+ if precision in ["torch.bfloat16", "bfloat16"]:
26
+ return Precision.bfloat16
27
+ if precision == "fp8":
28
+ return Precision.fp8
29
+ if precision == "int4":
30
+ return Precision.int4
31
+ return Precision.Unknown
src/envs.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from huggingface_hub import HfApi
4
+
5
+ # Info to change for your repository
6
+ # ----------------------------------
7
+ TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
+
9
+ OWNER = "SamsungResearch"
10
+ # ----------------------------------
11
+
12
+ REPO_ID = f"{OWNER}/TRUEBench"
13
+ QUEUE_REPO = f"{OWNER}/SR_Leaderboard_Requests"
14
+ FAILED_QUEUE_REPO = f"{OWNER}/SR_Leaderboard_Failed_Requests"
15
+ RESULTS_REPO = f"{OWNER}/SR_Leaderboard_Results"
16
+
17
+ # If you setup a cache later, just change HF_HOME
18
+ CACHE_PATH=os.getenv("HF_HOME", ".")
19
+
20
+ # Local caches
21
+ EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
22
+ EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
23
+ FAILED_EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "failed-eval-queue")
24
+ EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
25
+ EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
26
+ FAILED_EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "failed-eval-queue-bk")
27
+
28
+ API = HfApi(token=TOKEN)
src/submission/check_validity.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import re
4
+ from collections import defaultdict
5
+ from datetime import datetime, timedelta, timezone
6
+
7
+ import huggingface_hub
8
+ from huggingface_hub import ModelCard
9
+ from huggingface_hub.hf_api import ModelInfo
10
+ from transformers import AutoConfig
11
+ from transformers.models.auto.tokenization_auto import AutoTokenizer
12
+
13
+ def check_model_card(repo_id: str) -> tuple[bool, str]:
14
+ """Checks if the model card and license exist and have been filled"""
15
+ try:
16
+ card = ModelCard.load(repo_id)
17
+ except huggingface_hub.utils.EntryNotFoundError:
18
+ return False, "Please add a model card to your model to explain how you trained/fine-tuned it."
19
+
20
+ # Enforce license metadata
21
+ if card.data.license is None:
22
+ if not ("license_name" in card.data and "license_link" in card.data):
23
+ return False, (
24
+ "License not found. Please add a license to your model card using the `license` metadata or a"
25
+ " `license_name`/`license_link` pair."
26
+ )
27
+
28
+ # Enforce card content
29
+ if len(card.text) < 200:
30
+ return False, "Please add a description to your model card, it is too short."
31
+
32
+ return True, ""
33
+
34
+ def get_model_size(model_info: ModelInfo, precision: str):
35
+ """Gets the model size from the configuration, or the model name if the configuration does not contain the information."""
36
+ try:
37
+ model_size = round(model_info.safetensors["total"] / 1e9, 3)
38
+ except (AttributeError, TypeError):
39
+ return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
40
+
41
+ size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
42
+ model_size = size_factor * model_size
43
+ return model_size
44
+
45
+ def get_model_arch(model_info: ModelInfo):
46
+ """Gets the model architecture from the configuration"""
47
+ return model_info.config.get("architectures", "Unknown")
48
+
49
+ def already_submitted_models(requested_models_dir: str) -> set[str]:
50
+ """Gather a list of already submitted models to avoid duplicates"""
51
+ depth = 2
52
+ file_names = []
53
+ users_to_submission_dates = defaultdict(list)
54
+
55
+ for root, _, files in os.walk(requested_models_dir):
56
+ current_depth = root.count(os.sep) - requested_models_dir.count(os.sep)
57
+ if current_depth == depth:
58
+ for file in files:
59
+ if not file.endswith(".json"):
60
+ continue
61
+ with open(os.path.join(root, file), "r") as f:
62
+ info = json.load(f)
63
+
64
+ file_names.append(f"{info['benchmark']}_{info['model']}")
65
+
66
+ # Select organisation
67
+ if info["model"].count("/") == 0 or "submitted_time" not in info:
68
+ continue
69
+ organisation, _ = info["model"].split("/")
70
+ users_to_submission_dates[organisation].extend([{"benchmark": info['benchmark'], "model": info["model"], "submitted_time": info["submitted_time"]}])
71
+
72
+ return set(file_names), users_to_submission_dates
src/submission/submit.py ADDED
@@ -0,0 +1,279 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import yaml
3
+ import os
4
+ import re
5
+ from datetime import datetime, timezone, timedelta
6
+ from typing import Optional
7
+ from src.display.formatting import styled_error, styled_message, styled_warning
8
+ from src.envs import API, EVAL_REQUESTS_PATH, FAILED_EVAL_REQUESTS_PATH, TOKEN, FAILED_QUEUE_REPO, QUEUE_REPO, REPO_ID
9
+ from src.submission.check_validity import (
10
+ already_submitted_models,
11
+ check_model_card,
12
+ get_model_size
13
+ )
14
+ import gradio as gr
15
+ from utils import download_with_restart
16
+ from huggingface_hub import snapshot_download
17
+
18
+ REQUESTED_MODELS = None
19
+ USERS_TO_SUBMISSION_DATES = None
20
+
21
+ def restart_space():
22
+ API.restart_space(repo_id=REPO_ID)
23
+
24
+ def add_new_eval_option(
25
+ contact_email: str,
26
+ model: str,
27
+ model_type: str,
28
+ think_type: str,
29
+ precision: str,
30
+ response_prefix: str,
31
+ requirements: str,
32
+ user_state: str,
33
+ organization_list: list,
34
+ yml_textbox: str,
35
+ upbox,
36
+ ):
37
+
38
+ ERROR_MESSAGE = None
39
+
40
+ # Validate email format
41
+ email_regex = r"^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$"
42
+ if not re.match(email_regex, contact_email):
43
+ if ERROR_MESSAGE is None:
44
+ ERROR_MESSAGE = "Please provide a valid email address."
45
+
46
+ # Synchronize: Just before submission, copy the latest QUEUE_REPO to EVAL_REQUESTS_PATH
47
+ download_with_restart(
48
+ snapshot_download,
49
+ repo_id=QUEUE_REPO,
50
+ local_dir=EVAL_REQUESTS_PATH,
51
+ repo_type="dataset",
52
+ token=TOKEN,
53
+ restart_func=restart_space
54
+ )
55
+
56
+ # Synchronize: Just before submission, copy the latest FAILED_QUEUE_REPO to FAILED_EVAL_REQUESTS_PATH
57
+ download_with_restart(
58
+ snapshot_download,
59
+ repo_id=FAILED_QUEUE_REPO,
60
+ local_dir=FAILED_EVAL_REQUESTS_PATH,
61
+ repo_type="dataset",
62
+ token=TOKEN,
63
+ restart_func=restart_space
64
+ )
65
+
66
+ REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
67
+
68
+ user_name = ""
69
+ model_path = model
70
+ if "/" in model:
71
+ user_name = model.split("/")[0]
72
+ model_path = model.split("/")[1]
73
+
74
+ precision = precision.split(" ")[0]
75
+ KST = timezone(timedelta(hours=9))
76
+ current_time = datetime.now(KST).strftime("%Y-%m-%dT%H:%M:%S %z")
77
+
78
+ # Remove space in benchmark name
79
+ benchmark = "TRUEBench"
80
+
81
+ # Check submitter qualification
82
+
83
+ if user_name != user_state and user_name not in organization_list:
84
+ if ERROR_MESSAGE is None:
85
+ ERROR_MESSAGE = "The submitter does not have submission rights for this model."
86
+
87
+ # Does the organization submit more than three times in a day?
88
+ submission_times = [item['submitted_time'] for item in USERS_TO_SUBMISSION_DATES[user_name] if item['benchmark'] == benchmark]
89
+ submission_cnt = 0
90
+ for i in range(len(submission_times)):
91
+ hours_diff = (datetime.strptime(current_time, "%Y-%m-%dT%H:%M:%S %z") - datetime.strptime(submission_times[i], "%Y-%m-%dT%H:%M:%S %z")).total_seconds() / 3600
92
+ if hours_diff <= 24:
93
+ submission_cnt += 1
94
+ if submission_cnt >= 3:
95
+ if ERROR_MESSAGE is None:
96
+ ERROR_MESSAGE = "The organization already submitted three times for this benchmark today."
97
+
98
+ # Does the model actually exist?
99
+ revision = "main"
100
+
101
+ # Is the model info correctly filled?
102
+ model_info = None
103
+ model_size = "Unknown"
104
+ try:
105
+ model_info = API.model_info(repo_id=model, revision=revision)
106
+ model_size = get_model_size(model_info=model_info, precision=precision)
107
+ except Exception:
108
+ if ERROR_MESSAGE is None:
109
+ ERROR_MESSAGE = "Could not get your model information. Please fill it up properly."
110
+
111
+ # Were the model card and license filled?
112
+ license = "Unknown"
113
+ if model_info is not None:
114
+ try:
115
+ license = model_info.cardData["license"]
116
+ except Exception:
117
+ if ERROR_MESSAGE is None:
118
+ ERROR_MESSAGE = "Please select a license for your model."
119
+
120
+ modelcard_OK, error_msg = check_model_card(model)
121
+ if not modelcard_OK:
122
+ if ERROR_MESSAGE is None:
123
+ ERROR_MESSAGE = error_msg
124
+
125
+ # Response prefix check
126
+ if think_type == "On":
127
+ if response_prefix == "":
128
+ if ERROR_MESSAGE is None:
129
+ ERROR_MESSAGE = "It is required to fill in the response prefix when 'Think' is 'On'."
130
+ else:
131
+ response_prefix = ""
132
+
133
+ # Handle YAML config input (file or textbox)
134
+ config_dict = None
135
+
136
+ # Case 1: File uploaded
137
+ if upbox is not None and getattr(upbox, "name", ""):
138
+ file_name = upbox.name
139
+ if not file_name.lower().endswith(".yaml") and not file_name.lower().endswith(".yml"):
140
+ if ERROR_MESSAGE is None:
141
+ ERROR_MESSAGE = "Please submit a .yaml or .yml file."
142
+ try:
143
+ with open(file_name, 'r', encoding='utf-8') as f:
144
+ config_dict = yaml.safe_load(f)
145
+ except yaml.YAMLError:
146
+ if ERROR_MESSAGE is None:
147
+ ERROR_MESSAGE = "The file is not a valid YAML format."
148
+ except Exception as e:
149
+ if ERROR_MESSAGE is None:
150
+ ERROR_MESSAGE = f"An error occurred while reading the file. {e}"
151
+ if config_dict is None:
152
+ if ERROR_MESSAGE is None:
153
+ ERROR_MESSAGE = "The YAML file is empty or invalid."
154
+ else:
155
+ # Case 2: No file uploaded
156
+ if not yml_textbox or not yml_textbox.strip():
157
+ if ERROR_MESSAGE is None:
158
+ ERROR_MESSAGE = "Please fill in the configuration box or submit a YAML file."
159
+ try:
160
+ config_dict = yaml.safe_load(yml_textbox)
161
+ except yaml.YAMLError:
162
+ if ERROR_MESSAGE is None:
163
+ ERROR_MESSAGE = "Please provide a valid configuration."
164
+ if config_dict is None:
165
+ if ERROR_MESSAGE is None:
166
+ ERROR_MESSAGE = "Please provide a valid configuration."
167
+
168
+ # Restrict config keys
169
+ allowed_keys = {"llm_serve_args", "sampling_params", "extra_body"}
170
+ if not isinstance(config_dict, dict):
171
+ if ERROR_MESSAGE is None:
172
+ ERROR_MESSAGE = "The configuration must be a YAML dictionary at the top level."
173
+ extra_keys = set(config_dict.keys()) - allowed_keys
174
+ if extra_keys:
175
+ if ERROR_MESSAGE is None:
176
+ ERROR_MESSAGE = f"Only the following keys are allowed in the configuration: llm_serve_args, sampling_params, extra_body. Found invalid keys: {', '.join(sorted(extra_keys))}."
177
+
178
+ configs = json.dumps(config_dict, indent=4, ensure_ascii=False)
179
+
180
+ # Check for duplicate submission
181
+ submission_times = [item['submitted_time'] for item in USERS_TO_SUBMISSION_DATES[user_name] if item['benchmark'] == benchmark and item['model'] == model]
182
+ submission_cnt = 0
183
+ submission_total_cnt = 0
184
+ for i in range(len(submission_times)):
185
+ submission_total_cnt += 1
186
+ hours_diff = (datetime.strptime(current_time, "%Y-%m-%dT%H:%M:%S %z") - datetime.strptime(submission_times[i], "%Y-%m-%dT%H:%M:%S %z")).total_seconds() / 3600
187
+ if hours_diff <= 24:
188
+ submission_cnt += 1
189
+ if submission_cnt >= 1:
190
+ if ERROR_MESSAGE is None:
191
+ ERROR_MESSAGE = "This model has been already submitted within 24 hours."
192
+ if submission_total_cnt >= 3:
193
+ if ERROR_MESSAGE is None:
194
+ ERROR_MESSAGE = "This model has been already submitted three times for this benchmark."
195
+
196
+ print("Creating eval file")
197
+ if ERROR_MESSAGE is None:
198
+ OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}/{benchmark}_{model_path}"
199
+ else:
200
+ OUT_DIR = f"{FAILED_EVAL_REQUESTS_PATH}/{user_name}/{benchmark}_{model_path}"
201
+ os.makedirs(OUT_DIR, exist_ok=True)
202
+ current_time_replaced = current_time.replace("-", "").replace(":", "").replace("T", "_").split()[0]
203
+ out_path = f"{OUT_DIR}/{current_time_replaced}.json"
204
+
205
+ # Seems good, creating the eval
206
+ print("Adding new eval")
207
+
208
+ if ERROR_MESSAGE is None:
209
+ eval_entry = {
210
+ "benchmark": benchmark,
211
+ "contact_email": contact_email,
212
+ "model": model,
213
+ "type": "open",
214
+ "model_type": model_type,
215
+ "think_type": think_type,
216
+ "precision": precision,
217
+ "response_prefix": response_prefix,
218
+ "requirements": requirements,
219
+ "status": "PENDING",
220
+ "submitted_time": current_time,
221
+ "likes": getattr(model_info, "likes", -1),
222
+ "params": model_size,
223
+ "license": license,
224
+ "private": False,
225
+ "configs": configs
226
+ }
227
+ else:
228
+ eval_entry = {
229
+ "benchmark": benchmark,
230
+ "contact_email": contact_email,
231
+ "model": model,
232
+ "type": "open",
233
+ "model_type": model_type,
234
+ "think_type": think_type,
235
+ "precision": precision,
236
+ "response_prefix": response_prefix,
237
+ "requirements": requirements,
238
+ "status": "Failed",
239
+ "submitted_time": current_time,
240
+ "likes": getattr(model_info, "likes", -1),
241
+ "params": model_size,
242
+ "license": license,
243
+ "private": False,
244
+ "configs": configs,
245
+ "error_message": ERROR_MESSAGE
246
+ }
247
+
248
+ with open(out_path, "w") as f:
249
+ f.write(json.dumps(eval_entry))
250
+
251
+ print("Uploading eval file")
252
+ if ERROR_MESSAGE is None:
253
+ API.upload_file(
254
+ path_or_fileobj=out_path,
255
+ path_in_repo=out_path.split("eval-queue/")[1],
256
+ repo_id=QUEUE_REPO,
257
+ repo_type="dataset",
258
+ commit_message=f"Add {model} to eval queue",
259
+ )
260
+ else:
261
+ API.upload_file(
262
+ path_or_fileobj=out_path,
263
+ path_in_repo=out_path.split("failed-eval-queue/")[1],
264
+ repo_id=FAILED_QUEUE_REPO,
265
+ repo_type="dataset",
266
+ commit_message=f"Add {model} to failed eval queue",
267
+ )
268
+
269
+ # Remove the local file
270
+ os.remove(out_path)
271
+
272
+ if ERROR_MESSAGE is None:
273
+ return styled_message(
274
+ "Your request has been submitted to the evaluation queue!"
275
+ )
276
+ else:
277
+ return styled_error(
278
+ ERROR_MESSAGE
279
+ )
ui.py ADDED
@@ -0,0 +1,462 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from src.display.formatting import render_leaderboard_html, get_display_model_name
3
+ from src.data_utils import get_length_category_list, get_length_category_df
4
+ import pandas as pd
5
+ import numpy as np
6
+
7
+ def render_length_category_html(df, med_len_map=None):
8
+ """
9
+ Render the length category table with Model Name colored by Rank (gold/silver/bronze), no Rank column.
10
+ Model Name cell includes Think, Model Type badges. Overall column is always right after Model Name.
11
+ Optionally, inserts Med. Len. column after Overall if med_len_map is provided.
12
+ """
13
+ if df is None or df.empty:
14
+ return "<div>No data available.</div>"
15
+
16
+ # Compute Rank based on Overall (descending)
17
+ df = df.copy()
18
+ # 1. Sort so that empty strings come to the top first
19
+ df = df.sort_values("Overall", key=lambda x: (x == "").astype(int))
20
+ # 2. Then sort the actual values in descending order (empty strings are already at the top, so no effect)
21
+ df = df.sort_values("Overall", ascending=False, kind="mergesort").reset_index(drop=True)
22
+ df["Rank_Internal"] = df["Overall"].rank(method="min", ascending=False).astype(int)
23
+
24
+ # Ensure Think and Model Type columns exist for badge rendering
25
+ # Rename columns to ensure exact match
26
+ if "Type" not in df.columns:
27
+ df["Type"] = "unknown"
28
+ if "Model Type" not in df.columns:
29
+ df["Model Type"] = "unknown"
30
+ if "Think" not in df.columns:
31
+ df["Think"] = "unknown"
32
+
33
+ # Optionally add Med. Len. column
34
+ if med_len_map is not None:
35
+ df["Med. Len."] = df["Model Name"].map(med_len_map)
36
+
37
+ # Determine display columns: Model Name, Overall, Med. Len., {Category}, (rest, excluding Rank_Internal, Model Type, Think)
38
+ base_cols = [col for col in df.columns if col not in ["Rank_Internal", "Comment", "Group", "Link"]]
39
+
40
+ # Find the dynamic category column (e.g., "Short", "Long", etc.)
41
+ from src.data_utils import get_length_category_list
42
+ category_cols = [col for col in get_length_category_list() if col in base_cols]
43
+ category_col = category_cols[0] if category_cols else None
44
+
45
+ # Build display_cols: Model Name, Overall, Med. Len., {Category}, (rest)
46
+ display_cols = []
47
+ if "Model Name" in base_cols:
48
+ display_cols.append("Model Name")
49
+ if "Overall" in base_cols:
50
+ display_cols.append("Overall")
51
+ if "Med. Len." in base_cols:
52
+ display_cols.append("Med. Len.")
53
+ if "Med. Resp. Len." in base_cols:
54
+ display_cols.append("Med. Resp. Len.")
55
+ if category_col:
56
+ display_cols.append(category_col)
57
+ for col in base_cols:
58
+ if col not in display_cols:
59
+ display_cols.append(col)
60
+
61
+ # Build HTML table
62
+ html = '<table class="pretty-leaderboard-table">\n<thead><tr>'
63
+ for col in display_cols:
64
+ # Info icon for Model Name, Med. Len. and Med. Resp. Len.
65
+ if col == "Model Name":
66
+ html += (
67
+ f'<th>{col}'
68
+ '<span class="info-icon" title="Hovering the mouse displays additional details, and clicking the model name navigates to the corresponding page.">ⓘ</span>'
69
+ '</th>'
70
+ )
71
+ elif col == "Med. Len.":
72
+ html += (
73
+ f'<th>{col}'
74
+ '<span class="info-icon" title="Median token length of think and response for the model.">ⓘ</span>'
75
+ '</th>'
76
+ )
77
+ elif col == "Med. Resp. Len.":
78
+ html += (
79
+ f'<th>{col}'
80
+ '<span class="info-icon" title="Median token length of the model\'s responses (excluding think).">ⓘ</span>'
81
+ '</th>'
82
+ )
83
+ else:
84
+ html += f'<th>{col}</th>'
85
+ html += '</tr></thead>\n<tbody>\n'
86
+
87
+ # --- Define number formatting function ---
88
+ from constants import NUMERIC_COLS_CATEGORY, NUMERIC_INT_COLS_CATEGORY
89
+ def format_leaderboard_cell(cell, col):
90
+ # Handle NaN/empty strings
91
+ if pd.isna(cell) or (isinstance(cell, str) and cell.strip() == ""):
92
+ return cell
93
+ try:
94
+ if col in NUMERIC_INT_COLS_CATEGORY:
95
+ # Integer (rounded)
96
+ return str(int(round(float(cell))))
97
+ elif col in NUMERIC_COLS_CATEGORY:
98
+ # Two decimal places
99
+ return "{:.2f}".format(float(cell))
100
+ else:
101
+ return str(cell)
102
+ except Exception:
103
+ return str(cell)
104
+
105
+ for idx, row in df.iterrows():
106
+ html += '<tr>'
107
+ for col in display_cols:
108
+ cell = row[col]
109
+ if col == "Model Name":
110
+ # Gold/Silver/Bronze for 1/2/3
111
+ rank = row["Rank_Internal"]
112
+ if rank == 1:
113
+ style = "color: #ffd700; font-weight: bold; text-shadow: 0 0 4px #fff2;"
114
+ elif rank == 2:
115
+ style = "color: #b0b0b0; font-weight: bold;"
116
+ elif rank == 3:
117
+ style = "color: #cd7f32; font-weight: bold;"
118
+ else:
119
+ style = "color: #fff; font-weight: 600;"
120
+
121
+ # Badge HTML
122
+ model_type = row["Model Type"] if "Model Type" in row else "unknown"
123
+ think_type = row["Think"] if "Think" in row else "unknown"
124
+ type_value = row["Type"] if "Type" in row else "unknown"
125
+ from src.display.formatting import get_type_badge, get_think_badge, get_model_type_badge
126
+ badge_html = (
127
+ get_type_badge(type_value)
128
+ + get_model_type_badge(model_type)
129
+ + get_think_badge(think_type)
130
+ )
131
+
132
+ display_name = get_display_model_name(str(cell))
133
+
134
+ # --- Start of new logic for tooltip ---
135
+ comment_value = ""
136
+ # Check if 'Comment' column exists and the value is not NaN/empty
137
+ if "Comment" in row and pd.notna(row["Comment"]) and str(row["Comment"]).strip() != "":
138
+ comment_value = str(row["Comment"]).strip()
139
+ title_attribute = f' title="{comment_value}"' if comment_value else ""
140
+ # --- End of new logic for tooltip ---
141
+
142
+ # Link logic
143
+ link_value = row["Link"] if "Link" in row and pd.notna(row["Link"]) and str(row["Link"]).strip() != "" else None
144
+ if link_value:
145
+ clickable_name = f'<a href="{link_value}" target="_blank" style="color:inherit;">{display_name}</a>'
146
+ else:
147
+ clickable_name = display_name
148
+
149
+ html += f'<td><span style="{style}"{title_attribute}>{clickable_name}</span>{badge_html}</td>'
150
+ elif col == "Overall":
151
+ # Show stars
152
+ from src.display.formatting import get_score_stars
153
+ try:
154
+ unique_id = row.get("Model Name", None)
155
+ unique_id = unique_id.replace(" ", "_").replace("-", "_").replace("(", "_").replace(")", "_")
156
+ cell_html = get_score_stars(float(cell), unique_id=unique_id)
157
+ except Exception:
158
+ cell_html = str(cell)
159
+ html += f'<td>{cell_html}</td>'
160
+ else:
161
+ html += f'<td>{format_leaderboard_cell(cell, col)}</td>'
162
+ html += '</tr>\n'
163
+ html += '</tbody></table>'
164
+ # Wrap in scrollable div for sticky header
165
+ return f'<div class="leaderboard-table-container" style="max-height:900px;overflow-y:auto;">{html}</div>'
166
+
167
+ def render_length_category_table(leaderboard_df=None):
168
+ """
169
+ Renders a Category selector and a table showing length stats for the selected category.
170
+ Uses Overall from leaderboard_df for ranking, coloring, and stars.
171
+ """
172
+ import gradio as gr
173
+
174
+ categories = get_length_category_list()
175
+ default_category = categories[0] if categories else ""
176
+ # Merge Overall from leaderboard_df
177
+ def get_merged_df(selected_category):
178
+ df_cat = get_length_category_df(selected_category) if selected_category else None
179
+ if leaderboard_df is not None and df_cat is not None:
180
+ df_merged = df_cat.copy()
181
+ # Use Overall and {Category} from leaderboard_df
182
+ overall_map = dict(zip(leaderboard_df["Model Name"], leaderboard_df["Overall"]))
183
+ category_map = dict(zip(leaderboard_df["Model Name"], leaderboard_df[selected_category]))
184
+ df_merged["Overall"] = df_merged["Model Name"].map(overall_map)
185
+ df_merged[selected_category] = df_merged["Model Name"].map(category_map)
186
+ # Also map Model Type and Think
187
+ if "Type" in leaderboard_df.columns:
188
+ type_map = dict(zip(leaderboard_df["Model Name"], leaderboard_df["Type"]))
189
+ df_merged["Type"] = df_merged["Model Name"].map(type_map)
190
+ if "Model Type" in leaderboard_df.columns:
191
+ model_type_map = dict(zip(leaderboard_df["Model Name"], leaderboard_df["Model Type"]))
192
+ df_merged["Model Type"] = df_merged["Model Name"].map(model_type_map)
193
+ if "Think" in leaderboard_df.columns:
194
+ think_map = dict(zip(leaderboard_df["Model Name"], leaderboard_df["Think"]))
195
+ df_merged["Think"] = df_merged["Model Name"].map(think_map)
196
+ # Remove rows with missing Overall or {Category}
197
+ df_merged = df_merged[df_merged["Overall"].notna() & df_merged[selected_category].notna()]
198
+ return df_merged
199
+ return df_cat
200
+
201
+ df = get_merged_df(default_category)
202
+
203
+ # Prepare med_len_map if possible
204
+ med_len_map = None
205
+ if leaderboard_df is not None and "Med. Len." in leaderboard_df.columns:
206
+ med_len_map = dict(zip(leaderboard_df["Model Name"], leaderboard_df["Med. Len."]))
207
+
208
+ with gr.Column():
209
+ category_selector = gr.Dropdown(
210
+ choices=categories,
211
+ value=default_category,
212
+ label="Select Category for Length Table",
213
+ interactive=True,
214
+ )
215
+
216
+ table_html = gr.HTML(
217
+ value=render_length_category_html(df, med_len_map=med_len_map) if df is not None else "<div>No data available.</div>",
218
+ elem_id="length-category-table"
219
+ )
220
+
221
+ def update_table(selected_category):
222
+ df = get_merged_df(selected_category)
223
+ html = render_length_category_html(df, med_len_map=med_len_map)
224
+ return html
225
+
226
+ category_selector.change(
227
+ fn=update_table,
228
+ inputs=[category_selector],
229
+ outputs=[table_html]
230
+ )
231
+
232
+ return {
233
+ "category_selector": category_selector,
234
+ "table_html": table_html,
235
+ }
236
+
237
+ def create_leaderboard_tab(df, key):
238
+ """
239
+ df: DataFrame to display
240
+ key: "Category" or "Language"
241
+ column_selector_value: default columns to select
242
+ """
243
+ # Ensure df has Model, Model Type, Think columns for filtering
244
+ # No need to create Model column, only use Model Name
245
+ # Always ensure "Overall" column exists
246
+ if "Overall" not in df.columns:
247
+ return # Or handle error appropriately
248
+ # No additional mapping needed since DataFrame already has columns
249
+
250
+ df_state = gr.State(df)
251
+
252
+ # Create DataFrame including badge information (for upper table)
253
+ df_badge = df.copy()
254
+ # If Overall values are in the range 0~1, convert to 0~100
255
+ if "Overall" in df_badge.columns and df_badge["Overall"].max() <= 1.0:
256
+ df_badge["Overall"] = df_badge["Overall"] * 100
257
+ # Remove Group column (only in display)
258
+ for col_to_drop in ["Group"]:
259
+ if col_to_drop in df_badge.columns:
260
+ df_badge = df_badge.drop(columns=[col_to_drop])
261
+ # Handle error if "Overall" column does not exist
262
+ if "Overall" not in df_badge.columns:
263
+ return # Or handle error appropriately
264
+ # Always sort by "Overall"
265
+ # 1. Sort so that empty strings come to the top first
266
+ df_badge = df_badge.sort_values("Overall", key=lambda x: (x == "").astype(int))
267
+ # 2. Then sort the actual values in descending order (empty strings are already at the top, so no effect)
268
+ df_badge = df_badge.sort_values("Overall", ascending=False, kind="mergesort").reset_index(drop=True)
269
+ df_badge["Rank"] = df_badge.index + 1
270
+ # Reorder "Rank" column to be right after "Model Name"
271
+ cols = df_badge.columns.tolist()
272
+ if "Model Name" in cols and "Rank" in cols:
273
+ model_name_idx = cols.index("Model Name")
274
+ cols.remove("Rank")
275
+ cols.insert(model_name_idx + 1, "Rank")
276
+ df_badge = df_badge[cols]
277
+
278
+ with gr.Row():
279
+ # Type Selector (Open/Proprietary)
280
+ type_choices = ["Open", "Proprietary"]
281
+ type_selector = gr.CheckboxGroup(
282
+ choices=type_choices,
283
+ value=type_choices,
284
+ label="Select Type (Open/Proprietary)"
285
+ )
286
+
287
+ # Model Type Selector (Instruct/Think/Hybrid)
288
+ model_type_choices = ["Instruct", "Think", "Hybrid"]
289
+ model_type_selector = gr.CheckboxGroup(
290
+ choices=model_type_choices,
291
+ value=model_type_choices,
292
+ label="Select Model Type (Instruct/Think/Hybrid)"
293
+ )
294
+ # Think Selector (On/Off)
295
+ think_choices = ["On", "Off"]
296
+ think_selector = gr.CheckboxGroup(
297
+ choices=think_choices,
298
+ value=think_choices,
299
+ label="Select Think Mode (On/Off)"
300
+ )
301
+ # Add Gradio component for selecting sort criteria (always descending)
302
+ # For language leaderboard, dynamically extract language columns + Avg. Len., Parameter Size (B)
303
+
304
+ if key == "Language":
305
+ import re
306
+ language_columns = [col for col in df_badge.columns if re.fullmatch(r"[A-Z]{2}", col) or col == "VI"]
307
+ available_sort_columns = ["Overall", "Med. Len.", "Med. Resp. Len.", "Parameter Size (B)"] + language_columns
308
+ else:
309
+ category_columns = [
310
+ "Overall", "Med. Len.", "Med. Resp. Len.", "Parameter Size (B)", "Content Generation", "Editing", "Data Analysis", "Reasoning",
311
+ "Hallucination", "Safety", "Repetition", "Summarization", "Translation", "Multi-Turn"
312
+ ]
313
+ available_sort_columns = [col for col in category_columns if col in df_badge.columns]
314
+
315
+ sort_col_dropdown = gr.Dropdown(
316
+ choices=available_sort_columns,
317
+ value="Overall",
318
+ label="Sort by",
319
+ interactive=True,
320
+ )
321
+
322
+ # Sorting function
323
+ leaderboard_html = render_leaderboard_html(df_badge.round(3), overall_col="Overall", key=key)
324
+ leaderboard_html_comp = gr.HTML(value=leaderboard_html, elem_id="leaderboard-table")
325
+
326
+ # Filtering logic for new selectors
327
+ def unified_filter(types, model_types, thinks, df, sort_col):
328
+ # Apply search filter first
329
+ filtered = df.copy()
330
+ if "Type" in filtered.columns and (not types or len(types) == 0):
331
+ types = filtered["Type"].unique().tolist()
332
+ if "Model Type" in filtered.columns and (not model_types or len(model_types) == 0):
333
+ model_types = filtered["Model Type"].unique().tolist()
334
+ if "Think" in filtered.columns and (not thinks or len(thinks) == 0):
335
+ thinks = filtered["Think"].unique().tolist()
336
+ # Defensive: always ensure "Overall" exists
337
+ if "Type" in filtered.columns:
338
+ filtered["Type"] = filtered["Type"].fillna("").astype(str)
339
+ types_norm = [v.lower().strip() for v in types]
340
+ filtered = filtered[filtered["Type"].str.lower().str.strip().isin(types_norm)]
341
+ if "Model Type" in filtered.columns:
342
+ filtered["Model Type"] = filtered["Model Type"].fillna("").astype(str)
343
+ model_types_norm = [v.lower().strip() for v in model_types]
344
+ filtered = filtered[filtered["Model Type"].str.lower().str.strip().isin(model_types_norm)]
345
+ if "Think" in filtered.columns:
346
+ filtered["Think"] = filtered["Think"].fillna("").astype(str)
347
+ thinks_norm = [v.lower().strip() for v in thinks]
348
+ filtered = filtered[filtered["Think"].str.lower().str.strip().isin(thinks_norm)]
349
+ if "Overall" not in filtered.columns:
350
+ html = "<div style='color:red'>No 'Overall' column found in data. Please check your input data.</div>"
351
+ return html, sort_col
352
+ # Always sort in descending order
353
+ # To make empty strings come to the top, replace them with np.inf and sort descending
354
+ sort_col_for_sort = filtered[sort_col].replace('', np.inf).astype(float)
355
+ filtered = filtered.assign(sort_col_tmp=sort_col_for_sort)
356
+ filtered = filtered.sort_values('sort_col_tmp', ascending=False, kind="mergesort").reset_index(drop=True)
357
+ filtered = filtered.drop(columns=['sort_col_tmp'])
358
+ # Add "Rank" column and reorder it to be right after "Model Name"
359
+ filtered["Rank"] = filtered.index + 1
360
+ cols = filtered.columns.tolist()
361
+ if "Model Name" in cols and "Rank" in cols:
362
+ model_name_idx = cols.index("Model Name")
363
+ cols.remove("Rank")
364
+ cols.insert(model_name_idx + 1, "Rank")
365
+ filtered = filtered[cols]
366
+ # Always remove Group column
367
+ for col_to_drop in ["Group"]:
368
+ if col_to_drop in filtered.columns:
369
+ filtered = filtered.drop(columns=[col_to_drop])
370
+ filtered._sort_col = sort_col
371
+ # Extract top-5 models (currently sorted in descending order)
372
+ top5_models = []
373
+ if sort_col in filtered.columns and "Model Name" in filtered.columns:
374
+ # 1. Sort so that empty strings come to the top first
375
+ sort_col_for_sort = filtered[sort_col].replace('', np.inf).astype(float)
376
+ filtered_df_sorted = filtered.assign(sort_col_tmp=sort_col_for_sort)
377
+ filtered_df_sorted = filtered_df_sorted.sort_values('sort_col_tmp', ascending=False, kind="mergesort").reset_index(drop=True)
378
+ top5_models = filtered_df_sorted["Model Name"].tolist()[:5]
379
+ return render_leaderboard_html(filtered, overall_col="Overall", key=key), sort_col, top5_models
380
+
381
+ # Download CSV function
382
+ def dataframe_to_csv(data):
383
+ import pandas as pd
384
+ # Convert if data is not a DataFrame
385
+ if isinstance(data, pd.DataFrame):
386
+ df = data.copy() # Create a copy to avoid modifying the original DataFrame in memory
387
+ else:
388
+ df = pd.DataFrame(data)
389
+
390
+ # Apply get_display_model_name to the "Model Name" column if it exists
391
+ if "Model Name" in df.columns:
392
+ df["Model Name"] = df["Model Name"].apply(get_display_model_name)
393
+
394
+ csv_path = f"truebench_{key}.csv"
395
+ df.to_csv(csv_path, index=False)
396
+ return csv_path
397
+
398
+ # Add DownloadButton (using CSS class)
399
+ with gr.Row():
400
+ with gr.Column(scale=1):
401
+ pass # Empty space
402
+ with gr.Column(scale=0):
403
+ download_btn = gr.DownloadButton(
404
+ label="📥 Download to CSV",
405
+ value=dataframe_to_csv,
406
+ inputs=[df_state],
407
+ visible=True,
408
+ elem_classes=["custom-download-btn"]
409
+ )
410
+
411
+ # Add custom CSS
412
+ custom_css = """
413
+ <style>
414
+ .custom-download-btn >>> a {
415
+ background: #e3e6f3 !important;
416
+ color: #222 !important;
417
+ border: 1px solid rgba(0, 0, 0, 0.1) !important;
418
+ border-radius: 6px !important;
419
+ padding: 1px 1px !important;
420
+ font-size: 13px !important;
421
+ font-weight: bold !important;
422
+ text-shadow: 0 1px 1px rgba(0,0,0,0.1) !important;
423
+ margin: 0 3px 3px 0 !important;
424
+ }
425
+ .custom-download-btn:hover {
426
+ background: #f5f6fa !important;
427
+ box-shadow: 0 2px 6px rgba(0, 0, 0, 0.1) !important;
428
+ }
429
+ </style>
430
+ """
431
+ gr.HTML(custom_css)
432
+
433
+ sort_col_dropdown.change(
434
+ fn=unified_filter,
435
+ inputs=[type_selector, model_type_selector, think_selector, df_state, sort_col_dropdown],
436
+ outputs=[leaderboard_html_comp, sort_col_dropdown, gr.State()] # Add top5_models
437
+ )
438
+ type_selector.change(
439
+ fn=unified_filter,
440
+ inputs=[type_selector, model_type_selector, think_selector, df_state, sort_col_dropdown],
441
+ outputs=[leaderboard_html_comp, sort_col_dropdown, gr.State()]
442
+ )
443
+ model_type_selector.change(
444
+ fn=unified_filter,
445
+ inputs=[type_selector, model_type_selector, think_selector, df_state, sort_col_dropdown],
446
+ outputs=[leaderboard_html_comp, sort_col_dropdown, gr.State()]
447
+ )
448
+ think_selector.change(
449
+ fn=unified_filter,
450
+ inputs=[type_selector, model_type_selector, think_selector, df_state, sort_col_dropdown],
451
+ outputs=[leaderboard_html_comp, sort_col_dropdown, gr.State()]
452
+ )
453
+
454
+ return {
455
+ "type_selector": type_selector,
456
+ "model_type_selector": model_type_selector,
457
+ "think_selector": think_selector,
458
+ "leaderboard_html_comp": leaderboard_html_comp,
459
+ "sort_col_dropdown": sort_col_dropdown,
460
+ "df_state": df_state,
461
+ "unified_filter": unified_filter # Exposed for direct external call
462
+ }
utils.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ import yaml
3
+ import json
4
+ import gradio as gr
5
+ from huggingface_hub import whoami
6
+
7
+ def get_profile(profile: gr.OAuthProfile | None) -> str:
8
+ if profile is None:
9
+ return "Anonymous"
10
+ return profile.username
11
+
12
+ def get_organizations(oauth_token: gr.OAuthToken | None) -> str:
13
+ if oauth_token is None:
14
+ return "No Organization"
15
+ org_names = [org["name"] for org in whoami(oauth_token.token)["orgs"]]
16
+ return org_names
17
+
18
+ def get_profile_and_organizations(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToken | None) -> tuple[str, str]:
19
+ if profile is None:
20
+ output_profile = "Anonymous"
21
+ else:
22
+ output_profile = profile.username
23
+
24
+ if oauth_token is None:
25
+ output_org = "No Organization"
26
+ else:
27
+ output_org = [org["name"] for org in whoami(oauth_token.token)["orgs"]]
28
+
29
+ return output_profile, output_org
30
+
31
+ def download_with_restart(snapshot_download_func, repo_id, local_dir, repo_type, token, restart_func):
32
+ try:
33
+ snapshot_download_func(
34
+ repo_id=repo_id,
35
+ local_dir=local_dir,
36
+ repo_type=repo_type,
37
+ tqdm_class=None,
38
+ etag_timeout=30,
39
+ token=token
40
+ )
41
+ except Exception:
42
+ restart_func()
vis_utils.py ADDED
@@ -0,0 +1,723 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import plotly.graph_objects as go
4
+ from plotly.graph_objs._figure import Figure
5
+ from typing import Optional, List, Dict, Any
6
+ from src.display.formatting import get_display_model_name
7
+
8
+ SORT_COLUMN_MAP = {
9
+ "Average Accuracy": "Avg AC",
10
+ "Tool Selection Quality": "Avg TSQ",
11
+ "Session Cost": "Avg Total Cost"
12
+ }
13
+
14
+ def get_theme_colors(theme: str = "light") -> Dict[str, Any]:
15
+ """Return color settings for the given theme."""
16
+ if theme == "dark":
17
+ return {
18
+ "paper_bg": "#181c3a", # darker blue-gray
19
+ "plot_bg": "#181c3a",
20
+ "legend_font_color": "#F5F6F7",
21
+ "legend_bg": 'rgba(35,36,74,0.92)', # slightly lighter than bg, but still dark
22
+ "annotation_color": '#F5F6F7'
23
+ }
24
+ else:
25
+ return {
26
+ "paper_bg": "#23244a", # deep blue-gray
27
+ "plot_bg": "#23244a",
28
+ "legend_font_color": "#F5F6F7",
29
+ "legend_bg": 'rgba(35,36,74,0.92)', # match bg for harmony
30
+ "annotation_color": '#F5F6F7'
31
+ }
32
+
33
+ def create_empty_radar_chart(message: str) -> Figure:
34
+ """Create an empty radar chart with a message."""
35
+ fig = go.Figure()
36
+ fig.add_annotation(
37
+ text=f"📊 {message}",
38
+ xref="paper", yref="paper",
39
+ x=0.5, y=0.5,
40
+ xanchor='center', yanchor='middle',
41
+ font=dict(
42
+ size=18,
43
+ color="#94A3B8",
44
+ family="Verdana, sans-serif"
45
+ ),
46
+ showarrow=False,
47
+ bgcolor="rgba(245, 246, 247, 0.05)",
48
+ bordercolor="rgba(245, 246, 247, 0.2)",
49
+ borderwidth=1,
50
+ borderpad=20
51
+ )
52
+ fig.update_layout(
53
+ paper_bgcolor="#01091A",
54
+ plot_bgcolor="rgba(245, 246, 247, 0.02)",
55
+ height=800,
56
+ width=800,
57
+ margin=dict(t=100, b=80, l=80, r=80),
58
+ title=dict(
59
+ text="<b>Domain Performance Chart</b>",
60
+ x=0.5,
61
+ y=0.97,
62
+ font=dict(
63
+ size=22,
64
+ family="Verdana, sans-serif",
65
+ color="#F5F6F7",
66
+ weight=700
67
+ ),
68
+ ),
69
+ annotations=[
70
+ dict(
71
+ text="TRUEBench",
72
+ xref="paper", yref="paper",
73
+ x=0.98, y=0.02,
74
+ xanchor='right', yanchor='bottom',
75
+ font=dict(size=10, color='#64748B'),
76
+ showarrow=False
77
+ )
78
+ ]
79
+ )
80
+ return fig
81
+
82
+ def create_len_overall_scatter(
83
+ df: pd.DataFrame,
84
+ selected_models: Optional[List[str]] = None,
85
+ max_models: int = 30,
86
+ y_col: str = "Overall",
87
+ length_data: Optional[dict] = None,
88
+ theme: str = "light",
89
+ x_axis_data_source: str = "Med. Len."
90
+ ) -> Figure:
91
+ """
92
+ Create scatter plot showing Med. Len. vs selected y_col for up to 10 selected models.
93
+ Each dot is colored by Think (normal/reasoning), and the legend is by Think.
94
+ DataFrame must include an 'Think' column.
95
+ length_data: JSON data containing model length information by category
96
+ theme: "light" or "dark" (default: "light")
97
+ """
98
+ import plotly.express as px
99
+ import json
100
+
101
+ # Defensive: check required columns
102
+ required_cols = ['Model Name', 'Med. Len.', 'Med. Resp. Len.', y_col]
103
+ for col in required_cols:
104
+ if col not in df.columns:
105
+ return create_empty_radar_chart(f"Column '{col}' not found in data")
106
+ # Think column check
107
+ think_col = None
108
+ for candidate in ['Think']:
109
+ if candidate in df.columns:
110
+ think_col = candidate
111
+ break
112
+ if think_col is None:
113
+ return create_empty_radar_chart("Column 'Think' not found in data")
114
+ # Filter by selected_models
115
+ if selected_models is not None and len(selected_models) > 0:
116
+ df_filtered = df[df['Model Name'].isin(selected_models)].copy()
117
+ else:
118
+ # Default: top-N by Overall
119
+ df_filtered = df.copy()
120
+ df_filtered = df_filtered.sort_values('Overall', ascending=False).head(max_models)
121
+ if df_filtered.empty:
122
+ return create_empty_radar_chart(f"No data available for {x_axis_data_source} vs {y_col} analysis")
123
+
124
+ # Determine x-axis data based on x_axis_data_source
125
+ x_axis_col_name = x_axis_data_source # Use this for the DataFrame column
126
+ length_data_key = 'Med' if x_axis_data_source == "Med. Len." else 'Med Resp'
127
+
128
+ if y_col == "Overall":
129
+ # For 'Overall' category, prefer direct DataFrame column reading
130
+ df_filtered[x_axis_col_name] = pd.to_numeric(df_filtered[x_axis_col_name], errors='coerce')
131
+ elif length_data:
132
+ # For other categories, use length_data if available
133
+ df_filtered[x_axis_col_name] = df_filtered['Model Name'].apply(
134
+ lambda x: length_data.get(x, {}).get(y_col, {}).get(length_data_key, 0)
135
+ )
136
+ else:
137
+ # Fallback if no length_data and not 'Overall' (though this case should ideally be handled by required_cols)
138
+ df_filtered[x_axis_col_name] = pd.to_numeric(df_filtered[x_axis_col_name], errors='coerce')
139
+
140
+ df_filtered[y_col] = pd.to_numeric(df_filtered[y_col], errors='coerce')
141
+ if 'Type' in df_filtered.columns:
142
+ df_filtered = df_filtered[df_filtered['Type'] != 'Proprietary']
143
+ if 'Parameter Size (B)' in df_filtered.columns:
144
+ df_filtered['Parameter Size (B)'] = pd.to_numeric(df_filtered['Parameter Size (B)'], errors='coerce')
145
+ min_size = 20
146
+ max_size = 80
147
+ param_sizes = df_filtered['Parameter Size (B)'].fillna(5)
148
+ log_sizes = np.log10(param_sizes)
149
+ log_min = np.log10(5)
150
+ log_max = np.log10(param_sizes.max())
151
+ marker_sizes = min_size + ((log_sizes - log_min) / (log_max - log_min)) * (max_size - min_size)
152
+ else:
153
+ marker_sizes = [30] * len(df_filtered)
154
+
155
+ legend_name_map = {
156
+ 'On': 'Thinking',
157
+ 'Off': 'Non-Thinking'
158
+ }
159
+ color_palette = {
160
+ "Thinking": "#FCE39B",
161
+ "Non-Thinking": "#FF9185"
162
+ }
163
+ df_filtered['MarkerType'] = df_filtered['Parameter Size (B)'].apply(
164
+ lambda x: 'circle' if pd.notna(x) else 'star'
165
+ )
166
+ df_filtered['ThinkDisplay'] = df_filtered['Think'].map(legend_name_map).fillna(df_filtered['Think'])
167
+ prefix_map = {
168
+ 'circle': 'Open',
169
+ 'star': 'Proprietary'
170
+ }
171
+ combinations = df_filtered[['ThinkDisplay', 'MarkerType']].drop_duplicates()
172
+ marker_order = {'circle': 0, 'star': 1}
173
+ think_order = {'Thinking': 0, 'Non-Thinking': 1}
174
+ combinations['sort_key'] = combinations.apply(
175
+ lambda row: (marker_order.get(row['MarkerType'], 99), think_order.get(row['ThinkDisplay'], 99)),
176
+ axis=1
177
+ )
178
+ combinations = combinations.sort_values('sort_key')
179
+
180
+ fig = go.Figure()
181
+ legend_shown = set()
182
+ median_x = df_filtered[x_axis_col_name].median()
183
+ median_y = df_filtered[y_col].median()
184
+
185
+ x_axis_display_name = x_axis_data_source.replace("Med.", "Median").replace("Len.", "Length")
186
+
187
+ fig.add_vline(
188
+ x=median_x,
189
+ line_dash="dash",
190
+ line_color="#64748B",
191
+ opacity=0.6,
192
+ line_width=1.5,
193
+ annotation_text=f"{x_axis_display_name}",
194
+ annotation_position="top right",
195
+ annotation_font=dict(size=10, color="#64748B")
196
+ )
197
+ fig.add_hline(
198
+ y=median_y,
199
+ line_dash="dash",
200
+ line_color="#64748B",
201
+ opacity=0.6,
202
+ line_width=1.5,
203
+ annotation_text=f"Median {y_col}",
204
+ annotation_position="bottom right",
205
+ annotation_font=dict(size=10, color="#64748B")
206
+ )
207
+
208
+ for _, row in combinations.iterrows():
209
+ think = row['ThinkDisplay']
210
+ marker_type = row['MarkerType']
211
+ prefix = prefix_map.get(marker_type, '')
212
+ legend_name = f"{prefix} {think}"
213
+ sub_df = df_filtered[
214
+ (df_filtered['ThinkDisplay'] == think) &
215
+ (df_filtered['MarkerType'] == marker_type)
216
+ ]
217
+ color = color_palette.get(think, "#1098F7")
218
+ sub_marker_sizes = (
219
+ marker_sizes[sub_df.index]
220
+ if 'Parameter Size (B)' in df_filtered.columns and marker_type == 'circle'
221
+ else [30] * len(sub_df)
222
+ )
223
+ show_legend = legend_name not in legend_shown
224
+ legend_shown.add(legend_name)
225
+ fig.add_trace(go.Scatter(
226
+ x=sub_df[x_axis_col_name],
227
+ y=sub_df[y_col],
228
+ mode='markers+text',
229
+ name=legend_name,
230
+ legendgroup=legend_name,
231
+ showlegend=show_legend,
232
+ marker_symbol=marker_type,
233
+ marker=dict(
234
+ size=sub_marker_sizes,
235
+ color=color,
236
+ opacity=0.85,
237
+ line=dict(width=2, color='#01091A')
238
+ ),
239
+ text=sub_df['Model Name'].apply(get_display_model_name),
240
+ textposition="top center",
241
+ textfont=dict(size=10, color='#94A3B8'),
242
+ hovertemplate="<b>%{text}</b><br>" +
243
+ f"{x_axis_display_name}: "+"%{x:.2f}<br>" +
244
+ f"{y_col}: "+"%{y:.2f}<br>" +
245
+ f"Think: {legend_name}<br>" +
246
+ ("Parameter Size: %{customdata}B<br>" if marker_type == 'circle' else "") +
247
+ "<extra></extra>",
248
+ customdata=sub_df['Parameter Size (B)'].values if marker_type == 'circle' else None
249
+ ))
250
+
251
+ # Theme colors
252
+ theme_colors = get_theme_colors(theme)
253
+ fig.update_layout(
254
+ title=dict(
255
+ text=f"<b>{y_col} {x_axis_display_name} vs Category Score</b>",
256
+ x=0.5,
257
+ y=0.97,
258
+ font=dict(size=22, family="Verdana, sans-serif", color=theme_colors["legend_font_color"], weight=700)
259
+ ),
260
+ xaxis=dict(
261
+ title=dict(
262
+ text=f"<b>{y_col} {x_axis_display_name}</b>",
263
+ font=dict(size=16, color=theme_colors["legend_font_color"])
264
+ ),
265
+ tickfont=dict(size=12, color="#94A3B8"),
266
+ gridcolor="rgba(245, 246, 247, 0.1)",
267
+ zerolinecolor="rgba(245, 246, 247, 0.2)"
268
+ ),
269
+ yaxis=dict(
270
+ title=dict(
271
+ text=f"<b>{y_col} Score</b>",
272
+ font=dict(size=16, color=theme_colors["legend_font_color"])
273
+ ),
274
+ tickfont=dict(size=12, color="#94A3B8"),
275
+ gridcolor="rgba(245, 246, 247, 0.1)",
276
+ zerolinecolor="rgba(245, 246, 247, 0.2)"
277
+ ),
278
+ paper_bgcolor=theme_colors["paper_bg"],
279
+ plot_bgcolor=theme_colors["plot_bg"],
280
+ height=900,
281
+ width=1450,
282
+ showlegend=True,
283
+ legend=dict(
284
+ orientation="h",
285
+ yanchor="bottom",
286
+ y=1,
287
+ xanchor="center",
288
+ x=0.5,
289
+ font=dict(size=12, family="Verdana, sans-serif", color=theme_colors["legend_font_color"]),
290
+ bgcolor=theme_colors["legend_bg"],
291
+ bordercolor='rgba(245, 246, 247, 0.2)',
292
+ borderwidth=1
293
+ ),
294
+ margin=dict(t=100, b=80, l=80, r=80)
295
+ )
296
+ return fig
297
+
298
+ def create_language_radar_chart(
299
+ df: pd.DataFrame,
300
+ metric_type: str,
301
+ selected_models: Optional[List[str]] = None,
302
+ max_models: int = 5,
303
+ theme: str = "light"
304
+ ) -> Figure:
305
+ """
306
+ Create a radar chart showing model performance across languages for the selected models.
307
+ theme: "light" or "dark" (default: "light")
308
+ """
309
+ language_domains = ['KO', 'EN', 'JA', 'ZH', 'PL', 'DE', 'PT', 'ES', 'FR', 'IT', 'RU', 'VI']
310
+ if selected_models is None or len(selected_models) == 0:
311
+ actual_metric_type = SORT_COLUMN_MAP.get(metric_type, metric_type)
312
+ if actual_metric_type in df.columns:
313
+ selected_models = df.nlargest(max_models, actual_metric_type)['Model Name'].tolist()
314
+ else:
315
+ selected_models = df.head(max_models)['Model Name'].tolist()
316
+ selected_models = selected_models[:max_models]
317
+ harmonious_palette_light = [
318
+ {'fill': 'rgba(79,143,198,0.25)', 'line': '#4F8FC6', 'name': 'BlueGray'},
319
+ {'fill': 'rgba(109,213,237,0.25)', 'line': '#6DD5ED', 'name': 'SkyBlue'},
320
+ {'fill': 'rgba(162,89,247,0.25)', 'line': '#A259F7', 'name': 'Violet'},
321
+ {'fill': 'rgba(67,233,123,0.25)', 'line': '#43E97B', 'name': 'Mint'},
322
+ {'fill': 'rgba(255,215,0,0.20)', 'line': '#FFD700', 'name': 'Gold'}
323
+ ]
324
+ harmonious_palette_dark = [
325
+ {'fill': 'rgba(144,202,249,0.25)', 'line': '#90CAF9', 'name': 'LightBlue'},
326
+ {'fill': 'rgba(128,203,196,0.25)', 'line': '#80CBC4', 'name': 'Mint'},
327
+ {'fill': 'rgba(179,157,219,0.25)', 'line': '#B39DDB', 'name': 'Lavender'},
328
+ {'fill': 'rgba(244,143,177,0.25)', 'line': '#F48FB1', 'name': 'Pink'},
329
+ {'fill': 'rgba(255,213,79,0.20)', 'line': '#FFD54F', 'name': 'Gold'}
330
+ ]
331
+ palette = harmonious_palette_light if theme == "light" else harmonious_palette_dark
332
+ fig = go.Figure()
333
+ for idx, model_name in enumerate(selected_models):
334
+ model_data = df[df['Model Name'] == model_name]
335
+ if model_data.empty:
336
+ continue
337
+ model_row = model_data.iloc[0]
338
+ values = []
339
+ for lang in language_domains:
340
+ val = model_row[lang] if lang in model_row else 0
341
+ if pd.isna(val) or val == '':
342
+ val = 0
343
+ else:
344
+ val = float(val)
345
+ values.append(val)
346
+ values_plot = values + [values[0]]
347
+ domains_plot = language_domains + [language_domains[0]]
348
+ colors = palette[idx % len(palette)]
349
+ fig.add_trace(
350
+ go.Scatterpolar(
351
+ r=values_plot,
352
+ theta=domains_plot,
353
+ fill='toself',
354
+ fillcolor=colors['fill'],
355
+ line=dict(
356
+ color=colors['line'],
357
+ width=3,
358
+ shape='spline',
359
+ smoothing=0.5
360
+ ),
361
+ marker=dict(
362
+ size=10,
363
+ color=colors['line'],
364
+ symbol='circle',
365
+ line=dict(width=2, color='#01091A' if theme == "light" else '#e3e6f3')
366
+ ),
367
+ name=get_display_model_name(model_name),
368
+ mode="lines+markers",
369
+ hovertemplate="<b>%{fullData.name}</b><br>" +
370
+ "<span style='color: #94A3B8'>%{theta}</span><br>" +
371
+ "<b style='font-size: 12px'>%{r:.3f}</b><br>" +
372
+ "<extra></extra>",
373
+ hoverlabel=dict(
374
+ bgcolor="rgba(1, 9, 26, 0.95)" if theme == "dark" else "rgba(227,230,243,0.95)",
375
+ bordercolor=colors['line'],
376
+ font=dict(color="#F5F6F7" if theme == "dark" else "#23244a", size=12, family="Verdana, sans-serif")
377
+ )
378
+ )
379
+ )
380
+ max_range = 100.0
381
+ tick_vals = [i * max_range / 5 for i in range(6)]
382
+ tick_text = [f"{val:.2f}" for val in tick_vals]
383
+ theme_colors = get_theme_colors(theme)
384
+ fig.update_layout(
385
+ polar=dict(
386
+ bgcolor=theme_colors["plot_bg"],
387
+ domain=dict(x=[0,1], y=[0,1]),
388
+ radialaxis=dict(
389
+ visible=True,
390
+ range=[0, max_range],
391
+ showline=True,
392
+ linewidth=2,
393
+ linecolor='rgba(245, 246, 247, 0.2)',
394
+ gridcolor='rgba(245, 246, 247, 0.1)',
395
+ gridwidth=1,
396
+ tickvals=tick_vals,
397
+ ticktext=tick_text,
398
+ tickfont=dict(
399
+ size=11,
400
+ color='#94A3B8',
401
+ family="'Geist Mono', monospace"
402
+ ),
403
+ tickangle=0
404
+ ),
405
+ angularaxis=dict(
406
+ showline=True,
407
+ linewidth=2,
408
+ linecolor='rgba(245, 246, 247, 0.2)',
409
+ gridcolor='rgba(245, 246, 247, 0.08)',
410
+ tickfont=dict(
411
+ size=14,
412
+ family="Verdana, sans-serif",
413
+ color=theme_colors["legend_font_color"],
414
+ weight=600
415
+ ),
416
+ ticktext=[
417
+ "📝 Content Gen",
418
+ "✂️ Editing",
419
+ "📊 Data Analysis",
420
+ "🧠 Reasoning",
421
+ "🦄 Hallucination",
422
+ "🛡️ Safety",
423
+ "🔁 Repetition",
424
+ "📝 Summarization",
425
+ "🌐 Translation",
426
+ "💬 Multi-Turn"
427
+ ],
428
+ rotation=90,
429
+ direction="clockwise",
430
+ ),
431
+ ),
432
+ showlegend=True,
433
+ legend=dict(
434
+ orientation="h",
435
+ yanchor="bottom",
436
+ y=-0.15,
437
+ xanchor="center",
438
+ x=0.5,
439
+ font=dict(
440
+ size=12,
441
+ family="Verdana, sans-serif",
442
+ color=theme_colors["legend_font_color"]
443
+ ),
444
+ bgcolor=theme_colors["legend_bg"],
445
+ bordercolor='rgba(245, 246, 247, 0.2)',
446
+ borderwidth=1,
447
+ itemsizing='constant',
448
+ itemwidth=30
449
+ ),
450
+ title=dict(
451
+ text=f"<b>Language Performance</b>",
452
+ x=0.5,
453
+ y=0.97,
454
+ font=dict(
455
+ size=22,
456
+ family="Verdana, sans-serif",
457
+ color=theme_colors["legend_font_color"],
458
+ weight=700
459
+ ),
460
+ ),
461
+ paper_bgcolor=theme_colors["paper_bg"],
462
+ plot_bgcolor=theme_colors["plot_bg"],
463
+ height=900,
464
+ width=1450,
465
+ margin=dict(t=100, b=80, l=80, r=80),
466
+ annotations=[
467
+ dict(
468
+ text="TRUEBench",
469
+ xref="paper", yref="paper",
470
+ x=0.98, y=0.02,
471
+ xanchor='right', yanchor='bottom',
472
+ font=dict(size=10, color=theme_colors["annotation_color"]),
473
+ showarrow=False
474
+ )
475
+ ]
476
+ )
477
+ return fig
478
+
479
+ def load_leaderboard_data() -> pd.DataFrame:
480
+ """Load and prepare the leaderboard data (Category)."""
481
+ from src.data_loader import get_category_dataframe
482
+ return get_category_dataframe(processed=True)
483
+
484
+ def load_leaderboard_language_data() -> pd.DataFrame:
485
+ """Load and prepare the leaderboard data (Language)."""
486
+ from src.data_loader import get_language_dataframe
487
+ return get_language_dataframe(processed=True)
488
+
489
+ def create_domain_radar_chart(
490
+ df: pd.DataFrame,
491
+ metric_type: str,
492
+ selected_models: Optional[List[str]] = None,
493
+ max_models: int = 5,
494
+ theme: str = "light"
495
+ ) -> Figure:
496
+ """
497
+ Create a radar chart showing model performance across domains for the selected metric.
498
+ theme: "light" or "dark" (default: "light")
499
+ """
500
+ actual_metric_type = SORT_COLUMN_MAP.get(metric_type, metric_type)
501
+ domain_mapping = {
502
+ 'Avg AC': {
503
+ 'Content Generation': '📝 Content Generation',
504
+ 'Editing': '✂️ Editing',
505
+ 'Data Analysis': '📊 Data Analysis',
506
+ 'Reasoning': '🧠 Reasoning',
507
+ 'Hallucination': '🦄 Hallucination',
508
+ 'Safety': '🛡️ Safety',
509
+ 'Repetition': '🔁 Repetition',
510
+ 'Summarization': '📝 Summarization',
511
+ 'Translation': '🌐 Translation',
512
+ 'Multi-Turn': '💬 Multi-Turn'
513
+ },
514
+ 'Avg TSQ': {
515
+ 'Content Generation': 'Content Generation',
516
+ 'Editing': 'Editing',
517
+ 'Data Analysis': 'Data Analysis',
518
+ 'Reasoning': 'Reasoning',
519
+ 'Hallucination': 'Hallucination',
520
+ 'Safety': 'Safety',
521
+ 'Repetition': 'Repetition',
522
+ 'Summarization': 'Summarization',
523
+ 'Translation': 'Translation',
524
+ 'Multi-Turn': 'Multi-Turn'
525
+ },
526
+ 'Avg Total Cost': {
527
+ 'Content Generation': 'Content Generation',
528
+ 'Editing': 'Editing',
529
+ 'Data Analysis': 'Data Analysis',
530
+ 'Reasoning': 'Reasoning',
531
+ 'Hallucination': 'Hallucination',
532
+ 'Safety': 'Safety',
533
+ 'Repetition': 'Repetition',
534
+ 'Summarization': 'Summarization',
535
+ 'Translation': 'Translation',
536
+ 'Multi-Turn': 'Multi-Turn'
537
+ },
538
+ 'Avg Session Duration': {
539
+ 'Content Generation': 'Content Generation',
540
+ 'Editing': 'Editing',
541
+ 'Data Analysis': 'Data Analysis',
542
+ 'Reasoning': 'Reasoning',
543
+ 'Hallucination': 'Hallucination',
544
+ 'Safety': 'Safety',
545
+ 'Repetition': 'Repetition',
546
+ 'Summarization': 'Summarization',
547
+ 'Translation': 'Translation',
548
+ 'Multi-Turn': 'Multi-Turn'
549
+ },
550
+ 'Avg Turns': {
551
+ 'Content Generation': 'Content Generation',
552
+ 'Editing': 'Editing',
553
+ 'Data Analysis': 'Data Analysis',
554
+ 'Reasoning': 'Reasoning',
555
+ 'Hallucination': 'Hallucination',
556
+ 'Safety': 'Safety',
557
+ 'Repetition': 'Repetition',
558
+ 'Summarization': 'Summarization',
559
+ 'Translation': 'Translation',
560
+ 'Multi-Turn': 'Multi-Turn'
561
+ }
562
+ }
563
+ if actual_metric_type not in domain_mapping:
564
+ return create_empty_radar_chart(f"Domain breakdown not available for {metric_type}")
565
+ if selected_models is None or len(selected_models) == 0:
566
+ if actual_metric_type in df.columns:
567
+ selected_models = df.nlargest(max_models, actual_metric_type)['Model Name'].tolist()
568
+ else:
569
+ selected_models = df.head(max_models)['Model Name'].tolist()
570
+ selected_models = selected_models[:max_models]
571
+ domains = list(domain_mapping[actual_metric_type].keys())
572
+ domain_columns = list(domain_mapping[actual_metric_type].values())
573
+ harmonious_palette_light = [
574
+ {'fill': 'rgba(79,143,198,0.25)', 'line': '#4F8FC6', 'name': 'BlueGray'},
575
+ {'fill': 'rgba(109,213,237,0.25)', 'line': '#6DD5ED', 'name': 'SkyBlue'},
576
+ {'fill': 'rgba(162,89,247,0.25)', 'line': '#A259F7', 'name': 'Violet'},
577
+ {'fill': 'rgba(67,233,123,0.25)', 'line': '#43E97B', 'name': 'Mint'},
578
+ {'fill': 'rgba(255,215,0,0.20)', 'line': '#FFD700', 'name': 'Gold'}
579
+ ]
580
+ harmonious_palette_dark = [
581
+ {'fill': 'rgba(144,202,249,0.25)', 'line': '#90CAF9', 'name': 'LightBlue'},
582
+ {'fill': 'rgba(128,203,196,0.25)', 'line': '#80CBC4', 'name': 'Mint'},
583
+ {'fill': 'rgba(179,157,219,0.25)', 'line': '#B39DDB', 'name': 'Lavender'},
584
+ {'fill': 'rgba(244,143,177,0.25)', 'line': '#F48FB1', 'name': 'Pink'},
585
+ {'fill': 'rgba(255,213,79,0.20)', 'line': '#FFD54F', 'name': 'Gold'}
586
+ ]
587
+ palette = harmonious_palette_light if theme == "light" else harmonious_palette_dark
588
+ fig = go.Figure()
589
+ for idx, model_name in enumerate(selected_models):
590
+ model_data = df[df['Model Name'] == model_name]
591
+ if model_data.empty:
592
+ continue
593
+ model_row = model_data.iloc[0]
594
+ values = []
595
+ for domain, _ in zip(domains, domain_columns):
596
+ if domain in df.columns and domain in model_row:
597
+ val = model_row[domain]
598
+ if pd.isna(val) or val == '':
599
+ val = 0
600
+ else:
601
+ val = float(val)
602
+ values.append(val)
603
+ else:
604
+ values.append(0)
605
+ values_plot = values + [values[0]]
606
+ domains_plot = domains + [domains[0]]
607
+ colors = palette[idx % len(palette)]
608
+ fig.add_trace(
609
+ go.Scatterpolar(
610
+ r=values_plot,
611
+ theta=domains_plot,
612
+ fill='toself',
613
+ fillcolor=colors['fill'],
614
+ line=dict(
615
+ color=colors['line'],
616
+ width=3,
617
+ shape='spline',
618
+ smoothing=0.5
619
+ ),
620
+ marker=dict(
621
+ size=10,
622
+ color=colors['line'],
623
+ symbol='circle',
624
+ line=dict(width=2, color='#01091A' if theme == "light" else '#e3e6f3')
625
+ ),
626
+ name=get_display_model_name(model_name),
627
+ mode="lines+markers",
628
+ hovertemplate="<b>%{fullData.name}</b><br>" +
629
+ "<span style='color: #94A3B8'>%{theta}</span><br>" +
630
+ "<b style='font-size: 12px'>%{r:.3f}</b><br>" +
631
+ "<extra></extra>",
632
+ hoverlabel=dict(
633
+ bgcolor="rgba(1, 9, 26, 0.95)" if theme == "dark" else "rgba(227,230,243,0.95)",
634
+ bordercolor=colors['line'],
635
+ font=dict(color="#F5F6F7" if theme == "dark" else "#23244a", size=12, family="Verdana, sans-serif")
636
+ )
637
+ )
638
+ )
639
+ max_range = 100.0
640
+ tick_vals = [i * max_range / 5 for i in range(6)]
641
+ tick_text = [f"{val:.2f}" for val in tick_vals]
642
+ theme_colors = get_theme_colors(theme)
643
+ fig.update_layout(
644
+ polar=dict(
645
+ bgcolor=theme_colors["plot_bg"],
646
+ radialaxis=dict(
647
+ visible=True,
648
+ range=[0, max_range],
649
+ showline=True,
650
+ linewidth=2,
651
+ linecolor='rgba(245, 246, 247, 0.2)',
652
+ gridcolor='rgba(245, 246, 247, 0.1)',
653
+ gridwidth=1,
654
+ tickvals=tick_vals,
655
+ ticktext=tick_text,
656
+ tickfont=dict(
657
+ size=11,
658
+ color='#94A3B8',
659
+ family="'Geist Mono', monospace"
660
+ ),
661
+ tickangle=0
662
+ ),
663
+ angularaxis=dict(
664
+ showline=True,
665
+ linewidth=2,
666
+ linecolor='rgba(245, 246, 247, 0.2)',
667
+ gridcolor='rgba(245, 246, 247, 0.08)',
668
+ tickfont=dict(
669
+ size=14,
670
+ family="Verdana, sans-serif",
671
+ color=theme_colors["legend_font_color"],
672
+ weight=600
673
+ ),
674
+ rotation=90,
675
+ direction="clockwise",
676
+ ),
677
+ ),
678
+ showlegend=True,
679
+ legend=dict(
680
+ orientation="h",
681
+ yanchor="bottom",
682
+ y=-0.15,
683
+ xanchor="center",
684
+ x=0.5,
685
+ font=dict(
686
+ size=12,
687
+ family="Verdana, sans-serif",
688
+ color=theme_colors["legend_font_color"]
689
+ ),
690
+ bgcolor=theme_colors["legend_bg"],
691
+ bordercolor='rgba(245, 246, 247, 0.2)',
692
+ borderwidth=1,
693
+ itemsizing='constant',
694
+ itemwidth=30
695
+ ),
696
+ title=dict(
697
+ text=f"<b>Category Performance</b>",
698
+ x=0.5,
699
+ y=0.97,
700
+ font=dict(
701
+ size=22,
702
+ family="Verdana, sans-serif",
703
+ color=theme_colors["legend_font_color"],
704
+ weight=700
705
+ ),
706
+ ),
707
+ paper_bgcolor=theme_colors["paper_bg"],
708
+ plot_bgcolor=theme_colors["plot_bg"],
709
+ height=900,
710
+ width=1450,
711
+ margin=dict(t=100, b=80, l=80, r=80),
712
+ annotations=[
713
+ dict(
714
+ text="TRUEBench",
715
+ xref="paper", yref="paper",
716
+ x=0.98, y=0.02,
717
+ xanchor='right', yanchor='bottom',
718
+ font=dict(size=10, color=theme_colors["annotation_color"]),
719
+ showarrow=False
720
+ )
721
+ ]
722
+ )
723
+ return fig