Spaces:
Running
Running
송종윤/AI Productivity팀(SR)/삼성전자 commited on
Commit ·
8a254d6
0
Parent(s):
Initial commit
Browse filesInitial commit
minor
- .gitattributes +35 -0
- .gitignore +13 -0
- .pre-commit-config.yaml +53 -0
- Makefile +13 -0
- NOTICE +4 -0
- README.md +14 -0
- app.py +654 -0
- constants.py +20 -0
- pyproject.toml +13 -0
- requirements.txt +16 -0
- src/about.py +181 -0
- src/config.py +61 -0
- src/data/length_data.json +1906 -0
- src/data/stats.csv +48 -0
- src/data/stats_lang.csv +48 -0
- src/data_loader.py +122 -0
- src/data_utils.py +50 -0
- src/display/css_html_js.py +766 -0
- src/display/formatting.py +305 -0
- src/display/utils.py +31 -0
- src/envs.py +28 -0
- src/submission/check_validity.py +72 -0
- src/submission/submit.py +279 -0
- ui.py +462 -0
- utils.py +42 -0
- vis_utils.py +723 -0
.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
auto_evals/
|
| 2 |
+
venv/
|
| 3 |
+
__pycache__/
|
| 4 |
+
.env
|
| 5 |
+
.ipynb_checkpoints
|
| 6 |
+
*ipynb
|
| 7 |
+
.vscode/
|
| 8 |
+
|
| 9 |
+
eval-queue/
|
| 10 |
+
eval-results/
|
| 11 |
+
eval-queue-bk/
|
| 12 |
+
eval-results-bk/
|
| 13 |
+
logs/
|
.pre-commit-config.yaml
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
default_language_version:
|
| 16 |
+
python: python3
|
| 17 |
+
|
| 18 |
+
ci:
|
| 19 |
+
autofix_prs: true
|
| 20 |
+
autoupdate_commit_msg: '[pre-commit.ci] pre-commit suggestions'
|
| 21 |
+
autoupdate_schedule: quarterly
|
| 22 |
+
|
| 23 |
+
repos:
|
| 24 |
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
| 25 |
+
rev: v4.3.0
|
| 26 |
+
hooks:
|
| 27 |
+
- id: check-yaml
|
| 28 |
+
- id: check-case-conflict
|
| 29 |
+
- id: detect-private-key
|
| 30 |
+
- id: check-added-large-files
|
| 31 |
+
args: ['--maxkb=1000']
|
| 32 |
+
- id: requirements-txt-fixer
|
| 33 |
+
- id: end-of-file-fixer
|
| 34 |
+
- id: trailing-whitespace
|
| 35 |
+
|
| 36 |
+
- repo: https://github.com/PyCQA/isort
|
| 37 |
+
rev: 5.12.0
|
| 38 |
+
hooks:
|
| 39 |
+
- id: isort
|
| 40 |
+
name: Format imports
|
| 41 |
+
|
| 42 |
+
- repo: https://github.com/psf/black
|
| 43 |
+
rev: 22.12.0
|
| 44 |
+
hooks:
|
| 45 |
+
- id: black
|
| 46 |
+
name: Format code
|
| 47 |
+
additional_dependencies: ['click==8.0.2']
|
| 48 |
+
|
| 49 |
+
- repo: https://github.com/charliermarsh/ruff-pre-commit
|
| 50 |
+
# Ruff version.
|
| 51 |
+
rev: 'v0.0.267'
|
| 52 |
+
hooks:
|
| 53 |
+
- id: ruff
|
Makefile
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.PHONY: style format
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
style:
|
| 5 |
+
python -m black --line-length 119 .
|
| 6 |
+
python -m isort .
|
| 7 |
+
ruff check --fix .
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
quality:
|
| 11 |
+
python -m black --check --line-length 119 .
|
| 12 |
+
python -m isort --check-only .
|
| 13 |
+
ruff check .
|
NOTICE
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Certain styling elements of this project are partially adapted from HuggingFace leaderboard code,
|
| 2 |
+
(https://huggingface.co/spaces/galileo-ai/agent-leaderboard),
|
| 3 |
+
licensed under the Apache License, Version 2.0.
|
| 4 |
+
Modifications have been made by Samsung Research.
|
README.md
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: TRUEBench
|
| 3 |
+
emoji: 🔥
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: indigo
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: 5.38.0
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: false
|
| 10 |
+
license: apache-2.0
|
| 11 |
+
hf_oauth: true
|
| 12 |
+
---
|
| 13 |
+
|
| 14 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
|
@@ -0,0 +1,654 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
|
| 3 |
+
from apscheduler.schedulers.background import BackgroundScheduler
|
| 4 |
+
from huggingface_hub import snapshot_download
|
| 5 |
+
from src.data_utils import get_dataframe_category, get_dataframe_language
|
| 6 |
+
import src.config as configs
|
| 7 |
+
from utils import get_profile_and_organizations, download_with_restart
|
| 8 |
+
from vis_utils import load_leaderboard_data, create_domain_radar_chart, create_len_overall_scatter
|
| 9 |
+
|
| 10 |
+
from src.about import (
|
| 11 |
+
CITATION_BUTTON_LABEL,
|
| 12 |
+
CITATION_BUTTON_TEXT,
|
| 13 |
+
EVALUATION_QUEUE_TEXT,
|
| 14 |
+
EVALUATION_QUEUE_TEXT_OPTION1,
|
| 15 |
+
INTRODUCTION_TEXT,
|
| 16 |
+
BANNER,
|
| 17 |
+
TITLE,
|
| 18 |
+
LINK,
|
| 19 |
+
)
|
| 20 |
+
from src.display.css_html_js import custom_css
|
| 21 |
+
from src.display.utils import (
|
| 22 |
+
Precision
|
| 23 |
+
)
|
| 24 |
+
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
| 25 |
+
from src.submission.submit import add_new_eval_option
|
| 26 |
+
|
| 27 |
+
from ui import create_leaderboard_tab
|
| 28 |
+
|
| 29 |
+
def restart_space():
|
| 30 |
+
API.restart_space(repo_id=REPO_ID)
|
| 31 |
+
|
| 32 |
+
### Space initialisation
|
| 33 |
+
download_with_restart(
|
| 34 |
+
snapshot_download,
|
| 35 |
+
repo_id=QUEUE_REPO,
|
| 36 |
+
local_dir=EVAL_REQUESTS_PATH,
|
| 37 |
+
repo_type="dataset",
|
| 38 |
+
token=TOKEN,
|
| 39 |
+
restart_func=restart_space
|
| 40 |
+
)
|
| 41 |
+
download_with_restart(
|
| 42 |
+
snapshot_download,
|
| 43 |
+
repo_id=RESULTS_REPO,
|
| 44 |
+
local_dir=EVAL_RESULTS_PATH,
|
| 45 |
+
repo_type="dataset",
|
| 46 |
+
token=TOKEN,
|
| 47 |
+
restart_func=restart_space
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
theme = gr.themes.Default(
|
| 51 |
+
primary_hue="gray",
|
| 52 |
+
neutral_hue="gray"
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
demo = gr.Blocks(css=custom_css, theme=theme)
|
| 56 |
+
with demo:
|
| 57 |
+
gr.HTML(BANNER + TITLE + LINK)
|
| 58 |
+
user_state = gr.State()
|
| 59 |
+
organization_state = gr.State()
|
| 60 |
+
|
| 61 |
+
with gr.Tabs(elem_classes="tab-buttons") as main_tabs:
|
| 62 |
+
with gr.TabItem("TRUEBench", elem_id="llm-benchmark-tab-table", id=2):
|
| 63 |
+
gr.HTML(INTRODUCTION_TEXT)
|
| 64 |
+
|
| 65 |
+
gr.HTML("""
|
| 66 |
+
<div class="dark-container" style="margin-bottom: 24px;">
|
| 67 |
+
<div class="section-header">
|
| 68 |
+
<h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;">
|
| 69 |
+
Category Analysis
|
| 70 |
+
</h3>
|
| 71 |
+
</div>
|
| 72 |
+
<p style="color: var(--text-secondary); margin-bottom: 20px; font-size: 1.1rem; font-family: 'Geist', sans-serif;">TRUEBench consists of 10 categories and 46 sub-categories which highly related to productivity assistants.</p>
|
| 73 |
+
""")
|
| 74 |
+
# --- Category Explanation Box (2x5 grid, emoji, desc from about.py) ---
|
| 75 |
+
from src.about import CATEGORY_DESCRIPTIONS
|
| 76 |
+
gr.HTML(f"""
|
| 77 |
+
<style>
|
| 78 |
+
.category-box-grid {{
|
| 79 |
+
display: flex;
|
| 80 |
+
flex-direction: column;
|
| 81 |
+
gap: 18px;
|
| 82 |
+
margin: 18px 0;
|
| 83 |
+
}}
|
| 84 |
+
.category-box-row {{
|
| 85 |
+
display: flex;
|
| 86 |
+
gap: 18px;
|
| 87 |
+
}}
|
| 88 |
+
.category-box {{
|
| 89 |
+
background: linear-gradient(135deg, #e3e6f3 60%, #f5f6fa 100%);
|
| 90 |
+
border-radius: 26px;
|
| 91 |
+
box-shadow: 0 0 16px #6c63ff44, 0 2px 8px rgba(0,0,0,0.08);
|
| 92 |
+
color: #222 !important;
|
| 93 |
+
min-height: 140px;
|
| 94 |
+
flex: 1 1 0;
|
| 95 |
+
display: flex;
|
| 96 |
+
flex-direction: column;
|
| 97 |
+
align-items: flex-start;
|
| 98 |
+
padding: 18px 16px 12px 16px;
|
| 99 |
+
box-shadow: 0 0 16px #6c63ff44, 0 2px 8px rgba(0,0,0,0.08);
|
| 100 |
+
font-size: 1.08rem;
|
| 101 |
+
color: #222 !important;
|
| 102 |
+
transition: box-shadow 0.2s;
|
| 103 |
+
position: relative;
|
| 104 |
+
overflow: hidden;
|
| 105 |
+
opacity: 1;
|
| 106 |
+
}}
|
| 107 |
+
.category-title {{
|
| 108 |
+
font-weight: 700;
|
| 109 |
+
font-size: 1.18rem;
|
| 110 |
+
margin-left: 8px;
|
| 111 |
+
vertical-align: middle;
|
| 112 |
+
color: #222 !important;
|
| 113 |
+
}}
|
| 114 |
+
.category-desc {{
|
| 115 |
+
margin-top: 12px;
|
| 116 |
+
font-size: 0.98rem;
|
| 117 |
+
color: #fff !important;
|
| 118 |
+
font-weight: 400;
|
| 119 |
+
min-height: 24px;
|
| 120 |
+
width: 100%;
|
| 121 |
+
line-height: 1.5;
|
| 122 |
+
letter-spacing: 0.01em;
|
| 123 |
+
}}
|
| 124 |
+
.category-box:hover {{
|
| 125 |
+
box-shadow: 0 0 24px #a5a1ff55, 0 4px 16px rgba(0,0,0,0.18);
|
| 126 |
+
}}
|
| 127 |
+
.category-title {{
|
| 128 |
+
font-weight: 700;
|
| 129 |
+
font-size: 1.18rem;
|
| 130 |
+
margin-left: 8px;
|
| 131 |
+
vertical-align: middle;
|
| 132 |
+
}}
|
| 133 |
+
.category-desc {{
|
| 134 |
+
margin-top: 12px;
|
| 135 |
+
font-size: 0.98rem;
|
| 136 |
+
color: #222 !important;
|
| 137 |
+
font-weight: 400;
|
| 138 |
+
min-height: 24px;
|
| 139 |
+
width: 100%;
|
| 140 |
+
line-height: 1.5;
|
| 141 |
+
letter-spacing: 0.01em;
|
| 142 |
+
}}
|
| 143 |
+
@media (prefers-color-scheme: dark) {{
|
| 144 |
+
.category-box .category-title {{
|
| 145 |
+
color: #f5f6f7 !important;
|
| 146 |
+
}}
|
| 147 |
+
}}
|
| 148 |
+
</style>
|
| 149 |
+
<div class='category-box-grid'>
|
| 150 |
+
<div class='category-box-row'>
|
| 151 |
+
<div class='category-box'><span class='category-title'>📝 Content Generation</span><div class='category-desc'>{CATEGORY_DESCRIPTIONS["Content Generation"]}</div></div>
|
| 152 |
+
<div class='category-box'><span class='category-title'>✂️ Editing</span><div class='category-desc'>{CATEGORY_DESCRIPTIONS["Editing"]}</div></div>
|
| 153 |
+
<div class='category-box'><span class='category-title'>📊 Data Analysis</span><div class='category-desc'>{CATEGORY_DESCRIPTIONS["Data Analysis"]}</div></div>
|
| 154 |
+
<div class='category-box'><span class='category-title'>🧠 Reasoning</span><div class='category-desc'>{CATEGORY_DESCRIPTIONS["Reasoning"]}</div></div>
|
| 155 |
+
<div class='category-box'><span class='category-title'>🦄 Hallucination</span><div class='category-desc'>{CATEGORY_DESCRIPTIONS["Hallucination"]}</div></div>
|
| 156 |
+
</div>
|
| 157 |
+
<div class='category-box-row'>
|
| 158 |
+
<div class='category-box'><span class='category-title'>🛡️ Safety</span><div class='category-desc'>{CATEGORY_DESCRIPTIONS["Safety"]}</div></div>
|
| 159 |
+
<div class='category-box'><span class='category-title'>🔁 Repetition</span><div class='category-desc'>{CATEGORY_DESCRIPTIONS["Repetition"]}</div></div>
|
| 160 |
+
<div class='category-box'><span class='category-title'>📝 Summarization</span><div class='category-desc'>{CATEGORY_DESCRIPTIONS["Summarization"]}</div></div>
|
| 161 |
+
<div class='category-box'><span class='category-title'>🌐 Translation</span><div class='category-desc'>{CATEGORY_DESCRIPTIONS["Translation"]}</div></div>
|
| 162 |
+
<div class='category-box'><span class='category-title'>💬 Multi-Turn</span><div class='category-desc'>{CATEGORY_DESCRIPTIONS["Multi-Turn"]}</div></div>
|
| 163 |
+
</div>
|
| 164 |
+
</div>
|
| 165 |
+
""")
|
| 166 |
+
df = get_dataframe_category()
|
| 167 |
+
|
| 168 |
+
gr.HTML("""
|
| 169 |
+
<style>
|
| 170 |
+
.leaderboard-container {
|
| 171 |
+
background: #fff;
|
| 172 |
+
}
|
| 173 |
+
@media (prefers-color-scheme: dark) {
|
| 174 |
+
.leaderboard-container {
|
| 175 |
+
background: #121212;
|
| 176 |
+
}
|
| 177 |
+
}
|
| 178 |
+
</style>
|
| 179 |
+
<div class="leaderboard-container">
|
| 180 |
+
""")
|
| 181 |
+
leaderboard_tab_cat = create_leaderboard_tab(
|
| 182 |
+
df,
|
| 183 |
+
"Category",
|
| 184 |
+
)
|
| 185 |
+
gr.HTML("</div>")
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
# --- Category Radar Chart Section ---
|
| 189 |
+
from vis_utils import load_leaderboard_data, create_domain_radar_chart
|
| 190 |
+
initial_df_cat = load_leaderboard_data()
|
| 191 |
+
# Top 5 models based on leaderboard (Average Accuracy)
|
| 192 |
+
if "Overall" in initial_df_cat.columns:
|
| 193 |
+
top5_models_cat = initial_df_cat.sort_values("Overall", ascending=False)['Model Name'].tolist()[:5]
|
| 194 |
+
else:
|
| 195 |
+
top5_models_cat = initial_df_cat['Model Name'].tolist()[:5]
|
| 196 |
+
gr.HTML('<div class="chart-container" style="display: flex; justify-content: center; align-items: center; width: 100%; max-width: 100%; margin: 0 auto; padding: 0;">')
|
| 197 |
+
# Radar chart model selector (up to 5)
|
| 198 |
+
from src.display.formatting import get_display_model_name
|
| 199 |
+
display_names_cat = initial_df_cat['Model Name'].apply(get_display_model_name).tolist()
|
| 200 |
+
original_names_cat = initial_df_cat['Model Name'].tolist()
|
| 201 |
+
display_to_original_cat = dict(zip(display_names_cat, original_names_cat))
|
| 202 |
+
top5_display_names_cat = [get_display_model_name(m) for m in top5_models_cat]
|
| 203 |
+
model_selector_cat = gr.Dropdown(
|
| 204 |
+
choices=display_names_cat,
|
| 205 |
+
value=top5_display_names_cat,
|
| 206 |
+
multiselect=True,
|
| 207 |
+
label="🎯 Select Models for Radar Chart",
|
| 208 |
+
info="Choose up to 5 models to visualize",
|
| 209 |
+
elem_classes=["dropdown", "custom-dropdown"],
|
| 210 |
+
interactive=True,
|
| 211 |
+
filterable=True,
|
| 212 |
+
allow_custom_value=False
|
| 213 |
+
)
|
| 214 |
+
gr.HTML("""
|
| 215 |
+
<script>
|
| 216 |
+
document.querySelector('.custom-dropdown').addEventListener('change', function(e) {
|
| 217 |
+
if (this.value.length > 5) {
|
| 218 |
+
alert('You can select up to 5 models only');
|
| 219 |
+
this.value = this.value.slice(0, 5);
|
| 220 |
+
}
|
| 221 |
+
});
|
| 222 |
+
</script>
|
| 223 |
+
""")
|
| 224 |
+
radar_chart_cat = gr.Plot(
|
| 225 |
+
label="",
|
| 226 |
+
value=create_domain_radar_chart(
|
| 227 |
+
initial_df_cat,
|
| 228 |
+
"Average Accuracy",
|
| 229 |
+
top5_models_cat
|
| 230 |
+
),
|
| 231 |
+
elem_classes=["radar-chart", "plot-container"]
|
| 232 |
+
)
|
| 233 |
+
gr.HTML('</div>')
|
| 234 |
+
|
| 235 |
+
# Update radar chart when model_selector_cat selection changes
|
| 236 |
+
def update_radar_chart_cat(selected_display_names):
|
| 237 |
+
# If no selection, fallback to top-5
|
| 238 |
+
if not selected_display_names or len(selected_display_names) == 0:
|
| 239 |
+
df = load_leaderboard_data()
|
| 240 |
+
selected_display_names = [get_display_model_name(m) for m in df['Model Name'].tolist()[:5]]
|
| 241 |
+
selected_models = [display_to_original_cat[name] for name in selected_display_names if name in display_to_original_cat]
|
| 242 |
+
return create_domain_radar_chart(
|
| 243 |
+
load_leaderboard_data(),
|
| 244 |
+
"Average Accuracy",
|
| 245 |
+
selected_models
|
| 246 |
+
)
|
| 247 |
+
model_selector_cat.change(
|
| 248 |
+
fn=update_radar_chart_cat,
|
| 249 |
+
inputs=model_selector_cat,
|
| 250 |
+
outputs=radar_chart_cat
|
| 251 |
+
)
|
| 252 |
+
# --- Med. Len. vs Overall Scatter Plot Section ---
|
| 253 |
+
from vis_utils import create_len_overall_scatter
|
| 254 |
+
import json
|
| 255 |
+
with open("src/data/length_data.json", "r") as f:
|
| 256 |
+
length_data = json.load(f)
|
| 257 |
+
|
| 258 |
+
# --- Create a Gradio State component to hold length_data ---
|
| 259 |
+
length_data_state = gr.State(value=length_data)
|
| 260 |
+
gr.HTML("""
|
| 261 |
+
<div class="dark-container" style="margin-bottom: 24px; margin-top: 24px;">
|
| 262 |
+
<div class="section-header">
|
| 263 |
+
<h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;">
|
| 264 |
+
Output Length vs. Category Score
|
| 265 |
+
</h3>
|
| 266 |
+
</div>
|
| 267 |
+
<p style="color: var(--text-secondary); margin-bottom: 20px; font-size: 1.1rem; font-family: 'Geist', sans-serif;">
|
| 268 |
+
Explore the relationship between median output length and model performance by category
|
| 269 |
+
</p>
|
| 270 |
+
""")
|
| 271 |
+
|
| 272 |
+
|
| 273 |
+
# Category selection buttons (HTML + Gradio Radio for event)
|
| 274 |
+
category_columns = [col for col in configs.ON_LOAD_COLUMNS_CATEGORY if col not in configs.CATEGORY_EXCLUDED_COLUMNS]
|
| 275 |
+
# (cat-btn-radio related style block removed, now handled in custom_css)
|
| 276 |
+
category_selector = gr.Radio(
|
| 277 |
+
choices=category_columns,
|
| 278 |
+
value="Overall",
|
| 279 |
+
label="Select Category for Y-Axis",
|
| 280 |
+
elem_id="cat-btn-radio",
|
| 281 |
+
elem_classes=["cat-btn-radio"],
|
| 282 |
+
interactive=True,
|
| 283 |
+
show_label=False
|
| 284 |
+
)
|
| 285 |
+
x_axis_selector = gr.Radio(
|
| 286 |
+
choices=["Med. Len.", "Med. Resp. Len."],
|
| 287 |
+
value="Med. Len.",
|
| 288 |
+
label="Select X-Axis Data",
|
| 289 |
+
elem_id="x-axis-btn-radio",
|
| 290 |
+
elem_classes=["x-axis-btn-radio"],
|
| 291 |
+
interactive=True,
|
| 292 |
+
show_label=True
|
| 293 |
+
)
|
| 294 |
+
gr.HTML('<div class="chart-container" style="display: flex; justify-content: center; align-items: center;">')
|
| 295 |
+
scatter_plot_cat = gr.Plot(
|
| 296 |
+
label="",
|
| 297 |
+
value=create_len_overall_scatter(
|
| 298 |
+
load_leaderboard_data(),
|
| 299 |
+
y_col="Overall",
|
| 300 |
+
length_data=length_data,
|
| 301 |
+
x_axis_data_source=x_axis_selector.value
|
| 302 |
+
),
|
| 303 |
+
elem_classes=["efficiency-chart", "plot-container"]
|
| 304 |
+
)
|
| 305 |
+
gr.HTML('</div>')
|
| 306 |
+
gr.HTML("</div>")
|
| 307 |
+
|
| 308 |
+
# Update plot when category or x-axis selection changes
|
| 309 |
+
def update_scatter_plot_cat(selected_category, selected_x_source, current_length_data_state):
|
| 310 |
+
return create_len_overall_scatter(
|
| 311 |
+
load_leaderboard_data(),
|
| 312 |
+
y_col=selected_category,
|
| 313 |
+
length_data=current_length_data_state,
|
| 314 |
+
x_axis_data_source=selected_x_source
|
| 315 |
+
)
|
| 316 |
+
category_selector.change(
|
| 317 |
+
fn=update_scatter_plot_cat,
|
| 318 |
+
inputs=[category_selector, x_axis_selector, length_data_state],
|
| 319 |
+
outputs=scatter_plot_cat
|
| 320 |
+
)
|
| 321 |
+
x_axis_selector.change(
|
| 322 |
+
fn=update_scatter_plot_cat,
|
| 323 |
+
inputs=[category_selector, x_axis_selector, length_data_state],
|
| 324 |
+
outputs=scatter_plot_cat
|
| 325 |
+
)
|
| 326 |
+
|
| 327 |
+
# When leaderboard selectors change, synchronize model_selector_cat and radar_chart_cat to top-5
|
| 328 |
+
def update_model_selector_and_radar_chart_cat_from_leaderboard(types, model_types, thinks, df, sort_col):
|
| 329 |
+
_, _, top5_models = leaderboard_tab_cat["unified_filter"](types, model_types, thinks, df, sort_col)
|
| 330 |
+
|
| 331 |
+
top5_display_names = [get_display_model_name(m) for m in top5_models[:5]]
|
| 332 |
+
return gr.update(value=top5_display_names), create_domain_radar_chart(
|
| 333 |
+
load_leaderboard_data(),
|
| 334 |
+
"Average Accuracy",
|
| 335 |
+
top5_models[:5]
|
| 336 |
+
)
|
| 337 |
+
|
| 338 |
+
leaderboard_selectors_cat = [
|
| 339 |
+
leaderboard_tab_cat["type_selector"],
|
| 340 |
+
leaderboard_tab_cat["model_type_selector"],
|
| 341 |
+
leaderboard_tab_cat["think_selector"],
|
| 342 |
+
leaderboard_tab_cat["df_state"],
|
| 343 |
+
leaderboard_tab_cat["sort_col_dropdown"]
|
| 344 |
+
]
|
| 345 |
+
for selector in leaderboard_selectors_cat:
|
| 346 |
+
selector.change(
|
| 347 |
+
fn=update_model_selector_and_radar_chart_cat_from_leaderboard,
|
| 348 |
+
inputs=leaderboard_selectors_cat,
|
| 349 |
+
outputs=[model_selector_cat, radar_chart_cat]
|
| 350 |
+
)
|
| 351 |
+
|
| 352 |
+
|
| 353 |
+
|
| 354 |
+
gr.HTML("""
|
| 355 |
+
<div class="dark-container" style="margin-bottom: 24px;">
|
| 356 |
+
<div class="section-header">
|
| 357 |
+
<h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;">
|
| 358 |
+
Language Analysis
|
| 359 |
+
</h3>
|
| 360 |
+
</div>
|
| 361 |
+
<p style="color: var(--text-secondary); margin-bottom: 20px; font-size: 1.1rem; font-family: 'Geist', sans-serif;">As a multilingual benchmark, TRUEBench supports a total of 12 user input languages: Korean (KO), English (EN), Japanese (JA), Chinese (ZH), Polish (PL), German (DE), Portuguese (PT), Spanish (ES), French (FR), Italian (IT), Russian (RU), and Vietnamese (VI).</p>
|
| 362 |
+
""")
|
| 363 |
+
df = get_dataframe_language()
|
| 364 |
+
|
| 365 |
+
leaderboard_tab_lang = create_leaderboard_tab(
|
| 366 |
+
df,
|
| 367 |
+
"Language",
|
| 368 |
+
)
|
| 369 |
+
|
| 370 |
+
# --- Language Radar Chart Section ---
|
| 371 |
+
|
| 372 |
+
from vis_utils import load_leaderboard_language_data, create_language_radar_chart
|
| 373 |
+
initial_df_lang = load_leaderboard_language_data()
|
| 374 |
+
|
| 375 |
+
# Top 5 models based on leaderboard (Overall)
|
| 376 |
+
if "Overall" in initial_df_lang.columns:
|
| 377 |
+
top5_models_lang = initial_df_lang.sort_values("Overall", ascending=False)['Model Name'].tolist()[:5]
|
| 378 |
+
else:
|
| 379 |
+
top5_models_lang = initial_df_lang['Model Name'].tolist()[:5]
|
| 380 |
+
|
| 381 |
+
gr.HTML('<div class="chart-container" style="display: flex; justify-content: center; align-items: center;">')
|
| 382 |
+
# Add model selector
|
| 383 |
+
display_names_lang = initial_df_lang['Model Name'].apply(get_display_model_name).tolist()
|
| 384 |
+
original_names_lang = initial_df_lang['Model Name'].tolist()
|
| 385 |
+
display_to_original_lang = dict(zip(display_names_lang, original_names_lang))
|
| 386 |
+
top5_display_names_lang = [get_display_model_name(m) for m in top5_models_lang]
|
| 387 |
+
model_selector_lang = gr.Dropdown(
|
| 388 |
+
choices=display_names_lang,
|
| 389 |
+
value=top5_display_names_lang,
|
| 390 |
+
multiselect=True,
|
| 391 |
+
label="🎯 Select Models for Radar Chart",
|
| 392 |
+
info="Choose up to 5 models to visualize",
|
| 393 |
+
elem_classes=["dropdown", "custom-dropdown"],
|
| 394 |
+
interactive=True,
|
| 395 |
+
filterable=True,
|
| 396 |
+
allow_custom_value=False
|
| 397 |
+
)
|
| 398 |
+
gr.HTML("""
|
| 399 |
+
<script>
|
| 400 |
+
document.querySelectorAll('.custom-dropdown')[1].addEventListener('change', function(e) {
|
| 401 |
+
if (this.value.length > 5) {
|
| 402 |
+
alert('You can select up to 5 models only');
|
| 403 |
+
this.value = this.value.slice(0, 5);
|
| 404 |
+
}
|
| 405 |
+
});
|
| 406 |
+
</script>
|
| 407 |
+
""")
|
| 408 |
+
radar_chart_lang = gr.Plot(
|
| 409 |
+
label="",
|
| 410 |
+
value=create_language_radar_chart(
|
| 411 |
+
initial_df_lang,
|
| 412 |
+
"Average Accuracy",
|
| 413 |
+
top5_models_lang
|
| 414 |
+
),
|
| 415 |
+
elem_classes=["radar-chart", "plot-container"]
|
| 416 |
+
)
|
| 417 |
+
gr.HTML('</div>')
|
| 418 |
+
|
| 419 |
+
# Update radar chart when model_selector_lang selection changes
|
| 420 |
+
def update_radar_chart_lang(selected_display_names):
|
| 421 |
+
if not selected_display_names or len(selected_display_names) == 0:
|
| 422 |
+
df = load_leaderboard_language_data()
|
| 423 |
+
selected_display_names = [get_display_model_name(m) for m in df['Model Name'].tolist()[:5]]
|
| 424 |
+
selected_models = [display_to_original_lang[name] for name in selected_display_names if name in display_to_original_lang]
|
| 425 |
+
return create_language_radar_chart(
|
| 426 |
+
load_leaderboard_language_data(),
|
| 427 |
+
"Average Accuracy",
|
| 428 |
+
selected_models
|
| 429 |
+
)
|
| 430 |
+
model_selector_lang.change(
|
| 431 |
+
fn=update_radar_chart_lang,
|
| 432 |
+
inputs=model_selector_lang,
|
| 433 |
+
outputs=radar_chart_lang
|
| 434 |
+
)
|
| 435 |
+
|
| 436 |
+
# When leaderboard selectors change, automatically synchronize model_selector_lang and radar_chart_lang to top-5
|
| 437 |
+
def update_model_selector_and_radar_chart_lang_from_leaderboard(types, model_types, thinks, df, sort_col):
|
| 438 |
+
_, _, top5_models = leaderboard_tab_lang["unified_filter"](types, model_types, thinks, df, sort_col)
|
| 439 |
+
top5_display_names = [get_display_model_name(m) for m in top5_models[:5]]
|
| 440 |
+
return gr.update(value=top5_display_names), create_language_radar_chart(
|
| 441 |
+
load_leaderboard_language_data(),
|
| 442 |
+
"Average Accuracy",
|
| 443 |
+
top5_models[:5]
|
| 444 |
+
)
|
| 445 |
+
|
| 446 |
+
leaderboard_selectors_lang = [
|
| 447 |
+
leaderboard_tab_lang["type_selector"],
|
| 448 |
+
leaderboard_tab_lang["model_type_selector"],
|
| 449 |
+
leaderboard_tab_lang["think_selector"],
|
| 450 |
+
leaderboard_tab_lang["df_state"],
|
| 451 |
+
leaderboard_tab_lang["sort_col_dropdown"]
|
| 452 |
+
]
|
| 453 |
+
|
| 454 |
+
for selector in leaderboard_selectors_lang:
|
| 455 |
+
selector.change(
|
| 456 |
+
fn=update_model_selector_and_radar_chart_lang_from_leaderboard,
|
| 457 |
+
inputs=leaderboard_selectors_lang,
|
| 458 |
+
outputs=[model_selector_lang, radar_chart_lang]
|
| 459 |
+
)
|
| 460 |
+
|
| 461 |
+
|
| 462 |
+
|
| 463 |
+
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
| 464 |
+
with gr.Column():
|
| 465 |
+
with gr.Row():
|
| 466 |
+
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
| 467 |
+
|
| 468 |
+
with gr.Row():
|
| 469 |
+
gr.Markdown(EVALUATION_QUEUE_TEXT_OPTION1, elem_classes="markdown-text")
|
| 470 |
+
|
| 471 |
+
with gr.Row():
|
| 472 |
+
gr.Markdown("## ✉️ Submit your model here!", elem_classes="markdown-text")
|
| 473 |
+
|
| 474 |
+
login_button = gr.LoginButton()
|
| 475 |
+
|
| 476 |
+
with gr.Row():
|
| 477 |
+
with gr.Column():
|
| 478 |
+
contact_email = gr.Textbox(label="Contact Email", placeholder="Your email address", interactive=True)
|
| 479 |
+
model_name_textbox = gr.Textbox(label="Model Name")
|
| 480 |
+
model_type_dropdown = gr.Dropdown(
|
| 481 |
+
choices=["Instruct", "Think", "Hybrid"],
|
| 482 |
+
label="Model Type (Instruct, Think, or Hybrid)",
|
| 483 |
+
multiselect=False,
|
| 484 |
+
value="Instruct",
|
| 485 |
+
interactive=True,
|
| 486 |
+
)
|
| 487 |
+
think_type_dropdown = gr.Dropdown(
|
| 488 |
+
choices=["On", "Off"],
|
| 489 |
+
label="Think Mode (On/Off)",
|
| 490 |
+
multiselect=False,
|
| 491 |
+
value="Off",
|
| 492 |
+
interactive=False,
|
| 493 |
+
)
|
| 494 |
+
precision = gr.Dropdown(
|
| 495 |
+
choices=[i.value.name for i in Precision if i != Precision.Unknown],
|
| 496 |
+
label="Precision",
|
| 497 |
+
multiselect=False,
|
| 498 |
+
value="float16",
|
| 499 |
+
interactive=True,
|
| 500 |
+
)
|
| 501 |
+
# --- Dynamically control think_type based on model_type and connect event ---
|
| 502 |
+
def update_think_type(model_type_value):
|
| 503 |
+
if model_type_value == "Instruct":
|
| 504 |
+
return gr.update(value="Off", interactive=False)
|
| 505 |
+
elif model_type_value == "Think":
|
| 506 |
+
return gr.update(value="On", interactive=False)
|
| 507 |
+
else: # Hybrid
|
| 508 |
+
return gr.update(value="On", interactive=True)
|
| 509 |
+
model_type_dropdown.change(
|
| 510 |
+
fn=update_think_type,
|
| 511 |
+
inputs=model_type_dropdown,
|
| 512 |
+
outputs=think_type_dropdown
|
| 513 |
+
)
|
| 514 |
+
response_prefix_textbox = gr.Textbox(label="Response prefix", placeholder="(e.g., </think>)")
|
| 515 |
+
|
| 516 |
+
with gr.Column():
|
| 517 |
+
yml_textbox_placeholder = """# vLLM serving parameters
|
| 518 |
+
# Refence: https://docs.vllm.ai/en/latest/cli/serve.html
|
| 519 |
+
llm_serve_args:
|
| 520 |
+
max_model_len:
|
| 521 |
+
tensor_parallel_size:
|
| 522 |
+
dtype:
|
| 523 |
+
...
|
| 524 |
+
# OpenAI-compatible API (chat completion)
|
| 525 |
+
# Reference: https://platform.openai.com/docs/api-reference/chat
|
| 526 |
+
sampling_params:
|
| 527 |
+
top_p:
|
| 528 |
+
temperature:
|
| 529 |
+
presence_penalty:
|
| 530 |
+
...
|
| 531 |
+
# vLLM sampling parameters
|
| 532 |
+
# Reference: https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#chat-api_1
|
| 533 |
+
extra_body:
|
| 534 |
+
chat_template_kwargs:
|
| 535 |
+
enable_thinking:
|
| 536 |
+
...
|
| 537 |
+
top_k:
|
| 538 |
+
repetition_penalty:
|
| 539 |
+
..."""
|
| 540 |
+
yml_textbox = gr.Textbox(
|
| 541 |
+
label="Configuration (YAML format)",
|
| 542 |
+
elem_id="yml-textbox",
|
| 543 |
+
lines=7,
|
| 544 |
+
value=yml_textbox_placeholder
|
| 545 |
+
)
|
| 546 |
+
upbox = gr.File(
|
| 547 |
+
label="Upload configuration file as .yml or .yaml",
|
| 548 |
+
file_types=[".yml", ".yaml"],
|
| 549 |
+
type="filepath",
|
| 550 |
+
height=150
|
| 551 |
+
)
|
| 552 |
+
# Add Translate to JSON button below upbox
|
| 553 |
+
translate_button = gr.Button(
|
| 554 |
+
"Translate to JSON",
|
| 555 |
+
elem_id="translate-to-json-btn",
|
| 556 |
+
elem_classes=["translate-btn"],
|
| 557 |
+
scale=None
|
| 558 |
+
)
|
| 559 |
+
# Add custom style for the button
|
| 560 |
+
gr.HTML(
|
| 561 |
+
'''
|
| 562 |
+
<style>
|
| 563 |
+
#translate-to-json-btn, .translate-btn {
|
| 564 |
+
width: 100%;
|
| 565 |
+
min-height: 24px;
|
| 566 |
+
font-size: 1.1rem;
|
| 567 |
+
font-weight: 600;
|
| 568 |
+
background: linear-gradient(90deg, #6c63ff 60%, #a5a1ff 100%);
|
| 569 |
+
color: #fff;
|
| 570 |
+
border: none;
|
| 571 |
+
border-radius: 12px;
|
| 572 |
+
margin-top: 8px;
|
| 573 |
+
margin-bottom: 8px;
|
| 574 |
+
box-shadow: 0 2px 8px #6c63ff33;
|
| 575 |
+
transition: background 0.2s, box-shadow 0.2s;
|
| 576 |
+
}
|
| 577 |
+
#translate-to-json-btn:hover, .translate-btn:hover {
|
| 578 |
+
background: linear-gradient(90deg, #5a54d6 60%, #7e7bff 100%);
|
| 579 |
+
box-shadow: 0 4px 16px #6c63ff55;
|
| 580 |
+
}
|
| 581 |
+
</style>
|
| 582 |
+
'''
|
| 583 |
+
)
|
| 584 |
+
with gr.Column():
|
| 585 |
+
requirements_textbox = gr.Textbox(label="(Optional) Requirements", lines=30, elem_id="requirements-textbox")
|
| 586 |
+
|
| 587 |
+
output_dict = gr.Code(label="Translated Python Dictionary", language="json")
|
| 588 |
+
submit_button = gr.Button("Submit Eval")
|
| 589 |
+
submission_result = gr.Markdown()
|
| 590 |
+
def parse_and_display_yaml_config(upbox_path, yml_textbox_value):
|
| 591 |
+
import yaml, json
|
| 592 |
+
if upbox_path:
|
| 593 |
+
try:
|
| 594 |
+
with open(upbox_path, "r", encoding="utf-8") as f:
|
| 595 |
+
data = yaml.safe_load(f)
|
| 596 |
+
if data is None:
|
| 597 |
+
return "YAML file is empty."
|
| 598 |
+
return json.dumps(data, indent=4, ensure_ascii=False)
|
| 599 |
+
except Exception as e:
|
| 600 |
+
return f"Error parsing YAML file: {e}"
|
| 601 |
+
elif yml_textbox_value and yml_textbox_value.strip():
|
| 602 |
+
try:
|
| 603 |
+
data = yaml.safe_load(yml_textbox_value)
|
| 604 |
+
if data is None:
|
| 605 |
+
return "YAML textbox is empty or invalid."
|
| 606 |
+
return json.dumps(data, indent=4, ensure_ascii=False)
|
| 607 |
+
except Exception as e:
|
| 608 |
+
return f"Error parsing YAML textbox: {e}"
|
| 609 |
+
else:
|
| 610 |
+
return ""
|
| 611 |
+
|
| 612 |
+
event = submit_button.click(get_profile_and_organizations, inputs=[], outputs=[user_state, organization_state])
|
| 613 |
+
event.then(
|
| 614 |
+
add_new_eval_option,
|
| 615 |
+
[
|
| 616 |
+
contact_email,
|
| 617 |
+
model_name_textbox,
|
| 618 |
+
model_type_dropdown,
|
| 619 |
+
think_type_dropdown,
|
| 620 |
+
precision,
|
| 621 |
+
response_prefix_textbox,
|
| 622 |
+
requirements_textbox,
|
| 623 |
+
user_state,
|
| 624 |
+
organization_state,
|
| 625 |
+
yml_textbox,
|
| 626 |
+
upbox,
|
| 627 |
+
],
|
| 628 |
+
submission_result,
|
| 629 |
+
).then(
|
| 630 |
+
fn=parse_and_display_yaml_config,
|
| 631 |
+
inputs=[upbox, yml_textbox],
|
| 632 |
+
outputs=output_dict
|
| 633 |
+
)
|
| 634 |
+
translate_button.click(
|
| 635 |
+
fn=parse_and_display_yaml_config,
|
| 636 |
+
inputs=[upbox, yml_textbox],
|
| 637 |
+
outputs=output_dict
|
| 638 |
+
)
|
| 639 |
+
|
| 640 |
+
with gr.Row():
|
| 641 |
+
with gr.Accordion("📙 Citation", open=False):
|
| 642 |
+
citation_button = gr.Textbox(
|
| 643 |
+
value=CITATION_BUTTON_TEXT,
|
| 644 |
+
label=CITATION_BUTTON_LABEL,
|
| 645 |
+
lines=20,
|
| 646 |
+
elem_id="citation-button",
|
| 647 |
+
show_copy_button=True,
|
| 648 |
+
)
|
| 649 |
+
|
| 650 |
+
|
| 651 |
+
scheduler = BackgroundScheduler()
|
| 652 |
+
scheduler.add_job(restart_space, "interval", seconds=1800)
|
| 653 |
+
scheduler.start()
|
| 654 |
+
demo.queue(default_concurrency_limit=40).launch()
|
constants.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Leaderboard required columns (must always be included)
|
| 2 |
+
LEADERBOARD_REQUIRED_COLUMNS = [
|
| 3 |
+
"Model Name", "Group", "Overall", "Med. Len.", "Med. Resp. Len.", "Type", "Model Type", "Think", "Rank"
|
| 4 |
+
]
|
| 5 |
+
|
| 6 |
+
# Columns for number formatting (by category/language)
|
| 7 |
+
NUMERIC_COLS_CATEGORY = [
|
| 8 |
+
"Overall", "Med. Len.", "Med. Resp. Len.", "Parameter Size (B)",
|
| 9 |
+
"Content Generation", "Editing", "Data Analysis", "Reasoning",
|
| 10 |
+
"Hallucination", "Safety", "Repetition", "Summarization", "Translation", "Multi-Turn"
|
| 11 |
+
]
|
| 12 |
+
NUMERIC_INT_COLS_CATEGORY = ["Med. Len.", "Med. Resp. Len.", "Parameter Size (B)"]
|
| 13 |
+
NUMERIC_FLOAT_COLS_CATEGORY = [col for col in NUMERIC_COLS_CATEGORY if col not in NUMERIC_INT_COLS_CATEGORY]
|
| 14 |
+
|
| 15 |
+
NUMERIC_COLS_LANGUAGE = [
|
| 16 |
+
"Overall", "Med. Len.", "Med. Resp. Len.", "Parameter Size (B)",
|
| 17 |
+
"KO", "EN", "JA", "ZH", "PL", "DE", "PT", "ES", "FR", "IT", "RU", "VI"
|
| 18 |
+
]
|
| 19 |
+
NUMERIC_INT_COLS_LANGUAGE = ["Med. Len.", "Med. Resp. Len.", "Parameter Size (B)"]
|
| 20 |
+
NUMERIC_FLOAT_COLS_LANGUAGE = [col for col in NUMERIC_COLS_LANGUAGE if col not in NUMERIC_INT_COLS_LANGUAGE]
|
pyproject.toml
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[tool.ruff]
|
| 2 |
+
# Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
|
| 3 |
+
select = ["E", "F"]
|
| 4 |
+
ignore = ["E501"] # line too long (black is taking care of this)
|
| 5 |
+
line-length = 119
|
| 6 |
+
fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
|
| 7 |
+
|
| 8 |
+
[tool.isort]
|
| 9 |
+
profile = "black"
|
| 10 |
+
line_length = 119
|
| 11 |
+
|
| 12 |
+
[tool.black]
|
| 13 |
+
line-length = 119
|
requirements.txt
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
APScheduler
|
| 2 |
+
black
|
| 3 |
+
datasets
|
| 4 |
+
gradio
|
| 5 |
+
gradio[oauth]
|
| 6 |
+
gradio_leaderboard==0.0.13
|
| 7 |
+
gradio_client
|
| 8 |
+
huggingface-hub>=0.18.0
|
| 9 |
+
matplotlib
|
| 10 |
+
numpy
|
| 11 |
+
pandas
|
| 12 |
+
python-dateutil
|
| 13 |
+
tqdm
|
| 14 |
+
transformers
|
| 15 |
+
tokenizers>=0.15.0
|
| 16 |
+
plotly
|
src/about.py
ADDED
|
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
CATEGORY_DESCRIPTIONS = {
|
| 2 |
+
"Content Generation": "<p>Evaluates the model's ability to produce diverse written outputs across professional and creative domains. This category measures adaptability to linguistic, stylistic, and formatting constraints, as well as the effectiveness of prompt engineering.</p> <b>🏷️Email 🏷️ReportDrafting</b>",
|
| 3 |
+
"Editing": "<p>Evaluates refinement capabilities for optimizing given text. It focuses on queries related to rephrasing, revision, and correction, while preserving the rest of the content.</p> <b>🏷️QueryRephrase 🏷️DocumentRevision</b>",
|
| 4 |
+
"Data Analysis": "<p>Measures proficiency in processing structured and unstructured data. This category includes tasks related to information extraction and data processing.</p> <b>🏷️JSONFormatted 🏷️TableQuery</b>",
|
| 5 |
+
"Reasoning": "<p>Assesses logical problem-solving in coding, multiple-choice question answering, and mathematical operations. It also includes evaluation of rounding errors made by models in quantitative tasks.</p> <b>🏷️Logical 🏷️Mathematical</b>",
|
| 6 |
+
"Hallucination": "<p>Detects limitations in generating plausible but inaccurate responses when faced with ambiguous queries, insufficient context, hypothetical scenarios, or challenges in document interpretation.</p> <b>🏷️InsufficientContext 🏷️FalseQueries</b>",
|
| 7 |
+
"Safety": "<p>Verifies safeguards against harmful/inappropriate content. This category tests filtering of discriminatory, violent, or illegal material while upholding ethical standards.</p> <b>🏷️Illegal 🏷️Prejudice</b>",
|
| 8 |
+
"Repetition": "<p>Evaluates consistency in producing iterative content variations while maintaining quality and relevance across outputs.</p> <b>🏷️Listing</b>",
|
| 9 |
+
"Summarization": "<p>Measures ability to distill lengthy content into concise overviews preserving core concepts and eliminating redundancy. This category includes various constraints such as language, format, and output length.</p> <b>🏷️BulletPoints 🏷️N-lineSummary</b>",
|
| 10 |
+
"Translation": "<p>Tests the ability to accurately translate diverse real-world contexts while adhering to target language and specified constraints. Our benchmark includes linguistic conditions in 12 languages, ensuring comprehensive multilingual evaluation.</p> <b>🏷️Document 🏷️Line-by-line</b>",
|
| 11 |
+
"Multi-Turn": "<p>Assesses the model's ability to capture user intent in challenging scenarios where the context shifts or understanding of previous context is required.</p> <b>🏷️Consistency 🏷️Non-consistency</b>"
|
| 12 |
+
}
|
| 13 |
+
|
| 14 |
+
banner_url = "https://cdn-uploads.huggingface.co/production/uploads/6805a7222cbcd604c2e89cab/GIEbCbyNn7PjWBFftEgNm.png"
|
| 15 |
+
BANNER = f'<div style="display: flex; justify-content: flex-start; width: 100%;"> <img src="{banner_url}" alt="Banner" style="width: 100%; height: auto; object-fit: contain;"> </div> '
|
| 16 |
+
|
| 17 |
+
TITLE = """<html>
|
| 18 |
+
<body>
|
| 19 |
+
<p style="margin: 0; text-align: right">Leaderboards by Samsung Research for LLM evaluation.</p>
|
| 20 |
+
</body>
|
| 21 |
+
</html>"""
|
| 22 |
+
|
| 23 |
+
LINK = """
|
| 24 |
+
<h3 style="text-align: right; margin-top: 0;">
|
| 25 |
+
<span>✨</span>
|
| 26 |
+
<a href="https://research.samsung.com/" style="text-decoration: none;" rel="nofollow" target="_blank" onmouseover="this.style.textDecoration='underline'" onmouseout="this.style.textDecoration='none'">Samsung Research</a> |
|
| 27 |
+
<span>🌕</span>
|
| 28 |
+
<a href="https://github.com/samsung" style="text-decoration: none;" rel="nofollow" target="_blank" onmouseover="this.style.textDecoration='underline'" onmouseout="this.style.textDecoration='none'">GitHub</a> |
|
| 29 |
+
<span>🌎</span>
|
| 30 |
+
<a href="https://x.com/samsungresearch" style="text-decoration: none;" rel="nofollow" target="_blank" onmouseover="this.style.textDecoration='underline'" onmouseout="this.style.textDecoration='none'">X</a> |
|
| 31 |
+
<span>🌠</span>
|
| 32 |
+
<a href="https://huggingface.co/spaces/SamsungResearch/TRUEBench/discussions" style="text-decoration: none;" rel="nofollow" target="_blank" onmouseover="this.style.textDecoration='underline'" onmouseout="this.style.textDecoration='none'">Discussion</a> |
|
| 33 |
+
<span>🔭</span> Updated: 2025-09-16
|
| 34 |
+
</h3>
|
| 35 |
+
"""
|
| 36 |
+
|
| 37 |
+
INTRODUCTION_TEXT = """
|
| 38 |
+
<div style="margin-bottom: 20px; text-align: center !important;">
|
| 39 |
+
<h2 style="padding-bottom: 5px !important; text-align: center !important; font-size: 2.6em !important; font-weight: 900 !important; margin-top: 0.2em !important; margin-bottom: 0.3em !important;">
|
| 40 |
+
🏆 TRUEBench: A Benchmark for Assessing LLMs as Human Job Productivity Assistants
|
| 41 |
+
</h2>
|
| 42 |
+
<p style="font-size: 1.25em !important; line-height: 1.7 !important; margin: 14px 0 !important;">
|
| 43 |
+
TRUEBench (Trustworthy Real-world Usage Evaluation Benchmark) evaluates LLMs as productivity assistants. <br>
|
| 44 |
+
As LLMs become integral to tasks like report drafting and data analysis, existing benchmarks are suboptimal to capture real-world challenges. <br>
|
| 45 |
+
To address this gap, <strong>Samsung Research</strong> developed TRUEBench as a comprehensive evaluation framework for real-world LLM applications.
|
| 46 |
+
</p>
|
| 47 |
+
<p style="font-size: 1.25em !important; line-height: 1.7 !important; margin: 14px 0 !important;">
|
| 48 |
+
TRUEBench is a benchmark designed to evaluate the instruction-following capabilities of LLMs, determining whether a response receives a Pass (1 point) or Fail (0 points) based on checklists. <br> This aligns with user satisfaction from the perspective of job productivity.
|
| 49 |
+
</p>
|
| 50 |
+
<h3 style="font-size: 2em; font-weight: 800; margin-top: 1.2em; margin-bottom: 0.5em; line-height: 1.3; letter-spacing: -0.01em;">
|
| 51 |
+
Main Features
|
| 52 |
+
</h3>
|
| 53 |
+
<div class="intro-feature-row">
|
| 54 |
+
<div class="intro-feature-box">
|
| 55 |
+
<div class="intro-feature-icon">📝</div>
|
| 56 |
+
<div class="intro-feature-title">2,400+ Productivity-Oriented User Inputs</div>
|
| 57 |
+
<div class="intro-feature-desc">A large-scale collection of complex, real-world user inputs designed to reflect productivity assistant scenarios.</div>
|
| 58 |
+
</div>
|
| 59 |
+
<div class="intro-feature-box">
|
| 60 |
+
<div class="intro-feature-icon">🌎</div>
|
| 61 |
+
<div class="intro-feature-title">Multilinguality in Real Tasks</div>
|
| 62 |
+
<div class="intro-feature-desc">Comprehensive 12-language coverage with intra-instance multilingual instructions.</div>
|
| 63 |
+
<div class="intro-feature-desc" style="font-style: italic; color: #888;">For multilingual aspects, it was created through local research institutes.</div>
|
| 64 |
+
</div>
|
| 65 |
+
<div class="intro-feature-box">
|
| 66 |
+
<div class="intro-feature-icon">🧩</div>
|
| 67 |
+
<div class="intro-feature-title">Beyond Explicit Constraints</div>
|
| 68 |
+
<div class="intro-feature-desc">Human-annotated implicit requirements validated by LLMs.</div>
|
| 69 |
+
</div>
|
| 70 |
+
<div class="intro-feature-box">
|
| 71 |
+
<div class="intro-feature-icon">🧭</div>
|
| 72 |
+
<div class="intro-feature-title">Dynamic Multi-Turn Contexts</div>
|
| 73 |
+
<div class="intro-feature-desc">Realistic dialogue flows with evolving constraints.</div>
|
| 74 |
+
</div>
|
| 75 |
+
</div>
|
| 76 |
+
<a class="intro-dataset-btn" href="https://huggingface.co/datasets/SamsungResearch/TRUEBench" target="_blank" rel="nofollow">
|
| 77 |
+
📂 Dataset Sample →
|
| 78 |
+
</a>
|
| 79 |
+
</div> """
|
| 80 |
+
|
| 81 |
+
MAIN_FEATURES_TEXT ="""
|
| 82 |
+
<div style="padding: 10px; border-radius: 8px; margin-bottom: 20px;">
|
| 83 |
+
<h2 style="color: #2c3e50; border-bottom: 2px solid #3498db; padding-bottom: 5px;">✨ Main Features</h2>
|
| 84 |
+
<ul style="list-style-type: none; padding-left: 0;">
|
| 85 |
+
<li style="margin-bottom: 10px; padding-left: 25px; position: relative;">
|
| 86 |
+
<span style="position: absolute; left: 0; color: #3498db;">✓</span>
|
| 87 |
+
Input prompts across 12 languages
|
| 88 |
+
</li>
|
| 89 |
+
<li style="margin-bottom: 10px; padding-left: 25px; position: relative;">
|
| 90 |
+
<span style="position: absolute; left: 0; color: #3498db;">✓</span>
|
| 91 |
+
Intra-instance multilingual instructions
|
| 92 |
+
</li>
|
| 93 |
+
<li style="margin-bottom: 10px; padding-left: 25px; position: relative;">
|
| 94 |
+
<span style="position: absolute; left: 0; color: #3498db;">✓</span>
|
| 95 |
+
Rigorous evaluation criteria for explicit and implicit constraints
|
| 96 |
+
</li>
|
| 97 |
+
<li style="margin-bottom: 10px; padding-left: 25px; position: relative;">
|
| 98 |
+
<span style="position: absolute; left: 0; color: #3498db;">✓</span>
|
| 99 |
+
Complex multi-turn dialogue scenarios
|
| 100 |
+
</li>
|
| 101 |
+
<li style="margin-bottom: 10px; padding-left: 25px; position: relative;">
|
| 102 |
+
<span style="position: absolute; left: 0; color: #3498db;">✓</span>
|
| 103 |
+
LLM-validated constraints for reliable evaluation
|
| 104 |
+
</li>
|
| 105 |
+
</ul>
|
| 106 |
+
<div style="margin: 20px 0 10px 0;">
|
| 107 |
+
<a href="https://huggingface.co/datasets/SamsungResearch/TRUEBench"
|
| 108 |
+
style="color: #3498db;
|
| 109 |
+
text-decoration: underline;
|
| 110 |
+
font-size: 1.2em;
|
| 111 |
+
font-weight: bold;"
|
| 112 |
+
rel="nofollow"
|
| 113 |
+
target="_blank"
|
| 114 |
+
onmouseover="this.style.textDecoration='none'; this.style.color='#2c3e50'"
|
| 115 |
+
onmouseout="this.style.textDecoration='underline'; this.style.color='#3498db'">
|
| 116 |
+
📂 Dataset Sample →
|
| 117 |
+
</a>
|
| 118 |
+
</div>
|
| 119 |
+
</div>
|
| 120 |
+
"""
|
| 121 |
+
|
| 122 |
+
LLM_BENCHMARKS_TEXT = f"""
|
| 123 |
+
## How it works
|
| 124 |
+
We utilize LLM Judge with human-crafted criteria to assess AI response.
|
| 125 |
+
"""
|
| 126 |
+
|
| 127 |
+
EVALUATION_QUEUE_TEXT = '''
|
| 128 |
+
<div style="font-size: 1.25em !important; line-height: 1.7 !important; margin: 14px 0 !important;">
|
| 129 |
+
|
| 130 |
+
## Submission Policy
|
| 131 |
+
- Submissions are limited to models that are registered on *HuggingFace Models*.
|
| 132 |
+
- Each model affiliation (individual or organization) may submit up to **3** times within **24** hours.
|
| 133 |
+
- The same model can only be submitted once per 24 hours.
|
| 134 |
+
- Duplicate submissions will be determined based on the full model name (i.e., {affiliation}/{model name}). Sampling parameters, dtype, etc. are not considered for duplicate checking.
|
| 135 |
+
- Submissions are only valid if the model's affiliation matches that of the submitter.
|
| 136 |
+
- If the same model is submitted multiple times, only the version with the highest overall score will be reflected on the leaderboard. (Note: A maximum of 3 submissions per model is allowed.)
|
| 137 |
+
|
| 138 |
+
**[NOTE]** Models with commercial licenses may be excluded from evaluation. We focus on evaluating non-commercial models, such as those under Apache-2.0 or MIT licenses. <br>
|
| 139 |
+
**[NOTE]** We use your user name (via **OAuthProfile**) and your list of registered organizations (via **OAuthToken**) solely to verify submission eligibility. **This information is never stored.**<br><br>
|
| 140 |
+
|
| 141 |
+
## Evaluation Environments
|
| 142 |
+
- Submitted models are run on our internal servers to generate inference outputs, which are then evaluated using an LLM judge.
|
| 143 |
+
- Models must be runnable on up to **32 H100 GPUs** to be eligible for submission.
|
| 144 |
+
- By default, we perform inference in the vLLM 0.10.1 environment. We recommend testing your model in this environment first. You may include additional requests in the requirements section in a free‑form manner, but please note that such requests could be rejected due to the constraints of inferencing environment.
|
| 145 |
+
- We serve the model based on vLLM and perform inference through the OpenAI-compatible API (chat completion).<br><br>
|
| 146 |
+
|
| 147 |
+
## Evaluation Rules
|
| 148 |
+
- It might take more than 1 week for submitted models' scores to appear on the leaderboard.
|
| 149 |
+
- The maximum generation length is limited to **64K** tokens.
|
| 150 |
+
- Please provide a valid contact email address in the submission form so we can send notifications related to evaluation.
|
| 151 |
+
|
| 152 |
+
**[CAUTION]** If inference fails or if inappropriate content is detected, the model might be excluded from evaluation.<br><br>
|
| 153 |
+
|
| 154 |
+
## Submission Rules
|
| 155 |
+
- For Think models, you must specify the sequence that separates the thinking process and the final response (e.g., </think>) in the response_prefix field. We will use this prefix to extract the response for evaluation. (NOTE: Models that fail to provide a proper response prefix might be excluded from evaluation.)
|
| 156 |
+
- Referring to the configuration section of the submission form, provide the following in YAML format, either directly or via an uploaded `.yaml` file (if both are provided, the file takes priority):
|
| 157 |
+
- **Model serve arguments (llm_serve_args)**: vLLM-based model serving parameters ([Reference](https://docs.vllm.ai/en/latest/cli/serve.html))
|
| 158 |
+
- **Sampling parameters (sampling_params)**: Sampling parameters supported by the OpenAI API ([Reference](https://platform.openai.com/docs/api-reference/chat))
|
| 159 |
+
- **Extra body including chat template arguments (extra_body)**: `chat_template_kwargs` and sampling parameters supported by vLLM ([Reference](https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#extra-parameters_1))
|
| 160 |
+
- Any additional specifications outside the configuration format should be written in the requirements section.
|
| 161 |
+
|
| 162 |
+
**[NOTE]** If you need to use two or more H100 GPUs, be sure to specify `tensor_parallel_size` within `llm_serve_args`.<br><br>
|
| 163 |
+
</div>
|
| 164 |
+
'''
|
| 165 |
+
|
| 166 |
+
EVALUATION_QUEUE_TEXT_OPTION1 = """
|
| 167 |
+
<div style="font-size: 1.25em !important; line-height: 1.7 !important; margin: 14px 0 !important;">
|
| 168 |
+
|
| 169 |
+
## Submission Form
|
| 170 |
+
1. Sign in using the log-in button below.
|
| 171 |
+
2. Fill the information including metadata, requirements, and configuration (fill the textbox or upload .yaml file).
|
| 172 |
+
3. Press "Submit Eval" button to submit.
|
| 173 |
+
"""
|
| 174 |
+
|
| 175 |
+
EVALUATION_QUEUE_TEXT_OPTION2 = """
|
| 176 |
+
"""
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
CITATION_BUTTON_LABEL = "To be updated"
|
| 180 |
+
CITATION_BUTTON_TEXT = r"""
|
| 181 |
+
"""
|
src/config.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ON_LOAD_COLUMNS_LANG = [
|
| 2 |
+
"Model Name",
|
| 3 |
+
"Group",
|
| 4 |
+
"Overall",
|
| 5 |
+
"Med. Len.",
|
| 6 |
+
"Med. Resp. Len.",
|
| 7 |
+
"Parameter Size (B)",
|
| 8 |
+
"Type",
|
| 9 |
+
"Model Type",
|
| 10 |
+
"Think",
|
| 11 |
+
"KO",
|
| 12 |
+
"EN",
|
| 13 |
+
"JA",
|
| 14 |
+
"ZH",
|
| 15 |
+
"PL",
|
| 16 |
+
"DE",
|
| 17 |
+
"PT",
|
| 18 |
+
"ES",
|
| 19 |
+
"FR",
|
| 20 |
+
"IT",
|
| 21 |
+
"RU",
|
| 22 |
+
"VI"
|
| 23 |
+
]
|
| 24 |
+
|
| 25 |
+
ON_LOAD_COLUMNS_CATEGORY = [
|
| 26 |
+
"Model Name",
|
| 27 |
+
"Group",
|
| 28 |
+
"Overall",
|
| 29 |
+
"Med. Len.",
|
| 30 |
+
"Med. Resp. Len.",
|
| 31 |
+
"Parameter Size (B)",
|
| 32 |
+
"Type",
|
| 33 |
+
"Model Type",
|
| 34 |
+
"Think",
|
| 35 |
+
"Content Generation",
|
| 36 |
+
"Editing",
|
| 37 |
+
"Data Analysis",
|
| 38 |
+
"Reasoning",
|
| 39 |
+
"Hallucination",
|
| 40 |
+
"Safety",
|
| 41 |
+
"Repetition",
|
| 42 |
+
"Summarization",
|
| 43 |
+
"Translation",
|
| 44 |
+
"Multi-Turn"
|
| 45 |
+
]
|
| 46 |
+
|
| 47 |
+
CATEGORY_EXCLUDED_COLUMNS = [
|
| 48 |
+
"Model Name",
|
| 49 |
+
"Group",
|
| 50 |
+
"Med. Len.",
|
| 51 |
+
"Med. Resp. Len.",
|
| 52 |
+
"Parameter Size (B)",
|
| 53 |
+
"Type",
|
| 54 |
+
"Model Type",
|
| 55 |
+
"Think"
|
| 56 |
+
]
|
| 57 |
+
|
| 58 |
+
COLUMN_GROUP_LIST = [
|
| 59 |
+
"Category",
|
| 60 |
+
"Language"
|
| 61 |
+
]
|
src/data/length_data.json
ADDED
|
@@ -0,0 +1,1906 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"Claude 4.1 Opus (20250805) (think)": {
|
| 3 |
+
"Overall": {
|
| 4 |
+
"Min": -10,
|
| 5 |
+
"Max": -2,
|
| 6 |
+
"Med": -2.0,
|
| 7 |
+
"Med Resp": -1.0
|
| 8 |
+
},
|
| 9 |
+
"Content Generation": {
|
| 10 |
+
"Min": -2,
|
| 11 |
+
"Max": -2,
|
| 12 |
+
"Med": -2.0,
|
| 13 |
+
"Med Resp": -1.0
|
| 14 |
+
},
|
| 15 |
+
"Editing": {
|
| 16 |
+
"Min": -2,
|
| 17 |
+
"Max": -2,
|
| 18 |
+
"Med": -2.0,
|
| 19 |
+
"Med Resp": -1.0
|
| 20 |
+
},
|
| 21 |
+
"Data Analysis": {
|
| 22 |
+
"Min": -2,
|
| 23 |
+
"Max": -2,
|
| 24 |
+
"Med": -2.0,
|
| 25 |
+
"Med Resp": -1.0
|
| 26 |
+
},
|
| 27 |
+
"Reasoning": {
|
| 28 |
+
"Min": -2,
|
| 29 |
+
"Max": -2,
|
| 30 |
+
"Med": -2.0,
|
| 31 |
+
"Med Resp": -1.0
|
| 32 |
+
},
|
| 33 |
+
"Hallucination": {
|
| 34 |
+
"Min": -2,
|
| 35 |
+
"Max": -2,
|
| 36 |
+
"Med": -2.0,
|
| 37 |
+
"Med Resp": -1.0
|
| 38 |
+
},
|
| 39 |
+
"Safety": {
|
| 40 |
+
"Min": -2,
|
| 41 |
+
"Max": -2,
|
| 42 |
+
"Med": -2.0,
|
| 43 |
+
"Med Resp": -1.0
|
| 44 |
+
},
|
| 45 |
+
"Repetition": {
|
| 46 |
+
"Min": -2,
|
| 47 |
+
"Max": -2,
|
| 48 |
+
"Med": -2.0,
|
| 49 |
+
"Med Resp": -1.0
|
| 50 |
+
},
|
| 51 |
+
"Summarization": {
|
| 52 |
+
"Min": -2,
|
| 53 |
+
"Max": -2,
|
| 54 |
+
"Med": -2.0,
|
| 55 |
+
"Med Resp": -1.0
|
| 56 |
+
},
|
| 57 |
+
"Translation": {
|
| 58 |
+
"Min": -2,
|
| 59 |
+
"Max": -2,
|
| 60 |
+
"Med": -2.0,
|
| 61 |
+
"Med Resp": -1.0
|
| 62 |
+
},
|
| 63 |
+
"Multi-Turn": {
|
| 64 |
+
"Min": -10,
|
| 65 |
+
"Max": -4,
|
| 66 |
+
"Med": -6.0,
|
| 67 |
+
"Med Resp": -3.0
|
| 68 |
+
}
|
| 69 |
+
},
|
| 70 |
+
"EXAONE 4.0 32B (think)": {
|
| 71 |
+
"Overall": {
|
| 72 |
+
"Min": 37,
|
| 73 |
+
"Max": 142387,
|
| 74 |
+
"Med": 1274.5,
|
| 75 |
+
"Med Resp": 503.0
|
| 76 |
+
},
|
| 77 |
+
"Content Generation": {
|
| 78 |
+
"Min": 160,
|
| 79 |
+
"Max": 131068,
|
| 80 |
+
"Med": 1178.5,
|
| 81 |
+
"Med Resp": 559.0
|
| 82 |
+
},
|
| 83 |
+
"Editing": {
|
| 84 |
+
"Min": 37,
|
| 85 |
+
"Max": 10786,
|
| 86 |
+
"Med": 1041.0,
|
| 87 |
+
"Med Resp": 423.5
|
| 88 |
+
},
|
| 89 |
+
"Data Analysis": {
|
| 90 |
+
"Min": 229,
|
| 91 |
+
"Max": 131072,
|
| 92 |
+
"Med": 1412.0,
|
| 93 |
+
"Med Resp": 345.0
|
| 94 |
+
},
|
| 95 |
+
"Reasoning": {
|
| 96 |
+
"Min": 567,
|
| 97 |
+
"Max": 131076,
|
| 98 |
+
"Med": 3961.5,
|
| 99 |
+
"Med Resp": 585.5
|
| 100 |
+
},
|
| 101 |
+
"Hallucination": {
|
| 102 |
+
"Min": 298,
|
| 103 |
+
"Max": 65533,
|
| 104 |
+
"Med": 1247.5,
|
| 105 |
+
"Med Resp": 627.5
|
| 106 |
+
},
|
| 107 |
+
"Safety": {
|
| 108 |
+
"Min": 227,
|
| 109 |
+
"Max": 5093,
|
| 110 |
+
"Med": 1145.0,
|
| 111 |
+
"Med Resp": 589.0
|
| 112 |
+
},
|
| 113 |
+
"Repetition": {
|
| 114 |
+
"Min": 441,
|
| 115 |
+
"Max": 131072,
|
| 116 |
+
"Med": 1744.5,
|
| 117 |
+
"Med Resp": 579.5
|
| 118 |
+
},
|
| 119 |
+
"Summarization": {
|
| 120 |
+
"Min": 149,
|
| 121 |
+
"Max": 8423,
|
| 122 |
+
"Med": 693.5,
|
| 123 |
+
"Med Resp": 311.0
|
| 124 |
+
},
|
| 125 |
+
"Translation": {
|
| 126 |
+
"Min": 227,
|
| 127 |
+
"Max": 14234,
|
| 128 |
+
"Med": 915.0,
|
| 129 |
+
"Med Resp": 411.5
|
| 130 |
+
},
|
| 131 |
+
"Multi-Turn": {
|
| 132 |
+
"Min": 390,
|
| 133 |
+
"Max": 142387,
|
| 134 |
+
"Med": 3222.0,
|
| 135 |
+
"Med Resp": 1488.0
|
| 136 |
+
}
|
| 137 |
+
},
|
| 138 |
+
"DeepSeek V3.1 (think)": {
|
| 139 |
+
"Overall": {
|
| 140 |
+
"Min": 80,
|
| 141 |
+
"Max": 31147,
|
| 142 |
+
"Med": 710.5,
|
| 143 |
+
"Med Resp": 356.0
|
| 144 |
+
},
|
| 145 |
+
"Content Generation": {
|
| 146 |
+
"Min": 132,
|
| 147 |
+
"Max": 5354,
|
| 148 |
+
"Med": 776.5,
|
| 149 |
+
"Med Resp": 500.0
|
| 150 |
+
},
|
| 151 |
+
"Editing": {
|
| 152 |
+
"Min": 119,
|
| 153 |
+
"Max": 2063,
|
| 154 |
+
"Med": 571.0,
|
| 155 |
+
"Med Resp": 287.0
|
| 156 |
+
},
|
| 157 |
+
"Data Analysis": {
|
| 158 |
+
"Min": 119,
|
| 159 |
+
"Max": 13106,
|
| 160 |
+
"Med": 644.0,
|
| 161 |
+
"Med Resp": 218.0
|
| 162 |
+
},
|
| 163 |
+
"Reasoning": {
|
| 164 |
+
"Min": 259,
|
| 165 |
+
"Max": 31147,
|
| 166 |
+
"Med": 1340.5,
|
| 167 |
+
"Med Resp": 338.0
|
| 168 |
+
},
|
| 169 |
+
"Hallucination": {
|
| 170 |
+
"Min": 206,
|
| 171 |
+
"Max": 10356,
|
| 172 |
+
"Med": 1132.5,
|
| 173 |
+
"Med Resp": 667.0
|
| 174 |
+
},
|
| 175 |
+
"Safety": {
|
| 176 |
+
"Min": 80,
|
| 177 |
+
"Max": 3412,
|
| 178 |
+
"Med": 565.0,
|
| 179 |
+
"Med Resp": 206.0
|
| 180 |
+
},
|
| 181 |
+
"Repetition": {
|
| 182 |
+
"Min": 290,
|
| 183 |
+
"Max": 6553,
|
| 184 |
+
"Med": 826.5,
|
| 185 |
+
"Med Resp": 450.0
|
| 186 |
+
},
|
| 187 |
+
"Summarization": {
|
| 188 |
+
"Min": 148,
|
| 189 |
+
"Max": 1533,
|
| 190 |
+
"Med": 432.0,
|
| 191 |
+
"Med Resp": 211.5
|
| 192 |
+
},
|
| 193 |
+
"Translation": {
|
| 194 |
+
"Min": 147,
|
| 195 |
+
"Max": 7448,
|
| 196 |
+
"Med": 554.5,
|
| 197 |
+
"Med Resp": 320.0
|
| 198 |
+
},
|
| 199 |
+
"Multi-Turn": {
|
| 200 |
+
"Min": 324,
|
| 201 |
+
"Max": 7862,
|
| 202 |
+
"Med": 2558.5,
|
| 203 |
+
"Med Resp": 1545.0
|
| 204 |
+
}
|
| 205 |
+
},
|
| 206 |
+
"o4-mini": {
|
| 207 |
+
"Overall": {
|
| 208 |
+
"Min": -10,
|
| 209 |
+
"Max": -2,
|
| 210 |
+
"Med": -2.0,
|
| 211 |
+
"Med Resp": -1.0
|
| 212 |
+
},
|
| 213 |
+
"Content Generation": {
|
| 214 |
+
"Min": -2,
|
| 215 |
+
"Max": -2,
|
| 216 |
+
"Med": -2.0,
|
| 217 |
+
"Med Resp": -1.0
|
| 218 |
+
},
|
| 219 |
+
"Editing": {
|
| 220 |
+
"Min": -2,
|
| 221 |
+
"Max": -2,
|
| 222 |
+
"Med": -2.0,
|
| 223 |
+
"Med Resp": -1.0
|
| 224 |
+
},
|
| 225 |
+
"Data Analysis": {
|
| 226 |
+
"Min": -2,
|
| 227 |
+
"Max": -2,
|
| 228 |
+
"Med": -2.0,
|
| 229 |
+
"Med Resp": -1.0
|
| 230 |
+
},
|
| 231 |
+
"Reasoning": {
|
| 232 |
+
"Min": -2,
|
| 233 |
+
"Max": -2,
|
| 234 |
+
"Med": -2.0,
|
| 235 |
+
"Med Resp": -1.0
|
| 236 |
+
},
|
| 237 |
+
"Hallucination": {
|
| 238 |
+
"Min": -2,
|
| 239 |
+
"Max": -2,
|
| 240 |
+
"Med": -2.0,
|
| 241 |
+
"Med Resp": -1.0
|
| 242 |
+
},
|
| 243 |
+
"Safety": {
|
| 244 |
+
"Min": -2,
|
| 245 |
+
"Max": -2,
|
| 246 |
+
"Med": -2.0,
|
| 247 |
+
"Med Resp": -1.0
|
| 248 |
+
},
|
| 249 |
+
"Repetition": {
|
| 250 |
+
"Min": -2,
|
| 251 |
+
"Max": -2,
|
| 252 |
+
"Med": -2.0,
|
| 253 |
+
"Med Resp": -1.0
|
| 254 |
+
},
|
| 255 |
+
"Summarization": {
|
| 256 |
+
"Min": -2,
|
| 257 |
+
"Max": -2,
|
| 258 |
+
"Med": -2.0,
|
| 259 |
+
"Med Resp": -1.0
|
| 260 |
+
},
|
| 261 |
+
"Translation": {
|
| 262 |
+
"Min": -2,
|
| 263 |
+
"Max": -2,
|
| 264 |
+
"Med": -2.0,
|
| 265 |
+
"Med Resp": -1.0
|
| 266 |
+
},
|
| 267 |
+
"Multi-Turn": {
|
| 268 |
+
"Min": -10,
|
| 269 |
+
"Max": -4,
|
| 270 |
+
"Med": -6.0,
|
| 271 |
+
"Med Resp": -3.0
|
| 272 |
+
}
|
| 273 |
+
},
|
| 274 |
+
"Gemini 2.5 Flash": {
|
| 275 |
+
"Overall": {
|
| 276 |
+
"Min": -10,
|
| 277 |
+
"Max": -2,
|
| 278 |
+
"Med": -2.0,
|
| 279 |
+
"Med Resp": -1.0
|
| 280 |
+
},
|
| 281 |
+
"Content Generation": {
|
| 282 |
+
"Min": -2,
|
| 283 |
+
"Max": -2,
|
| 284 |
+
"Med": -2.0,
|
| 285 |
+
"Med Resp": -1.0
|
| 286 |
+
},
|
| 287 |
+
"Editing": {
|
| 288 |
+
"Min": -2,
|
| 289 |
+
"Max": -2,
|
| 290 |
+
"Med": -2.0,
|
| 291 |
+
"Med Resp": -1.0
|
| 292 |
+
},
|
| 293 |
+
"Data Analysis": {
|
| 294 |
+
"Min": -2,
|
| 295 |
+
"Max": -2,
|
| 296 |
+
"Med": -2.0,
|
| 297 |
+
"Med Resp": -1.0
|
| 298 |
+
},
|
| 299 |
+
"Reasoning": {
|
| 300 |
+
"Min": -2,
|
| 301 |
+
"Max": -2,
|
| 302 |
+
"Med": -2.0,
|
| 303 |
+
"Med Resp": -1.0
|
| 304 |
+
},
|
| 305 |
+
"Hallucination": {
|
| 306 |
+
"Min": -2,
|
| 307 |
+
"Max": -2,
|
| 308 |
+
"Med": -2.0,
|
| 309 |
+
"Med Resp": -1.0
|
| 310 |
+
},
|
| 311 |
+
"Safety": {
|
| 312 |
+
"Min": -2,
|
| 313 |
+
"Max": -2,
|
| 314 |
+
"Med": -2.0,
|
| 315 |
+
"Med Resp": -1.0
|
| 316 |
+
},
|
| 317 |
+
"Repetition": {
|
| 318 |
+
"Min": -2,
|
| 319 |
+
"Max": -2,
|
| 320 |
+
"Med": -2.0,
|
| 321 |
+
"Med Resp": -1.0
|
| 322 |
+
},
|
| 323 |
+
"Summarization": {
|
| 324 |
+
"Min": -2,
|
| 325 |
+
"Max": -2,
|
| 326 |
+
"Med": -2.0,
|
| 327 |
+
"Med Resp": -1.0
|
| 328 |
+
},
|
| 329 |
+
"Translation": {
|
| 330 |
+
"Min": -2,
|
| 331 |
+
"Max": -2,
|
| 332 |
+
"Med": -2.0,
|
| 333 |
+
"Med Resp": -1.0
|
| 334 |
+
},
|
| 335 |
+
"Multi-Turn": {
|
| 336 |
+
"Min": -10,
|
| 337 |
+
"Max": -4,
|
| 338 |
+
"Med": -6.0,
|
| 339 |
+
"Med Resp": -3.0
|
| 340 |
+
}
|
| 341 |
+
},
|
| 342 |
+
"Claude 4 Sonnet (20250514) (think)": {
|
| 343 |
+
"Overall": {
|
| 344 |
+
"Min": -10,
|
| 345 |
+
"Max": -2,
|
| 346 |
+
"Med": -2.0,
|
| 347 |
+
"Med Resp": -1.0
|
| 348 |
+
},
|
| 349 |
+
"Content Generation": {
|
| 350 |
+
"Min": -2,
|
| 351 |
+
"Max": -2,
|
| 352 |
+
"Med": -2.0,
|
| 353 |
+
"Med Resp": -1.0
|
| 354 |
+
},
|
| 355 |
+
"Editing": {
|
| 356 |
+
"Min": -2,
|
| 357 |
+
"Max": -2,
|
| 358 |
+
"Med": -2.0,
|
| 359 |
+
"Med Resp": -1.0
|
| 360 |
+
},
|
| 361 |
+
"Data Analysis": {
|
| 362 |
+
"Min": -2,
|
| 363 |
+
"Max": -2,
|
| 364 |
+
"Med": -2.0,
|
| 365 |
+
"Med Resp": -1.0
|
| 366 |
+
},
|
| 367 |
+
"Reasoning": {
|
| 368 |
+
"Min": -2,
|
| 369 |
+
"Max": -2,
|
| 370 |
+
"Med": -2.0,
|
| 371 |
+
"Med Resp": -1.0
|
| 372 |
+
},
|
| 373 |
+
"Hallucination": {
|
| 374 |
+
"Min": -2,
|
| 375 |
+
"Max": -2,
|
| 376 |
+
"Med": -2.0,
|
| 377 |
+
"Med Resp": -1.0
|
| 378 |
+
},
|
| 379 |
+
"Safety": {
|
| 380 |
+
"Min": -2,
|
| 381 |
+
"Max": -2,
|
| 382 |
+
"Med": -2.0,
|
| 383 |
+
"Med Resp": -1.0
|
| 384 |
+
},
|
| 385 |
+
"Repetition": {
|
| 386 |
+
"Min": -2,
|
| 387 |
+
"Max": -2,
|
| 388 |
+
"Med": -2.0,
|
| 389 |
+
"Med Resp": -1.0
|
| 390 |
+
},
|
| 391 |
+
"Summarization": {
|
| 392 |
+
"Min": -2,
|
| 393 |
+
"Max": -2,
|
| 394 |
+
"Med": -2.0,
|
| 395 |
+
"Med Resp": -1.0
|
| 396 |
+
},
|
| 397 |
+
"Translation": {
|
| 398 |
+
"Min": -2,
|
| 399 |
+
"Max": -2,
|
| 400 |
+
"Med": -2.0,
|
| 401 |
+
"Med Resp": -1.0
|
| 402 |
+
},
|
| 403 |
+
"Multi-Turn": {
|
| 404 |
+
"Min": -10,
|
| 405 |
+
"Max": -4,
|
| 406 |
+
"Med": -6.0,
|
| 407 |
+
"Med Resp": -3.0
|
| 408 |
+
}
|
| 409 |
+
},
|
| 410 |
+
"Solar Pro Preview (top_p:0.95, temp: 0.7)": {
|
| 411 |
+
"Overall": {
|
| 412 |
+
"Min": 1,
|
| 413 |
+
"Max": 4060,
|
| 414 |
+
"Med": 260.0,
|
| 415 |
+
"Med Resp": 260.0
|
| 416 |
+
},
|
| 417 |
+
"Content Generation": {
|
| 418 |
+
"Min": 15,
|
| 419 |
+
"Max": 3643,
|
| 420 |
+
"Med": 426.0,
|
| 421 |
+
"Med Resp": 426.0
|
| 422 |
+
},
|
| 423 |
+
"Editing": {
|
| 424 |
+
"Min": 14,
|
| 425 |
+
"Max": 3948,
|
| 426 |
+
"Med": 218.0,
|
| 427 |
+
"Med Resp": 218.0
|
| 428 |
+
},
|
| 429 |
+
"Data Analysis": {
|
| 430 |
+
"Min": 2,
|
| 431 |
+
"Max": 3500,
|
| 432 |
+
"Med": 89.0,
|
| 433 |
+
"Med Resp": 89.0
|
| 434 |
+
},
|
| 435 |
+
"Reasoning": {
|
| 436 |
+
"Min": 1,
|
| 437 |
+
"Max": 3338,
|
| 438 |
+
"Med": 190.5,
|
| 439 |
+
"Med Resp": 190.5
|
| 440 |
+
},
|
| 441 |
+
"Hallucination": {
|
| 442 |
+
"Min": 20,
|
| 443 |
+
"Max": 1093,
|
| 444 |
+
"Med": 128.5,
|
| 445 |
+
"Med Resp": 128.5
|
| 446 |
+
},
|
| 447 |
+
"Safety": {
|
| 448 |
+
"Min": 11,
|
| 449 |
+
"Max": 1507,
|
| 450 |
+
"Med": 92.0,
|
| 451 |
+
"Med Resp": 92.0
|
| 452 |
+
},
|
| 453 |
+
"Repetition": {
|
| 454 |
+
"Min": 34,
|
| 455 |
+
"Max": 4060,
|
| 456 |
+
"Med": 214.0,
|
| 457 |
+
"Med Resp": 214.0
|
| 458 |
+
},
|
| 459 |
+
"Summarization": {
|
| 460 |
+
"Min": 43,
|
| 461 |
+
"Max": 2478,
|
| 462 |
+
"Med": 218.0,
|
| 463 |
+
"Med Resp": 218.0
|
| 464 |
+
},
|
| 465 |
+
"Translation": {
|
| 466 |
+
"Min": 20,
|
| 467 |
+
"Max": 1711,
|
| 468 |
+
"Med": 360.0,
|
| 469 |
+
"Med Resp": 360.0
|
| 470 |
+
},
|
| 471 |
+
"Multi-Turn": {
|
| 472 |
+
"Min": 5,
|
| 473 |
+
"Max": 3353,
|
| 474 |
+
"Med": 530.0,
|
| 475 |
+
"Med Resp": 530.0
|
| 476 |
+
}
|
| 477 |
+
},
|
| 478 |
+
"DeepSeek R1 (0528) (top_p: 0.95, temp:0.6)": {
|
| 479 |
+
"Overall": {
|
| 480 |
+
"Min": 4,
|
| 481 |
+
"Max": 16917,
|
| 482 |
+
"Med": 1177.5,
|
| 483 |
+
"Med Resp": 554.0
|
| 484 |
+
},
|
| 485 |
+
"Content Generation": {
|
| 486 |
+
"Min": 389,
|
| 487 |
+
"Max": 7861,
|
| 488 |
+
"Med": 1261.5,
|
| 489 |
+
"Med Resp": 694.0
|
| 490 |
+
},
|
| 491 |
+
"Editing": {
|
| 492 |
+
"Min": 4,
|
| 493 |
+
"Max": 7611,
|
| 494 |
+
"Med": 1054.5,
|
| 495 |
+
"Med Resp": 517.5
|
| 496 |
+
},
|
| 497 |
+
"Data Analysis": {
|
| 498 |
+
"Min": 4,
|
| 499 |
+
"Max": 8191,
|
| 500 |
+
"Med": 1112.0,
|
| 501 |
+
"Med Resp": 355.0
|
| 502 |
+
},
|
| 503 |
+
"Reasoning": {
|
| 504 |
+
"Min": 4,
|
| 505 |
+
"Max": 12257,
|
| 506 |
+
"Med": 1913.0,
|
| 507 |
+
"Med Resp": 455.5
|
| 508 |
+
},
|
| 509 |
+
"Hallucination": {
|
| 510 |
+
"Min": 4,
|
| 511 |
+
"Max": 7390,
|
| 512 |
+
"Med": 1214.5,
|
| 513 |
+
"Med Resp": 682.0
|
| 514 |
+
},
|
| 515 |
+
"Safety": {
|
| 516 |
+
"Min": 227,
|
| 517 |
+
"Max": 6387,
|
| 518 |
+
"Med": 963.0,
|
| 519 |
+
"Med Resp": 568.0
|
| 520 |
+
},
|
| 521 |
+
"Repetition": {
|
| 522 |
+
"Min": 4,
|
| 523 |
+
"Max": 7787,
|
| 524 |
+
"Med": 1405.5,
|
| 525 |
+
"Med Resp": 646.5
|
| 526 |
+
},
|
| 527 |
+
"Summarization": {
|
| 528 |
+
"Min": 319,
|
| 529 |
+
"Max": 2613,
|
| 530 |
+
"Med": 711.5,
|
| 531 |
+
"Med Resp": 321.0
|
| 532 |
+
},
|
| 533 |
+
"Translation": {
|
| 534 |
+
"Min": 4,
|
| 535 |
+
"Max": 7687,
|
| 536 |
+
"Med": 1021.0,
|
| 537 |
+
"Med Resp": 561.5
|
| 538 |
+
},
|
| 539 |
+
"Multi-Turn": {
|
| 540 |
+
"Min": 448,
|
| 541 |
+
"Max": 16917,
|
| 542 |
+
"Med": 3418.5,
|
| 543 |
+
"Med Resp": 1874.0
|
| 544 |
+
}
|
| 545 |
+
},
|
| 546 |
+
"A.X 4.0": {
|
| 547 |
+
"Overall": {
|
| 548 |
+
"Min": 1,
|
| 549 |
+
"Max": 65581,
|
| 550 |
+
"Med": 412.5,
|
| 551 |
+
"Med Resp": 412.5
|
| 552 |
+
},
|
| 553 |
+
"Content Generation": {
|
| 554 |
+
"Min": 2,
|
| 555 |
+
"Max": 65581,
|
| 556 |
+
"Med": 543.0,
|
| 557 |
+
"Med Resp": 543.0
|
| 558 |
+
},
|
| 559 |
+
"Editing": {
|
| 560 |
+
"Min": 8,
|
| 561 |
+
"Max": 1791,
|
| 562 |
+
"Med": 250.0,
|
| 563 |
+
"Med Resp": 250.0
|
| 564 |
+
},
|
| 565 |
+
"Data Analysis": {
|
| 566 |
+
"Min": 1,
|
| 567 |
+
"Max": 65537,
|
| 568 |
+
"Med": 267.0,
|
| 569 |
+
"Med Resp": 267.0
|
| 570 |
+
},
|
| 571 |
+
"Reasoning": {
|
| 572 |
+
"Min": 2,
|
| 573 |
+
"Max": 2046,
|
| 574 |
+
"Med": 498.0,
|
| 575 |
+
"Med Resp": 498.0
|
| 576 |
+
},
|
| 577 |
+
"Hallucination": {
|
| 578 |
+
"Min": 12,
|
| 579 |
+
"Max": 2639,
|
| 580 |
+
"Med": 511.5,
|
| 581 |
+
"Med Resp": 511.5
|
| 582 |
+
},
|
| 583 |
+
"Safety": {
|
| 584 |
+
"Min": 4,
|
| 585 |
+
"Max": 2942,
|
| 586 |
+
"Med": 516.0,
|
| 587 |
+
"Med Resp": 516.0
|
| 588 |
+
},
|
| 589 |
+
"Repetition": {
|
| 590 |
+
"Min": 84,
|
| 591 |
+
"Max": 65536,
|
| 592 |
+
"Med": 341.5,
|
| 593 |
+
"Med Resp": 341.5
|
| 594 |
+
},
|
| 595 |
+
"Summarization": {
|
| 596 |
+
"Min": 26,
|
| 597 |
+
"Max": 2369,
|
| 598 |
+
"Med": 282.0,
|
| 599 |
+
"Med Resp": 282.0
|
| 600 |
+
},
|
| 601 |
+
"Translation": {
|
| 602 |
+
"Min": 7,
|
| 603 |
+
"Max": 35068,
|
| 604 |
+
"Med": 343.0,
|
| 605 |
+
"Med Resp": 343.0
|
| 606 |
+
},
|
| 607 |
+
"Multi-Turn": {
|
| 608 |
+
"Min": 3,
|
| 609 |
+
"Max": 9420,
|
| 610 |
+
"Med": 1455.0,
|
| 611 |
+
"Med Resp": 1455.0
|
| 612 |
+
}
|
| 613 |
+
},
|
| 614 |
+
"GPT-5 (Reasoning: medium)": {
|
| 615 |
+
"Overall": {
|
| 616 |
+
"Min": -10,
|
| 617 |
+
"Max": -2,
|
| 618 |
+
"Med": -2.0,
|
| 619 |
+
"Med Resp": -1.0
|
| 620 |
+
},
|
| 621 |
+
"Content Generation": {
|
| 622 |
+
"Min": -2,
|
| 623 |
+
"Max": -2,
|
| 624 |
+
"Med": -2.0,
|
| 625 |
+
"Med Resp": -1.0
|
| 626 |
+
},
|
| 627 |
+
"Editing": {
|
| 628 |
+
"Min": -2,
|
| 629 |
+
"Max": -2,
|
| 630 |
+
"Med": -2.0,
|
| 631 |
+
"Med Resp": -1.0
|
| 632 |
+
},
|
| 633 |
+
"Data Analysis": {
|
| 634 |
+
"Min": -2,
|
| 635 |
+
"Max": -2,
|
| 636 |
+
"Med": -2.0,
|
| 637 |
+
"Med Resp": -1.0
|
| 638 |
+
},
|
| 639 |
+
"Reasoning": {
|
| 640 |
+
"Min": -2,
|
| 641 |
+
"Max": -2,
|
| 642 |
+
"Med": -2.0,
|
| 643 |
+
"Med Resp": -1.0
|
| 644 |
+
},
|
| 645 |
+
"Hallucination": {
|
| 646 |
+
"Min": -2,
|
| 647 |
+
"Max": -2,
|
| 648 |
+
"Med": -2.0,
|
| 649 |
+
"Med Resp": -1.0
|
| 650 |
+
},
|
| 651 |
+
"Safety": {
|
| 652 |
+
"Min": -2,
|
| 653 |
+
"Max": -2,
|
| 654 |
+
"Med": -2.0,
|
| 655 |
+
"Med Resp": -1.0
|
| 656 |
+
},
|
| 657 |
+
"Repetition": {
|
| 658 |
+
"Min": -2,
|
| 659 |
+
"Max": -2,
|
| 660 |
+
"Med": -2.0,
|
| 661 |
+
"Med Resp": -1.0
|
| 662 |
+
},
|
| 663 |
+
"Summarization": {
|
| 664 |
+
"Min": -2,
|
| 665 |
+
"Max": -2,
|
| 666 |
+
"Med": -2.0,
|
| 667 |
+
"Med Resp": -1.0
|
| 668 |
+
},
|
| 669 |
+
"Translation": {
|
| 670 |
+
"Min": -2,
|
| 671 |
+
"Max": -2,
|
| 672 |
+
"Med": -2.0,
|
| 673 |
+
"Med Resp": -1.0
|
| 674 |
+
},
|
| 675 |
+
"Multi-Turn": {
|
| 676 |
+
"Min": -10,
|
| 677 |
+
"Max": -4,
|
| 678 |
+
"Med": -6.0,
|
| 679 |
+
"Med Resp": -3.0
|
| 680 |
+
}
|
| 681 |
+
},
|
| 682 |
+
"Kanana 1.5 15.7B A3B Instruct": {
|
| 683 |
+
"Overall": {
|
| 684 |
+
"Min": 1,
|
| 685 |
+
"Max": 34276,
|
| 686 |
+
"Med": 414.0,
|
| 687 |
+
"Med Resp": 414.0
|
| 688 |
+
},
|
| 689 |
+
"Content Generation": {
|
| 690 |
+
"Min": 10,
|
| 691 |
+
"Max": 22194,
|
| 692 |
+
"Med": 463.5,
|
| 693 |
+
"Med Resp": 463.5
|
| 694 |
+
},
|
| 695 |
+
"Editing": {
|
| 696 |
+
"Min": 5,
|
| 697 |
+
"Max": 1311,
|
| 698 |
+
"Med": 249.5,
|
| 699 |
+
"Med Resp": 249.5
|
| 700 |
+
},
|
| 701 |
+
"Data Analysis": {
|
| 702 |
+
"Min": 1,
|
| 703 |
+
"Max": 22211,
|
| 704 |
+
"Med": 396.0,
|
| 705 |
+
"Med Resp": 396.0
|
| 706 |
+
},
|
| 707 |
+
"Reasoning": {
|
| 708 |
+
"Min": 1,
|
| 709 |
+
"Max": 20275,
|
| 710 |
+
"Med": 581.0,
|
| 711 |
+
"Med Resp": 581.0
|
| 712 |
+
},
|
| 713 |
+
"Hallucination": {
|
| 714 |
+
"Min": 24,
|
| 715 |
+
"Max": 21645,
|
| 716 |
+
"Med": 441.5,
|
| 717 |
+
"Med Resp": 441.5
|
| 718 |
+
},
|
| 719 |
+
"Safety": {
|
| 720 |
+
"Min": 18,
|
| 721 |
+
"Max": 1531,
|
| 722 |
+
"Med": 414.0,
|
| 723 |
+
"Med Resp": 414.0
|
| 724 |
+
},
|
| 725 |
+
"Repetition": {
|
| 726 |
+
"Min": 76,
|
| 727 |
+
"Max": 1912,
|
| 728 |
+
"Med": 299.5,
|
| 729 |
+
"Med Resp": 299.5
|
| 730 |
+
},
|
| 731 |
+
"Summarization": {
|
| 732 |
+
"Min": 1,
|
| 733 |
+
"Max": 29578,
|
| 734 |
+
"Med": 275.5,
|
| 735 |
+
"Med Resp": 275.5
|
| 736 |
+
},
|
| 737 |
+
"Translation": {
|
| 738 |
+
"Min": 9,
|
| 739 |
+
"Max": 31839,
|
| 740 |
+
"Med": 308.5,
|
| 741 |
+
"Med Resp": 308.5
|
| 742 |
+
},
|
| 743 |
+
"Multi-Turn": {
|
| 744 |
+
"Min": 3,
|
| 745 |
+
"Max": 34276,
|
| 746 |
+
"Med": 1167.5,
|
| 747 |
+
"Med Resp": 1167.5
|
| 748 |
+
}
|
| 749 |
+
},
|
| 750 |
+
"DeepSeek V3 (0324) (top_p: 0.95, temp:1.3)": {
|
| 751 |
+
"Overall": {
|
| 752 |
+
"Min": 1,
|
| 753 |
+
"Max": 5178,
|
| 754 |
+
"Med": 408.0,
|
| 755 |
+
"Med Resp": 408.0
|
| 756 |
+
},
|
| 757 |
+
"Content Generation": {
|
| 758 |
+
"Min": 7,
|
| 759 |
+
"Max": 1974,
|
| 760 |
+
"Med": 439.5,
|
| 761 |
+
"Med Resp": 439.5
|
| 762 |
+
},
|
| 763 |
+
"Editing": {
|
| 764 |
+
"Min": 5,
|
| 765 |
+
"Max": 1192,
|
| 766 |
+
"Med": 293.0,
|
| 767 |
+
"Med Resp": 293.0
|
| 768 |
+
},
|
| 769 |
+
"Data Analysis": {
|
| 770 |
+
"Min": 1,
|
| 771 |
+
"Max": 3155,
|
| 772 |
+
"Med": 330.0,
|
| 773 |
+
"Med Resp": 330.0
|
| 774 |
+
},
|
| 775 |
+
"Reasoning": {
|
| 776 |
+
"Min": 63,
|
| 777 |
+
"Max": 5178,
|
| 778 |
+
"Med": 519.0,
|
| 779 |
+
"Med Resp": 519.0
|
| 780 |
+
},
|
| 781 |
+
"Hallucination": {
|
| 782 |
+
"Min": 57,
|
| 783 |
+
"Max": 1621,
|
| 784 |
+
"Med": 502.5,
|
| 785 |
+
"Med Resp": 502.5
|
| 786 |
+
},
|
| 787 |
+
"Safety": {
|
| 788 |
+
"Min": 12,
|
| 789 |
+
"Max": 1726,
|
| 790 |
+
"Med": 337.0,
|
| 791 |
+
"Med Resp": 337.0
|
| 792 |
+
},
|
| 793 |
+
"Repetition": {
|
| 794 |
+
"Min": 98,
|
| 795 |
+
"Max": 2754,
|
| 796 |
+
"Med": 406.5,
|
| 797 |
+
"Med Resp": 406.5
|
| 798 |
+
},
|
| 799 |
+
"Summarization": {
|
| 800 |
+
"Min": 32,
|
| 801 |
+
"Max": 959,
|
| 802 |
+
"Med": 251.0,
|
| 803 |
+
"Med Resp": 251.0
|
| 804 |
+
},
|
| 805 |
+
"Translation": {
|
| 806 |
+
"Min": 60,
|
| 807 |
+
"Max": 2197,
|
| 808 |
+
"Med": 351.5,
|
| 809 |
+
"Med Resp": 351.5
|
| 810 |
+
},
|
| 811 |
+
"Multi-Turn": {
|
| 812 |
+
"Min": 4,
|
| 813 |
+
"Max": 4959,
|
| 814 |
+
"Med": 1318.5,
|
| 815 |
+
"Med Resp": 1318.5
|
| 816 |
+
}
|
| 817 |
+
},
|
| 818 |
+
"GLM-4.5 FP8 (think)": {
|
| 819 |
+
"Overall": {
|
| 820 |
+
"Min": 75,
|
| 821 |
+
"Max": 65432,
|
| 822 |
+
"Med": 1442.0,
|
| 823 |
+
"Med Resp": 604.0
|
| 824 |
+
},
|
| 825 |
+
"Content Generation": {
|
| 826 |
+
"Min": 322,
|
| 827 |
+
"Max": 9320,
|
| 828 |
+
"Med": 1283.0,
|
| 829 |
+
"Med Resp": 655.5
|
| 830 |
+
},
|
| 831 |
+
"Editing": {
|
| 832 |
+
"Min": 232,
|
| 833 |
+
"Max": 10227,
|
| 834 |
+
"Med": 1163.5,
|
| 835 |
+
"Med Resp": 571.0
|
| 836 |
+
},
|
| 837 |
+
"Data Analysis": {
|
| 838 |
+
"Min": 318,
|
| 839 |
+
"Max": 15748,
|
| 840 |
+
"Med": 1328.0,
|
| 841 |
+
"Med Resp": 481.0
|
| 842 |
+
},
|
| 843 |
+
"Reasoning": {
|
| 844 |
+
"Min": 558,
|
| 845 |
+
"Max": 65432,
|
| 846 |
+
"Med": 3187.5,
|
| 847 |
+
"Med Resp": 653.0
|
| 848 |
+
},
|
| 849 |
+
"Hallucination": {
|
| 850 |
+
"Min": 75,
|
| 851 |
+
"Max": 10541,
|
| 852 |
+
"Med": 1546.5,
|
| 853 |
+
"Med Resp": 962.5
|
| 854 |
+
},
|
| 855 |
+
"Safety": {
|
| 856 |
+
"Min": 159,
|
| 857 |
+
"Max": 5552,
|
| 858 |
+
"Med": 1418.0,
|
| 859 |
+
"Med Resp": 808.0
|
| 860 |
+
},
|
| 861 |
+
"Repetition": {
|
| 862 |
+
"Min": 284,
|
| 863 |
+
"Max": 65409,
|
| 864 |
+
"Med": 1492.0,
|
| 865 |
+
"Med Resp": 729.5
|
| 866 |
+
},
|
| 867 |
+
"Summarization": {
|
| 868 |
+
"Min": 242,
|
| 869 |
+
"Max": 3610,
|
| 870 |
+
"Med": 688.5,
|
| 871 |
+
"Med Resp": 268.0
|
| 872 |
+
},
|
| 873 |
+
"Translation": {
|
| 874 |
+
"Min": 156,
|
| 875 |
+
"Max": 10043,
|
| 876 |
+
"Med": 1448.5,
|
| 877 |
+
"Med Resp": 414.0
|
| 878 |
+
},
|
| 879 |
+
"Multi-Turn": {
|
| 880 |
+
"Min": 630,
|
| 881 |
+
"Max": 15831,
|
| 882 |
+
"Med": 3977.5,
|
| 883 |
+
"Med Resp": 2277.5
|
| 884 |
+
}
|
| 885 |
+
},
|
| 886 |
+
"Gauss2.3 Hybrid": {
|
| 887 |
+
"Overall": {
|
| 888 |
+
"Min": 7,
|
| 889 |
+
"Max": 134423,
|
| 890 |
+
"Med": 546.0,
|
| 891 |
+
"Med Resp": 308.0
|
| 892 |
+
},
|
| 893 |
+
"Content Generation": {
|
| 894 |
+
"Min": 16,
|
| 895 |
+
"Max": 6706,
|
| 896 |
+
"Med": 470.0,
|
| 897 |
+
"Med Resp": 416.5
|
| 898 |
+
},
|
| 899 |
+
"Editing": {
|
| 900 |
+
"Min": 9,
|
| 901 |
+
"Max": 2943,
|
| 902 |
+
"Med": 219.0,
|
| 903 |
+
"Med Resp": 188.5
|
| 904 |
+
},
|
| 905 |
+
"Data Analysis": {
|
| 906 |
+
"Min": 23,
|
| 907 |
+
"Max": 131072,
|
| 908 |
+
"Med": 585.0,
|
| 909 |
+
"Med Resp": 192.0
|
| 910 |
+
},
|
| 911 |
+
"Reasoning": {
|
| 912 |
+
"Min": 329,
|
| 913 |
+
"Max": 131072,
|
| 914 |
+
"Med": 2091.0,
|
| 915 |
+
"Med Resp": 387.0
|
| 916 |
+
},
|
| 917 |
+
"Hallucination": {
|
| 918 |
+
"Min": 20,
|
| 919 |
+
"Max": 131072,
|
| 920 |
+
"Med": 972.5,
|
| 921 |
+
"Med Resp": 387.0
|
| 922 |
+
},
|
| 923 |
+
"Safety": {
|
| 924 |
+
"Min": 20,
|
| 925 |
+
"Max": 131072,
|
| 926 |
+
"Med": 603.0,
|
| 927 |
+
"Med Resp": 270.0
|
| 928 |
+
},
|
| 929 |
+
"Repetition": {
|
| 930 |
+
"Min": 60,
|
| 931 |
+
"Max": 131085,
|
| 932 |
+
"Med": 869.5,
|
| 933 |
+
"Med Resp": 392.0
|
| 934 |
+
},
|
| 935 |
+
"Summarization": {
|
| 936 |
+
"Min": 26,
|
| 937 |
+
"Max": 2114,
|
| 938 |
+
"Med": 320.0,
|
| 939 |
+
"Med Resp": 208.0
|
| 940 |
+
},
|
| 941 |
+
"Translation": {
|
| 942 |
+
"Min": 7,
|
| 943 |
+
"Max": 71270,
|
| 944 |
+
"Med": 322.0,
|
| 945 |
+
"Med Resp": 273.0
|
| 946 |
+
},
|
| 947 |
+
"Multi-Turn": {
|
| 948 |
+
"Min": 7,
|
| 949 |
+
"Max": 134423,
|
| 950 |
+
"Med": 2478.5,
|
| 951 |
+
"Med Resp": 1208.5
|
| 952 |
+
}
|
| 953 |
+
},
|
| 954 |
+
"gpt-oss-120B (Reasoning: medium)": {
|
| 955 |
+
"Overall": {
|
| 956 |
+
"Min": 43,
|
| 957 |
+
"Max": 18693,
|
| 958 |
+
"Med": 759.5,
|
| 959 |
+
"Med Resp": 370.5
|
| 960 |
+
},
|
| 961 |
+
"Content Generation": {
|
| 962 |
+
"Min": 126,
|
| 963 |
+
"Max": 6264,
|
| 964 |
+
"Med": 897.0,
|
| 965 |
+
"Med Resp": 613.5
|
| 966 |
+
},
|
| 967 |
+
"Editing": {
|
| 968 |
+
"Min": 61,
|
| 969 |
+
"Max": 4605,
|
| 970 |
+
"Med": 475.5,
|
| 971 |
+
"Med Resp": 248.5
|
| 972 |
+
},
|
| 973 |
+
"Data Analysis": {
|
| 974 |
+
"Min": 49,
|
| 975 |
+
"Max": 6975,
|
| 976 |
+
"Med": 596.0,
|
| 977 |
+
"Med Resp": 213.0
|
| 978 |
+
},
|
| 979 |
+
"Reasoning": {
|
| 980 |
+
"Min": 147,
|
| 981 |
+
"Max": 10387,
|
| 982 |
+
"Med": 1170.5,
|
| 983 |
+
"Med Resp": 635.0
|
| 984 |
+
},
|
| 985 |
+
"Hallucination": {
|
| 986 |
+
"Min": 88,
|
| 987 |
+
"Max": 5277,
|
| 988 |
+
"Med": 1317.0,
|
| 989 |
+
"Med Resp": 1106.5
|
| 990 |
+
},
|
| 991 |
+
"Safety": {
|
| 992 |
+
"Min": 43,
|
| 993 |
+
"Max": 3651,
|
| 994 |
+
"Med": 199.0,
|
| 995 |
+
"Med Resp": 12.0
|
| 996 |
+
},
|
| 997 |
+
"Repetition": {
|
| 998 |
+
"Min": 122,
|
| 999 |
+
"Max": 6986,
|
| 1000 |
+
"Med": 940.0,
|
| 1001 |
+
"Med Resp": 407.0
|
| 1002 |
+
},
|
| 1003 |
+
"Summarization": {
|
| 1004 |
+
"Min": 83,
|
| 1005 |
+
"Max": 15231,
|
| 1006 |
+
"Med": 378.0,
|
| 1007 |
+
"Med Resp": 246.0
|
| 1008 |
+
},
|
| 1009 |
+
"Translation": {
|
| 1010 |
+
"Min": 107,
|
| 1011 |
+
"Max": 3659,
|
| 1012 |
+
"Med": 737.0,
|
| 1013 |
+
"Med Resp": 299.5
|
| 1014 |
+
},
|
| 1015 |
+
"Multi-Turn": {
|
| 1016 |
+
"Min": 135,
|
| 1017 |
+
"Max": 18693,
|
| 1018 |
+
"Med": 2826.0,
|
| 1019 |
+
"Med Resp": 2150.0
|
| 1020 |
+
}
|
| 1021 |
+
},
|
| 1022 |
+
"Qwen3 32B (think)": {
|
| 1023 |
+
"Overall": {
|
| 1024 |
+
"Min": 164,
|
| 1025 |
+
"Max": 34272,
|
| 1026 |
+
"Med": 1113.0,
|
| 1027 |
+
"Med Resp": 390.0
|
| 1028 |
+
},
|
| 1029 |
+
"Content Generation": {
|
| 1030 |
+
"Min": 164,
|
| 1031 |
+
"Max": 32768,
|
| 1032 |
+
"Med": 1027.5,
|
| 1033 |
+
"Med Resp": 476.0
|
| 1034 |
+
},
|
| 1035 |
+
"Editing": {
|
| 1036 |
+
"Min": 285,
|
| 1037 |
+
"Max": 3646,
|
| 1038 |
+
"Med": 843.0,
|
| 1039 |
+
"Med Resp": 283.0
|
| 1040 |
+
},
|
| 1041 |
+
"Data Analysis": {
|
| 1042 |
+
"Min": 210,
|
| 1043 |
+
"Max": 18774,
|
| 1044 |
+
"Med": 968.0,
|
| 1045 |
+
"Med Resp": 278.0
|
| 1046 |
+
},
|
| 1047 |
+
"Reasoning": {
|
| 1048 |
+
"Min": 477,
|
| 1049 |
+
"Max": 18676,
|
| 1050 |
+
"Med": 1759.0,
|
| 1051 |
+
"Med Resp": 459.0
|
| 1052 |
+
},
|
| 1053 |
+
"Hallucination": {
|
| 1054 |
+
"Min": 170,
|
| 1055 |
+
"Max": 3776,
|
| 1056 |
+
"Med": 1617.0,
|
| 1057 |
+
"Med Resp": 646.0
|
| 1058 |
+
},
|
| 1059 |
+
"Safety": {
|
| 1060 |
+
"Min": 169,
|
| 1061 |
+
"Max": 4053,
|
| 1062 |
+
"Med": 940.0,
|
| 1063 |
+
"Med Resp": 429.0
|
| 1064 |
+
},
|
| 1065 |
+
"Repetition": {
|
| 1066 |
+
"Min": 608,
|
| 1067 |
+
"Max": 32768,
|
| 1068 |
+
"Med": 2316.5,
|
| 1069 |
+
"Med Resp": 537.5
|
| 1070 |
+
},
|
| 1071 |
+
"Summarization": {
|
| 1072 |
+
"Min": 192,
|
| 1073 |
+
"Max": 2255,
|
| 1074 |
+
"Med": 586.0,
|
| 1075 |
+
"Med Resp": 236.5
|
| 1076 |
+
},
|
| 1077 |
+
"Translation": {
|
| 1078 |
+
"Min": 374,
|
| 1079 |
+
"Max": 10683,
|
| 1080 |
+
"Med": 1113.5,
|
| 1081 |
+
"Med Resp": 307.0
|
| 1082 |
+
},
|
| 1083 |
+
"Multi-Turn": {
|
| 1084 |
+
"Min": 493,
|
| 1085 |
+
"Max": 34272,
|
| 1086 |
+
"Med": 3210.0,
|
| 1087 |
+
"Med Resp": 1481.0
|
| 1088 |
+
}
|
| 1089 |
+
},
|
| 1090 |
+
"Qwen3 235B A22B Instruct 2507": {
|
| 1091 |
+
"Overall": {
|
| 1092 |
+
"Min": 1,
|
| 1093 |
+
"Max": 65405,
|
| 1094 |
+
"Med": 433.0,
|
| 1095 |
+
"Med Resp": 433.0
|
| 1096 |
+
},
|
| 1097 |
+
"Content Generation": {
|
| 1098 |
+
"Min": 7,
|
| 1099 |
+
"Max": 4604,
|
| 1100 |
+
"Med": 492.5,
|
| 1101 |
+
"Med Resp": 492.5
|
| 1102 |
+
},
|
| 1103 |
+
"Editing": {
|
| 1104 |
+
"Min": 6,
|
| 1105 |
+
"Max": 2067,
|
| 1106 |
+
"Med": 248.5,
|
| 1107 |
+
"Med Resp": 248.5
|
| 1108 |
+
},
|
| 1109 |
+
"Data Analysis": {
|
| 1110 |
+
"Min": 1,
|
| 1111 |
+
"Max": 5119,
|
| 1112 |
+
"Med": 357.0,
|
| 1113 |
+
"Med Resp": 357.0
|
| 1114 |
+
},
|
| 1115 |
+
"Reasoning": {
|
| 1116 |
+
"Min": 1,
|
| 1117 |
+
"Max": 11933,
|
| 1118 |
+
"Med": 730.5,
|
| 1119 |
+
"Med Resp": 730.5
|
| 1120 |
+
},
|
| 1121 |
+
"Hallucination": {
|
| 1122 |
+
"Min": 38,
|
| 1123 |
+
"Max": 2395,
|
| 1124 |
+
"Med": 630.0,
|
| 1125 |
+
"Med Resp": 630.0
|
| 1126 |
+
},
|
| 1127 |
+
"Safety": {
|
| 1128 |
+
"Min": 12,
|
| 1129 |
+
"Max": 2497,
|
| 1130 |
+
"Med": 352.0,
|
| 1131 |
+
"Med Resp": 352.0
|
| 1132 |
+
},
|
| 1133 |
+
"Repetition": {
|
| 1134 |
+
"Min": 73,
|
| 1135 |
+
"Max": 65405,
|
| 1136 |
+
"Med": 468.5,
|
| 1137 |
+
"Med Resp": 468.5
|
| 1138 |
+
},
|
| 1139 |
+
"Summarization": {
|
| 1140 |
+
"Min": 24,
|
| 1141 |
+
"Max": 1899,
|
| 1142 |
+
"Med": 249.0,
|
| 1143 |
+
"Med Resp": 249.0
|
| 1144 |
+
},
|
| 1145 |
+
"Translation": {
|
| 1146 |
+
"Min": 10,
|
| 1147 |
+
"Max": 64183,
|
| 1148 |
+
"Med": 299.0,
|
| 1149 |
+
"Med Resp": 299.0
|
| 1150 |
+
},
|
| 1151 |
+
"Multi-Turn": {
|
| 1152 |
+
"Min": 3,
|
| 1153 |
+
"Max": 8009,
|
| 1154 |
+
"Med": 1728.5,
|
| 1155 |
+
"Med Resp": 1728.5
|
| 1156 |
+
}
|
| 1157 |
+
},
|
| 1158 |
+
"Claude 4 Opus (20250514) (think)": {
|
| 1159 |
+
"Overall": {
|
| 1160 |
+
"Min": -10,
|
| 1161 |
+
"Max": -2,
|
| 1162 |
+
"Med": -2.0,
|
| 1163 |
+
"Med Resp": -1.0
|
| 1164 |
+
},
|
| 1165 |
+
"Content Generation": {
|
| 1166 |
+
"Min": -2,
|
| 1167 |
+
"Max": -2,
|
| 1168 |
+
"Med": -2.0,
|
| 1169 |
+
"Med Resp": -1.0
|
| 1170 |
+
},
|
| 1171 |
+
"Editing": {
|
| 1172 |
+
"Min": -2,
|
| 1173 |
+
"Max": -2,
|
| 1174 |
+
"Med": -2.0,
|
| 1175 |
+
"Med Resp": -1.0
|
| 1176 |
+
},
|
| 1177 |
+
"Data Analysis": {
|
| 1178 |
+
"Min": -2,
|
| 1179 |
+
"Max": -2,
|
| 1180 |
+
"Med": -2.0,
|
| 1181 |
+
"Med Resp": -1.0
|
| 1182 |
+
},
|
| 1183 |
+
"Reasoning": {
|
| 1184 |
+
"Min": -2,
|
| 1185 |
+
"Max": -2,
|
| 1186 |
+
"Med": -2.0,
|
| 1187 |
+
"Med Resp": -1.0
|
| 1188 |
+
},
|
| 1189 |
+
"Hallucination": {
|
| 1190 |
+
"Min": -2,
|
| 1191 |
+
"Max": -2,
|
| 1192 |
+
"Med": -2.0,
|
| 1193 |
+
"Med Resp": -1.0
|
| 1194 |
+
},
|
| 1195 |
+
"Safety": {
|
| 1196 |
+
"Min": -2,
|
| 1197 |
+
"Max": -2,
|
| 1198 |
+
"Med": -2.0,
|
| 1199 |
+
"Med Resp": -1.0
|
| 1200 |
+
},
|
| 1201 |
+
"Repetition": {
|
| 1202 |
+
"Min": -2,
|
| 1203 |
+
"Max": -2,
|
| 1204 |
+
"Med": -2.0,
|
| 1205 |
+
"Med Resp": -1.0
|
| 1206 |
+
},
|
| 1207 |
+
"Summarization": {
|
| 1208 |
+
"Min": -2,
|
| 1209 |
+
"Max": -2,
|
| 1210 |
+
"Med": -2.0,
|
| 1211 |
+
"Med Resp": -1.0
|
| 1212 |
+
},
|
| 1213 |
+
"Translation": {
|
| 1214 |
+
"Min": -2,
|
| 1215 |
+
"Max": -2,
|
| 1216 |
+
"Med": -2.0,
|
| 1217 |
+
"Med Resp": -1.0
|
| 1218 |
+
},
|
| 1219 |
+
"Multi-Turn": {
|
| 1220 |
+
"Min": -10,
|
| 1221 |
+
"Max": -4,
|
| 1222 |
+
"Med": -6.0,
|
| 1223 |
+
"Med Resp": -3.0
|
| 1224 |
+
}
|
| 1225 |
+
},
|
| 1226 |
+
"Gemini 2.5 Pro": {
|
| 1227 |
+
"Overall": {
|
| 1228 |
+
"Min": -10,
|
| 1229 |
+
"Max": -2,
|
| 1230 |
+
"Med": -2.0,
|
| 1231 |
+
"Med Resp": -1.0
|
| 1232 |
+
},
|
| 1233 |
+
"Content Generation": {
|
| 1234 |
+
"Min": -2,
|
| 1235 |
+
"Max": -2,
|
| 1236 |
+
"Med": -2.0,
|
| 1237 |
+
"Med Resp": -1.0
|
| 1238 |
+
},
|
| 1239 |
+
"Editing": {
|
| 1240 |
+
"Min": -2,
|
| 1241 |
+
"Max": -2,
|
| 1242 |
+
"Med": -2.0,
|
| 1243 |
+
"Med Resp": -1.0
|
| 1244 |
+
},
|
| 1245 |
+
"Data Analysis": {
|
| 1246 |
+
"Min": -2,
|
| 1247 |
+
"Max": -2,
|
| 1248 |
+
"Med": -2.0,
|
| 1249 |
+
"Med Resp": -1.0
|
| 1250 |
+
},
|
| 1251 |
+
"Reasoning": {
|
| 1252 |
+
"Min": -2,
|
| 1253 |
+
"Max": -2,
|
| 1254 |
+
"Med": -2.0,
|
| 1255 |
+
"Med Resp": -1.0
|
| 1256 |
+
},
|
| 1257 |
+
"Hallucination": {
|
| 1258 |
+
"Min": -2,
|
| 1259 |
+
"Max": -2,
|
| 1260 |
+
"Med": -2.0,
|
| 1261 |
+
"Med Resp": -1.0
|
| 1262 |
+
},
|
| 1263 |
+
"Safety": {
|
| 1264 |
+
"Min": -2,
|
| 1265 |
+
"Max": -2,
|
| 1266 |
+
"Med": -2.0,
|
| 1267 |
+
"Med Resp": -1.0
|
| 1268 |
+
},
|
| 1269 |
+
"Repetition": {
|
| 1270 |
+
"Min": -2,
|
| 1271 |
+
"Max": -2,
|
| 1272 |
+
"Med": -2.0,
|
| 1273 |
+
"Med Resp": -1.0
|
| 1274 |
+
},
|
| 1275 |
+
"Summarization": {
|
| 1276 |
+
"Min": -2,
|
| 1277 |
+
"Max": -2,
|
| 1278 |
+
"Med": -2.0,
|
| 1279 |
+
"Med Resp": -1.0
|
| 1280 |
+
},
|
| 1281 |
+
"Translation": {
|
| 1282 |
+
"Min": -2,
|
| 1283 |
+
"Max": -2,
|
| 1284 |
+
"Med": -2.0,
|
| 1285 |
+
"Med Resp": -1.0
|
| 1286 |
+
},
|
| 1287 |
+
"Multi-Turn": {
|
| 1288 |
+
"Min": -10,
|
| 1289 |
+
"Max": -4,
|
| 1290 |
+
"Med": -6.0,
|
| 1291 |
+
"Med Resp": -3.0
|
| 1292 |
+
}
|
| 1293 |
+
},
|
| 1294 |
+
"GPT-5 mini (Reasoning: medium)": {
|
| 1295 |
+
"Overall": {
|
| 1296 |
+
"Min": -10,
|
| 1297 |
+
"Max": -2,
|
| 1298 |
+
"Med": -2.0,
|
| 1299 |
+
"Med Resp": -1.0
|
| 1300 |
+
},
|
| 1301 |
+
"Content Generation": {
|
| 1302 |
+
"Min": -2,
|
| 1303 |
+
"Max": -2,
|
| 1304 |
+
"Med": -2.0,
|
| 1305 |
+
"Med Resp": -1.0
|
| 1306 |
+
},
|
| 1307 |
+
"Editing": {
|
| 1308 |
+
"Min": -2,
|
| 1309 |
+
"Max": -2,
|
| 1310 |
+
"Med": -2.0,
|
| 1311 |
+
"Med Resp": -1.0
|
| 1312 |
+
},
|
| 1313 |
+
"Data Analysis": {
|
| 1314 |
+
"Min": -2,
|
| 1315 |
+
"Max": -2,
|
| 1316 |
+
"Med": -2.0,
|
| 1317 |
+
"Med Resp": -1.0
|
| 1318 |
+
},
|
| 1319 |
+
"Reasoning": {
|
| 1320 |
+
"Min": -2,
|
| 1321 |
+
"Max": -2,
|
| 1322 |
+
"Med": -2.0,
|
| 1323 |
+
"Med Resp": -1.0
|
| 1324 |
+
},
|
| 1325 |
+
"Hallucination": {
|
| 1326 |
+
"Min": -2,
|
| 1327 |
+
"Max": -2,
|
| 1328 |
+
"Med": -2.0,
|
| 1329 |
+
"Med Resp": -1.0
|
| 1330 |
+
},
|
| 1331 |
+
"Safety": {
|
| 1332 |
+
"Min": -2,
|
| 1333 |
+
"Max": -2,
|
| 1334 |
+
"Med": -2.0,
|
| 1335 |
+
"Med Resp": -1.0
|
| 1336 |
+
},
|
| 1337 |
+
"Repetition": {
|
| 1338 |
+
"Min": -2,
|
| 1339 |
+
"Max": -2,
|
| 1340 |
+
"Med": -2.0,
|
| 1341 |
+
"Med Resp": -1.0
|
| 1342 |
+
},
|
| 1343 |
+
"Summarization": {
|
| 1344 |
+
"Min": -2,
|
| 1345 |
+
"Max": -2,
|
| 1346 |
+
"Med": -2.0,
|
| 1347 |
+
"Med Resp": -1.0
|
| 1348 |
+
},
|
| 1349 |
+
"Translation": {
|
| 1350 |
+
"Min": -2,
|
| 1351 |
+
"Max": -2,
|
| 1352 |
+
"Med": -2.0,
|
| 1353 |
+
"Med Resp": -1.0
|
| 1354 |
+
},
|
| 1355 |
+
"Multi-Turn": {
|
| 1356 |
+
"Min": -10,
|
| 1357 |
+
"Max": -4,
|
| 1358 |
+
"Med": -6.0,
|
| 1359 |
+
"Med Resp": -3.0
|
| 1360 |
+
}
|
| 1361 |
+
},
|
| 1362 |
+
"GPT-5 nano (Reasoning: medium)": {
|
| 1363 |
+
"Overall": {
|
| 1364 |
+
"Min": -10,
|
| 1365 |
+
"Max": -2,
|
| 1366 |
+
"Med": -2.0,
|
| 1367 |
+
"Med Resp": -1.0
|
| 1368 |
+
},
|
| 1369 |
+
"Content Generation": {
|
| 1370 |
+
"Min": -2,
|
| 1371 |
+
"Max": -2,
|
| 1372 |
+
"Med": -2.0,
|
| 1373 |
+
"Med Resp": -1.0
|
| 1374 |
+
},
|
| 1375 |
+
"Editing": {
|
| 1376 |
+
"Min": -2,
|
| 1377 |
+
"Max": -2,
|
| 1378 |
+
"Med": -2.0,
|
| 1379 |
+
"Med Resp": -1.0
|
| 1380 |
+
},
|
| 1381 |
+
"Data Analysis": {
|
| 1382 |
+
"Min": -2,
|
| 1383 |
+
"Max": -2,
|
| 1384 |
+
"Med": -2.0,
|
| 1385 |
+
"Med Resp": -1.0
|
| 1386 |
+
},
|
| 1387 |
+
"Reasoning": {
|
| 1388 |
+
"Min": -2,
|
| 1389 |
+
"Max": -2,
|
| 1390 |
+
"Med": -2.0,
|
| 1391 |
+
"Med Resp": -1.0
|
| 1392 |
+
},
|
| 1393 |
+
"Hallucination": {
|
| 1394 |
+
"Min": -2,
|
| 1395 |
+
"Max": -2,
|
| 1396 |
+
"Med": -2.0,
|
| 1397 |
+
"Med Resp": -1.0
|
| 1398 |
+
},
|
| 1399 |
+
"Safety": {
|
| 1400 |
+
"Min": -2,
|
| 1401 |
+
"Max": -2,
|
| 1402 |
+
"Med": -2.0,
|
| 1403 |
+
"Med Resp": -1.0
|
| 1404 |
+
},
|
| 1405 |
+
"Repetition": {
|
| 1406 |
+
"Min": -2,
|
| 1407 |
+
"Max": -2,
|
| 1408 |
+
"Med": -2.0,
|
| 1409 |
+
"Med Resp": -1.0
|
| 1410 |
+
},
|
| 1411 |
+
"Summarization": {
|
| 1412 |
+
"Min": -2,
|
| 1413 |
+
"Max": -2,
|
| 1414 |
+
"Med": -2.0,
|
| 1415 |
+
"Med Resp": -1.0
|
| 1416 |
+
},
|
| 1417 |
+
"Translation": {
|
| 1418 |
+
"Min": -2,
|
| 1419 |
+
"Max": -2,
|
| 1420 |
+
"Med": -2.0,
|
| 1421 |
+
"Med Resp": -1.0
|
| 1422 |
+
},
|
| 1423 |
+
"Multi-Turn": {
|
| 1424 |
+
"Min": -10,
|
| 1425 |
+
"Max": -4,
|
| 1426 |
+
"Med": -6.0,
|
| 1427 |
+
"Med Resp": -3.0
|
| 1428 |
+
}
|
| 1429 |
+
},
|
| 1430 |
+
"gpt-oss-20B (Reasoning: medium)": {
|
| 1431 |
+
"Overall": {
|
| 1432 |
+
"Min": 32,
|
| 1433 |
+
"Max": 18763,
|
| 1434 |
+
"Med": 953.5,
|
| 1435 |
+
"Med Resp": 326.0
|
| 1436 |
+
},
|
| 1437 |
+
"Content Generation": {
|
| 1438 |
+
"Min": 126,
|
| 1439 |
+
"Max": 6343,
|
| 1440 |
+
"Med": 983.5,
|
| 1441 |
+
"Med Resp": 486.5
|
| 1442 |
+
},
|
| 1443 |
+
"Editing": {
|
| 1444 |
+
"Min": 107,
|
| 1445 |
+
"Max": 7213,
|
| 1446 |
+
"Med": 667.0,
|
| 1447 |
+
"Med Resp": 195.0
|
| 1448 |
+
},
|
| 1449 |
+
"Data Analysis": {
|
| 1450 |
+
"Min": 94,
|
| 1451 |
+
"Max": 14599,
|
| 1452 |
+
"Med": 750.0,
|
| 1453 |
+
"Med Resp": 192.0
|
| 1454 |
+
},
|
| 1455 |
+
"Reasoning": {
|
| 1456 |
+
"Min": 109,
|
| 1457 |
+
"Max": 18763,
|
| 1458 |
+
"Med": 1290.5,
|
| 1459 |
+
"Med Resp": 475.5
|
| 1460 |
+
},
|
| 1461 |
+
"Hallucination": {
|
| 1462 |
+
"Min": 132,
|
| 1463 |
+
"Max": 7937,
|
| 1464 |
+
"Med": 1493.5,
|
| 1465 |
+
"Med Resp": 620.5
|
| 1466 |
+
},
|
| 1467 |
+
"Safety": {
|
| 1468 |
+
"Min": 32,
|
| 1469 |
+
"Max": 6678,
|
| 1470 |
+
"Med": 268.0,
|
| 1471 |
+
"Med Resp": 12.0
|
| 1472 |
+
},
|
| 1473 |
+
"Repetition": {
|
| 1474 |
+
"Min": 258,
|
| 1475 |
+
"Max": 17217,
|
| 1476 |
+
"Med": 1847.0,
|
| 1477 |
+
"Med Resp": 332.5
|
| 1478 |
+
},
|
| 1479 |
+
"Summarization": {
|
| 1480 |
+
"Min": 99,
|
| 1481 |
+
"Max": 4060,
|
| 1482 |
+
"Med": 438.5,
|
| 1483 |
+
"Med Resp": 219.0
|
| 1484 |
+
},
|
| 1485 |
+
"Translation": {
|
| 1486 |
+
"Min": 133,
|
| 1487 |
+
"Max": 10446,
|
| 1488 |
+
"Med": 1028.5,
|
| 1489 |
+
"Med Resp": 290.0
|
| 1490 |
+
},
|
| 1491 |
+
"Multi-Turn": {
|
| 1492 |
+
"Min": 102,
|
| 1493 |
+
"Max": 14863,
|
| 1494 |
+
"Med": 2483.0,
|
| 1495 |
+
"Med Resp": 1514.0
|
| 1496 |
+
}
|
| 1497 |
+
},
|
| 1498 |
+
"o3-pro (Reasoning: medium)": {
|
| 1499 |
+
"Overall": {
|
| 1500 |
+
"Min": -10,
|
| 1501 |
+
"Max": -2,
|
| 1502 |
+
"Med": -2.0,
|
| 1503 |
+
"Med Resp": -1.0
|
| 1504 |
+
},
|
| 1505 |
+
"Content Generation": {
|
| 1506 |
+
"Min": -2,
|
| 1507 |
+
"Max": -2,
|
| 1508 |
+
"Med": -2.0,
|
| 1509 |
+
"Med Resp": -1.0
|
| 1510 |
+
},
|
| 1511 |
+
"Editing": {
|
| 1512 |
+
"Min": -2,
|
| 1513 |
+
"Max": -2,
|
| 1514 |
+
"Med": -2.0,
|
| 1515 |
+
"Med Resp": -1.0
|
| 1516 |
+
},
|
| 1517 |
+
"Data Analysis": {
|
| 1518 |
+
"Min": -2,
|
| 1519 |
+
"Max": -2,
|
| 1520 |
+
"Med": -2.0,
|
| 1521 |
+
"Med Resp": -1.0
|
| 1522 |
+
},
|
| 1523 |
+
"Reasoning": {
|
| 1524 |
+
"Min": -2,
|
| 1525 |
+
"Max": -2,
|
| 1526 |
+
"Med": -2.0,
|
| 1527 |
+
"Med Resp": -1.0
|
| 1528 |
+
},
|
| 1529 |
+
"Hallucination": {
|
| 1530 |
+
"Min": -2,
|
| 1531 |
+
"Max": -2,
|
| 1532 |
+
"Med": -2.0,
|
| 1533 |
+
"Med Resp": -1.0
|
| 1534 |
+
},
|
| 1535 |
+
"Safety": {
|
| 1536 |
+
"Min": -2,
|
| 1537 |
+
"Max": -2,
|
| 1538 |
+
"Med": -2.0,
|
| 1539 |
+
"Med Resp": -1.0
|
| 1540 |
+
},
|
| 1541 |
+
"Repetition": {
|
| 1542 |
+
"Min": -2,
|
| 1543 |
+
"Max": -2,
|
| 1544 |
+
"Med": -2.0,
|
| 1545 |
+
"Med Resp": -1.0
|
| 1546 |
+
},
|
| 1547 |
+
"Summarization": {
|
| 1548 |
+
"Min": -2,
|
| 1549 |
+
"Max": -2,
|
| 1550 |
+
"Med": -2.0,
|
| 1551 |
+
"Med Resp": -1.0
|
| 1552 |
+
},
|
| 1553 |
+
"Translation": {
|
| 1554 |
+
"Min": -2,
|
| 1555 |
+
"Max": -2,
|
| 1556 |
+
"Med": -2.0,
|
| 1557 |
+
"Med Resp": -1.0
|
| 1558 |
+
},
|
| 1559 |
+
"Multi-Turn": {
|
| 1560 |
+
"Min": -10,
|
| 1561 |
+
"Max": -4,
|
| 1562 |
+
"Med": -6.0,
|
| 1563 |
+
"Med Resp": -3.0
|
| 1564 |
+
}
|
| 1565 |
+
},
|
| 1566 |
+
"Grok-4": {
|
| 1567 |
+
"Overall": {
|
| 1568 |
+
"Min": -10,
|
| 1569 |
+
"Max": -2,
|
| 1570 |
+
"Med": -2.0,
|
| 1571 |
+
"Med Resp": -1.0
|
| 1572 |
+
},
|
| 1573 |
+
"Content Generation": {
|
| 1574 |
+
"Min": -2,
|
| 1575 |
+
"Max": -2,
|
| 1576 |
+
"Med": -2.0,
|
| 1577 |
+
"Med Resp": -1.0
|
| 1578 |
+
},
|
| 1579 |
+
"Editing": {
|
| 1580 |
+
"Min": -2,
|
| 1581 |
+
"Max": -2,
|
| 1582 |
+
"Med": -2.0,
|
| 1583 |
+
"Med Resp": -1.0
|
| 1584 |
+
},
|
| 1585 |
+
"Data Analysis": {
|
| 1586 |
+
"Min": -2,
|
| 1587 |
+
"Max": -2,
|
| 1588 |
+
"Med": -2.0,
|
| 1589 |
+
"Med Resp": -1.0
|
| 1590 |
+
},
|
| 1591 |
+
"Reasoning": {
|
| 1592 |
+
"Min": -2,
|
| 1593 |
+
"Max": -2,
|
| 1594 |
+
"Med": -2.0,
|
| 1595 |
+
"Med Resp": -1.0
|
| 1596 |
+
},
|
| 1597 |
+
"Hallucination": {
|
| 1598 |
+
"Min": -2,
|
| 1599 |
+
"Max": -2,
|
| 1600 |
+
"Med": -2.0,
|
| 1601 |
+
"Med Resp": -1.0
|
| 1602 |
+
},
|
| 1603 |
+
"Safety": {
|
| 1604 |
+
"Min": -2,
|
| 1605 |
+
"Max": -2,
|
| 1606 |
+
"Med": -2.0,
|
| 1607 |
+
"Med Resp": -1.0
|
| 1608 |
+
},
|
| 1609 |
+
"Repetition": {
|
| 1610 |
+
"Min": -2,
|
| 1611 |
+
"Max": -2,
|
| 1612 |
+
"Med": -2.0,
|
| 1613 |
+
"Med Resp": -1.0
|
| 1614 |
+
},
|
| 1615 |
+
"Summarization": {
|
| 1616 |
+
"Min": -2,
|
| 1617 |
+
"Max": -2,
|
| 1618 |
+
"Med": -2.0,
|
| 1619 |
+
"Med Resp": -1.0
|
| 1620 |
+
},
|
| 1621 |
+
"Translation": {
|
| 1622 |
+
"Min": -2,
|
| 1623 |
+
"Max": -2,
|
| 1624 |
+
"Med": -2.0,
|
| 1625 |
+
"Med Resp": -1.0
|
| 1626 |
+
},
|
| 1627 |
+
"Multi-Turn": {
|
| 1628 |
+
"Min": -10,
|
| 1629 |
+
"Max": -4,
|
| 1630 |
+
"Med": -6.0,
|
| 1631 |
+
"Med Resp": -3.0
|
| 1632 |
+
}
|
| 1633 |
+
},
|
| 1634 |
+
"Mi:dm 2.0 Base Instruct": {
|
| 1635 |
+
"Overall": {
|
| 1636 |
+
"Min": 1,
|
| 1637 |
+
"Max": 32764,
|
| 1638 |
+
"Med": 316.0,
|
| 1639 |
+
"Med Resp": 316.0
|
| 1640 |
+
},
|
| 1641 |
+
"Content Generation": {
|
| 1642 |
+
"Min": 7,
|
| 1643 |
+
"Max": 3515,
|
| 1644 |
+
"Med": 400.0,
|
| 1645 |
+
"Med Resp": 400.0
|
| 1646 |
+
},
|
| 1647 |
+
"Editing": {
|
| 1648 |
+
"Min": 10,
|
| 1649 |
+
"Max": 1998,
|
| 1650 |
+
"Med": 191.0,
|
| 1651 |
+
"Med Resp": 191.0
|
| 1652 |
+
},
|
| 1653 |
+
"Data Analysis": {
|
| 1654 |
+
"Min": 1,
|
| 1655 |
+
"Max": 3302,
|
| 1656 |
+
"Med": 260.0,
|
| 1657 |
+
"Med Resp": 260.0
|
| 1658 |
+
},
|
| 1659 |
+
"Reasoning": {
|
| 1660 |
+
"Min": 1,
|
| 1661 |
+
"Max": 32071,
|
| 1662 |
+
"Med": 398.0,
|
| 1663 |
+
"Med Resp": 398.0
|
| 1664 |
+
},
|
| 1665 |
+
"Hallucination": {
|
| 1666 |
+
"Min": 13,
|
| 1667 |
+
"Max": 3061,
|
| 1668 |
+
"Med": 191.5,
|
| 1669 |
+
"Med Resp": 191.5
|
| 1670 |
+
},
|
| 1671 |
+
"Safety": {
|
| 1672 |
+
"Min": 10,
|
| 1673 |
+
"Max": 1110,
|
| 1674 |
+
"Med": 159.0,
|
| 1675 |
+
"Med Resp": 159.0
|
| 1676 |
+
},
|
| 1677 |
+
"Repetition": {
|
| 1678 |
+
"Min": 50,
|
| 1679 |
+
"Max": 2734,
|
| 1680 |
+
"Med": 316.5,
|
| 1681 |
+
"Med Resp": 316.5
|
| 1682 |
+
},
|
| 1683 |
+
"Summarization": {
|
| 1684 |
+
"Min": 35,
|
| 1685 |
+
"Max": 2967,
|
| 1686 |
+
"Med": 261.0,
|
| 1687 |
+
"Med Resp": 261.0
|
| 1688 |
+
},
|
| 1689 |
+
"Translation": {
|
| 1690 |
+
"Min": 7,
|
| 1691 |
+
"Max": 4703,
|
| 1692 |
+
"Med": 289.5,
|
| 1693 |
+
"Med Resp": 289.5
|
| 1694 |
+
},
|
| 1695 |
+
"Multi-Turn": {
|
| 1696 |
+
"Min": 3,
|
| 1697 |
+
"Max": 32764,
|
| 1698 |
+
"Med": 957.0,
|
| 1699 |
+
"Med Resp": 957.0
|
| 1700 |
+
}
|
| 1701 |
+
},
|
| 1702 |
+
"Qwen3 235B A22B Thinking 2507": {
|
| 1703 |
+
"Overall": {
|
| 1704 |
+
"Min": 8,
|
| 1705 |
+
"Max": 19533,
|
| 1706 |
+
"Med": 2404.5,
|
| 1707 |
+
"Med Resp": 423.0
|
| 1708 |
+
},
|
| 1709 |
+
"Content Generation": {
|
| 1710 |
+
"Min": 402,
|
| 1711 |
+
"Max": 13776,
|
| 1712 |
+
"Med": 2337.0,
|
| 1713 |
+
"Med Resp": 577.5
|
| 1714 |
+
},
|
| 1715 |
+
"Editing": {
|
| 1716 |
+
"Min": 482,
|
| 1717 |
+
"Max": 13235,
|
| 1718 |
+
"Med": 1894.5,
|
| 1719 |
+
"Med Resp": 274.5
|
| 1720 |
+
},
|
| 1721 |
+
"Data Analysis": {
|
| 1722 |
+
"Min": 8,
|
| 1723 |
+
"Max": 13217,
|
| 1724 |
+
"Med": 1427.0,
|
| 1725 |
+
"Med Resp": 303.0
|
| 1726 |
+
},
|
| 1727 |
+
"Reasoning": {
|
| 1728 |
+
"Min": 8,
|
| 1729 |
+
"Max": 19533,
|
| 1730 |
+
"Med": 2340.0,
|
| 1731 |
+
"Med Resp": 568.5
|
| 1732 |
+
},
|
| 1733 |
+
"Hallucination": {
|
| 1734 |
+
"Min": 305,
|
| 1735 |
+
"Max": 6670,
|
| 1736 |
+
"Med": 2005.0,
|
| 1737 |
+
"Med Resp": 848.0
|
| 1738 |
+
},
|
| 1739 |
+
"Safety": {
|
| 1740 |
+
"Min": 304,
|
| 1741 |
+
"Max": 8302,
|
| 1742 |
+
"Med": 1708.0,
|
| 1743 |
+
"Med Resp": 619.0
|
| 1744 |
+
},
|
| 1745 |
+
"Repetition": {
|
| 1746 |
+
"Min": 8,
|
| 1747 |
+
"Max": 11012,
|
| 1748 |
+
"Med": 3533.0,
|
| 1749 |
+
"Med Resp": 514.5
|
| 1750 |
+
},
|
| 1751 |
+
"Summarization": {
|
| 1752 |
+
"Min": 373,
|
| 1753 |
+
"Max": 11701,
|
| 1754 |
+
"Med": 1468.5,
|
| 1755 |
+
"Med Resp": 233.5
|
| 1756 |
+
},
|
| 1757 |
+
"Translation": {
|
| 1758 |
+
"Min": 381,
|
| 1759 |
+
"Max": 12124,
|
| 1760 |
+
"Med": 3332.5,
|
| 1761 |
+
"Med Resp": 284.0
|
| 1762 |
+
},
|
| 1763 |
+
"Multi-Turn": {
|
| 1764 |
+
"Min": 721,
|
| 1765 |
+
"Max": 19299,
|
| 1766 |
+
"Med": 5745.0,
|
| 1767 |
+
"Med Resp": 1736.5
|
| 1768 |
+
}
|
| 1769 |
+
},
|
| 1770 |
+
"HyperCLOVAX SEED Think 14B (think)": {
|
| 1771 |
+
"Overall": {
|
| 1772 |
+
"Min": 223,
|
| 1773 |
+
"Max": 131436,
|
| 1774 |
+
"Med": 1444.0,
|
| 1775 |
+
"Med Resp": 382.5
|
| 1776 |
+
},
|
| 1777 |
+
"Content Generation": {
|
| 1778 |
+
"Min": 279,
|
| 1779 |
+
"Max": 72029,
|
| 1780 |
+
"Med": 1222.0,
|
| 1781 |
+
"Med Resp": 476.5
|
| 1782 |
+
},
|
| 1783 |
+
"Editing": {
|
| 1784 |
+
"Min": 304,
|
| 1785 |
+
"Max": 65536,
|
| 1786 |
+
"Med": 1228.5,
|
| 1787 |
+
"Med Resp": 351.0
|
| 1788 |
+
},
|
| 1789 |
+
"Data Analysis": {
|
| 1790 |
+
"Min": 240,
|
| 1791 |
+
"Max": 65536,
|
| 1792 |
+
"Med": 1352.0,
|
| 1793 |
+
"Med Resp": 234.0
|
| 1794 |
+
},
|
| 1795 |
+
"Reasoning": {
|
| 1796 |
+
"Min": 414,
|
| 1797 |
+
"Max": 65536,
|
| 1798 |
+
"Med": 3010.0,
|
| 1799 |
+
"Med Resp": 315.0
|
| 1800 |
+
},
|
| 1801 |
+
"Hallucination": {
|
| 1802 |
+
"Min": 263,
|
| 1803 |
+
"Max": 65536,
|
| 1804 |
+
"Med": 1310.5,
|
| 1805 |
+
"Med Resp": 444.0
|
| 1806 |
+
},
|
| 1807 |
+
"Safety": {
|
| 1808 |
+
"Min": 241,
|
| 1809 |
+
"Max": 65536,
|
| 1810 |
+
"Med": 1100.0,
|
| 1811 |
+
"Med Resp": 412.0
|
| 1812 |
+
},
|
| 1813 |
+
"Repetition": {
|
| 1814 |
+
"Min": 389,
|
| 1815 |
+
"Max": 65536,
|
| 1816 |
+
"Med": 2233.0,
|
| 1817 |
+
"Med Resp": 355.0
|
| 1818 |
+
},
|
| 1819 |
+
"Summarization": {
|
| 1820 |
+
"Min": 223,
|
| 1821 |
+
"Max": 5987,
|
| 1822 |
+
"Med": 833.5,
|
| 1823 |
+
"Med Resp": 285.0
|
| 1824 |
+
},
|
| 1825 |
+
"Translation": {
|
| 1826 |
+
"Min": 457,
|
| 1827 |
+
"Max": 65536,
|
| 1828 |
+
"Med": 1611.0,
|
| 1829 |
+
"Med Resp": 352.0
|
| 1830 |
+
},
|
| 1831 |
+
"Multi-Turn": {
|
| 1832 |
+
"Min": 648,
|
| 1833 |
+
"Max": 131436,
|
| 1834 |
+
"Med": 3234.5,
|
| 1835 |
+
"Med Resp": 1324.5
|
| 1836 |
+
}
|
| 1837 |
+
},
|
| 1838 |
+
"o3": {
|
| 1839 |
+
"Overall": {
|
| 1840 |
+
"Min": -10,
|
| 1841 |
+
"Max": -2,
|
| 1842 |
+
"Med": -2.0,
|
| 1843 |
+
"Med Resp": -1.0
|
| 1844 |
+
},
|
| 1845 |
+
"Content Generation": {
|
| 1846 |
+
"Min": -2,
|
| 1847 |
+
"Max": -2,
|
| 1848 |
+
"Med": -2.0,
|
| 1849 |
+
"Med Resp": -1.0
|
| 1850 |
+
},
|
| 1851 |
+
"Editing": {
|
| 1852 |
+
"Min": -2,
|
| 1853 |
+
"Max": -2,
|
| 1854 |
+
"Med": -2.0,
|
| 1855 |
+
"Med Resp": -1.0
|
| 1856 |
+
},
|
| 1857 |
+
"Data Analysis": {
|
| 1858 |
+
"Min": -2,
|
| 1859 |
+
"Max": -2,
|
| 1860 |
+
"Med": -2.0,
|
| 1861 |
+
"Med Resp": -1.0
|
| 1862 |
+
},
|
| 1863 |
+
"Reasoning": {
|
| 1864 |
+
"Min": -2,
|
| 1865 |
+
"Max": -2,
|
| 1866 |
+
"Med": -2.0,
|
| 1867 |
+
"Med Resp": -1.0
|
| 1868 |
+
},
|
| 1869 |
+
"Hallucination": {
|
| 1870 |
+
"Min": -2,
|
| 1871 |
+
"Max": -2,
|
| 1872 |
+
"Med": -2.0,
|
| 1873 |
+
"Med Resp": -1.0
|
| 1874 |
+
},
|
| 1875 |
+
"Safety": {
|
| 1876 |
+
"Min": -2,
|
| 1877 |
+
"Max": -2,
|
| 1878 |
+
"Med": -2.0,
|
| 1879 |
+
"Med Resp": -1.0
|
| 1880 |
+
},
|
| 1881 |
+
"Repetition": {
|
| 1882 |
+
"Min": -2,
|
| 1883 |
+
"Max": -2,
|
| 1884 |
+
"Med": -2.0,
|
| 1885 |
+
"Med Resp": -1.0
|
| 1886 |
+
},
|
| 1887 |
+
"Summarization": {
|
| 1888 |
+
"Min": -2,
|
| 1889 |
+
"Max": -2,
|
| 1890 |
+
"Med": -2.0,
|
| 1891 |
+
"Med Resp": -1.0
|
| 1892 |
+
},
|
| 1893 |
+
"Translation": {
|
| 1894 |
+
"Min": -2,
|
| 1895 |
+
"Max": -2,
|
| 1896 |
+
"Med": -2.0,
|
| 1897 |
+
"Med Resp": -1.0
|
| 1898 |
+
},
|
| 1899 |
+
"Multi-Turn": {
|
| 1900 |
+
"Min": -10,
|
| 1901 |
+
"Max": -4,
|
| 1902 |
+
"Med": -6.0,
|
| 1903 |
+
"Med Resp": -3.0
|
| 1904 |
+
}
|
| 1905 |
+
}
|
| 1906 |
+
}
|
src/data/stats.csv
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"Model Name" "Link" "Comment" "Group" "Med. Len." "Med. Resp. Len." "Parameter Size (B)" "Type" "Model Type" "Think" "Overall" "Content Generation" "Editing" "Data Analysis" "Reasoning" "Hallucination" "Safety" "Repetition" "Summarization" "Translation" "Multi-Turn"
|
| 2 |
+
"GPT-5 (Reasoning: medium)" "https://platform.openai.com/docs/models/gpt-5" "Reasoning: medium" "GPT" "" "" "" "Proprietary" "Think" "On" "70.73" "71.0" "74.38" "76.49" "79.75" "64.94" "56.2" "82.86" "80.16" "69.38" "54.36"
|
| 3 |
+
"o3-pro (Reasoning: medium)" "https://platform.openai.com/docs/models/o3-pro" "Reasoning: medium" "GPT" "" "" "" "Proprietary" "Think" "On" "66.47" "72.5" "70.31" "75.7" "83.88" "64.37" "33.88" "74.29" "65.48" "64.33" "48.32"
|
| 4 |
+
"Claude 4 Opus (20250514) (think)" "https://www.anthropic.com/claude/opus" "version: 20250514" "Claude" "" "" "" "Proprietary" "Hybrid" "On" "63.29" "60.75" "59.69" "73.31" "69.83" "78.74" "53.72" "55.71" "65.48" "65.45" "48.99"
|
| 5 |
+
"Claude 4.1 Opus (20250805) (think)" "https://www.anthropic.com/claude/opus" "version: 20250805" "Claude" "" "" "" "Proprietary" "Hybrid" "On" "63.24" "61.25" "60.0" "78.49" "72.73" "77.01" "56.2" "57.14" "61.9" "62.64" "46.98"
|
| 6 |
+
"GPT-5 mini (Reasoning: medium)" "https://platform.openai.com/docs/models/gpt-5-mini" "Reasoning: medium" "GPT" "" "" "" "Proprietary" "Think" "On" "62.56" "68.0" "62.5" "74.9" "76.86" "55.17" "47.93" "44.29" "74.6" "56.18" "45.3"
|
| 7 |
+
"Claude 4 Sonnet (20250514) (think)" "https://www.anthropic.com/claude/sonnet" "version: 20250514" "Claude" "" "" "" "Proprietary" "Hybrid" "On" "61.8" "58.0" "58.44" "76.49" "67.77" "79.31" "57.02" "44.29" "65.08" "62.92" "44.97"
|
| 8 |
+
"o3" "https://platform.openai.com/docs/models/o3" "" "GPT" "" "" "" "Proprietary" "Think" "On" "60.91" "68.75" "60.0" "73.31" "79.34" "54.02" "34.71" "64.29" "60.71" "55.06" "46.98"
|
| 9 |
+
"Gemini 2.5 Pro" "https://deepmind.google/models/gemini/pro/" "" "Gemini" "" "" "" "Proprietary" "Think" "On" "59.34" "54.0" "60.94" "78.88" "73.14" "63.22" "17.36" "52.86" "67.86" "53.93" "52.68"
|
| 10 |
+
"Grok-4" "https://x.ai/news/grok-4" "temperature: 0.6
|
| 11 |
+
top-p: 0.95" "Grok" "" "" "" "Proprietary" "Think" "On" "58.74" "61.0" "66.25" "72.51" "63.22" "66.09" "16.53" "58.57" "66.27" "54.21" "44.3"
|
| 12 |
+
"Gemini 2.5 Flash" "https://deepmind.google/models/gemini/flash/" "" "Gemini" "" "" "" "Proprietary" "Hybrid" "On" "58.62" "57.25" "62.19" "70.52" "72.31" "56.9" "28.93" "47.14" "68.65" "55.06" "46.98"
|
| 13 |
+
"o4-mini" "https://platform.openai.com/docs/models/o4-mini" "" "GPT" "" "" "" "Proprietary" "Think" "On" "57.57" "67.25" "61.25" "71.71" "75.62" "45.4" "39.67" "44.29" "59.92" "47.19" "41.95"
|
| 14 |
+
"Qwen3 235B A22B Thinking 2507" "https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507" "temperature: 0.6
|
| 15 |
+
top-p: 0.95" "Qwen" "2404.5" "423.0" "235.0" "Open" "Think" "On" "55.48" "57.5" "53.12" "73.31" "75.21" "55.17" "25.62" "35.71" "55.56" "56.18" "40.27"
|
| 16 |
+
"GPT-5 nano (Reasoning: medium)" "https://platform.openai.com/docs/models/gpt-5-nano" "Reasoning: medium" "GPT" "" "" "" "Proprietary" "Think" "On" "55.39" "63.5" "47.19" "68.92" "75.21" "55.17" "52.07" "34.29" "63.49" "40.73" "42.95"
|
| 17 |
+
"GLM-4.5 FP8 (think)" "https://huggingface.co/zai-org/GLM-4.5-FP8" "temperature: 0.6
|
| 18 |
+
top-p: 0.95" "GLM" "1442.0" "604.0" "355.0" "Open" "Hybrid" "On" "54.03" "60.75" "53.75" "68.92" "74.38" "47.13" "33.06" "41.43" "60.32" "46.07" "35.91"
|
| 19 |
+
"Qwen3 235B A22B Instruct 2507" "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507" "temperature: 0.7
|
| 20 |
+
top-p: 0.8" "Qwen" "433.0" "433.0" "235.0" "Open" "Instruct" "Off" "52.94" "58.0" "49.69" "68.13" "73.97" "55.17" "45.45" "30.0" "55.95" "38.48" "41.61"
|
| 21 |
+
"DeepSeek V3.1 (think)" "https://huggingface.co/deepseek-ai/DeepSeek-V3.1" "temperature: 0.6
|
| 22 |
+
top-p: 0.95" "DeepSeek" "710.5" "356.0" "671.0" "Open" "Hybrid" "On" "51.45" "52.0" "50.0" "67.33" "69.83" "50.0" "33.88" "35.71" "59.52" "41.85" "40.27"
|
| 23 |
+
"gpt-oss-120B (Reasoning: medium)" "https://huggingface.co/openai/gpt-oss-120b" "Reasoning: medium
|
| 24 |
+
temperature: 1.0
|
| 25 |
+
top-p: 1.0" "GPT" "759.5" "370.5" "117.0" "Open" "Think" "On" "49.11" "58.5" "48.44" "68.92" "69.83" "41.38" "39.67" "25.71" "50.79" "35.67" "32.21"
|
| 26 |
+
"DeepSeek R1 (0528) (top_p: 0.95, temp:0.6)" "https://huggingface.co/deepseek-ai/DeepSeek-R1-0528" "version: 0528
|
| 27 |
+
temperature: 0.6
|
| 28 |
+
top-p: 0.95" "DeepSeek" "1177.5" "554.0" "671.0" "Open" "Think" "On" "48.79" "49.75" "50.0" "65.34" "59.09" "48.85" "38.02" "32.86" "57.94" "36.52" "38.93"
|
| 29 |
+
"Gauss2.3 Hybrid" "" "" "Gauss" "546.0" "308.0" "" "Proprietary" "Hybrid" "On" "46.58" "52.0" "46.25" "59.76" "66.94" "41.95" "34.71" "25.71" "53.17" "34.55" "33.22"
|
| 30 |
+
"DeepSeek V3 (0324) (top_p: 0.95, temp:1.3)" "https://huggingface.co/deepseek-ai/DeepSeek-V3-0324" "version: 0324
|
| 31 |
+
temperature: 1.3
|
| 32 |
+
top-p: 0.95" "DeepSeek" "408.0" "408.0" "671.0" "Open" "Instruct" "Off" "45.09" "46.25" "45.0" "58.96" "60.33" "41.95" "21.49" "30.0" "55.95" "38.48" "33.22"
|
| 33 |
+
"Qwen3 32B (think)" "https://huggingface.co/Qwen/Qwen3-32B" "temperature: 0.6
|
| 34 |
+
top-p: 0.95" "Qwen" "1113.0" "390.0" "32.8" "Open" "Hybrid" "On" "44.44" "52.25" "41.56" "68.92" "66.53" "35.06" "19.83" "25.71" "46.43" "30.9" "32.89"
|
| 35 |
+
"A.X 4.0" "https://huggingface.co/skt/A.X-4.0" "" "SKT" "412.5" "412.5" "71.9" "Open" "Instruct" "Off" "41.59" "56.0" "43.75" "43.43" "42.56" "40.23" "15.7" "24.29" "53.97" "33.43" "32.21"
|
| 36 |
+
"gpt-oss-20B (Reasoning: medium)" "https://huggingface.co/openai/gpt-oss-20b" "Reasoning: medium
|
| 37 |
+
temperature: 1.0
|
| 38 |
+
top-p: 1.0" "GPT" "953.5" "326.0" "21.0" "Open" "Think" "On" "41.18" "52.0" "40.0" "61.35" "65.7" "43.1" "41.32" "22.86" "36.51" "20.51" "22.82"
|
| 39 |
+
"EXAONE 4.0 32B (think)" "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B" "temperature: 0.6
|
| 40 |
+
top-p: 0.95" "Exaone" "1274.5" "503.0" "32.0" "Open" "Hybrid" "On" "33.82" "34.25" "29.38" "56.97" "57.44" "24.71" "27.27" "17.14" "38.49" "18.54" "25.5"
|
| 41 |
+
"HyperCLOVAX SEED Think 14B (think)" "https://huggingface.co/naver-hyperclovax/HyperCLOVAX-SEED-Think-14B" "temperature: 0.5
|
| 42 |
+
top-p: 0.6" "HCX" "1444.0" "382.5" "14.7" "Open" "Hybrid" "On" "31.84" "35.0" "26.56" "53.78" "58.68" "27.59" "26.45" "17.14" "29.76" "17.13" "20.47"
|
| 43 |
+
"Solar Pro Preview (top_p:0.95, temp: 0.7)" "https://huggingface.co/upstage/solar-pro-preview-instruct" "temperature: 0.7
|
| 44 |
+
top-p: 0.95" "Solar" "260.0" "260.0" "22.0" "Open" "Instruct" "Off" "20.73" "28.0" "24.69" "16.73" "19.42" "17.24" "28.1" "11.43" "31.35" "13.76" "11.74"
|
| 45 |
+
"Mi:dm 2.0 Base Instruct" "https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct" "temperature: 0.8
|
| 46 |
+
top-p: 0.7" "kt" "316.0" "316.0" "11.5" "Open" "Instruct" "Off" "20.25" "21.75" "17.5" "16.73" "18.6" "27.59" "59.5" "14.29" "25.4" "12.64" "11.41"
|
| 47 |
+
"Kanana 1.5 15.7B A3B Instruct" "https://huggingface.co/kakaocorp/kanana-1.5-15.7b-a3b-instruct" "temperature: 1.0
|
| 48 |
+
top-p: 0.95" "kakao" "414.0" "414.0" "15.7" "Open" "Instruct" "Off" "11.71" "14.25" "10.62" "13.55" "11.16" "22.41" "22.31" "4.29" "11.9" "6.74" "5.37"
|
src/data/stats_lang.csv
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"Model Name" "Link" "Comment" "Group" "Med. Len." "Med. Resp. Len." "Parameter Size (B)" "Type" "Model Type" "Think" "Overall" "KO" "EN" "JA" "ZH" "PL" "DE" "PT" "ES" "FR" "IT" "RU" "VI"
|
| 2 |
+
"GPT-5 (Reasoning: medium)" "https://platform.openai.com/docs/models/gpt-5" "Reasoning: medium" "GPT" "" "" "" "Proprietary" "Think" "On" "70.73" "64.72" "65.83" "71.69" "67.68" "72.78" "71.27" "73.74" "75.68" "72.83" "77.05" "70.79" "75.61"
|
| 3 |
+
"o3-pro (Reasoning: medium)" "https://platform.openai.com/docs/models/o3-pro" "Reasoning: medium" "GPT" "" "" "" "Proprietary" "Think" "On" "66.47" "63.61" "63.61" "69.28" "65.24" "63.89" "64.09" "68.16" "69.19" "70.11" "72.13" "62.36" "71.95"
|
| 4 |
+
"Claude 4 Opus (20250514) (think)" "https://www.anthropic.com/claude/opus" "version: 20250514" "Claude" "" "" "" "Proprietary" "Hybrid" "On" "63.29" "57.5" "62.5" "64.46" "62.8" "59.44" "65.19" "65.92" "60.54" "65.22" "65.57" "65.17" "72.56"
|
| 5 |
+
"Claude 4.1 Opus (20250805) (think)" "https://www.anthropic.com/claude/opus" "version: 20250805" "Claude" "" "" "" "Proprietary" "Hybrid" "On" "63.24" "58.33" "61.39" "60.84" "64.02" "61.67" "66.85" "68.16" "61.08" "65.76" "66.67" "65.73" "65.24"
|
| 6 |
+
"GPT-5 mini (Reasoning: medium)" "https://platform.openai.com/docs/models/gpt-5-mini" "Reasoning: medium" "GPT" "" "" "" "Proprietary" "Think" "On" "62.56" "57.5" "56.39" "62.65" "62.2" "63.89" "60.22" "66.48" "67.03" "70.11" "67.76" "66.29" "60.98"
|
| 7 |
+
"Claude 4 Sonnet (20250514) (think)" "https://www.anthropic.com/claude/sonnet" "version: 20250514" "Claude" "" "" "" "Proprietary" "Hybrid" "On" "61.8" "54.17" "59.17" "63.86" "64.63" "59.44" "61.33" "64.8" "62.16" "65.22" "67.21" "66.29" "64.02"
|
| 8 |
+
"o3" "https://platform.openai.com/docs/models/o3" "" "GPT" "" "" "" "Proprietary" "Think" "On" "60.91" "57.5" "59.17" "61.45" "58.54" "61.11" "64.09" "60.89" "62.16" "63.59" "65.03" "54.49" "68.29"
|
| 9 |
+
"Gemini 2.5 Pro" "https://deepmind.google/models/gemini/pro/" "" "Gemini" "" "" "" "Proprietary" "Think" "On" "59.34" "53.61" "57.78" "59.04" "57.93" "57.22" "56.91" "60.89" "63.24" "67.93" "62.3" "61.24" "60.98"
|
| 10 |
+
"Grok-4" "https://x.ai/news/grok-4" "temperature: 0.6
|
| 11 |
+
top-p: 0.95" "Grok" "" "" "" "Proprietary" "Think" "On" "58.74" "57.78" "56.67" "62.65" "60.37" "58.33" "60.22" "59.78" "56.22" "62.5" "60.66" "52.25" "60.98"
|
| 12 |
+
"Gemini 2.5 Flash" "https://deepmind.google/models/gemini/flash/" "" "Gemini" "" "" "" "Proprietary" "Hybrid" "On" "58.62" "51.11" "56.39" "62.05" "56.71" "62.78" "60.77" "61.45" "60.0" "63.04" "57.92" "64.04" "56.71"
|
| 13 |
+
"o4-mini" "https://platform.openai.com/docs/models/o4-mini" "" "GPT" "" "" "" "Proprietary" "Think" "On" "57.57" "54.17" "55.0" "62.05" "59.76" "52.78" "58.56" "63.69" "55.68" "57.61" "60.66" "56.74" "60.98"
|
| 14 |
+
"Qwen3 235B A22B Thinking 2507" "https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507" "temperature: 0.6
|
| 15 |
+
top-p: 0.95" "Qwen" "2404.5" "423.0" "235.0" "Open" "Think" "On" "55.48" "49.17" "53.33" "56.02" "58.54" "50.56" "62.43" "60.89" "52.97" "56.52" "60.11" "53.93" "60.37"
|
| 16 |
+
"GPT-5 nano (Reasoning: medium)" "https://platform.openai.com/docs/models/gpt-5-nano" "Reasoning: medium" "GPT" "" "" "" "Proprietary" "Think" "On" "55.39" "51.94" "53.89" "57.23" "53.66" "55.56" "58.01" "59.78" "54.59" "56.52" "59.02" "57.3" "51.83"
|
| 17 |
+
"GLM-4.5 FP8 (think)" "https://huggingface.co/zai-org/GLM-4.5-FP8" "temperature: 0.6
|
| 18 |
+
top-p: 0.95" "GLM" "1442.0" "604.0" "355.0" "Open" "Hybrid" "On" "54.03" "46.94" "54.17" "60.84" "58.54" "48.89" "55.8" "54.75" "48.11" "57.61" "57.92" "57.87" "54.88"
|
| 19 |
+
"Qwen3 235B A22B Instruct 2507" "https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507" "temperature: 0.7
|
| 20 |
+
top-p: 0.8" "Qwen" "433.0" "433.0" "235.0" "Open" "Instruct" "Off" "52.94" "46.67" "55.28" "53.61" "59.15" "46.11" "51.38" "55.87" "54.59" "53.26" "56.28" "54.49" "53.05"
|
| 21 |
+
"DeepSeek V3.1 (think)" "https://huggingface.co/deepseek-ai/DeepSeek-V3.1" "temperature: 0.6
|
| 22 |
+
top-p: 0.95" "DeepSeek" "710.5" "356.0" "671.0" "Open" "Hybrid" "On" "51.45" "44.44" "48.33" "56.63" "48.78" "48.89" "55.25" "53.07" "52.97" "56.52" "57.92" "50.56" "54.27"
|
| 23 |
+
"gpt-oss-120B (Reasoning: medium)" "https://huggingface.co/openai/gpt-oss-120b" "Reasoning: medium
|
| 24 |
+
temperature: 1.0
|
| 25 |
+
top-p: 1.0" "GPT" "759.5" "370.5" "117.0" "Open" "Think" "On" "49.11" "46.67" "51.39" "51.81" "47.56" "45.0" "51.38" "54.75" "50.27" "51.63" "47.54" "46.07" "45.12"
|
| 26 |
+
"DeepSeek R1 (0528) (top_p: 0.95, temp:0.6)" "https://huggingface.co/deepseek-ai/DeepSeek-R1-0528" "version: 0528
|
| 27 |
+
temperature: 0.6
|
| 28 |
+
top-p: 0.95" "DeepSeek" "1177.5" "554.0" "671.0" "Open" "Think" "On" "48.79" "42.22" "49.44" "50.0" "53.05" "47.22" "48.62" "50.28" "48.11" "51.63" "54.1" "44.38" "53.05"
|
| 29 |
+
"Gauss2.3 Hybrid" "" "" "Gauss" "546.0" "308.0" "" "Proprietary" "Hybrid" "On" "46.58" "39.72" "45.56" "48.8" "48.17" "45.0" "44.2" "53.63" "45.41" "52.17" "51.91" "44.94" "47.56"
|
| 30 |
+
"DeepSeek V3 (0324) (top_p: 0.95, temp:1.3)" "https://huggingface.co/deepseek-ai/DeepSeek-V3-0324" "version: 0324
|
| 31 |
+
temperature: 1.3
|
| 32 |
+
top-p: 0.95" "DeepSeek" "408.0" "408.0" "671.0" "Open" "Instruct" "Off" "45.09" "37.5" "43.61" "46.99" "51.22" "45.56" "44.75" "44.69" "44.32" "48.91" "49.18" "44.94" "49.39"
|
| 33 |
+
"Qwen3 32B (think)" "https://huggingface.co/Qwen/Qwen3-32B" "temperature: 0.6
|
| 34 |
+
top-p: 0.95" "Qwen" "1113.0" "390.0" "32.8" "Open" "Hybrid" "On" "44.44" "38.89" "41.67" "48.8" "50.0" "38.33" "46.41" "44.69" "44.86" "44.57" "50.82" "46.07" "47.56"
|
| 35 |
+
"A.X 4.0" "https://huggingface.co/skt/A.X-4.0" "" "SKT" "412.5" "412.5" "71.9" "Open" "Instruct" "Off" "41.59" "38.89" "41.11" "43.98" "49.39" "36.11" "45.86" "43.58" "44.32" "39.67" "43.17" "39.89" "36.59"
|
| 36 |
+
"gpt-oss-20B (Reasoning: medium)" "https://huggingface.co/openai/gpt-oss-20b" "Reasoning: medium
|
| 37 |
+
temperature: 1.0
|
| 38 |
+
top-p: 1.0" "GPT" "953.5" "326.0" "21.0" "Open" "Think" "On" "41.18" "36.67" "42.78" "45.78" "45.73" "37.78" "35.91" "41.9" "39.46" "51.09" "40.44" "38.76" "41.46"
|
| 39 |
+
"EXAONE 4.0 32B (think)" "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B" "temperature: 0.6
|
| 40 |
+
top-p: 0.95" "Exaone" "1274.5" "503.0" "32.0" "Open" "Hybrid" "On" "33.82" "33.61" "38.33" "28.92" "35.98" "26.11" "35.91" "34.08" "38.92" "35.33" "33.88" "28.09" "31.71"
|
| 41 |
+
"HyperCLOVAX SEED Think 14B (think)" "https://huggingface.co/naver-hyperclovax/HyperCLOVAX-SEED-Think-14B" "temperature: 0.5
|
| 42 |
+
top-p: 0.6" "HCX" "1444.0" "382.5" "14.7" "Open" "Hybrid" "On" "31.84" "32.22" "37.22" "31.93" "38.41" "27.78" "32.6" "30.17" "29.19" "32.07" "33.33" "25.28" "26.22"
|
| 43 |
+
"Solar Pro Preview (top_p:0.95, temp: 0.7)" "https://huggingface.co/upstage/solar-pro-preview-instruct" "temperature: 0.7
|
| 44 |
+
top-p: 0.95" "Solar" "260.0" "260.0" "22.0" "Open" "Instruct" "Off" "20.73" "9.72" "22.22" "21.08" "24.39" "9.44" "18.23" "24.02" "29.73" "29.89" "33.33" "22.47" "12.8"
|
| 45 |
+
"Mi:dm 2.0 Base Instruct" "https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct" "temperature: 0.8
|
| 46 |
+
top-p: 0.7" "kt" "316.0" "316.0" "11.5" "Open" "Instruct" "Off" "20.25" "26.39" "26.39" "17.47" "26.83" "13.33" "18.78" "20.67" "16.22" "20.65" "21.31" "12.92" "9.15"
|
| 47 |
+
"Kanana 1.5 15.7B A3B Instruct" "https://huggingface.co/kakaocorp/kanana-1.5-15.7b-a3b-instruct" "temperature: 1.0
|
| 48 |
+
top-p: 0.95" "kakao" "414.0" "414.0" "15.7" "Open" "Instruct" "Off" "11.71" "21.11" "20.28" "10.84" "15.24" "5.56" "7.73" "8.94" "9.19" "8.15" "5.46" "5.06" "4.88"
|
src/data_loader.py
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
from typing import Optional
|
| 4 |
+
|
| 5 |
+
# Global cache variables
|
| 6 |
+
_category_df_cache: Optional[pd.DataFrame] = None
|
| 7 |
+
_language_df_cache: Optional[pd.DataFrame] = None
|
| 8 |
+
|
| 9 |
+
def _load_category_csv() -> pd.DataFrame:
|
| 10 |
+
"""Load the category CSV file with proper encoding and delimiter."""
|
| 11 |
+
abs_path = Path(__file__).parent
|
| 12 |
+
df = pd.read_csv(str(abs_path / "data/stats.csv"), encoding='utf-8', delimiter="\t")
|
| 13 |
+
return df.copy()
|
| 14 |
+
|
| 15 |
+
def _load_language_csv() -> pd.DataFrame:
|
| 16 |
+
"""Load the language CSV file with proper encoding and delimiter."""
|
| 17 |
+
abs_path = Path(__file__).parent
|
| 18 |
+
df = pd.read_csv(str(abs_path / "data/stats_lang.csv"), encoding='utf-8', delimiter="\t")
|
| 19 |
+
return df.copy()
|
| 20 |
+
|
| 21 |
+
def get_category_dataframe(processed: bool = True) -> pd.DataFrame:
|
| 22 |
+
"""
|
| 23 |
+
Get the category dataframe.
|
| 24 |
+
|
| 25 |
+
Args:
|
| 26 |
+
processed: If True, returns processed dataframe (for vis_utils.py compatibility)
|
| 27 |
+
If False, returns raw dataframe sorted by Overall (for data_utils.py compatibility)
|
| 28 |
+
|
| 29 |
+
Returns:
|
| 30 |
+
pd.DataFrame: The category dataframe
|
| 31 |
+
"""
|
| 32 |
+
global _category_df_cache
|
| 33 |
+
|
| 34 |
+
if _category_df_cache is None:
|
| 35 |
+
_category_df_cache = _load_category_csv()
|
| 36 |
+
|
| 37 |
+
df = _category_df_cache.copy()
|
| 38 |
+
|
| 39 |
+
if processed:
|
| 40 |
+
# Apply vis_utils.py processing
|
| 41 |
+
required_cols = ['Model Name', 'Link', "Group", "Overall", "Med. Len.", "Med. Resp. Len.", "Parameter Size (B)", "Type", "Model Type", "Think", 'Content Generation', 'Editing', 'Data Analysis',
|
| 42 |
+
'Reasoning', 'Hallucination', 'Safety', 'Repetition',
|
| 43 |
+
'Summarization', 'Translation', 'Multi-Turn']
|
| 44 |
+
|
| 45 |
+
for col in required_cols:
|
| 46 |
+
if col not in df.columns:
|
| 47 |
+
if col in ["Link", "Group"]:
|
| 48 |
+
df[col] = ""
|
| 49 |
+
else:
|
| 50 |
+
df[col] = 0
|
| 51 |
+
|
| 52 |
+
from constants import NUMERIC_COLS_CATEGORY, NUMERIC_INT_COLS_CATEGORY
|
| 53 |
+
for col in NUMERIC_COLS_CATEGORY:
|
| 54 |
+
if col in df.columns:
|
| 55 |
+
if col in NUMERIC_INT_COLS_CATEGORY:
|
| 56 |
+
df[col] = pd.to_numeric(df[col], errors='coerce').round(0)
|
| 57 |
+
else:
|
| 58 |
+
df[col] = pd.to_numeric(df[col], errors='coerce').round(3)
|
| 59 |
+
else:
|
| 60 |
+
df[col] = 0
|
| 61 |
+
|
| 62 |
+
if "Think" not in df.columns:
|
| 63 |
+
df["Think"] = "Off"
|
| 64 |
+
|
| 65 |
+
df = df.fillna('')
|
| 66 |
+
|
| 67 |
+
else:
|
| 68 |
+
# Apply data_utils.py processing
|
| 69 |
+
df = df.sort_values("Overall", ascending=False)
|
| 70 |
+
|
| 71 |
+
return df
|
| 72 |
+
|
| 73 |
+
def get_language_dataframe(processed: bool = True) -> pd.DataFrame:
|
| 74 |
+
"""
|
| 75 |
+
Get the language dataframe.
|
| 76 |
+
|
| 77 |
+
Args:
|
| 78 |
+
processed: If True, returns processed dataframe (for vis_utils.py compatibility)
|
| 79 |
+
If False, returns raw dataframe sorted by Overall (for data_utils.py compatibility)
|
| 80 |
+
|
| 81 |
+
Returns:
|
| 82 |
+
pd.DataFrame: The language dataframe
|
| 83 |
+
"""
|
| 84 |
+
global _language_df_cache
|
| 85 |
+
|
| 86 |
+
if _language_df_cache is None:
|
| 87 |
+
_language_df_cache = _load_language_csv()
|
| 88 |
+
|
| 89 |
+
df = _language_df_cache.copy()
|
| 90 |
+
|
| 91 |
+
if processed:
|
| 92 |
+
# Apply vis_utils.py processing
|
| 93 |
+
language_cols = ['Model Name', 'Link', "Group", "Overall", "Med. Len.", "Med. Resp. Len.", "Parameter Size (B)", "Type", "Model Type", "Think", 'KO', 'EN', 'JA', 'ZH', 'PL', 'DE', 'PT', 'ES', 'FR', 'IT', 'RU', 'VI']
|
| 94 |
+
for col in language_cols:
|
| 95 |
+
if col not in df.columns:
|
| 96 |
+
if col in ["Link", "Group"]:
|
| 97 |
+
df[col] = ""
|
| 98 |
+
else:
|
| 99 |
+
df[col] = 0
|
| 100 |
+
|
| 101 |
+
from constants import NUMERIC_COLS_LANGUAGE, NUMERIC_INT_COLS_LANGUAGE
|
| 102 |
+
for col in NUMERIC_COLS_LANGUAGE:
|
| 103 |
+
if col in df.columns:
|
| 104 |
+
if col in NUMERIC_INT_COLS_LANGUAGE:
|
| 105 |
+
df[col] = pd.to_numeric(df[col], errors='coerce').round(0)
|
| 106 |
+
else:
|
| 107 |
+
df[col] = pd.to_numeric(df[col], errors='coerce').round(3)
|
| 108 |
+
else:
|
| 109 |
+
df[col] = 0
|
| 110 |
+
|
| 111 |
+
df = df.fillna('')
|
| 112 |
+
else:
|
| 113 |
+
# Apply data_utils.py processing
|
| 114 |
+
df = df.sort_values("Overall", ascending=False)
|
| 115 |
+
|
| 116 |
+
return df
|
| 117 |
+
|
| 118 |
+
def clear_cache():
|
| 119 |
+
"""Clear the cached dataframes to force reload on next access."""
|
| 120 |
+
global _category_df_cache, _language_df_cache
|
| 121 |
+
_category_df_cache = None
|
| 122 |
+
_language_df_cache = None
|
src/data_utils.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
def get_dataframe_category():
|
| 5 |
+
from src.data_loader import get_category_dataframe
|
| 6 |
+
return get_category_dataframe(processed=False)
|
| 7 |
+
|
| 8 |
+
def get_dataframe_language():
|
| 9 |
+
from src.data_loader import get_language_dataframe
|
| 10 |
+
return get_language_dataframe(processed=False)
|
| 11 |
+
|
| 12 |
+
import json
|
| 13 |
+
|
| 14 |
+
def get_length_category_df(selected_category):
|
| 15 |
+
"""
|
| 16 |
+
Loads length_data.json and returns a DataFrame for the selected category.
|
| 17 |
+
Columns: Model Name, {Category} Min, {Category} Max, {Category} Med, {Category} Med Resp
|
| 18 |
+
"""
|
| 19 |
+
abs_path = Path(__file__).parent
|
| 20 |
+
json_path = abs_path / "data/length_data.json"
|
| 21 |
+
with open(json_path, "r", encoding="utf-8") as f:
|
| 22 |
+
data = json.load(f)
|
| 23 |
+
rows = []
|
| 24 |
+
for model_name, stats in data.items():
|
| 25 |
+
cat = stats.get(selected_category, {})
|
| 26 |
+
row = {
|
| 27 |
+
"Model Name": model_name,
|
| 28 |
+
f"Min Len. ({selected_category})": cat.get("Min", None),
|
| 29 |
+
f"Max Len. ({selected_category}))": cat.get("Max", None),
|
| 30 |
+
f"Med. Len. ({selected_category})": cat.get("Med", None),
|
| 31 |
+
f"Med. Resp. Len. ({selected_category})": cat.get("Med Resp", None),
|
| 32 |
+
}
|
| 33 |
+
rows.append(row)
|
| 34 |
+
df = pd.DataFrame(rows)
|
| 35 |
+
return df
|
| 36 |
+
|
| 37 |
+
def get_length_category_list():
|
| 38 |
+
"""
|
| 39 |
+
Returns the list of available categories in length_data.json (excluding 'Overall').
|
| 40 |
+
"""
|
| 41 |
+
abs_path = Path(__file__).parent
|
| 42 |
+
json_path = abs_path / "data/length_data.json"
|
| 43 |
+
with open(json_path, "r", encoding="utf-8") as f:
|
| 44 |
+
data = json.load(f)
|
| 45 |
+
if not data:
|
| 46 |
+
return []
|
| 47 |
+
# Get categories from the first model
|
| 48 |
+
first_model = next(iter(data.values()))
|
| 49 |
+
categories = [k for k in first_model.keys() if k != "Overall"]
|
| 50 |
+
return categories
|
src/display/css_html_js.py
ADDED
|
@@ -0,0 +1,766 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
custom_css = """
|
| 2 |
+
/* Info icon for column tooltips */
|
| 3 |
+
.info-icon {
|
| 4 |
+
color: #b0b0b0;
|
| 5 |
+
font-size: 15px;
|
| 6 |
+
margin-left: 4px;
|
| 7 |
+
vertical-align: middle;
|
| 8 |
+
font-family: Arial, sans-serif;
|
| 9 |
+
font-style: normal;
|
| 10 |
+
font-weight: bold;
|
| 11 |
+
user-select: none;
|
| 12 |
+
}
|
| 13 |
+
.info-icon:hover {
|
| 14 |
+
color: #888;
|
| 15 |
+
}
|
| 16 |
+
/* Model Name link hover effect */
|
| 17 |
+
.pretty-leaderboard-table a:hover {
|
| 18 |
+
text-decoration: underline;
|
| 19 |
+
color: #1098F7;
|
| 20 |
+
cursor: pointer;
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
/* INTRO FEATURE CARDS (for about page) */
|
| 24 |
+
.intro-feature-row {
|
| 25 |
+
display: flex;
|
| 26 |
+
flex-wrap: wrap;
|
| 27 |
+
justify-content: center;
|
| 28 |
+
gap: 24px;
|
| 29 |
+
margin: 18px 0 !important;
|
| 30 |
+
}
|
| 31 |
+
.intro-feature-box {
|
| 32 |
+
background: linear-gradient(135deg, #f8fafc 60%, #e3e6f3 100%);
|
| 33 |
+
border-radius: 18px;
|
| 34 |
+
box-shadow: 0 4px 16px rgba(44,62,80,0.08);
|
| 35 |
+
padding: 32px 28px;
|
| 36 |
+
width: 380px;
|
| 37 |
+
min-width: 260px;
|
| 38 |
+
max-width: 420px;
|
| 39 |
+
text-align: left;
|
| 40 |
+
display: flex;
|
| 41 |
+
flex-direction: column;
|
| 42 |
+
align-items: flex-start;
|
| 43 |
+
transition: box-shadow 0.2s;
|
| 44 |
+
}
|
| 45 |
+
.intro-feature-title {
|
| 46 |
+
font-weight: 900;
|
| 47 |
+
font-size: 1.45em;
|
| 48 |
+
margin-bottom: 12px;
|
| 49 |
+
color: #23244a;
|
| 50 |
+
}
|
| 51 |
+
.intro-feature-desc {
|
| 52 |
+
font-size: 1.18em;
|
| 53 |
+
color: #444;
|
| 54 |
+
margin-bottom: 7px;
|
| 55 |
+
}
|
| 56 |
+
.intro-feature-icon {
|
| 57 |
+
font-size: 2.3em;
|
| 58 |
+
margin-bottom: 16px;
|
| 59 |
+
color: #1098F7;
|
| 60 |
+
}
|
| 61 |
+
.intro-feature-box:hover {
|
| 62 |
+
box-shadow: 0 0 24px #a5a1ff55, 0 4px 16px rgba(0,0,0,0.18);
|
| 63 |
+
transform: translateY(-4px) scale(1.025);
|
| 64 |
+
transition: box-shadow 0.2s, transform 0.2s;
|
| 65 |
+
cursor: default;
|
| 66 |
+
}
|
| 67 |
+
@media (prefers-color-scheme: dark) {
|
| 68 |
+
.intro-feature-box {
|
| 69 |
+
background: linear-gradient(135deg, #23244a 0%, #2a1859 100%) !important;
|
| 70 |
+
color: #f5f6f7 !important;
|
| 71 |
+
}
|
| 72 |
+
.intro-feature-title, .intro-feature-desc {
|
| 73 |
+
color: #f5f6f7 !important;
|
| 74 |
+
}
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
/* Dataset Sample Button (below feature cards) */
|
| 78 |
+
.intro-dataset-btn {
|
| 79 |
+
display: inline-block;
|
| 80 |
+
background: #1098F7;
|
| 81 |
+
color: #fff !important;
|
| 82 |
+
border: none;
|
| 83 |
+
border-radius: 12px;
|
| 84 |
+
font-weight: 700;
|
| 85 |
+
font-size: 1.18em;
|
| 86 |
+
padding: 16px 36px;
|
| 87 |
+
margin: 32px auto 0 auto;
|
| 88 |
+
text-align: center;
|
| 89 |
+
text-decoration: none;
|
| 90 |
+
box-shadow: 0 2px 8px #1098f733;
|
| 91 |
+
transition: background 0.18s, color 0.18s, box-shadow 0.18s;
|
| 92 |
+
cursor: pointer;
|
| 93 |
+
outline: none;
|
| 94 |
+
}
|
| 95 |
+
.intro-dataset-btn:hover, .intro-dataset-btn:focus {
|
| 96 |
+
background: #0a6dc2;
|
| 97 |
+
color: #fff !important;
|
| 98 |
+
box-shadow: 0 4px 16px #1098f755;
|
| 99 |
+
text-decoration: none;
|
| 100 |
+
}
|
| 101 |
+
@media (prefers-color-scheme: dark) {
|
| 102 |
+
.intro-dataset-btn {
|
| 103 |
+
background: #1a4b7a;
|
| 104 |
+
color: #fff !important;
|
| 105 |
+
}
|
| 106 |
+
.intro-dataset-btn:hover, .intro-dataset-btn:focus {
|
| 107 |
+
background: #1098F7;
|
| 108 |
+
color: #fff !important;
|
| 109 |
+
}
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
.radar-chart, .plot-container {
|
| 113 |
+
display: block;
|
| 114 |
+
margin-left: auto;
|
| 115 |
+
margin-right: auto;
|
| 116 |
+
width: fit-content;
|
| 117 |
+
max-width: 100%;
|
| 118 |
+
}
|
| 119 |
+
|
| 120 |
+
/* Ensure injected HTML/Markdown blocks are transparent and text is visible in all color schemes */
|
| 121 |
+
.gr-html, .gr-markdown, .gr-html * {
|
| 122 |
+
background: transparent !important;
|
| 123 |
+
color: inherit !important;
|
| 124 |
+
}
|
| 125 |
+
.gr-html div, .gr-html body, .gr-markdown div, .gr-markdown body {
|
| 126 |
+
background: transparent !important;
|
| 127 |
+
color: inherit !important;
|
| 128 |
+
}
|
| 129 |
+
@media (prefers-color-scheme: dark) {
|
| 130 |
+
.gr-html, .gr-markdown, .gr-html *, .gr-markdown * {
|
| 131 |
+
color: #f5f6f7 !important;
|
| 132 |
+
}
|
| 133 |
+
}
|
| 134 |
+
@media (prefers-color-scheme: light) {
|
| 135 |
+
.gr-html, .gr-markdown, .gr-html *, .gr-markdown * {
|
| 136 |
+
color: #23244a !important;
|
| 137 |
+
}
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
/* Custom radio styles for category selector */
|
| 141 |
+
.cat-btn-radio label {
|
| 142 |
+
border-radius: 18px !important;
|
| 143 |
+
border: 1.5px solid #d1d5db !important;
|
| 144 |
+
background: #f8fafc !important;
|
| 145 |
+
color: #222 !important;
|
| 146 |
+
font-weight: 600 !important;
|
| 147 |
+
cursor: pointer !important;
|
| 148 |
+
padding: 8px 20px !important;
|
| 149 |
+
box-shadow: 0 2px 8px #e5e7eb88 !important;
|
| 150 |
+
margin: 0 !important;
|
| 151 |
+
font-size: 1.08rem !important;
|
| 152 |
+
transition: background 0.2s, color 0.2s, box-shadow 0.2s, border 0.2s !important;
|
| 153 |
+
display: inline-block !important;
|
| 154 |
+
}
|
| 155 |
+
.cat-btn-radio input[type="radio"] {
|
| 156 |
+
display: none !important;
|
| 157 |
+
}
|
| 158 |
+
.cat-btn-radio input[type="radio"]:checked + label,
|
| 159 |
+
.cat-btn-radio label.selected {
|
| 160 |
+
background: #1098F7 !important;
|
| 161 |
+
color: #fff !important;
|
| 162 |
+
border: 1.5px solid #1098F7 !important;
|
| 163 |
+
box-shadow: 0 4px 16px #1098f755, 0 2px 8px #e5e7eb88 !important;
|
| 164 |
+
}
|
| 165 |
+
.cat-btn-radio label:hover {
|
| 166 |
+
border: 1.5px solid #1098F7 !important;
|
| 167 |
+
box-shadow: 0 4px 16px #1098f733, 0 2px 8px #e5e7eb88 !important;
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
/* Gradio tab content: Space-themed background */
|
| 171 |
+
.gr-tabitem {
|
| 172 |
+
background: linear-gradient(135deg, #e3e6f3 60%, #f5f6fa 100%);
|
| 173 |
+
background-image:
|
| 174 |
+
radial-gradient(rgba(255,255,255,0.10) 1.2px, transparent 1.2px),
|
| 175 |
+
radial-gradient(rgba(255,255,255,0.06) 1px, transparent 1px);
|
| 176 |
+
background-size: 40px 40px, 80px 80px;
|
| 177 |
+
background-position: 0 0, 20px 20px;
|
| 178 |
+
}
|
| 179 |
+
@media (prefers-color-scheme: dark) {
|
| 180 |
+
.gr-tabitem {
|
| 181 |
+
background: linear-gradient(135deg, #181c3a 0%, #2a1859 100%) !important;
|
| 182 |
+
background-image:
|
| 183 |
+
radial-gradient(rgba(255,255,255,0.10) 1.2px, transparent 1.2px),
|
| 184 |
+
radial-gradient(rgba(255,255,255,0.06) 1px, transparent 1px);
|
| 185 |
+
background-size: 40px 40px, 80px 80px;
|
| 186 |
+
background-position: 0 0, 20px 20px;
|
| 187 |
+
}
|
| 188 |
+
}
|
| 189 |
+
@media (prefers-color-scheme: light) {
|
| 190 |
+
.gr-tabitem {
|
| 191 |
+
background: linear-gradient(135deg, #e3e6f3 60%, #f5f6fa 100%) !important;
|
| 192 |
+
background-image:
|
| 193 |
+
radial-gradient(rgba(255,255,255,0.10) 1.2px, transparent 1.2px),
|
| 194 |
+
radial-gradient(rgba(255,255,255,0.06) 1px, transparent 1px);
|
| 195 |
+
background-size: 40px 40px, 80px 80px;
|
| 196 |
+
background-position: 0 0, 20px 20px;
|
| 197 |
+
}
|
| 198 |
+
h3 a, h3 a:visited {
|
| 199 |
+
color: #222 !important;
|
| 200 |
+
}
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
+
/* Sort arrow/button styles */
|
| 204 |
+
.sort-arrow, .sort-btn {
|
| 205 |
+
display: inline-flex;
|
| 206 |
+
align-items: center;
|
| 207 |
+
justify-content: center;
|
| 208 |
+
background: #23244a;
|
| 209 |
+
color: #ffd700 !important; /* Always yellow */
|
| 210 |
+
border: 1.5px solid #ffd700; /* Gold border */
|
| 211 |
+
border-radius: 6px;
|
| 212 |
+
font-size: 15px;
|
| 213 |
+
font-weight: 700;
|
| 214 |
+
margin-left: 6px;
|
| 215 |
+
margin-right: 2px;
|
| 216 |
+
padding: 2px 8px 2px 6px;
|
| 217 |
+
cursor: pointer;
|
| 218 |
+
transition: background 0.2s, color 0.2s, border 0.2s;
|
| 219 |
+
min-width: 28px;
|
| 220 |
+
min-height: 28px;
|
| 221 |
+
outline: none;
|
| 222 |
+
}
|
| 223 |
+
.sort-arrow.active, .sort-btn.active {
|
| 224 |
+
color: #ffd700 !important; /* Gold */
|
| 225 |
+
border-color: #ffd700;
|
| 226 |
+
background: #1a237e;
|
| 227 |
+
}
|
| 228 |
+
.sort-arrow:hover, .sort-btn:hover {
|
| 229 |
+
background: #ffd700;
|
| 230 |
+
color: #23244a !important;
|
| 231 |
+
border-color: #ffd700;
|
| 232 |
+
}
|
| 233 |
+
.sort-arrow svg, .sort-btn svg {
|
| 234 |
+
margin-left: 2px;
|
| 235 |
+
margin-right: 0;
|
| 236 |
+
width: 1em;
|
| 237 |
+
height: 1em;
|
| 238 |
+
vertical-align: middle;
|
| 239 |
+
}
|
| 240 |
+
|
| 241 |
+
/* Enhanced leaderboard table styles */
|
| 242 |
+
.pretty-leaderboard-table {
|
| 243 |
+
width: 100%;
|
| 244 |
+
border-collapse: separate;
|
| 245 |
+
border-spacing: 0;
|
| 246 |
+
background: rgba(30, 34, 54, 0.98);
|
| 247 |
+
/* border-radius: 16px; 테이블 자체에는 radius 제거 */
|
| 248 |
+
box-shadow: 0 4px 24px 0 rgba(16, 152, 247, 0.10), 0 1.5px 6px 0 rgba(227, 84, 84, 0.08);
|
| 249 |
+
margin-bottom: 24px;
|
| 250 |
+
}
|
| 251 |
+
.pretty-leaderboard-table thead {
|
| 252 |
+
border-radius: 16px 16px 0 0;
|
| 253 |
+
overflow: hidden;
|
| 254 |
+
background: #23244a;
|
| 255 |
+
}
|
| 256 |
+
|
| 257 |
+
/* Sticky first and second columns */
|
| 258 |
+
/* Sticky first and second columns - header(th) */
|
| 259 |
+
.pretty-leaderboard-table th:nth-child(1) {
|
| 260 |
+
position: sticky;
|
| 261 |
+
left: 0;
|
| 262 |
+
top: 0;
|
| 263 |
+
z-index: 5;
|
| 264 |
+
background: #23244a;
|
| 265 |
+
min-width: 60px;
|
| 266 |
+
max-width: 60px;
|
| 267 |
+
width: 60px;
|
| 268 |
+
}
|
| 269 |
+
.pretty-leaderboard-table th:nth-child(2) {
|
| 270 |
+
position: sticky;
|
| 271 |
+
left: 60px;
|
| 272 |
+
top: 0;
|
| 273 |
+
z-index: 5;
|
| 274 |
+
background: #23244a;
|
| 275 |
+
min-width: 220px;
|
| 276 |
+
max-width: 400px;
|
| 277 |
+
width: 220px;
|
| 278 |
+
}
|
| 279 |
+
/* Sticky first and second columns - body(td) with CSS variable for background */
|
| 280 |
+
.pretty-leaderboard-table td:nth-child(1) {
|
| 281 |
+
position: sticky;
|
| 282 |
+
left: 0;
|
| 283 |
+
z-index: 4;
|
| 284 |
+
background: var(--row-bg) !important;
|
| 285 |
+
min-width: 60px;
|
| 286 |
+
max-width: 60px;
|
| 287 |
+
width: 60px;
|
| 288 |
+
}
|
| 289 |
+
.pretty-leaderboard-table td:nth-child(2) {
|
| 290 |
+
position: sticky;
|
| 291 |
+
left: 60px;
|
| 292 |
+
z-index: 4;
|
| 293 |
+
background: var(--row-bg) !important;
|
| 294 |
+
min-width: 220px;
|
| 295 |
+
max-width: 400px;
|
| 296 |
+
width: 220px;
|
| 297 |
+
}
|
| 298 |
+
|
| 299 |
+
/* Set --row-bg variable for each row type */
|
| 300 |
+
.pretty-leaderboard-table tr {
|
| 301 |
+
--row-bg: #1e2236;
|
| 302 |
+
}
|
| 303 |
+
.pretty-leaderboard-table tr:nth-child(even) {
|
| 304 |
+
--row-bg: #23253a;
|
| 305 |
+
}
|
| 306 |
+
.pretty-leaderboard-table tr:hover {
|
| 307 |
+
--row-bg: #2066a0;
|
| 308 |
+
}
|
| 309 |
+
.pretty-leaderboard-table th {
|
| 310 |
+
z-index: 4;
|
| 311 |
+
}
|
| 312 |
+
.pretty-leaderboard-table th, .pretty-leaderboard-table td {
|
| 313 |
+
padding: 12px 16px;
|
| 314 |
+
text-align: left;
|
| 315 |
+
border-bottom: 1px solid #23244a;
|
| 316 |
+
font-size: 16px;
|
| 317 |
+
}
|
| 318 |
+
.pretty-leaderboard-table th {
|
| 319 |
+
background: #23244a;
|
| 320 |
+
color: #fff;
|
| 321 |
+
font-weight: 800;
|
| 322 |
+
letter-spacing: 0.5px;
|
| 323 |
+
border-bottom: 2px solid #1098F7;
|
| 324 |
+
text-shadow: 0 1px 8px #0006;
|
| 325 |
+
transition: background 0.2s, color 0.2s;
|
| 326 |
+
position: sticky;
|
| 327 |
+
top: 0;
|
| 328 |
+
z-index: 2;
|
| 329 |
+
border-radius: 0 !important;
|
| 330 |
+
}
|
| 331 |
+
.pretty-leaderboard-table th:hover, .pretty-leaderboard-table th:focus {
|
| 332 |
+
background: #273a8a;
|
| 333 |
+
color: #fff;
|
| 334 |
+
}
|
| 335 |
+
.pretty-leaderboard-table td {
|
| 336 |
+
color: #F5F6F7;
|
| 337 |
+
vertical-align: middle;
|
| 338 |
+
background: var(--row-bg);
|
| 339 |
+
}
|
| 340 |
+
.pretty-leaderboard-table tr:last-child td {
|
| 341 |
+
border-bottom: none;
|
| 342 |
+
}
|
| 343 |
+
/* th/td의 border-radius는 모두 제거, 둥근 효과는 thead에만 */
|
| 344 |
+
|
| 345 |
+
/* Enhanced score bar styles */
|
| 346 |
+
.score-bar {
|
| 347 |
+
display: flex;
|
| 348 |
+
align-items: center;
|
| 349 |
+
gap: 12px;
|
| 350 |
+
width: 100%;
|
| 351 |
+
}
|
| 352 |
+
.score-bar-track {
|
| 353 |
+
flex-grow: 1;
|
| 354 |
+
height: 10px;
|
| 355 |
+
background: rgba(245, 246, 247, 0.12);
|
| 356 |
+
border-radius: 5px;
|
| 357 |
+
overflow: hidden;
|
| 358 |
+
max-width: 220px;
|
| 359 |
+
box-shadow: 0 1px 4px 0 rgba(16, 152, 247, 0.10);
|
| 360 |
+
}
|
| 361 |
+
.score-bar-fill {
|
| 362 |
+
height: 100%;
|
| 363 |
+
background: linear-gradient(90deg, #a259f7 0%, #6d28d9 100%);
|
| 364 |
+
border-radius: 5px;
|
| 365 |
+
transition: width 0.3s cubic-bezier(0.4,0,0.2,1);
|
| 366 |
+
}
|
| 367 |
+
.score-bar-value {
|
| 368 |
+
font-family: 'SF Mono', monospace;
|
| 369 |
+
font-weight: 600;
|
| 370 |
+
color: #F5F6F7;
|
| 371 |
+
min-width: 60px;
|
| 372 |
+
font-size: 14px;
|
| 373 |
+
}
|
| 374 |
+
|
| 375 |
+
body {
|
| 376 |
+
min-height: 100vh;
|
| 377 |
+
}
|
| 378 |
+
/* ���체 배경색은 브라우저 기본값을 따름. gradio-container도 마찬가지로 별도 배경 없음 */
|
| 379 |
+
|
| 380 |
+
.markdown-text {
|
| 381 |
+
font-size: 16px !important;
|
| 382 |
+
}
|
| 383 |
+
|
| 384 |
+
#citation-button span {
|
| 385 |
+
font-size: 16px !important;
|
| 386 |
+
}
|
| 387 |
+
|
| 388 |
+
#citation-button textarea {
|
| 389 |
+
font-size: 16px !important;
|
| 390 |
+
}
|
| 391 |
+
|
| 392 |
+
#citation-button > label > button {
|
| 393 |
+
margin: 6px;
|
| 394 |
+
transform: scale(1.3);
|
| 395 |
+
}
|
| 396 |
+
|
| 397 |
+
.leaderboard-table-container {
|
| 398 |
+
margin-top: 15px;
|
| 399 |
+
/* Space-themed background */
|
| 400 |
+
background: linear-gradient(135deg, #e3e6f3 60%, #f5f6fa 100%);
|
| 401 |
+
position: relative;
|
| 402 |
+
background-image:
|
| 403 |
+
radial-gradient(rgba(255,255,255,0.15) 1.2px, transparent 1.2px),
|
| 404 |
+
radial-gradient(rgba(255,255,255,0.10) 1px, transparent 1px);
|
| 405 |
+
background-size: 40px 40px, 80px 80px;
|
| 406 |
+
background-position: 0 0, 20px 20px;
|
| 407 |
+
}
|
| 408 |
+
@media (prefers-color-scheme: dark) {
|
| 409 |
+
.leaderboard-table-container {
|
| 410 |
+
background: linear-gradient(135deg, #1a237e 0%, #311b92 100%) !important;
|
| 411 |
+
background-image:
|
| 412 |
+
radial-gradient(rgba(255,255,255,0.15) 1.2px, transparent 1.2px),
|
| 413 |
+
radial-gradient(rgba(255,255,255,0.10) 1px, transparent 1px);
|
| 414 |
+
background-size: 40px 40px, 80px 80px;
|
| 415 |
+
background-position: 0 0, 20px 20px;
|
| 416 |
+
}
|
| 417 |
+
}
|
| 418 |
+
@media (prefers-color-scheme: light) {
|
| 419 |
+
.leaderboard-table-container {
|
| 420 |
+
background: linear-gradient(135deg, #e3e6f3 60%, #f5f6fa 100%) !important;
|
| 421 |
+
background-image:
|
| 422 |
+
radial-gradient(rgba(255,255,255,0.15) 1.2px, transparent 1.2px),
|
| 423 |
+
radial-gradient(rgba(255,255,255,0.10) 1px, transparent 1px);
|
| 424 |
+
background-size: 40px 40px, 80px 80px;
|
| 425 |
+
background-position: 0 0, 20px 20px;
|
| 426 |
+
}
|
| 427 |
+
}
|
| 428 |
+
|
| 429 |
+
/* Limit the width of the first column so that names don't expand too much */
|
| 430 |
+
.leaderboard-table-container td:nth-child(2),
|
| 431 |
+
.leaderboard-table-container th:nth-child(2) {
|
| 432 |
+
max-width: 400px;
|
| 433 |
+
overflow: auto;
|
| 434 |
+
white-space: nowrap;
|
| 435 |
+
}
|
| 436 |
+
|
| 437 |
+
.tab-buttons button {
|
| 438 |
+
font-size: 20px;
|
| 439 |
+
}
|
| 440 |
+
|
| 441 |
+
|
| 442 |
+
/* Model type and think badge styles */
|
| 443 |
+
.badge {
|
| 444 |
+
display: inline-block;
|
| 445 |
+
border-radius: 12px;
|
| 446 |
+
padding: 2px 10px;
|
| 447 |
+
font-size: 0.85em;
|
| 448 |
+
font-weight: 700;
|
| 449 |
+
margin-left: 6px;
|
| 450 |
+
box-shadow: 0 1px 4px rgba(0,0,0,0.10);
|
| 451 |
+
vertical-align: middle;
|
| 452 |
+
}
|
| 453 |
+
.badge-think-on {
|
| 454 |
+
background: #A7C7E7;
|
| 455 |
+
color: #234567;
|
| 456 |
+
border: 1.5px solid #A7C7E7;
|
| 457 |
+
}
|
| 458 |
+
.badge-think-off {
|
| 459 |
+
background: #E0E0E0;
|
| 460 |
+
color: #555;
|
| 461 |
+
border: 1.5px solid #E0E0E0;
|
| 462 |
+
}
|
| 463 |
+
|
| 464 |
+
/* Model Type badge styles */
|
| 465 |
+
.badge-modeltype-instruct {
|
| 466 |
+
background: #B2F2E9;
|
| 467 |
+
color: #22796A;
|
| 468 |
+
border: 1.5px solid #B2F2E9;
|
| 469 |
+
}
|
| 470 |
+
.badge-modeltype-think {
|
| 471 |
+
background: #D6C8F7;
|
| 472 |
+
color: #5B4B8A;
|
| 473 |
+
border: 1.5px solid #D6C8F7;
|
| 474 |
+
}
|
| 475 |
+
.badge-modeltype-hybrid {
|
| 476 |
+
background: #FFE0B2;
|
| 477 |
+
color: #A67C52;
|
| 478 |
+
border: 1.5px solid #FFE0B2;
|
| 479 |
+
}
|
| 480 |
+
|
| 481 |
+
/* Type badge Open/Proprietary styles */
|
| 482 |
+
.badge-type-open {
|
| 483 |
+
background: #A8E6A3;
|
| 484 |
+
color: #225522;
|
| 485 |
+
border: 1.5px solid #A8E6A3;
|
| 486 |
+
}
|
| 487 |
+
.badge-type-proprietary {
|
| 488 |
+
background: #F7B2B7;
|
| 489 |
+
color: #7A2F34;
|
| 490 |
+
border: 1.5px solid #F7B2B7;
|
| 491 |
+
}
|
| 492 |
+
|
| 493 |
+
|
| 494 |
+
/* Sort button styles */
|
| 495 |
+
.sort-btn {
|
| 496 |
+
background: #23244a;
|
| 497 |
+
color: #F5F6F7;
|
| 498 |
+
border: 1px solid #1098F7;
|
| 499 |
+
border-radius: 6px;
|
| 500 |
+
font-size: 13px;
|
| 501 |
+
font-weight: 700;
|
| 502 |
+
margin-left: 4px;
|
| 503 |
+
margin-right: 2px;
|
| 504 |
+
padding: 2px 7px;
|
| 505 |
+
cursor: pointer;
|
| 506 |
+
transition: background 0.2s, color 0.2s;
|
| 507 |
+
}
|
| 508 |
+
.sort-btn:hover {
|
| 509 |
+
background: #1098F7;
|
| 510 |
+
color: #fff;
|
| 511 |
+
}
|
| 512 |
+
|
| 513 |
+
/* Custom CheckboxGroup and Dropdown styles for table theme */
|
| 514 |
+
.gr-checkbox-group, .gr-checkbox, .gr-checkbox-group label, .gr-checkbox input[type="checkbox"] {
|
| 515 |
+
background: #23244a !important;
|
| 516 |
+
color: #F5F6F7 !important;
|
| 517 |
+
border: 1.5px solid #1098F7 !important;
|
| 518 |
+
border-radius: 6px !important;
|
| 519 |
+
}
|
| 520 |
+
.gr-checkbox input[type="checkbox"]:checked {
|
| 521 |
+
background: #1a237e !important;
|
| 522 |
+
border-color: #ffd700 !important;
|
| 523 |
+
}
|
| 524 |
+
.gr-dropdown, .gr-input, select {
|
| 525 |
+
background: #23244a !important;
|
| 526 |
+
color: #F5F6F7 !important;
|
| 527 |
+
border: 1.5px solid #1098F7 !important;
|
| 528 |
+
border-radius: 6px !important;
|
| 529 |
+
}
|
| 530 |
+
|
| 531 |
+
/* Custom style for radar chart model selector's selected tags (only the tag area, not the dropdown list) */
|
| 532 |
+
.custom-dropdown .multiselect__tag {
|
| 533 |
+
background: #1098F7 !important;
|
| 534 |
+
color: #fff !important;
|
| 535 |
+
border: 1.5px solid #1098F7 !important;
|
| 536 |
+
box-shadow: 0 4px 16px #1098f755, 0 2px 8px #e5e7eb88 !important;
|
| 537 |
+
border-radius: 18px !important;
|
| 538 |
+
font-weight: 600 !important;
|
| 539 |
+
padding: 8px 20px !important;
|
| 540 |
+
margin: 2px 4px !important;
|
| 541 |
+
font-size: 1.08rem !important;
|
| 542 |
+
display: inline-block !important;
|
| 543 |
+
transition: background 0.2s, color 0.2s, box-shadow 0.2s, border 0.2s !important;
|
| 544 |
+
}
|
| 545 |
+
|
| 546 |
+
.gr-dropdown:focus, .gr-input:focus, select:focus {
|
| 547 |
+
border-color: #ffd700 !important;
|
| 548 |
+
outline: none !important;
|
| 549 |
+
}
|
| 550 |
+
|
| 551 |
+
@media (prefers-color-scheme: dark) {
|
| 552 |
+
.category-box,
|
| 553 |
+
.space-info-box,
|
| 554 |
+
.pretty-leaderboard-table,
|
| 555 |
+
.dark-container {
|
| 556 |
+
background: linear-gradient(135deg, #23244a 0%, #2a1859 100%) !important;
|
| 557 |
+
color: #f5f6f7 !important;
|
| 558 |
+
}
|
| 559 |
+
.space-info-box, .space-info-box * {
|
| 560 |
+
color: #f5f6f7 !important;
|
| 561 |
+
}
|
| 562 |
+
h3 a, h3 a:visited {
|
| 563 |
+
color: #f5f6f7 !important;
|
| 564 |
+
}
|
| 565 |
+
}
|
| 566 |
+
"""
|
| 567 |
+
|
| 568 |
+
# requirements_textbox and adaptor_class_textbox scroll/height control
|
| 569 |
+
custom_css += """
|
| 570 |
+
#requirements-textbox textarea {
|
| 571 |
+
overflow-y: auto !important;
|
| 572 |
+
resize: vertical;
|
| 573 |
+
height: 480px;
|
| 574 |
+
max-height: 480px;
|
| 575 |
+
}
|
| 576 |
+
#yml-textbox textarea {
|
| 577 |
+
overflow-y: auto !important;
|
| 578 |
+
resize: vertical;
|
| 579 |
+
height: 240px;
|
| 580 |
+
max-height: 240px;
|
| 581 |
+
}
|
| 582 |
+
|
| 583 |
+
/* No border textbox style for file upload status */
|
| 584 |
+
.no-border-textbox textarea {
|
| 585 |
+
border: none !important;
|
| 586 |
+
box-shadow: none !important;
|
| 587 |
+
background: transparent !important;
|
| 588 |
+
padding: 0 !important;
|
| 589 |
+
margin: 0 !important;
|
| 590 |
+
outline: none !important;
|
| 591 |
+
resize: none !important;
|
| 592 |
+
overflow: hidden !important;
|
| 593 |
+
}
|
| 594 |
+
.no-border-textbox .wrap {
|
| 595 |
+
background: transparent !important;
|
| 596 |
+
border: none !important;
|
| 597 |
+
box-shadow: none !important;
|
| 598 |
+
padding: 0 !important;
|
| 599 |
+
margin: 0 !important;
|
| 600 |
+
outline: none !important;
|
| 601 |
+
}
|
| 602 |
+
.no-border-textbox .prose {
|
| 603 |
+
background: transparent !important;
|
| 604 |
+
border: none !important;
|
| 605 |
+
box-shadow: none !important;
|
| 606 |
+
padding: 0 !important;
|
| 607 |
+
margin: 0 !important;
|
| 608 |
+
}
|
| 609 |
+
.no-border-textbox label {
|
| 610 |
+
display: none !important;
|
| 611 |
+
}
|
| 612 |
+
.no-border-textbox .gr-textbox {
|
| 613 |
+
border: none !important;
|
| 614 |
+
box-shadow: none !important;
|
| 615 |
+
background: transparent !important;
|
| 616 |
+
padding: 0 !important;
|
| 617 |
+
margin: 0 !important;
|
| 618 |
+
}
|
| 619 |
+
.no-border-textbox .gr-textbox > div {
|
| 620 |
+
border: none !important;
|
| 621 |
+
box-shadow: none !important;
|
| 622 |
+
background: transparent !important;
|
| 623 |
+
padding: 0 !important;
|
| 624 |
+
margin: 0 !important;
|
| 625 |
+
}
|
| 626 |
+
.no-border-textbox .gr-textbox > div > div {
|
| 627 |
+
border: none !important;
|
| 628 |
+
box-shadow: none !important;
|
| 629 |
+
background: transparent !important;
|
| 630 |
+
padding: 0 !important;
|
| 631 |
+
margin: 0 !important;
|
| 632 |
+
}
|
| 633 |
+
|
| 634 |
+
/* Ensure model name tooltips are hoverable */
|
| 635 |
+
.pretty-leaderboard-table td span[title] {
|
| 636 |
+
pointer-events: auto;
|
| 637 |
+
}
|
| 638 |
+
|
| 639 |
+
/* Tall file upload container to match lines=25 textbox height */
|
| 640 |
+
.tall-file-upload .file-upload-container {
|
| 641 |
+
min-height: 283px !important;
|
| 642 |
+
height: 283px !important;
|
| 643 |
+
}
|
| 644 |
+
.tall-file-upload .gr-file {
|
| 645 |
+
min-height: 283px !important;
|
| 646 |
+
height: 283px !important;
|
| 647 |
+
}
|
| 648 |
+
"""
|
| 649 |
+
|
| 650 |
+
def get_rank_badge(rank: int) -> str:
|
| 651 |
+
"""
|
| 652 |
+
Returns emoji for 1st, 2nd, 3rd, otherwise just the number.
|
| 653 |
+
"""
|
| 654 |
+
if rank == 1:
|
| 655 |
+
return '<span style="font-size:1.5em;" title="1st">🥇</span>'
|
| 656 |
+
elif rank == 2:
|
| 657 |
+
return '<span style="font-size:1.5em;" title="2nd">🥈</span>'
|
| 658 |
+
elif rank == 3:
|
| 659 |
+
return '<span style="font-size:1.5em;" title="3rd">🥉</span>'
|
| 660 |
+
else:
|
| 661 |
+
return f'<span style="font-size:1.2em;color:#a1a1aa;font-weight:500;">{rank}</span>'
|
| 662 |
+
|
| 663 |
+
def get_score_gauge(score: float) -> str:
|
| 664 |
+
"""
|
| 665 |
+
Returns HTML for an overall score gauge (progress bar style).
|
| 666 |
+
Robustly normalizes score to 0~100% regardless of input range (0~1 or 0~100).
|
| 667 |
+
"""
|
| 668 |
+
# Handle None/NaN
|
| 669 |
+
try:
|
| 670 |
+
score = float(score)
|
| 671 |
+
except (TypeError, ValueError):
|
| 672 |
+
score = 0.0
|
| 673 |
+
|
| 674 |
+
# Normalize: if score is 0~1, treat as normalized and scale to 0~100
|
| 675 |
+
if score is None or score != score: # NaN check
|
| 676 |
+
percent = 0.0
|
| 677 |
+
display_score = 0.0
|
| 678 |
+
elif score <= 1.0:
|
| 679 |
+
percent = score * 100
|
| 680 |
+
display_score = percent
|
| 681 |
+
else:
|
| 682 |
+
percent = score
|
| 683 |
+
display_score = score
|
| 684 |
+
|
| 685 |
+
# For scores above 95, adjust to 98~100% so the bar appears almost full
|
| 686 |
+
if percent >= 95:
|
| 687 |
+
percent = 98 + (min(percent, 100) - 95) * 0.4 # 95=98%, 100=100%
|
| 688 |
+
# Clip to 0~100
|
| 689 |
+
percent = min(max(percent, 0), 100)
|
| 690 |
+
display_score = min(max(display_score, 0), 100)
|
| 691 |
+
|
| 692 |
+
return f'''
|
| 693 |
+
<div class="score-bar" style="margin: 0.5em 0;">
|
| 694 |
+
<div class="score-bar-track">
|
| 695 |
+
<div class="score-bar-fill" style="width: {percent}%;"></div>
|
| 696 |
+
</div>
|
| 697 |
+
<span class="score-bar-value">{display_score:.3f}</span>
|
| 698 |
+
</div>
|
| 699 |
+
'''
|
| 700 |
+
|
| 701 |
+
from src.display.formatting import get_score_stars
|
| 702 |
+
|
| 703 |
+
def get_leaderboard_table_html(df, key="Category") -> str:
|
| 704 |
+
"""
|
| 705 |
+
Returns HTML for a pretty leaderboard table using badge and gauge.
|
| 706 |
+
Displays all columns in df, applying format_leaderboard_cell to each cell as needed.
|
| 707 |
+
key: "Category" or "Language" (default: "Category")
|
| 708 |
+
"""
|
| 709 |
+
import pandas as pd
|
| 710 |
+
from src.display.formatting import get_score_stars, get_type_badge, get_model_type_badge, get_output_badge, format_leaderboard_cell, get_display_model_name
|
| 711 |
+
# Build table header
|
| 712 |
+
html = ['<table class="pretty-leaderboard-table">']
|
| 713 |
+
html.append("<thead><tr>")
|
| 714 |
+
for col in df.columns:
|
| 715 |
+
html.append(f"<th>{col}</th>")
|
| 716 |
+
html.append("</tr></thead>")
|
| 717 |
+
html.append("<tbody>")
|
| 718 |
+
for idx, row in df.iterrows():
|
| 719 |
+
html.append("<tr>")
|
| 720 |
+
for col in df.columns:
|
| 721 |
+
cell = row[col]
|
| 722 |
+
# Special cell handling
|
| 723 |
+
if col == "Rank":
|
| 724 |
+
badge = get_rank_badge(cell)
|
| 725 |
+
html.append(f"<td>{badge}</td>")
|
| 726 |
+
elif col == "Model Name":
|
| 727 |
+
# Highlight top 1~3
|
| 728 |
+
rank = row.get("Rank", None)
|
| 729 |
+
highlight_style = ""
|
| 730 |
+
if rank == 1 or rank == "1":
|
| 731 |
+
highlight_style = "color: #ffd700; font-weight: bold; text-shadow: 0 0 4px #fff2;"
|
| 732 |
+
elif rank == 2 or rank == "2":
|
| 733 |
+
highlight_style = "color: #b0b0b0; font-weight: bold;"
|
| 734 |
+
elif rank == 3 or rank == "3":
|
| 735 |
+
highlight_style = "color: #cd7f32; font-weight: bold;"
|
| 736 |
+
else:
|
| 737 |
+
highlight_style = "color: #fff; font-weight: 600;"
|
| 738 |
+
display_name = get_display_model_name(str(cell))
|
| 739 |
+
link_value = row["Link"] if "Link" in row and pd.notna(row["Link"]) and str(row["Link"]).strip() != "" else None
|
| 740 |
+
if link_value:
|
| 741 |
+
clickable_name = f'<a href="{link_value}" target="_blank" style="color:inherit;">{display_name}</a>'
|
| 742 |
+
else:
|
| 743 |
+
clickable_name = display_name
|
| 744 |
+
html.append(f'<td><span style="{highlight_style}">{clickable_name}</span></td>')
|
| 745 |
+
elif col == "Model Type":
|
| 746 |
+
html.append(f"<td>{get_model_type_badge(row.get('Model Type', ''))}</td>")
|
| 747 |
+
elif col == "Type":
|
| 748 |
+
html.append(f"<td>{get_type_badge(row.get('Type', ''))}</td>")
|
| 749 |
+
elif col == "Think":
|
| 750 |
+
html.append(f"<td>{get_output_badge(row.get('Think', ''))}</td>")
|
| 751 |
+
elif col == "Overall":
|
| 752 |
+
# 별점
|
| 753 |
+
try:
|
| 754 |
+
unique_id = row.get("Model Name", None)
|
| 755 |
+
unique_id = unique_id.replace(" ", "_").replace("-", "_").replace("(", "_").replace(")", "_")
|
| 756 |
+
cell_html = get_score_stars(float(cell), unique_id=unique_id)
|
| 757 |
+
except Exception:
|
| 758 |
+
cell_html = str(cell)
|
| 759 |
+
html.append(f"<td>{cell_html}</td>")
|
| 760 |
+
else:
|
| 761 |
+
html.append(f"<td>{format_leaderboard_cell(cell, col, key)}</td>")
|
| 762 |
+
html.append("</tr>")
|
| 763 |
+
html.append("</tbody></table>")
|
| 764 |
+
table_html = "\n".join(html)
|
| 765 |
+
# Wrap in scrollable div for sticky header
|
| 766 |
+
return f'<div class="leaderboard-table-container" style="max-height:900px;overflow-y:auto;">{table_html}</div>'
|
src/display/formatting.py
ADDED
|
@@ -0,0 +1,305 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
from constants import (
|
| 3 |
+
NUMERIC_COLS_CATEGORY, NUMERIC_INT_COLS_CATEGORY,
|
| 4 |
+
NUMERIC_COLS_LANGUAGE, NUMERIC_INT_COLS_LANGUAGE
|
| 5 |
+
)
|
| 6 |
+
|
| 7 |
+
def format_leaderboard_cell(cell, col, key="Category"):
|
| 8 |
+
"""
|
| 9 |
+
Apply integer/two-decimal formatting to numeric columns.
|
| 10 |
+
key: "Category" or "Language"
|
| 11 |
+
"""
|
| 12 |
+
if key == "Language":
|
| 13 |
+
numeric_cols = NUMERIC_COLS_LANGUAGE
|
| 14 |
+
int_cols = NUMERIC_INT_COLS_LANGUAGE
|
| 15 |
+
else:
|
| 16 |
+
numeric_cols = NUMERIC_COLS_CATEGORY
|
| 17 |
+
int_cols = NUMERIC_INT_COLS_CATEGORY
|
| 18 |
+
if pd.isna(cell) or (isinstance(cell, str) and cell.strip() == ""):
|
| 19 |
+
return ""
|
| 20 |
+
try:
|
| 21 |
+
if col in int_cols:
|
| 22 |
+
return str(int(round(float(cell))))
|
| 23 |
+
elif col in numeric_cols:
|
| 24 |
+
return "{:.2f}".format(float(cell))
|
| 25 |
+
else:
|
| 26 |
+
return str(cell)
|
| 27 |
+
except Exception:
|
| 28 |
+
return ""
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def styled_error(error):
|
| 32 |
+
return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def styled_warning(warn):
|
| 36 |
+
return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def styled_message(message):
|
| 40 |
+
return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def has_no_nan_values(df, columns):
|
| 44 |
+
return df[columns].notna().all(axis=1)
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def has_nan_values(df, columns):
|
| 48 |
+
return df[columns].isna().any(axis=1)
|
| 49 |
+
|
| 50 |
+
def get_display_model_name(full_model_name: str) -> str:
|
| 51 |
+
"""
|
| 52 |
+
Removes text within parentheses from the model name for display purposes.
|
| 53 |
+
Example: "Model (v1)" -> "Model"
|
| 54 |
+
"""
|
| 55 |
+
import re
|
| 56 |
+
return re.sub(r'\s*\(.*?\)', '', full_model_name)
|
| 57 |
+
|
| 58 |
+
def get_score_stars(score, unique_id=None):
|
| 59 |
+
"""
|
| 60 |
+
Generate HTML for a 5-star rating visualization.
|
| 61 |
+
|
| 62 |
+
Args:
|
| 63 |
+
score (float or int): Overall score, can be in 0~1 or 0~100 range.
|
| 64 |
+
- If 0~1, it will be automatically scaled to 0~100.
|
| 65 |
+
- If None, NaN, or negative, treated as 0.
|
| 66 |
+
unique_id (optional): Unique identifier for SVG gradient.
|
| 67 |
+
|
| 68 |
+
Returns:
|
| 69 |
+
str: HTML string with 5-star visualization, filled in proportion to score.
|
| 70 |
+
"""
|
| 71 |
+
# Robust normalization: 0~1 -> 0~100, None/NaN/negative -> 0
|
| 72 |
+
max_stars = 5
|
| 73 |
+
full_stars = int(score // 20)
|
| 74 |
+
partial = (score % 20) / 20 # 0.0 ~ 0.999
|
| 75 |
+
stars_html = ""
|
| 76 |
+
star_size = 18 # px
|
| 77 |
+
|
| 78 |
+
# If unique_id is not provided, use "default"
|
| 79 |
+
uid = str(unique_id) if unique_id is not None else "default"
|
| 80 |
+
|
| 81 |
+
def star_svg(fill_ratio, idx):
|
| 82 |
+
# fill_ratio: 0.0 (empty) ~ 1.0 (full)
|
| 83 |
+
# White fill, gray background
|
| 84 |
+
grad_id = f"star-grad-{uid}-{idx}"
|
| 85 |
+
return f'''
|
| 86 |
+
<svg width="{star_size}" height="{star_size}" viewBox="0 0 24 24" style="margin-right:0.5px;vertical-align:middle;">
|
| 87 |
+
<defs>
|
| 88 |
+
<linearGradient id="{grad_id}" x1="0" x2="1" y1="0" y2="0">
|
| 89 |
+
<stop offset="0%" stop-color="#fff"/>
|
| 90 |
+
<stop offset="{fill_ratio*100:.1f}%" stop-color="#fff"/>
|
| 91 |
+
<stop offset="{fill_ratio*100:.1f}%" stop-color="#666666"/>
|
| 92 |
+
<stop offset="100%" stop-color="#666666"/>
|
| 93 |
+
</linearGradient>
|
| 94 |
+
</defs>
|
| 95 |
+
<polygon points="12,2 15,9 22,9.5 17,14.2 18.5,21 12,17.5 5.5,21 7,14.2 2,9.5 9,9"
|
| 96 |
+
fill="url(#{grad_id})" stroke="#888" stroke-width="1"/>
|
| 97 |
+
</svg>
|
| 98 |
+
'''
|
| 99 |
+
|
| 100 |
+
# Full stars
|
| 101 |
+
for i in range(full_stars):
|
| 102 |
+
stars_html += star_svg(1.0, i)
|
| 103 |
+
# Partial star (if needed)
|
| 104 |
+
if full_stars < max_stars:
|
| 105 |
+
if partial > 0:
|
| 106 |
+
stars_html += star_svg(partial, full_stars)
|
| 107 |
+
empty_stars = max_stars - full_stars - 1
|
| 108 |
+
start_empty = full_stars + 1
|
| 109 |
+
else:
|
| 110 |
+
empty_stars = max_stars - full_stars
|
| 111 |
+
start_empty = full_stars
|
| 112 |
+
else:
|
| 113 |
+
empty_stars = 0
|
| 114 |
+
start_empty = max_stars
|
| 115 |
+
# Empty stars
|
| 116 |
+
for i in range(start_empty, start_empty + empty_stars):
|
| 117 |
+
stars_html += star_svg(0.0, i)
|
| 118 |
+
|
| 119 |
+
# Score text
|
| 120 |
+
score_text = f'<span style="color:#fff;font-size:16px;margin-left:8px;">{score:.2f}</span>'
|
| 121 |
+
|
| 122 |
+
return f'''
|
| 123 |
+
<div style="display:flex;align-items:center;gap:4px;">
|
| 124 |
+
{stars_html}
|
| 125 |
+
{score_text}
|
| 126 |
+
</div>
|
| 127 |
+
'''
|
| 128 |
+
|
| 129 |
+
def get_type_badge(type_value):
|
| 130 |
+
"""
|
| 131 |
+
type_value: e.g. 'Open', 'Proprietary'
|
| 132 |
+
Returns a badge with class depending on type (Open/Proprietary).
|
| 133 |
+
"""
|
| 134 |
+
label = str(type_value).capitalize()
|
| 135 |
+
badge_class = ""
|
| 136 |
+
if str(type_value).lower() == "open":
|
| 137 |
+
badge_class = "badge-type-open"
|
| 138 |
+
elif str(type_value).lower() == "proprietary":
|
| 139 |
+
badge_class = "badge-type-proprietary"
|
| 140 |
+
else:
|
| 141 |
+
badge_class = "badge-type-proprietary"
|
| 142 |
+
|
| 143 |
+
return f'<span class="badge {badge_class}">{label}</span>'
|
| 144 |
+
|
| 145 |
+
def get_model_type_badge(model_type):
|
| 146 |
+
"""
|
| 147 |
+
Model Type badge: Style varies depending on Think/Normal
|
| 148 |
+
"""
|
| 149 |
+
label = str(model_type).capitalize()
|
| 150 |
+
model_type_str = str(model_type).strip().lower()
|
| 151 |
+
if model_type_str == "think":
|
| 152 |
+
badge_class = "badge-modeltype-think"
|
| 153 |
+
elif model_type_str == "instruct":
|
| 154 |
+
badge_class = "badge-modeltype-instruct"
|
| 155 |
+
elif model_type_str == "hybrid":
|
| 156 |
+
badge_class = "badge-modeltype-hybrid"
|
| 157 |
+
else:
|
| 158 |
+
badge_class = "badge-modeltype-instruct"
|
| 159 |
+
return f'<span class="badge {badge_class}">{label}</span>'
|
| 160 |
+
|
| 161 |
+
def get_think_badge(think_type):
|
| 162 |
+
label = str(think_type).capitalize()
|
| 163 |
+
if str(think_type).lower() == "on":
|
| 164 |
+
return f'<span class="badge badge-think-on">{label}</span>'
|
| 165 |
+
elif str(think_type).lower() == "off":
|
| 166 |
+
return f'<span class="badge badge-think-off">{label}</span>'
|
| 167 |
+
else:
|
| 168 |
+
return f'<span class="badge badge-think-off">{label}</span>'
|
| 169 |
+
|
| 170 |
+
import pandas as pd
|
| 171 |
+
|
| 172 |
+
def render_leaderboard_html(df, overall_col="Overall", key="Category"):
|
| 173 |
+
"""
|
| 174 |
+
Render a DataFrame as an HTML table, replacing the overall_col with a star rating visualization.
|
| 175 |
+
key: "Category" or "Language"
|
| 176 |
+
"""
|
| 177 |
+
|
| 178 |
+
# Force column order
|
| 179 |
+
desired_order = ["Rank", "Model Name", "Link", "Type", "Model Type", "Think", "Overall"]
|
| 180 |
+
cols = list(df.columns)
|
| 181 |
+
# Remaining columns
|
| 182 |
+
rest = [c for c in cols if c not in desired_order]
|
| 183 |
+
new_cols = []
|
| 184 |
+
for c in desired_order:
|
| 185 |
+
if c in cols:
|
| 186 |
+
new_cols.append(c)
|
| 187 |
+
new_cols += rest
|
| 188 |
+
df = df[new_cols]
|
| 189 |
+
|
| 190 |
+
# Columns to hide
|
| 191 |
+
hidden_cols = ["Comment", "Link"]
|
| 192 |
+
|
| 193 |
+
# Build table header
|
| 194 |
+
def get_sort_arrow():
|
| 195 |
+
# Arrow buttons removed as requested
|
| 196 |
+
return ""
|
| 197 |
+
|
| 198 |
+
# Extract sort state (from State or use default)
|
| 199 |
+
sort_col = getattr(df, "_sort_col", None) or (df.columns[0] if len(df.columns) > 0 else None)
|
| 200 |
+
sort_asc = getattr(df, "_sort_asc", None)
|
| 201 |
+
if sort_asc is None:
|
| 202 |
+
sort_asc = True
|
| 203 |
+
|
| 204 |
+
html = '<table class="pretty-leaderboard-table">\n<thead><tr>'
|
| 205 |
+
for col in df.columns:
|
| 206 |
+
if col in hidden_cols:
|
| 207 |
+
continue
|
| 208 |
+
# Info icon for Model Name, Med. Len. and Med. Resp. Len.
|
| 209 |
+
if col == "Model Name":
|
| 210 |
+
html += (
|
| 211 |
+
f'<th>{col}'
|
| 212 |
+
'<span class="info-icon" title="Hovering the mouse displays additional details, and clicking the model name navigates to the corresponding page.">ⓘ</span>'
|
| 213 |
+
f'{get_sort_arrow()}</th>'
|
| 214 |
+
)
|
| 215 |
+
elif col == "Med. Len.":
|
| 216 |
+
html += (
|
| 217 |
+
f'<th>{col}'
|
| 218 |
+
'<span class="info-icon" title="Median token length of think and response for the model.">ⓘ</span>'
|
| 219 |
+
f'{get_sort_arrow()}</th>'
|
| 220 |
+
)
|
| 221 |
+
elif col == "Med. Resp. Len.":
|
| 222 |
+
html += (
|
| 223 |
+
f'<th>{col}'
|
| 224 |
+
'<span class="info-icon" title="Median token length of the model\'s responses (excluding think).">ⓘ</span>'
|
| 225 |
+
f'{get_sort_arrow()}</th>'
|
| 226 |
+
)
|
| 227 |
+
elif col == overall_col:
|
| 228 |
+
html += f'<th style="min-width: 120px; max-width: 300px; width: 150px;">{col}{get_sort_arrow()}</th>'
|
| 229 |
+
else:
|
| 230 |
+
html += f'<th>{col}{get_sort_arrow()}</th>'
|
| 231 |
+
html += '</tr></thead>\n<tbody>\n'
|
| 232 |
+
|
| 233 |
+
# Build table rows
|
| 234 |
+
for _, row in df.iterrows():
|
| 235 |
+
html += '<tr>'
|
| 236 |
+
for col in df.columns:
|
| 237 |
+
if col in hidden_cols:
|
| 238 |
+
continue
|
| 239 |
+
cell = row[col]
|
| 240 |
+
if col == overall_col:
|
| 241 |
+
try:
|
| 242 |
+
# Use "Model Name" of the row as unique_id
|
| 243 |
+
unique_id = row.get("Model Name", None)
|
| 244 |
+
unique_id = unique_id.replace(" ", "_").replace("-", "_").replace("(", "_").replace(")", "_")
|
| 245 |
+
cell_html = get_score_stars(float(cell), unique_id=unique_id)
|
| 246 |
+
except Exception:
|
| 247 |
+
cell_html = str(cell)
|
| 248 |
+
html += f'<td style="min-width: 120px; max-width: 300px; width: 150px;">{cell_html}</td>'
|
| 249 |
+
elif col == "Rank":
|
| 250 |
+
# For 1st, 2nd, and 3rd place, emphasize with medal emoji and color
|
| 251 |
+
medal = ""
|
| 252 |
+
style = "color: #fff; font-weight: 600;"
|
| 253 |
+
if cell == 1 or cell == "1":
|
| 254 |
+
medal = "🥇"
|
| 255 |
+
style = "color: #ffd700; font-weight: bold; text-shadow: 0 0 4px #fff2;"
|
| 256 |
+
elif cell == 2 or cell == "2":
|
| 257 |
+
medal = "🥈"
|
| 258 |
+
style = "color: #b0b0b0; font-weight: bold;"
|
| 259 |
+
elif cell == 3 or cell == "3":
|
| 260 |
+
medal = "🥉"
|
| 261 |
+
style = "color: #cd7f32; font-weight: bold;"
|
| 262 |
+
html += f'<td><span style="{style}">{medal if medal else cell}</span></td>'
|
| 263 |
+
elif col in ["Model Name"]:
|
| 264 |
+
# Only highlight top 1~3, do not apply badge
|
| 265 |
+
rank = row.get("Rank", None)
|
| 266 |
+
highlight_style = ""
|
| 267 |
+
if rank == 1 or rank == "1":
|
| 268 |
+
highlight_style = "color: #ffd700; font-weight: bold; text-shadow: 0 0 4px #fff2;"
|
| 269 |
+
elif rank == 2 or rank == "2":
|
| 270 |
+
highlight_style = "color: #b0b0b0; font-weight: bold;"
|
| 271 |
+
elif rank == 3 or rank == "3":
|
| 272 |
+
highlight_style = "color: #cd7f32; font-weight: bold;"
|
| 273 |
+
else:
|
| 274 |
+
highlight_style = "color: #fff; font-weight: 600;"
|
| 275 |
+
display_name = get_display_model_name(str(cell))
|
| 276 |
+
|
| 277 |
+
# --- Start of new logic for tooltip ---
|
| 278 |
+
comment_value = ""
|
| 279 |
+
# Check if 'Comment' column exists and the value is not NaN/empty
|
| 280 |
+
if "Comment" in row and pd.notna(row["Comment"]) and str(row["Comment"]).strip() != "":
|
| 281 |
+
comment_value = str(row["Comment"]).strip()
|
| 282 |
+
|
| 283 |
+
title_attribute = f' title="{comment_value}"' if comment_value else ""
|
| 284 |
+
# --- End of new logic for tooltip ---
|
| 285 |
+
|
| 286 |
+
# Link logic
|
| 287 |
+
link_value = row["Link"] if "Link" in row and pd.notna(row["Link"]) and str(row["Link"]).strip() != "" else None
|
| 288 |
+
if link_value:
|
| 289 |
+
clickable_name = f'<a href="{link_value}" target="_blank" style="color:inherit;">{display_name}</a>'
|
| 290 |
+
else:
|
| 291 |
+
clickable_name = display_name
|
| 292 |
+
|
| 293 |
+
html += f'<td><span style="{highlight_style}"{title_attribute}>{clickable_name}</span></td>'
|
| 294 |
+
elif col == "Type":
|
| 295 |
+
html += f'<td>{get_type_badge(row.get("Type", ""))}</td>'
|
| 296 |
+
elif col == "Model Type":
|
| 297 |
+
html += f'<td>{get_model_type_badge(row.get("Model Type", ""))}</td>'
|
| 298 |
+
elif col == "Think":
|
| 299 |
+
html += f'<td>{get_think_badge(row.get("Think", ""))}</td>'
|
| 300 |
+
else:
|
| 301 |
+
html += f'<td>{format_leaderboard_cell(cell, col, key)}</td>'
|
| 302 |
+
html += '</tr>\n'
|
| 303 |
+
html += '</tbody></table>'
|
| 304 |
+
# Wrap in scrollable div for sticky header
|
| 305 |
+
return f'<div class="leaderboard-table-container" style="max-height:900px;overflow-y:auto;">{html}</div>'
|
src/display/utils.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dataclasses import dataclass
|
| 2 |
+
from enum import Enum
|
| 3 |
+
|
| 4 |
+
def fields(raw_class):
|
| 5 |
+
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
| 6 |
+
|
| 7 |
+
## All the model information that we might need
|
| 8 |
+
@dataclass
|
| 9 |
+
class ModelDetails:
|
| 10 |
+
name: str
|
| 11 |
+
display_name: str = ""
|
| 12 |
+
symbol: str = "" # emoji
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class Precision(Enum):
|
| 16 |
+
float16 = ModelDetails("float16")
|
| 17 |
+
bfloat16 = ModelDetails("bfloat16")
|
| 18 |
+
fp8 = ModelDetails("fp8")
|
| 19 |
+
int4 = ModelDetails("int4")
|
| 20 |
+
Unknown = ModelDetails("?")
|
| 21 |
+
|
| 22 |
+
def from_str(precision):
|
| 23 |
+
if precision in ["torch.float16", "float16"]:
|
| 24 |
+
return Precision.float16
|
| 25 |
+
if precision in ["torch.bfloat16", "bfloat16"]:
|
| 26 |
+
return Precision.bfloat16
|
| 27 |
+
if precision == "fp8":
|
| 28 |
+
return Precision.fp8
|
| 29 |
+
if precision == "int4":
|
| 30 |
+
return Precision.int4
|
| 31 |
+
return Precision.Unknown
|
src/envs.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
|
| 3 |
+
from huggingface_hub import HfApi
|
| 4 |
+
|
| 5 |
+
# Info to change for your repository
|
| 6 |
+
# ----------------------------------
|
| 7 |
+
TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
|
| 8 |
+
|
| 9 |
+
OWNER = "SamsungResearch"
|
| 10 |
+
# ----------------------------------
|
| 11 |
+
|
| 12 |
+
REPO_ID = f"{OWNER}/TRUEBench"
|
| 13 |
+
QUEUE_REPO = f"{OWNER}/SR_Leaderboard_Requests"
|
| 14 |
+
FAILED_QUEUE_REPO = f"{OWNER}/SR_Leaderboard_Failed_Requests"
|
| 15 |
+
RESULTS_REPO = f"{OWNER}/SR_Leaderboard_Results"
|
| 16 |
+
|
| 17 |
+
# If you setup a cache later, just change HF_HOME
|
| 18 |
+
CACHE_PATH=os.getenv("HF_HOME", ".")
|
| 19 |
+
|
| 20 |
+
# Local caches
|
| 21 |
+
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
|
| 22 |
+
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
|
| 23 |
+
FAILED_EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "failed-eval-queue")
|
| 24 |
+
EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
|
| 25 |
+
EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
|
| 26 |
+
FAILED_EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "failed-eval-queue-bk")
|
| 27 |
+
|
| 28 |
+
API = HfApi(token=TOKEN)
|
src/submission/check_validity.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
import re
|
| 4 |
+
from collections import defaultdict
|
| 5 |
+
from datetime import datetime, timedelta, timezone
|
| 6 |
+
|
| 7 |
+
import huggingface_hub
|
| 8 |
+
from huggingface_hub import ModelCard
|
| 9 |
+
from huggingface_hub.hf_api import ModelInfo
|
| 10 |
+
from transformers import AutoConfig
|
| 11 |
+
from transformers.models.auto.tokenization_auto import AutoTokenizer
|
| 12 |
+
|
| 13 |
+
def check_model_card(repo_id: str) -> tuple[bool, str]:
|
| 14 |
+
"""Checks if the model card and license exist and have been filled"""
|
| 15 |
+
try:
|
| 16 |
+
card = ModelCard.load(repo_id)
|
| 17 |
+
except huggingface_hub.utils.EntryNotFoundError:
|
| 18 |
+
return False, "Please add a model card to your model to explain how you trained/fine-tuned it."
|
| 19 |
+
|
| 20 |
+
# Enforce license metadata
|
| 21 |
+
if card.data.license is None:
|
| 22 |
+
if not ("license_name" in card.data and "license_link" in card.data):
|
| 23 |
+
return False, (
|
| 24 |
+
"License not found. Please add a license to your model card using the `license` metadata or a"
|
| 25 |
+
" `license_name`/`license_link` pair."
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
# Enforce card content
|
| 29 |
+
if len(card.text) < 200:
|
| 30 |
+
return False, "Please add a description to your model card, it is too short."
|
| 31 |
+
|
| 32 |
+
return True, ""
|
| 33 |
+
|
| 34 |
+
def get_model_size(model_info: ModelInfo, precision: str):
|
| 35 |
+
"""Gets the model size from the configuration, or the model name if the configuration does not contain the information."""
|
| 36 |
+
try:
|
| 37 |
+
model_size = round(model_info.safetensors["total"] / 1e9, 3)
|
| 38 |
+
except (AttributeError, TypeError):
|
| 39 |
+
return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
|
| 40 |
+
|
| 41 |
+
size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
|
| 42 |
+
model_size = size_factor * model_size
|
| 43 |
+
return model_size
|
| 44 |
+
|
| 45 |
+
def get_model_arch(model_info: ModelInfo):
|
| 46 |
+
"""Gets the model architecture from the configuration"""
|
| 47 |
+
return model_info.config.get("architectures", "Unknown")
|
| 48 |
+
|
| 49 |
+
def already_submitted_models(requested_models_dir: str) -> set[str]:
|
| 50 |
+
"""Gather a list of already submitted models to avoid duplicates"""
|
| 51 |
+
depth = 2
|
| 52 |
+
file_names = []
|
| 53 |
+
users_to_submission_dates = defaultdict(list)
|
| 54 |
+
|
| 55 |
+
for root, _, files in os.walk(requested_models_dir):
|
| 56 |
+
current_depth = root.count(os.sep) - requested_models_dir.count(os.sep)
|
| 57 |
+
if current_depth == depth:
|
| 58 |
+
for file in files:
|
| 59 |
+
if not file.endswith(".json"):
|
| 60 |
+
continue
|
| 61 |
+
with open(os.path.join(root, file), "r") as f:
|
| 62 |
+
info = json.load(f)
|
| 63 |
+
|
| 64 |
+
file_names.append(f"{info['benchmark']}_{info['model']}")
|
| 65 |
+
|
| 66 |
+
# Select organisation
|
| 67 |
+
if info["model"].count("/") == 0 or "submitted_time" not in info:
|
| 68 |
+
continue
|
| 69 |
+
organisation, _ = info["model"].split("/")
|
| 70 |
+
users_to_submission_dates[organisation].extend([{"benchmark": info['benchmark'], "model": info["model"], "submitted_time": info["submitted_time"]}])
|
| 71 |
+
|
| 72 |
+
return set(file_names), users_to_submission_dates
|
src/submission/submit.py
ADDED
|
@@ -0,0 +1,279 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import yaml
|
| 3 |
+
import os
|
| 4 |
+
import re
|
| 5 |
+
from datetime import datetime, timezone, timedelta
|
| 6 |
+
from typing import Optional
|
| 7 |
+
from src.display.formatting import styled_error, styled_message, styled_warning
|
| 8 |
+
from src.envs import API, EVAL_REQUESTS_PATH, FAILED_EVAL_REQUESTS_PATH, TOKEN, FAILED_QUEUE_REPO, QUEUE_REPO, REPO_ID
|
| 9 |
+
from src.submission.check_validity import (
|
| 10 |
+
already_submitted_models,
|
| 11 |
+
check_model_card,
|
| 12 |
+
get_model_size
|
| 13 |
+
)
|
| 14 |
+
import gradio as gr
|
| 15 |
+
from utils import download_with_restart
|
| 16 |
+
from huggingface_hub import snapshot_download
|
| 17 |
+
|
| 18 |
+
REQUESTED_MODELS = None
|
| 19 |
+
USERS_TO_SUBMISSION_DATES = None
|
| 20 |
+
|
| 21 |
+
def restart_space():
|
| 22 |
+
API.restart_space(repo_id=REPO_ID)
|
| 23 |
+
|
| 24 |
+
def add_new_eval_option(
|
| 25 |
+
contact_email: str,
|
| 26 |
+
model: str,
|
| 27 |
+
model_type: str,
|
| 28 |
+
think_type: str,
|
| 29 |
+
precision: str,
|
| 30 |
+
response_prefix: str,
|
| 31 |
+
requirements: str,
|
| 32 |
+
user_state: str,
|
| 33 |
+
organization_list: list,
|
| 34 |
+
yml_textbox: str,
|
| 35 |
+
upbox,
|
| 36 |
+
):
|
| 37 |
+
|
| 38 |
+
ERROR_MESSAGE = None
|
| 39 |
+
|
| 40 |
+
# Validate email format
|
| 41 |
+
email_regex = r"^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$"
|
| 42 |
+
if not re.match(email_regex, contact_email):
|
| 43 |
+
if ERROR_MESSAGE is None:
|
| 44 |
+
ERROR_MESSAGE = "Please provide a valid email address."
|
| 45 |
+
|
| 46 |
+
# Synchronize: Just before submission, copy the latest QUEUE_REPO to EVAL_REQUESTS_PATH
|
| 47 |
+
download_with_restart(
|
| 48 |
+
snapshot_download,
|
| 49 |
+
repo_id=QUEUE_REPO,
|
| 50 |
+
local_dir=EVAL_REQUESTS_PATH,
|
| 51 |
+
repo_type="dataset",
|
| 52 |
+
token=TOKEN,
|
| 53 |
+
restart_func=restart_space
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
# Synchronize: Just before submission, copy the latest FAILED_QUEUE_REPO to FAILED_EVAL_REQUESTS_PATH
|
| 57 |
+
download_with_restart(
|
| 58 |
+
snapshot_download,
|
| 59 |
+
repo_id=FAILED_QUEUE_REPO,
|
| 60 |
+
local_dir=FAILED_EVAL_REQUESTS_PATH,
|
| 61 |
+
repo_type="dataset",
|
| 62 |
+
token=TOKEN,
|
| 63 |
+
restart_func=restart_space
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
|
| 67 |
+
|
| 68 |
+
user_name = ""
|
| 69 |
+
model_path = model
|
| 70 |
+
if "/" in model:
|
| 71 |
+
user_name = model.split("/")[0]
|
| 72 |
+
model_path = model.split("/")[1]
|
| 73 |
+
|
| 74 |
+
precision = precision.split(" ")[0]
|
| 75 |
+
KST = timezone(timedelta(hours=9))
|
| 76 |
+
current_time = datetime.now(KST).strftime("%Y-%m-%dT%H:%M:%S %z")
|
| 77 |
+
|
| 78 |
+
# Remove space in benchmark name
|
| 79 |
+
benchmark = "TRUEBench"
|
| 80 |
+
|
| 81 |
+
# Check submitter qualification
|
| 82 |
+
|
| 83 |
+
if user_name != user_state and user_name not in organization_list:
|
| 84 |
+
if ERROR_MESSAGE is None:
|
| 85 |
+
ERROR_MESSAGE = "The submitter does not have submission rights for this model."
|
| 86 |
+
|
| 87 |
+
# Does the organization submit more than three times in a day?
|
| 88 |
+
submission_times = [item['submitted_time'] for item in USERS_TO_SUBMISSION_DATES[user_name] if item['benchmark'] == benchmark]
|
| 89 |
+
submission_cnt = 0
|
| 90 |
+
for i in range(len(submission_times)):
|
| 91 |
+
hours_diff = (datetime.strptime(current_time, "%Y-%m-%dT%H:%M:%S %z") - datetime.strptime(submission_times[i], "%Y-%m-%dT%H:%M:%S %z")).total_seconds() / 3600
|
| 92 |
+
if hours_diff <= 24:
|
| 93 |
+
submission_cnt += 1
|
| 94 |
+
if submission_cnt >= 3:
|
| 95 |
+
if ERROR_MESSAGE is None:
|
| 96 |
+
ERROR_MESSAGE = "The organization already submitted three times for this benchmark today."
|
| 97 |
+
|
| 98 |
+
# Does the model actually exist?
|
| 99 |
+
revision = "main"
|
| 100 |
+
|
| 101 |
+
# Is the model info correctly filled?
|
| 102 |
+
model_info = None
|
| 103 |
+
model_size = "Unknown"
|
| 104 |
+
try:
|
| 105 |
+
model_info = API.model_info(repo_id=model, revision=revision)
|
| 106 |
+
model_size = get_model_size(model_info=model_info, precision=precision)
|
| 107 |
+
except Exception:
|
| 108 |
+
if ERROR_MESSAGE is None:
|
| 109 |
+
ERROR_MESSAGE = "Could not get your model information. Please fill it up properly."
|
| 110 |
+
|
| 111 |
+
# Were the model card and license filled?
|
| 112 |
+
license = "Unknown"
|
| 113 |
+
if model_info is not None:
|
| 114 |
+
try:
|
| 115 |
+
license = model_info.cardData["license"]
|
| 116 |
+
except Exception:
|
| 117 |
+
if ERROR_MESSAGE is None:
|
| 118 |
+
ERROR_MESSAGE = "Please select a license for your model."
|
| 119 |
+
|
| 120 |
+
modelcard_OK, error_msg = check_model_card(model)
|
| 121 |
+
if not modelcard_OK:
|
| 122 |
+
if ERROR_MESSAGE is None:
|
| 123 |
+
ERROR_MESSAGE = error_msg
|
| 124 |
+
|
| 125 |
+
# Response prefix check
|
| 126 |
+
if think_type == "On":
|
| 127 |
+
if response_prefix == "":
|
| 128 |
+
if ERROR_MESSAGE is None:
|
| 129 |
+
ERROR_MESSAGE = "It is required to fill in the response prefix when 'Think' is 'On'."
|
| 130 |
+
else:
|
| 131 |
+
response_prefix = ""
|
| 132 |
+
|
| 133 |
+
# Handle YAML config input (file or textbox)
|
| 134 |
+
config_dict = None
|
| 135 |
+
|
| 136 |
+
# Case 1: File uploaded
|
| 137 |
+
if upbox is not None and getattr(upbox, "name", ""):
|
| 138 |
+
file_name = upbox.name
|
| 139 |
+
if not file_name.lower().endswith(".yaml") and not file_name.lower().endswith(".yml"):
|
| 140 |
+
if ERROR_MESSAGE is None:
|
| 141 |
+
ERROR_MESSAGE = "Please submit a .yaml or .yml file."
|
| 142 |
+
try:
|
| 143 |
+
with open(file_name, 'r', encoding='utf-8') as f:
|
| 144 |
+
config_dict = yaml.safe_load(f)
|
| 145 |
+
except yaml.YAMLError:
|
| 146 |
+
if ERROR_MESSAGE is None:
|
| 147 |
+
ERROR_MESSAGE = "The file is not a valid YAML format."
|
| 148 |
+
except Exception as e:
|
| 149 |
+
if ERROR_MESSAGE is None:
|
| 150 |
+
ERROR_MESSAGE = f"An error occurred while reading the file. {e}"
|
| 151 |
+
if config_dict is None:
|
| 152 |
+
if ERROR_MESSAGE is None:
|
| 153 |
+
ERROR_MESSAGE = "The YAML file is empty or invalid."
|
| 154 |
+
else:
|
| 155 |
+
# Case 2: No file uploaded
|
| 156 |
+
if not yml_textbox or not yml_textbox.strip():
|
| 157 |
+
if ERROR_MESSAGE is None:
|
| 158 |
+
ERROR_MESSAGE = "Please fill in the configuration box or submit a YAML file."
|
| 159 |
+
try:
|
| 160 |
+
config_dict = yaml.safe_load(yml_textbox)
|
| 161 |
+
except yaml.YAMLError:
|
| 162 |
+
if ERROR_MESSAGE is None:
|
| 163 |
+
ERROR_MESSAGE = "Please provide a valid configuration."
|
| 164 |
+
if config_dict is None:
|
| 165 |
+
if ERROR_MESSAGE is None:
|
| 166 |
+
ERROR_MESSAGE = "Please provide a valid configuration."
|
| 167 |
+
|
| 168 |
+
# Restrict config keys
|
| 169 |
+
allowed_keys = {"llm_serve_args", "sampling_params", "extra_body"}
|
| 170 |
+
if not isinstance(config_dict, dict):
|
| 171 |
+
if ERROR_MESSAGE is None:
|
| 172 |
+
ERROR_MESSAGE = "The configuration must be a YAML dictionary at the top level."
|
| 173 |
+
extra_keys = set(config_dict.keys()) - allowed_keys
|
| 174 |
+
if extra_keys:
|
| 175 |
+
if ERROR_MESSAGE is None:
|
| 176 |
+
ERROR_MESSAGE = f"Only the following keys are allowed in the configuration: llm_serve_args, sampling_params, extra_body. Found invalid keys: {', '.join(sorted(extra_keys))}."
|
| 177 |
+
|
| 178 |
+
configs = json.dumps(config_dict, indent=4, ensure_ascii=False)
|
| 179 |
+
|
| 180 |
+
# Check for duplicate submission
|
| 181 |
+
submission_times = [item['submitted_time'] for item in USERS_TO_SUBMISSION_DATES[user_name] if item['benchmark'] == benchmark and item['model'] == model]
|
| 182 |
+
submission_cnt = 0
|
| 183 |
+
submission_total_cnt = 0
|
| 184 |
+
for i in range(len(submission_times)):
|
| 185 |
+
submission_total_cnt += 1
|
| 186 |
+
hours_diff = (datetime.strptime(current_time, "%Y-%m-%dT%H:%M:%S %z") - datetime.strptime(submission_times[i], "%Y-%m-%dT%H:%M:%S %z")).total_seconds() / 3600
|
| 187 |
+
if hours_diff <= 24:
|
| 188 |
+
submission_cnt += 1
|
| 189 |
+
if submission_cnt >= 1:
|
| 190 |
+
if ERROR_MESSAGE is None:
|
| 191 |
+
ERROR_MESSAGE = "This model has been already submitted within 24 hours."
|
| 192 |
+
if submission_total_cnt >= 3:
|
| 193 |
+
if ERROR_MESSAGE is None:
|
| 194 |
+
ERROR_MESSAGE = "This model has been already submitted three times for this benchmark."
|
| 195 |
+
|
| 196 |
+
print("Creating eval file")
|
| 197 |
+
if ERROR_MESSAGE is None:
|
| 198 |
+
OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}/{benchmark}_{model_path}"
|
| 199 |
+
else:
|
| 200 |
+
OUT_DIR = f"{FAILED_EVAL_REQUESTS_PATH}/{user_name}/{benchmark}_{model_path}"
|
| 201 |
+
os.makedirs(OUT_DIR, exist_ok=True)
|
| 202 |
+
current_time_replaced = current_time.replace("-", "").replace(":", "").replace("T", "_").split()[0]
|
| 203 |
+
out_path = f"{OUT_DIR}/{current_time_replaced}.json"
|
| 204 |
+
|
| 205 |
+
# Seems good, creating the eval
|
| 206 |
+
print("Adding new eval")
|
| 207 |
+
|
| 208 |
+
if ERROR_MESSAGE is None:
|
| 209 |
+
eval_entry = {
|
| 210 |
+
"benchmark": benchmark,
|
| 211 |
+
"contact_email": contact_email,
|
| 212 |
+
"model": model,
|
| 213 |
+
"type": "open",
|
| 214 |
+
"model_type": model_type,
|
| 215 |
+
"think_type": think_type,
|
| 216 |
+
"precision": precision,
|
| 217 |
+
"response_prefix": response_prefix,
|
| 218 |
+
"requirements": requirements,
|
| 219 |
+
"status": "PENDING",
|
| 220 |
+
"submitted_time": current_time,
|
| 221 |
+
"likes": getattr(model_info, "likes", -1),
|
| 222 |
+
"params": model_size,
|
| 223 |
+
"license": license,
|
| 224 |
+
"private": False,
|
| 225 |
+
"configs": configs
|
| 226 |
+
}
|
| 227 |
+
else:
|
| 228 |
+
eval_entry = {
|
| 229 |
+
"benchmark": benchmark,
|
| 230 |
+
"contact_email": contact_email,
|
| 231 |
+
"model": model,
|
| 232 |
+
"type": "open",
|
| 233 |
+
"model_type": model_type,
|
| 234 |
+
"think_type": think_type,
|
| 235 |
+
"precision": precision,
|
| 236 |
+
"response_prefix": response_prefix,
|
| 237 |
+
"requirements": requirements,
|
| 238 |
+
"status": "Failed",
|
| 239 |
+
"submitted_time": current_time,
|
| 240 |
+
"likes": getattr(model_info, "likes", -1),
|
| 241 |
+
"params": model_size,
|
| 242 |
+
"license": license,
|
| 243 |
+
"private": False,
|
| 244 |
+
"configs": configs,
|
| 245 |
+
"error_message": ERROR_MESSAGE
|
| 246 |
+
}
|
| 247 |
+
|
| 248 |
+
with open(out_path, "w") as f:
|
| 249 |
+
f.write(json.dumps(eval_entry))
|
| 250 |
+
|
| 251 |
+
print("Uploading eval file")
|
| 252 |
+
if ERROR_MESSAGE is None:
|
| 253 |
+
API.upload_file(
|
| 254 |
+
path_or_fileobj=out_path,
|
| 255 |
+
path_in_repo=out_path.split("eval-queue/")[1],
|
| 256 |
+
repo_id=QUEUE_REPO,
|
| 257 |
+
repo_type="dataset",
|
| 258 |
+
commit_message=f"Add {model} to eval queue",
|
| 259 |
+
)
|
| 260 |
+
else:
|
| 261 |
+
API.upload_file(
|
| 262 |
+
path_or_fileobj=out_path,
|
| 263 |
+
path_in_repo=out_path.split("failed-eval-queue/")[1],
|
| 264 |
+
repo_id=FAILED_QUEUE_REPO,
|
| 265 |
+
repo_type="dataset",
|
| 266 |
+
commit_message=f"Add {model} to failed eval queue",
|
| 267 |
+
)
|
| 268 |
+
|
| 269 |
+
# Remove the local file
|
| 270 |
+
os.remove(out_path)
|
| 271 |
+
|
| 272 |
+
if ERROR_MESSAGE is None:
|
| 273 |
+
return styled_message(
|
| 274 |
+
"Your request has been submitted to the evaluation queue!"
|
| 275 |
+
)
|
| 276 |
+
else:
|
| 277 |
+
return styled_error(
|
| 278 |
+
ERROR_MESSAGE
|
| 279 |
+
)
|
ui.py
ADDED
|
@@ -0,0 +1,462 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
from src.display.formatting import render_leaderboard_html, get_display_model_name
|
| 3 |
+
from src.data_utils import get_length_category_list, get_length_category_df
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import numpy as np
|
| 6 |
+
|
| 7 |
+
def render_length_category_html(df, med_len_map=None):
|
| 8 |
+
"""
|
| 9 |
+
Render the length category table with Model Name colored by Rank (gold/silver/bronze), no Rank column.
|
| 10 |
+
Model Name cell includes Think, Model Type badges. Overall column is always right after Model Name.
|
| 11 |
+
Optionally, inserts Med. Len. column after Overall if med_len_map is provided.
|
| 12 |
+
"""
|
| 13 |
+
if df is None or df.empty:
|
| 14 |
+
return "<div>No data available.</div>"
|
| 15 |
+
|
| 16 |
+
# Compute Rank based on Overall (descending)
|
| 17 |
+
df = df.copy()
|
| 18 |
+
# 1. Sort so that empty strings come to the top first
|
| 19 |
+
df = df.sort_values("Overall", key=lambda x: (x == "").astype(int))
|
| 20 |
+
# 2. Then sort the actual values in descending order (empty strings are already at the top, so no effect)
|
| 21 |
+
df = df.sort_values("Overall", ascending=False, kind="mergesort").reset_index(drop=True)
|
| 22 |
+
df["Rank_Internal"] = df["Overall"].rank(method="min", ascending=False).astype(int)
|
| 23 |
+
|
| 24 |
+
# Ensure Think and Model Type columns exist for badge rendering
|
| 25 |
+
# Rename columns to ensure exact match
|
| 26 |
+
if "Type" not in df.columns:
|
| 27 |
+
df["Type"] = "unknown"
|
| 28 |
+
if "Model Type" not in df.columns:
|
| 29 |
+
df["Model Type"] = "unknown"
|
| 30 |
+
if "Think" not in df.columns:
|
| 31 |
+
df["Think"] = "unknown"
|
| 32 |
+
|
| 33 |
+
# Optionally add Med. Len. column
|
| 34 |
+
if med_len_map is not None:
|
| 35 |
+
df["Med. Len."] = df["Model Name"].map(med_len_map)
|
| 36 |
+
|
| 37 |
+
# Determine display columns: Model Name, Overall, Med. Len., {Category}, (rest, excluding Rank_Internal, Model Type, Think)
|
| 38 |
+
base_cols = [col for col in df.columns if col not in ["Rank_Internal", "Comment", "Group", "Link"]]
|
| 39 |
+
|
| 40 |
+
# Find the dynamic category column (e.g., "Short", "Long", etc.)
|
| 41 |
+
from src.data_utils import get_length_category_list
|
| 42 |
+
category_cols = [col for col in get_length_category_list() if col in base_cols]
|
| 43 |
+
category_col = category_cols[0] if category_cols else None
|
| 44 |
+
|
| 45 |
+
# Build display_cols: Model Name, Overall, Med. Len., {Category}, (rest)
|
| 46 |
+
display_cols = []
|
| 47 |
+
if "Model Name" in base_cols:
|
| 48 |
+
display_cols.append("Model Name")
|
| 49 |
+
if "Overall" in base_cols:
|
| 50 |
+
display_cols.append("Overall")
|
| 51 |
+
if "Med. Len." in base_cols:
|
| 52 |
+
display_cols.append("Med. Len.")
|
| 53 |
+
if "Med. Resp. Len." in base_cols:
|
| 54 |
+
display_cols.append("Med. Resp. Len.")
|
| 55 |
+
if category_col:
|
| 56 |
+
display_cols.append(category_col)
|
| 57 |
+
for col in base_cols:
|
| 58 |
+
if col not in display_cols:
|
| 59 |
+
display_cols.append(col)
|
| 60 |
+
|
| 61 |
+
# Build HTML table
|
| 62 |
+
html = '<table class="pretty-leaderboard-table">\n<thead><tr>'
|
| 63 |
+
for col in display_cols:
|
| 64 |
+
# Info icon for Model Name, Med. Len. and Med. Resp. Len.
|
| 65 |
+
if col == "Model Name":
|
| 66 |
+
html += (
|
| 67 |
+
f'<th>{col}'
|
| 68 |
+
'<span class="info-icon" title="Hovering the mouse displays additional details, and clicking the model name navigates to the corresponding page.">ⓘ</span>'
|
| 69 |
+
'</th>'
|
| 70 |
+
)
|
| 71 |
+
elif col == "Med. Len.":
|
| 72 |
+
html += (
|
| 73 |
+
f'<th>{col}'
|
| 74 |
+
'<span class="info-icon" title="Median token length of think and response for the model.">ⓘ</span>'
|
| 75 |
+
'</th>'
|
| 76 |
+
)
|
| 77 |
+
elif col == "Med. Resp. Len.":
|
| 78 |
+
html += (
|
| 79 |
+
f'<th>{col}'
|
| 80 |
+
'<span class="info-icon" title="Median token length of the model\'s responses (excluding think).">ⓘ</span>'
|
| 81 |
+
'</th>'
|
| 82 |
+
)
|
| 83 |
+
else:
|
| 84 |
+
html += f'<th>{col}</th>'
|
| 85 |
+
html += '</tr></thead>\n<tbody>\n'
|
| 86 |
+
|
| 87 |
+
# --- Define number formatting function ---
|
| 88 |
+
from constants import NUMERIC_COLS_CATEGORY, NUMERIC_INT_COLS_CATEGORY
|
| 89 |
+
def format_leaderboard_cell(cell, col):
|
| 90 |
+
# Handle NaN/empty strings
|
| 91 |
+
if pd.isna(cell) or (isinstance(cell, str) and cell.strip() == ""):
|
| 92 |
+
return cell
|
| 93 |
+
try:
|
| 94 |
+
if col in NUMERIC_INT_COLS_CATEGORY:
|
| 95 |
+
# Integer (rounded)
|
| 96 |
+
return str(int(round(float(cell))))
|
| 97 |
+
elif col in NUMERIC_COLS_CATEGORY:
|
| 98 |
+
# Two decimal places
|
| 99 |
+
return "{:.2f}".format(float(cell))
|
| 100 |
+
else:
|
| 101 |
+
return str(cell)
|
| 102 |
+
except Exception:
|
| 103 |
+
return str(cell)
|
| 104 |
+
|
| 105 |
+
for idx, row in df.iterrows():
|
| 106 |
+
html += '<tr>'
|
| 107 |
+
for col in display_cols:
|
| 108 |
+
cell = row[col]
|
| 109 |
+
if col == "Model Name":
|
| 110 |
+
# Gold/Silver/Bronze for 1/2/3
|
| 111 |
+
rank = row["Rank_Internal"]
|
| 112 |
+
if rank == 1:
|
| 113 |
+
style = "color: #ffd700; font-weight: bold; text-shadow: 0 0 4px #fff2;"
|
| 114 |
+
elif rank == 2:
|
| 115 |
+
style = "color: #b0b0b0; font-weight: bold;"
|
| 116 |
+
elif rank == 3:
|
| 117 |
+
style = "color: #cd7f32; font-weight: bold;"
|
| 118 |
+
else:
|
| 119 |
+
style = "color: #fff; font-weight: 600;"
|
| 120 |
+
|
| 121 |
+
# Badge HTML
|
| 122 |
+
model_type = row["Model Type"] if "Model Type" in row else "unknown"
|
| 123 |
+
think_type = row["Think"] if "Think" in row else "unknown"
|
| 124 |
+
type_value = row["Type"] if "Type" in row else "unknown"
|
| 125 |
+
from src.display.formatting import get_type_badge, get_think_badge, get_model_type_badge
|
| 126 |
+
badge_html = (
|
| 127 |
+
get_type_badge(type_value)
|
| 128 |
+
+ get_model_type_badge(model_type)
|
| 129 |
+
+ get_think_badge(think_type)
|
| 130 |
+
)
|
| 131 |
+
|
| 132 |
+
display_name = get_display_model_name(str(cell))
|
| 133 |
+
|
| 134 |
+
# --- Start of new logic for tooltip ---
|
| 135 |
+
comment_value = ""
|
| 136 |
+
# Check if 'Comment' column exists and the value is not NaN/empty
|
| 137 |
+
if "Comment" in row and pd.notna(row["Comment"]) and str(row["Comment"]).strip() != "":
|
| 138 |
+
comment_value = str(row["Comment"]).strip()
|
| 139 |
+
title_attribute = f' title="{comment_value}"' if comment_value else ""
|
| 140 |
+
# --- End of new logic for tooltip ---
|
| 141 |
+
|
| 142 |
+
# Link logic
|
| 143 |
+
link_value = row["Link"] if "Link" in row and pd.notna(row["Link"]) and str(row["Link"]).strip() != "" else None
|
| 144 |
+
if link_value:
|
| 145 |
+
clickable_name = f'<a href="{link_value}" target="_blank" style="color:inherit;">{display_name}</a>'
|
| 146 |
+
else:
|
| 147 |
+
clickable_name = display_name
|
| 148 |
+
|
| 149 |
+
html += f'<td><span style="{style}"{title_attribute}>{clickable_name}</span>{badge_html}</td>'
|
| 150 |
+
elif col == "Overall":
|
| 151 |
+
# Show stars
|
| 152 |
+
from src.display.formatting import get_score_stars
|
| 153 |
+
try:
|
| 154 |
+
unique_id = row.get("Model Name", None)
|
| 155 |
+
unique_id = unique_id.replace(" ", "_").replace("-", "_").replace("(", "_").replace(")", "_")
|
| 156 |
+
cell_html = get_score_stars(float(cell), unique_id=unique_id)
|
| 157 |
+
except Exception:
|
| 158 |
+
cell_html = str(cell)
|
| 159 |
+
html += f'<td>{cell_html}</td>'
|
| 160 |
+
else:
|
| 161 |
+
html += f'<td>{format_leaderboard_cell(cell, col)}</td>'
|
| 162 |
+
html += '</tr>\n'
|
| 163 |
+
html += '</tbody></table>'
|
| 164 |
+
# Wrap in scrollable div for sticky header
|
| 165 |
+
return f'<div class="leaderboard-table-container" style="max-height:900px;overflow-y:auto;">{html}</div>'
|
| 166 |
+
|
| 167 |
+
def render_length_category_table(leaderboard_df=None):
|
| 168 |
+
"""
|
| 169 |
+
Renders a Category selector and a table showing length stats for the selected category.
|
| 170 |
+
Uses Overall from leaderboard_df for ranking, coloring, and stars.
|
| 171 |
+
"""
|
| 172 |
+
import gradio as gr
|
| 173 |
+
|
| 174 |
+
categories = get_length_category_list()
|
| 175 |
+
default_category = categories[0] if categories else ""
|
| 176 |
+
# Merge Overall from leaderboard_df
|
| 177 |
+
def get_merged_df(selected_category):
|
| 178 |
+
df_cat = get_length_category_df(selected_category) if selected_category else None
|
| 179 |
+
if leaderboard_df is not None and df_cat is not None:
|
| 180 |
+
df_merged = df_cat.copy()
|
| 181 |
+
# Use Overall and {Category} from leaderboard_df
|
| 182 |
+
overall_map = dict(zip(leaderboard_df["Model Name"], leaderboard_df["Overall"]))
|
| 183 |
+
category_map = dict(zip(leaderboard_df["Model Name"], leaderboard_df[selected_category]))
|
| 184 |
+
df_merged["Overall"] = df_merged["Model Name"].map(overall_map)
|
| 185 |
+
df_merged[selected_category] = df_merged["Model Name"].map(category_map)
|
| 186 |
+
# Also map Model Type and Think
|
| 187 |
+
if "Type" in leaderboard_df.columns:
|
| 188 |
+
type_map = dict(zip(leaderboard_df["Model Name"], leaderboard_df["Type"]))
|
| 189 |
+
df_merged["Type"] = df_merged["Model Name"].map(type_map)
|
| 190 |
+
if "Model Type" in leaderboard_df.columns:
|
| 191 |
+
model_type_map = dict(zip(leaderboard_df["Model Name"], leaderboard_df["Model Type"]))
|
| 192 |
+
df_merged["Model Type"] = df_merged["Model Name"].map(model_type_map)
|
| 193 |
+
if "Think" in leaderboard_df.columns:
|
| 194 |
+
think_map = dict(zip(leaderboard_df["Model Name"], leaderboard_df["Think"]))
|
| 195 |
+
df_merged["Think"] = df_merged["Model Name"].map(think_map)
|
| 196 |
+
# Remove rows with missing Overall or {Category}
|
| 197 |
+
df_merged = df_merged[df_merged["Overall"].notna() & df_merged[selected_category].notna()]
|
| 198 |
+
return df_merged
|
| 199 |
+
return df_cat
|
| 200 |
+
|
| 201 |
+
df = get_merged_df(default_category)
|
| 202 |
+
|
| 203 |
+
# Prepare med_len_map if possible
|
| 204 |
+
med_len_map = None
|
| 205 |
+
if leaderboard_df is not None and "Med. Len." in leaderboard_df.columns:
|
| 206 |
+
med_len_map = dict(zip(leaderboard_df["Model Name"], leaderboard_df["Med. Len."]))
|
| 207 |
+
|
| 208 |
+
with gr.Column():
|
| 209 |
+
category_selector = gr.Dropdown(
|
| 210 |
+
choices=categories,
|
| 211 |
+
value=default_category,
|
| 212 |
+
label="Select Category for Length Table",
|
| 213 |
+
interactive=True,
|
| 214 |
+
)
|
| 215 |
+
|
| 216 |
+
table_html = gr.HTML(
|
| 217 |
+
value=render_length_category_html(df, med_len_map=med_len_map) if df is not None else "<div>No data available.</div>",
|
| 218 |
+
elem_id="length-category-table"
|
| 219 |
+
)
|
| 220 |
+
|
| 221 |
+
def update_table(selected_category):
|
| 222 |
+
df = get_merged_df(selected_category)
|
| 223 |
+
html = render_length_category_html(df, med_len_map=med_len_map)
|
| 224 |
+
return html
|
| 225 |
+
|
| 226 |
+
category_selector.change(
|
| 227 |
+
fn=update_table,
|
| 228 |
+
inputs=[category_selector],
|
| 229 |
+
outputs=[table_html]
|
| 230 |
+
)
|
| 231 |
+
|
| 232 |
+
return {
|
| 233 |
+
"category_selector": category_selector,
|
| 234 |
+
"table_html": table_html,
|
| 235 |
+
}
|
| 236 |
+
|
| 237 |
+
def create_leaderboard_tab(df, key):
|
| 238 |
+
"""
|
| 239 |
+
df: DataFrame to display
|
| 240 |
+
key: "Category" or "Language"
|
| 241 |
+
column_selector_value: default columns to select
|
| 242 |
+
"""
|
| 243 |
+
# Ensure df has Model, Model Type, Think columns for filtering
|
| 244 |
+
# No need to create Model column, only use Model Name
|
| 245 |
+
# Always ensure "Overall" column exists
|
| 246 |
+
if "Overall" not in df.columns:
|
| 247 |
+
return # Or handle error appropriately
|
| 248 |
+
# No additional mapping needed since DataFrame already has columns
|
| 249 |
+
|
| 250 |
+
df_state = gr.State(df)
|
| 251 |
+
|
| 252 |
+
# Create DataFrame including badge information (for upper table)
|
| 253 |
+
df_badge = df.copy()
|
| 254 |
+
# If Overall values are in the range 0~1, convert to 0~100
|
| 255 |
+
if "Overall" in df_badge.columns and df_badge["Overall"].max() <= 1.0:
|
| 256 |
+
df_badge["Overall"] = df_badge["Overall"] * 100
|
| 257 |
+
# Remove Group column (only in display)
|
| 258 |
+
for col_to_drop in ["Group"]:
|
| 259 |
+
if col_to_drop in df_badge.columns:
|
| 260 |
+
df_badge = df_badge.drop(columns=[col_to_drop])
|
| 261 |
+
# Handle error if "Overall" column does not exist
|
| 262 |
+
if "Overall" not in df_badge.columns:
|
| 263 |
+
return # Or handle error appropriately
|
| 264 |
+
# Always sort by "Overall"
|
| 265 |
+
# 1. Sort so that empty strings come to the top first
|
| 266 |
+
df_badge = df_badge.sort_values("Overall", key=lambda x: (x == "").astype(int))
|
| 267 |
+
# 2. Then sort the actual values in descending order (empty strings are already at the top, so no effect)
|
| 268 |
+
df_badge = df_badge.sort_values("Overall", ascending=False, kind="mergesort").reset_index(drop=True)
|
| 269 |
+
df_badge["Rank"] = df_badge.index + 1
|
| 270 |
+
# Reorder "Rank" column to be right after "Model Name"
|
| 271 |
+
cols = df_badge.columns.tolist()
|
| 272 |
+
if "Model Name" in cols and "Rank" in cols:
|
| 273 |
+
model_name_idx = cols.index("Model Name")
|
| 274 |
+
cols.remove("Rank")
|
| 275 |
+
cols.insert(model_name_idx + 1, "Rank")
|
| 276 |
+
df_badge = df_badge[cols]
|
| 277 |
+
|
| 278 |
+
with gr.Row():
|
| 279 |
+
# Type Selector (Open/Proprietary)
|
| 280 |
+
type_choices = ["Open", "Proprietary"]
|
| 281 |
+
type_selector = gr.CheckboxGroup(
|
| 282 |
+
choices=type_choices,
|
| 283 |
+
value=type_choices,
|
| 284 |
+
label="Select Type (Open/Proprietary)"
|
| 285 |
+
)
|
| 286 |
+
|
| 287 |
+
# Model Type Selector (Instruct/Think/Hybrid)
|
| 288 |
+
model_type_choices = ["Instruct", "Think", "Hybrid"]
|
| 289 |
+
model_type_selector = gr.CheckboxGroup(
|
| 290 |
+
choices=model_type_choices,
|
| 291 |
+
value=model_type_choices,
|
| 292 |
+
label="Select Model Type (Instruct/Think/Hybrid)"
|
| 293 |
+
)
|
| 294 |
+
# Think Selector (On/Off)
|
| 295 |
+
think_choices = ["On", "Off"]
|
| 296 |
+
think_selector = gr.CheckboxGroup(
|
| 297 |
+
choices=think_choices,
|
| 298 |
+
value=think_choices,
|
| 299 |
+
label="Select Think Mode (On/Off)"
|
| 300 |
+
)
|
| 301 |
+
# Add Gradio component for selecting sort criteria (always descending)
|
| 302 |
+
# For language leaderboard, dynamically extract language columns + Avg. Len., Parameter Size (B)
|
| 303 |
+
|
| 304 |
+
if key == "Language":
|
| 305 |
+
import re
|
| 306 |
+
language_columns = [col for col in df_badge.columns if re.fullmatch(r"[A-Z]{2}", col) or col == "VI"]
|
| 307 |
+
available_sort_columns = ["Overall", "Med. Len.", "Med. Resp. Len.", "Parameter Size (B)"] + language_columns
|
| 308 |
+
else:
|
| 309 |
+
category_columns = [
|
| 310 |
+
"Overall", "Med. Len.", "Med. Resp. Len.", "Parameter Size (B)", "Content Generation", "Editing", "Data Analysis", "Reasoning",
|
| 311 |
+
"Hallucination", "Safety", "Repetition", "Summarization", "Translation", "Multi-Turn"
|
| 312 |
+
]
|
| 313 |
+
available_sort_columns = [col for col in category_columns if col in df_badge.columns]
|
| 314 |
+
|
| 315 |
+
sort_col_dropdown = gr.Dropdown(
|
| 316 |
+
choices=available_sort_columns,
|
| 317 |
+
value="Overall",
|
| 318 |
+
label="Sort by",
|
| 319 |
+
interactive=True,
|
| 320 |
+
)
|
| 321 |
+
|
| 322 |
+
# Sorting function
|
| 323 |
+
leaderboard_html = render_leaderboard_html(df_badge.round(3), overall_col="Overall", key=key)
|
| 324 |
+
leaderboard_html_comp = gr.HTML(value=leaderboard_html, elem_id="leaderboard-table")
|
| 325 |
+
|
| 326 |
+
# Filtering logic for new selectors
|
| 327 |
+
def unified_filter(types, model_types, thinks, df, sort_col):
|
| 328 |
+
# Apply search filter first
|
| 329 |
+
filtered = df.copy()
|
| 330 |
+
if "Type" in filtered.columns and (not types or len(types) == 0):
|
| 331 |
+
types = filtered["Type"].unique().tolist()
|
| 332 |
+
if "Model Type" in filtered.columns and (not model_types or len(model_types) == 0):
|
| 333 |
+
model_types = filtered["Model Type"].unique().tolist()
|
| 334 |
+
if "Think" in filtered.columns and (not thinks or len(thinks) == 0):
|
| 335 |
+
thinks = filtered["Think"].unique().tolist()
|
| 336 |
+
# Defensive: always ensure "Overall" exists
|
| 337 |
+
if "Type" in filtered.columns:
|
| 338 |
+
filtered["Type"] = filtered["Type"].fillna("").astype(str)
|
| 339 |
+
types_norm = [v.lower().strip() for v in types]
|
| 340 |
+
filtered = filtered[filtered["Type"].str.lower().str.strip().isin(types_norm)]
|
| 341 |
+
if "Model Type" in filtered.columns:
|
| 342 |
+
filtered["Model Type"] = filtered["Model Type"].fillna("").astype(str)
|
| 343 |
+
model_types_norm = [v.lower().strip() for v in model_types]
|
| 344 |
+
filtered = filtered[filtered["Model Type"].str.lower().str.strip().isin(model_types_norm)]
|
| 345 |
+
if "Think" in filtered.columns:
|
| 346 |
+
filtered["Think"] = filtered["Think"].fillna("").astype(str)
|
| 347 |
+
thinks_norm = [v.lower().strip() for v in thinks]
|
| 348 |
+
filtered = filtered[filtered["Think"].str.lower().str.strip().isin(thinks_norm)]
|
| 349 |
+
if "Overall" not in filtered.columns:
|
| 350 |
+
html = "<div style='color:red'>No 'Overall' column found in data. Please check your input data.</div>"
|
| 351 |
+
return html, sort_col
|
| 352 |
+
# Always sort in descending order
|
| 353 |
+
# To make empty strings come to the top, replace them with np.inf and sort descending
|
| 354 |
+
sort_col_for_sort = filtered[sort_col].replace('', np.inf).astype(float)
|
| 355 |
+
filtered = filtered.assign(sort_col_tmp=sort_col_for_sort)
|
| 356 |
+
filtered = filtered.sort_values('sort_col_tmp', ascending=False, kind="mergesort").reset_index(drop=True)
|
| 357 |
+
filtered = filtered.drop(columns=['sort_col_tmp'])
|
| 358 |
+
# Add "Rank" column and reorder it to be right after "Model Name"
|
| 359 |
+
filtered["Rank"] = filtered.index + 1
|
| 360 |
+
cols = filtered.columns.tolist()
|
| 361 |
+
if "Model Name" in cols and "Rank" in cols:
|
| 362 |
+
model_name_idx = cols.index("Model Name")
|
| 363 |
+
cols.remove("Rank")
|
| 364 |
+
cols.insert(model_name_idx + 1, "Rank")
|
| 365 |
+
filtered = filtered[cols]
|
| 366 |
+
# Always remove Group column
|
| 367 |
+
for col_to_drop in ["Group"]:
|
| 368 |
+
if col_to_drop in filtered.columns:
|
| 369 |
+
filtered = filtered.drop(columns=[col_to_drop])
|
| 370 |
+
filtered._sort_col = sort_col
|
| 371 |
+
# Extract top-5 models (currently sorted in descending order)
|
| 372 |
+
top5_models = []
|
| 373 |
+
if sort_col in filtered.columns and "Model Name" in filtered.columns:
|
| 374 |
+
# 1. Sort so that empty strings come to the top first
|
| 375 |
+
sort_col_for_sort = filtered[sort_col].replace('', np.inf).astype(float)
|
| 376 |
+
filtered_df_sorted = filtered.assign(sort_col_tmp=sort_col_for_sort)
|
| 377 |
+
filtered_df_sorted = filtered_df_sorted.sort_values('sort_col_tmp', ascending=False, kind="mergesort").reset_index(drop=True)
|
| 378 |
+
top5_models = filtered_df_sorted["Model Name"].tolist()[:5]
|
| 379 |
+
return render_leaderboard_html(filtered, overall_col="Overall", key=key), sort_col, top5_models
|
| 380 |
+
|
| 381 |
+
# Download CSV function
|
| 382 |
+
def dataframe_to_csv(data):
|
| 383 |
+
import pandas as pd
|
| 384 |
+
# Convert if data is not a DataFrame
|
| 385 |
+
if isinstance(data, pd.DataFrame):
|
| 386 |
+
df = data.copy() # Create a copy to avoid modifying the original DataFrame in memory
|
| 387 |
+
else:
|
| 388 |
+
df = pd.DataFrame(data)
|
| 389 |
+
|
| 390 |
+
# Apply get_display_model_name to the "Model Name" column if it exists
|
| 391 |
+
if "Model Name" in df.columns:
|
| 392 |
+
df["Model Name"] = df["Model Name"].apply(get_display_model_name)
|
| 393 |
+
|
| 394 |
+
csv_path = f"truebench_{key}.csv"
|
| 395 |
+
df.to_csv(csv_path, index=False)
|
| 396 |
+
return csv_path
|
| 397 |
+
|
| 398 |
+
# Add DownloadButton (using CSS class)
|
| 399 |
+
with gr.Row():
|
| 400 |
+
with gr.Column(scale=1):
|
| 401 |
+
pass # Empty space
|
| 402 |
+
with gr.Column(scale=0):
|
| 403 |
+
download_btn = gr.DownloadButton(
|
| 404 |
+
label="📥 Download to CSV",
|
| 405 |
+
value=dataframe_to_csv,
|
| 406 |
+
inputs=[df_state],
|
| 407 |
+
visible=True,
|
| 408 |
+
elem_classes=["custom-download-btn"]
|
| 409 |
+
)
|
| 410 |
+
|
| 411 |
+
# Add custom CSS
|
| 412 |
+
custom_css = """
|
| 413 |
+
<style>
|
| 414 |
+
.custom-download-btn >>> a {
|
| 415 |
+
background: #e3e6f3 !important;
|
| 416 |
+
color: #222 !important;
|
| 417 |
+
border: 1px solid rgba(0, 0, 0, 0.1) !important;
|
| 418 |
+
border-radius: 6px !important;
|
| 419 |
+
padding: 1px 1px !important;
|
| 420 |
+
font-size: 13px !important;
|
| 421 |
+
font-weight: bold !important;
|
| 422 |
+
text-shadow: 0 1px 1px rgba(0,0,0,0.1) !important;
|
| 423 |
+
margin: 0 3px 3px 0 !important;
|
| 424 |
+
}
|
| 425 |
+
.custom-download-btn:hover {
|
| 426 |
+
background: #f5f6fa !important;
|
| 427 |
+
box-shadow: 0 2px 6px rgba(0, 0, 0, 0.1) !important;
|
| 428 |
+
}
|
| 429 |
+
</style>
|
| 430 |
+
"""
|
| 431 |
+
gr.HTML(custom_css)
|
| 432 |
+
|
| 433 |
+
sort_col_dropdown.change(
|
| 434 |
+
fn=unified_filter,
|
| 435 |
+
inputs=[type_selector, model_type_selector, think_selector, df_state, sort_col_dropdown],
|
| 436 |
+
outputs=[leaderboard_html_comp, sort_col_dropdown, gr.State()] # Add top5_models
|
| 437 |
+
)
|
| 438 |
+
type_selector.change(
|
| 439 |
+
fn=unified_filter,
|
| 440 |
+
inputs=[type_selector, model_type_selector, think_selector, df_state, sort_col_dropdown],
|
| 441 |
+
outputs=[leaderboard_html_comp, sort_col_dropdown, gr.State()]
|
| 442 |
+
)
|
| 443 |
+
model_type_selector.change(
|
| 444 |
+
fn=unified_filter,
|
| 445 |
+
inputs=[type_selector, model_type_selector, think_selector, df_state, sort_col_dropdown],
|
| 446 |
+
outputs=[leaderboard_html_comp, sort_col_dropdown, gr.State()]
|
| 447 |
+
)
|
| 448 |
+
think_selector.change(
|
| 449 |
+
fn=unified_filter,
|
| 450 |
+
inputs=[type_selector, model_type_selector, think_selector, df_state, sort_col_dropdown],
|
| 451 |
+
outputs=[leaderboard_html_comp, sort_col_dropdown, gr.State()]
|
| 452 |
+
)
|
| 453 |
+
|
| 454 |
+
return {
|
| 455 |
+
"type_selector": type_selector,
|
| 456 |
+
"model_type_selector": model_type_selector,
|
| 457 |
+
"think_selector": think_selector,
|
| 458 |
+
"leaderboard_html_comp": leaderboard_html_comp,
|
| 459 |
+
"sort_col_dropdown": sort_col_dropdown,
|
| 460 |
+
"df_state": df_state,
|
| 461 |
+
"unified_filter": unified_filter # Exposed for direct external call
|
| 462 |
+
}
|
utils.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
import yaml
|
| 3 |
+
import json
|
| 4 |
+
import gradio as gr
|
| 5 |
+
from huggingface_hub import whoami
|
| 6 |
+
|
| 7 |
+
def get_profile(profile: gr.OAuthProfile | None) -> str:
|
| 8 |
+
if profile is None:
|
| 9 |
+
return "Anonymous"
|
| 10 |
+
return profile.username
|
| 11 |
+
|
| 12 |
+
def get_organizations(oauth_token: gr.OAuthToken | None) -> str:
|
| 13 |
+
if oauth_token is None:
|
| 14 |
+
return "No Organization"
|
| 15 |
+
org_names = [org["name"] for org in whoami(oauth_token.token)["orgs"]]
|
| 16 |
+
return org_names
|
| 17 |
+
|
| 18 |
+
def get_profile_and_organizations(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToken | None) -> tuple[str, str]:
|
| 19 |
+
if profile is None:
|
| 20 |
+
output_profile = "Anonymous"
|
| 21 |
+
else:
|
| 22 |
+
output_profile = profile.username
|
| 23 |
+
|
| 24 |
+
if oauth_token is None:
|
| 25 |
+
output_org = "No Organization"
|
| 26 |
+
else:
|
| 27 |
+
output_org = [org["name"] for org in whoami(oauth_token.token)["orgs"]]
|
| 28 |
+
|
| 29 |
+
return output_profile, output_org
|
| 30 |
+
|
| 31 |
+
def download_with_restart(snapshot_download_func, repo_id, local_dir, repo_type, token, restart_func):
|
| 32 |
+
try:
|
| 33 |
+
snapshot_download_func(
|
| 34 |
+
repo_id=repo_id,
|
| 35 |
+
local_dir=local_dir,
|
| 36 |
+
repo_type=repo_type,
|
| 37 |
+
tqdm_class=None,
|
| 38 |
+
etag_timeout=30,
|
| 39 |
+
token=token
|
| 40 |
+
)
|
| 41 |
+
except Exception:
|
| 42 |
+
restart_func()
|
vis_utils.py
ADDED
|
@@ -0,0 +1,723 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
import plotly.graph_objects as go
|
| 4 |
+
from plotly.graph_objs._figure import Figure
|
| 5 |
+
from typing import Optional, List, Dict, Any
|
| 6 |
+
from src.display.formatting import get_display_model_name
|
| 7 |
+
|
| 8 |
+
SORT_COLUMN_MAP = {
|
| 9 |
+
"Average Accuracy": "Avg AC",
|
| 10 |
+
"Tool Selection Quality": "Avg TSQ",
|
| 11 |
+
"Session Cost": "Avg Total Cost"
|
| 12 |
+
}
|
| 13 |
+
|
| 14 |
+
def get_theme_colors(theme: str = "light") -> Dict[str, Any]:
|
| 15 |
+
"""Return color settings for the given theme."""
|
| 16 |
+
if theme == "dark":
|
| 17 |
+
return {
|
| 18 |
+
"paper_bg": "#181c3a", # darker blue-gray
|
| 19 |
+
"plot_bg": "#181c3a",
|
| 20 |
+
"legend_font_color": "#F5F6F7",
|
| 21 |
+
"legend_bg": 'rgba(35,36,74,0.92)', # slightly lighter than bg, but still dark
|
| 22 |
+
"annotation_color": '#F5F6F7'
|
| 23 |
+
}
|
| 24 |
+
else:
|
| 25 |
+
return {
|
| 26 |
+
"paper_bg": "#23244a", # deep blue-gray
|
| 27 |
+
"plot_bg": "#23244a",
|
| 28 |
+
"legend_font_color": "#F5F6F7",
|
| 29 |
+
"legend_bg": 'rgba(35,36,74,0.92)', # match bg for harmony
|
| 30 |
+
"annotation_color": '#F5F6F7'
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
def create_empty_radar_chart(message: str) -> Figure:
|
| 34 |
+
"""Create an empty radar chart with a message."""
|
| 35 |
+
fig = go.Figure()
|
| 36 |
+
fig.add_annotation(
|
| 37 |
+
text=f"📊 {message}",
|
| 38 |
+
xref="paper", yref="paper",
|
| 39 |
+
x=0.5, y=0.5,
|
| 40 |
+
xanchor='center', yanchor='middle',
|
| 41 |
+
font=dict(
|
| 42 |
+
size=18,
|
| 43 |
+
color="#94A3B8",
|
| 44 |
+
family="Verdana, sans-serif"
|
| 45 |
+
),
|
| 46 |
+
showarrow=False,
|
| 47 |
+
bgcolor="rgba(245, 246, 247, 0.05)",
|
| 48 |
+
bordercolor="rgba(245, 246, 247, 0.2)",
|
| 49 |
+
borderwidth=1,
|
| 50 |
+
borderpad=20
|
| 51 |
+
)
|
| 52 |
+
fig.update_layout(
|
| 53 |
+
paper_bgcolor="#01091A",
|
| 54 |
+
plot_bgcolor="rgba(245, 246, 247, 0.02)",
|
| 55 |
+
height=800,
|
| 56 |
+
width=800,
|
| 57 |
+
margin=dict(t=100, b=80, l=80, r=80),
|
| 58 |
+
title=dict(
|
| 59 |
+
text="<b>Domain Performance Chart</b>",
|
| 60 |
+
x=0.5,
|
| 61 |
+
y=0.97,
|
| 62 |
+
font=dict(
|
| 63 |
+
size=22,
|
| 64 |
+
family="Verdana, sans-serif",
|
| 65 |
+
color="#F5F6F7",
|
| 66 |
+
weight=700
|
| 67 |
+
),
|
| 68 |
+
),
|
| 69 |
+
annotations=[
|
| 70 |
+
dict(
|
| 71 |
+
text="TRUEBench",
|
| 72 |
+
xref="paper", yref="paper",
|
| 73 |
+
x=0.98, y=0.02,
|
| 74 |
+
xanchor='right', yanchor='bottom',
|
| 75 |
+
font=dict(size=10, color='#64748B'),
|
| 76 |
+
showarrow=False
|
| 77 |
+
)
|
| 78 |
+
]
|
| 79 |
+
)
|
| 80 |
+
return fig
|
| 81 |
+
|
| 82 |
+
def create_len_overall_scatter(
|
| 83 |
+
df: pd.DataFrame,
|
| 84 |
+
selected_models: Optional[List[str]] = None,
|
| 85 |
+
max_models: int = 30,
|
| 86 |
+
y_col: str = "Overall",
|
| 87 |
+
length_data: Optional[dict] = None,
|
| 88 |
+
theme: str = "light",
|
| 89 |
+
x_axis_data_source: str = "Med. Len."
|
| 90 |
+
) -> Figure:
|
| 91 |
+
"""
|
| 92 |
+
Create scatter plot showing Med. Len. vs selected y_col for up to 10 selected models.
|
| 93 |
+
Each dot is colored by Think (normal/reasoning), and the legend is by Think.
|
| 94 |
+
DataFrame must include an 'Think' column.
|
| 95 |
+
length_data: JSON data containing model length information by category
|
| 96 |
+
theme: "light" or "dark" (default: "light")
|
| 97 |
+
"""
|
| 98 |
+
import plotly.express as px
|
| 99 |
+
import json
|
| 100 |
+
|
| 101 |
+
# Defensive: check required columns
|
| 102 |
+
required_cols = ['Model Name', 'Med. Len.', 'Med. Resp. Len.', y_col]
|
| 103 |
+
for col in required_cols:
|
| 104 |
+
if col not in df.columns:
|
| 105 |
+
return create_empty_radar_chart(f"Column '{col}' not found in data")
|
| 106 |
+
# Think column check
|
| 107 |
+
think_col = None
|
| 108 |
+
for candidate in ['Think']:
|
| 109 |
+
if candidate in df.columns:
|
| 110 |
+
think_col = candidate
|
| 111 |
+
break
|
| 112 |
+
if think_col is None:
|
| 113 |
+
return create_empty_radar_chart("Column 'Think' not found in data")
|
| 114 |
+
# Filter by selected_models
|
| 115 |
+
if selected_models is not None and len(selected_models) > 0:
|
| 116 |
+
df_filtered = df[df['Model Name'].isin(selected_models)].copy()
|
| 117 |
+
else:
|
| 118 |
+
# Default: top-N by Overall
|
| 119 |
+
df_filtered = df.copy()
|
| 120 |
+
df_filtered = df_filtered.sort_values('Overall', ascending=False).head(max_models)
|
| 121 |
+
if df_filtered.empty:
|
| 122 |
+
return create_empty_radar_chart(f"No data available for {x_axis_data_source} vs {y_col} analysis")
|
| 123 |
+
|
| 124 |
+
# Determine x-axis data based on x_axis_data_source
|
| 125 |
+
x_axis_col_name = x_axis_data_source # Use this for the DataFrame column
|
| 126 |
+
length_data_key = 'Med' if x_axis_data_source == "Med. Len." else 'Med Resp'
|
| 127 |
+
|
| 128 |
+
if y_col == "Overall":
|
| 129 |
+
# For 'Overall' category, prefer direct DataFrame column reading
|
| 130 |
+
df_filtered[x_axis_col_name] = pd.to_numeric(df_filtered[x_axis_col_name], errors='coerce')
|
| 131 |
+
elif length_data:
|
| 132 |
+
# For other categories, use length_data if available
|
| 133 |
+
df_filtered[x_axis_col_name] = df_filtered['Model Name'].apply(
|
| 134 |
+
lambda x: length_data.get(x, {}).get(y_col, {}).get(length_data_key, 0)
|
| 135 |
+
)
|
| 136 |
+
else:
|
| 137 |
+
# Fallback if no length_data and not 'Overall' (though this case should ideally be handled by required_cols)
|
| 138 |
+
df_filtered[x_axis_col_name] = pd.to_numeric(df_filtered[x_axis_col_name], errors='coerce')
|
| 139 |
+
|
| 140 |
+
df_filtered[y_col] = pd.to_numeric(df_filtered[y_col], errors='coerce')
|
| 141 |
+
if 'Type' in df_filtered.columns:
|
| 142 |
+
df_filtered = df_filtered[df_filtered['Type'] != 'Proprietary']
|
| 143 |
+
if 'Parameter Size (B)' in df_filtered.columns:
|
| 144 |
+
df_filtered['Parameter Size (B)'] = pd.to_numeric(df_filtered['Parameter Size (B)'], errors='coerce')
|
| 145 |
+
min_size = 20
|
| 146 |
+
max_size = 80
|
| 147 |
+
param_sizes = df_filtered['Parameter Size (B)'].fillna(5)
|
| 148 |
+
log_sizes = np.log10(param_sizes)
|
| 149 |
+
log_min = np.log10(5)
|
| 150 |
+
log_max = np.log10(param_sizes.max())
|
| 151 |
+
marker_sizes = min_size + ((log_sizes - log_min) / (log_max - log_min)) * (max_size - min_size)
|
| 152 |
+
else:
|
| 153 |
+
marker_sizes = [30] * len(df_filtered)
|
| 154 |
+
|
| 155 |
+
legend_name_map = {
|
| 156 |
+
'On': 'Thinking',
|
| 157 |
+
'Off': 'Non-Thinking'
|
| 158 |
+
}
|
| 159 |
+
color_palette = {
|
| 160 |
+
"Thinking": "#FCE39B",
|
| 161 |
+
"Non-Thinking": "#FF9185"
|
| 162 |
+
}
|
| 163 |
+
df_filtered['MarkerType'] = df_filtered['Parameter Size (B)'].apply(
|
| 164 |
+
lambda x: 'circle' if pd.notna(x) else 'star'
|
| 165 |
+
)
|
| 166 |
+
df_filtered['ThinkDisplay'] = df_filtered['Think'].map(legend_name_map).fillna(df_filtered['Think'])
|
| 167 |
+
prefix_map = {
|
| 168 |
+
'circle': 'Open',
|
| 169 |
+
'star': 'Proprietary'
|
| 170 |
+
}
|
| 171 |
+
combinations = df_filtered[['ThinkDisplay', 'MarkerType']].drop_duplicates()
|
| 172 |
+
marker_order = {'circle': 0, 'star': 1}
|
| 173 |
+
think_order = {'Thinking': 0, 'Non-Thinking': 1}
|
| 174 |
+
combinations['sort_key'] = combinations.apply(
|
| 175 |
+
lambda row: (marker_order.get(row['MarkerType'], 99), think_order.get(row['ThinkDisplay'], 99)),
|
| 176 |
+
axis=1
|
| 177 |
+
)
|
| 178 |
+
combinations = combinations.sort_values('sort_key')
|
| 179 |
+
|
| 180 |
+
fig = go.Figure()
|
| 181 |
+
legend_shown = set()
|
| 182 |
+
median_x = df_filtered[x_axis_col_name].median()
|
| 183 |
+
median_y = df_filtered[y_col].median()
|
| 184 |
+
|
| 185 |
+
x_axis_display_name = x_axis_data_source.replace("Med.", "Median").replace("Len.", "Length")
|
| 186 |
+
|
| 187 |
+
fig.add_vline(
|
| 188 |
+
x=median_x,
|
| 189 |
+
line_dash="dash",
|
| 190 |
+
line_color="#64748B",
|
| 191 |
+
opacity=0.6,
|
| 192 |
+
line_width=1.5,
|
| 193 |
+
annotation_text=f"{x_axis_display_name}",
|
| 194 |
+
annotation_position="top right",
|
| 195 |
+
annotation_font=dict(size=10, color="#64748B")
|
| 196 |
+
)
|
| 197 |
+
fig.add_hline(
|
| 198 |
+
y=median_y,
|
| 199 |
+
line_dash="dash",
|
| 200 |
+
line_color="#64748B",
|
| 201 |
+
opacity=0.6,
|
| 202 |
+
line_width=1.5,
|
| 203 |
+
annotation_text=f"Median {y_col}",
|
| 204 |
+
annotation_position="bottom right",
|
| 205 |
+
annotation_font=dict(size=10, color="#64748B")
|
| 206 |
+
)
|
| 207 |
+
|
| 208 |
+
for _, row in combinations.iterrows():
|
| 209 |
+
think = row['ThinkDisplay']
|
| 210 |
+
marker_type = row['MarkerType']
|
| 211 |
+
prefix = prefix_map.get(marker_type, '')
|
| 212 |
+
legend_name = f"{prefix} {think}"
|
| 213 |
+
sub_df = df_filtered[
|
| 214 |
+
(df_filtered['ThinkDisplay'] == think) &
|
| 215 |
+
(df_filtered['MarkerType'] == marker_type)
|
| 216 |
+
]
|
| 217 |
+
color = color_palette.get(think, "#1098F7")
|
| 218 |
+
sub_marker_sizes = (
|
| 219 |
+
marker_sizes[sub_df.index]
|
| 220 |
+
if 'Parameter Size (B)' in df_filtered.columns and marker_type == 'circle'
|
| 221 |
+
else [30] * len(sub_df)
|
| 222 |
+
)
|
| 223 |
+
show_legend = legend_name not in legend_shown
|
| 224 |
+
legend_shown.add(legend_name)
|
| 225 |
+
fig.add_trace(go.Scatter(
|
| 226 |
+
x=sub_df[x_axis_col_name],
|
| 227 |
+
y=sub_df[y_col],
|
| 228 |
+
mode='markers+text',
|
| 229 |
+
name=legend_name,
|
| 230 |
+
legendgroup=legend_name,
|
| 231 |
+
showlegend=show_legend,
|
| 232 |
+
marker_symbol=marker_type,
|
| 233 |
+
marker=dict(
|
| 234 |
+
size=sub_marker_sizes,
|
| 235 |
+
color=color,
|
| 236 |
+
opacity=0.85,
|
| 237 |
+
line=dict(width=2, color='#01091A')
|
| 238 |
+
),
|
| 239 |
+
text=sub_df['Model Name'].apply(get_display_model_name),
|
| 240 |
+
textposition="top center",
|
| 241 |
+
textfont=dict(size=10, color='#94A3B8'),
|
| 242 |
+
hovertemplate="<b>%{text}</b><br>" +
|
| 243 |
+
f"{x_axis_display_name}: "+"%{x:.2f}<br>" +
|
| 244 |
+
f"{y_col}: "+"%{y:.2f}<br>" +
|
| 245 |
+
f"Think: {legend_name}<br>" +
|
| 246 |
+
("Parameter Size: %{customdata}B<br>" if marker_type == 'circle' else "") +
|
| 247 |
+
"<extra></extra>",
|
| 248 |
+
customdata=sub_df['Parameter Size (B)'].values if marker_type == 'circle' else None
|
| 249 |
+
))
|
| 250 |
+
|
| 251 |
+
# Theme colors
|
| 252 |
+
theme_colors = get_theme_colors(theme)
|
| 253 |
+
fig.update_layout(
|
| 254 |
+
title=dict(
|
| 255 |
+
text=f"<b>{y_col} {x_axis_display_name} vs Category Score</b>",
|
| 256 |
+
x=0.5,
|
| 257 |
+
y=0.97,
|
| 258 |
+
font=dict(size=22, family="Verdana, sans-serif", color=theme_colors["legend_font_color"], weight=700)
|
| 259 |
+
),
|
| 260 |
+
xaxis=dict(
|
| 261 |
+
title=dict(
|
| 262 |
+
text=f"<b>{y_col} {x_axis_display_name}</b>",
|
| 263 |
+
font=dict(size=16, color=theme_colors["legend_font_color"])
|
| 264 |
+
),
|
| 265 |
+
tickfont=dict(size=12, color="#94A3B8"),
|
| 266 |
+
gridcolor="rgba(245, 246, 247, 0.1)",
|
| 267 |
+
zerolinecolor="rgba(245, 246, 247, 0.2)"
|
| 268 |
+
),
|
| 269 |
+
yaxis=dict(
|
| 270 |
+
title=dict(
|
| 271 |
+
text=f"<b>{y_col} Score</b>",
|
| 272 |
+
font=dict(size=16, color=theme_colors["legend_font_color"])
|
| 273 |
+
),
|
| 274 |
+
tickfont=dict(size=12, color="#94A3B8"),
|
| 275 |
+
gridcolor="rgba(245, 246, 247, 0.1)",
|
| 276 |
+
zerolinecolor="rgba(245, 246, 247, 0.2)"
|
| 277 |
+
),
|
| 278 |
+
paper_bgcolor=theme_colors["paper_bg"],
|
| 279 |
+
plot_bgcolor=theme_colors["plot_bg"],
|
| 280 |
+
height=900,
|
| 281 |
+
width=1450,
|
| 282 |
+
showlegend=True,
|
| 283 |
+
legend=dict(
|
| 284 |
+
orientation="h",
|
| 285 |
+
yanchor="bottom",
|
| 286 |
+
y=1,
|
| 287 |
+
xanchor="center",
|
| 288 |
+
x=0.5,
|
| 289 |
+
font=dict(size=12, family="Verdana, sans-serif", color=theme_colors["legend_font_color"]),
|
| 290 |
+
bgcolor=theme_colors["legend_bg"],
|
| 291 |
+
bordercolor='rgba(245, 246, 247, 0.2)',
|
| 292 |
+
borderwidth=1
|
| 293 |
+
),
|
| 294 |
+
margin=dict(t=100, b=80, l=80, r=80)
|
| 295 |
+
)
|
| 296 |
+
return fig
|
| 297 |
+
|
| 298 |
+
def create_language_radar_chart(
|
| 299 |
+
df: pd.DataFrame,
|
| 300 |
+
metric_type: str,
|
| 301 |
+
selected_models: Optional[List[str]] = None,
|
| 302 |
+
max_models: int = 5,
|
| 303 |
+
theme: str = "light"
|
| 304 |
+
) -> Figure:
|
| 305 |
+
"""
|
| 306 |
+
Create a radar chart showing model performance across languages for the selected models.
|
| 307 |
+
theme: "light" or "dark" (default: "light")
|
| 308 |
+
"""
|
| 309 |
+
language_domains = ['KO', 'EN', 'JA', 'ZH', 'PL', 'DE', 'PT', 'ES', 'FR', 'IT', 'RU', 'VI']
|
| 310 |
+
if selected_models is None or len(selected_models) == 0:
|
| 311 |
+
actual_metric_type = SORT_COLUMN_MAP.get(metric_type, metric_type)
|
| 312 |
+
if actual_metric_type in df.columns:
|
| 313 |
+
selected_models = df.nlargest(max_models, actual_metric_type)['Model Name'].tolist()
|
| 314 |
+
else:
|
| 315 |
+
selected_models = df.head(max_models)['Model Name'].tolist()
|
| 316 |
+
selected_models = selected_models[:max_models]
|
| 317 |
+
harmonious_palette_light = [
|
| 318 |
+
{'fill': 'rgba(79,143,198,0.25)', 'line': '#4F8FC6', 'name': 'BlueGray'},
|
| 319 |
+
{'fill': 'rgba(109,213,237,0.25)', 'line': '#6DD5ED', 'name': 'SkyBlue'},
|
| 320 |
+
{'fill': 'rgba(162,89,247,0.25)', 'line': '#A259F7', 'name': 'Violet'},
|
| 321 |
+
{'fill': 'rgba(67,233,123,0.25)', 'line': '#43E97B', 'name': 'Mint'},
|
| 322 |
+
{'fill': 'rgba(255,215,0,0.20)', 'line': '#FFD700', 'name': 'Gold'}
|
| 323 |
+
]
|
| 324 |
+
harmonious_palette_dark = [
|
| 325 |
+
{'fill': 'rgba(144,202,249,0.25)', 'line': '#90CAF9', 'name': 'LightBlue'},
|
| 326 |
+
{'fill': 'rgba(128,203,196,0.25)', 'line': '#80CBC4', 'name': 'Mint'},
|
| 327 |
+
{'fill': 'rgba(179,157,219,0.25)', 'line': '#B39DDB', 'name': 'Lavender'},
|
| 328 |
+
{'fill': 'rgba(244,143,177,0.25)', 'line': '#F48FB1', 'name': 'Pink'},
|
| 329 |
+
{'fill': 'rgba(255,213,79,0.20)', 'line': '#FFD54F', 'name': 'Gold'}
|
| 330 |
+
]
|
| 331 |
+
palette = harmonious_palette_light if theme == "light" else harmonious_palette_dark
|
| 332 |
+
fig = go.Figure()
|
| 333 |
+
for idx, model_name in enumerate(selected_models):
|
| 334 |
+
model_data = df[df['Model Name'] == model_name]
|
| 335 |
+
if model_data.empty:
|
| 336 |
+
continue
|
| 337 |
+
model_row = model_data.iloc[0]
|
| 338 |
+
values = []
|
| 339 |
+
for lang in language_domains:
|
| 340 |
+
val = model_row[lang] if lang in model_row else 0
|
| 341 |
+
if pd.isna(val) or val == '':
|
| 342 |
+
val = 0
|
| 343 |
+
else:
|
| 344 |
+
val = float(val)
|
| 345 |
+
values.append(val)
|
| 346 |
+
values_plot = values + [values[0]]
|
| 347 |
+
domains_plot = language_domains + [language_domains[0]]
|
| 348 |
+
colors = palette[idx % len(palette)]
|
| 349 |
+
fig.add_trace(
|
| 350 |
+
go.Scatterpolar(
|
| 351 |
+
r=values_plot,
|
| 352 |
+
theta=domains_plot,
|
| 353 |
+
fill='toself',
|
| 354 |
+
fillcolor=colors['fill'],
|
| 355 |
+
line=dict(
|
| 356 |
+
color=colors['line'],
|
| 357 |
+
width=3,
|
| 358 |
+
shape='spline',
|
| 359 |
+
smoothing=0.5
|
| 360 |
+
),
|
| 361 |
+
marker=dict(
|
| 362 |
+
size=10,
|
| 363 |
+
color=colors['line'],
|
| 364 |
+
symbol='circle',
|
| 365 |
+
line=dict(width=2, color='#01091A' if theme == "light" else '#e3e6f3')
|
| 366 |
+
),
|
| 367 |
+
name=get_display_model_name(model_name),
|
| 368 |
+
mode="lines+markers",
|
| 369 |
+
hovertemplate="<b>%{fullData.name}</b><br>" +
|
| 370 |
+
"<span style='color: #94A3B8'>%{theta}</span><br>" +
|
| 371 |
+
"<b style='font-size: 12px'>%{r:.3f}</b><br>" +
|
| 372 |
+
"<extra></extra>",
|
| 373 |
+
hoverlabel=dict(
|
| 374 |
+
bgcolor="rgba(1, 9, 26, 0.95)" if theme == "dark" else "rgba(227,230,243,0.95)",
|
| 375 |
+
bordercolor=colors['line'],
|
| 376 |
+
font=dict(color="#F5F6F7" if theme == "dark" else "#23244a", size=12, family="Verdana, sans-serif")
|
| 377 |
+
)
|
| 378 |
+
)
|
| 379 |
+
)
|
| 380 |
+
max_range = 100.0
|
| 381 |
+
tick_vals = [i * max_range / 5 for i in range(6)]
|
| 382 |
+
tick_text = [f"{val:.2f}" for val in tick_vals]
|
| 383 |
+
theme_colors = get_theme_colors(theme)
|
| 384 |
+
fig.update_layout(
|
| 385 |
+
polar=dict(
|
| 386 |
+
bgcolor=theme_colors["plot_bg"],
|
| 387 |
+
domain=dict(x=[0,1], y=[0,1]),
|
| 388 |
+
radialaxis=dict(
|
| 389 |
+
visible=True,
|
| 390 |
+
range=[0, max_range],
|
| 391 |
+
showline=True,
|
| 392 |
+
linewidth=2,
|
| 393 |
+
linecolor='rgba(245, 246, 247, 0.2)',
|
| 394 |
+
gridcolor='rgba(245, 246, 247, 0.1)',
|
| 395 |
+
gridwidth=1,
|
| 396 |
+
tickvals=tick_vals,
|
| 397 |
+
ticktext=tick_text,
|
| 398 |
+
tickfont=dict(
|
| 399 |
+
size=11,
|
| 400 |
+
color='#94A3B8',
|
| 401 |
+
family="'Geist Mono', monospace"
|
| 402 |
+
),
|
| 403 |
+
tickangle=0
|
| 404 |
+
),
|
| 405 |
+
angularaxis=dict(
|
| 406 |
+
showline=True,
|
| 407 |
+
linewidth=2,
|
| 408 |
+
linecolor='rgba(245, 246, 247, 0.2)',
|
| 409 |
+
gridcolor='rgba(245, 246, 247, 0.08)',
|
| 410 |
+
tickfont=dict(
|
| 411 |
+
size=14,
|
| 412 |
+
family="Verdana, sans-serif",
|
| 413 |
+
color=theme_colors["legend_font_color"],
|
| 414 |
+
weight=600
|
| 415 |
+
),
|
| 416 |
+
ticktext=[
|
| 417 |
+
"📝 Content Gen",
|
| 418 |
+
"✂️ Editing",
|
| 419 |
+
"📊 Data Analysis",
|
| 420 |
+
"🧠 Reasoning",
|
| 421 |
+
"🦄 Hallucination",
|
| 422 |
+
"🛡️ Safety",
|
| 423 |
+
"🔁 Repetition",
|
| 424 |
+
"📝 Summarization",
|
| 425 |
+
"🌐 Translation",
|
| 426 |
+
"💬 Multi-Turn"
|
| 427 |
+
],
|
| 428 |
+
rotation=90,
|
| 429 |
+
direction="clockwise",
|
| 430 |
+
),
|
| 431 |
+
),
|
| 432 |
+
showlegend=True,
|
| 433 |
+
legend=dict(
|
| 434 |
+
orientation="h",
|
| 435 |
+
yanchor="bottom",
|
| 436 |
+
y=-0.15,
|
| 437 |
+
xanchor="center",
|
| 438 |
+
x=0.5,
|
| 439 |
+
font=dict(
|
| 440 |
+
size=12,
|
| 441 |
+
family="Verdana, sans-serif",
|
| 442 |
+
color=theme_colors["legend_font_color"]
|
| 443 |
+
),
|
| 444 |
+
bgcolor=theme_colors["legend_bg"],
|
| 445 |
+
bordercolor='rgba(245, 246, 247, 0.2)',
|
| 446 |
+
borderwidth=1,
|
| 447 |
+
itemsizing='constant',
|
| 448 |
+
itemwidth=30
|
| 449 |
+
),
|
| 450 |
+
title=dict(
|
| 451 |
+
text=f"<b>Language Performance</b>",
|
| 452 |
+
x=0.5,
|
| 453 |
+
y=0.97,
|
| 454 |
+
font=dict(
|
| 455 |
+
size=22,
|
| 456 |
+
family="Verdana, sans-serif",
|
| 457 |
+
color=theme_colors["legend_font_color"],
|
| 458 |
+
weight=700
|
| 459 |
+
),
|
| 460 |
+
),
|
| 461 |
+
paper_bgcolor=theme_colors["paper_bg"],
|
| 462 |
+
plot_bgcolor=theme_colors["plot_bg"],
|
| 463 |
+
height=900,
|
| 464 |
+
width=1450,
|
| 465 |
+
margin=dict(t=100, b=80, l=80, r=80),
|
| 466 |
+
annotations=[
|
| 467 |
+
dict(
|
| 468 |
+
text="TRUEBench",
|
| 469 |
+
xref="paper", yref="paper",
|
| 470 |
+
x=0.98, y=0.02,
|
| 471 |
+
xanchor='right', yanchor='bottom',
|
| 472 |
+
font=dict(size=10, color=theme_colors["annotation_color"]),
|
| 473 |
+
showarrow=False
|
| 474 |
+
)
|
| 475 |
+
]
|
| 476 |
+
)
|
| 477 |
+
return fig
|
| 478 |
+
|
| 479 |
+
def load_leaderboard_data() -> pd.DataFrame:
|
| 480 |
+
"""Load and prepare the leaderboard data (Category)."""
|
| 481 |
+
from src.data_loader import get_category_dataframe
|
| 482 |
+
return get_category_dataframe(processed=True)
|
| 483 |
+
|
| 484 |
+
def load_leaderboard_language_data() -> pd.DataFrame:
|
| 485 |
+
"""Load and prepare the leaderboard data (Language)."""
|
| 486 |
+
from src.data_loader import get_language_dataframe
|
| 487 |
+
return get_language_dataframe(processed=True)
|
| 488 |
+
|
| 489 |
+
def create_domain_radar_chart(
|
| 490 |
+
df: pd.DataFrame,
|
| 491 |
+
metric_type: str,
|
| 492 |
+
selected_models: Optional[List[str]] = None,
|
| 493 |
+
max_models: int = 5,
|
| 494 |
+
theme: str = "light"
|
| 495 |
+
) -> Figure:
|
| 496 |
+
"""
|
| 497 |
+
Create a radar chart showing model performance across domains for the selected metric.
|
| 498 |
+
theme: "light" or "dark" (default: "light")
|
| 499 |
+
"""
|
| 500 |
+
actual_metric_type = SORT_COLUMN_MAP.get(metric_type, metric_type)
|
| 501 |
+
domain_mapping = {
|
| 502 |
+
'Avg AC': {
|
| 503 |
+
'Content Generation': '📝 Content Generation',
|
| 504 |
+
'Editing': '✂️ Editing',
|
| 505 |
+
'Data Analysis': '📊 Data Analysis',
|
| 506 |
+
'Reasoning': '🧠 Reasoning',
|
| 507 |
+
'Hallucination': '🦄 Hallucination',
|
| 508 |
+
'Safety': '🛡️ Safety',
|
| 509 |
+
'Repetition': '🔁 Repetition',
|
| 510 |
+
'Summarization': '📝 Summarization',
|
| 511 |
+
'Translation': '🌐 Translation',
|
| 512 |
+
'Multi-Turn': '💬 Multi-Turn'
|
| 513 |
+
},
|
| 514 |
+
'Avg TSQ': {
|
| 515 |
+
'Content Generation': 'Content Generation',
|
| 516 |
+
'Editing': 'Editing',
|
| 517 |
+
'Data Analysis': 'Data Analysis',
|
| 518 |
+
'Reasoning': 'Reasoning',
|
| 519 |
+
'Hallucination': 'Hallucination',
|
| 520 |
+
'Safety': 'Safety',
|
| 521 |
+
'Repetition': 'Repetition',
|
| 522 |
+
'Summarization': 'Summarization',
|
| 523 |
+
'Translation': 'Translation',
|
| 524 |
+
'Multi-Turn': 'Multi-Turn'
|
| 525 |
+
},
|
| 526 |
+
'Avg Total Cost': {
|
| 527 |
+
'Content Generation': 'Content Generation',
|
| 528 |
+
'Editing': 'Editing',
|
| 529 |
+
'Data Analysis': 'Data Analysis',
|
| 530 |
+
'Reasoning': 'Reasoning',
|
| 531 |
+
'Hallucination': 'Hallucination',
|
| 532 |
+
'Safety': 'Safety',
|
| 533 |
+
'Repetition': 'Repetition',
|
| 534 |
+
'Summarization': 'Summarization',
|
| 535 |
+
'Translation': 'Translation',
|
| 536 |
+
'Multi-Turn': 'Multi-Turn'
|
| 537 |
+
},
|
| 538 |
+
'Avg Session Duration': {
|
| 539 |
+
'Content Generation': 'Content Generation',
|
| 540 |
+
'Editing': 'Editing',
|
| 541 |
+
'Data Analysis': 'Data Analysis',
|
| 542 |
+
'Reasoning': 'Reasoning',
|
| 543 |
+
'Hallucination': 'Hallucination',
|
| 544 |
+
'Safety': 'Safety',
|
| 545 |
+
'Repetition': 'Repetition',
|
| 546 |
+
'Summarization': 'Summarization',
|
| 547 |
+
'Translation': 'Translation',
|
| 548 |
+
'Multi-Turn': 'Multi-Turn'
|
| 549 |
+
},
|
| 550 |
+
'Avg Turns': {
|
| 551 |
+
'Content Generation': 'Content Generation',
|
| 552 |
+
'Editing': 'Editing',
|
| 553 |
+
'Data Analysis': 'Data Analysis',
|
| 554 |
+
'Reasoning': 'Reasoning',
|
| 555 |
+
'Hallucination': 'Hallucination',
|
| 556 |
+
'Safety': 'Safety',
|
| 557 |
+
'Repetition': 'Repetition',
|
| 558 |
+
'Summarization': 'Summarization',
|
| 559 |
+
'Translation': 'Translation',
|
| 560 |
+
'Multi-Turn': 'Multi-Turn'
|
| 561 |
+
}
|
| 562 |
+
}
|
| 563 |
+
if actual_metric_type not in domain_mapping:
|
| 564 |
+
return create_empty_radar_chart(f"Domain breakdown not available for {metric_type}")
|
| 565 |
+
if selected_models is None or len(selected_models) == 0:
|
| 566 |
+
if actual_metric_type in df.columns:
|
| 567 |
+
selected_models = df.nlargest(max_models, actual_metric_type)['Model Name'].tolist()
|
| 568 |
+
else:
|
| 569 |
+
selected_models = df.head(max_models)['Model Name'].tolist()
|
| 570 |
+
selected_models = selected_models[:max_models]
|
| 571 |
+
domains = list(domain_mapping[actual_metric_type].keys())
|
| 572 |
+
domain_columns = list(domain_mapping[actual_metric_type].values())
|
| 573 |
+
harmonious_palette_light = [
|
| 574 |
+
{'fill': 'rgba(79,143,198,0.25)', 'line': '#4F8FC6', 'name': 'BlueGray'},
|
| 575 |
+
{'fill': 'rgba(109,213,237,0.25)', 'line': '#6DD5ED', 'name': 'SkyBlue'},
|
| 576 |
+
{'fill': 'rgba(162,89,247,0.25)', 'line': '#A259F7', 'name': 'Violet'},
|
| 577 |
+
{'fill': 'rgba(67,233,123,0.25)', 'line': '#43E97B', 'name': 'Mint'},
|
| 578 |
+
{'fill': 'rgba(255,215,0,0.20)', 'line': '#FFD700', 'name': 'Gold'}
|
| 579 |
+
]
|
| 580 |
+
harmonious_palette_dark = [
|
| 581 |
+
{'fill': 'rgba(144,202,249,0.25)', 'line': '#90CAF9', 'name': 'LightBlue'},
|
| 582 |
+
{'fill': 'rgba(128,203,196,0.25)', 'line': '#80CBC4', 'name': 'Mint'},
|
| 583 |
+
{'fill': 'rgba(179,157,219,0.25)', 'line': '#B39DDB', 'name': 'Lavender'},
|
| 584 |
+
{'fill': 'rgba(244,143,177,0.25)', 'line': '#F48FB1', 'name': 'Pink'},
|
| 585 |
+
{'fill': 'rgba(255,213,79,0.20)', 'line': '#FFD54F', 'name': 'Gold'}
|
| 586 |
+
]
|
| 587 |
+
palette = harmonious_palette_light if theme == "light" else harmonious_palette_dark
|
| 588 |
+
fig = go.Figure()
|
| 589 |
+
for idx, model_name in enumerate(selected_models):
|
| 590 |
+
model_data = df[df['Model Name'] == model_name]
|
| 591 |
+
if model_data.empty:
|
| 592 |
+
continue
|
| 593 |
+
model_row = model_data.iloc[0]
|
| 594 |
+
values = []
|
| 595 |
+
for domain, _ in zip(domains, domain_columns):
|
| 596 |
+
if domain in df.columns and domain in model_row:
|
| 597 |
+
val = model_row[domain]
|
| 598 |
+
if pd.isna(val) or val == '':
|
| 599 |
+
val = 0
|
| 600 |
+
else:
|
| 601 |
+
val = float(val)
|
| 602 |
+
values.append(val)
|
| 603 |
+
else:
|
| 604 |
+
values.append(0)
|
| 605 |
+
values_plot = values + [values[0]]
|
| 606 |
+
domains_plot = domains + [domains[0]]
|
| 607 |
+
colors = palette[idx % len(palette)]
|
| 608 |
+
fig.add_trace(
|
| 609 |
+
go.Scatterpolar(
|
| 610 |
+
r=values_plot,
|
| 611 |
+
theta=domains_plot,
|
| 612 |
+
fill='toself',
|
| 613 |
+
fillcolor=colors['fill'],
|
| 614 |
+
line=dict(
|
| 615 |
+
color=colors['line'],
|
| 616 |
+
width=3,
|
| 617 |
+
shape='spline',
|
| 618 |
+
smoothing=0.5
|
| 619 |
+
),
|
| 620 |
+
marker=dict(
|
| 621 |
+
size=10,
|
| 622 |
+
color=colors['line'],
|
| 623 |
+
symbol='circle',
|
| 624 |
+
line=dict(width=2, color='#01091A' if theme == "light" else '#e3e6f3')
|
| 625 |
+
),
|
| 626 |
+
name=get_display_model_name(model_name),
|
| 627 |
+
mode="lines+markers",
|
| 628 |
+
hovertemplate="<b>%{fullData.name}</b><br>" +
|
| 629 |
+
"<span style='color: #94A3B8'>%{theta}</span><br>" +
|
| 630 |
+
"<b style='font-size: 12px'>%{r:.3f}</b><br>" +
|
| 631 |
+
"<extra></extra>",
|
| 632 |
+
hoverlabel=dict(
|
| 633 |
+
bgcolor="rgba(1, 9, 26, 0.95)" if theme == "dark" else "rgba(227,230,243,0.95)",
|
| 634 |
+
bordercolor=colors['line'],
|
| 635 |
+
font=dict(color="#F5F6F7" if theme == "dark" else "#23244a", size=12, family="Verdana, sans-serif")
|
| 636 |
+
)
|
| 637 |
+
)
|
| 638 |
+
)
|
| 639 |
+
max_range = 100.0
|
| 640 |
+
tick_vals = [i * max_range / 5 for i in range(6)]
|
| 641 |
+
tick_text = [f"{val:.2f}" for val in tick_vals]
|
| 642 |
+
theme_colors = get_theme_colors(theme)
|
| 643 |
+
fig.update_layout(
|
| 644 |
+
polar=dict(
|
| 645 |
+
bgcolor=theme_colors["plot_bg"],
|
| 646 |
+
radialaxis=dict(
|
| 647 |
+
visible=True,
|
| 648 |
+
range=[0, max_range],
|
| 649 |
+
showline=True,
|
| 650 |
+
linewidth=2,
|
| 651 |
+
linecolor='rgba(245, 246, 247, 0.2)',
|
| 652 |
+
gridcolor='rgba(245, 246, 247, 0.1)',
|
| 653 |
+
gridwidth=1,
|
| 654 |
+
tickvals=tick_vals,
|
| 655 |
+
ticktext=tick_text,
|
| 656 |
+
tickfont=dict(
|
| 657 |
+
size=11,
|
| 658 |
+
color='#94A3B8',
|
| 659 |
+
family="'Geist Mono', monospace"
|
| 660 |
+
),
|
| 661 |
+
tickangle=0
|
| 662 |
+
),
|
| 663 |
+
angularaxis=dict(
|
| 664 |
+
showline=True,
|
| 665 |
+
linewidth=2,
|
| 666 |
+
linecolor='rgba(245, 246, 247, 0.2)',
|
| 667 |
+
gridcolor='rgba(245, 246, 247, 0.08)',
|
| 668 |
+
tickfont=dict(
|
| 669 |
+
size=14,
|
| 670 |
+
family="Verdana, sans-serif",
|
| 671 |
+
color=theme_colors["legend_font_color"],
|
| 672 |
+
weight=600
|
| 673 |
+
),
|
| 674 |
+
rotation=90,
|
| 675 |
+
direction="clockwise",
|
| 676 |
+
),
|
| 677 |
+
),
|
| 678 |
+
showlegend=True,
|
| 679 |
+
legend=dict(
|
| 680 |
+
orientation="h",
|
| 681 |
+
yanchor="bottom",
|
| 682 |
+
y=-0.15,
|
| 683 |
+
xanchor="center",
|
| 684 |
+
x=0.5,
|
| 685 |
+
font=dict(
|
| 686 |
+
size=12,
|
| 687 |
+
family="Verdana, sans-serif",
|
| 688 |
+
color=theme_colors["legend_font_color"]
|
| 689 |
+
),
|
| 690 |
+
bgcolor=theme_colors["legend_bg"],
|
| 691 |
+
bordercolor='rgba(245, 246, 247, 0.2)',
|
| 692 |
+
borderwidth=1,
|
| 693 |
+
itemsizing='constant',
|
| 694 |
+
itemwidth=30
|
| 695 |
+
),
|
| 696 |
+
title=dict(
|
| 697 |
+
text=f"<b>Category Performance</b>",
|
| 698 |
+
x=0.5,
|
| 699 |
+
y=0.97,
|
| 700 |
+
font=dict(
|
| 701 |
+
size=22,
|
| 702 |
+
family="Verdana, sans-serif",
|
| 703 |
+
color=theme_colors["legend_font_color"],
|
| 704 |
+
weight=700
|
| 705 |
+
),
|
| 706 |
+
),
|
| 707 |
+
paper_bgcolor=theme_colors["paper_bg"],
|
| 708 |
+
plot_bgcolor=theme_colors["plot_bg"],
|
| 709 |
+
height=900,
|
| 710 |
+
width=1450,
|
| 711 |
+
margin=dict(t=100, b=80, l=80, r=80),
|
| 712 |
+
annotations=[
|
| 713 |
+
dict(
|
| 714 |
+
text="TRUEBench",
|
| 715 |
+
xref="paper", yref="paper",
|
| 716 |
+
x=0.98, y=0.02,
|
| 717 |
+
xanchor='right', yanchor='bottom',
|
| 718 |
+
font=dict(size=10, color=theme_colors["annotation_color"]),
|
| 719 |
+
showarrow=False
|
| 720 |
+
)
|
| 721 |
+
]
|
| 722 |
+
)
|
| 723 |
+
return fig
|