wip
Browse files- models.jsonl +0 -1
- run_benchmarks.py +2 -0
models.jsonl
CHANGED
|
@@ -14,7 +14,6 @@
|
|
| 14 |
{"id": "o3-mini", "name": " o3-mini", "organization": "OpenAI", "license": "Proprietary", "api_model": "o3-mini", "provider": "openai", "parameters": "N/A"}
|
| 15 |
{"id": "gpt-4.1", "name": "GPT-4.1", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-4.1", "provider": "openai", "parameters": "N/A"}
|
| 16 |
{"id": "gpt-4o", "name": "GPT-4o", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-4o", "provider": "openai", "parameters": "N/A"}
|
| 17 |
-
{"id": "gpt-4-turbo", "name": "GPT-4 Turbo", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-4-turbo", "provider": "openai", "parameters": "N/A"}
|
| 18 |
{"id": "gpt-3.5-turbo", "name": "GPT-3.5 Turbo", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-3.5-turbo", "provider": "openai", "parameters": "N/A"}
|
| 19 |
|
| 20 |
{"id": "claude-3-7-sonnet-latest", "name": "Claude 3.7 Sonnet", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-7-sonnet-latest", "provider": "anthropic", "parameters": "N/A"}
|
|
|
|
| 14 |
{"id": "o3-mini", "name": " o3-mini", "organization": "OpenAI", "license": "Proprietary", "api_model": "o3-mini", "provider": "openai", "parameters": "N/A"}
|
| 15 |
{"id": "gpt-4.1", "name": "GPT-4.1", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-4.1", "provider": "openai", "parameters": "N/A"}
|
| 16 |
{"id": "gpt-4o", "name": "GPT-4o", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-4o", "provider": "openai", "parameters": "N/A"}
|
|
|
|
| 17 |
{"id": "gpt-3.5-turbo", "name": "GPT-3.5 Turbo", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-3.5-turbo", "provider": "openai", "parameters": "N/A"}
|
| 18 |
|
| 19 |
{"id": "claude-3-7-sonnet-latest", "name": "Claude 3.7 Sonnet", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-7-sonnet-latest", "provider": "anthropic", "parameters": "N/A"}
|
run_benchmarks.py
CHANGED
|
@@ -398,6 +398,8 @@ def evaluate_dataset(
|
|
| 398 |
judge_results = raw_df[raw_df["judge_id"] == judge_id].copy()
|
| 399 |
if test_type == "prompt_injections":
|
| 400 |
gt_label = "PROMPT_INJECTION"
|
|
|
|
|
|
|
| 401 |
else:
|
| 402 |
gt_label = "UNSAFE"
|
| 403 |
|
|
|
|
| 398 |
judge_results = raw_df[raw_df["judge_id"] == judge_id].copy()
|
| 399 |
if test_type == "prompt_injections":
|
| 400 |
gt_label = "PROMPT_INJECTION"
|
| 401 |
+
elif test_type == "grounding":
|
| 402 |
+
gt_label = "GROUNDED"
|
| 403 |
else:
|
| 404 |
gt_label = "UNSAFE"
|
| 405 |
|