dror44 commited on
Commit
a43203f
·
1 Parent(s): a0c1734
Files changed (2) hide show
  1. models.jsonl +0 -1
  2. run_benchmarks.py +2 -0
models.jsonl CHANGED
@@ -14,7 +14,6 @@
14
  {"id": "o3-mini", "name": " o3-mini", "organization": "OpenAI", "license": "Proprietary", "api_model": "o3-mini", "provider": "openai", "parameters": "N/A"}
15
  {"id": "gpt-4.1", "name": "GPT-4.1", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-4.1", "provider": "openai", "parameters": "N/A"}
16
  {"id": "gpt-4o", "name": "GPT-4o", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-4o", "provider": "openai", "parameters": "N/A"}
17
- {"id": "gpt-4-turbo", "name": "GPT-4 Turbo", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-4-turbo", "provider": "openai", "parameters": "N/A"}
18
  {"id": "gpt-3.5-turbo", "name": "GPT-3.5 Turbo", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-3.5-turbo", "provider": "openai", "parameters": "N/A"}
19
 
20
  {"id": "claude-3-7-sonnet-latest", "name": "Claude 3.7 Sonnet", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-7-sonnet-latest", "provider": "anthropic", "parameters": "N/A"}
 
14
  {"id": "o3-mini", "name": " o3-mini", "organization": "OpenAI", "license": "Proprietary", "api_model": "o3-mini", "provider": "openai", "parameters": "N/A"}
15
  {"id": "gpt-4.1", "name": "GPT-4.1", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-4.1", "provider": "openai", "parameters": "N/A"}
16
  {"id": "gpt-4o", "name": "GPT-4o", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-4o", "provider": "openai", "parameters": "N/A"}
 
17
  {"id": "gpt-3.5-turbo", "name": "GPT-3.5 Turbo", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-3.5-turbo", "provider": "openai", "parameters": "N/A"}
18
 
19
  {"id": "claude-3-7-sonnet-latest", "name": "Claude 3.7 Sonnet", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-7-sonnet-latest", "provider": "anthropic", "parameters": "N/A"}
run_benchmarks.py CHANGED
@@ -398,6 +398,8 @@ def evaluate_dataset(
398
  judge_results = raw_df[raw_df["judge_id"] == judge_id].copy()
399
  if test_type == "prompt_injections":
400
  gt_label = "PROMPT_INJECTION"
 
 
401
  else:
402
  gt_label = "UNSAFE"
403
 
 
398
  judge_results = raw_df[raw_df["judge_id"] == judge_id].copy()
399
  if test_type == "prompt_injections":
400
  gt_label = "PROMPT_INJECTION"
401
+ elif test_type == "grounding":
402
+ gt_label = "GROUNDED"
403
  else:
404
  gt_label = "UNSAFE"
405