Spaces:

allenai
/

asta-bench-leaderboard

Running

File size: 21,195 Bytes

ee1b999

import logging
from typing import Optional, Any, Dict # Added Dict
from zoneinfo import ZoneInfo

# datasets import might not be strictly needed by LeaderboardViewer itself anymore,
# but _get_dataframe might still use types from it if EvalResult refers to them.
# For now, let's keep it if your EvalResult or SuiteConfig models have dependencies.
# If not, it can be removed from here.
import datasets # Potentially removable from this file
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import pandas as pd
import seaborn as sns
import json # For loading the local JSON file
import os # For checking file existence

from agenteval import compute_summary_statistics
from agenteval.config import SuiteConfig
from agenteval.models import EvalResult

logger = logging.getLogger(__name__)

import logging
from typing import Optional, Any, Dict, List # Added List
from zoneinfo import ZoneInfo # Assuming this might be used by SuiteConfig/EvalResult or _get_dataframe
import json
import os

# Assuming these are correctly imported from your project
from agenteval.config import SuiteConfig
from agenteval.models import EvalResult
# from agenteval import compute_summary_statistics # Used by _get_dataframe


class DataTransformer:
    """
    Load and visualize leaderboard from a single, local JSON result file.
    """
    _INFORMAL_TO_FORMAL_NAME_MAP = {
        "lit": "Literature Understanding",
        "data": "Data Analysis",
        "code": "Code Execution",
        "discovery": "Discovery",
        "arxivdigestables_validation": "Arxivdigestables Validation",
        "sqa_dev": "Sqa Dev",
        "litqa2_validation": "Litqa2 Validation",
        "paper_finder_validation": "Paper Finder Validation",
        "discoverybench_validation": "Discoverybench Validation",
        "core_bench_validation": "Core Bench Validation",
        "ds1000_validation": "DS1000 Validation",
        "e2e_discovery_validation": "E2E Discovery Validation",
        "super_validation": "Super Validation",
        # Add any other raw names that can appear in task.name or task.tags
    }

    def __init__(
            self,
            json_file_path: str, # Mandatory: path to the local JSON file
            split: str,          # Still needed for context within the JSON's suite_config
            is_internal: bool = False
    ):
        self._json_file_path = json_file_path
        self._split = split
        self._internal = is_internal
        self._loaded_json_data: Optional[Dict[str, Any]] = None
        self._cfg: Optional[SuiteConfig] = None

        logger.info(f"Initializing LeaderboardViewer with local JSON file: {self._json_file_path}")

        # --- Load and Validate JSON data ---
        if not os.path.exists(self._json_file_path):
            raise FileNotFoundError(f"JSON file not found at path: {self._json_file_path}")
        try:
            with open(self._json_file_path, 'r', encoding='utf-8') as f:
                self._loaded_json_data = json.load(f)
        except json.JSONDecodeError as e:
            raise ValueError(f"Failed to parse JSON from local file {self._json_file_path}: {e}")
        except Exception as e:
            raise ValueError(f"Error reading local file {self._json_file_path}: {e}")

        if not self._loaded_json_data:
            raise ValueError(f"No data loaded from JSON file {self._json_file_path}.")

        try:
            eval_result = EvalResult.model_validate(self._loaded_json_data)
        except Exception as e:
            raise ValueError(f"Failed to validate JSON data from file '{self._json_file_path}' against EvalResult model: {e}")

        self._cfg = eval_result.suite_config
        if not isinstance(self._cfg, SuiteConfig):
            raise TypeError(f"self._cfg is not a SuiteConfig object after loading from '{self._json_file_path}', got {type(self._cfg)}.")

        # --- Populate Tag Map (Corrected Placement and Helper Function Access) ---
        self.tag_map: dict[str, list[str]] = {}

        # Access tasks from the loaded config
        tasks_for_split: List[Any] = self._cfg.get_tasks(self._split) # Assuming get_tasks returns a list of task-like objects

        for task in tasks_for_split:
            # Ensure task object has 'name' and 'tags' attributes
            if not hasattr(task, 'name') or not hasattr(task, 'tags'):
                logger.warning(f"Task object {task} is missing 'name' or 'tags' attribute. Skipping.")
                continue

            formal_task_display_name = self._get_formal_display_name_static(task.name) # Use the helper method

            if not (task.tags or []):
                continue

            for raw_tag_name in task.tags:
                formal_tag_display_name_key = self._get_formal_display_name_static(raw_tag_name)

                self.tag_map.setdefault(formal_tag_display_name_key, []).append(formal_task_display_name)

        for key in self.tag_map:
            self.tag_map[key] = sorted(list(set(self.tag_map[key])))

    # --- Helper function defined as a static method or regular method ---
    # Option 1: Static method (doesn't need 'self', uses the class attribute)
    @staticmethod
    def _get_formal_display_name_static(raw_name: str) -> str:
        """
        Helper function to get the formal display name for a raw tag or task name.
        Uses the class's map and provides a fallback.
        """
        return DataTransformer._INFORMAL_TO_FORMAL_NAME_MAP.get(raw_name, raw_name.replace("_", " ").title())

    def _load(self) -> tuple[pd.DataFrame, dict[str, list[str]]]:
        """
        Prepares the DataFrame from the loaded JSON data.
        The JSON data is already loaded and validated in __init__.
        """
        if self._loaded_json_data is None or self._cfg is None:
            # This should not happen if __init__ completed successfully
            raise RuntimeError("LeaderboardViewer2 not properly initialized. JSON data or SuiteConfig is missing.")

        # The _get_dataframe function expects a list of records.
        # Since we have a single JSON file representing one result, wrap it in a list.
        records_list: list[dict] = [self._loaded_json_data]

        overview_df = _get_dataframe(
            records_list=records_list,
            split=self._split,
            is_internal=self._internal,
            suite_config=self._cfg, # Pass the SuiteConfig loaded in __init__
        )
        return overview_df, self.tag_map

    # --- view method remains the same as your last version ---
    def view(
            self,
            tag: Optional[str] = None,
            with_plots: bool = False,
            use_plotly: bool = False,
    ) -> tuple[pd.DataFrame, dict[str, Any]]:
        data, tag_map = self._load() # tag_map is also returned by _load now
        print(f"AHAHASHJDBFGASJHDBJAHSDB,AHDB {tag_map}")
        print(f"THIS IS THE DATA DATA DTAA {data.columns}")
        if data.empty or (len(data) == 1 and data.iloc[0].get("Agent") == "No data"):
            logger.warning("No data available to view. Returning empty DataFrame and plots.")
            return data, {}

        base_cols = ["Agent", "Submitter", "Date", "Logs"]
        existing_cols = [col for col in base_cols if col in data.columns]

        primary_score_col: str
        group_metric_names: list[str]

        if tag is None:
            primary = "Overall"
            group = list(tag_map.keys())
        else:
            primary = tag
            group = tag_map.get(tag, [])

        if f"{primary} Score" in data.columns:
            data = data.sort_values(f"{primary} Score", ascending=False)
        else:
            logger.warning(f"Primary metric '{primary}' for sorting not found. Data will not be sorted by it.")

        metrics_to_display = []
        if f"{primary} Cost" in data.columns:
            metrics_to_display.append(f"{primary} Cost")
        if f"{primary} Score" in data.columns:
            metrics_to_display.append(f"{primary} Score")

        for g_item in group:
            if g_item in data.columns:
                metrics_to_display.append(g_item)
            if f"{g_item} Cost" in data.columns:
                metrics_to_display.append(f"{g_item} Cost")
            if f"{g_item} Score" in data.columns:
                metrics_to_display.append(f"{g_item} Score")


        final_cols_to_display = existing_cols + [m for m in metrics_to_display if m in data.columns]
        final_cols_to_display = sorted(list(set(final_cols_to_display)), key=final_cols_to_display.index)

        df_view = data.loc[:, final_cols_to_display].reset_index(drop=True)

        plots: dict[str, Any] = {}
        if with_plots:
            plot_metric_names = [primary] + [g_item for g_item in group if g_item in data.columns]
            for metric_name in plot_metric_names:
                score_col = f"{metric_name} Score"
                cost_col = f"{metric_name} Cost"
                if score_col in df_view.columns and cost_col in df_view.columns:
                    if use_plotly:
                        fig = _plot_scatter_plotly(df_view, x=cost_col, y=score_col, agent_col="Agent")
                    plots[f"scatter_{metric_name}"] = fig
                else:
                    logger.warning(
                        f"Skipping plot for '{metric_name}': score column '{score_col}' or cost column '{cost_col}' not found."
                    )
        return df_view, plots


def _safe_round(value, digits=2):
    return round(value, digits) if isinstance(value, (float, int)) and pd.notna(value) else value

def _get_dataframe(
        records_list: list[dict],
        split: str,
        is_internal: bool,
        suite_config: SuiteConfig,
        timezone: str = "US/Pacific",
) -> pd.DataFrame:
    # This function remains the same as in the previous version you provided.
    # It takes a list of records (which will be a list containing one item
    # from the loaded JSON file) and processes it.
    if not records_list:
        logger.warning(f"No records provided to _get_dataframe for split '{split}'. Returning empty DataFrame with placeholder.")
        expected_pretty_cols = ["Agent Name", "Submitter", "Date", "Overall Score", "Logs"]
        empty_df = pd.DataFrame({p_col: ["No data"] for p_col in expected_pretty_cols})
        return empty_df

    cfg = suite_config

    rows = []
    for itm_idx, itm in enumerate(records_list):
        if not isinstance(itm, dict):
            logger.warning(f"Item {itm_idx} in records_list is not a dict, skipping.")
            continue
        try:
            ev = EvalResult.model_validate(itm)
        except Exception as e:
            logger.error(f"Failed to validate item {itm_idx} with EvalResult: {itm}. Error: {e}")
            continue

        sub = ev.submission
        date_str = None
        if sub.submit_time is not None:
            submit_dt = sub.submit_time
            if not isinstance(submit_dt, pd.Timestamp):
                if submit_dt.tzinfo is None:
                    logger.debug(f"Submission time for {sub.agent_name} is timezone-naive, assuming UTC.")
                    submit_dt = submit_dt.replace(tzinfo=ZoneInfo("UTC"))
            date_str = pd.Timestamp(submit_dt).tz_convert(ZoneInfo(timezone)).strftime("%Y-%m-%d")
        else:
            date_str = None

        if not ev.results:
            logger.warning(
                f"Skipping submission {sub.agent_name} ({sub.username or 'N/A'}) "
                f"({sub.submit_time or 'N/A'}) due to no results."
            )
            continue
        stats = compute_summary_statistics(
            suite_config=cfg, split=split, results=ev.results
        )
        flat = {}
        print(f"STATS STATS ASTATAS SD T S T A A {stats}")
        for key, s_obj in stats.items():
            parts = key.split("/")
            if parts[0] == "overall":
                flat["overall/score"] = _safe_round(getattr(s_obj, 'score', np.nan))
                flat["overall/cost"] = _safe_round(getattr(s_obj, 'cost', np.nan))
            elif parts[0] == "tag" and len(parts) > 1:
                tag_name = parts[1]
                flat[f"tag/{tag_name}/score"] = _safe_round(getattr(s_obj, 'score', np.nan))
                flat[f"tag/{tag_name}/cost"] = _safe_round(getattr(s_obj, 'cost', np.nan))
            elif parts[0] == "task" and len(parts) > 1:
                task_name = parts[1]
                score = getattr(s_obj, 'score', np.nan)
                cost = getattr(s_obj, 'cost', np.nan)
                score_stderr = getattr(s_obj, 'score_stderr', np.nan)
                cost_stderr = getattr(s_obj, 'cost_stderr', np.nan)

                flat[f"task/{task_name}/score"] = _safe_round(score)
                flat[f"task/{task_name}/score_ci"] = _safe_round(score_stderr * 1.96 if pd.notna(score_stderr) else np.nan)
                flat[f"task/{task_name}/cost"] = _safe_round(cost)
                flat[f"task/{task_name}/cost_ci"] = _safe_round(cost_stderr * 1.96 if pd.notna(cost_stderr) else np.nan)
            else:
                logger.debug(f"Uncommon key structure from compute_summary_statistics: '{key}'. Attempting generic add.")
                if hasattr(s_obj, 'score'):
                    flat[f"{key}/score"] = _safe_round(s_obj.score)
                if hasattr(s_obj, 'cost'):
                    flat[f"{key}/cost"] = _safe_round(s_obj.cost)

        current_logs_url = None
        if is_internal and sub.logs_url:
            current_logs_url = str(sub.logs_url)
        elif not is_internal and sub.logs_url_public:
            current_logs_url = str(sub.logs_url_public)

        rows.append(
            {
                "agent_name": sub.agent_name or "N/A",
                "username": sub.username or "N/A",
                "submit_time": date_str,
                **flat,
                "logs_url": current_logs_url,
            }
        )

    if not rows:
        logger.warning(f"No valid rows generated from records_list for split '{split}'. Returning empty DataFrame with placeholder.")
        expected_pretty_cols = ["Agent", "Submitter", "Date", "Overall Score", "Overall Cost", "Logs"]
        empty_df = pd.DataFrame({p_col: ["No data"] for p_col in expected_pretty_cols})
        return empty_df

    df = pd.DataFrame(rows)
    pretty_cols = {c: _pretty_column_name(c) for c in df.columns if c in df.columns}
    overview = df.rename(columns=pretty_cols)
    return overview

def _pretty_column_name(col: str) -> str:
    """Map raw column name to display name."""
    # --- Step 1: Fixed, direct mappings ---
    fixed_mappings = {
        "submit_time": "Date",
        "agent_name": "Agent",
        "username": "Submitter",
        "logs_url": "Logs",
        "overall/score": "Overall Score",
        "overall/cost": "Overall Cost",
    }
    if col in fixed_mappings:
        return fixed_mappings[col]

    # --- Step 2: Define your mapping for informal names to descriptive names ---
    informal_map = DataTransformer._INFORMAL_TO_FORMAL_NAME_MAP

    # --- Step 3: Dynamic mappings for task or tag columns using the informal_to_formal_name_map ---
    parts = col.split("/")
    if len(parts) == 3:
        item_type, informal_name, metric_suffix = parts #

        formal_name = informal_map.get(informal_name)
        if formal_name is None:
            formal_name = informal_name.replace("_", " ").title()
            print(f"[DEBUG _pretty_column_name] Informal name '{informal_name}' not in map, using fallback: '{formal_name}'")

        if metric_suffix == "score":
            return f"{formal_name} Score"
        if metric_suffix == "cost":
            return f"{formal_name} Cost"
        if metric_suffix == "score_ci":
            return f"{formal_name} Score 95% CI"
        if metric_suffix == "cost_ci":
            return f"{formal_name} Cost 95% CI"

    # --- Step 4: Fallback for columns that don't match the "type/name/metric" pattern ---
    if "/" not in col:
        return col.replace("_", " ").title()
    else:
        return parts[-1].replace("_", " ").title()

DEFAULT_Y_COLUMN = "Overall Score"
DUMMY_X_VALUE_FOR_MISSING_COSTS = 0 # Value to use if x-axis data (costs) is missing

def _plot_scatter_plotly(
        data: pd.DataFrame,
        x: Optional[str],
        y: str,
        agent_col: str = "Agent"
) -> go.Figure:

    x_col_to_use = x
    y_col_to_use = y

    # 1. Check if y-column exists
    if y_col_to_use not in data.columns:
        logger.error(
            f"y-axis column '{y_col_to_use}' MUST exist in DataFrame. "
            f"Cannot generate plot. Available columns: {data.columns.tolist()}"
        )
        return go.Figure()

    # 2. Check if agent_col exists
    if agent_col not in data.columns:
        logger.warning(
            f"Agent column '{agent_col}' not found in DataFrame. "
            f"Available columns: {data.columns.tolist()}. Returning empty figure."
        )
        return go.Figure()

    # 3. Prepare data (make a copy, handle numeric conversion for y)
    data_plot = data.copy()
    try:
        data_plot[y_col_to_use] = pd.to_numeric(data_plot[y_col_to_use], errors='coerce')
    except Exception as e:
        logger.error(f"Error converting y-column '{y_col_to_use}' to numeric: {e}. Returning empty figure.")
        return go.Figure()

    # 4. Handle x-column (costs)
    x_axis_label = x_col_to_use if x_col_to_use else "Cost (Data N/A)" # Label for the x-axis
    x_data_is_valid = False

    if x_col_to_use and x_col_to_use in data_plot.columns:
        try:
            data_plot[x_col_to_use] = pd.to_numeric(data_plot[x_col_to_use], errors='coerce')
            # Check if there's any non-NaN data after coercion for x
            if data_plot[x_col_to_use].notna().any():
                x_data_is_valid = True
            else:
                logger.info(f"x-axis column '{x_col_to_use}' exists but contains all NaN/None values after numeric conversion.")
        except Exception as e:
            logger.warning(f"Error converting x-column '{x_col_to_use}' to numeric: {e}. Will use dummy x-values.")
            # x_data_is_valid remains False
    else:
        if x_col_to_use: # Name was provided but column doesn't exist
            logger.warning(f"x-axis column '{x_col_to_use}' not found in DataFrame.")
        else: # x (column name) was None
            logger.info("x-axis column name was not provided (is None).")

    if not x_data_is_valid:
        logger.info(f"Using dummy x-value '{DUMMY_X_VALUE_FOR_MISSING_COSTS}' for all data points as x-data is missing or invalid.")
        # Create a new column with the dummy x-value for all rows
        # Use a unique name for this dummy column to avoid potential clashes
        dummy_x_col_name = "__dummy_x_for_plotting__"
        data_plot[dummy_x_col_name] = DUMMY_X_VALUE_FOR_MISSING_COSTS
        x_col_to_use = dummy_x_col_name # Update x_col_to_use to point to our dummy data
        x_axis_label = x if x else "Cost (Data N/A)" # Use original x name for label if provided
        # or a generic label if x was None.
        # Could also be f"Cost (Fixed at {DUMMY_X_VALUE_FOR_MISSING_COSTS})"


    # 5. Drop rows where y is NaN (x is now guaranteed to have values, either real or dummy)
    data_plot.dropna(subset=[y_col_to_use], inplace=True)

    fig = go.Figure()

    if data_plot.empty:
        logger.warning(f"No valid data to plot for y='{y_col_to_use}' (and x='{x_col_to_use}') after cleaning NaNs from y.")
        # Still return a figure object, but it will be empty. Update layout for clarity.
        fig.update_layout(
            title=f"{y_col_to_use} vs. {x_axis_label} (No Data)",
            xaxis=dict(title=x_axis_label, range=[DUMMY_X_VALUE_FOR_MISSING_COSTS - 1, DUMMY_X_VALUE_FOR_MISSING_COSTS + 1] if not x_data_is_valid else None),
            yaxis=dict(title=y_col_to_use)
        )
        return fig


    for agent, group in data_plot.groupby(agent_col):
        hover_x_display = "%{x:.2f}" if x_data_is_valid else str(DUMMY_X_VALUE_FOR_MISSING_COSTS) + " (fixed)"
        fig.add_trace(go.Scatter(
            x=group[x_col_to_use],
            y=group[y_col_to_use],
            mode='markers',
            name=str(agent),
            hovertemplate=f"{x_axis_label}: {hover_x_display}<br>{y_col_to_use}: %{{y:.2f}}<extra>{str(agent)}</extra>",
            marker=dict(size=10)
        ))

    # Configure layout
    xaxis_config = dict(title=x_axis_label)
    if not x_data_is_valid: # If using dummy x, set a tighter, fixed range for x-axis
        xaxis_config['range'] = [DUMMY_X_VALUE_FOR_MISSING_COSTS - 1, DUMMY_X_VALUE_FOR_MISSING_COSTS + 1]
        xaxis_config['tickvals'] = [DUMMY_X_VALUE_FOR_MISSING_COSTS] # Show only one tick at the dummy value
        xaxis_config['ticktext'] = [str(DUMMY_X_VALUE_FOR_MISSING_COSTS)]
    else: # Real x-data
        xaxis_config['rangemode'] = "tozero"


    fig.update_layout(
        title=f"{y_col_to_use} vs. {x_axis_label}",
        xaxis=xaxis_config,
        yaxis=dict(title=y_col_to_use, rangemode="tozero"),
        legend_title_text=agent_col
    )

    return fig