Spaces:

jbilcke-hf
/

SNIPED_grasp-any-region

Running on Zero

File size: 7,200 Bytes

46861c5

# --------------------------------------------------------
# Copyright (2025) Bytedance Ltd. and/or its affiliates
# Licensed under the Apache License, Version 2.0 (the "License")
# Grasp Any Region Project
# Written by Haochen Wang and Yuhao Wang
# --------------------------------------------------------

import argparse
import base64
import io
import json
import os
import re

import numpy as np
import openai
from PIL import Image
from pycocotools import mask as mask_utils
from pycocotools.coco import COCO
from tqdm import tqdm

# Define Azure OpenAI details
model_name = "gpt-4o-2024-11-20"
max_tokens = 1000  # range: [1, 4095]

# Initialize the Azure client
client = openai.AzureOpenAI(
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_key=os.getenv("AZURE_OPENAI_KEY"),
    api_version="2024-03-01-preview",
)

prompt_ann = """
You are a language model expert. Your task is to evaluate the following model output based on the provided images, and subject, object, and relationship.

- subject_name: {subject_name}
- object_name: {object_name}
- predicate_name: {predicate_name}
- model_output: {model_output}

Task:
1. Check if the model output describes the {subject_name}. 
2. Check if the model output conveys the relationship between {subject_name} and {object_name} related to {predicate_name}.

Note:
- The first task only requires checking if {subject_name} is mentioned in the model output.
- The second task asks if the output conveys a relationship related to {predicate_name} between {subject_name} and {object_name}, even if different words or phrases are used.
- If both tasks are successfully completed, return "True" Otherwise, return "False"
- Do not output any reasoning. Do not perform correction. Please output only just one "True" or "False".

"""


def process_questions(outputs):

    pattern = r"^```json\s*|\s*```$"
    try:
        cleaned_str = re.sub(pattern, "", outputs, flags=re.MULTILINE)
        questions_data = json.loads(cleaned_str)
    except:
        print("Error in parsing JSON")
        return []
    return questions_data


def encode_pil_image_to_base64(pil_image):
    buffered = io.BytesIO()
    pil_image.save(buffered, format="PNG")
    img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
    return img_str


def mask_to_box(mask_np):
    mask_coords = np.argwhere(mask_np)
    y0, x0 = mask_coords.min(axis=0)
    y1, x1 = mask_coords.max(axis=0) + 1

    h = y1 - y0
    w = x1 - x0

    return x0, y0, w, h


def query(messages):
    # Adjusted to use the Azure OpenAI client with the specified parameters
    response = client.chat.completions.create(
        model=model_name,
        messages=[{"role": "user", "content": content}],
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
    )

    message = response.choices[0].message.content
    return message


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Evaluate model outputs")
    parser.add_argument("--pred", type=str, help="Path to the model")
    parser.add_argument("--min_box_w", type=int, help="Minimum width", default=56)
    parser.add_argument("--min_box_h", type=int, help="Minimum height", default=56)
    parser.add_argument(
        "--image_folder", type=str, default="evaluation/GAR-Bench/annotations"
    )
    args = parser.parse_args()

    with open(args.pred, "r") as f:
        data = json.load(f)

    output_json = []
    total = 0
    true = 0

    for item in tqdm(data):
        total = total + 1
        model_output = item["model_output"]

        subject_name = item["subject_name"]
        object_name = item["object_name"]
        predicate_name = item["predicate_name"]
        model_output = item["model_output"]
        prompt = prompt_ann.format(
            subject_name=subject_name,
            object_name=object_name,
            predicate_name=predicate_name,
            model_output=model_output,
        )

        img = Image.open(os.path.join(args.image_folder, item["image"]))

        img_np = np.array(img)
        base64_image = encode_pil_image_to_base64(img)
        content = [
            {"type": "text", "text": "\n1. The original image:\n"},
            {
                "type": "image_url",
                "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
            },
        ]

        for mask_idx, mask_rle in enumerate(item["mask_rles"]):
            mask_np = mask_utils.decode(mask_rle).astype(np.uint8)
            pil_mask = Image.fromarray((mask_np * 255).astype(np.uint8))

            assert (
                img_np.shape[:2] == mask_np.shape
            ), f"image shape mismatches with mask shape: {img_np.shape}, {mask_np.shape}"
            img_h, img_w = img_np.shape[:2]

            x0, y0, w, h = mask_to_box(mask_np)
            xc, yc = x0 + w / 2, y0 + h / 2

            # focal_crop: need to have at least min_box_w and min_box_h pixels, otherwise resizing to (384, 384) leads to artifacts that may be OOD
            w, h = max(w, args.min_box_w), max(h, args.min_box_h)
            x0, y0 = int(xc - w / 2), int(yc - h / 2)

            cropped_mask_np = mask_np[
                max(y0 - h, 0) : min(y0 + 2 * h, img_h),
                max(x0 - w, 0) : min(x0 + 2 * w, img_w),
            ]
            cropped_img_np = img_np[
                max(y0 - h, 0) : min(y0 + 2 * h, img_h),
                max(x0 - w, 0) : min(x0 + 2 * w, img_w),
            ]

            cropped_pil_img = Image.fromarray(cropped_img_np)
            cropped_pil_mask = Image.fromarray((cropped_mask_np * 255).astype(np.uint8))

            base64_cropped_image = encode_pil_image_to_base64(cropped_pil_img)
            base64_cropped_mask = encode_pil_image_to_base64(cropped_pil_mask)

            content.extend(
                [
                    {
                        "type": "text",
                        "text": f"\n{2 * mask_idx + 2}. <Prompt{mask_idx}>:\n",
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_cropped_image}"
                        },
                    },
                    {
                        "type": "text",
                        "text": f"\n{2 * mask_idx + 3}. The mask of <Prompt{mask_idx}>:\n",
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_cropped_mask}"
                        },
                    },
                ]
            )

        content.append({"type": "text", "text": prompt})

        messages = [{"role": "user", "content": content}]

        outputs = query(messages)
        print(outputs)
        if outputs == "True":
            true = true + 1
        item.update({"eval_result": outputs})
        output_json.append(item)

    print("Accuracy: ", true / total)
    with open(args.pred.replace(".json", "_eval.json"), "w") as f:
        json.dump(output_json, f, indent=4)