GUI-Agent / app.py
abiyyufahri's picture
Add base64 GUI click endpoint
2cf117f
raw
history blame
3.43 kB
from fastapi import FastAPI, UploadFile, Form
from fastapi.responses import JSONResponse
from PIL import Image
from io import BytesIO
import torch
import base64
from transformers import Qwen2VLProcessor
from gui_actor.modeling import Qwen2VLForConditionalGenerationWithPointer
from gui_actor.inference import inference
app = FastAPI()
# Load model
model_name = "microsoft/GUI-Actor-2B-Qwen2-VL"
processor = Qwen2VLProcessor.from_pretrained(model_name)
tokenizer = processor.tokenizer
model = Qwen2VLForConditionalGenerationWithPointer.from_pretrained(
model_name,
torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
device_map="auto",
attn_implementation="flash_attention_2" if torch.cuda.is_available() else None,
).eval()
@app.post("/click_base64")
async def predict_click_base64(
image_base64: str = Form(...),
instruction: str = Form(...)
):
# Decode base64 image
try:
if "," in image_base64:
image_base64 = image_base64.split(",")[1]
image_data = base64.b64decode(image_base64)
pil_image = Image.open(BytesIO(image_data)).convert("RGB")
except Exception as e:
return JSONResponse(status_code=400, content={"error": f"Invalid image format: {str(e)}"})
# Prepare conversation
conversation = [
{
"role": "system",
"content": [
{
"type": "text",
"text": "You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.",
}
]
},
{
"role": "user",
"content": [
{
"type": "image",
"image": pil_image,
},
{
"type": "text",
"text": instruction,
},
],
},
]
# Inference
try:
pred = inference(conversation, model, tokenizer, processor, use_placeholder=True, topk=3)
px, py = pred["topk_points"][0]
return JSONResponse(content={"x": round(px, 4), "y": round(py, 4)})
except Exception as e:
return JSONResponse(status_code=500, content={"error": f"Inference failed: {str(e)}"})
@app.post("/click")
async def predict_click(image: UploadFile, instruction: str = Form(...)):
# Load image
contents = await image.read()
pil_image = Image.open(BytesIO(contents)).convert("RGB")
conversation = [
{
"role": "system",
"content": [
{
"type": "text",
"text": "You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.",
}
]
},
{
"role": "user",
"content": [
{
"type": "image",
"image": pil_image,
},
{
"type": "text",
"text": instruction,
},
],
},
]
pred = inference(conversation, model, tokenizer, processor, use_placeholder=True, topk=3)
px, py = pred["topk_points"][0]
return JSONResponse(content={"x": round(px, 4), "y": round(py, 4)})