Spaces:

abiyyufahri
/

GUI-Agent

Sleeping

App Files Files Community

GUI-Agent / app.py

abiyyufahri

Add base64 GUI click endpoint

2cf117f 5 months ago

raw

history blame

3.43 kB

	from fastapi import FastAPI, UploadFile, Form
	from fastapi.responses import JSONResponse
	from PIL import Image
	from io import BytesIO
	import torch
	import base64

	from transformers import Qwen2VLProcessor
	from gui_actor.modeling import Qwen2VLForConditionalGenerationWithPointer
	from gui_actor.inference import inference

	app = FastAPI()

	# Load model
	model_name = "microsoft/GUI-Actor-2B-Qwen2-VL"
	processor = Qwen2VLProcessor.from_pretrained(model_name)
	tokenizer = processor.tokenizer
	model = Qwen2VLForConditionalGenerationWithPointer.from_pretrained(
	model_name,
	torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
	device_map="auto",
	attn_implementation="flash_attention_2" if torch.cuda.is_available() else None,
	).eval()


	@app.post("/click_base64")
	async def predict_click_base64(
	image_base64: str = Form(...),
	instruction: str = Form(...)
	):
	# Decode base64 image
	try:
	if "," in image_base64:
	image_base64 = image_base64.split(",")[1]
	image_data = base64.b64decode(image_base64)
	pil_image = Image.open(BytesIO(image_data)).convert("RGB")
	except Exception as e:
	return JSONResponse(status_code=400, content={"error": f"Invalid image format: {str(e)}"})

	# Prepare conversation
	conversation = [
	{
	"role": "system",
	"content": [
	{
	"type": "text",
	"text": "You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.",
	}
	]
	},
	{
	"role": "user",
	"content": [
	{
	"type": "image",
	"image": pil_image,
	},
	{
	"type": "text",
	"text": instruction,
	},
	],
	},
	]

	# Inference
	try:
	pred = inference(conversation, model, tokenizer, processor, use_placeholder=True, topk=3)
	px, py = pred["topk_points"][0]
	return JSONResponse(content={"x": round(px, 4), "y": round(py, 4)})
	except Exception as e:
	return JSONResponse(status_code=500, content={"error": f"Inference failed: {str(e)}"})


	@app.post("/click")
	async def predict_click(image: UploadFile, instruction: str = Form(...)):
	# Load image
	contents = await image.read()
	pil_image = Image.open(BytesIO(contents)).convert("RGB")

	conversation = [
	{
	"role": "system",
	"content": [
	{
	"type": "text",
	"text": "You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.",
	}
	]
	},
	{
	"role": "user",
	"content": [
	{
	"type": "image",
	"image": pil_image,
	},
	{
	"type": "text",
	"text": instruction,
	},
	],
	},
	]

	pred = inference(conversation, model, tokenizer, processor, use_placeholder=True, topk=3)
	px, py = pred["topk_points"][0]
	return JSONResponse(content={"x": round(px, 4), "y": round(py, 4)})