Spaces:

abiyyufahri
/

GUI-Agent

Running

GUI-Agent / app.py

Install error fix

55b2cb1 5 months ago

1.95 kB

	from fastapi import FastAPI, Form
	from fastapi.responses import JSONResponse
	from pydantic import BaseModel
	from PIL import Image
	from io import BytesIO
	import base64
	import torch

	from transformers import Qwen2VLProcessor
	from gui_actor.modeling import Qwen2VLForConditionalGenerationWithPointer
	from gui_actor.inference import inference

	app = FastAPI()

	# Load model
	model_name = "microsoft/GUI-Actor-2B-Qwen2-VL"
	processor = Qwen2VLProcessor.from_pretrained(model_name)
	tokenizer = processor.tokenizer
	model = Qwen2VLForConditionalGenerationWithPointer.from_pretrained(
	model_name,
	torch_dtype=torch.float32, # use float32 for CPU
	device_map=None, # don't map to cuda
	attn_implementation=None,
	).eval()


	class Base64Request(BaseModel):
	image_base64: str
	instruction: str


	@app.post("/click/base64")
	async def predict_click_base64(data: Base64Request):
	# Decode base64 to image
	image_data = base64.b64decode(data.image_base64.split(",")[-1])
	pil_image = Image.open(BytesIO(image_data)).convert("RGB")

	conversation = [
	{
	"role": "system",
	"content": [
	{
	"type": "text",
	"text": "You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.",
	}
	]
	},
	{
	"role": "user",
	"content": [
	{
	"type": "image",
	"image": pil_image,
	},
	{
	"type": "text",
	"text": data.instruction,
	},
	],
	},
	]

	pred = inference(conversation, model, tokenizer, processor, use_placeholder=True, topk=3)
	px, py = pred["topk_points"][0]
	return JSONResponse(content={"x": round(px, 4), "y": round(py, 4)})