Spaces:

abiyyufahri
/

GUI-Agent

Sleeping

App Files Files Community

GUI-Agent / app.py

abiyyufahri

Install error fix attemp 6

e670b79 5 months ago

raw

history blame

3.55 kB

	from fastapi import FastAPI, Form
	from fastapi.responses import JSONResponse
	from pydantic import BaseModel
	from PIL import Image
	from io import BytesIO
	import base64
	import torch

	# Import sesuai dokumentasi GUI-Actor
	from qwen_vl_utils import process_vision_info
	from transformers import Qwen2VLProcessor
	from gui_actor.constants import chat_template
	from gui_actor.modeling import Qwen2VLForConditionalGenerationWithPointer
	from gui_actor.inference import inference

	app = FastAPI()

	# Load model sesuai dokumentasi
	model_name_or_path = "microsoft/GUI-Actor-2B-Qwen2-VL"
	data_processor = Qwen2VLProcessor.from_pretrained(model_name_or_path)
	tokenizer = data_processor.tokenizer

	# Modifikasi untuk CPU atau GPU
	device = "cuda" if torch.cuda.is_available() else "cpu"
	torch_dtype = torch.bfloat16 if device == "cuda" else torch.float32

	model = Qwen2VLForConditionalGenerationWithPointer.from_pretrained(
	model_name_or_path,
	torch_dtype=torch_dtype,
	device_map=device if device == "cuda" else None,
	attn_implementation="flash_attention_2" if device == "cuda" else None
	).eval()

	class Base64Request(BaseModel):
	image_base64: str
	instruction: str

	@app.post("/click/base64")
	async def predict_click_base64(data: Base64Request):
	try:
	# Decode base64 to image
	image_data = base64.b64decode(data.image_base64.split(",")[-1])
	pil_image = Image.open(BytesIO(image_data)).convert("RGB")

	conversation = [
	{
	"role": "system",
	"content": [
	{
	"type": "text",
	"text": "You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.",
	}
	]
	},
	{
	"role": "user",
	"content": [
	{
	"type": "image",
	"image": pil_image,
	},
	{
	"type": "text",
	"text": data.instruction,
	},
	],
	},
	]

	# Inference menggunakan fungsi dari GUI-Actor
	pred = inference(
	conversation,
	model,
	tokenizer,
	data_processor,
	use_placeholder=True,
	topk=3
	)

	px, py = pred["topk_points"][0]

	return JSONResponse(content={
	"x": round(px, 4),
	"y": round(py, 4),
	"all_points": [[round(x, 4), round(y, 4)] for x, y in pred["topk_points"]],
	"success": True
	})

	except Exception as e:
	return JSONResponse(
	content={
	"error": str(e),
	"success": False
	},
	status_code=500
	)

	@app.get("/health")
	async def health_check():
	return {
	"status": "healthy",
	"model": model_name_or_path,
	"device": device,
	"torch_dtype": str(torch_dtype)
	}

	# Endpoint tambahan untuk testing dengan form data
	@app.post("/click/form")
	async def predict_click_form(
	image_base64: str = Form(...),
	instruction: str = Form(...)
	):
	data = Base64Request(image_base64=image_base64, instruction=instruction)
	return await predict_click_base64(data)

	if __name__ == "__main__":
	import uvicorn
	uvicorn.run(app, host="0.0.0.0", port=7860)