GUI-Agent / app.py
abiyyufahri's picture
Install error fix attemp 6
e670b79
raw
history blame
3.55 kB
from fastapi import FastAPI, Form
from fastapi.responses import JSONResponse
from pydantic import BaseModel
from PIL import Image
from io import BytesIO
import base64
import torch
# Import sesuai dokumentasi GUI-Actor
from qwen_vl_utils import process_vision_info
from transformers import Qwen2VLProcessor
from gui_actor.constants import chat_template
from gui_actor.modeling import Qwen2VLForConditionalGenerationWithPointer
from gui_actor.inference import inference
app = FastAPI()
# Load model sesuai dokumentasi
model_name_or_path = "microsoft/GUI-Actor-2B-Qwen2-VL"
data_processor = Qwen2VLProcessor.from_pretrained(model_name_or_path)
tokenizer = data_processor.tokenizer
# Modifikasi untuk CPU atau GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.bfloat16 if device == "cuda" else torch.float32
model = Qwen2VLForConditionalGenerationWithPointer.from_pretrained(
model_name_or_path,
torch_dtype=torch_dtype,
device_map=device if device == "cuda" else None,
attn_implementation="flash_attention_2" if device == "cuda" else None
).eval()
class Base64Request(BaseModel):
image_base64: str
instruction: str
@app.post("/click/base64")
async def predict_click_base64(data: Base64Request):
try:
# Decode base64 to image
image_data = base64.b64decode(data.image_base64.split(",")[-1])
pil_image = Image.open(BytesIO(image_data)).convert("RGB")
conversation = [
{
"role": "system",
"content": [
{
"type": "text",
"text": "You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.",
}
]
},
{
"role": "user",
"content": [
{
"type": "image",
"image": pil_image,
},
{
"type": "text",
"text": data.instruction,
},
],
},
]
# Inference menggunakan fungsi dari GUI-Actor
pred = inference(
conversation,
model,
tokenizer,
data_processor,
use_placeholder=True,
topk=3
)
px, py = pred["topk_points"][0]
return JSONResponse(content={
"x": round(px, 4),
"y": round(py, 4),
"all_points": [[round(x, 4), round(y, 4)] for x, y in pred["topk_points"]],
"success": True
})
except Exception as e:
return JSONResponse(
content={
"error": str(e),
"success": False
},
status_code=500
)
@app.get("/health")
async def health_check():
return {
"status": "healthy",
"model": model_name_or_path,
"device": device,
"torch_dtype": str(torch_dtype)
}
# Endpoint tambahan untuk testing dengan form data
@app.post("/click/form")
async def predict_click_form(
image_base64: str = Form(...),
instruction: str = Form(...)
):
data = Base64Request(image_base64=image_base64, instruction=instruction)
return await predict_click_base64(data)
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860)