Spaces:
Running
Running
| from fastapi import FastAPI, Form | |
| from fastapi.responses import JSONResponse | |
| from pydantic import BaseModel | |
| from PIL import Image | |
| from io import BytesIO | |
| import base64 | |
| import torch | |
| from transformers import Qwen2VLProcessor | |
| from gui_actor.modeling import Qwen2VLForConditionalGenerationWithPointer | |
| from gui_actor.inference import inference | |
| app = FastAPI() | |
| # Load model | |
| model_name = "microsoft/GUI-Actor-2B-Qwen2-VL" | |
| processor = Qwen2VLProcessor.from_pretrained(model_name) | |
| tokenizer = processor.tokenizer | |
| model = Qwen2VLForConditionalGenerationWithPointer.from_pretrained( | |
| model_name, | |
| torch_dtype=torch.float32, # use float32 for CPU | |
| device_map=None, # don't map to cuda | |
| attn_implementation=None, | |
| ).eval() | |
| class Base64Request(BaseModel): | |
| image_base64: str | |
| instruction: str | |
| async def predict_click_base64(data: Base64Request): | |
| # Decode base64 to image | |
| image_data = base64.b64decode(data.image_base64.split(",")[-1]) | |
| pil_image = Image.open(BytesIO(image_data)).convert("RGB") | |
| conversation = [ | |
| { | |
| "role": "system", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": "You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.", | |
| } | |
| ] | |
| }, | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "image", | |
| "image": pil_image, | |
| }, | |
| { | |
| "type": "text", | |
| "text": data.instruction, | |
| }, | |
| ], | |
| }, | |
| ] | |
| pred = inference(conversation, model, tokenizer, processor, use_placeholder=True, topk=3) | |
| px, py = pred["topk_points"][0] | |
| return JSONResponse(content={"x": round(px, 4), "y": round(py, 4)}) | |