Spaces:
Sleeping
Sleeping
| from fastapi import FastAPI, Form | |
| from fastapi.responses import JSONResponse | |
| from pydantic import BaseModel | |
| from PIL import Image | |
| from io import BytesIO | |
| import base64 | |
| import torch | |
| # Import sesuai dokumentasi GUI-Actor | |
| from qwen_vl_utils import process_vision_info | |
| from transformers import Qwen2VLProcessor | |
| from gui_actor.constants import chat_template | |
| from gui_actor.modeling import Qwen2VLForConditionalGenerationWithPointer | |
| from gui_actor.inference import inference | |
| app = FastAPI() | |
| # Load model sesuai dokumentasi | |
| model_name_or_path = "microsoft/GUI-Actor-2B-Qwen2-VL" | |
| data_processor = Qwen2VLProcessor.from_pretrained(model_name_or_path) | |
| tokenizer = data_processor.tokenizer | |
| # Modifikasi untuk CPU atau GPU | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| torch_dtype = torch.bfloat16 if device == "cuda" else torch.float32 | |
| model = Qwen2VLForConditionalGenerationWithPointer.from_pretrained( | |
| model_name_or_path, | |
| torch_dtype=torch_dtype, | |
| device_map=device if device == "cuda" else None, | |
| attn_implementation="flash_attention_2" if device == "cuda" else None | |
| ).eval() | |
| class Base64Request(BaseModel): | |
| image_base64: str | |
| instruction: str | |
| async def predict_click_base64(data: Base64Request): | |
| try: | |
| # Decode base64 to image | |
| image_data = base64.b64decode(data.image_base64.split(",")[-1]) | |
| pil_image = Image.open(BytesIO(image_data)).convert("RGB") | |
| conversation = [ | |
| { | |
| "role": "system", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": "You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.", | |
| } | |
| ] | |
| }, | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "image", | |
| "image": pil_image, | |
| }, | |
| { | |
| "type": "text", | |
| "text": data.instruction, | |
| }, | |
| ], | |
| }, | |
| ] | |
| # Inference menggunakan fungsi dari GUI-Actor | |
| pred = inference( | |
| conversation, | |
| model, | |
| tokenizer, | |
| data_processor, | |
| use_placeholder=True, | |
| topk=3 | |
| ) | |
| px, py = pred["topk_points"][0] | |
| return JSONResponse(content={ | |
| "x": round(px, 4), | |
| "y": round(py, 4), | |
| "all_points": [[round(x, 4), round(y, 4)] for x, y in pred["topk_points"]], | |
| "success": True | |
| }) | |
| except Exception as e: | |
| return JSONResponse( | |
| content={ | |
| "error": str(e), | |
| "success": False | |
| }, | |
| status_code=500 | |
| ) | |
| async def health_check(): | |
| return { | |
| "status": "healthy", | |
| "model": model_name_or_path, | |
| "device": device, | |
| "torch_dtype": str(torch_dtype) | |
| } | |
| # Endpoint tambahan untuk testing dengan form data | |
| async def predict_click_form( | |
| image_base64: str = Form(...), | |
| instruction: str = Form(...) | |
| ): | |
| data = Base64Request(image_base64=image_base64, instruction=instruction) | |
| return await predict_click_base64(data) | |
| if __name__ == "__main__": | |
| import uvicorn | |
| uvicorn.run(app, host="0.0.0.0", port=7860) |