Spaces:

abiyyufahri
/

GUI-Agent

Sleeping

App Files Files Community

abiyyufahri commited on Jul 24

Commit

6b36184

verified ·

1 Parent(s): 89b8ede

Update app.py

Browse files

Files changed (1) hide show

app.py +155 -141

app.py CHANGED Viewed

@@ -1,13 +1,14 @@
-import os
-import spaces
-import gradio as gr
 from PIL import Image
 import torch
 import re
 import logging
-from typing import Tuple, List
-import base64
-from io import BytesIO
 # Configure logging
 logging.basicConfig(level=logging.INFO)
@@ -20,7 +21,7 @@ tokenizer = None
 model_name = "microsoft/GUI-Actor-2B-Qwen2-VL"
 model_loaded = False
-def load_model():
     """Load model with proper error handling and fallback strategies"""
     global model, processor, tokenizer, model_loaded
@@ -39,8 +40,8 @@ def load_model():
             model = Qwen2VLForConditionalGeneration.from_pretrained(
                 model_name,
-                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-                device_map="auto" if torch.cuda.is_available() else None,
                 trust_remote_code=True,
                 low_cpu_mem_usage=True
             ).eval()
@@ -52,17 +53,17 @@ def load_model():
             logger.info("Trying AutoProcessor and AutoModel fallback...")
             try:
-                from transformers import AutoProcessor, AutoModelForVision2Seq
                 processor = AutoProcessor.from_pretrained(
                     model_name,
                     trust_remote_code=True
                 )
-                model = AutoModelForVision2Seq.from_pretrained(
                     model_name,
-                    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-                    device_map="auto" if torch.cuda.is_available() else None,
                     trust_remote_code=True,
                     low_cpu_mem_usage=True
                 ).eval()
@@ -74,7 +75,7 @@ def load_model():
                 logger.info("Trying generic transformers approach...")
                 # Last fallback - try loading as generic model
-                from transformers import AutoConfig, AutoProcessor
                 import transformers
                 config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
@@ -96,8 +97,8 @@ def load_model():
                 model = ModelClass.from_pretrained(
                     model_name,
                     config=config,
-                    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-                    device_map="auto" if torch.cuda.is_available() else None,
                     trust_remote_code=True,
                     low_cpu_mem_usage=True
                 ).eval()
@@ -116,8 +117,30 @@ def load_model():
         model_loaded = False
         return False
-def extract_coordinates(text: str) -> List[Tuple[float, float]]:
-    """Extract coordinates from model output text"""
     # Pattern untuk mencari koordinat dalam berbagai format
     patterns = [
         r'click\s*\(\s*(\d+(?:\.\d+)?)\s*,\s*(\d+(?:\.\d+)?)\s*\)',  # click(x, y)
@@ -143,38 +166,11 @@ def extract_coordinates(text: str) -> List[Tuple[float, float]]:
     # Default ke center jika tidak ditemukan
     return [(0.5, 0.5)]
-@spaces.GPU  # Decorator untuk menggunakan GPU di Hugging Face Spaces
-def inference(pil_image: Image.Image, instruction: str):
-    """Inference function with Spaces GPU support"""
-    if not model_loaded:
-        return "Model not loaded properly", 0.5, 0.5
     try:
-        conversation = [
-            {
-                "role": "system",
-                "content": [
-                    {
-                        "type": "text",
-                        "text": "You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task. Please provide the click coordinates.",
-                    }
-                ]
-            },
-            {
-                "role": "user",
-                "content": [
-                    {
-                        "type": "image",
-                        "image": pil_image,
-                    },
-                    {
-                        "type": "text",
-                        "text": instruction,
-                    },
-                ],
-            },
-        ]
         # Apply chat template
         text = processor.apply_chat_template(
             conversation,
@@ -190,15 +186,11 @@ def inference(pil_image: Image.Image, instruction: str):
             text=[text],
             images=[image],
             return_tensors="pt",
-            padding=True,
-            truncation=True,
-            max_length=512
         )
-        # Move inputs to the same device as model
-        if torch.cuda.is_available():
-            inputs = {k: v.cuda() if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
         # Generate response with proper error handling
         with torch.no_grad():
             try:
@@ -226,97 +218,119 @@ def inference(pil_image: Image.Image, instruction: str):
         # Extract coordinates
         coordinates = extract_coordinates(response)
-        px, py = coordinates[0]
-        return response, round(px, 4), round(py, 4)
     except Exception as e:
         logger.error(f"Inference error: {e}")
-        return f"Error during inference: {str(e)}", 0.5, 0.5
-def process_image(image: Image.Image, instruction: str):
-    """Process the uploaded image and instruction"""
-    if image is None:
-        return "Please upload an image", 0.5, 0.5
-    if not instruction.strip():
-        return "Please provide an instruction", 0.5, 0.5
-    # Convert image to RGB if needed
-    if image.mode != "RGB":
-        image = image.convert("RGB")
-    # Run inference
-    response, x, y = inference(image, instruction)
-    return response, x, y
-# Load model on startup
-logger.info("Loading model...")
-load_model()
-# Create Gradio interface
-with gr.Blocks(title="GUI-Actor Click Prediction", theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# GUI-Actor Click Prediction")
-    gr.Markdown("Upload a screenshot and provide instructions to get click coordinates prediction.")
-    with gr.Row():
-        with gr.Column():
-            image_input = gr.Image(
-                type="pil",
-                label="Upload Screenshot",
-                height=400
-            )
-            instruction_input = gr.Textbox(
-                label="Instruction",
-                placeholder="e.g., Click on the login button",
-                lines=3
-            )
-            submit_btn = gr.Button("Predict Click Location", variant="primary")
-        with gr.Column():
-            response_output = gr.Textbox(
-                label="Model Response",
-                lines=5,
-                interactive=False
-            )
-            with gr.Row():
-                x_output = gr.Number(
-                    label="X Coordinate (normalized)",
-                    precision=4,
-                    interactive=False
-                )
-                y_output = gr.Number(
-                    label="Y Coordinate (normalized)",
-                    precision=4,
-                    interactive=False
-                )
-    # Status indicator
-    with gr.Row():
-        gr.Markdown(f"**Model Status:** {'✅ Loaded' if model_loaded else '❌ Not Loaded'}")
-        gr.Markdown(f"**Device:** {'GPU' if torch.cuda.is_available() else 'CPU'}")
-    # Examples
-    gr.Examples(
-        examples=[
-            ["Click on the search button", None],
-            ["Select the dropdown menu", None],
-            ["Click on the submit form", None],
-        ],
-        inputs=[instruction_input, image_input],
-    )
-    # Event handlers
-    submit_btn.click(
-        fn=process_image,
-        inputs=[image_input, instruction_input],
-        outputs=[response_output, x_output, y_output]
-    )
-if __name__ == "__main__":
-    demo.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        share=True
-    )

+from fastapi import FastAPI, HTTPException
+from fastapi.responses import JSONResponse
+from pydantic import BaseModel
 from PIL import Image
+from io import BytesIO
+import base64
 import torch
 import re
 import logging
+import asyncio
+from contextlib import asynccontextmanager
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 model_name = "microsoft/GUI-Actor-2B-Qwen2-VL"
 model_loaded = False
+async def load_model():
     """Load model with proper error handling and fallback strategies"""
     global model, processor, tokenizer, model_loaded
             model = Qwen2VLForConditionalGeneration.from_pretrained(
                 model_name,
+                torch_dtype=torch.float32,
+                device_map=None,  # CPU only
                 trust_remote_code=True,
                 low_cpu_mem_usage=True
             ).eval()
             logger.info("Trying AutoProcessor and AutoModel fallback...")
             try:
+                from transformers import AutoProcessor, AutoModel
                 processor = AutoProcessor.from_pretrained(
                     model_name,
                     trust_remote_code=True
                 )
+                model = AutoModel.from_pretrained(
                     model_name,
+                    torch_dtype=torch.float32,
+                    device_map=None,
                     trust_remote_code=True,
                     low_cpu_mem_usage=True
                 ).eval()
                 logger.info("Trying generic transformers approach...")
                 # Last fallback - try loading as generic model
+                from transformers import AutoConfig, AutoTokenizer
                 import transformers
                 config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
                 model = ModelClass.from_pretrained(
                     model_name,
                     config=config,
+                    torch_dtype=torch.float32,
+                    device_map=None,
                     trust_remote_code=True,
                     low_cpu_mem_usage=True
                 ).eval()
         model_loaded = False
         return False
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    # Startup
+    logger.info("Starting up GUI-Actor API...")
+    await load_model()
+    yield
+    # Shutdown
+    logger.info("Shutting down GUI-Actor API...")
+# Initialize FastAPI app with lifespan
+app = FastAPI(
+    title="GUI-Actor API",
+    version="1.0.0",
+    lifespan=lifespan
+)
+class Base64Request(BaseModel):
+    image_base64: str
+    instruction: str
+def extract_coordinates(text):
+    """
+    Extract coordinates from model output text
+    """
     # Pattern untuk mencari koordinat dalam berbagai format
     patterns = [
         r'click\s*\(\s*(\d+(?:\.\d+)?)\s*,\s*(\d+(?:\.\d+)?)\s*\)',  # click(x, y)
     # Default ke center jika tidak ditemukan
     return [(0.5, 0.5)]
+def cpu_inference(conversation, model, tokenizer, processor):
+    """
+    Inference function untuk CPU with better error handling
+    """
     try:
         # Apply chat template
         text = processor.apply_chat_template(
             conversation,
             text=[text],
             images=[image],
             return_tensors="pt",
+            padding=True,  # Enable padding
+            truncation=True,  # Enable truncation for long texts
+            max_length=512  # Set reasonable max length
         )
         # Generate response with proper error handling
         with torch.no_grad():
             try:
         # Extract coordinates
         coordinates = extract_coordinates(response)
+        return {
+            "topk_points": coordinates,
+            "response": response,
+            "success": True
+        }
     except Exception as e:
         logger.error(f"Inference error: {e}")
+        return {
+            "topk_points": [(0.5, 0.5)],
+            "response": f"Error during inference: {str(e)}",
+            "success": False
+        }
+@app.get("/")
+async def root():
+    return {
+        "message": "GUI-Actor API is running",
+        "status": "healthy",
+        "model_loaded": model_loaded,
+        "model_name": model_name
+    }
+@app.post("/click/base64")
+async def predict_click_base64(data: Base64Request):
+    if not model_loaded:
+        raise HTTPException(
+            status_code=503,
+            detail="Model not loaded properly"
+        )
+    try:
+        # Decode base64 to image
+        try:
+            # Handle data URL format
+            if "," in data.image_base64:
+                image_data = base64.b64decode(data.image_base64.split(",")[-1])
+            else:
+                image_data = base64.b64decode(data.image_base64)
+        except Exception as e:
+            raise HTTPException(status_code=400, detail=f"Invalid base64 image: {e}")
+        try:
+            pil_image = Image.open(BytesIO(image_data)).convert("RGB")
+        except Exception as e:
+            raise HTTPException(status_code=400, detail=f"Invalid image format: {e}")
+        conversation = [
+            {
+                "role": "system",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task. Please provide the click coordinates.",
+                    }
+                ]
+            },
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image",
+                        "image": pil_image,
+                    },
+                    {
+                        "type": "text",
+                        "text": data.instruction,
+                    },
+                ],
+            },
+        ]
+        # Run inference
+        pred = cpu_inference(conversation, model, tokenizer, processor)
+        px, py = pred["topk_points"][0]
+        return JSONResponse(content={
+            "x": round(px, 4),
+            "y": round(py, 4),
+            "response": pred["response"],
+            "success": pred["success"]
+        })
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Prediction error: {e}")
+        raise HTTPException(
+            status_code=500,
+            detail=f"Internal server error: {str(e)}"
+        )
+@app.get("/health")
+async def health_check():
+    return {
+        "status": "healthy" if model_loaded else "unhealthy",
+        "model": model_name,
+        "device": "cpu",
+        "torch_dtype": "float32",
+        "model_loaded": model_loaded
+    }
+@app.get("/debug")
+async def debug_info():
+    """Debug endpoint to check model loading status"""
+    import transformers
+    available_classes = [attr for attr in dir(transformers) if 'Qwen' in attr or 'VL' in attr]
+    return {
+        "model_loaded": model_loaded,
+        "processor_type": type(processor).__name__ if processor else None,
+        "model_type": type(model).__name__ if model else None,
+        "available_qwen_classes": available_classes,
+        "transformers_version": transformers.__version__
+    }