Spaces:

abiyyufahri
/

GUI-Agent

Sleeping

App Files Files Community

abiyyufahri commited on Jul 24

Commit

0b96209

1 Parent(s): e670b79

Install error fix attemp 6

Browse files

Files changed (3) hide show

Dockerfile +12 -13
app.py +113 -37
requirements.txt +5 -4

Dockerfile CHANGED Viewed

@@ -1,30 +1,25 @@
-FROM nvidia/cuda:12.1-devel-ubuntu22.04
-# Install Python 3.10
 RUN apt-get update && apt-get install -y --no-install-recommends \
-    python3.10 python3.10-dev python3-pip python3.10-venv \
     git gcc g++ libglib2.0-0 libsm6 libxext6 libxrender-dev \
     build-essential curl && \
     rm -rf /var/lib/apt/lists/*
-# Create symbolic links for python
-RUN ln -s /usr/bin/python3.10 /usr/bin/python && \
-    ln -s /usr/bin/python3.10 /usr/bin/python3
 RUN useradd -m -u 1000 user
 USER user
 ENV PATH="/home/user/.local/bin:$PATH"
 WORKDIR /app
-# Install dependencies step by step untuk menghindari konflik
 RUN pip install --upgrade pip && \
     pip install --no-cache-dir packaging ninja wheel setuptools numpy
-# Install PyTorch dengan CUDA support
-RUN pip install --no-cache-dir torch==2.2.2 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
-# Install dependencies lain sebelum GUI-Actor
 RUN pip install --no-cache-dir \
     transformers \
     datasets \
@@ -35,8 +30,12 @@ RUN pip install --no-cache-dir \
     fastapi \
     "uvicorn[standard]"
-# Install GUI-Actor package terakhir (includes flash-attn)
-RUN pip install --no-cache-dir "git+https://github.com/microsoft/GUI-Actor.git"
 COPY --chown=user . .

+FROM python:3.10-slim
 RUN apt-get update && apt-get install -y --no-install-recommends \
     git gcc g++ libglib2.0-0 libsm6 libxext6 libxrender-dev \
     build-essential curl && \
     rm -rf /var/lib/apt/lists/*
 RUN useradd -m -u 1000 user
 USER user
 ENV PATH="/home/user/.local/bin:$PATH"
 WORKDIR /app
+COPY --chown=user requirements.txt ./
+# Install dependencies step by step
 RUN pip install --upgrade pip && \
     pip install --no-cache-dir packaging ninja wheel setuptools numpy
+# Install PyTorch CPU version
+RUN pip install --no-cache-dir torch==2.2.2+cpu torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+# Install core dependencies
 RUN pip install --no-cache-dir \
     transformers \
     datasets \
     fastapi \
     "uvicorn[standard]"
+# Install GUI-Actor dependencies manually (skip flash-attn)
+RUN pip install --no-cache-dir \
+    pre-commit \
+    liger-kernel==0.5.2 \
+    opencv-python-headless \
+    deepspeed==0.16.0
 COPY --chown=user . .

app.py CHANGED Viewed

@@ -5,36 +5,119 @@ from PIL import Image
 from io import BytesIO
 import base64
 import torch
-# Import sesuai dokumentasi GUI-Actor
-from qwen_vl_utils import process_vision_info
-from transformers import Qwen2VLProcessor
-from gui_actor.constants import chat_template
-from gui_actor.modeling import Qwen2VLForConditionalGenerationWithPointer
-from gui_actor.inference import inference
 app = FastAPI()
-# Load model sesuai dokumentasi
-model_name_or_path = "microsoft/GUI-Actor-2B-Qwen2-VL"
-data_processor = Qwen2VLProcessor.from_pretrained(model_name_or_path)
-tokenizer = data_processor.tokenizer
-# Modifikasi untuk CPU atau GPU
-device = "cuda" if torch.cuda.is_available() else "cpu"
-torch_dtype = torch.bfloat16 if device == "cuda" else torch.float32
-model = Qwen2VLForConditionalGenerationWithPointer.from_pretrained(
-    model_name_or_path,
-    torch_dtype=torch_dtype,
-    device_map=device if device == "cuda" else None,
-    attn_implementation="flash_attention_2" if device == "cuda" else None
 ).eval()
 class Base64Request(BaseModel):
     image_base64: str
     instruction: str
 @app.post("/click/base64")
 async def predict_click_base64(data: Base64Request):
     try:
@@ -48,7 +131,7 @@ async def predict_click_base64(data: Base64Request):
                 "content": [
                     {
                         "type": "text",
-                        "text": "You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.",
                     }
                 ]
             },
@@ -67,30 +150,24 @@ async def predict_click_base64(data: Base64Request):
             },
         ]
-        # Inference menggunakan fungsi dari GUI-Actor
-        pred = inference(
-            conversation,
-            model,
-            tokenizer,
-            data_processor,
-            use_placeholder=True,
-            topk=3
-        )
         px, py = pred["topk_points"][0]
         return JSONResponse(content={
             "x": round(px, 4),
             "y": round(py, 4),
-            "all_points": [[round(x, 4), round(y, 4)] for x, y in pred["topk_points"]],
-            "success": True
         })
     except Exception as e:
         return JSONResponse(
             content={
                 "error": str(e),
-                "success": False
             },
             status_code=500
         )
@@ -99,12 +176,11 @@ async def predict_click_base64(data: Base64Request):
 async def health_check():
     return {
         "status": "healthy",
-        "model": model_name_or_path,
-        "device": device,
-        "torch_dtype": str(torch_dtype)
     }
-# Endpoint tambahan untuk testing dengan form data
 @app.post("/click/form")
 async def predict_click_form(
     image_base64: str = Form(...),

 from io import BytesIO
 import base64
 import torch
+import re
+from transformers import AutoModelForCausalLM, AutoProcessor
 app = FastAPI()
+# Load model untuk CPU
+model_name = "microsoft/GUI-Actor-2B-Qwen2-VL"
+# Load processor
+try:
+    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
+except Exception as e:
+    print(f"Failed to load AutoProcessor: {e}")
+    from transformers import Qwen2VLProcessor
+    processor = Qwen2VLProcessor.from_pretrained(model_name)
+tokenizer = processor.tokenizer
+# Load model dengan CPU support
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    torch_dtype=torch.float32,  # float32 untuk CPU
+    device_map=None,            # CPU only
+    trust_remote_code=True,     # untuk custom model
+    attn_implementation=None    # skip flash attention
 ).eval()
 class Base64Request(BaseModel):
     image_base64: str
     instruction: str
+def extract_coordinates(text):
+    """
+    Extract coordinates from model output text
+    """
+    # Pattern untuk mencari koordinat dalam berbagai format
+    patterns = [
+        r'click\s*\(\s*(\d+(?:\.\d+)?)\s*,\s*(\d+(?:\.\d+)?)\s*\)',  # click(x, y)
+        r'\[\s*(\d+(?:\.\d+)?)\s*,\s*(\d+(?:\.\d+)?)\s*\]',          # [x, y]
+        r'(\d+(?:\.\d+)?)\s*,\s*(\d+(?:\.\d+)?)',                    # x, y
+        r'point:\s*\(\s*(\d+(?:\.\d+)?)\s*,\s*(\d+(?:\.\d+)?)\s*\)', # point: (x, y)
+    ]
+    for pattern in patterns:
+        matches = re.findall(pattern, text.lower())
+        if matches:
+            try:
+                x, y = float(matches[0][0]), float(matches[0][1])
+                # Normalize jika koordinat > 1 (asumsi pixel coordinates)
+                if x > 1 or y > 1:
+                    # Asumsi resolusi 1920x1080 untuk normalisasi
+                    x = x / 1920 if x > 1 else x
+                    y = y / 1080 if y > 1 else y
+                return [(x, y)]
+            except (ValueError, IndexError):
+                continue
+    # Default ke center jika tidak ditemukan
+    return [(0.5, 0.5)]
+def cpu_inference(conversation, model, tokenizer, processor):
+    """
+    Inference function untuk CPU tanpa GUI-Actor dependencies
+    """
+    try:
+        # Apply chat template
+        text = processor.apply_chat_template(
+            conversation,
+            tokenize=False,
+            add_generation_prompt=True
+        )
+        # Get image from conversation
+        image = conversation[1]["content"][0]["image"]
+        # Process inputs
+        inputs = processor(
+            text=[text],
+            images=[image],
+            return_tensors="pt"
+        )
+        # Generate response
+        with torch.no_grad():
+            outputs = model.generate(
+                **inputs,
+                max_new_tokens=256,
+                do_sample=True,
+                temperature=0.3,
+                top_p=0.8,
+                pad_token_id=tokenizer.eos_token_id
+            )
+        # Decode response
+        generated_ids = outputs[0][inputs["input_ids"].shape[1]:]
+        response = tokenizer.decode(generated_ids, skip_special_tokens=True)
+        # Extract coordinates
+        coordinates = extract_coordinates(response)
+        return {
+            "topk_points": coordinates,
+            "response": response,
+            "success": True
+        }
+    except Exception as e:
+        return {
+            "topk_points": [(0.5, 0.5)],
+            "response": f"Error during inference: {str(e)}",
+            "success": False
+        }
 @app.post("/click/base64")
 async def predict_click_base64(data: Base64Request):
     try:
                 "content": [
                     {
                         "type": "text",
+                        "text": "You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task. Please provide the click coordinates.",
                     }
                 ]
             },
             },
         ]
+        # Run inference
+        pred = cpu_inference(conversation, model, tokenizer, processor)
         px, py = pred["topk_points"][0]
         return JSONResponse(content={
             "x": round(px, 4),
             "y": round(py, 4),
+            "response": pred["response"],
+            "success": pred["success"]
         })
     except Exception as e:
         return JSONResponse(
             content={
                 "error": str(e),
+                "success": False,
+                "x": 0.5,
+                "y": 0.5
             },
             status_code=500
         )
 async def health_check():
     return {
         "status": "healthy",
+        "model": model_name,
+        "device": "cpu",
+        "torch_dtype": "float32"
     }
 @app.post("/click/form")
 async def predict_click_form(
     image_base64: str = Form(...),

requirements.txt CHANGED Viewed

@@ -2,13 +2,14 @@ packaging
 ninja
 fastapi
 uvicorn[standard]
-transformers
 datasets
 Pillow
-torch==2.2.2
 torchvision
 torchaudio
 accelerate
 scipy
-qwen-vl-utils
-git+https://github.com/microsoft/GUI-Actor.git

 ninja
 fastapi
 uvicorn[standard]
+transformers>=4.37.0
 datasets
 Pillow
+torch==2.2.2+cpu
 torchvision
 torchaudio
+--index-url https://download.pytorch.org/whl/cpu
 accelerate
 scipy
+numpy
+qwen-vl-utils