Spaces:

abiyyufahri
/

GUI-Agent

Sleeping

App Files Files Community

abiyyufahri commited on Jul 24

Commit

5599f5a

1 Parent(s): 5ef548f

Install error fix attemp 12

Browse files

Files changed (1) hide show

main.py +121 -36

main.py CHANGED Viewed

@@ -22,36 +22,93 @@ model_name = "microsoft/GUI-Actor-2B-Qwen2-VL"
 model_loaded = False
 async def load_model():
-    """Load model with proper error handling"""
     global model, processor, tokenizer, model_loaded
     try:
         logger.info("Starting model loading...")
-        # Import required modules - use specific Qwen2VL classes
-        from transformers import Qwen2VLProcessor, Qwen2VLForConditionalGeneration
-        logger.info("Loading processor...")
-        # Use specific Qwen2VL processor
-        processor = Qwen2VLProcessor.from_pretrained(
-            model_name,
-            trust_remote_code=True
-        )
-        logger.info("Processor loaded successfully")
         tokenizer = processor.tokenizer
-        logger.info("Loading model...")
-        # Use specific Qwen2VL model class
-        model = Qwen2VLForConditionalGeneration.from_pretrained(
-            model_name,
-            torch_dtype=torch.float32,
-            device_map=None,  # CPU only
-            trust_remote_code=True,
-            low_cpu_mem_usage=True  # For better memory management
-        ).eval()
-        logger.info("Model loaded successfully!")
         model_loaded = True
         return True
@@ -111,7 +168,7 @@ def extract_coordinates(text):
 def cpu_inference(conversation, model, tokenizer, processor):
     """
-    Inference function untuk CPU
     """
     try:
         # Apply chat template
@@ -124,23 +181,36 @@ def cpu_inference(conversation, model, tokenizer, processor):
         # Get image from conversation
         image = conversation[1]["content"][0]["image"]
-        # Process inputs
         inputs = processor(
             text=[text],
             images=[image],
-            return_tensors="pt"
         )
-        # Generate response
         with torch.no_grad():
-            outputs = model.generate(
-                **inputs,
-                max_new_tokens=256,
-                do_sample=True,
-                temperature=0.3,
-                top_p=0.8,
-                pad_token_id=tokenizer.eos_token_id
-            )
         # Decode response
         generated_ids = outputs[0][inputs["input_ids"].shape[1]:]
@@ -168,7 +238,8 @@ async def root():
     return {
         "message": "GUI-Actor API is running",
         "status": "healthy",
-        "model_loaded": model_loaded
     }
 @app.post("/click/base64")
@@ -248,4 +319,18 @@ async def health_check():
         "device": "cpu",
         "torch_dtype": "float32",
         "model_loaded": model_loaded
     }

 model_loaded = False
 async def load_model():
+    """Load model with proper error handling and fallback strategies"""
     global model, processor, tokenizer, model_loaded
     try:
         logger.info("Starting model loading...")
+        # Try specific Qwen2VL classes first
+        try:
+            logger.info("Attempting to load with Qwen2VL specific classes...")
+            from transformers import Qwen2VLProcessor, Qwen2VLForConditionalGeneration
+            processor = Qwen2VLProcessor.from_pretrained(
+                model_name,
+                trust_remote_code=True
+            )
+            model = Qwen2VLForConditionalGeneration.from_pretrained(
+                model_name,
+                torch_dtype=torch.float32,
+                device_map=None,  # CPU only
+                trust_remote_code=True,
+                low_cpu_mem_usage=True
+            ).eval()
+            logger.info("Successfully loaded with Qwen2VL specific classes")
+        except Exception as e1:
+            logger.warning(f"Failed with Qwen2VL classes: {e1}")
+            logger.info("Trying AutoProcessor and AutoModel fallback...")
+            try:
+                from transformers import AutoProcessor, AutoModel
+                processor = AutoProcessor.from_pretrained(
+                    model_name,
+                    trust_remote_code=True
+                )
+                model = AutoModel.from_pretrained(
+                    model_name,
+                    torch_dtype=torch.float32,
+                    device_map=None,
+                    trust_remote_code=True,
+                    low_cpu_mem_usage=True
+                ).eval()
+                logger.info("Successfully loaded with Auto classes")
+            except Exception as e2:
+                logger.warning(f"Failed with Auto classes: {e2}")
+                logger.info("Trying generic transformers approach...")
+                # Last fallback - try loading as generic model
+                from transformers import AutoConfig, AutoTokenizer
+                import transformers
+                config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+                logger.info(f"Model config type: {type(config)}")
+                # Try to find the right model class
+                if hasattr(transformers, 'Qwen2VLForConditionalGeneration'):
+                    ModelClass = getattr(transformers, 'Qwen2VLForConditionalGeneration')
+                elif hasattr(transformers, 'AutoModelForVision2Seq'):
+                    ModelClass = getattr(transformers, 'AutoModelForVision2Seq')
+                else:
+                    raise Exception("No suitable model class found")
+                processor = AutoProcessor.from_pretrained(
+                    model_name,
+                    trust_remote_code=True
+                )
+                model = ModelClass.from_pretrained(
+                    model_name,
+                    config=config,
+                    torch_dtype=torch.float32,
+                    device_map=None,
+                    trust_remote_code=True,
+                    low_cpu_mem_usage=True
+                ).eval()
+        # Verify processor and model are loaded
+        if processor is None or model is None:
+            raise Exception("Failed to load processor or model")
         tokenizer = processor.tokenizer
+        logger.info("Model and processor loaded successfully!")
         model_loaded = True
         return True
 def cpu_inference(conversation, model, tokenizer, processor):
     """
+    Inference function untuk CPU with better error handling
     """
     try:
         # Apply chat template
         # Get image from conversation
         image = conversation[1]["content"][0]["image"]
+        # Process inputs with proper padding
         inputs = processor(
             text=[text],
             images=[image],
+            return_tensors="pt",
+            padding=True,  # Enable padding
+            truncation=True,  # Enable truncation for long texts
+            max_length=512  # Set reasonable max length
         )
+        # Generate response with proper error handling
         with torch.no_grad():
+            try:
+                outputs = model.generate(
+                    **inputs,
+                    max_new_tokens=256,
+                    do_sample=True,
+                    temperature=0.3,
+                    top_p=0.8,
+                    pad_token_id=tokenizer.eos_token_id if tokenizer.eos_token_id else tokenizer.pad_token_id
+                )
+            except Exception as e:
+                logger.error(f"Generation error: {e}")
+                # Try with simpler parameters
+                outputs = model.generate(
+                    **inputs,
+                    max_new_tokens=128,
+                    do_sample=False,
+                    pad_token_id=tokenizer.eos_token_id if tokenizer.eos_token_id else 0
+                )
         # Decode response
         generated_ids = outputs[0][inputs["input_ids"].shape[1]:]
     return {
         "message": "GUI-Actor API is running",
         "status": "healthy",
+        "model_loaded": model_loaded,
+        "model_name": model_name
     }
 @app.post("/click/base64")
         "device": "cpu",
         "torch_dtype": "float32",
         "model_loaded": model_loaded
+    }
+@app.get("/debug")
+async def debug_info():
+    """Debug endpoint to check model loading status"""
+    import transformers
+    available_classes = [attr for attr in dir(transformers) if 'Qwen' in attr or 'VL' in attr]
+    return {
+        "model_loaded": model_loaded,
+        "processor_type": type(processor).__name__ if processor else None,
+        "model_type": type(model).__name__ if model else None,
+        "available_qwen_classes": available_classes,
+        "transformers_version": transformers.__version__
     }