Spaces:

GlobalStudio
/

starflow

Sleeping

leoeric commited on 9 days ago

Commit

34395b9

1 Parent(s): 5616201

Fix GPU abort error: improve ZeroGPU decorator detection and GPU context handling

- Fix @spaces.GPU decorator application for proper ZeroGPU detection
- Preserve CUDA_VISIBLE_DEVICES in subprocess calls
- Add GPU availability checks before generation
- Enhance error handling for GPU abort scenarios
- Add GPU status logging for debugging

Files changed (4) hide show

app.py +87 -20
dataset.py +42 -5
sample.py +13 -6
utils/training.py +3 -1

app.py CHANGED Viewed

@@ -76,8 +76,14 @@ def get_checkpoint_path(checkpoint_file, default_local_path, repo_id=None, filen
     # Try downloading from Model Hub if configured
     if repo_id and filename and HF_HUB_AVAILABLE:
         try:
-            # Use /workspace if available (persistent), otherwise /tmp
-            cache_dir = "/workspace/checkpoints" if os.path.exists("/workspace") else "/tmp/checkpoints"
             os.makedirs(cache_dir, exist_ok=True)
             # Check if already downloaded
@@ -144,15 +150,15 @@ else:
     print(f"   PyTorch Version: {torch.__version__}")
 # Apply @spaces.GPU decorator if available (required for ZeroGPU)
 if SPACES_AVAILABLE and hasattr(spaces, 'GPU'):
-    @spaces.GPU
-    def generate_image(prompt, aspect_ratio, cfg, seed, checkpoint_file, config_path):
-        """Generate image from text prompt."""
-        return _generate_image_impl(prompt, aspect_ratio, cfg, seed, checkpoint_file, config_path)
-else:
-    def generate_image(prompt, aspect_ratio, cfg, seed, checkpoint_file, config_path):
-        """Generate image from text prompt."""
-        return _generate_image_impl(prompt, aspect_ratio, cfg, seed, checkpoint_file, config_path)
 def _generate_image_impl(prompt, aspect_ratio, cfg, seed, checkpoint_file, config_path):
     """Generate image from text prompt (implementation)."""
@@ -222,13 +228,27 @@ def _generate_image_impl(prompt, aspect_ratio, cfg, seed, checkpoint_file, confi
         log_file = output_dir / "generation.log"
         status_msg += f"📋 Logs will be saved to: {log_file}\n"
         # Run with timeout (45 minutes max - allows for download + generation)
         # Capture output and write to log file
         result = subprocess.run(
             cmd,
             capture_output=True,
             text=True,
-            cwd=os.getcwd(),
             timeout=2700
         )
@@ -251,6 +271,21 @@ def _generate_image_impl(prompt, aspect_ratio, cfg, seed, checkpoint_file, confi
         if result.returncode != 0:
             error_msg = f"❌ Error during generation (return code: {result.returncode})\n\n"
             error_msg += f"=== STDERR ===\n{result.stderr}\n\n"
             error_msg += f"=== STDOUT ===\n{result.stdout}\n\n"
             if log_content:
@@ -323,15 +358,15 @@ def _generate_image_impl(prompt, aspect_ratio, cfg, seed, checkpoint_file, confi
 # Apply @spaces.GPU decorator if available (required for ZeroGPU)
 if SPACES_AVAILABLE and hasattr(spaces, 'GPU'):
-    @spaces.GPU
-    def generate_video(prompt, aspect_ratio, cfg, seed, target_length, checkpoint_file, config_path, input_image):
-        """Generate video from text prompt."""
-        return _generate_video_impl(prompt, aspect_ratio, cfg, seed, target_length, checkpoint_file, config_path, input_image)
-else:
-    def generate_video(prompt, aspect_ratio, cfg, seed, target_length, checkpoint_file, config_path, input_image):
-        """Generate video from text prompt."""
-        return _generate_video_impl(prompt, aspect_ratio, cfg, seed, target_length, checkpoint_file, config_path, input_image)
 def _generate_video_impl(prompt, aspect_ratio, cfg, seed, target_length, checkpoint_file, config_path, input_image):
     """Generate video from text prompt (implementation)."""
@@ -396,10 +431,42 @@ def _generate_video_impl(prompt, aspect_ratio, cfg, seed, target_length, checkpo
         else:
             cmd.extend(["--input_image", "none"])
-        result = subprocess.run(cmd, capture_output=True, text=True, cwd=os.getcwd())
         if result.returncode != 0:
             error_msg = f"❌ Error during video generation (return code: {result.returncode})\n\n"
             error_msg += f"=== STDERR ===\n{result.stderr}\n\n"
             error_msg += f"=== STDOUT ===\n{result.stdout}\n"
             return None, error_msg

     # Try downloading from Model Hub if configured
     if repo_id and filename and HF_HUB_AVAILABLE:
         try:
+            # Use /workspace if available (HF Spaces persistent), otherwise use local cache
+            if os.path.exists("/workspace"):
+                cache_dir = "/workspace/checkpoints"
+            elif os.path.exists("/tmp"):
+                cache_dir = "/tmp/checkpoints"
+            else:
+                # Local development: use project directory or user cache
+                cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "starflow")
             os.makedirs(cache_dir, exist_ok=True)
             # Check if already downloaded
     print(f"   PyTorch Version: {torch.__version__}")
 # Apply @spaces.GPU decorator if available (required for ZeroGPU)
+# IMPORTANT: Decorator must be applied at module level for ZeroGPU to detect it at startup
+def generate_image(prompt, aspect_ratio, cfg, seed, checkpoint_file, config_path):
+    """Generate image from text prompt."""
+    return _generate_image_impl(prompt, aspect_ratio, cfg, seed, checkpoint_file, config_path)
+# Apply decorator if spaces module is available (ZeroGPU detection happens at import time)
 if SPACES_AVAILABLE and hasattr(spaces, 'GPU'):
+    generate_image = spaces.GPU(generate_image)
+    print("✅ ZeroGPU decorator applied to generate_image")
 def _generate_image_impl(prompt, aspect_ratio, cfg, seed, checkpoint_file, config_path):
     """Generate image from text prompt (implementation)."""
         log_file = output_dir / "generation.log"
         status_msg += f"📋 Logs will be saved to: {log_file}\n"
+        # Ensure GPU environment variables are passed to subprocess
+        env = os.environ.copy()
+        # Preserve CUDA_VISIBLE_DEVICES if set (important for ZeroGPU)
+        if 'CUDA_VISIBLE_DEVICES' in env:
+            print(f"✅ CUDA_VISIBLE_DEVICES={env['CUDA_VISIBLE_DEVICES']}")
+        # Verify GPU is available before starting
+        if torch.cuda.is_available():
+            status_msg += f"✅ GPU available: {torch.cuda.get_device_name(0)}\n"
+            status_msg += f"   GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB\n"
+        else:
+            status_msg += "⚠️  Warning: CUDA not available, will use CPU (very slow)\n"
         # Run with timeout (45 minutes max - allows for download + generation)
         # Capture output and write to log file
         result = subprocess.run(
             cmd,
             capture_output=True,
             text=True,
+            cwd=os.getcwd(),
+            env=env,  # Pass environment variables (including CUDA_VISIBLE_DEVICES)
             timeout=2700
         )
         if result.returncode != 0:
             error_msg = f"❌ Error during generation (return code: {result.returncode})\n\n"
+            # Check for GPU abort or CUDA errors
+            error_output = (result.stderr + result.stdout).lower()
+            if "gpu aborted" in error_output or "cuda" in error_output or "out of memory" in error_output:
+                error_msg += "⚠️  GPU ERROR DETECTED\n\n"
+                error_msg += "Possible causes:\n"
+                error_msg += "1. GPU timeout (ZeroGPU may have a 5-10 min limit)\n"
+                error_msg += "2. CUDA out of memory (model too large for GPU)\n"
+                error_msg += "3. GPU allocation failed (ZeroGPU not detected)\n\n"
+                error_msg += "Solutions:\n"
+                error_msg += "- Try again (GPU may have been released)\n"
+                error_msg += "- Check Space logs for detailed error\n"
+                error_msg += "- Ensure @spaces.GPU decorator is applied\n"
+                error_msg += "- Consider using paid GPU tier for longer runs\n\n"
             error_msg += f"=== STDERR ===\n{result.stderr}\n\n"
             error_msg += f"=== STDOUT ===\n{result.stdout}\n\n"
             if log_content:
 # Apply @spaces.GPU decorator if available (required for ZeroGPU)
+# IMPORTANT: Decorator must be applied at module level for ZeroGPU to detect it at startup
+def generate_video(prompt, aspect_ratio, cfg, seed, target_length, checkpoint_file, config_path, input_image):
+    """Generate video from text prompt."""
+    return _generate_video_impl(prompt, aspect_ratio, cfg, seed, target_length, checkpoint_file, config_path, input_image)
+# Apply decorator if spaces module is available (ZeroGPU detection happens at import time)
 if SPACES_AVAILABLE and hasattr(spaces, 'GPU'):
+    generate_video = spaces.GPU(generate_video)
+    print("✅ ZeroGPU decorator applied to generate_video")
 def _generate_video_impl(prompt, aspect_ratio, cfg, seed, target_length, checkpoint_file, config_path, input_image):
     """Generate video from text prompt (implementation)."""
         else:
             cmd.extend(["--input_image", "none"])
+        # Ensure GPU environment variables are passed to subprocess
+        env = os.environ.copy()
+        # Preserve CUDA_VISIBLE_DEVICES if set (important for ZeroGPU)
+        if 'CUDA_VISIBLE_DEVICES' in env:
+            print(f"✅ CUDA_VISIBLE_DEVICES={env['CUDA_VISIBLE_DEVICES']}")
+        # Verify GPU is available before starting
+        if torch.cuda.is_available():
+            print(f"✅ GPU available: {torch.cuda.get_device_name(0)}")
+        result = subprocess.run(
+            cmd,
+            capture_output=True,
+            text=True,
+            cwd=os.getcwd(),
+            env=env,  # Pass environment variables (including CUDA_VISIBLE_DEVICES)
+            timeout=3600  # 60 minutes for video generation
+        )
         if result.returncode != 0:
             error_msg = f"❌ Error during video generation (return code: {result.returncode})\n\n"
+            # Check for GPU abort or CUDA errors
+            error_output = (result.stderr + result.stdout).lower()
+            if "gpu aborted" in error_output or "cuda" in error_output or "out of memory" in error_output:
+                error_msg += "⚠️  GPU ERROR DETECTED\n\n"
+                error_msg += "Possible causes:\n"
+                error_msg += "1. GPU timeout (ZeroGPU may have a 5-10 min limit)\n"
+                error_msg += "2. CUDA out of memory (video generation needs more GPU memory)\n"
+                error_msg += "3. GPU allocation failed (ZeroGPU not detected)\n\n"
+                error_msg += "Solutions:\n"
+                error_msg += "- Try again (GPU may have been released)\n"
+                error_msg += "- Check Space logs for detailed error\n"
+                error_msg += "- Ensure @spaces.GPU decorator is applied\n"
+                error_msg += "- Consider using paid GPU tier for longer runs\n\n"
             error_msg += f"=== STDERR ===\n{result.stderr}\n\n"
             error_msg += f"=== STDOUT ===\n{result.stdout}\n"
             return None, error_msg

dataset.py CHANGED Viewed

@@ -24,15 +24,41 @@ import gc
 import threading
 import psutil
 import tempfile
-import decord
-from decord import VideoReader
 import concurrent.futures
 from concurrent.futures import ThreadPoolExecutor, TimeoutError
 from misc import print, xprint
 from misc.condition_utils import get_camera_condition, get_point_condition, get_wind_condition
-# Initialize multiprocessing manager
-manager = torch.multiprocessing.Manager()
 # ==== helpers ==== #
@@ -91,6 +117,8 @@ def sample_clip(
     num_frames: int = 8,
     out_fps: Optional[float] = None,      # ← pass an fps here
 ):
     vr       = VideoReader(video_path)
     src_fps  = vr.get_avg_fps()        # native fps
     total    = len(vr)
@@ -353,11 +381,20 @@ class ImageTarDataset(Dataset):
 class OnlineImageTarDataset(ImageTarDataset):
     max_retry_n = 20
     max_read = 4096
-    tar_keys_lock = manager.Lock() if manager is not None else None
     def __init__(self, dataset_tsv, image_size, batch_size=None, **kwargs):
         super().__init__(dataset_tsv, image_size, **kwargs)
         self.tar_lists = defaultdict(lambda: [])
         self.tar_image_buckets = defaultdict(lambda: defaultdict(lambda: 0))
         for i, line in enumerate(self.all_lines):

 import threading
 import psutil
 import tempfile
+# Optional import for video processing (not available on macOS ARM)
+try:
+    import decord
+    from decord import VideoReader
+    DECORD_AVAILABLE = True
+except ImportError:
+    DECORD_AVAILABLE = False
+    print("⚠️  decord not available. Video processing will be disabled.")
 import concurrent.futures
 from concurrent.futures import ThreadPoolExecutor, TimeoutError
 from misc import print, xprint
 from misc.condition_utils import get_camera_condition, get_point_condition, get_wind_condition
+# Lazy initialization of multiprocessing manager (only when needed, not at import time)
+# This avoids issues on macOS which uses 'spawn' instead of 'fork'
+_manager = None
+def get_manager():
+    """Get or create the multiprocessing manager lazily."""
+    global _manager
+    if _manager is None:
+        try:
+            # Only create manager when actually needed (not at import time)
+            # This avoids RuntimeError on macOS with spawn method
+            _manager = torch.multiprocessing.Manager()
+        except (RuntimeError, EOFError) as e:
+            # If manager creation fails (e.g., on macOS with spawn), return None
+            # The code already handles None manager gracefully
+            print(f"⚠️  Could not create multiprocessing manager: {e}")
+            print("   Continuing without multiprocessing manager (may affect some features)")
+            _manager = False  # Use False to indicate attempted but failed
+    return _manager if _manager is not False else None
+# For backward compatibility, but will be None until get_manager() is called
+manager = None
 # ==== helpers ==== #
     num_frames: int = 8,
     out_fps: Optional[float] = None,      # ← pass an fps here
 ):
+    if not DECORD_AVAILABLE:
+        raise ImportError("decord is required for video processing but is not available. Install with: pip install decord (Note: not available on macOS ARM)")
     vr       = VideoReader(video_path)
     src_fps  = vr.get_avg_fps()        # native fps
     total    = len(vr)
 class OnlineImageTarDataset(ImageTarDataset):
     max_retry_n = 20
     max_read = 4096
+    # tar_keys_lock will be initialized in __init__ to avoid import-time issues
     def __init__(self, dataset_tsv, image_size, batch_size=None, **kwargs):
         super().__init__(dataset_tsv, image_size, **kwargs)
+        # Initialize manager lazily (only when this class is instantiated)
+        manager = get_manager()
+        # Use threading.Lock as fallback if multiprocessing manager unavailable
+        if manager is not None:
+            self.tar_keys_lock = manager.Lock()
+        else:
+            # Fallback to threading lock for single-process use
+            self.tar_keys_lock = threading.Lock()
         self.tar_lists = defaultdict(lambda: [])
         self.tar_image_buckets = defaultdict(lambda: defaultdict(lambda: 0))
         for i, line in enumerate(self.all_lines):

sample.py CHANGED Viewed

@@ -60,7 +60,9 @@ def setup_model_and_components(args: argparse.Namespace) -> Tuple[torch.nn.Modul
     dist = utils.Distributed()
     # If not running with torchrun, initialize single-process group
-    if not dist.distributed and torch.cuda.is_available():
         import os
         # Initialize single-process process group for model compatibility
         if not torch.distributed.is_initialized():
@@ -69,13 +71,15 @@ def setup_model_and_components(args: argparse.Namespace) -> Tuple[torch.nn.Modul
             os.environ['RANK'] = '0'
             os.environ['LOCAL_RANK'] = '0'
             os.environ['WORLD_SIZE'] = '1'
             torch.distributed.init_process_group(
-                backend='nccl' if torch.cuda.is_available() else 'gloo',
                 init_method='env://',
                 world_size=1,
                 rank=0,
             )
-            print("✅ Initialized single-process distributed group for model compatibility")
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -268,11 +272,13 @@ def main(args: argparse.Namespace) -> None:
     # Start sampling
     print(f'Starting sampling with global batch size {args.sample_batch_size}x{dist.world_size} GPUs')
-    torch.cuda.synchronize()
     start_time = time.time()
     with torch.no_grad():
-        with torch.autocast(device_type='cuda', dtype=torch.float32):
             for i in tqdm.tqdm(range(int(np.ceil(num_samples / (args.sample_batch_size * dist.world_size))))):
                 # Determine aspect ratio and image shape
                 x_aspect = args.aspect_ratio if args.mix_aspect else None
@@ -367,7 +373,8 @@ def main(args: argparse.Namespace) -> None:
                 )
     # Print timing statistics
-    torch.cuda.synchronize()
     elapsed_time = time.time() - start_time
     print(f'{model_name} cfg {args.cfg:.2f}, bsz={args.sample_batch_size}x{dist.world_size}, '
           f'time={elapsed_time:.2f}s, speed={num_samples / elapsed_time:.2f} images/s')

     dist = utils.Distributed()
     # If not running with torchrun, initialize single-process group
+    # This is needed because the model code uses torch.distributed.all_reduce
+    # Works for both CUDA and CPU modes
+    if not dist.distributed:
         import os
         # Initialize single-process process group for model compatibility
         if not torch.distributed.is_initialized():
             os.environ['RANK'] = '0'
             os.environ['LOCAL_RANK'] = '0'
             os.environ['WORLD_SIZE'] = '1'
+            # Use 'nccl' for CUDA, 'gloo' for CPU
+            backend = 'nccl' if torch.cuda.is_available() else 'gloo'
             torch.distributed.init_process_group(
+                backend=backend,
                 init_method='env://',
                 world_size=1,
                 rank=0,
             )
+            print(f"✅ Initialized single-process distributed group (backend: {backend}) for model compatibility")
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     # Start sampling
     print(f'Starting sampling with global batch size {args.sample_batch_size}x{dist.world_size} GPUs')
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
     start_time = time.time()
     with torch.no_grad():
+        device_type = 'cuda' if torch.cuda.is_available() else 'cpu'
+        with torch.autocast(device_type=device_type, dtype=torch.float32):
             for i in tqdm.tqdm(range(int(np.ceil(num_samples / (args.sample_batch_size * dist.world_size))))):
                 # Determine aspect ratio and image shape
                 x_aspect = args.aspect_ratio if args.mix_aspect else None
                 )
     # Print timing statistics
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
     elapsed_time = time.time() - start_time
     print(f'{model_name} cfg {args.cfg:.2f}, bsz={args.sample_batch_size}x{dist.world_size}, '
           f'time={elapsed_time:.2f}s, speed={num_samples / elapsed_time:.2f} images/s')

utils/training.py CHANGED Viewed

@@ -81,7 +81,9 @@ class Distributed:
         else:  # When running with python for debugging
             self.rank, self.local_rank, self.world_size = 0, 0, 1
             self.distributed = False
-        torch.cuda.set_device(self.local_rank)
         self.barrier()
     def barrier(self) -> None:

         else:  # When running with python for debugging
             self.rank, self.local_rank, self.world_size = 0, 0, 1
             self.distributed = False
+        # Only set CUDA device if CUDA is available
+        if torch.cuda.is_available():
+            torch.cuda.set_device(self.local_rank)
         self.barrier()
     def barrier(self) -> None: