Spaces:

GlobalStudio
/

starflow

Sleeping

App Files Files Community

leoeric commited on 6 days ago

Commit

3a6a9cd

1 Parent(s): bd6dbaf

Fix CUDA out of memory error by clearing cache between model loads and enabling expandable segments

Browse files

Files changed (2) hide show

app.py +39 -5
sample.py +18 -4

app.py CHANGED Viewed

@@ -10,6 +10,8 @@ import os
 os.environ['OMP_NUM_THREADS'] = '1'
 os.environ['MKL_NUM_THREADS'] = '1'
 os.environ['NUMEXPR_NUM_THREADS'] = '1'
 import warnings
 import gradio as gr
@@ -237,17 +239,49 @@ def generate_image(prompt, aspect_ratio, cfg, seed, checkpoint_file, config_path
         # Find the generated image
         # The sample.py script saves to logdir/model_name/...
-        # We need to find the most recent output
-        output_files = list(output_dir.glob("**/*.png")) + list(output_dir.glob("**/*.jpg"))
         if output_files:
             latest_file = max(output_files, key=lambda p: p.stat().st_mtime)
             return str(latest_file), status_msg + "✅ Success! Image generated."
         else:
-            error_msg = status_msg + f"Error: Generated image not found in {output_dir}."
             if log_content:
-                error_msg += f"\n\n📋 Check log file for details: {log_file}\nLast 1000 chars:\n{log_content[-1000:]}"
             else:
-                error_msg += f"\n\nCheck stdout:\n{result.stdout}"
             return None, error_msg
     except Exception as e:

 os.environ['OMP_NUM_THREADS'] = '1'
 os.environ['MKL_NUM_THREADS'] = '1'
 os.environ['NUMEXPR_NUM_THREADS'] = '1'
+# Fix CUDA memory fragmentation
+os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
 import warnings
 import gradio as gr
         # Find the generated image
         # The sample.py script saves to logdir/model_name/...
+        # Model name is derived from checkpoint path stem
+        checkpoint_stem = Path(checkpoint_path).stem
+        model_output_dir = output_dir / checkpoint_stem
+        status_msg += f"Searching in: {model_output_dir}\n"
+        status_msg += f"Also searching recursively in: {output_dir}\n"
+        # Search in model-specific directory first, then recursively
+        search_paths = [model_output_dir, output_dir]
+        output_files = []
+        for search_path in search_paths:
+            if search_path.exists():
+                # Look for PNG, JPG, JPEG files
+                found = list(search_path.glob("**/*.png")) + list(search_path.glob("**/*.jpg")) + list(search_path.glob("**/*.jpeg"))
+                output_files.extend(found)
+                status_msg += f"Found {len(found)} files in {search_path}\n"
         if output_files:
+            # Get the most recent file
             latest_file = max(output_files, key=lambda p: p.stat().st_mtime)
+            status_msg += f"✅ Found image: {latest_file}\n"
             return str(latest_file), status_msg + "✅ Success! Image generated."
         else:
+            # Debug: list what's actually in the directory
+            debug_info = f"\n\nDebug info:\n"
+            debug_info += f"Output dir exists: {output_dir.exists()}\n"
+            if output_dir.exists():
+                debug_info += f"Contents of {output_dir}:\n"
+                for item in output_dir.iterdir():
+                    debug_info += f"  - {item.name} ({'dir' if item.is_dir() else 'file'})\n"
+                if model_output_dir.exists():
+                    debug_info += f"\nContents of {model_output_dir}:\n"
+                    for item in model_output_dir.iterdir():
+                        debug_info += f"  - {item.name} ({'dir' if item.is_dir() else 'file'})\n"
+            error_msg = status_msg + f"Error: Generated image not found.\n"
+            error_msg += f"Searched in: {output_dir} and {model_output_dir}\n"
+            error_msg += debug_info
             if log_content:
+                error_msg += f"\n\n📋 Check log file for details: {log_file}\nLast 2000 chars:\n{log_content[-2000:]}"
             else:
+                error_msg += f"\n\nCheck stdout:\n{result.stdout[-1000:]}"
             return None, error_msg
     except Exception as e:

sample.py CHANGED Viewed

@@ -17,6 +17,7 @@ import argparse
 import copy
 import pathlib
 import time
 from typing import Dict, List, Optional, Tuple, Union
 import numpy as np
@@ -60,28 +61,40 @@ def setup_model_and_components(args: argparse.Namespace) -> Tuple[torch.nn.Modul
     utils.set_random_seed(args.seed + dist.rank)
     # Setup text encoder
     tokenizer, text_encoder = utils.setup_encoder(args, dist, device)
     # Setup VAE if specified
     vae = None
     if args.vae is not None:
         vae = utils.setup_vae(args, dist, device)
         args.img_size = args.img_size // vae.downsample_factor
     else:
         args.finetuned_vae = 'none'
     # Setup main transformer model
     model = utils.setup_transformer(
         args, dist,
         txt_dim=text_encoder.config.hidden_size,
         use_checkpoint=1
-    ).to(device)
-    # Load checkpoint
-    print(f"Loading checkpoint from local path: {args.checkpoint_path}")
     state_dict = torch.load(args.checkpoint_path, map_location='cpu')
     model.load_state_dict(state_dict, strict=False)
-    del state_dict; torch.cuda.empty_cache()
     # Set model to eval mode and disable gradients
     for p in model.parameters():
@@ -90,6 +103,7 @@ def setup_model_and_components(args: argparse.Namespace) -> Tuple[torch.nn.Modul
     # Parallelize model for multi-GPU sampling
     _, model = utils.parallelize_model(args, model, dist, device)
     return model, vae, (tokenizer, text_encoder, dist, device)

 import copy
 import pathlib
 import time
+import gc
 from typing import Dict, List, Optional, Tuple, Union
 import numpy as np
     utils.set_random_seed(args.seed + dist.rank)
     # Setup text encoder
+    print("Loading text encoder...")
     tokenizer, text_encoder = utils.setup_encoder(args, dist, device)
+    torch.cuda.empty_cache()  # Clear cache after text encoder
     # Setup VAE if specified
     vae = None
     if args.vae is not None:
+        print("Loading VAE...")
         vae = utils.setup_vae(args, dist, device)
         args.img_size = args.img_size // vae.downsample_factor
+        torch.cuda.empty_cache()  # Clear cache after VAE
     else:
         args.finetuned_vae = 'none'
     # Setup main transformer model
+    print("Loading main transformer model...")
     model = utils.setup_transformer(
         args, dist,
         txt_dim=text_encoder.config.hidden_size,
         use_checkpoint=1
+    )
+    # Load checkpoint to CPU first, then move to GPU
+    print(f"Loading checkpoint from: {args.checkpoint_path}")
     state_dict = torch.load(args.checkpoint_path, map_location='cpu')
     model.load_state_dict(state_dict, strict=False)
+    del state_dict
+    gc.collect()  # Force garbage collection
+    torch.cuda.empty_cache()  # Clear any GPU cache
+    # Move model to GPU after loading weights
+    print("Moving model to GPU...")
+    model = model.to(device)
+    torch.cuda.empty_cache()  # Clear cache after moving to GPU
     # Set model to eval mode and disable gradients
     for p in model.parameters():
     # Parallelize model for multi-GPU sampling
     _, model = utils.parallelize_model(args, model, dist, device)
+    torch.cuda.empty_cache()  # Final cache clear
     return model, vae, (tokenizer, text_encoder, dist, device)