Spaces:

GlobalStudio
/

starflow

Running on Zero

App Files Files Community

starflow / utils /training.py

leoeric

Fix GPU abort error: improve ZeroGPU decorator detection and GPU context handling

34395b9 7 days ago

raw

history blame contribute delete

8.19 kB

	#
	# For licensing see accompanying LICENSE file.
	# Copyright (C) 2025 Apple Inc. All Rights Reserved.
	#
	"""
	Training utilities for STARFlow.
	"""

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import torch.distributed
	import torch.distributed.checkpoint as dcp
	from torch.distributed._composable.fsdp import fully_shard, MixedPrecisionPolicy, CPUOffloadPolicy
	from torch.distributed._tensor import DeviceMesh
	from torch.distributed.device_mesh import init_device_mesh
	import datetime
	import math
	import os
	import random
	import numpy as np
	import contextlib
	import typing as t
	from typing import Any, Dict, List, Union, Optional
	from collections import defaultdict, OrderedDict
	from fnmatch import fnmatch


	# ==== Learning Rate Schedule ====

	class CosineLRSchedule(torch.nn.Module):
	counter: torch.Tensor

	def __init__(self, optimizer, warmup_steps: int, total_steps: int, min_lr: float, max_lr: float):
	super().__init__()
	self.register_buffer('counter', torch.zeros(()))
	self.warmup_steps = warmup_steps
	self.total_steps = total_steps
	self.optimizer = optimizer
	self.min_lr = min_lr
	self.start_lr = min(min_lr, 1e-6)
	self.max_lr = max_lr
	self.set_lr(min_lr)

	def set_lr(self, lr: float) -> float:
	if self.min_lr <= lr <= self.max_lr:
	for pg in self.optimizer.param_groups:
	pg['lr'] = lr
	return pg['lr']

	def step(self) -> float:
	with torch.no_grad():
	counter = self.counter.add_(1).item()
	if self.counter <= self.warmup_steps:
	new_lr = self.start_lr + counter / self.warmup_steps * (self.max_lr - self.start_lr)
	return self.set_lr(new_lr)

	t = (counter - self.warmup_steps) / (self.total_steps - self.warmup_steps)
	new_lr = self.min_lr + 0.5 * (1 + math.cos(math.pi * t)) * (self.max_lr - self.min_lr)
	return self.set_lr(new_lr)


	# ==== Distributed Training ====

	class Distributed:
	timeout: float = 72000

	def __init__(self):
	if os.environ.get('MASTER_PORT'): # When running with torchrun
	self.rank = int(os.environ['RANK'])
	self.local_rank = int(os.environ['LOCAL_RANK'])
	self.world_size = int(os.environ['WORLD_SIZE'])
	self.distributed = True
	torch.distributed.init_process_group(
	backend='nccl',
	init_method='env://',
	world_size=self.world_size,
	timeout=datetime.timedelta(seconds=self.timeout),
	rank=self.rank,
	)
	else: # When running with python for debugging
	self.rank, self.local_rank, self.world_size = 0, 0, 1
	self.distributed = False
	# Only set CUDA device if CUDA is available
	if torch.cuda.is_available():
	torch.cuda.set_device(self.local_rank)
	self.barrier()

	def barrier(self) -> None:
	if self.distributed:
	torch.distributed.barrier()

	def gather_concat(self, x: torch.Tensor) -> torch.Tensor:
	if not self.distributed:
	return x
	x_list = [torch.empty_like(x) for _ in range(self.world_size)]
	torch.distributed.all_gather(x_list, x)
	return torch.cat(x_list)

	def reduce(self, x):
	if not self.distributed:
	return x
	torch.distributed.all_reduce(x, op=torch.distributed.ReduceOp.SUM)
	return x

	def __del__(self):
	if self.distributed:
	torch.distributed.destroy_process_group()


	def get_local_rank() -> int:
	if os.environ.get('MASTER_PORT'): # When running with torchrun
	return int(os.environ['LOCAL_RANK'])
	return 0


	def get_device_mesh(dp_size: int, tp_size: int = 1) -> DeviceMesh:
	"""Create DeviceMesh based on tensor and data parallelism configuration."""
	# by default, I will use TP=1 for simplicity
	mesh_shape = (dp_size, tp_size)
	names = ("dp", "tp")
	return init_device_mesh("cuda", mesh_shape=mesh_shape, mesh_dim_names=names)


	def wrap_matching_layers(
	model: nn.Module,
	layer_patterns: t.List[str],
	wrapper_fn: t.Callable[[nn.Module], nn.Module],
	):
	"""
	Recursively wraps submodules in the order they appear in layer_patterns.
	For each pattern (in order), we do a pass over the model and wrap matches.
	"""
	def _wrap_single_pattern(mod: nn.Module, pattern: str):
	"""
	Recurse over mod, wrapping submodules that match `pattern`.
	We do a post-order traversal so children get wrapped before the parent.
	"""
	for child_name, child_module in list(mod.named_children()):
	# Wrap grandchildren first.
	_wrap_single_pattern(child_module, pattern)

	# Check if the child's class name matches the pattern.
	if fnmatch(child_module.__class__.__name__, pattern):
	# Replace the child in the parent.
	wrapped = wrapper_fn(child_module)
	setattr(mod, child_name, wrapped)

	# We do a pass for each pattern in order
	for pattern in layer_patterns:
	_wrap_single_pattern(model, pattern)


	def parallelize_model(args, model: nn.Module, dist: Distributed, device='cuda', block_names=['AttentionBlock']) -> nn.Module:
	if not getattr(args, "fsdp", False): # use standard DDP
	model = model.to(device=device)
	if dist.distributed:
	print(f"Using DDP")
	model_ddp = torch.nn.parallel.DistributedDataParallel(model, device_ids=[dist.local_rank])
	else:
	model_ddp = model # compatible with DDP
	return model, model_ddp

	# Instantiate mixed precision policy from config
	mp_policy = MixedPrecisionPolicy(
	param_dtype=torch.bfloat16,
	reduce_dtype=torch.bfloat16,
	output_dtype=torch.bfloat16,
	cast_forward_inputs=True
	)
	print(f"Using FSDP2 with: {mp_policy}")

	# Apply FSDP wrapping based on specified parallel dimensions
	dp_mesh = get_device_mesh(dist.world_size)["dp"]

	# Configure core FSDP parameters
	fsdp_config = {"mp_policy": mp_policy, "mesh": dp_mesh, "reshard_after_forward": True}

	# Wrap specified layer patterns with FSDP
	wrap_matching_layers(model, block_names, lambda m: fully_shard(m, **fsdp_config))

	# Then wrap full model (remaining modules are captured with this)
	model = fully_shard(model, **fsdp_config)
	model = model.to(device=device)
	return model, model # for compatibility with DDP


	def save_model(args, dist, model, model_ckpt_file):
	states = model.state_dict()
	if not getattr(args, "fsdp", False): # save DDP checkpoints
	if dist.local_rank == 0:
	torch.save(states, model_ckpt_file)
	else: # save FSDP checkpoints
	dcp.save(states, checkpoint_id=str(model_ckpt_file))


	def save_optimizer(args, dist, optimizer, lr_schedule, opt_ckpt_file):
	optim_states, lr_states = optimizer.state_dict(), lr_schedule.state_dict()
	if not getattr(args, "fsdp", False): # save DDP checkpoints
	if dist.local_rank == 0:
	torch.save({"optimizer": optim_states, "lr_schedule": lr_states}, opt_ckpt_file)
	else:
	filename = str(opt_ckpt_file)
	dcp.save(optim_states, checkpoint_id=f"{filename}/optimizer")
	torch.save(lr_states, f"{filename}/lr_schedule.bin") # lr_schedule is not fsdp


	@contextlib.contextmanager
	def _fsdp2_no_sync(module, sync):
	# v2 APIs
	module.set_requires_gradient_sync(sync, recurse=True)
	try:
	yield
	finally:
	module.set_requires_gradient_sync(True, recurse=True)


	def sync_ctx(model, sync=True):
	if hasattr(model, 'set_requires_gradient_sync'):
	return _fsdp2_no_sync(model, sync)
	elif not sync and hasattr(model, 'no_sync'):
	return model.no_sync()
	return contextlib.nullcontext()


	# ==== Utility Functions ====

	def set_random_seed(seed: int) -> None:
	"""Set random seed for reproducibility."""
	random.seed(seed)
	np.random.seed(seed)
	torch.manual_seed(seed)
	torch.cuda.manual_seed(seed)
	torch.cuda.manual_seed_all(seed)