Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import os | |
| import numpy as np | |
| import cv2 | |
| import time | |
| import shutil | |
| from pathlib import Path | |
| from einops import rearrange | |
| from typing import Union | |
| try: | |
| import spaces | |
| except ImportError: | |
| def spaces(func): | |
| return func | |
| import torch | |
| import torchvision.transforms as T | |
| import logging | |
| from concurrent.futures import ThreadPoolExecutor | |
| import atexit | |
| import uuid | |
| import decord | |
| from models.SpaTrackV2.models.vggt4track.models.vggt_moe import VGGT4Track | |
| from models.SpaTrackV2.models.vggt4track.utils.load_fn import preprocess_image | |
| from models.SpaTrackV2.models.predictor import Predictor | |
| from models.SpaTrackV2.models.utils import get_points_on_a_grid | |
| from diffusers.utils import export_to_video, load_image | |
| from pipelines.wan_pipeline import WanImageToVideoTTMPipeline | |
| from pipelines.utils import compute_hw_from_area | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| MAX_FRAMES = 81 | |
| OUTPUT_FPS = 24 | |
| RENDER_WIDTH = 512 | |
| RENDER_HEIGHT = 384 | |
| WAN_MODEL_ID = "Wan-AI/Wan2.2-I2V-A14B-Diffusers" | |
| CAMERA_MOVEMENTS = [ | |
| "static", | |
| "move_forward", | |
| "move_backward", | |
| "move_left", | |
| "move_right", | |
| "move_up", | |
| "move_down" | |
| ] | |
| thread_pool_executor = ThreadPoolExecutor(max_workers=2) | |
| def delete_later(path: Union[str, os.PathLike], delay: int = 600): | |
| def _delete(): | |
| try: | |
| if os.path.isfile(path): | |
| os.remove(path) | |
| elif os.path.isdir(path): | |
| shutil.rmtree(path) | |
| except Exception as e: | |
| logger.warning(f"Failed to delete {path}: {e}") | |
| def _wait_and_delete(): | |
| time.sleep(delay) | |
| _delete() | |
| thread_pool_executor.submit(_wait_and_delete) | |
| atexit.register(_delete) | |
| def create_user_temp_dir(): | |
| session_id = str(uuid.uuid4())[:8] | |
| temp_dir = os.path.join("temp_local", f"session_{session_id}") | |
| os.makedirs(temp_dir, exist_ok=True) | |
| delete_later(temp_dir, delay=600) | |
| return temp_dir | |
| print("🚀 Initializing tracking models...") | |
| vggt4track_model = VGGT4Track.from_pretrained( | |
| "Yuxihenry/SpatialTrackerV2_Front") | |
| vggt4track_model.eval() | |
| if not hasattr(vggt4track_model, 'infer'): | |
| vggt4track_model.infer = vggt4track_model.forward | |
| tracker_model = Predictor.from_pretrained("Yuxihenry/SpatialTrackerV2-Offline") | |
| tracker_model.eval() | |
| wan_pipeline = WanImageToVideoTTMPipeline.from_pretrained( | |
| WAN_MODEL_ID, | |
| torch_dtype=torch.bfloat16 | |
| ) | |
| wan_pipeline.vae.enable_tiling() | |
| wan_pipeline.vae.enable_slicing() | |
| print("✅ Tracking models loaded successfully!") | |
| gr.set_static_paths(paths=[Path.cwd().absolute()/"_viz"]) | |
| def generate_camera_trajectory(num_frames: int, movement_type: str, base_intrinsics: np.ndarray, scene_scale: float = 1.0) -> tuple: | |
| speed = scene_scale * 0.02 | |
| extrinsics = np.zeros((num_frames, 4, 4), dtype=np.float32) | |
| for t in range(num_frames): | |
| ext = np.eye(4, dtype=np.float32) | |
| if movement_type == "move_forward": | |
| ext[2, 3] = -speed * t | |
| elif movement_type == "move_backward": | |
| ext[2, 3] = speed * t | |
| elif movement_type == "move_left": | |
| ext[0, 3] = -speed * t | |
| elif movement_type == "move_right": | |
| ext[0, 3] = speed * t | |
| elif movement_type == "move_up": | |
| ext[1, 3] = -speed * t | |
| elif movement_type == "move_down": | |
| ext[1, 3] = speed * t | |
| extrinsics[t] = ext | |
| return extrinsics | |
| def render_from_pointcloud(rgb_frames, depth_frames, intrinsics, original_extrinsics, new_extrinsics, output_path, fps=24, generate_ttm_inputs=False): | |
| T, H, W, _ = rgb_frames.shape | |
| fourcc = cv2.VideoWriter_fourcc(*'mp4v') | |
| out = cv2.VideoWriter(output_path, fourcc, fps, (W, H)) | |
| motion_signal_path = mask_path = out_motion_signal = out_mask = None | |
| if generate_ttm_inputs: | |
| base_dir = os.path.dirname(output_path) | |
| motion_signal_path = os.path.join(base_dir, "motion_signal.mp4") | |
| mask_path = os.path.join(base_dir, "mask.mp4") | |
| out_motion_signal = cv2.VideoWriter( | |
| motion_signal_path, fourcc, fps, (W, H)) | |
| out_mask = cv2.VideoWriter(mask_path, fourcc, fps, (W, H)) | |
| u, v = np.meshgrid(np.arange(W), np.arange(H)) | |
| for t in range(T): | |
| rgb, depth, K = rgb_frames[t], depth_frames[t], intrinsics[t] | |
| orig_c2w = np.linalg.inv(original_extrinsics[t]) | |
| if t == 0: | |
| base_c2w = orig_c2w.copy() | |
| new_c2w = base_c2w @ new_extrinsics[t] | |
| new_w2c = np.linalg.inv(new_c2w) | |
| K_inv = np.linalg.inv(K) | |
| pixels = np.stack([u, v, np.ones_like(u)], axis=-1).reshape(-1, 3) | |
| rays_cam = (K_inv @ pixels.T).T | |
| points_cam = rays_cam * depth.reshape(-1, 1) | |
| points_world = (orig_c2w[:3, :3] @ points_cam.T).T + orig_c2w[:3, 3] | |
| points_new_cam = (new_w2c[:3, :3] @ points_world.T).T + new_w2c[:3, 3] | |
| points_proj = (K @ points_new_cam.T).T | |
| z = np.clip(points_proj[:, 2:3], 1e-6, None) | |
| uv_new = points_proj[:, :2] / z | |
| rendered = np.zeros((H, W, 3), dtype=np.uint8) | |
| z_buffer = np.full((H, W), np.inf, dtype=np.float32) | |
| colors, depths_new = rgb.reshape(-1, 3), points_new_cam[:, 2] | |
| for i in range(len(uv_new)): | |
| uu, vv = int(round(uv_new[i, 0])), int(round(uv_new[i, 1])) | |
| if 0 <= uu < W and 0 <= vv < H and depths_new[i] > 0: | |
| if depths_new[i] < z_buffer[vv, uu]: | |
| z_buffer[vv, uu] = depths_new[i] | |
| rendered[vv, uu] = colors[i] | |
| valid_mask = (rendered.sum(axis=-1) > 0).astype(np.uint8) * 255 | |
| motion_signal_frame = rendered.copy() | |
| hole_mask = (motion_signal_frame.sum(axis=-1) == 0).astype(np.uint8) | |
| if hole_mask.sum() > 0: | |
| kernel = np.ones((3, 3), np.uint8) | |
| for _ in range(10): # Iterative fill | |
| if hole_mask.sum() == 0: | |
| break | |
| dilated = cv2.dilate(motion_signal_frame, kernel) | |
| motion_signal_frame = np.where( | |
| hole_mask[:, :, None] > 0, dilated, motion_signal_frame) | |
| hole_mask = (motion_signal_frame.sum( | |
| axis=-1) == 0).astype(np.uint8) | |
| if generate_ttm_inputs: | |
| out_motion_signal.write(cv2.cvtColor( | |
| motion_signal_frame, cv2.COLOR_RGB2BGR)) | |
| out_mask.write(np.stack([valid_mask]*3, axis=-1)) | |
| out.write(cv2.cvtColor(motion_signal_frame, cv2.COLOR_RGB2BGR)) | |
| out.release() | |
| if generate_ttm_inputs: | |
| out_motion_signal.release() | |
| out_mask.release() | |
| return {'rendered': output_path, 'motion_signal': motion_signal_path, 'mask': mask_path} | |
| def run_spatial_tracker(video_tensor: torch.Tensor): | |
| """ | |
| GPU-intensive spatial tracking function. | |
| Args: | |
| video_tensor: Preprocessed video tensor (T, C, H, W) | |
| Returns: | |
| Dictionary containing tracking results | |
| """ | |
| global vggt4track_model | |
| global tracker_model | |
| global wan_pipeline | |
| video_input = preprocess_image(video_tensor)[None].cuda() | |
| vggt4track_model = vggt4track_model.to("cuda") | |
| with torch.no_grad(): | |
| with torch.amp.autocast('cuda', dtype=torch.bfloat16): | |
| predictions = vggt4track_model(video_input / 255) | |
| extrinsic = predictions["poses_pred"] | |
| intrinsic = predictions["intrs"] | |
| depth_map = predictions["points_map"][..., 2] | |
| depth_conf = predictions["unc_metric"] | |
| depth_tensor = depth_map.squeeze().cpu().numpy() | |
| extrs = extrinsic.squeeze().cpu().numpy() | |
| intrs = intrinsic.squeeze().cpu().numpy() | |
| video_tensor_gpu = video_input.squeeze() | |
| unc_metric = depth_conf.squeeze().cpu().numpy() > 0.5 | |
| tracker_model.spatrack.track_num = 512 | |
| tracker_model.to("cuda") | |
| frame_H, frame_W = video_tensor_gpu.shape[2:] | |
| grid_pts = get_points_on_a_grid(30, (frame_H, frame_W), device="cpu") | |
| query_xyt = torch.cat([torch.zeros_like(grid_pts[:, :, :1]), grid_pts], dim=2)[ | |
| 0].numpy() | |
| with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16): | |
| ( | |
| c2w_traj, intrs_out, point_map, conf_depth, | |
| track3d_pred, track2d_pred, vis_pred, conf_pred, video_out | |
| ) = tracker_model.forward( | |
| video_tensor_gpu, depth=depth_tensor, | |
| intrs=intrs, extrs=extrs, | |
| queries=query_xyt, | |
| fps=1, full_point=False, iters_track=4, | |
| query_no_BA=True, fixed_cam=False, stage=1, | |
| unc_metric=unc_metric, | |
| support_frame=len(video_tensor_gpu)-1, replace_ratio=0.2 | |
| ) | |
| max_size = 384 | |
| h, w = video_out.shape[2:] | |
| scale = min(max_size / h, max_size / w) | |
| if scale < 1: | |
| new_h, new_w = int(h * scale), int(w * scale) | |
| video_out = T.Resize((new_h, new_w))(video_out) | |
| point_map = T.Resize((new_h, new_w))(point_map) | |
| conf_depth = T.Resize((new_h, new_w))(conf_depth) | |
| intrs_out[:, :2, :] = intrs_out[:, :2, :] * scale | |
| return { | |
| 'video_out': video_out.cpu(), | |
| 'point_map': point_map.cpu(), | |
| 'conf_depth': conf_depth.cpu(), | |
| 'intrs_out': intrs_out.cpu(), | |
| 'c2w_traj': c2w_traj.cpu(), | |
| } | |
| def run_wan_ttm_generation(prompt, tweak_index, tstrong_index, first_frame_path, motion_video_path, mask_video_path, progress=gr.Progress()): | |
| if not first_frame_path or not motion_video_path or not mask_video_path: | |
| return None, "❌ TTM Inputs missing. Please run 3D tracking first." | |
| progress(0, desc="Loading Wan TTM Pipeline...") | |
| import decord | |
| vr = decord.VideoReader(motion_video_path) | |
| actual_frame_count = len(vr) | |
| target_num_frames = ((actual_frame_count - 1) // 4) * 4 + 1 | |
| if target_num_frames < 5: | |
| return None, f"❌ Video too short. Only {actual_frame_count} frames tracked." | |
| logger.info(f"Setting Wan num_frames to {target_num_frames} based on tracking output.") | |
| progress(0.2, desc="Preparing inputs...") | |
| image = load_image(first_frame_path) | |
| negative_prompt = ( | |
| "色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量," | |
| "低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的," | |
| "毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" | |
| ) | |
| wan_pipeline.to("cuda") | |
| max_area = 480 * 832 | |
| mod_value = wan_pipeline.vae_scale_factor_spatial * \ | |
| wan_pipeline.transformer.config.patch_size[1] | |
| height, width = compute_hw_from_area( | |
| image.height, image.width, max_area, mod_value) | |
| image = image.resize((width, height)) | |
| progress(0.4, desc=f"Generating {target_num_frames} frames (this may take a few minutes)...") | |
| generator = torch.Generator(device="cuda").manual_seed(0) | |
| with torch.inference_mode(): | |
| result = wan_pipeline( | |
| image=image, | |
| prompt=prompt, | |
| negative_prompt=negative_prompt, | |
| height=height, | |
| width=width, | |
| num_frames=target_num_frames, | |
| guidance_scale=3.5, | |
| num_inference_steps=50, | |
| generator=generator, | |
| motion_signal_video_path=motion_video_path, | |
| motion_signal_mask_path=mask_video_path, | |
| tweak_index=int(tweak_index), | |
| tstrong_index=int(tstrong_index), | |
| ) | |
| output_path = os.path.join(os.path.dirname( | |
| first_frame_path), "wan_ttm_output.mp4") | |
| export_to_video(result.frames[0], output_path, fps=16) | |
| return output_path, f"✅ TTM Video ({target_num_frames} frames) generated successfully!" | |
| # --- MODIFIED PROCESS VIDEO TO RETURN FILE PATHS --- | |
| def process_video(video_path, camera_movement, generate_ttm=True, progress=gr.Progress()): | |
| if video_path is None: | |
| return None, None, None, None, "❌ Please upload a video first" | |
| progress(0, desc="Initializing...") | |
| temp_dir = create_user_temp_dir() | |
| out_dir = os.path.join(temp_dir, "results") | |
| os.makedirs(out_dir, exist_ok=True) | |
| try: | |
| progress(0.1, desc="Loading video...") | |
| video_reader = decord.VideoReader(video_path) | |
| video_tensor = torch.from_numpy(video_reader.get_batch( | |
| range(len(video_reader))).asnumpy()).permute(0, 3, 1, 2).float() | |
| video_tensor = video_tensor[::max( | |
| 1, len(video_tensor)//MAX_FRAMES)][:MAX_FRAMES] | |
| h, w = video_tensor.shape[2:] | |
| scale = 336 / min(h, w) | |
| if scale < 1: | |
| video_tensor = T.Resize( | |
| (int(h*scale)//2*2, int(w*scale)//2*2))(video_tensor) | |
| progress(0.4, desc="Running 3D tracking...") | |
| tracking_results = run_spatial_tracker(video_tensor) | |
| rgb_frames = rearrange( | |
| tracking_results['video_out'].numpy(), "T C H W -> T H W C").astype(np.uint8) | |
| depth_frames = tracking_results['point_map'][:, 2].numpy() | |
| depth_frames[tracking_results['conf_depth'].numpy() < 0.5] = 0 | |
| scene_scale = np.median(depth_frames[depth_frames > 0]) if np.any( | |
| depth_frames > 0) else 1.0 | |
| new_exts = generate_camera_trajectory(len( | |
| rgb_frames), camera_movement, tracking_results['intrs_out'].numpy(), scene_scale) | |
| progress(0.8, desc="Rendering viewpoint...") | |
| output_video_path = os.path.join(out_dir, "rendered_video.mp4") | |
| render_results = render_from_pointcloud(rgb_frames, depth_frames, tracking_results['intrs_out'].numpy(), | |
| torch.inverse( | |
| tracking_results['c2w_traj']).numpy(), | |
| new_exts, output_video_path, fps=OUTPUT_FPS, generate_ttm_inputs=generate_ttm) | |
| first_frame_path = os.path.join(out_dir, "first_frame.png") | |
| cv2.imwrite(first_frame_path, cv2.cvtColor( | |
| rgb_frames[0], cv2.COLOR_RGB2BGR)) | |
| status_msg = f"✅ 3D results ready! You can now use the prompt below to generate a high-quality TTM video." | |
| return render_results['rendered'], render_results['motion_signal'], render_results['mask'], first_frame_path, status_msg | |
| except Exception as e: | |
| logger.error(f"Error: {e}") | |
| return None, None, None, None, f"❌ Error: {str(e)}" | |
| # --- GRADIO INTERFACE --- | |
| with gr.Blocks(theme=gr.themes.Soft(), title="🎬 TTM Wan Video Generator") as demo: | |
| gr.Markdown("# 🎬 Video to Point Cloud & TTM Wan Generator") | |
| gr.Markdown( | |
| "Transform standard videos into 3D-aware motion signals for Time-to-Move (TTM) generation.") | |
| first_frame_file = gr.State("") | |
| motion_signal_file = gr.State("") | |
| mask_file = gr.State("") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### 1. Tracking & Viewpoint") | |
| video_input = gr.Video(label="Upload Video") | |
| camera_movement = gr.Dropdown( | |
| choices=CAMERA_MOVEMENTS, | |
| value="static", | |
| label="Camera Movement" | |
| ) | |
| generate_btn = gr.Button( | |
| "🚀 1. Run Spatial Tracker", variant="primary") | |
| output_video = gr.Video(label="Point Cloud Render (Draft)") | |
| status_text = gr.Markdown("Ready...") | |
| with gr.Column(scale=1): | |
| gr.Markdown("### 2. Time-to-Move (Wan 2.2)") | |
| ttm_prompt = gr.Textbox( | |
| label="Prompt", | |
| placeholder="Describe the scene (e.g., 'A monkey walking in the forest, high quality')" | |
| ) | |
| with gr.Row(): | |
| tweak_idx = gr.Number( | |
| label="Tweak Index", value=3, precision=0) | |
| tstrong_idx = gr.Number( | |
| label="Tstrong Index", value=6, precision=0) | |
| wan_generate_btn = gr.Button( | |
| "✨ 2. Generate TTM Video (Wan)", variant="secondary") | |
| wan_output_video = gr.Video(label="Final High-Quality TTM Video") | |
| wan_status = gr.Markdown("Awaiting 3D inputs...") | |
| with gr.Accordion("Debug: TTM Intermediate Inputs", open=False): | |
| with gr.Row(): | |
| motion_signal_output = gr.Video(label="motion_signal.mp4") | |
| mask_output = gr.Video(label="mask.mp4") | |
| first_frame_output = gr.Image( | |
| label="first_frame.png", type="filepath") | |
| generate_btn.click( | |
| fn=process_video, | |
| inputs=[video_input, camera_movement], | |
| outputs=[ | |
| output_video, | |
| motion_signal_output, | |
| mask_output, | |
| first_frame_output, | |
| status_text | |
| ] | |
| ).then( | |
| fn=lambda a, b, c, d, e: (b, c, d), | |
| inputs=[ | |
| output_video, | |
| motion_signal_output, | |
| mask_output, | |
| first_frame_output, | |
| status_text | |
| ], | |
| outputs=[motion_signal_file, mask_file, first_frame_file] | |
| ) | |
| wan_generate_btn.click( | |
| fn=run_wan_ttm_generation, | |
| inputs=[ | |
| ttm_prompt, | |
| tweak_idx, | |
| tstrong_idx, | |
| first_frame_file, | |
| motion_signal_file, | |
| mask_file | |
| ], | |
| outputs=[wan_output_video, wan_status] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(share=False) | |