Commit
·
48571f9
1
Parent(s):
1e533fd
Upload folder using huggingface_hub
Browse files- data/data.ipynb +0 -0
- data/output.csv +19 -20
- data/output/18.mp4 +0 -0
- data/output/19.mp4 +0 -0
- data/output/20.mp4 +0 -0
- data/output/21.mp4 +0 -0
- data/output/22.mp4 +0 -0
- data/output/23.mp4 +0 -0
- data/output/24.mp4 +0 -0
- data/output/25.mp4 +0 -0
- data/output/26.mp4 +0 -0
- data/output/27.mp4 +0 -0
- data/output/28.mp4 +0 -0
- data/output/29.mp4 +0 -0
- data/output/30.mp4 +0 -0
- data/output/31.mp4 +0 -0
- data/output/32.mp4 +0 -0
- data/output/33.mp4 +0 -0
- data/output/34.mp4 +0 -0
- handler.py +98 -21
- models/Motion_Module/test/config.yaml +4 -4
data/data.ipynb
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/output.csv
CHANGED
|
@@ -12,26 +12,25 @@ videoid,name,page_dir
|
|
| 12 |
10,"camera panning right to left, a bird's - eye view of a row of buildings in a city with trees in the foreground",raw/3dDS1RL
|
| 13 |
11,"camera panning left to right, a computer generated image of a blue diamond in the middle of a green and leafy area",raw/3dDS20
|
| 14 |
12,"camera panning left to right, a computer generated image of a landscape with trees, rocks, and a path through a forest",raw/3dDS21
|
| 15 |
-
13,"camera panning
|
| 16 |
14,"camera panning left to right, a map of a small town with a lot of trees and bushes in the middle of it",raw/3dDS23
|
| 17 |
-
15,"camera panning
|
| 18 |
16,"camera panning left to right, an artist's rendering of a floating island in the middle of a large body of water",raw/3dDS25
|
| 19 |
17,"camera panning left to right, a couple of white birds floating on top of a lake filled with waterlily green leaves",raw/3dDS26
|
| 20 |
-
18,"camera panning left to right, a
|
| 21 |
-
19,"camera panning
|
| 22 |
-
20,"camera panning
|
| 23 |
-
21,"camera panning
|
| 24 |
-
22,"camera panning left to right, an
|
| 25 |
-
23,"camera panning left to right, an
|
| 26 |
-
24,"camera panning left to right,
|
| 27 |
-
25,"camera panning
|
| 28 |
-
26,"camera panning
|
| 29 |
-
27,"camera panning
|
| 30 |
-
28,"camera panning left to right, a
|
| 31 |
-
29,"camera panning left to right, a bird's
|
| 32 |
-
30,"camera panning left to right, a bird's eye view of a
|
| 33 |
-
31,"camera panning left to right, a
|
| 34 |
-
32,"camera panning
|
| 35 |
-
33,"camera panning right to left, a
|
| 36 |
-
34,"camera panning
|
| 37 |
-
35,"camera panning left to right, a computer generated image of a city street with tall buildings and a clock tower in the distance",raw/3dDS9
|
|
|
|
| 12 |
10,"camera panning right to left, a bird's - eye view of a row of buildings in a city with trees in the foreground",raw/3dDS1RL
|
| 13 |
11,"camera panning left to right, a computer generated image of a blue diamond in the middle of a green and leafy area",raw/3dDS20
|
| 14 |
12,"camera panning left to right, a computer generated image of a landscape with trees, rocks, and a path through a forest",raw/3dDS21
|
| 15 |
+
13,"camera panning right to left, a computer generated image of a building with a blue roof and a blue roof on top of it",raw/3dDS22RL
|
| 16 |
14,"camera panning left to right, a map of a small town with a lot of trees and bushes in the middle of it",raw/3dDS23
|
| 17 |
+
15,"camera panning right to left, a computer generated image of a building with a blue roof and a green lawn in front of it",raw/3dDS24RL
|
| 18 |
16,"camera panning left to right, an artist's rendering of a floating island in the middle of a large body of water",raw/3dDS25
|
| 19 |
17,"camera panning left to right, a couple of white birds floating on top of a lake filled with waterlily green leaves",raw/3dDS26
|
| 20 |
+
18,"camera panning left to right, a 3d rendering of a jail cell with bars and bars on each side of the cell wall",raw/3dDS28
|
| 21 |
+
19,"camera panning right to left, an artist's rendering of a bridge over a body of water with a building in the background",raw/3dDS29RL
|
| 22 |
+
20,"camera panning right to left, a bird's eye view of a building with a lot of trees in front of it",raw/3dDS2RL
|
| 23 |
+
21,"camera panning left to right, an artist's rendering of a highway with a bridge in the middle and trees on both sides",raw/3dDS30
|
| 24 |
+
22,"camera panning left to right, an aerial view of a freeway with multiple lanes and a bridge in the foreground and a cityscape in the background",raw/3dDS31
|
| 25 |
+
23,"camera panning left to right, an artist's rendering of a bridge over a body of water with a clock tower in the background",raw/3dDS32
|
| 26 |
+
24,"camera panning left to right, a set of stairs leading up to the top of a set of stairs in a dimly lit area",raw/3dDS33
|
| 27 |
+
25,"camera panning right to left, a 3d rendering of a subway station with red and yellow striped barriers and a red and white sign",raw/3dDS34RL
|
| 28 |
+
26,"camera panning right to left, a screenshot of a subway station with a man in a suit and a woman in a dress",raw/3dDS35RL
|
| 29 |
+
27,"camera panning right to left, a screenshot of a hallway in a building with a gate and a sign on the wall",raw/3dDS36RL
|
| 30 |
+
28,"camera panning left to right, a bird's - eye view of a row of houses in the suburbs of a city",raw/3dDS3LR
|
| 31 |
+
29,"camera panning left to right, a bird's eye view of a house with solar panels on the top of the roof",raw/3dDS4
|
| 32 |
+
30,"camera panning left to right, a bird's eye view of a row of houses with solar panels on top of them",raw/3dDS5
|
| 33 |
+
31,"camera panning left to right, a screenshot of a city street with a bench on one side and a building on the other",raw/3dDS6
|
| 34 |
+
32,"camera panning right to left, a computer generated image of a computer generated image of a construction site with a truck in the background",raw/3dDS7RL
|
| 35 |
+
33,"camera panning right to left, a screenshot of a city with a bunch of green objects in the middle of the street",raw/3dDS8RL
|
| 36 |
+
34,"camera panning left to right, a computer generated image of a city street with tall buildings and a clock tower in the distance",raw/3dDS9
|
|
|
data/output/18.mp4
CHANGED
|
Binary files a/data/output/18.mp4 and b/data/output/18.mp4 differ
|
|
|
data/output/19.mp4
CHANGED
|
Binary files a/data/output/19.mp4 and b/data/output/19.mp4 differ
|
|
|
data/output/20.mp4
CHANGED
|
Binary files a/data/output/20.mp4 and b/data/output/20.mp4 differ
|
|
|
data/output/21.mp4
CHANGED
|
Binary files a/data/output/21.mp4 and b/data/output/21.mp4 differ
|
|
|
data/output/22.mp4
CHANGED
|
Binary files a/data/output/22.mp4 and b/data/output/22.mp4 differ
|
|
|
data/output/23.mp4
CHANGED
|
Binary files a/data/output/23.mp4 and b/data/output/23.mp4 differ
|
|
|
data/output/24.mp4
CHANGED
|
Binary files a/data/output/24.mp4 and b/data/output/24.mp4 differ
|
|
|
data/output/25.mp4
CHANGED
|
Binary files a/data/output/25.mp4 and b/data/output/25.mp4 differ
|
|
|
data/output/26.mp4
CHANGED
|
Binary files a/data/output/26.mp4 and b/data/output/26.mp4 differ
|
|
|
data/output/27.mp4
CHANGED
|
Binary files a/data/output/27.mp4 and b/data/output/27.mp4 differ
|
|
|
data/output/28.mp4
CHANGED
|
Binary files a/data/output/28.mp4 and b/data/output/28.mp4 differ
|
|
|
data/output/29.mp4
CHANGED
|
Binary files a/data/output/29.mp4 and b/data/output/29.mp4 differ
|
|
|
data/output/30.mp4
CHANGED
|
Binary files a/data/output/30.mp4 and b/data/output/30.mp4 differ
|
|
|
data/output/31.mp4
CHANGED
|
Binary files a/data/output/31.mp4 and b/data/output/31.mp4 differ
|
|
|
data/output/32.mp4
CHANGED
|
Binary files a/data/output/32.mp4 and b/data/output/32.mp4 differ
|
|
|
data/output/33.mp4
CHANGED
|
Binary files a/data/output/33.mp4 and b/data/output/33.mp4 differ
|
|
|
data/output/34.mp4
CHANGED
|
Binary files a/data/output/34.mp4 and b/data/output/34.mp4 differ
|
|
|
handler.py
CHANGED
|
@@ -9,6 +9,8 @@ import os
|
|
| 9 |
import json
|
| 10 |
import base64
|
| 11 |
|
|
|
|
|
|
|
| 12 |
from diffusers.utils.import_utils import is_xformers_available
|
| 13 |
from typing import Any
|
| 14 |
import torch
|
|
@@ -21,7 +23,10 @@ from animatediff.models.unet import UNet3DConditionModel
|
|
| 21 |
from animatediff.pipelines.pipeline_animation import AnimationPipeline
|
| 22 |
from animatediff.utils.util import save_videos_grid
|
| 23 |
from animatediff.utils.util import load_weights
|
|
|
|
|
|
|
| 24 |
|
|
|
|
| 25 |
|
| 26 |
class EndpointHandler():
|
| 27 |
def __init__(self, model_path: str = "bluestarburst/AnimateDiff-SceneFusion"):
|
|
@@ -46,6 +51,15 @@ class EndpointHandler():
|
|
| 46 |
|
| 47 |
unet = UNet3DConditionModel.from_pretrained_2d(pretrained_model_path=unet_model_path, unet_additional_kwargs=OmegaConf.to_container(inference_config.unet_additional_kwargs), config_path=unet_config_path)
|
| 48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
if is_xformers_available(): unet.enable_xformers_memory_efficient_attention()
|
| 50 |
else: assert False
|
| 51 |
|
|
@@ -56,18 +70,67 @@ class EndpointHandler():
|
|
| 56 |
|
| 57 |
# huggingface download motion module from bluestarburst/AnimateDiff-SceneFusion/models/Motion_Module/mm_sd_v15.ckpt
|
| 58 |
|
| 59 |
-
motion_module = hf_hub_download(repo_id="bluestarburst/AnimateDiff-SceneFusion", filename="models/Motion_Module/mm_sd_v15.ckpt")
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
|
| 72 |
def __call__(self, data : Any):
|
| 73 |
"""
|
|
@@ -76,19 +139,33 @@ class EndpointHandler():
|
|
| 76 |
"""
|
| 77 |
|
| 78 |
prompt = data.pop("prompt", "")
|
| 79 |
-
negative_prompt = data.pop("negative_prompt", "
|
|
|
|
| 80 |
steps = data.pop("steps", 25)
|
| 81 |
guidance_scale = data.pop("guidance_scale", 12.5)
|
| 82 |
|
|
|
|
|
|
|
| 83 |
vids = self.pipeline(
|
| 84 |
-
prompt
|
| 85 |
-
negative_prompt=negative_prompt,
|
| 86 |
-
num_inference_steps=steps,
|
| 87 |
-
guidance_scale=guidance_scale,
|
| 88 |
-
width= 256,
|
| 89 |
-
height= 256,
|
| 90 |
-
video_length= 5,
|
| 91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
|
| 93 |
videos = rearrange(vids, "b c t h w -> t b c h w")
|
| 94 |
n_rows=6
|
|
|
|
| 9 |
import json
|
| 10 |
import base64
|
| 11 |
|
| 12 |
+
from safetensors import safe_open
|
| 13 |
+
|
| 14 |
from diffusers.utils.import_utils import is_xformers_available
|
| 15 |
from typing import Any
|
| 16 |
import torch
|
|
|
|
| 23 |
from animatediff.pipelines.pipeline_animation import AnimationPipeline
|
| 24 |
from animatediff.utils.util import save_videos_grid
|
| 25 |
from animatediff.utils.util import load_weights
|
| 26 |
+
from animatediff.utils.convert_from_ckpt import convert_ldm_unet_checkpoint, convert_ldm_clip_checkpoint, convert_ldm_vae_checkpoint
|
| 27 |
+
from animatediff.utils.convert_lora_safetensor_to_diffusers import convert_lora
|
| 28 |
|
| 29 |
+
current_model = "backup"
|
| 30 |
|
| 31 |
class EndpointHandler():
|
| 32 |
def __init__(self, model_path: str = "bluestarburst/AnimateDiff-SceneFusion"):
|
|
|
|
| 51 |
|
| 52 |
unet = UNet3DConditionModel.from_pretrained_2d(pretrained_model_path=unet_model_path, unet_additional_kwargs=OmegaConf.to_container(inference_config.unet_additional_kwargs), config_path=unet_config_path)
|
| 53 |
|
| 54 |
+
# inv_latent_path = f"{OUTPUT_DIR}/inv_latents/ddim_latent-1.pt"
|
| 55 |
+
inv_latent_path = hf_hub_download(repo_id="bluestarburst/AnimateDiff-SceneFusion", filename=f"models/Motion_Module/{current_model}/inv_latents/ddim_latent-1.pt")
|
| 56 |
+
self.latents = torch.load(inv_latent_path).to(torch.float)
|
| 57 |
+
print(self.latents.shape, self.latents.dtype)
|
| 58 |
+
|
| 59 |
+
# torch.backends.cuda.enable_mem_efficient_sdp(True)
|
| 60 |
+
torch.backends.cuda.enable_flash_sdp(True)
|
| 61 |
+
torch.backends.cuda.enable_math_sdp(True)
|
| 62 |
+
|
| 63 |
if is_xformers_available(): unet.enable_xformers_memory_efficient_attention()
|
| 64 |
else: assert False
|
| 65 |
|
|
|
|
| 70 |
|
| 71 |
# huggingface download motion module from bluestarburst/AnimateDiff-SceneFusion/models/Motion_Module/mm_sd_v15.ckpt
|
| 72 |
|
| 73 |
+
# motion_module = hf_hub_download(repo_id="bluestarburst/AnimateDiff-SceneFusion", filename="models/Motion_Module/mm_sd_v15.ckpt")
|
| 74 |
+
motion_module = hf_hub_download(repo_id="bluestarburst/AnimateDiff-SceneFusion", filename=f"models/Motion_Module/{current_model}/mm.pth")
|
| 75 |
+
# LORA_DREAMBOOTH_PATH="models/DreamBooth_LoRA/toonyou_beta3.safetensors"
|
| 76 |
+
|
| 77 |
+
LORA_DREAMBOOTH_PATH = None
|
| 78 |
+
LORA_DREAMBOOTH_PATH = hf_hub_download(repo_id="bluestarburst/AnimateDiff-SceneFusion", filename="models/DreamBooth_LoRA/toonyou_beta3.safetensors")
|
| 79 |
+
|
| 80 |
+
# self.pipeline = load_weights(
|
| 81 |
+
# self.pipeline,
|
| 82 |
+
# # motion module
|
| 83 |
+
# motion_module_path = motion_module,
|
| 84 |
+
# motion_module_lora_configs = [],
|
| 85 |
+
# # image layers
|
| 86 |
+
# dreambooth_model_path = "",
|
| 87 |
+
# lora_model_path = "",
|
| 88 |
+
# lora_alpha = 0.8,
|
| 89 |
+
# ).to("cuda")
|
| 90 |
+
|
| 91 |
+
motion_module_state_dict = torch.load(motion_module, map_location="cpu")
|
| 92 |
+
missing, unexpected = self.pipeline.unet.load_state_dict(motion_module_state_dict, strict=False)
|
| 93 |
+
assert len(unexpected) == 0
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
# FIX THIS
|
| 97 |
+
if LORA_DREAMBOOTH_PATH != "":
|
| 98 |
+
if LORA_DREAMBOOTH_PATH.endswith(".ckpt"):
|
| 99 |
+
state_dict = torch.load(LORA_DREAMBOOTH_PATH)
|
| 100 |
+
self.pipeline.unet.load_state_dict(state_dict)
|
| 101 |
+
|
| 102 |
+
elif LORA_DREAMBOOTH_PATH.endswith(".safetensors"):
|
| 103 |
+
state_dict = {}
|
| 104 |
+
with safe_open(LORA_DREAMBOOTH_PATH, framework="pt", device="cpu") as f:
|
| 105 |
+
for key in f.keys():
|
| 106 |
+
state_dict[key] = f.get_tensor(key)
|
| 107 |
+
|
| 108 |
+
is_lora = all("lora" in k for k in state_dict.keys())
|
| 109 |
+
if not is_lora:
|
| 110 |
+
base_state_dict = state_dict
|
| 111 |
+
else:
|
| 112 |
+
base_state_dict = {}
|
| 113 |
+
with safe_open("", framework="pt", device="cpu") as f:
|
| 114 |
+
for key in f.keys():
|
| 115 |
+
base_state_dict[key] = f.get_tensor(key)
|
| 116 |
+
|
| 117 |
+
# vae
|
| 118 |
+
converted_vae_checkpoint = convert_ldm_vae_checkpoint(base_state_dict, self.pipeline.vae.config)
|
| 119 |
+
self.pipeline.vae.load_state_dict(converted_vae_checkpoint)
|
| 120 |
+
# unet
|
| 121 |
+
converted_unet_checkpoint = convert_ldm_unet_checkpoint(base_state_dict, self.pipeline.unet.config)
|
| 122 |
+
self.pipeline.unet.load_state_dict(converted_unet_checkpoint, strict=False)
|
| 123 |
+
# text_model (TODO: problem here)
|
| 124 |
+
# converted_test_encoder_checkpoint = convert_ldm_clip_checkpoint(base_state_dict)
|
| 125 |
+
# pipeline.text_encoder = converted_test_encoder_checkpoint
|
| 126 |
+
|
| 127 |
+
# import pdb
|
| 128 |
+
# pdb.set_trace()
|
| 129 |
+
if is_lora:
|
| 130 |
+
self.pipeline = convert_lora(self.pipeline, state_dict)
|
| 131 |
+
# self.pipeline = convert_lora(self.pipeline, state_dict, alpha=model_config.lora_alpha)
|
| 132 |
+
|
| 133 |
+
self.pipeline.to("cuda")
|
| 134 |
|
| 135 |
def __call__(self, data : Any):
|
| 136 |
"""
|
|
|
|
| 139 |
"""
|
| 140 |
|
| 141 |
prompt = data.pop("prompt", "")
|
| 142 |
+
negative_prompt = data.pop("negative_prompt", "")
|
| 143 |
+
negative_prompt += ",easynegative,bad_construction,bad_structure,bad_wail,bad_windows,blurry,cloned_window,cropped,deformed,disfigured,error,extra_windows,extra_chimney,extra_door,extra_structure,extra_frame,fewer_digits,fused_structure,gross_proportions,jpeg_artifacts,long_roof,low_quality,structure_limbs,missing_windows,missing_doors,missing_roofs,mutated_structure,mutation,normal_quality,out_of_frame,owres,poorly_drawn_structure,poorly_drawn_house,signature,text,too_many_windows,ugly,username,uta,watermark,worst_quality"
|
| 144 |
steps = data.pop("steps", 25)
|
| 145 |
guidance_scale = data.pop("guidance_scale", 12.5)
|
| 146 |
|
| 147 |
+
print(f"current seed: {torch.initial_seed()}")
|
| 148 |
+
print(f"sampling {prompt} ...")
|
| 149 |
vids = self.pipeline(
|
| 150 |
+
prompt,
|
| 151 |
+
negative_prompt = negative_prompt,
|
| 152 |
+
num_inference_steps = steps,
|
| 153 |
+
guidance_scale = guidance_scale,
|
| 154 |
+
width = 256,
|
| 155 |
+
height = 256,
|
| 156 |
+
video_length = 5,
|
| 157 |
+
latents = self.latents,
|
| 158 |
+
).videos
|
| 159 |
+
|
| 160 |
+
# vids = self.pipeline(
|
| 161 |
+
# prompt=prompt,
|
| 162 |
+
# negative_prompt=negative_prompt,
|
| 163 |
+
# num_inference_steps=steps,
|
| 164 |
+
# guidance_scale=guidance_scale,
|
| 165 |
+
# width= 256,
|
| 166 |
+
# height= 256,
|
| 167 |
+
# video_length= 5,
|
| 168 |
+
# ).videos
|
| 169 |
|
| 170 |
videos = rearrange(vids, "b c t h w -> t b c h w")
|
| 171 |
n_rows=6
|
models/Motion_Module/test/config.yaml
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
-
pretrained_model_path:
|
| 2 |
-
output_dir:
|
| 3 |
train_data:
|
| 4 |
video_folder: data/output
|
| 5 |
csv_path: data/output.csv
|
|
@@ -23,7 +23,7 @@ train_whole_module: false
|
|
| 23 |
trainable_modules:
|
| 24 |
- to_q
|
| 25 |
train_batch_size: 1
|
| 26 |
-
max_train_steps:
|
| 27 |
learning_rate: 0.0003
|
| 28 |
scale_lr: false
|
| 29 |
lr_scheduler: constant
|
|
@@ -42,7 +42,7 @@ mixed_precision: fp16
|
|
| 42 |
use_8bit_adam: false
|
| 43 |
enable_xformers_memory_efficient_attention: true
|
| 44 |
seed: 33
|
| 45 |
-
motion_module:
|
| 46 |
inference_config_path: configs/inference/inference-v3.yaml
|
| 47 |
motion_module_pe_multiplier: 1
|
| 48 |
dataset_class: MultiTuneAVideoDataset
|
|
|
|
| 1 |
+
pretrained_model_path: models/StableDiffusion/
|
| 2 |
+
output_dir: models/Motion_Module/test
|
| 3 |
train_data:
|
| 4 |
video_folder: data/output
|
| 5 |
csv_path: data/output.csv
|
|
|
|
| 23 |
trainable_modules:
|
| 24 |
- to_q
|
| 25 |
train_batch_size: 1
|
| 26 |
+
max_train_steps: 1
|
| 27 |
learning_rate: 0.0003
|
| 28 |
scale_lr: false
|
| 29 |
lr_scheduler: constant
|
|
|
|
| 42 |
use_8bit_adam: false
|
| 43 |
enable_xformers_memory_efficient_attention: true
|
| 44 |
seed: 33
|
| 45 |
+
motion_module: models/Motion_Module/mm_sd_v15.ckpt
|
| 46 |
inference_config_path: configs/inference/inference-v3.yaml
|
| 47 |
motion_module_pe_multiplier: 1
|
| 48 |
dataset_class: MultiTuneAVideoDataset
|