Upload folder using huggingface_hub

Browse files

Files changed (21) hide show

data/data.ipynb +0 -0
data/output.csv +19 -20
data/output/18.mp4 +0 -0
data/output/19.mp4 +0 -0
data/output/20.mp4 +0 -0
data/output/21.mp4 +0 -0
data/output/22.mp4 +0 -0
data/output/23.mp4 +0 -0
data/output/24.mp4 +0 -0
data/output/25.mp4 +0 -0
data/output/26.mp4 +0 -0
data/output/27.mp4 +0 -0
data/output/28.mp4 +0 -0
data/output/29.mp4 +0 -0
data/output/30.mp4 +0 -0
data/output/31.mp4 +0 -0
data/output/32.mp4 +0 -0
data/output/33.mp4 +0 -0
data/output/34.mp4 +0 -0
handler.py +98 -21
models/Motion_Module/test/config.yaml +4 -4

data/data.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

data/output.csv CHANGED Viewed

@@ -12,26 +12,25 @@ videoid,name,page_dir
 10,"camera panning right to left, a bird's - eye view of a row of buildings in a city with trees in the foreground",raw/3dDS1RL
 11,"camera panning left to right, a computer generated image of a blue diamond in the middle of a green and leafy area",raw/3dDS20
 12,"camera panning left to right, a computer generated image of a landscape with trees, rocks, and a path through a forest",raw/3dDS21
-13,"camera panning left to right, a computer generated image of a building with a blue roof and a blue roof on top of it",raw/3dDS22
 14,"camera panning left to right, a map of a small town with a lot of trees and bushes in the middle of it",raw/3dDS23
-15,"camera panning left to right, a computer generated image of a building with a blue roof and a green lawn in front of it",raw/3dDS24
 16,"camera panning left to right, an artist's rendering of a floating island in the middle of a large body of water",raw/3dDS25
 17,"camera panning left to right, a couple of white birds floating on top of a lake filled with waterlily green leaves",raw/3dDS26
-18,"camera panning left to right, a bunch of poles that are standing in front of a building with a sign that says shop",raw/3dDS27
-19,"camera panning left to right, a 3d rendering of a jail cell with bars and bars on each side of the cell wall",raw/3dDS28
-20,"camera panning left to right, an artist's rendering of a bridge over a body of water with a building in the background",raw/3dDS29
-21,"camera panning right to left, a bird's eye view of a building with a lot of trees in front of it",raw/3dDS2RL
-22,"camera panning left to right, an artist's rendering of a highway with a bridge in the middle and trees on both sides",raw/3dDS30
-23,"camera panning left to right, an aerial view of a freeway with multiple lanes and a bridge in the foreground and a cityscape in the background",raw/3dDS31
-24,"camera panning left to right, an artist's rendering of a bridge over a body of water with a clock tower in the background",raw/3dDS32
-25,"camera panning left to right, a set of stairs leading up to the top of a set of stairs in a dimly lit area",raw/3dDS33
-26,"camera panning left to right, a 3d rendering of a subway station with red and yellow striped barriers and a red and white sign",raw/3dDS34
-27,"camera panning left to right, a screenshot of a subway station with a man in a suit and a woman in a dress",raw/3dDS35
-28,"camera panning left to right, a screenshot of a hallway in a building with a gate and a sign on the wall",raw/3dDS36
-29,"camera panning left to right, a bird's - eye view of a row of houses in the suburbs of a city",raw/3dDS3LR
-30,"camera panning left to right, a bird's eye view of a house with solar panels on the top of the roof",raw/3dDS4
-31,"camera panning left to right, a bird's eye view of a row of houses with solar panels on top of them",raw/3dDS5
-32,"camera panning left to right, a screenshot of a city street with a bench on one side and a building on the other",raw/3dDS6
-33,"camera panning right to left, a computer generated image of a computer generated image of a construction site with a truck in the background",raw/3dDS7RL
-34,"camera panning right to left, a screenshot of a city with a bunch of green objects in the middle of the street",raw/3dDS8RL
-35,"camera panning left to right, a computer generated image of a city street with tall buildings and a clock tower in the distance",raw/3dDS9

 10,"camera panning right to left, a bird's - eye view of a row of buildings in a city with trees in the foreground",raw/3dDS1RL
 11,"camera panning left to right, a computer generated image of a blue diamond in the middle of a green and leafy area",raw/3dDS20
 12,"camera panning left to right, a computer generated image of a landscape with trees, rocks, and a path through a forest",raw/3dDS21
+13,"camera panning right to left, a computer generated image of a building with a blue roof and a blue roof on top of it",raw/3dDS22RL
 14,"camera panning left to right, a map of a small town with a lot of trees and bushes in the middle of it",raw/3dDS23
+15,"camera panning right to left, a computer generated image of a building with a blue roof and a green lawn in front of it",raw/3dDS24RL
 16,"camera panning left to right, an artist's rendering of a floating island in the middle of a large body of water",raw/3dDS25
 17,"camera panning left to right, a couple of white birds floating on top of a lake filled with waterlily green leaves",raw/3dDS26
+18,"camera panning left to right, a 3d rendering of a jail cell with bars and bars on each side of the cell wall",raw/3dDS28
+19,"camera panning right to left, an artist's rendering of a bridge over a body of water with a building in the background",raw/3dDS29RL
+20,"camera panning right to left, a bird's eye view of a building with a lot of trees in front of it",raw/3dDS2RL
+21,"camera panning left to right, an artist's rendering of a highway with a bridge in the middle and trees on both sides",raw/3dDS30
+22,"camera panning left to right, an aerial view of a freeway with multiple lanes and a bridge in the foreground and a cityscape in the background",raw/3dDS31
+23,"camera panning left to right, an artist's rendering of a bridge over a body of water with a clock tower in the background",raw/3dDS32
+24,"camera panning left to right, a set of stairs leading up to the top of a set of stairs in a dimly lit area",raw/3dDS33
+25,"camera panning right to left, a 3d rendering of a subway station with red and yellow striped barriers and a red and white sign",raw/3dDS34RL
+26,"camera panning right to left, a screenshot of a subway station with a man in a suit and a woman in a dress",raw/3dDS35RL
+27,"camera panning right to left, a screenshot of a hallway in a building with a gate and a sign on the wall",raw/3dDS36RL
+28,"camera panning left to right, a bird's - eye view of a row of houses in the suburbs of a city",raw/3dDS3LR
+29,"camera panning left to right, a bird's eye view of a house with solar panels on the top of the roof",raw/3dDS4
+30,"camera panning left to right, a bird's eye view of a row of houses with solar panels on top of them",raw/3dDS5
+31,"camera panning left to right, a screenshot of a city street with a bench on one side and a building on the other",raw/3dDS6
+32,"camera panning right to left, a computer generated image of a computer generated image of a construction site with a truck in the background",raw/3dDS7RL
+33,"camera panning right to left, a screenshot of a city with a bunch of green objects in the middle of the street",raw/3dDS8RL
+34,"camera panning left to right, a computer generated image of a city street with tall buildings and a clock tower in the distance",raw/3dDS9

data/output/18.mp4 CHANGED Viewed

Binary files a/data/output/18.mp4 and b/data/output/18.mp4 differ

data/output/19.mp4 CHANGED Viewed

Binary files a/data/output/19.mp4 and b/data/output/19.mp4 differ

data/output/20.mp4 CHANGED Viewed

Binary files a/data/output/20.mp4 and b/data/output/20.mp4 differ

data/output/21.mp4 CHANGED Viewed

Binary files a/data/output/21.mp4 and b/data/output/21.mp4 differ

data/output/22.mp4 CHANGED Viewed

Binary files a/data/output/22.mp4 and b/data/output/22.mp4 differ

data/output/23.mp4 CHANGED Viewed

Binary files a/data/output/23.mp4 and b/data/output/23.mp4 differ

data/output/24.mp4 CHANGED Viewed

Binary files a/data/output/24.mp4 and b/data/output/24.mp4 differ

data/output/25.mp4 CHANGED Viewed

Binary files a/data/output/25.mp4 and b/data/output/25.mp4 differ

data/output/26.mp4 CHANGED Viewed

Binary files a/data/output/26.mp4 and b/data/output/26.mp4 differ

data/output/27.mp4 CHANGED Viewed

Binary files a/data/output/27.mp4 and b/data/output/27.mp4 differ

data/output/28.mp4 CHANGED Viewed

Binary files a/data/output/28.mp4 and b/data/output/28.mp4 differ

data/output/29.mp4 CHANGED Viewed

Binary files a/data/output/29.mp4 and b/data/output/29.mp4 differ

data/output/30.mp4 CHANGED Viewed

Binary files a/data/output/30.mp4 and b/data/output/30.mp4 differ

data/output/31.mp4 CHANGED Viewed

Binary files a/data/output/31.mp4 and b/data/output/31.mp4 differ

data/output/32.mp4 CHANGED Viewed

Binary files a/data/output/32.mp4 and b/data/output/32.mp4 differ

data/output/33.mp4 CHANGED Viewed

Binary files a/data/output/33.mp4 and b/data/output/33.mp4 differ

data/output/34.mp4 CHANGED Viewed

Binary files a/data/output/34.mp4 and b/data/output/34.mp4 differ

handler.py CHANGED Viewed

@@ -9,6 +9,8 @@ import os
 import json
 import base64
 from diffusers.utils.import_utils import is_xformers_available
 from typing import Any
 import torch
@@ -21,7 +23,10 @@ from animatediff.models.unet import UNet3DConditionModel
 from animatediff.pipelines.pipeline_animation import AnimationPipeline
 from animatediff.utils.util import save_videos_grid
 from animatediff.utils.util import load_weights
 class EndpointHandler():
     def __init__(self, model_path: str = "bluestarburst/AnimateDiff-SceneFusion"):
@@ -46,6 +51,15 @@ class EndpointHandler():
         unet         = UNet3DConditionModel.from_pretrained_2d(pretrained_model_path=unet_model_path, unet_additional_kwargs=OmegaConf.to_container(inference_config.unet_additional_kwargs), config_path=unet_config_path)
         if is_xformers_available(): unet.enable_xformers_memory_efficient_attention()
         else: assert False
@@ -56,18 +70,67 @@ class EndpointHandler():
         # huggingface download motion module from bluestarburst/AnimateDiff-SceneFusion/models/Motion_Module/mm_sd_v15.ckpt
-        motion_module = hf_hub_download(repo_id="bluestarburst/AnimateDiff-SceneFusion", filename="models/Motion_Module/mm_sd_v15.ckpt")
-        self.pipeline = load_weights(
-            self.pipeline,
-            # motion module
-            motion_module_path         = motion_module,
-            motion_module_lora_configs = [],
-            # image layers
-            dreambooth_model_path      = "",
-            lora_model_path            = "",
-            lora_alpha                 = 0.8,
-        ).to("cuda")
     def __call__(self, data : Any):
         """
@@ -76,19 +139,33 @@ class EndpointHandler():
         """
         prompt = data.pop("prompt", "")
-        negative_prompt = data.pop("negative_prompt", "easynegative,bad_construction,bad_structure,bad_wail,bad_windows,blurry,cloned_window,cropped,deformed,disfigured,error,extra_windows,extra_chimney,extra_door,extra_structure,extra_frame,fewer_digits,fused_structure,gross_proportions,jpeg_artifacts,long_roof,low_quality,structure_limbs,missing_windows,missing_doors,missing_roofs,mutated_structure,mutation,normal_quality,out_of_frame,owres,poorly_drawn_structure,poorly_drawn_house,signature,text,too_many_windows,ugly,username,uta,watermark,worst_quality")
         steps = data.pop("steps", 25)
         guidance_scale = data.pop("guidance_scale", 12.5)
         vids = self.pipeline(
-            prompt=prompt,
-            negative_prompt=negative_prompt,
-            num_inference_steps=steps,
-            guidance_scale=guidance_scale,
-            width= 256,
-            height= 256,
-            video_length= 5,
-            ).videos
         videos = rearrange(vids, "b c t h w -> t b c h w")
         n_rows=6

 import json
 import base64
+from safetensors import safe_open
 from diffusers.utils.import_utils import is_xformers_available
 from typing import Any
 import torch
 from animatediff.pipelines.pipeline_animation import AnimationPipeline
 from animatediff.utils.util import save_videos_grid
 from animatediff.utils.util import load_weights
+from animatediff.utils.convert_from_ckpt import convert_ldm_unet_checkpoint, convert_ldm_clip_checkpoint, convert_ldm_vae_checkpoint
+from animatediff.utils.convert_lora_safetensor_to_diffusers import convert_lora
+current_model = "backup"
 class EndpointHandler():
     def __init__(self, model_path: str = "bluestarburst/AnimateDiff-SceneFusion"):
         unet         = UNet3DConditionModel.from_pretrained_2d(pretrained_model_path=unet_model_path, unet_additional_kwargs=OmegaConf.to_container(inference_config.unet_additional_kwargs), config_path=unet_config_path)
+        # inv_latent_path = f"{OUTPUT_DIR}/inv_latents/ddim_latent-1.pt"
+        inv_latent_path = hf_hub_download(repo_id="bluestarburst/AnimateDiff-SceneFusion", filename=f"models/Motion_Module/{current_model}/inv_latents/ddim_latent-1.pt")
+        self.latents = torch.load(inv_latent_path).to(torch.float)
+        print(self.latents.shape, self.latents.dtype)
+        # torch.backends.cuda.enable_mem_efficient_sdp(True)
+        torch.backends.cuda.enable_flash_sdp(True)
+        torch.backends.cuda.enable_math_sdp(True)
         if is_xformers_available(): unet.enable_xformers_memory_efficient_attention()
         else: assert False
         # huggingface download motion module from bluestarburst/AnimateDiff-SceneFusion/models/Motion_Module/mm_sd_v15.ckpt
+        # motion_module = hf_hub_download(repo_id="bluestarburst/AnimateDiff-SceneFusion", filename="models/Motion_Module/mm_sd_v15.ckpt")
+        motion_module = hf_hub_download(repo_id="bluestarburst/AnimateDiff-SceneFusion", filename=f"models/Motion_Module/{current_model}/mm.pth")
+        # LORA_DREAMBOOTH_PATH="models/DreamBooth_LoRA/toonyou_beta3.safetensors"
+        LORA_DREAMBOOTH_PATH = None
+        LORA_DREAMBOOTH_PATH = hf_hub_download(repo_id="bluestarburst/AnimateDiff-SceneFusion", filename="models/DreamBooth_LoRA/toonyou_beta3.safetensors")
+        # self.pipeline = load_weights(
+        #     self.pipeline,
+        #     # motion module
+        #     motion_module_path         = motion_module,
+        #     motion_module_lora_configs = [],
+        #     # image layers
+        #     dreambooth_model_path      = "",
+        #     lora_model_path            = "",
+        #     lora_alpha                 = 0.8,
+        # ).to("cuda")
+        motion_module_state_dict = torch.load(motion_module, map_location="cpu")
+        missing, unexpected = self.pipeline.unet.load_state_dict(motion_module_state_dict, strict=False)
+        assert len(unexpected) == 0
+        # FIX THIS
+        if LORA_DREAMBOOTH_PATH != "":
+            if LORA_DREAMBOOTH_PATH.endswith(".ckpt"):
+                state_dict = torch.load(LORA_DREAMBOOTH_PATH)
+                self.pipeline.unet.load_state_dict(state_dict)
+            elif LORA_DREAMBOOTH_PATH.endswith(".safetensors"):
+                state_dict = {}
+                with safe_open(LORA_DREAMBOOTH_PATH, framework="pt", device="cpu") as f:
+                    for key in f.keys():
+                        state_dict[key] = f.get_tensor(key)
+                is_lora = all("lora" in k for k in state_dict.keys())
+                if not is_lora:
+                    base_state_dict = state_dict
+                else:
+                    base_state_dict = {}
+                    with safe_open("", framework="pt", device="cpu") as f:
+                        for key in f.keys():
+                            base_state_dict[key] = f.get_tensor(key)
+                # vae
+                converted_vae_checkpoint = convert_ldm_vae_checkpoint(base_state_dict, self.pipeline.vae.config)
+                self.pipeline.vae.load_state_dict(converted_vae_checkpoint)
+                # unet
+                converted_unet_checkpoint = convert_ldm_unet_checkpoint(base_state_dict, self.pipeline.unet.config)
+                self.pipeline.unet.load_state_dict(converted_unet_checkpoint, strict=False)
+                # text_model (TODO: problem here)
+                # converted_test_encoder_checkpoint = convert_ldm_clip_checkpoint(base_state_dict)
+                # pipeline.text_encoder = converted_test_encoder_checkpoint
+                # import pdb
+                # pdb.set_trace()
+                if is_lora:
+                    self.pipeline = convert_lora(self.pipeline, state_dict)
+                    # self.pipeline = convert_lora(self.pipeline, state_dict, alpha=model_config.lora_alpha)
+        self.pipeline.to("cuda")
     def __call__(self, data : Any):
         """
         """
         prompt = data.pop("prompt", "")
+        negative_prompt = data.pop("negative_prompt", "")
+        negative_prompt += ",easynegative,bad_construction,bad_structure,bad_wail,bad_windows,blurry,cloned_window,cropped,deformed,disfigured,error,extra_windows,extra_chimney,extra_door,extra_structure,extra_frame,fewer_digits,fused_structure,gross_proportions,jpeg_artifacts,long_roof,low_quality,structure_limbs,missing_windows,missing_doors,missing_roofs,mutated_structure,mutation,normal_quality,out_of_frame,owres,poorly_drawn_structure,poorly_drawn_house,signature,text,too_many_windows,ugly,username,uta,watermark,worst_quality"
         steps = data.pop("steps", 25)
         guidance_scale = data.pop("guidance_scale", 12.5)
+        print(f"current seed: {torch.initial_seed()}")
+        print(f"sampling {prompt} ...")
         vids = self.pipeline(
+            prompt,
+            negative_prompt     = negative_prompt,
+            num_inference_steps = steps,
+            guidance_scale      = guidance_scale,
+            width               = 256,
+            height              = 256,
+            video_length        = 5,
+            latents             = self.latents,
+        ).videos
+        # vids = self.pipeline(
+        #     prompt=prompt,
+        #     negative_prompt=negative_prompt,
+        #     num_inference_steps=steps,
+        #     guidance_scale=guidance_scale,
+        #     width= 256,
+        #     height= 256,
+        #     video_length= 5,
+        #     ).videos
         videos = rearrange(vids, "b c t h w -> t b c h w")
         n_rows=6

models/Motion_Module/test/config.yaml CHANGED Viewed

@@ -1,5 +1,5 @@
-pretrained_model_path: /content/AnimateDiff-SceneFusion/models/StableDiffusion/
-output_dir: /content/AnimateDiff-SceneFusion/models/Motion_Module/test
 train_data:
   video_folder: data/output
   csv_path: data/output.csv
@@ -23,7 +23,7 @@ train_whole_module: false
 trainable_modules:
 - to_q
 train_batch_size: 1
-max_train_steps: 36
 learning_rate: 0.0003
 scale_lr: false
 lr_scheduler: constant
@@ -42,7 +42,7 @@ mixed_precision: fp16
 use_8bit_adam: false
 enable_xformers_memory_efficient_attention: true
 seed: 33
-motion_module: /content/AnimateDiff-SceneFusion/models/Motion_Module/mm_sd_v15.ckpt
 inference_config_path: configs/inference/inference-v3.yaml
 motion_module_pe_multiplier: 1
 dataset_class: MultiTuneAVideoDataset

+pretrained_model_path: models/StableDiffusion/
+output_dir: models/Motion_Module/test
 train_data:
   video_folder: data/output
   csv_path: data/output.csv
 trainable_modules:
 - to_q
 train_batch_size: 1
+max_train_steps: 1
 learning_rate: 0.0003
 scale_lr: false
 lr_scheduler: constant
 use_8bit_adam: false
 enable_xformers_memory_efficient_attention: true
 seed: 33
+motion_module: models/Motion_Module/mm_sd_v15.ckpt
 inference_config_path: configs/inference/inference-v3.yaml
 motion_module_pe_multiplier: 1
 dataset_class: MultiTuneAVideoDataset