Commit
·
b3ee019
1
Parent(s):
381e596
Fix pipeline
Browse files- .gitignore +3 -1
- bapp.py +26 -13
- requirements.txt +6 -0
- test.py +12 -0
- test_video.py +11 -0
- animate.py → visual_anagrams/animate.py +15 -6
- visual_anagrams/samplers.py +21 -10
.gitignore
CHANGED
|
@@ -1,3 +1,5 @@
|
|
| 1 |
env/
|
| 2 |
__pycache__/
|
| 3 |
-
assets/
|
|
|
|
|
|
|
|
|
| 1 |
env/
|
| 2 |
__pycache__/
|
| 3 |
+
assets/
|
| 4 |
+
*.png
|
| 5 |
+
*.mp4
|
bapp.py
CHANGED
|
@@ -4,10 +4,12 @@ from pathlib import Path
|
|
| 4 |
import gradio as gr
|
| 5 |
import torch
|
| 6 |
from diffusers import DiffusionPipeline
|
|
|
|
| 7 |
|
| 8 |
from visual_anagrams.views import get_views, VIEW_MAP_NAMES
|
| 9 |
from visual_anagrams.samplers import sample_stage_1, sample_stage_2
|
| 10 |
from visual_anagrams.utils import add_args, save_illusion, save_metadata
|
|
|
|
| 11 |
|
| 12 |
stage_1 = DiffusionPipeline.from_pretrained(
|
| 13 |
"DeepFloyd/IF-I-M-v1.0",
|
|
@@ -31,23 +33,26 @@ def generate_content(
|
|
| 31 |
num_inference_steps,
|
| 32 |
seed
|
| 33 |
):
|
| 34 |
-
prompts = [prompt_for_original, prompt_for_transformed]
|
| 35 |
-
prompt_embeds = [stage_1.encode_prompt(
|
| 36 |
prompt_embeds, negative_prompt_embeds = zip(*prompt_embeds)
|
| 37 |
prompt_embeds = torch.cat(prompt_embeds)
|
| 38 |
negative_prompt_embeds = torch.cat(negative_prompt_embeds)
|
| 39 |
|
| 40 |
-
views = ['identity', transformation]
|
| 41 |
views = get_views(views)
|
| 42 |
|
| 43 |
generator = torch.manual_seed(seed)
|
|
|
|
|
|
|
| 44 |
image = sample_stage_1(stage_1,
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
|
|
|
|
| 51 |
image = sample_stage_2(stage_2,
|
| 52 |
image,
|
| 53 |
prompt_embeds,
|
|
@@ -55,8 +60,16 @@ def generate_content(
|
|
| 55 |
views,
|
| 56 |
num_inference_steps=num_inference_steps,
|
| 57 |
generator=generator)
|
|
|
|
| 58 |
|
| 59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
|
| 62 |
choices = list(VIEW_MAP_NAMES.keys())
|
|
@@ -64,13 +77,13 @@ gradio_app = gr.Interface(
|
|
| 64 |
fn=generate_content,
|
| 65 |
inputs=[
|
| 66 |
gr.Textbox(label="Style", placeholder="an oil painting of"),
|
| 67 |
-
gr.Textbox(label="Prompt for original view", placeholder="a
|
| 68 |
-
gr.Textbox(label="Prompt for transformed view", placeholder="
|
| 69 |
gr.Dropdown(label="View transformation", choices=choices, value=choices[0]),
|
| 70 |
-
gr.Number(label="Number of diffusion steps", value=
|
| 71 |
gr.Number(label="Random seed", value=0, step=1, minimum=0, maximum=100000)
|
| 72 |
],
|
| 73 |
-
outputs=[gr.
|
| 74 |
)
|
| 75 |
|
| 76 |
|
|
|
|
| 4 |
import gradio as gr
|
| 5 |
import torch
|
| 6 |
from diffusers import DiffusionPipeline
|
| 7 |
+
from icecream import ic
|
| 8 |
|
| 9 |
from visual_anagrams.views import get_views, VIEW_MAP_NAMES
|
| 10 |
from visual_anagrams.samplers import sample_stage_1, sample_stage_2
|
| 11 |
from visual_anagrams.utils import add_args, save_illusion, save_metadata
|
| 12 |
+
from visual_anagrams.animate import animate_two_view
|
| 13 |
|
| 14 |
stage_1 = DiffusionPipeline.from_pretrained(
|
| 15 |
"DeepFloyd/IF-I-M-v1.0",
|
|
|
|
| 33 |
num_inference_steps,
|
| 34 |
seed
|
| 35 |
):
|
| 36 |
+
prompts = [f'{style} {p}'.strip() for p in [prompt_for_original, prompt_for_transformed]]
|
| 37 |
+
prompt_embeds = [stage_1.encode_prompt(p) for p in prompts]
|
| 38 |
prompt_embeds, negative_prompt_embeds = zip(*prompt_embeds)
|
| 39 |
prompt_embeds = torch.cat(prompt_embeds)
|
| 40 |
negative_prompt_embeds = torch.cat(negative_prompt_embeds)
|
| 41 |
|
| 42 |
+
views = ['identity', VIEW_MAP_NAMES[transformation]]
|
| 43 |
views = get_views(views)
|
| 44 |
|
| 45 |
generator = torch.manual_seed(seed)
|
| 46 |
+
|
| 47 |
+
print("Sample stage 1")
|
| 48 |
image = sample_stage_1(stage_1,
|
| 49 |
+
prompt_embeds,
|
| 50 |
+
negative_prompt_embeds,
|
| 51 |
+
views,
|
| 52 |
+
num_inference_steps=num_inference_steps,
|
| 53 |
+
generator=generator)
|
| 54 |
|
| 55 |
+
print("Sample stage 2")
|
| 56 |
image = sample_stage_2(stage_2,
|
| 57 |
image,
|
| 58 |
prompt_embeds,
|
|
|
|
| 60 |
views,
|
| 61 |
num_inference_steps=num_inference_steps,
|
| 62 |
generator=generator)
|
| 63 |
+
save_illusion(image, views, Path(""))
|
| 64 |
|
| 65 |
+
size = image.shape[-1]
|
| 66 |
+
animate_two_view(
|
| 67 |
+
f"sample_{size}.png",
|
| 68 |
+
views[1],
|
| 69 |
+
prompts[0],
|
| 70 |
+
prompts[1],
|
| 71 |
+
)
|
| 72 |
+
return 'tmp.mp4', f"sample_{size}.png", f"sample_{size}.views.png"
|
| 73 |
|
| 74 |
|
| 75 |
choices = list(VIEW_MAP_NAMES.keys())
|
|
|
|
| 77 |
fn=generate_content,
|
| 78 |
inputs=[
|
| 79 |
gr.Textbox(label="Style", placeholder="an oil painting of"),
|
| 80 |
+
gr.Textbox(label="Prompt for original view", placeholder="a dress"),
|
| 81 |
+
gr.Textbox(label="Prompt for transformed view", placeholder="an old man"),
|
| 82 |
gr.Dropdown(label="View transformation", choices=choices, value=choices[0]),
|
| 83 |
+
gr.Number(label="Number of diffusion steps", value=100, step=1, minimum=1, maximum=300),
|
| 84 |
gr.Number(label="Random seed", value=0, step=1, minimum=0, maximum=100000)
|
| 85 |
],
|
| 86 |
+
outputs=[gr.Video(label="Illusion"), gr.Image(label="Original"), gr.Image(label="Transformed")],
|
| 87 |
)
|
| 88 |
|
| 89 |
|
requirements.txt
CHANGED
|
@@ -2,9 +2,15 @@ accelerate
|
|
| 2 |
diffusers
|
| 3 |
einops
|
| 4 |
gradio
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
safetensors
|
| 6 |
sentencepiece
|
| 7 |
transformers
|
| 8 |
torch
|
| 9 |
torchvision
|
|
|
|
| 10 |
xformers
|
|
|
|
| 2 |
diffusers
|
| 3 |
einops
|
| 4 |
gradio
|
| 5 |
+
icecream
|
| 6 |
+
imageio
|
| 7 |
+
imageio[ffmpeg]
|
| 8 |
+
imageio[pyav]
|
| 9 |
+
opencv-python
|
| 10 |
safetensors
|
| 11 |
sentencepiece
|
| 12 |
transformers
|
| 13 |
torch
|
| 14 |
torchvision
|
| 15 |
+
tqdm
|
| 16 |
xformers
|
test.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from bapp import generate_content
|
| 2 |
+
|
| 3 |
+
if __name__ == "__main__":
|
| 4 |
+
print(generate_content(
|
| 5 |
+
"a painting of",
|
| 6 |
+
"vases",
|
| 7 |
+
"a sloth",
|
| 8 |
+
"Flip",
|
| 9 |
+
1,
|
| 10 |
+
0
|
| 11 |
+
))
|
| 12 |
+
|
test_video.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from visual_anagrams.animate import animate_two_view
|
| 2 |
+
from visual_anagrams.views import get_views
|
| 3 |
+
|
| 4 |
+
if __name__ == "__main__":
|
| 5 |
+
animate_two_view(
|
| 6 |
+
"sample_256.png",
|
| 7 |
+
get_views(["identity", "flip"])[1],
|
| 8 |
+
"a painting of vases",
|
| 9 |
+
"a painting of a sloth",
|
| 10 |
+
save_video_path="tmp3.mp4",
|
| 11 |
+
)
|
animate.py → visual_anagrams/animate.py
RENAMED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
from tqdm import tqdm
|
| 2 |
import numpy as np
|
| 3 |
from PIL import Image, ImageDraw, ImageFont
|
|
@@ -13,12 +14,11 @@ def draw_text(image, text, fill=(0,0,0), frame_size=384, im_size=256):
|
|
| 13 |
image = image.copy()
|
| 14 |
|
| 15 |
# Font info
|
| 16 |
-
font_path = get_courier_font_path()
|
| 17 |
font_size = 16
|
| 18 |
|
| 19 |
# Make PIL objects
|
| 20 |
draw = ImageDraw.Draw(image)
|
| 21 |
-
font = ImageFont.
|
| 22 |
|
| 23 |
# Center text horizontally, and vertically between
|
| 24 |
# illusion bottom and frame bottom
|
|
@@ -117,10 +117,19 @@ def animate_two_view(
|
|
| 117 |
|
| 118 |
# Convert PIL images to numpy arrays
|
| 119 |
image_array = [imageio.core.asarray(frame) for frame in frames]
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
print(
|
| 123 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
|
| 125 |
|
| 126 |
|
|
|
|
| 1 |
+
import cv2
|
| 2 |
from tqdm import tqdm
|
| 3 |
import numpy as np
|
| 4 |
from PIL import Image, ImageDraw, ImageFont
|
|
|
|
| 14 |
image = image.copy()
|
| 15 |
|
| 16 |
# Font info
|
|
|
|
| 17 |
font_size = 16
|
| 18 |
|
| 19 |
# Make PIL objects
|
| 20 |
draw = ImageDraw.Draw(image)
|
| 21 |
+
font = ImageFont.load_default()
|
| 22 |
|
| 23 |
# Center text horizontally, and vertically between
|
| 24 |
# illusion bottom and frame bottom
|
|
|
|
| 117 |
|
| 118 |
# Convert PIL images to numpy arrays
|
| 119 |
image_array = [imageio.core.asarray(frame) for frame in frames]
|
| 120 |
+
f = image_array[0]
|
| 121 |
+
print(f.dtype)
|
| 122 |
+
print(f.shape)
|
| 123 |
+
print(frame_size)
|
| 124 |
+
print(np.min(f), np.max(f))
|
| 125 |
+
print(len(image_array))
|
| 126 |
+
|
| 127 |
+
# Save as video using opencv
|
| 128 |
+
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
|
| 129 |
+
video = cv2.VideoWriter(save_video_path, fourcc, 30, (frame_size, frame_size))
|
| 130 |
+
for frame in image_array:
|
| 131 |
+
video.write(frame)
|
| 132 |
+
video.release()
|
| 133 |
|
| 134 |
|
| 135 |
|
visual_anagrams/samplers.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
from tqdm import tqdm
|
|
|
|
| 2 |
|
| 3 |
import torch
|
| 4 |
import torch.nn.functional as F
|
|
@@ -42,8 +43,9 @@ def sample_stage_1(model,
|
|
| 42 |
device,
|
| 43 |
generator,
|
| 44 |
)
|
|
|
|
| 45 |
|
| 46 |
-
for i, t in enumerate(
|
| 47 |
# Apply views to noisy_image
|
| 48 |
viewed_noisy_images = []
|
| 49 |
for view_fn in views:
|
|
@@ -56,6 +58,7 @@ def sample_stage_1(model,
|
|
| 56 |
model_input = model.scheduler.scale_model_input(model_input, t)
|
| 57 |
|
| 58 |
# Predict noise estimate
|
|
|
|
| 59 |
noise_pred = model.unet(
|
| 60 |
model_input,
|
| 61 |
t,
|
|
@@ -63,9 +66,11 @@ def sample_stage_1(model,
|
|
| 63 |
cross_attention_kwargs=None,
|
| 64 |
return_dict=False,
|
| 65 |
)[0]
|
|
|
|
| 66 |
|
| 67 |
# Extract uncond (neg) and cond noise estimates
|
| 68 |
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
|
|
|
|
| 69 |
|
| 70 |
# Invert the unconditional (negative) estimates
|
| 71 |
inverted_preds = []
|
|
@@ -73,6 +78,7 @@ def sample_stage_1(model,
|
|
| 73 |
inverted_pred = view.inverse_view(pred)
|
| 74 |
inverted_preds.append(inverted_pred)
|
| 75 |
noise_pred_uncond = torch.stack(inverted_preds)
|
|
|
|
| 76 |
|
| 77 |
# Invert the conditional estimates
|
| 78 |
inverted_preds = []
|
|
@@ -80,11 +86,13 @@ def sample_stage_1(model,
|
|
| 80 |
inverted_pred = view.inverse_view(pred)
|
| 81 |
inverted_preds.append(inverted_pred)
|
| 82 |
noise_pred_text = torch.stack(inverted_preds)
|
|
|
|
| 83 |
|
| 84 |
# Split into noise estimate and variance estimates
|
| 85 |
noise_pred_uncond, _ = noise_pred_uncond.split(model_input.shape[1], dim=1)
|
| 86 |
noise_pred_text, predicted_variance = noise_pred_text.split(model_input.shape[1], dim=1)
|
| 87 |
noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
|
|
|
|
| 88 |
|
| 89 |
# Reduce predicted noise and variances
|
| 90 |
noise_pred = noise_pred.view(-1,num_prompts,3,64,64)
|
|
@@ -98,11 +106,14 @@ def sample_stage_1(model,
|
|
| 98 |
else:
|
| 99 |
raise ValueError('Reduction must be either `mean` or `alternate`')
|
| 100 |
noise_pred = torch.cat([noise_pred, predicted_variance], dim=1)
|
|
|
|
| 101 |
|
|
|
|
| 102 |
# compute the previous noisy sample x_t -> x_t-1
|
| 103 |
noisy_images = model.scheduler.step(
|
| 104 |
-
noise_pred, t, noisy_images, generator=generator, return_dict=False
|
| 105 |
)[0]
|
|
|
|
| 106 |
|
| 107 |
# Return denoised images
|
| 108 |
return noisy_images
|
|
@@ -149,34 +160,34 @@ def sample_stage_2(model,
|
|
| 149 |
prompt_embeds.dtype,
|
| 150 |
device,
|
| 151 |
generator,
|
| 152 |
-
)
|
| 153 |
|
| 154 |
# Prepare upscaled image and noise level
|
| 155 |
image = model.preprocess_image(image, num_images_per_prompt, device)
|
| 156 |
-
upscaled = F.interpolate(image, (height, width), mode="bilinear", align_corners=True)
|
| 157 |
|
| 158 |
noise_level = torch.tensor([noise_level] * upscaled.shape[0], device=upscaled.device)
|
| 159 |
noise = randn_tensor(upscaled.shape, generator=generator, device=upscaled.device, dtype=upscaled.dtype)
|
| 160 |
-
upscaled = model.image_noising_scheduler.add_noise(upscaled, noise, timesteps=noise_level)
|
| 161 |
|
| 162 |
# Condition on noise level, for each model input
|
| 163 |
-
noise_level = torch.cat([noise_level] * num_prompts * 2)
|
| 164 |
|
| 165 |
# Denoising Loop
|
| 166 |
for i, t in enumerate(tqdm(timesteps)):
|
| 167 |
# Cat noisy image with upscaled conditioning image
|
| 168 |
-
model_input = torch.cat([noisy_images, upscaled], dim=1)
|
| 169 |
|
| 170 |
# Apply views to noisy_image
|
| 171 |
viewed_inputs = []
|
| 172 |
for view_fn in views:
|
| 173 |
viewed_inputs.append(view_fn.view(model_input[0]))
|
| 174 |
-
viewed_inputs = torch.stack(viewed_inputs)
|
| 175 |
|
| 176 |
# Duplicate inputs for CFG
|
| 177 |
# Model input is: [ neg_0, neg_1, ..., pos_0, pos_1, ... ]
|
| 178 |
-
model_input = torch.cat([viewed_inputs] * 2)
|
| 179 |
-
model_input = model.scheduler.scale_model_input(model_input, t)
|
| 180 |
|
| 181 |
# predict the noise residual
|
| 182 |
noise_pred = model.unet(
|
|
|
|
| 1 |
from tqdm import tqdm
|
| 2 |
+
from icecream import ic
|
| 3 |
|
| 4 |
import torch
|
| 5 |
import torch.nn.functional as F
|
|
|
|
| 43 |
device,
|
| 44 |
generator,
|
| 45 |
)
|
| 46 |
+
# ic(noisy_images.shape)
|
| 47 |
|
| 48 |
+
for i, t in tqdm(enumerate(timesteps)):
|
| 49 |
# Apply views to noisy_image
|
| 50 |
viewed_noisy_images = []
|
| 51 |
for view_fn in views:
|
|
|
|
| 58 |
model_input = model.scheduler.scale_model_input(model_input, t)
|
| 59 |
|
| 60 |
# Predict noise estimate
|
| 61 |
+
# print("Predicting noise estimate")
|
| 62 |
noise_pred = model.unet(
|
| 63 |
model_input,
|
| 64 |
t,
|
|
|
|
| 66 |
cross_attention_kwargs=None,
|
| 67 |
return_dict=False,
|
| 68 |
)[0]
|
| 69 |
+
# ic(noise_pred.shape)
|
| 70 |
|
| 71 |
# Extract uncond (neg) and cond noise estimates
|
| 72 |
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
|
| 73 |
+
# ic(noise_pred_uncond.shape)
|
| 74 |
|
| 75 |
# Invert the unconditional (negative) estimates
|
| 76 |
inverted_preds = []
|
|
|
|
| 78 |
inverted_pred = view.inverse_view(pred)
|
| 79 |
inverted_preds.append(inverted_pred)
|
| 80 |
noise_pred_uncond = torch.stack(inverted_preds)
|
| 81 |
+
# ic(noise_pred_uncond.shape)
|
| 82 |
|
| 83 |
# Invert the conditional estimates
|
| 84 |
inverted_preds = []
|
|
|
|
| 86 |
inverted_pred = view.inverse_view(pred)
|
| 87 |
inverted_preds.append(inverted_pred)
|
| 88 |
noise_pred_text = torch.stack(inverted_preds)
|
| 89 |
+
# ic(noise_pred_text.shape)
|
| 90 |
|
| 91 |
# Split into noise estimate and variance estimates
|
| 92 |
noise_pred_uncond, _ = noise_pred_uncond.split(model_input.shape[1], dim=1)
|
| 93 |
noise_pred_text, predicted_variance = noise_pred_text.split(model_input.shape[1], dim=1)
|
| 94 |
noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
|
| 95 |
+
# ic(noise_pred.shape)
|
| 96 |
|
| 97 |
# Reduce predicted noise and variances
|
| 98 |
noise_pred = noise_pred.view(-1,num_prompts,3,64,64)
|
|
|
|
| 106 |
else:
|
| 107 |
raise ValueError('Reduction must be either `mean` or `alternate`')
|
| 108 |
noise_pred = torch.cat([noise_pred, predicted_variance], dim=1)
|
| 109 |
+
# ic(noise_pred.shape)
|
| 110 |
|
| 111 |
+
# ic(t.shape)
|
| 112 |
# compute the previous noisy sample x_t -> x_t-1
|
| 113 |
noisy_images = model.scheduler.step(
|
| 114 |
+
noise_pred.to('cuda'), t, noisy_images.to('cuda'), generator=generator, return_dict=False
|
| 115 |
)[0]
|
| 116 |
+
# ic(noisy_images.shape)
|
| 117 |
|
| 118 |
# Return denoised images
|
| 119 |
return noisy_images
|
|
|
|
| 160 |
prompt_embeds.dtype,
|
| 161 |
device,
|
| 162 |
generator,
|
| 163 |
+
).to('cuda')
|
| 164 |
|
| 165 |
# Prepare upscaled image and noise level
|
| 166 |
image = model.preprocess_image(image, num_images_per_prompt, device)
|
| 167 |
+
upscaled = F.interpolate(image.to('cuda'), (height, width), mode="bilinear", align_corners=True).to('cuda')
|
| 168 |
|
| 169 |
noise_level = torch.tensor([noise_level] * upscaled.shape[0], device=upscaled.device)
|
| 170 |
noise = randn_tensor(upscaled.shape, generator=generator, device=upscaled.device, dtype=upscaled.dtype)
|
| 171 |
+
upscaled = model.image_noising_scheduler.add_noise(upscaled, noise, timesteps=noise_level).to('cuda')
|
| 172 |
|
| 173 |
# Condition on noise level, for each model input
|
| 174 |
+
noise_level = torch.cat([noise_level] * num_prompts * 2).to('cuda')
|
| 175 |
|
| 176 |
# Denoising Loop
|
| 177 |
for i, t in enumerate(tqdm(timesteps)):
|
| 178 |
# Cat noisy image with upscaled conditioning image
|
| 179 |
+
model_input = torch.cat([noisy_images, upscaled], dim=1).to('cuda')
|
| 180 |
|
| 181 |
# Apply views to noisy_image
|
| 182 |
viewed_inputs = []
|
| 183 |
for view_fn in views:
|
| 184 |
viewed_inputs.append(view_fn.view(model_input[0]))
|
| 185 |
+
viewed_inputs = torch.stack(viewed_inputs).to('cuda')
|
| 186 |
|
| 187 |
# Duplicate inputs for CFG
|
| 188 |
# Model input is: [ neg_0, neg_1, ..., pos_0, pos_1, ... ]
|
| 189 |
+
model_input = torch.cat([viewed_inputs] * 2).to('cuda')
|
| 190 |
+
model_input = model.scheduler.scale_model_input(model_input, t).to('cuda')
|
| 191 |
|
| 192 |
# predict the noise residual
|
| 193 |
noise_pred = model.unet(
|