leoeric commited on
Commit
0b4562b
·
0 Parent(s):

Initial commit for HF Space - code files only

Browse files
app.py ADDED
@@ -0,0 +1,249 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Hugging Face Space for STARFlow
3
+ Text-to-Image and Text-to-Video Generation
4
+
5
+ This app allows you to run STARFlow inference on Hugging Face GPU infrastructure.
6
+ """
7
+
8
+ import os
9
+ import gradio as gr
10
+ import torch
11
+ import subprocess
12
+ import pathlib
13
+ from pathlib import Path
14
+
15
+ # Check if running on Hugging Face Spaces
16
+ HF_SPACE = os.environ.get("SPACE_ID") is not None
17
+
18
+ # Verify CUDA availability (will be True on HF Spaces with GPU hardware)
19
+ if torch.cuda.is_available():
20
+ print(f"✅ CUDA available! Device: {torch.cuda.get_device_name(0)}")
21
+ print(f" CUDA Version: {torch.version.cuda}")
22
+ print(f" PyTorch Version: {torch.__version__}")
23
+ else:
24
+ print("⚠️ CUDA not available. Make sure GPU hardware is selected in Space settings.")
25
+
26
+ def generate_image(prompt, aspect_ratio, cfg, seed, checkpoint_file, config_path):
27
+ """Generate image from text prompt."""
28
+ if checkpoint_file is None:
29
+ return None, "Error: Please upload a checkpoint file."
30
+
31
+ # Handle Gradio file object
32
+ if hasattr(checkpoint_file, 'name'):
33
+ checkpoint_path = checkpoint_file.name
34
+ else:
35
+ checkpoint_path = str(checkpoint_file)
36
+
37
+ if not os.path.exists(checkpoint_path):
38
+ return None, f"Error: Checkpoint file not found at {checkpoint_path}."
39
+
40
+ if not config_path or not os.path.exists(config_path):
41
+ return None, "Error: Config file not found. Please ensure config file exists."
42
+
43
+ try:
44
+ # Create output directory
45
+ output_dir = Path("outputs")
46
+ output_dir.mkdir(exist_ok=True)
47
+
48
+ # Run sampling command
49
+ cmd = [
50
+ "python", "sample.py",
51
+ "--model_config_path", config_path,
52
+ "--checkpoint_path", checkpoint_path,
53
+ "--caption", prompt,
54
+ "--sample_batch_size", "1",
55
+ "--cfg", str(cfg),
56
+ "--aspect_ratio", aspect_ratio,
57
+ "--seed", str(seed),
58
+ "--save_folder", "1",
59
+ "--finetuned_vae", "none",
60
+ "--jacobi", "1",
61
+ "--jacobi_th", "0.001",
62
+ "--jacobi_block_size", "16"
63
+ ]
64
+
65
+ result = subprocess.run(cmd, capture_output=True, text=True, cwd=os.getcwd())
66
+
67
+ if result.returncode != 0:
68
+ return None, f"Error: {result.stderr}"
69
+
70
+ # Find the generated image
71
+ # The sample.py script saves to logdir/model_name/...
72
+ # We need to find the most recent output
73
+ output_files = list(output_dir.glob("**/*.png")) + list(output_dir.glob("**/*.jpg"))
74
+ if output_files:
75
+ latest_file = max(output_files, key=lambda p: p.stat().st_mtime)
76
+ return str(latest_file), "Success! Image generated."
77
+ else:
78
+ return None, "Error: Generated image not found."
79
+
80
+ except Exception as e:
81
+ return None, f"Error: {str(e)}"
82
+
83
+
84
+ def generate_video(prompt, aspect_ratio, cfg, seed, target_length, checkpoint_file, config_path, input_image):
85
+ """Generate video from text prompt."""
86
+ if checkpoint_file is None:
87
+ return None, "Error: Please upload a checkpoint file."
88
+
89
+ # Handle Gradio file object
90
+ if hasattr(checkpoint_file, 'name'):
91
+ checkpoint_path = checkpoint_file.name
92
+ else:
93
+ checkpoint_path = str(checkpoint_file)
94
+
95
+ if not os.path.exists(checkpoint_path):
96
+ return None, f"Error: Checkpoint file not found at {checkpoint_path}."
97
+
98
+ if not config_path or not os.path.exists(config_path):
99
+ return None, "Error: Config file not found. Please ensure config file exists."
100
+
101
+ # Handle input image
102
+ input_image_path = None
103
+ if input_image is not None:
104
+ if hasattr(input_image, 'name'):
105
+ input_image_path = input_image.name
106
+ else:
107
+ input_image_path = str(input_image)
108
+
109
+ try:
110
+ # Create output directory
111
+ output_dir = Path("outputs")
112
+ output_dir.mkdir(exist_ok=True)
113
+
114
+ # Run sampling command
115
+ cmd = [
116
+ "python", "sample.py",
117
+ "--model_config_path", config_path,
118
+ "--checkpoint_path", checkpoint_path,
119
+ "--caption", prompt,
120
+ "--sample_batch_size", "1",
121
+ "--cfg", str(cfg),
122
+ "--aspect_ratio", aspect_ratio,
123
+ "--seed", str(seed),
124
+ "--out_fps", "16",
125
+ "--save_folder", "1",
126
+ "--jacobi", "1",
127
+ "--jacobi_th", "0.001",
128
+ "--finetuned_vae", "none",
129
+ "--disable_learnable_denoiser", "0",
130
+ "--jacobi_block_size", "32",
131
+ "--target_length", str(target_length)
132
+ ]
133
+
134
+ if input_image_path and os.path.exists(input_image_path):
135
+ cmd.extend(["--input_image", input_image_path])
136
+ else:
137
+ cmd.extend(["--input_image", "none"])
138
+
139
+ result = subprocess.run(cmd, capture_output=True, text=True, cwd=os.getcwd())
140
+
141
+ if result.returncode != 0:
142
+ return None, f"Error: {result.stderr}"
143
+
144
+ # Find the generated video
145
+ output_files = list(output_dir.glob("**/*.mp4")) + list(output_dir.glob("**/*.gif"))
146
+ if output_files:
147
+ latest_file = max(output_files, key=lambda p: p.stat().st_mtime)
148
+ return str(latest_file), "Success! Video generated."
149
+ else:
150
+ return None, "Error: Generated video not found."
151
+
152
+ except Exception as e:
153
+ return None, f"Error: {str(e)}"
154
+
155
+
156
+ # Create Gradio interface
157
+ with gr.Blocks(title="STARFlow - Text-to-Image & Video Generation") as demo:
158
+ gr.Markdown("""
159
+ # STARFlow: Scalable Transformer Auto-Regressive Flow
160
+
161
+ Generate high-quality images and videos from text prompts using STARFlow models.
162
+
163
+ **Note**: You'll need to upload model checkpoints. Check the README for model download links.
164
+ """)
165
+
166
+ with gr.Tabs():
167
+ with gr.Tab("Text-to-Image"):
168
+ with gr.Row():
169
+ with gr.Column():
170
+ image_prompt = gr.Textbox(
171
+ label="Prompt",
172
+ placeholder="a film still of a cat playing piano",
173
+ lines=3
174
+ )
175
+ image_checkpoint = gr.File(
176
+ label="Model Checkpoint (.pth file)",
177
+ file_types=[".pth"]
178
+ )
179
+ image_config = gr.Textbox(
180
+ label="Config Path",
181
+ value="configs/starflow_3B_t2i_256x256.yaml",
182
+ placeholder="configs/starflow_3B_t2i_256x256.yaml"
183
+ )
184
+ image_aspect = gr.Dropdown(
185
+ choices=["1:1", "2:3", "3:2", "16:9", "9:16", "4:5", "5:4"],
186
+ value="1:1",
187
+ label="Aspect Ratio"
188
+ )
189
+ image_cfg = gr.Slider(1.0, 10.0, value=3.6, step=0.1, label="CFG Scale")
190
+ image_seed = gr.Number(value=999, label="Seed", precision=0)
191
+ image_btn = gr.Button("Generate Image", variant="primary")
192
+
193
+ with gr.Column():
194
+ image_output = gr.Image(label="Generated Image")
195
+ image_status = gr.Textbox(label="Status", interactive=False)
196
+
197
+ image_btn.click(
198
+ fn=generate_image,
199
+ inputs=[image_prompt, image_aspect, image_cfg, image_seed, image_checkpoint, image_config],
200
+ outputs=[image_output, image_status],
201
+ show_progress=True
202
+ )
203
+
204
+ with gr.Tab("Text-to-Video"):
205
+ with gr.Row():
206
+ with gr.Column():
207
+ video_prompt = gr.Textbox(
208
+ label="Prompt",
209
+ placeholder="a corgi dog looks at the camera",
210
+ lines=3
211
+ )
212
+ video_checkpoint = gr.File(
213
+ label="Model Checkpoint (.pth file)",
214
+ file_types=[".pth"]
215
+ )
216
+ video_config = gr.Textbox(
217
+ label="Config Path",
218
+ value="configs/starflow-v_7B_t2v_caus_480p.yaml",
219
+ placeholder="configs/starflow-v_7B_t2v_caus_480p.yaml"
220
+ )
221
+ video_aspect = gr.Dropdown(
222
+ choices=["16:9", "1:1", "4:3"],
223
+ value="16:9",
224
+ label="Aspect Ratio"
225
+ )
226
+ video_cfg = gr.Slider(1.0, 10.0, value=3.5, step=0.1, label="CFG Scale")
227
+ video_seed = gr.Number(value=99, label="Seed", precision=0)
228
+ video_length = gr.Slider(81, 481, value=81, step=80, label="Target Length (frames)")
229
+ video_input_image = gr.File(
230
+ label="Input Image (optional, for image-to-video)",
231
+ file_types=["image"]
232
+ )
233
+ video_btn = gr.Button("Generate Video", variant="primary")
234
+
235
+ with gr.Column():
236
+ video_output = gr.Video(label="Generated Video")
237
+ video_status = gr.Textbox(label="Status", interactive=False)
238
+
239
+ video_btn.click(
240
+ fn=generate_video,
241
+ inputs=[video_prompt, video_aspect, video_cfg, video_seed, video_length,
242
+ video_checkpoint, video_config, video_input_image],
243
+ outputs=[video_output, video_status],
244
+ show_progress=True
245
+ )
246
+
247
+ if __name__ == "__main__":
248
+ demo.launch(server_name="0.0.0.0", server_port=7860)
249
+
configs/captions/ar_video_prompts_original.txt ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ A stylish woman walks down a Tokyo street filled with warm glowing neon and animated city signage. She wears a black leather jacket, a long red dress, and black boots, and carries a black purse. She wears sunglasses and red lipstick. She walks confidently and casually. The street is damp and reflective, creating a mirror effect of the colorful lights. Many pedestrians walk about.
2
+ Several giant wooly mammoths approach treading through a snowy meadow, their long wooly fur lightly blows in the wind as they walk, snow covered trees and dramatic snow capped mountains in the distance, mid afternoon light with wispy clouds and a sun high in the distance creates a warm glow, the low camera view is stunning capturing the large furry mammal with beautiful photography, depth of field.
3
+ Drone view of waves crashing against the rugged cliffs along Big Sur’s Garay Point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff’s edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff’s edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of Big Sur.
4
+ Historical footage of California during the gold rush.
5
+ A close‑up view of a glass sphere that has a zen garden within it. There is a small dwarf in the sphere who is raking the zen garden and creating patterns in the sand.
6
+ Extreme close‑up of a 24‑year‑old woman’s eye blinking, standing in Marrakech during magic hour, cinematic film shot in 70 mm, depth of field, vivid colors, cinematic.
7
+ 3D animation of a small, round, fluffy creature with big, expressive eyes explores a vibrant, enchanted forest. The creature, a whimsical blend of a rabbit and a squirrel, has soft blue fur and a bushy, striped tail. It hops along a sparkling stream, its eyes wide with wonder. The forest is alive with magical elements: flowers that glow and change colors, trees with leaves in shades of purple and silver, and small floating lights that resemble fireflies.
8
+ The camera follows behind a white vintage SUV with a black roof rack as it speeds up a steep dirt road surrounded by pine trees on a mountain slope; dust kicks up from its tires. The sunlight shines on the SUV as it accelerates, casting a warm glow over the scene. The dirt road curves gently into the distance, redwoods on either side.
9
+ A drone camera circles around a beautiful historic church built on a rocky outcropping along the Amalfi Coast; the view showcases intricate architectural details. Waves crash against the rocks below while the warm glow of the afternoon sun bathes the scene.
10
+ An extreme close‑up of a gray‑haired man with a beard in his 60s sits at a café in Paris, deep in thought. At the end he offers a subtle closed‑mouth smile as if he found the answer to the mystery of life. Dramatic cinematic lighting.
11
+ Step‑printing scene of a person running, cinematic film shot in 35 mm.
12
+ An adorable happy otter confidently stands on a surfboard wearing a yellow lifejacket, riding along turquoise tropical waters near lush islands, 3D digital render art style.
13
+ A Chinese man sits at a table and eats noodles with chopsticks.
14
+ Carefully pouring the milk into the cup, the milk flows smoothly and the cup gradually fills with milky‑white color.
15
+ A rally car taking a fast turn on a track.
16
+ Close‑up of a bright blue parrot's feathers glittering in the light, showing vibrant colors.
17
+ A white and orange tabby cat happily darts through a dense garden from a ground‑level perspective, cinematic warm tones and grain.
18
+ A FPV drone shot through a castle on a cliff.
19
+ Over‑the‑shoulder shot of a woman running and watching a rocket in the distance.
20
+ A pink pig running fast toward the camera in an alley in Tokyo.
21
+ In a still frame, a tranquil pond fringed by weeping cherry trees, blossoms drifting onto the glassy surface.
22
+ In a still frame, the Parthenon's majestic Doric columns stand atop the Acropolis, framed by the Athenian landscape.
23
+ A person drinking coffee in a cafe.
24
+ A motorcycle accelerating to gain speed.
25
+ A train speeding down the tracks.
26
+ Panda playing the guitar.
27
+ A slow cinematic push‑in on an ostrich standing in a 1980s kitchen.
28
+ A cyclone of broken glass in an urban alleyway, dynamic movement.
29
+ A man standing in front of a burning building giving the thumbs‑up sign.
30
+ A woman singing on a concert stage with a bright backlight.
31
+ Dragon‑toucan walking through the Serengeti.
32
+ Aerial view of Santorini during the blue hour, showcasing white Cycladic buildings with blue domes and the caldera.
33
+ Animated scene: a short fluffy monster kneels beside a melting red candle, gazing at the flame with wonder.
34
+ A cyclist powering up a steep hill in a road race.
35
+ A trio of seahorses holding onto seagrass with their tails.
36
+ A seal eagerly catching tossed fish from a trainer.
37
+ Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: a stylish woman walks down a Tokyo street filled with warm glowing neon and animated city signage. She wears a black leather jacket, a long red dress, and black boots, and carries a black purse. She wears sunglasses and red lipstick. She walks confidently and casually. The street is damp and reflective, creating a mirror effect of the colorful lights. Many pedestrians walk about.
38
+ Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: several giant wooly mammoths approach treading through a snowy meadow, their long wooly fur lightly blows in the wind as they walk, snow covered trees and dramatic snow capped mountains in the distance, mid afternoon light with wispy clouds and a sun high in the distance creates a warm glow, the low camera view is stunning capturing the large furry mammal with beautiful photography, depth of field.
39
+ Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: drone view of waves crashing against the rugged cliffs along Big Sur’s Garay Point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff’s edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff’s edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of Big Sur.
40
+ Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: historical footage of California during the gold rush.
41
+ Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: a close‑up view of a glass sphere that has a zen garden within it. There is a small dwarf in the sphere who is raking the zen garden and creating patterns in the sand.
42
+ Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: extreme close‑up of a 24‑year‑old woman’s eye blinking, standing in Marrakech during magic hour, cinematic film shot in 70 mm, depth of field, vivid colors, cinematic.
43
+ Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: 3D animation of a small, round, fluffy creature with big, expressive eyes explores a vibrant, enchanted forest. The creature, a whimsical blend of a rabbit and a squirrel, has soft blue fur and a bushy, striped tail. It hops along a sparkling stream, its eyes wide with wonder. The forest is alive with magical elements: flowers that glow and change colors, trees with leaves in shades of purple and silver, and small floating lights that resemble fireflies.
44
+ Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: the camera follows behind a white vintage SUV with a black roof rack as it speeds up a steep dirt road surrounded by pine trees on a mountain slope; dust kicks up from its tires. The sunlight shines on the SUV as it accelerates, casting a warm glow over the scene. The dirt road curves gently into the distance, redwoods on either side.
45
+ Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: a drone camera circles around a beautiful historic church built on a rocky outcropping along the Amalfi Coast; the view showcases intricate architectural details. Waves crash against the rocks below while the warm glow of the afternoon sun bathes the scene.
46
+ Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: an extreme close‑up of a gray‑haired man with a beard in his 60s sits at a café in Paris, deep in thought. At the end he offers a subtle closed‑mouth smile as if he found the answer to the mystery of life. Dramatic cinematic lighting.
47
+ Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: step‑printing scene of a person running, cinematic film shot in 35 mm.
48
+ Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: an adorable happy otter confidently stands on a surfboard wearing a yellow lifejacket, riding along turquoise tropical waters near lush islands, 3D digital render art style.
configs/captions/ar_video_prompts_videogen.txt ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: a stylish woman walks down a Tokyo street filled with warm glowing neon and animated city signage. She wears a black leather jacket, a long red dress, and black boots, and carries a black purse. She wears sunglasses and red lipstick. She walks confidently and casually. The street is damp and reflective, creating a mirror effect of the colorful lights. Many pedestrians walk about.
2
+ Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: several giant wooly mammoths approach treading through a snowy meadow, their long wooly fur lightly blows in the wind as they walk, snow covered trees and dramatic snow capped mountains in the distance, mid afternoon light with wispy clouds and a sun high in the distance creates a warm glow, the low camera view is stunning capturing the large furry mammal with beautiful photography, depth of field.
3
+ Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: drone view of waves crashing against the rugged cliffs along Big Sur’s Garay Point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff’s edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff’s edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of Big Sur.
4
+ Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: historical footage of California during the gold rush.
5
+ Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: a close‑up view of a glass sphere that has a zen garden within it. There is a small dwarf in the sphere who is raking the zen garden and creating patterns in the sand.
6
+ Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: extreme close‑up of a 24‑year‑old woman’s eye blinking, standing in Marrakech during magic hour, cinematic film shot in 70 mm, depth of field, vivid colors, cinematic.
7
+ Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: 3D animation of a small, round, fluffy creature with big, expressive eyes explores a vibrant, enchanted forest. The creature, a whimsical blend of a rabbit and a squirrel, has soft blue fur and a bushy, striped tail. It hops along a sparkling stream, its eyes wide with wonder. The forest is alive with magical elements: flowers that glow and change colors, trees with leaves in shades of purple and silver, and small floating lights that resemble fireflies.
8
+ Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: the camera follows behind a white vintage SUV with a black roof rack as it speeds up a steep dirt road surrounded by pine trees on a mountain slope; dust kicks up from its tires. The sunlight shines on the SUV as it accelerates, casting a warm glow over the scene. The dirt road curves gently into the distance, redwoods on either side.
9
+ Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: a drone camera circles around a beautiful historic church built on a rocky outcropping along the Amalfi Coast; the view showcases intricate architectural details. Waves crash against the rocks below while the warm glow of the afternoon sun bathes the scene.
10
+ Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: an extreme close‑up of a gray‑haired man with a beard in his 60s sits at a café in Paris, deep in thought. At the end he offers a subtle closed‑mouth smile as if he found the answer to the mystery of life. Dramatic cinematic lighting.
11
+ Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: step‑printing scene of a person running, cinematic film shot in 35 mm.
12
+ Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: an adorable happy otter confidently stands on a surfboard wearing a yellow lifejacket, riding along turquoise tropical waters near lush islands, 3D digital render art style.
13
+ Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: a Chinese man sits at a table and eats noodles with chopsticks.
14
+ Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: carefully pouring the milk into the cup, the milk flows smoothly and the cup gradually fills with milky‑white color.
15
+ Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: a rally car taking a fast turn on a track.
16
+ Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: close‑up of a bright blue parrot's feathers glittering in the light, showing vibrant colors.
17
+ Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: a white and orange tabby cat happily darts through a dense garden from a ground‑level perspective, cinematic warm tones and grain.
18
+ Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: a FPV drone shot through a castle on a cliff.
19
+ Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: over‑the‑shoulder shot of a woman running and watching a rocket in the distance.
20
+ Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: a pink pig running fast toward the camera in an alley in Tokyo.
21
+ Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: in a still frame, a tranquil pond fringed by weeping cherry trees, blossoms drifting onto the glassy surface.
22
+ Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: in a still frame, the Parthenon's majestic Doric columns stand atop the Acropolis, framed by the Athenian landscape.
23
+ Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: a person drinking coffee in a cafe.
24
+ Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: a motorcycle accelerating to gain speed.
25
+ Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: a train speeding down the tracks.
26
+ Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: panda playing the guitar.
27
+ Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: a slow cinematic push‑in on an ostrich standing in a 1980s kitchen.
28
+ Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: a cyclone of broken glass in an urban alleyway, dynamic movement.
29
+ Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: a man standing in front of a burning building giving the thumbs‑up sign.
30
+ Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: a woman singing on a concert stage with a bright backlight.
31
+ Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: dragon‑toucan walking through the Serengeti.
32
+ Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: aerial view of Santorini during the blue hour, showcasing white Cycladic buildings with blue domes and the caldera.
33
+ Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: animated scene: a short fluffy monster kneels beside a melting red candle, gazing at the flame with wonder.
34
+ Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: a cyclist powering up a steep hill in a road race.
35
+ Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: a trio of seahorses holding onto seagrass with their tails.
36
+ Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: a seal eagerly catching tossed fish from a trainer.
configs/captions/editing.txt ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ A red vintage bicycle leaning against a brick wall [edit] change the bicycle’s color to teal
2
+ A calico cat sleeping on a sunny windowsill [edit] add a small potted cactus beside the cat
3
+ A steaming mug of coffee on a marble countertop [edit] replace the coffee with green matcha tea
4
+ A wooden rowboat floating on a crystal lake [edit] change the time of day to golden sunset
5
+ A rustic bookshelf filled with old novels [edit] insert a glowing crystal on the middle shelf
6
+ A bowl of fresh strawberries on a white tablecloth [edit] remove three strawberries from the bowl
7
+ A snow-covered cabin in a pine forest [edit] turn the scene into midsummer with lush greenery
8
+ A street artist painting a mural on a brick wall [edit] make the mural feature abstract geometric shapes
9
+ A golden retriever running across a beach [edit] add a colorful kite flying in the sky
10
+ A glass terrarium containing succulents [edit] replace the succulents with blooming orchids
11
+ A ceramic teapot beside two porcelain cups [edit] change the teapot design to blue willow pattern
12
+ A mountain landscape under clear blue sky [edit] add dramatic storm clouds rolling in
13
+ A violin resting on velvet fabric [edit] apply a vintage sepia tone to the entire image
14
+ A busy city street at night with neon signs [edit] remove all people from the scene
15
+ A marble statue in a museum gallery [edit] add a soft spotlight highlighting the statue’s face
16
+ A plate of sushi arranged neatly on slate [edit] replace tuna pieces with avocado slices
17
+ A lighthouse overlooking a rocky shore [edit] change the lighthouse stripes to red and white
18
+ A cyclist on a winding countryside road [edit] turn the season to autumn with orange foliage
19
+ A steaming bowl of ramen on a wooden table [edit] add extra slices of boiled egg on top
20
+ A young sapling planted in fertile soil [edit] transform the sapling into a full-grown oak
21
+ An astronaut floating in outer space [edit] replace the Earth in background with Saturn
22
+ A quaint café terrace with empty chairs [edit] add a black cat sitting on one chair
23
+ A desert scene with a single cactus [edit] introduce a small oasis with palm trees in the distance
24
+ A handwritten love letter on parchment [edit] change the ink color to royal blue
25
+ A crystal chandelier hanging in a ballroom [edit] increase brightness to make it sparkle intensely
26
+ A surfer riding a large ocean wave [edit] add a dolphin jumping alongside the surfer
27
+ A clay pot spinning on a pottery wheel [edit] apply a rainbow glaze pattern to the pot
28
+ A quiet library with tall wooden shelves [edit] add floating dust motes in sunbeams
29
+ A steaming cup of herbal tea with lemon slice [edit] remove the lemon and add a sprig of mint
30
+ A sleek black sports car on a mountain road [edit] change the car color to bright yellow
31
+ A picnic blanket spread on green grass [edit] add a wicker basket filled with fruit
32
+ A snowy owl perched on a pine branch [edit] make the owl’s eyes a vibrant amber color
33
+ A cobblestone alley lined with lanterns [edit] turn the scene into daytime with clear sky
34
+ A chef plating gourmet pasta in a kitchen [edit] replace the pasta sauce with pesto
35
+ A koi pond with lily pads and flowers [edit] add five more colorful koi fish
36
+ A rustic wooden door with iron hinges [edit] paint the door a weathered turquoise shade
37
+ A bouquet of wildflowers in a glass vase [edit] remove two tallest flowers to shorten arrangement
38
+ A stack of pancakes topped with syrup [edit] add fresh blueberries and powdered sugar
39
+ A commuter train passing through countryside [edit] change the season to winter with snowfall
40
+ A puppy wearing a red bandana [edit] change the bandana color to pastel green
41
+ An artist’s desk cluttered with brushes [edit] neatly organize the brushes into a cup
42
+ A glowing full moon over a calm lake [edit] replace the moon with a crescent shape
43
+ A bamboo steamer filled with dumplings [edit] add steam rising more visibly from the dumplings
44
+ A violinist performing on a small stage [edit] dim the background lights for a spotlight effect
45
+ A vineyard at harvest time [edit] change grapes from green to deep purple
46
+ A rustic windmill in a sunflower field [edit] remove the windmill blades entirely
47
+ A glass of red wine on a wooden barrel [edit] switch the wine color to white wine
48
+ A paperback book open on beach sand [edit] add gentle ocean waves reaching the book edge
49
+ A hallway lined with ornate mirrors [edit] turn the flooring into black-and-white checkerboard
50
+ A basket of fresh oranges [edit] add a peeled orange with segments showing
51
+ A vintage typewriter on a desk [edit] change the paper to display the words “Hello World”
52
+ A fisherman casting a line at dawn [edit] move the sun position higher into late morning
53
+ A city skyline at twilight [edit] increase building window lights for a vibrant look
54
+ A steaming bowl of tomato soup [edit] garnish with a swirl of cream on top
55
+ A classic motorcycle parked by roadside [edit] add raindrops for a freshly rained-on effect
56
+ A painter mixing colors on a palette [edit] replace one primary color with metallic gold paint
57
+ A terrier wearing a raincoat [edit] change the raincoat pattern to polka dots
58
+ A cozy fireplace with burning logs [edit] intensify the flames for a warmer glow
59
+ A cappuccino with latte art heart [edit] change the latte art to a leaf design
60
+ A chessboard mid-game [edit] remove the black queen from the board
61
+ A vintage camera resting on a map [edit] add a passport beside the camera
62
+ A plate of macarons in pastel colors [edit] swap pink macarons to lavender hue
63
+ A child flying a paper airplane [edit] turn the background into a starry night sky
64
+ A bakery display of croissants [edit] add powdered sugar dusting on croissants
65
+ A hummingbird hovering near a flower [edit] freeze the wings for sharp detail
66
+ A busy farmer’s market stall [edit] remove price tags from all produce
67
+ A kayak on a tranquil river [edit] change water color to emerald green
68
+ A stack of vinyl records beside a turntable [edit] add a glowing neon sign saying “Now Playing”
69
+ A spiral staircase in an old tower [edit] brighten ambient light for clearer details
70
+ A latte served in a glass cup [edit] make the froth height taller by 20%
71
+ A soccer ball on a grassy field [edit] add morning dew drops on grass blades
72
+ A bookshelf with color-coded novels [edit] shuffle books into random order
73
+ A geisha holding a parasol [edit] change parasol pattern to cherry blossoms
74
+ A mountain biker on forest trail [edit] add motion blur to background for action feel
75
+ A garden gnome standing among tulips [edit] paint the gnome’s hat bright yellow
76
+ A street food cart serving tacos [edit] replace tacos with sushi rolls
77
+ A ceramic mug with chipped rim [edit] restore the rim to perfect condition
78
+ A violin bow resting beside sheet music [edit] add handwritten annotations on the sheet
79
+ A classic lamp post beside a foggy road [edit] turn fog into light snowfall
80
+ A bowl of green apples [edit] change two apples to bright red
81
+ A skyline reflected in calm river [edit] add gentle ripples to distort reflection
82
+ A barista pouring latte art [edit] slow shutter effect to create milk swirl trails
83
+ A polaroid photo pinned to corkboard [edit] fade colors for a vintage look
84
+ A snowboarder on a snowy slope [edit] increase snow spray for dynamic action
85
+ A rustic barn in golden wheat field [edit] turn wheat color to early spring green
86
+ An open sketchbook with pencil drawing [edit] add watercolor wash to the sketch
87
+ A chef sharpening a kitchen knife [edit] replace knife with a wooden spoon
88
+ A sunflower facing the sun [edit] rotate flower to face viewer instead
89
+ A crystal perfume bottle on silken fabric [edit] scatter light rainbow refractions around bottle
90
+ A laptop on a café table [edit] change screen content to code editor view
91
+ A violin case lying open on stage floor [edit] add velvet lining texture detail
92
+ A swimmer diving into a pool [edit] add underwater bubbles trailing the swimmer
93
+ A campfire under starry sky [edit] increase star density for a Milky Way effect
94
+ A gardener watering roses [edit] change watering can to a copper finish
95
+ A bowl of ramen with chopsticks resting [edit] replace chopsticks with a wooden spoon
96
+ A gothic archway in ancient ruins [edit] add creeping ivy on stone walls
97
+ A bicycle leaning against tree [edit] convert scene to black-and-white photograph
98
+ A plate of grilled salmon [edit] add lemon wedges on side
99
+ A pianist’s hands on grand piano keys [edit] add sheet music on stand illuminated softly
100
+ A street dog lying on warm pavement [edit] change pavement to cobblestone texture
101
+ A cratered lunar landscape [edit] add Earth rise on horizon
102
+ A pearl necklace on velvet pillow [edit] increase pearl luster for extra shine
103
+ A mason jar filled with fireflies [edit] add soft glowing light escaping lid gaps
104
+ A shuttle bus arriving at snowy station [edit] remove snow for a springtime setting
105
+ A woman practicing yoga on cliff edge [edit] shift sunrise colors to sunset palette
106
+ A stack of colorful gift boxes [edit] replace top box ribbon with a bow
107
+ A sailboat on calm turquoise sea [edit] add a distant island silhouette
108
+ A stack of chocolate chip cookies [edit] add melting chocolate drizzle
109
+ A vintage radio on windowsill [edit] change radio color to mint green
110
+ A sculpture of abstract metal shapes [edit] convert metal texture to brushed gold
111
+ A firefighter spraying water on flames [edit] increase water spray for dramatic arc
112
+ A desert highway stretching to horizon [edit] add tumbleweed crossing the road
113
+ A fluffy kitten playing with yarn ball [edit] replace yarn color with multicolor rainbow threads
114
+ An ancient stone bridge over river [edit] add lanterns hanging under arches
115
+ A scientist examining test tubes [edit] change liquid color to bright purple
116
+ A row of wind turbines on rolling hills [edit] turn scene into twilight with orange sky
117
+ A scuba diver exploring coral reef [edit] add a school of colorful fish surrounding diver
118
+ A cafe latte on a wooden tray [edit] stencil a star pattern in cocoa on froth
configs/captions/editing2.txt ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ A jade teapot on a bamboo mat [edit] change the teapot to polished copper
2
+ A marble bust lit by soft window light [edit] rotate the bust 30° to the left
3
+ A commuter train entering a tunnel [edit] replace the train with a freight locomotive
4
+ A medieval stone bridge over misty river [edit] add flower boxes along the railing
5
+ A neon food truck at midnight market [edit] switch neon sign color to violet
6
+ A bonsai pine on a wooden shelf [edit] enlarge the bonsai roots onto a rock slab
7
+ A cyclist on a snow-dusted trail [edit] remove all snow for summer scenery
8
+ A row of lab flasks with blue liquid [edit] change liquid to gradient rainbow
9
+ A grand piano on a concert hall stage [edit] lower the key cover halfway
10
+ A lighthouse beam cutting through fog [edit] intensify light to double brightness
11
+ An astronaut planting a flag on Mars [edit] swap Mars sand for lunar regolith
12
+ A vintage typewriter beside ink bottle [edit] add dried lavender sprig on keys
13
+ A plate of macarons in pastel hues [edit] convert entire scene to monochrome
14
+ A sunflower field at golden hour [edit] turn one sunflower to face the viewer
15
+ A violin resting on velvet cloth [edit] change violin varnish from brown to deep red
16
+ A mountain climber reaching the summit [edit] add celebratory confetti burst
17
+ A glass of iced lemonade on patio table [edit] remove ice cubes completely
18
+ A ceramic mug with hand-painted flowers [edit] replace floral pattern with stripes
19
+ A koi pond with rippling water [edit] freeze the water surface perfectly still
20
+ A bookshelf of worn leather tomes [edit] insert a glowing holographic book
21
+ A city skyline reflected on wet pavement [edit] change reflection to fragmented shards
22
+ A chef garnishing sushi rolls [edit] replace sushi with colorful mochi
23
+ A freight ship in calm harbor [edit] add gentle morning fog around hull
24
+ A ballerina mid-pirouette on empty stage [edit] add motion trail behind tutu
25
+ A rustic barn in golden wheat field [edit] turn wheat into lavender plants
26
+ A steaming bowl of pho on wooden table [edit] add fresh chili slices floating
27
+ A geodesic dome in arctic landscape [edit] shift lighting to sunset pinks
28
+ An old pocket watch on velvet [edit] set time from 4:00 to 10:10
29
+ An oak tree with autumn leaves [edit] transition leaves to spring green
30
+ A skateboarder grinding rail [edit] add spark particles at contact point
31
+ A crystal chandelier in foyer [edit] dim ambient light for stronger contrast
32
+ A swan gliding on mirror lake [edit] create concentric ripples behind swan
33
+ A row of wind turbines on ridge [edit] turn blades into transparent glass
34
+ A latte with leaf-shaped foam art [edit] change art to a rosette pattern
35
+ A gourmet burger on slate board [edit] remove cheese slice entirely
36
+ A paper airplane against blue sky [edit] add sketched contrail behind plane
37
+ A flamenco guitarist on street corner [edit] replace guitar with a violin
38
+ A snow-covered pine forest [edit] add footprints leading into distance
39
+ A stack of vinyl records [edit] change record labels to bright cyan
40
+ An artist mixing oil paints [edit] replace palette knife with fine brush
41
+ A classic red telephone booth [edit] repaint booth matte black
42
+ A glass skyscraper at sunrise [edit] add low-lying rain clouds wrapping base
43
+ A puppy chasing butterflies [edit] swap butterflies for floating bubbles
44
+ A clay pot on pottery wheel [edit] apply crackle glaze texture
45
+ A bridge enveloped in autumn fog [edit] lift fog to reveal river below
46
+ A salad bowl of mixed greens [edit] add sliced strawberries on top
47
+ A desert road with heat haze [edit] insert distant thunderstorm on horizon
48
+ A chess set mid-game [edit] replace black queen with 3-D printed model
49
+ A coral reef teeming with fish [edit] shift palette to infrared false-color
50
+ A fountain in public square [edit] stop water flow to frozen sculptural state
51
+ A maple leaf on wet asphalt [edit] intensify reflections around leaf
52
+ A violinist practicing scales [edit] add metronome on nearby stand
53
+ A bowl of ramen with narutomaki [edit] remove narutomaki slices
54
+ A modern smartwatch on wrist [edit] change wristband to woven fabric
55
+ A candlelit dinner table [edit] extinguish leftmost candle only
56
+ A forest path carpeted with moss [edit] scatter pink cherry petals along trail
57
+ A drone view over rice terraces [edit] turn season to early harvest yellow
58
+ A plate of assorted cheeses [edit] replace blue cheese with goat cheese
59
+ A fox sitting in snowy field [edit] add softly falling snowflakes
60
+ A graffiti mural on brick wall [edit] desaturate bricks, keep mural color
61
+ A leather-bound journal and quill [edit] age journal cover with scratches
62
+ A small cabin under starry sky [edit] add faint aurora on horizon
63
+ A cup of espresso with crema [edit] lighten crema for flat white look
64
+ A bouquet of roses in vase [edit] change roses from red to white
65
+ A lighthouse on rocky coastline [edit] add crashing wave spray
66
+ A billiard table break shot [edit] freeze cue ball before impact
67
+ A hot-air balloon over vineyards [edit] change balloon envelope to stripes
68
+ A snow leopard on rocky ledge [edit] brighten ambient light for detail
69
+ A sushi chef slicing tuna [edit] replace tuna with salmon
70
+ A violin bow resting on strings [edit] slightly raise bow above strings
71
+ A stack of pancakes with butter [edit] drizzle maple syrup generously
72
+ A waterfall in tropical jungle [edit] adjust shutter to silky water effect
73
+ A painter’s easel by window [edit] add soft morning light beam
74
+ A cherry pie cooling on sill [edit] remove lattice crust for open top
75
+ A monorail passing futuristic city [edit] shift city lighting to neon pink-blue
76
+ A rustic windmill beside field [edit] stop windmill blades mid-motion
77
+ A latte art heart [edit] invert colors for photographic negative
78
+ A scuba diver photographing coral [edit] switch diver’s fins to bright yellow
79
+ A snowy owl in flight [edit] extend wingspan slightly wider
80
+ A violin case with sheet music [edit] add handwritten annotations
81
+ A garden gnome among tulips [edit] repaint gnome hat to polka dots
82
+ A stone archway leading to courtyard [edit] add creeping ivy on stone
83
+ A bowl of blueberries [edit] remove two berries for asymmetry
84
+ A photographer in desert dunes [edit] add blowing sand trail behind
85
+ A city street with puddles [edit] enhance neon reflections in puddles
86
+ A close-up of eye with makeup [edit] change iris color to amber
87
+ A vintage car parked roadside [edit] convert entire scene to sepia
88
+ A farmer holding basket of apples [edit] change apples to peaches
89
+ A plate of spaghetti carbonara [edit] sprinkle extra parmesan on top
90
+ A kayaker on serene lake [edit] add low morning mist on water
91
+ A bee on sunflower [edit] isolate bee in spotlight, darken background
92
+ A commuter bike leaning on wall [edit] switch bike frame to bamboo
93
+ A violin soloist on stage [edit] dim background to silhouette orchestra
94
+ A chess clock beside board [edit] set clock to 00:05 time left
95
+ A coffee grinder with beans [edit] reduce bean level by half
96
+ A cliffside pagoda at dawn [edit] add low-lying clouds below pagoda
97
+ A pitcher of lemonade [edit] replace ice cubes with frozen lemon slices
98
+ A forest waterfall in winter [edit] transform to autumn foliage
99
+ A crystal ball on pedestal [edit] show swirling galaxy inside
100
+ A plate of dim sum baskets [edit] stack one extra basket on left
101
+ A gondola on Venetian canal [edit] add lanterns hanging above canal
102
+ A foxglove plant in bloom [edit] desaturate background for subject pop
103
+ A pianist’s hands mid-chord [edit] blur hands for motion effect
104
+ A stack of handmade soaps [edit] emboss top soap with floral logo
105
+ A campfire with marshmallows [edit] increase flame height 30%
106
+ A glass of red wine swirling [edit] switch wine color to rosé
107
+ A street dog drinking puddle [edit] remove puddle, replace with food bowl
108
+ A marble statue under skylight [edit] add tiny crack at base
109
+ A subway entrance with stairs [edit] turn scene into rainy evening
110
+ A painter holding color wheel [edit] rotate wheel 90 degrees clockwise
111
+ A plate of tacos with salsa [edit] change salsa from red to green
112
+ A violin decked in flowers [edit] remove all flowers except one rose
113
+ A desert highway at dusk [edit] add old neon motel sign
114
+ A kayak on river rapids [edit] calm water to slow flow
115
+ A snow globe with cabin [edit] add swirling glitter instead of snow
116
+ A chef flipping wok vegetables [edit] increase flame size dramatically
117
+ A castle on cliff at sunset [edit] alter sky to twilight stars
118
+ A latte with leaf art [edit] add cinnamon dusting on froth
119
+ A bonsai tree in tray [edit] add miniature stone lantern near trunk
120
+ A neon-lit alley with puddles [edit] reduce neon signs by half
121
+ A violin bridge macro [edit] add faint dust for realism
122
+ A picnic table with sandwiches [edit] remove one sandwich for negative space
123
+ A city skyline sunrise [edit] shift color palette to dusk purples
124
+ A bowl of ramen [edit] replace noodles with udon
125
+ A journal on wooden desk [edit] add fountain pen diagonal across page
126
+ A barista steaming milk [edit] increase visible steam
127
+ A surfer catching wave [edit] add sun flare in corner
128
+ A vintage film camera [edit] open lens cap slightly
129
+ A plate of donuts [edit] coat top donut with sprinkles
130
+ A horse grazing meadow [edit] extend mane length slightly
131
+ A lighthouse silhouette [edit] brighten beacon light halo
132
+ A violin on chair [edit] place bow across strings
133
+ A chef torching crème brûlée [edit] darken caramelization spots
134
+ A koi pond reflection [edit] add falling cherry petals
135
+ A mountain trail hiker [edit] insert trail signpost
136
+ A glass orb on sand [edit] show inverted reflection sharper
137
+ An espresso shot [edit] double crema thickness
138
+ A windmill at dusk [edit] turn blades into slow blur motion
139
+ A chessboard aerial view [edit] highlight white king with glow
140
+ A bakery croissant stack [edit] dust powdered sugar on top two croissants
141
+ A sunset beach scene [edit] add silhouetted palm in foreground
142
+ A violin scroll close-up [edit] tint varnish warmer amber
143
+ A dripping paintbrush [edit] change paint color to teal
144
+ A bowl of ramen eggs [edit] cut egg halves for yolk view
145
+ A street cafe table [edit] remove one chair for asymmetry
146
+ A forest creek [edit] add stepping stones across water
147
+ A cityscape night sky [edit] add comet trail overhead
148
+ A cup of cocoa with whipped cream [edit] sprinkle cocoa powder on top
149
+ A classical guitar against wall [edit] desaturate background only
150
+ A crystal decanter [edit] fill decanter halfway with amber liquid
151
+ A parchment map [edit] burn map edges slightly
152
+ A snowflake macro [edit] invert to dark field illumination
153
+ A violin tailpiece macro [edit] add subtle wood grain texture
154
+ A racing bike [edit] swap wheels to deep-rim carbon
155
+ A farmer in wheat field [edit] change wheat to barley heads
156
+ A hummingbird feeder [edit] add second bird approaching
157
+ A street market fruit stand [edit] rearrange apples to pyramid
158
+ A calligraphy brush stroke [edit] thicken stroke width 15%
configs/captions/example_human.txt ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Concept‑art illustration of a wind‑turbine technician perched atop a wind turbine at sunset, elongated frame guiding eye upward, glittering dust in sunbeams, ultrawide cinematic lens
2
+ Dynamic action shot of a percussionist on a cliffside lighthouse balcony, vertical layout with leading lines, volumetric mist, Baroque oil texture
3
+ Dynamic action shot of a sound foley artist by a bioluminescent tide pool, vertical layout with leading lines, golden‑hour backlight, sepia film grain
4
+ Documentary snapshot of a watchmaker by a bioluminescent tide pool, top‑to‑bottom visual flow, dramatic cloud backdrop, Baroque oil texture
5
+ Hyperreal CGI render of a comic colorist inside a bustling food market, towering vertical composition, glittering dust in sunbeams, isometric voxel aesthetic
6
+ Graphic novel panel showing a northern‑lights tour guide amid glowing jellyfish tanks, portrait orientation emphasizing height, tilt‑shift miniaturization effect, lo‑fi pixel art
7
+ Soft pastel painting of a watchmaker amid glowing jellyfish tanks, portrait orientation emphasizing height, glittering dust in sunbeams, infrared false color
8
+ Concept‑art illustration of a forest firefighter on a storm‑battered sea wall, towering vertical composition, falling snowflakes, isometric voxel aesthetic
9
+ Retro film photograph of a VR game designer at a festival of paper lanterns, towering vertical composition, rim lighting on subject, photoreal 8K detail
10
+ Candid photo of a train signal operator on a cliffside lighthouse balcony, elongated frame guiding eye upward, soft foreground bokeh, ultrawide cinematic lens
11
+ Concept‑art illustration of a data journalist perched atop a wind turbine at sunset, vertical layout with leading lines, golden‑hour backlight, lo‑fi pixel art
12
+ Hyperreal CGI render of a calligraphy artist amid autumn maple leaves, vertical layout with leading lines, golden‑hour backlight, infrared false color
13
+ Dynamic action shot of a roller‑coaster mechanic inside a library tower filled with skylight, elongated frame guiding eye upward, falling snowflakes, lo‑fi pixel art
14
+ High‑fashion editorial of a bike messenger on a cliffside lighthouse balcony, vertical layout with leading lines, dramatic cloud backdrop, pastel watercolor wash
15
+ Documentary snapshot of a train signal operator inside a shipyard dry dock, vertical layout with leading lines, wet pavement reflections, photoreal 8K detail
16
+ Documentary snapshot of a ice sculptor on a floating river dock at dawn, portrait orientation emphasizing height, rim lighting on subject, vivid gouache strokes
17
+ Graphic novel panel showing a paramedic inside a bustling food market, towering vertical composition, glittering dust in sunbeams, lo‑fi pixel art
18
+ High‑fashion editorial of a robotics engineer on a rain‑slick neon street, towering vertical composition, glittering dust in sunbeams, photoreal 8K detail
19
+ Cinematic still of a heritage conservator inside a glass‑roofed greenhouse, top‑to‑bottom visual flow, tilt‑shift miniaturization effect, vivid gouache strokes
20
+ Dynamic action shot of a subway conductor on an urban rooftop garden, top‑to‑bottom visual flow, dramatic cloud backdrop, ultrawide cinematic lens
21
+ Dynamic action shot of a tea ceremony master on a wooden pier in fog, elongated frame guiding eye upward, wet pavement reflections, pastel watercolor wash
22
+ Concept‑art illustration of a bonsai cultivator on a rain‑slick neon street, top‑to‑bottom visual flow, glittering dust in sunbeams, ultrawide cinematic lens
23
+ Dynamic action shot of a robotics engineer on a storm‑battered sea wall, vertical layout with leading lines, soft foreground bokeh, vivid gouache strokes
24
+ Retro film photograph of a bee swarm researcher in a misty bamboo grove, towering vertical composition, falling snowflakes, Baroque oil texture
25
+ Cinematic still of a robotics engineer beside a mirror‑still alpine lake, portrait orientation emphasizing height, volumetric mist, ultrawide cinematic lens
26
+ Concept‑art illustration of a calligraphy artist at a coastal cliff wind farm, towering vertical composition, tilt‑shift miniaturization effect, pastel watercolor wash
27
+ Documentary snapshot of a deep‑sea diver on a floating river dock at dawn, top‑to‑bottom visual flow, falling snowflakes, vivid gouache strokes
28
+ Documentary snapshot of a urban beekeeper inside a subterranean crystal cavern, elongated frame guiding eye upward, dramatic cloud backdrop, ultrawide cinematic lens
29
+ Cinematic still of a shipwright inside a shipyard dry dock, portrait orientation emphasizing height, wet pavement reflections, Baroque oil texture
30
+ Retro film photograph of a bonsai cultivator beneath aurora‑lit sky, vertical layout with leading lines, golden‑hour backlight, sepia film grain
31
+ Cinematic still of a watchmaker at a snow‑covered mountain pass, vertical layout with leading lines, dramatic cloud backdrop, Baroque oil texture
32
+ Dynamic action shot of a astronomer by a bioluminescent tide pool, top‑to‑bottom visual flow, falling snowflakes, sepia film grain
33
+ Dynamic action shot of a urban beekeeper inside a bustling food market, portrait orientation emphasizing height, falling snowflakes, isometric voxel aesthetic
34
+ Concept‑art illustration of a shipwright inside a vintage train carriage, portrait orientation emphasizing height, soft foreground bokeh, pastel watercolor wash
35
+ Dynamic action shot of a subway conductor inside a library tower filled with skylight, vertical layout with leading lines, volumetric mist, pastel watercolor wash
36
+ High‑fashion editorial of a mountain guide at a festival of paper lanterns, elongated frame guiding eye upward, falling snowflakes, lo‑fi pixel art
37
+ Dynamic action shot of a surfboard shaper inside a glass‑roofed greenhouse, vertical layout with leading lines, soft foreground bokeh, Baroque oil texture
38
+ Documentary snapshot of a percussionist beside a roaring waterfall, vertical layout with leading lines, rim lighting on subject, vivid gouache strokes
39
+ Candid photo of a train signal operator at a snow‑covered mountain pass, top‑to‑bottom visual flow, rim lighting on subject, sepia film grain
40
+ Concept‑art illustration of a VR game designer inside a vintage train carriage, vertical layout with leading lines, soft foreground bokeh, Baroque oil texture
41
+ Retro film photograph of a ice‑hotel architect inside a shipyard dry dock, elongated frame guiding eye upward, tilt‑shift miniaturization effect, lo‑fi pixel art
42
+ Concept‑art illustration of a urban beekeeper on a cliffside lighthouse balcony, top‑to‑bottom visual flow, falling snowflakes, infrared false color
43
+ Dynamic action shot of a bridge painter amid autumn maple leaves, elongated frame guiding eye upward, falling snowflakes, isometric voxel aesthetic
44
+ Concept‑art illustration of a calligraphy artist inside a glass‑roofed greenhouse, vertical layout with leading lines, soft foreground bokeh, photoreal 8K detail
45
+ Soft pastel painting of a street muralist inside a library tower filled with skylight, top‑to‑bottom visual flow, glittering dust in sunbeams, photoreal 8K detail
46
+ Hyperreal CGI render of a paramedic on a cliffside lighthouse balcony, vertical layout with leading lines, long‑exposure light trails, neon noir palette
47
+ Soft pastel painting of a wildlife rehabilitator in a misty bamboo grove, towering vertical composition, glittering dust in sunbeams, Baroque oil texture
48
+ Retro film photograph of a lighthouse keeper amid autumn maple leaves, portrait orientation emphasizing height, glittering dust in sunbeams, vivid gouache strokes
49
+ Dynamic action shot of a wildlife rehabilitator inside a watchmaker’s workshop, portrait orientation emphasizing height, rim lighting on subject, pastel watercolor wash
50
+ Cinematic still of a forensic analyst inside a vintage train carriage, towering vertical composition, long‑exposure light trails, lo‑fi pixel art
51
+ Hyperreal CGI render of a watchmaker on a rain‑drenched ferry deck, vertical layout with leading lines, glittering dust in sunbeams, lo‑fi pixel art
52
+ Concept‑art illustration of a urban farmer inside a bustling food market, vertical layout with leading lines, soft foreground bokeh, pastel watercolor wash
53
+ Soft pastel painting of a percussionist beside a roaring waterfall, towering vertical composition, falling snowflakes, isometric voxel aesthetic
54
+ Soft pastel painting of a robotics engineer by a bioluminescent tide pool, top‑to‑bottom visual flow, tilt‑shift miniaturization effect, photoreal 8K detail
55
+ Candid photo of a solar‑sail pilot on a rain‑slick neon street, elongated frame guiding eye upward, falling snowflakes, neon noir palette
56
+ Soft pastel painting of a watchmaker inside a watchmaker’s workshop, top‑to‑bottom visual flow, dramatic cloud backdrop, isometric voxel aesthetic
57
+ Retro film photograph of a data journalist by a bioluminescent tide pool, portrait orientation emphasizing height, soft foreground bokeh, sepia film grain
58
+ Candid photo of a forensic analyst perched atop a wind turbine at sunset, portrait orientation emphasizing height, volumetric mist, isometric voxel aesthetic
59
+ Documentary snapshot of a urban farmer inside a subterranean crystal cavern, towering vertical composition, soft foreground bokeh, pastel watercolor wash
60
+ High‑fashion editorial of a drone cinematographer inside a glass‑roofed greenhouse, towering vertical composition, soft foreground bokeh, pastel watercolor wash
61
+ High‑fashion editorial of a dune ecologist at a lunar research outpost, portrait orientation emphasizing height, falling snowflakes, photoreal 8K detail
62
+ Candid photo of a comic colorist by a bioluminescent tide pool, elongated frame guiding eye upward, falling snowflakes, photoreal 8K detail
63
+ Candid photo of a dune ecologist beneath aurora‑lit sky, top‑to‑bottom visual flow, volumetric mist, ultrawide cinematic lens
64
+ Dynamic action shot of a ballet dancer in a misty bamboo grove, portrait orientation emphasizing height, soft foreground bokeh, lo‑fi pixel art
65
+ Documentary snapshot of a urban farmer under blooming cherry trees, top‑to‑bottom visual flow, rim lighting on subject, vivid gouache strokes
66
+ Documentary snapshot of a storm chaser amid glowing jellyfish tanks, top‑to‑bottom visual flow, soft foreground bokeh, ultrawide cinematic lens
67
+ Documentary snapshot of a wind‑turbine technician at a lunar research outpost, towering vertical composition, long‑exposure light trails, sepia film grain
68
+ Hyperreal CGI render of a robotics engineer inside an abandoned observatory, towering vertical composition, wet pavement reflections, neon noir palette
69
+ Graphic novel panel showing a dune ecologist beneath aurora‑lit sky, towering vertical composition, rim lighting on subject, pastel watercolor wash
70
+ Soft pastel painting of a bridge painter inside a watchmaker’s workshop, elongated frame guiding eye upward, rim lighting on subject, photoreal 8K detail
71
+ Documentary snapshot of a drone cinematographer perched atop a wind turbine at sunset, towering vertical composition, soft foreground bokeh, Baroque oil texture
72
+ Dynamic action shot of a roller‑coaster mechanic inside a watchmaker’s workshop, elongated frame guiding eye upward, long‑exposure light trails, photoreal 8K detail
73
+ Dynamic action shot of a shipwright on a wooden pier in fog, top‑to‑bottom visual flow, long‑exposure light trails, pastel watercolor wash
74
+ Concept‑art illustration of a ballet dancer inside a watchmaker’s workshop, elongated frame guiding eye upward, dramatic cloud backdrop, Baroque oil texture
75
+ Dynamic action shot of a paramedic on a storm‑battered sea wall, top‑to‑bottom visual flow, rim lighting on subject, neon noir palette
76
+ Graphic novel panel showing a deep‑sea diver inside an abandoned observatory, elongated frame guiding eye upward, dramatic cloud backdrop, photoreal 8K detail
77
+ Concept‑art illustration of a forest firefighter perched atop a wind turbine at sunset, top‑to‑bottom visual flow, wet pavement reflections, Baroque oil texture
78
+ Graphic novel panel showing a VR game designer beside a roaring waterfall, portrait orientation emphasizing height, rim lighting on subject, ultrawide cinematic lens
79
+ Hyperreal CGI render of a bike messenger in a misty bamboo grove, elongated frame guiding eye upward, tilt‑shift miniaturization effect, neon noir palette
80
+ Dynamic action shot of a food‑truck chef on a rain‑slick neon street, elongated frame guiding eye upward, dramatic cloud backdrop, neon noir palette
81
+ Cinematic still of a deep‑sea diver beside a roaring waterfall, towering vertical composition, tilt‑shift miniaturization effect, ultrawide cinematic lens
82
+ Retro film photograph of a ice sculptor beneath aurora‑lit sky, top‑to‑bottom visual flow, soft foreground bokeh, photoreal 8K detail
83
+ Graphic novel panel showing a parkour athlete at a festival of paper lanterns, portrait orientation emphasizing height, volumetric mist, Baroque oil texture
84
+ Graphic novel panel showing a street muralist inside a glass‑roofed greenhouse, portrait orientation emphasizing height, soft foreground bokeh, neon noir palette
85
+ Hyperreal CGI render of a wildlife rehabilitator amid glowing jellyfish tanks, towering vertical composition, dramatic cloud backdrop, Baroque oil texture
86
+ Graphic novel panel showing a bike messenger at a lunar research outpost, top‑to‑bottom visual flow, dramatic cloud backdrop, sepia film grain
87
+ Candid photo of a kite maker within a desert dust storm, vertical layout with leading lines, long‑exposure light trails, vivid gouache strokes
88
+ Candid photo of a astronomer on a wooden pier in fog, towering vertical composition, falling snowflakes, ultrawide cinematic lens
89
+ Documentary snapshot of a comic colorist at a lunar research outpost, top‑to‑bottom visual flow, soft foreground bokeh, isometric voxel aesthetic
90
+ High‑fashion editorial of a portrait photographer in a misty bamboo grove, vertical layout with leading lines, golden‑hour backlight, lo‑fi pixel art
91
+ Dynamic action shot of a urban farmer in a misty bamboo grove, elongated frame guiding eye upward, glittering dust in sunbeams, infrared false color
92
+ Concept‑art illustration of a drone cinematographer beneath aurora‑lit sky, elongated frame guiding eye upward, golden‑hour backlight, ultrawide cinematic lens
93
+ Retro film photograph of a kite maker under blooming cherry trees, portrait orientation emphasizing height, tilt‑shift miniaturization effect, vivid gouache strokes
94
+ Soft pastel painting of a wildlife rehabilitator inside a watchmaker’s workshop, towering vertical composition, wet pavement reflections, sepia film grain
95
+ Hyperreal CGI render of a urban farmer inside an abandoned observatory, elongated frame guiding eye upward, long‑exposure light trails, lo‑fi pixel art
96
+ Dynamic action shot of a northern‑lights tour guide within a desert dust storm, elongated frame guiding eye upward, tilt‑shift miniaturization effect, Baroque oil texture
97
+ High‑fashion editorial of a northern‑lights tour guide inside an abandoned observatory, elongated frame guiding eye upward, falling snowflakes, lo‑fi pixel art
98
+ Retro film photograph of a subway conductor in a misty bamboo grove, towering vertical composition, volumetric mist, photoreal 8K detail
99
+ Retro film photograph of a storm chaser inside a watchmaker’s workshop, towering vertical composition, long‑exposure light trails, photoreal 8K detail
100
+ Hyperreal CGI render of a urban farmer on a wooden pier in fog, top‑to‑bottom visual flow, golden‑hour backlight, vivid gouache strokes
configs/captions/example_prompts2.txt ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ A celestial lantern village flickering between dimensions, anamorphic lens flare
2
+ A fractal sonar reef resonating with harmonic waves, sepia ink etching
3
+ A amber compass balanced atop a obsidian chalice beneath aurora-lit horizon, infrared false‑color
4
+ 'Vortex' displayed as a solar‑panel mosaic framed by erupting geysers, hyperdetailed microphotography
5
+ A amber compass encased within a meteorite slab atop cloud-piercing tower, minimalist negative space
6
+ A kinetic railway bridge aligned with lunar phases, chalk pastel texture
7
+ A cerulean origami crane hovering beside a quartz pedestal in moonlit desert ruin, dreamlike soft focus
8
+ A levitating clock tower flooded by molten stardust, brutalist geometry emphasis
9
+ 'Vortex' displayed as a floating floral typography framed by erupting geysers, lo‑fi film grain
10
+ A labyrinthine mirror lake composed of levitating stones, anamorphic lens flare
11
+ A retro‑futuristic tea pavilion guarded by luminous koi, isometric voxel render
12
+ 'Halcyon' displayed as a interactive LED canopy projected onto ancient ziggurat, Art‑Nouveau outlines
13
+ A titanium feather spiraling around a mahogany music box beneath aurora-lit horizon, neon noir palette
14
+ A infrared railway bridge composed of levitating stones, vector flat design
15
+ A volcanic nocturnal carnival etched with forgotten runes, ultrawide panoramic frame
16
+ 'Catalyst' displayed as a crystal fiber optic sign reflected on tranquil rice paddies, dreamlike soft focus
17
+ 'Radiance' displayed as a drone‑light hologram projected onto ancient ziggurat, tilt‑shift miniature look
18
+ A fractal sky monastery scattered across hovering islands, photoreal volumetric lighting
19
+ A vaporous coral metropolis embedded in frozen time, vivid watercolor bloom
20
+ A submerged sonar reef flooded by molten stardust, brutalist geometry emphasis
21
+ A surreal voltage garden singing in ultrasonic tones, hyperdetailed microphotography
22
+ A vaporous coral metropolis drifting through twilight mist, tilt‑shift miniature look
23
+ A emerald hourglass encased within a ancient parchment scroll in moonlit desert ruin, anamorphic lens flare
24
+ A frozen voltage garden guarded by luminous koi, anamorphic lens flare
25
+ 'Radiance' displayed as a geothermal steam stencil above polar ice floes at dusk, brutalist geometry emphasis
26
+ A infrared subterranean amphitheater framed by aurora curtains, chalk pastel texture
27
+ A labyrinthine railway bridge woven from crystalline filaments, chalk pastel texture
28
+ A porcelain mask spiraling around a meteorite slab beneath aurora-lit horizon, CGI ray‑traced caustics
29
+ 'Obsidian' displayed as a drone‑light hologram amid drifting balloon lanterns, tilt‑shift miniature look
30
+ A phosphorescent desert oasis anchored in swirling vortexes, hyperdetailed microphotography
31
+ A opalescent sky monastery resonating with harmonic waves, hyperdetailed microphotography
32
+ A volcanic bamboo labyrinth anchored in swirling vortexes, brutalist geometry emphasis
33
+ A volcanic lighthouse isle anchored in swirling vortexes, 8‑bit pixel aesthetic
34
+ A amber compass spiraling around a hollow crystal sphere beside luminescent tide pool, dreamlike soft focus
35
+ 'Cipher' displayed as a wave‑pattern sand relief hovering above midnight ocean, Baroque oil technique
36
+ A echoing coral metropolis flickering between dimensions, brutalist geometry emphasis
37
+ A mesmerizing mirror lake powered by clockwork tides, isometric voxel render
38
+ 'Euphoria' displayed as a solar‑panel mosaic hovering above midnight ocean, brutalist geometry emphasis
39
+ A translucent railway bridge carved into a meteor fragment, anamorphic lens flare
40
+ A tessellated nocturnal carnival guarded by luminous koi, hyperdetailed microphotography
41
+ A retro‑futuristic subterranean amphitheater carved into a meteor fragment, minimalist negative space
42
+ A verdant ice palace etched with forgotten runes, Baroque oil technique
43
+ A sonic glacier cave spiraling into infinite recursion, Baroque oil technique
44
+ A vaporous arboretum anchored in swirling vortexes, isometric voxel render
45
+ A polychromatic quantum workshop traversed by whispering drones, anamorphic lens flare
46
+ A submerged tidal library wrapped in fractal snowflakes, vivid watercolor bloom
47
+ A luminous rainbow waterfall grown from magnetic vines, Art‑Nouveau outlines
48
+ A holographic sonar reef flickering between dimensions, Art‑Nouveau outlines
49
+ A levitating tidal library traversed by whispering drones, sepia ink etching
50
+ A surreal lighthouse isle traversed by whispering drones, sepia ink etching
51
+ A ascending sandstone fortress embedded in frozen time, dreamlike soft focus
52
+ A levitating observatory flooded by molten stardust, sepia ink etching
53
+ A polychromatic storm laboratory guarded by luminous koi, tilt‑shift miniature look
54
+ A levitating arboretum etched with forgotten runes, infrared false‑color
55
+ 'Luminescence' displayed as a interactive LED canopy amid drifting balloon lanterns, sepia ink etching
56
+ A infrared lantern village orbiting a miniature sun, Art‑Nouveau outlines
57
+ A echoing nocturnal carnival grown from magnetic vines, Art‑Nouveau outlines
58
+ A origami tea pavilion framed by aurora curtains, chalk pastel texture
59
+ 'Vortex' displayed as a geothermal steam stencil beneath swirling cyclone clouds, chalk pastel texture
60
+ A mechanized observatory resonating with harmonic waves, dreamlike soft focus
61
+ A polychromatic nocturnal carnival framed by aurora curtains, ultrawide panoramic frame
62
+ A mesmerizing gravity well plaza framed by aurora curtains, chalk pastel texture
63
+ A cryptic glacier cave resonating with harmonic waves, hyperdetailed microphotography
64
+ 'Entropy' displayed as a interactive LED canopy suspended between coral spires, vivid watercolor bloom
65
+ A celestial astral caravan carved into a meteor fragment, anamorphic lens flare
66
+ A infrared nocturnal carnival submerged beneath glassy waves, sepia ink etching
67
+ A whimsical coral metropolis scattered across hovering islands, vivid watercolor bloom
68
+ A silver key spiraling around a quartz pedestal beside luminescent tide pool, vivid watercolor bloom
69
+ A emerald hourglass illuminating a woven silk tapestry in moonlit desert ruin, CGI ray‑traced caustics
70
+ A levitating ice palace carved into a meteor fragment, ultrawide panoramic frame
71
+ 'Cipher' displayed as a wave‑pattern sand relief hovering above midnight ocean, lo‑fi film grain
72
+ A tessellated asteroid mine suspended above rolling thunderheads, isometric voxel render
73
+ A zero‑gravity tidal library anchored in swirling vortexes, neon noir palette
74
+ A submerged data cathedral grown from magnetic vines, dreamlike soft focus
75
+ A retro‑futuristic coral metropolis illuminated by bioluminescent spores, vivid watercolor bloom
76
+ A mechanized tea pavilion composed of levitating stones, photoreal volumetric lighting
77
+ A levitating tea pavilion spiraling into infinite recursion, vector flat design
78
+ A polychromatic voltage garden grown from magnetic vines, infrared false‑color
79
+ A tessellated sandstone fortress composed of levitating stones, Baroque oil technique
80
+ 'Cascade' displayed as a molten metal casting suspended between coral spires, minimalist negative space
81
+ A vaporous nocturnal carnival embedded in frozen time, vector flat design
82
+ 'Luminescence' displayed as a molten metal casting above polar ice floes at dusk, sepia ink etching
83
+ A origami moss temple drifting through twilight mist, Baroque oil technique
84
+ 'Cascade' displayed as a wave‑pattern sand relief suspended between coral spires, 8‑bit pixel aesthetic
85
+ A phosphorescent gravity well plaza echoing with distant chimes, chalk pastel texture
86
+ A tessellated data cathedral guarded by luminous koi, vector flat design
87
+ A cryptic subterranean amphitheater aligned with lunar phases, vector flat design
88
+ A submerged mirror lake singing in ultrasonic tones, chalk pastel texture
89
+ A levitating arboretum embedded in frozen time, photoreal volumetric lighting
90
+ A polychromatic bamboo labyrinth drifting through twilight mist, tilt‑shift miniature look
91
+ 'Elysium' displayed as a crystal fiber optic sign beneath swirling cyclone clouds, ultrawide panoramic frame
92
+ 'Nebulous' displayed as a drone‑light hologram hovering above midnight ocean, brutalist geometry emphasis
93
+ A whimsical data cathedral framed by aurora curtains, photoreal volumetric lighting
94
+ A holographic bamboo labyrinth composed of levitating stones, neon noir palette
95
+ A cryptic tea pavilion singing in ultrasonic tones, minimalist negative space
96
+ A verdant glacier cave suspended above rolling thunderheads, CGI ray‑traced caustics
97
+ A levitating sonar reef submerged beneath glassy waves, brutalist geometry emphasis
98
+ 'Entropy' displayed as a wave‑pattern sand relief reflected on tranquil rice paddies, CGI ray‑traced caustics
99
+ 'Catalyst' displayed as a crystal fiber optic sign beneath swirling cyclone clouds, infrared false‑color
100
+ A silver key encased within a glacial mirror shard within silent subterranean hall, Art‑Nouveau outlines
configs/captions/example_prompts3.txt ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ A subway station covered in bioluminescent moss, watercolor illustration, with reflective wet pavement
2
+ A lone cherry blossom tree growing on a floating island, ultrawide cinematic frame, bathed in golden hour light
3
+ A paper sailboat navigating waves of handwritten letters, matte‑painting concept art, captured in long‑exposure motion blur
4
+ A desert caravan beneath a sky filled with iridescent balloons, watercolor illustration, in Art‑Nouveau line work
5
+ A lighthouse emitting rainbow beams into coastal fog, watercolor illustration, bathed in golden hour light
6
+ A futuristic monorail gliding above verdant rice terraces, neon synthwave palette, in Art‑Nouveau line work
7
+ A futuristic monorail gliding above verdant rice terraces, matte‑painting concept art, bathed in golden hour light
8
+ A spiraling staircase made of glowing origami cranes, photoreal 8K render, framed by towering archways
9
+ A cascade of cosmic paint pouring from an open window, watercolor illustration, under a crescent moon
10
+ A pair of vintage typewriters merging into a butterfly, photoreal 8K render, under a crescent moon
11
+ A lighthouse emitting rainbow beams into coastal fog, chalk pastel drawing, shot with tilt‑shift focus
12
+ A cascade of cosmic paint pouring from an open window, photoreal 8K render, captured in long‑exposure motion blur
13
+ A futuristic monorail gliding above verdant rice terraces, ultrawide cinematic frame, under a crescent moon
14
+ A pair of vintage typewriters merging into a butterfly, watercolor illustration, under a crescent moon
15
+ A spiraling staircase made of glowing origami cranes, photoreal 8K render, with volumetric fog
16
+ An owl composed entirely of clock gears and cogs, ultrawide cinematic frame, shot with tilt‑shift focus
17
+ A glass teapot swirling with blooming jasmine tea, hyperreal CGI composite, with reflective wet pavement
18
+ A transparent cube containing a miniature thunderstorm, chalk pastel drawing, captured in long‑exposure motion blur
19
+ A lone cherry blossom tree growing on a floating island, isometric pixel art, framed by towering archways
20
+ A violin carved from shimmering ice, ultrawide cinematic frame, with reflective wet pavement
21
+ A lighthouse emitting rainbow beams into coastal fog, watercolor illustration, captured in long‑exposure motion blur
22
+ A bonsai tree shaped like a twisting dragon, hyperreal CGI composite, bathed in golden hour light
23
+ A crystal castle perched on a cliff of amethyst, hyperreal CGI composite, under a crescent moon
24
+ A paper sailboat navigating waves of handwritten letters, neon synthwave palette, framed by towering archways
25
+ A cascade of cosmic paint pouring from an open window, hyperreal CGI composite, using dramatic rim lighting
26
+ A lone cherry blossom tree growing on a floating island, sepia ink sketch, shot with tilt‑shift focus
27
+ A starlit library built inside a hollowed redwood, isometric pixel art, with volumetric fog
28
+ A subway station covered in bioluminescent moss, low‑poly 3‑D art, using dramatic rim lighting
29
+ A subway station covered in bioluminescent moss, sepia ink sketch, captured in long‑exposure motion blur
30
+ An owl composed entirely of clock gears and cogs, photoreal 8K render, shot with tilt‑shift focus
31
+ A spiraling staircase made of glowing origami cranes, ultrawide cinematic frame, captured in long‑exposure motion blur
32
+ A subway station covered in bioluminescent moss, ultrawide cinematic frame, captured in long‑exposure motion blur
33
+ An owl composed entirely of clock gears and cogs, ultrawide cinematic frame, in Art‑Nouveau line work
34
+ A crystal castle perched on a cliff of amethyst, ultrawide cinematic frame, under a crescent moon
35
+ A starlit library built inside a hollowed redwood, hyperreal CGI composite, bathed in golden hour light
36
+ A cascade of cosmic paint pouring from an open window, hyperreal CGI composite, in Art‑Nouveau line work
37
+ An antique compass resting on weathered parchment, neon synthwave palette, under a crescent moon
38
+ A pair of vintage typewriters merging into a butterfly, neon synthwave palette, surrounded by soft bokeh
39
+ A violin carved from shimmering ice, hyperreal CGI composite, surrounded by soft bokeh
40
+ A transparent cube containing a miniature thunderstorm, ultrawide cinematic frame, bathed in golden hour light
41
+ A futuristic monorail gliding above verdant rice terraces, hyperreal CGI composite, in Art‑Nouveau line work
42
+ A pair of vintage typewriters merging into a butterfly, low‑poly 3‑D art, surrounded by soft bokeh
43
+ A desert caravan beneath a sky filled with iridescent balloons, isometric pixel art, using dramatic rim lighting
44
+ An owl composed entirely of clock gears and cogs, isometric pixel art, shot with tilt‑shift focus
45
+ An owl composed entirely of clock gears and cogs, matte‑painting concept art, under a crescent moon
46
+ A spiraling staircase made of glowing origami cranes, chalk pastel drawing, surrounded by soft bokeh
47
+ A glass teapot swirling with blooming jasmine tea, ultrawide cinematic frame, surrounded by soft bokeh
48
+ A desert caravan beneath a sky filled with iridescent balloons, matte‑painting concept art, shot with tilt‑shift focus
49
+ A pocket watch melting over the edge of a marble pedestal, ultrawide cinematic frame, under a crescent moon
50
+ A serene koi pond reflecting autumn maple leaves, isometric pixel art, framed by towering archways
51
+ A paper sailboat navigating waves of handwritten letters, isometric pixel art, framed by towering archways
52
+ A fox wearing a patchwork cloak of autumn leaves, isometric pixel art, using dramatic rim lighting
53
+ A glass teapot swirling with blooming jasmine tea, low‑poly 3‑D art, using dramatic rim lighting
54
+ A cascade of cosmic paint pouring from an open window, chalk pastel drawing, captured in long‑exposure motion blur
55
+ A bonsai tree shaped like a twisting dragon, hyperreal CGI composite, captured in long‑exposure motion blur
56
+ A bonsai tree shaped like a twisting dragon, isometric pixel art, shot with tilt‑shift focus
57
+ A futuristic monorail gliding above verdant rice terraces, neon synthwave palette, captured in long‑exposure motion blur
58
+ An owl composed entirely of clock gears and cogs, sepia ink sketch, with volumetric fog
59
+ A crystal castle perched on a cliff of amethyst, neon synthwave palette, surrounded by soft bokeh
60
+ A fox wearing a patchwork cloak of autumn leaves, low‑poly 3‑D art, framed by towering archways
61
+ A glass teapot swirling with blooming jasmine tea, watercolor illustration, surrounded by soft bokeh
62
+ A transparent cube containing a miniature thunderstorm, isometric pixel art, captured in long‑exposure motion blur
63
+ An antique compass resting on weathered parchment, photoreal 8K render, captured in long‑exposure motion blur
64
+ A futuristic monorail gliding above verdant rice terraces, hyperreal CGI composite, captured in long‑exposure motion blur
65
+ A crystal castle perched on a cliff of amethyst, sepia ink sketch, under a crescent moon
66
+ A transparent cube containing a miniature thunderstorm, hyperreal CGI composite, using dramatic rim lighting
67
+ A pair of vintage typewriters merging into a butterfly, photoreal 8K render, in Art‑Nouveau line work
68
+ A subway station covered in bioluminescent moss, hyperreal CGI composite, in Art‑Nouveau line work
69
+ A starlit library built inside a hollowed redwood, ultrawide cinematic frame, using dramatic rim lighting
70
+ A crystal castle perched on a cliff of amethyst, neon synthwave palette, using dramatic rim lighting
71
+ 'Solstice' rendered as a neon sign suspended between skyscrapers, golden glow
72
+ 'Eclipse' rendered as a drone light show emerging from city fog, golden glow
73
+ 'Solstice' rendered as a drone light show hovering above rolling clouds, golden glow
74
+ 'Harmony' rendered as a chalkboard typography projected onto canyon walls, golden glow
75
+ 'Quantum' rendered as a chalkboard typography over a misty harbor, golden glow
76
+ 'Harmony' rendered as a LED billboard against a star‑filled night sky, golden glow
77
+ 'Harmony' rendered as a drone light show projected onto canyon walls, golden glow
78
+ 'Solstice' rendered as a drone light show suspended between skyscrapers, golden glow
79
+ 'Harmony' rendered as a LED billboard against a star‑filled night sky, golden glow
80
+ 'Quantum' rendered as a ice sculpture headline framed by snow‑capped peaks, golden glow
81
+ 'Eclipse' rendered as a sand‑dune calligraphy framed by snow‑capped peaks, golden glow
82
+ 'Solstice' rendered as a LED billboard against a star‑filled night sky, golden glow
83
+ 'Quantum' rendered as a LED billboard reflected on a tranquil lake, golden glow
84
+ 'Harmony' rendered as a floral arrangement over a misty harbor, golden glow
85
+ 'Zenith' rendered as a ice sculpture headline reflected on a tranquil lake, golden glow
86
+ 'Serenity' rendered as a neon sign over a misty harbor, golden glow
87
+ 'Eclipse' rendered as a LED billboard reflected on a tranquil lake, golden glow
88
+ 'Momentum' rendered as a neon sign hovering above rolling clouds, golden glow
89
+ 'Quantum' rendered as a drone light show suspended between skyscrapers, golden glow
90
+ 'Zenith' rendered as a ice sculpture headline hovering above rolling clouds, golden glow
91
+ A red apple on a blue table next to a glass of water, low‑poly 3‑D art
92
+ Two identical robots playing chess under a lantern, low‑poly 3‑D art
93
+ A green hummingbird hovering above a purple tulip inside a snow globe, hyperreal CGI composite
94
+ A stack of three books with a lit candle on top beside an hourglass, neon synthwave palette
95
+ A silver teapot pouring tea into a floating porcelain cup, sepia ink sketch
96
+ A tiny astronaut standing on a giant sunflower facing the sunrise, photoreal 8K render
97
+ A cat sleeping under a transparent umbrella in gentle rain, chalk pastel drawing
98
+ A bicycle leaning against a graffiti‑covered wall under string lights, chalk pastel drawing
99
+ An origami crane flying over a steaming cup of coffee on a saucer, chalk pastel drawing
100
+ A vintage camera resting on a map with scattered film negatives, low‑poly 3‑D art
configs/captions/example_prompts4.txt ADDED
@@ -0,0 +1,800 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ A bioluminescent rainforest at night, viewed from a canopy walkway, hyper-real, crisp moonlight filtering through mist
2
+ Cross-section of an imaginary geode revealing swirling nebula-like mineral layers, macro photography style
3
+ Futuristic library carved into a glacier, warm interior lighting contrasting icy blue walls, isometric view
4
+ Surreal desert with floating sandstone monoliths casting long shadows at golden hour, ultra-wide lens
5
+ Vintage watercolor map of an archipelago shaped like musical notes, illustrated cartography
6
+ Cyberpunk alley drenched in neon rain, reflective puddles, no characters, cinematic atmosphere
7
+ Close-up of a hummingbird made of fractal glass shards hovering near a sapphire flower, 8K detail
8
+ Orbiting observatory above a gas-giant planet, rings stretching across star-filled sky, photoreal
9
+ Abstract kinetic sculpture of twisting ribbons suspended in a white cube gallery, studio lighting
10
+ Fog-covered pine forest with a single crimson tree in the center, muted color palette
11
+ Time-lapse style composite of a tidal pool from dawn to dusk, stitched into one frame
12
+ Isometric diagram of an autonomous greenhouse on Mars, annotated schematics aesthetic
13
+ Paper-cut illustration of a city inside a whale, layered depth, soft muted tones
14
+ Steampunk airship port at sunrise, brass machinery glinting, painterly brushwork
15
+ Minimalist ink wash painting of a solitary mountain peak emerging from clouds
16
+ Ultraviolet microscope image of an invented pollen grain with crystalline spikes
17
+ Retro 8-bit pixel art scene of a cozy lakeside cabin under meteor shower
18
+ Low-poly 3-D render of a coral reef teeming with geometric fish shapes
19
+ Aerial view of a terraced rice field arranged in a perfect Fibonacci spiral
20
+ Schematic cutaway of a clockwork heart pumping luminous liquid, technical drawing style
21
+ Long-exposure night photograph of fireflies tracing mathematical Lissajous curves
22
+ Gothic cathedral interior built entirely from translucent ice, soft subsurface scattering
23
+ Top-down macro of latte foam forming a fractal coastline pattern
24
+ Astronomical illustration of a triple-sunset over an ocean on an exoplanet
25
+ Ink-on-parchment concept art of a floating pagoda tethered by chains to mountain peaks
26
+ Cubist still life of fruit and musical instruments, vivid complementary colors
27
+ Moody black-and-white film photograph of rain on a lonely train platform, 1950s era
28
+ Hyperreal chrome koi fish swimming through clouds, sky as water
29
+ Floral mandala assembled from autumn leaves, top-down symmetric composition
30
+ Concept art of an underground crystal cavern illuminated by bioluminescent fungi
31
+ Sci-fi control room with holographic interfaces projected into fog, teal-orange palette
32
+ Minimal claymation style landscape with rolling pastel hills and giant daisies
33
+ Polaroid aesthetic photo of a roadside diner at twilight, neon sign flickering
34
+ Vector infographic showing the life cycle of a fictional winged seed, flat design
35
+ Dream-like seascape where waves morph into galloping horses, double-exposure effect
36
+ Art-deco poster of an interstellar passenger train speeding past moons
37
+ Cross-section illustration of a layered cake that resembles planetary strata
38
+ Infrared photograph of a mangrove swamp, foliage appearing white, water inky black
39
+ Whimsical pencil sketch of a tea party with levitating porcelain, soft shading
40
+ Architectural render of a zero-gravity museum with exhibits floating mid-air
41
+ Oil painting of a stormy sky splitting into vortices shaped like musical clefs
42
+ Isometric cutaway of an underground dwarf forge with molten rivers, game concept art
43
+ Frosted glass terrarium containing a miniature thunderstorm, studio backdrop
44
+ Minimalist cyanotype print of fern leaves arranged in a golden ratio spiral
45
+ Fantasy moonlit waterfall cascading upward into the sky, long-exposure feel
46
+ Retro-futuristic poster of a solar-powered desert rover kicking up red dust
47
+ Double helix made of blooming flowers against a white background, high-key macro
48
+ Top-down shot of a labyrinth garden trimmed into Escher-like impossible geometry
49
+ Sci-fi vending machine selling bottled starlight, hologram price tags
50
+ Watercolor portrait of an abstract humanoid with translucent skin revealing galaxies
51
+ Silhouette of a lone tree on an island reflected perfectly in still water, dusk gradient
52
+ Close-up macro of snowflakes arranged to form a Mandelbrot set
53
+ Ink drawing of a koi pond where fish tails morph into swirling calligraphy strokes
54
+ Hyperreal food photography of a floating stack of pancakes with gravity-defying syrup
55
+ Electroluminescent circuit board cityscape at night, streets as glowing traces
56
+ Surreal scene of books sprouting wings and migrating across a sunset sky
57
+ Low-angle view of a colossal sandstone arch framing a star-filled Milky Way
58
+ Cross-section of a mechanical sunflower tracking a miniature artificial sun
59
+ Art-nouveau travel poster for an imaginary cloud kingdom, flowing line art
60
+ Graph-paper style blueprint of a perpetual-motion water wheel, annotated
61
+ Futuristic zen garden with levitating raked sand and floating bonsai stones
62
+ Photoreal underwater city with glass domes linked by glowing tunnels
63
+ Tilt-shift photo of a festival lantern parade through narrow cobblestone streets
64
+ Neon wireframe landscape reminiscent of 1980s synthwave, grid fading to horizon
65
+ Paper-quilling style illustration of a comet bursting into colorful spirals
66
+ Panorama of a crimson aurora over icy mountains, ultra-wide 16:9 aspect
67
+ Transparent holographic chess set floating in zero-gravity, pieces mid-game
68
+ Pointillist painting of a bustling open-air market under summer sun
69
+ Infrared thermal view of a volcanic eruption, palette mapped to rainbow hues
70
+ Detail shot of clock gears where each tooth is a tiny stairway with lanterns
71
+ Minimal line-art poster depicting the evolution of flight from feathers to starships
72
+ Glowing jellyfish drifting through a misty pine forest at dawn, photoreal composite
73
+ Art-studio workbench cluttered with vintage robotics schematics and metal parts
74
+ Monochrome charcoal drawing of a lighthouse beam piercing heavy fog
75
+ Isometric voxel art of a floating garden island with waterfalls spilling into void
76
+ Surreal split-scene: left half winter forest, right half summer meadow, seamless blend
77
+ Retro postage stamp design celebrating a fictional eclipse festival
78
+ Hyperdetailed ceramic mosaic of a phoenix rising, mediterranean style
79
+ Sci-fi medical lab growing crystalline plants in suspended nutrient orbs
80
+ High-speed photo of colored ink clouds colliding underwater, symmetrical composition
81
+ Anamorphic street art illusion of a chasm opening in a city square
82
+ Timber-frame hobbit-style cottage under giant sunflowers, golden afternoon
83
+ Futuristic monorail weaving through skyscrapers wrapped in vertical gardens
84
+ Scientific render of a transparent hypercube containing swirling plasma
85
+ Sepia photograph of an abandoned observatory overtaken by vines
86
+ Concept piece: biomechanical dragon skeleton displayed in a museum hall
87
+ Minimal gradient poster of a single droplet rippling concentric neon rings
88
+ Chalkboard schematic showing stages of a do-it-yourself constellation projector
89
+ Digital glitch art of a city skyline melting into cascading pixels
90
+ Aerial drone shot of rice paddies shaped like circuitry pathways
91
+ Macro of soap film displaying shifting rainbow interference patterns
92
+ Oil-on-canvas seascape where waves are brush strokes of pure geometry
93
+ Tilted perspective of a spiral staircase made entirely of stained glass
94
+ Hyperreal 3-D render of a desert mirage city shimmering above dunes
95
+ Vectorized infographic of wind turbine anatomy with exploded components
96
+ Snow-covered bamboo forest under lantern light, gentle falling flakes
97
+ Abstract generative art of golden particles forming a torus knot in black void
98
+ Stop-motion clay diorama of a miniature volcano erupting sprinkles
99
+ Ultrawide cinematic shot of two converging thunderstorms over open ocean
100
+ Graphite sketch of intertwined river deltas resembling tree roots, top-down view
101
+ Design an hourglass where sand forms miniature mountains in vector flat design style
102
+ Design an hourglass where sand forms miniature mountains in low-poly 3‑D model style
103
+ Depict a mountain range shaped like sleeping giants in vibrant watercolor style
104
+ Observe a futuristic city built on towering waterfalls in steampunk-inspired blueprint style
105
+ Create a desert of shattered stained glass dunes in neon-lit synthwave illustration style
106
+ Envision a forest whose trees emit soft neon pulses in minimalist ink sketch style
107
+ Depict a desert of shattered stained glass dunes in baroque-style oil painting style
108
+ Render a spiral staircase carved from moonlight in baroque-style oil painting style
109
+ Depict a surreal corridor of mirrors reflecting infinite galaxies in digital matte painting style
110
+ Render an hourglass where sand forms miniature mountains in digital matte painting style
111
+ Design a violin constructed of flowing water in steampunk-inspired blueprint style
112
+ Compose a futuristic city built on towering waterfalls in minimalist ink sketch style
113
+ Depict a desert of shattered stained glass dunes in digital matte painting style
114
+ Create a violin constructed of flowing water in minimalist ink sketch style
115
+ Compose a forest whose trees emit soft neon pulses in neon-lit synthwave illustration style
116
+ Observe a violin constructed of flowing water in steampunk-inspired blueprint style
117
+ Observe an antique compass floating above stormy seas in minimalist ink sketch style
118
+ Create a desert of shattered stained glass dunes in photorealistic concept art style
119
+ Design a forest whose trees emit soft neon pulses in neon-lit synthwave illustration style
120
+ Envision an antique compass floating above stormy seas in low-poly 3‑D model style
121
+ Picture a surreal corridor of mirrors reflecting infinite galaxies in low-poly 3‑D model style
122
+ Depict a mountain range shaped like sleeping giants in digital matte painting style
123
+ Visualize a violin constructed of flowing water in photorealistic concept art style
124
+ Visualize a violin constructed of flowing water in vibrant watercolor style
125
+ Create an hourglass where sand forms miniature mountains in vector flat design style
126
+ Render a mountain range shaped like sleeping giants in minimalist ink sketch style
127
+ Render a mountain range shaped like sleeping giants in vector flat design style
128
+ Envision a library whose shelves orbit a glowing star in photorealistic concept art style
129
+ Picture a futuristic city built on towering waterfalls in vibrant watercolor style
130
+ Create a library whose shelves orbit a glowing star in minimalist ink sketch style
131
+ Depict a forest whose trees emit soft neon pulses in minimalist ink sketch style
132
+ Envision a futuristic city built on towering waterfalls in digital matte painting style
133
+ Envision a violin constructed of flowing water in steampunk-inspired blueprint style
134
+ Visualize a surreal corridor of mirrors reflecting infinite galaxies in hyperreal CGI render style
135
+ Imagine a forest whose trees emit soft neon pulses in vector flat design style
136
+ Depict a spiral staircase carved from moonlight in neon-lit synthwave illustration style
137
+ Picture a library whose shelves orbit a glowing star in minimalist ink sketch style
138
+ Compose an hourglass where sand forms miniature mountains in minimalist ink sketch style
139
+ Visualize a forest whose trees emit soft neon pulses in vibrant watercolor style
140
+ Picture a violin constructed of flowing water in minimalist ink sketch style
141
+ Depict a spiral staircase carved from moonlight in hyperreal CGI render style
142
+ Visualize a spiral staircase carved from moonlight in low-poly 3‑D model style
143
+ Picture a futuristic city built on towering waterfalls in vector flat design style
144
+ Picture a spiral staircase carved from moonlight in low-poly 3‑D model style
145
+ Picture a spiral staircase carved from moonlight in hyperreal CGI render style
146
+ Visualize a spiral staircase carved from moonlight in steampunk-inspired blueprint style
147
+ Observe a surreal corridor of mirrors reflecting infinite galaxies in low-poly 3‑D model style
148
+ Envision a surreal corridor of mirrors reflecting infinite galaxies in hyperreal CGI render style
149
+ Depict a forest whose trees emit soft neon pulses in steampunk-inspired blueprint style
150
+ Imagine a desert of shattered stained glass dunes in low-poly 3‑D model style
151
+ Envision a library whose shelves orbit a glowing star in vibrant watercolor style
152
+ Compose an antique compass floating above stormy seas in low-poly 3‑D model style
153
+ Design an antique compass floating above stormy seas in neon-lit synthwave illustration style
154
+ Design a forest whose trees emit soft neon pulses in baroque-style oil painting style
155
+ Design a surreal corridor of mirrors reflecting infinite galaxies in photorealistic concept art style
156
+ Envision a futuristic city built on towering waterfalls in hyperreal CGI render style
157
+ Render a forest whose trees emit soft neon pulses in digital matte painting style
158
+ Design an antique compass floating above stormy seas in vector flat design style
159
+ Compose a desert of shattered stained glass dunes in vibrant watercolor style
160
+ Design an hourglass where sand forms miniature mountains in minimalist ink sketch style
161
+ Imagine a library whose shelves orbit a glowing star in baroque-style oil painting style
162
+ Compose a spiral staircase carved from moonlight in vibrant watercolor style
163
+ Compose a desert of shattered stained glass dunes in minimalist ink sketch style
164
+ Compose a library whose shelves orbit a glowing star in digital matte painting style
165
+ Render a library whose shelves orbit a glowing star in vibrant watercolor style
166
+ Envision a violin constructed of flowing water in steampunk-inspired blueprint style
167
+ Create an antique compass floating above stormy seas in digital matte painting style
168
+ Create a surreal corridor of mirrors reflecting infinite galaxies in vector flat design style
169
+ Observe a mountain range shaped like sleeping giants in vector flat design style
170
+ Depict a library whose shelves orbit a glowing star in hyperreal CGI render style
171
+ Compose a violin constructed of flowing water in photorealistic concept art style
172
+ Visualize a surreal corridor of mirrors reflecting infinite galaxies in neon-lit synthwave illustration style
173
+ Visualize a violin constructed of flowing water in baroque-style oil painting style
174
+ Picture a futuristic city built on towering waterfalls in hyperreal CGI render style
175
+ Design a desert of shattered stained glass dunes in hyperreal CGI render style
176
+ Imagine an hourglass where sand forms miniature mountains in vibrant watercolor style
177
+ Visualize a futuristic city built on towering waterfalls in digital matte painting style
178
+ Visualize a violin constructed of flowing water in photorealistic concept art style
179
+ Observe a futuristic city built on towering waterfalls in hyperreal CGI render style
180
+ Create a mountain range shaped like sleeping giants in vibrant watercolor style
181
+ Visualize a violin constructed of flowing water in digital matte painting style
182
+ Design a futuristic city built on towering waterfalls in digital matte painting style
183
+ Depict a forest whose trees emit soft neon pulses in digital matte painting style
184
+ Design an hourglass where sand forms miniature mountains in photorealistic concept art style
185
+ Visualize a surreal corridor of mirrors reflecting infinite galaxies in photorealistic concept art style
186
+ Picture a mountain range shaped like sleeping giants in vibrant watercolor style
187
+ Compose a futuristic city built on towering waterfalls in vibrant watercolor style
188
+ Depict a mountain range shaped like sleeping giants in hyperreal CGI render style
189
+ Envision a violin constructed of flowing water in photorealistic concept art style
190
+ Imagine a desert of shattered stained glass dunes in steampunk-inspired blueprint style
191
+ Compose a library whose shelves orbit a glowing star in neon-lit synthwave illustration style
192
+ Render a desert of shattered stained glass dunes in low-poly 3‑D model style
193
+ Imagine a futuristic city built on towering waterfalls in steampunk-inspired blueprint style
194
+ Picture a futuristic city built on towering waterfalls in minimalist ink sketch style
195
+ Imagine a violin constructed of flowing water in steampunk-inspired blueprint style
196
+ Render a mountain range shaped like sleeping giants in baroque-style oil painting style
197
+ Envision a futuristic city built on towering waterfalls in photorealistic concept art style
198
+ Observe a forest whose trees emit soft neon pulses in steampunk-inspired blueprint style
199
+ Depict a mountain range shaped like sleeping giants in vibrant watercolor style
200
+ Observe a forest whose trees emit soft neon pulses in steampunk-inspired blueprint style
201
+ Golden hour photograph of a red fox stepping cautiously through dewy meadow grass
202
+ Macro shot of morning frost crystallizing on fern fronds in temperate rainforest
203
+ Underwater wide‑angle photo of a sea turtle gliding above a coral reef teeming with anthias
204
+ High‑altitude drone capture of alpine lake mirrors jagged snow‑capped peaks and drifting clouds
205
+ Slow‑shutter waterfall scene where silky water cascades over mossy basalt boulders in Icelandic gorge
206
+ Cinematic backlit portrait of a barn owl perched on ancient oak branch amid floating dust motes
207
+ Time‑lapse composite of Milky Way arcing above blooming lavender fields in Provence
208
+ Split‑level photo of mangrove roots below waterline and sunset‑lit shoreline above
209
+ Close‑up of honeybee collecting pollen from vibrant sunflower, pollen grains visible on legs
210
+ Foggy morning panorama of rolling hills covered in tea plantations with workers in colorful attire
211
+ Crystal‑clear lakebed photograph revealing patterned stones beneath undisturbed surface reflections
212
+ High‑speed capture of kingfisher diving, water droplets frozen mid‑air around electric‑blue feathers
213
+ Infrared landscape where leafy canopy glows white against deep charcoal sky over calm river
214
+ Portrait of snow leopard resting on sun‑warmed granite ledge, whiskers sharply detailed
215
+ Aerial view of winding river carving emerald wetlands into fractal branching shapes
216
+ Cave photograph of bioluminescent glowworms illuminating stalactites like starry constellations
217
+ Golden backlight silhouette of wild horses galloping across dust‑filled prairie at dusk
218
+ Macro image of dragonfly wings showing iridescent lattice structure against soft bokeh background
219
+ Long‑exposure night photo of fireflies painting light trails over forest clearing
220
+ Humpback whale breaching beside sailboat under dramatic storm clouds, telephoto perspective
221
+ Minimalist snowy landscape with single crimson maple tree breaking endless white expanse
222
+ Low‑angle shot of raindrops impacting still pond, concentric ripples overlapping gracefully
223
+ Juvenile emperor penguins huddling together on Antarctic ice shelf, gentle snowfall
224
+ Monsoon lightning fork illuminating terraced rice paddies in tropical valley
225
+ Photograph of desert sand dune crest with sharp wind‑carved ridges and subtle color gradients
226
+ Crisp autumn scene with mirror‑still lake reflecting birch trees in full yellow foliage
227
+ Grizzly bear catching salmon mid‑leap at waterfall, droplets sparkling in sun
228
+ Ultra‑wide rainforest canopy shot looking straight up at towering kapok trees and lianas
229
+ Frosty spiderweb adorned with dew pearls against muted sunrise pastel sky
230
+ Split‑tone black‑and‑white portrait of African elephant with textured, weathered skin
231
+ Close‑focus shot of chameleon eyes moving independently while clinging to branch
232
+ Macro of raindrop on leaf acting as natural lens magnifying vein structure
233
+ Evening photo of bioluminescent plankton igniting gentle waves on secluded beach
234
+ Snowy owl in silent flight, wings fully extended, low sunlight catching feathers
235
+ Panoramic vista of Grand Canyon with storm rolling in, sunbreak spotlighting sandstone layers
236
+ Reflection of northern lights on frozen lake with cracked ice foreground patterns
237
+ Monarch butterflies clustering densely on eucalyptus branches during migration season
238
+ Fogbow arcing over coastal cliffs during sunrise, soft pastel halo effect
239
+ Underwater shot beneath breaking wave showing turbulent bubbles and sandy seabed
240
+ Starlit desert night with silhouetted Joshua trees and meteor streaks overhead
241
+ Close‑up of mossy forest floor with tiny mushrooms resembling fairy rooftops
242
+ Gorilla family interaction, young playing under watchful silverback in mountain forest
243
+ Infrared portrait of flamingo colony, feathers rendered in surreal icy tones
244
+ Sunlit macro of water droplet refracting inverted mountain landscape
245
+ Silk‑smooth long‑exposure image of tide swirling around jagged sea stacks
246
+ Beetle with metallic iridescence crawling over textured bark, focus stacked
247
+ Golden eagle soaring against dramatic cumulonimbus background, telephoto sharpness
248
+ Low‑key studio shot of concentric nautilus shell cross‑section revealing logarithmic spiral
249
+ High‑shutter capture of hummingbird beating wings beside red hibiscus bloom
250
+ Pastel sunrise over mist‑covered bamboo forest, layered depth fading into distance
251
+ Underwater cave photo with diver silhouette illuminated by cyan light shaft
252
+ Macro of snowflake on dark wool mitten showing intricate hexagonal symmetry
253
+ Valley of wildflowers beneath towering granite cliffs during alpine spring bloom
254
+ Reflection of autumn forest distorted in moving river captured abstractly
255
+ Overhead view of stingrays casting shadows on shallow sandy seafloor in turquoise water
256
+ Red fox curled into ball sleeping amidst fallen maple leaves, soft light
257
+ Thunderstorm shelf cloud sweeping across prairie wheat field, cinematic contrast
258
+ Underlit jellyfish drifting gracefully in inky black aquarium space, tentacles trailing
259
+ Cliffside puffin returning to burrow with beak full of fish, ocean backdrop
260
+ High‑frame burst of snow being shaken off evergreen branch, frozen crystals glimmering
261
+ Evening silhouette of baobab trees reflected in seasonal floodplain under violet sky
262
+ Ultra‑wide sunflower field facing rising sun, radial pattern leading toward horizon
263
+ Close‑up of wolf tracks imprinted in fresh snow, subtle shadows defining ridges
264
+ Macro texture study of elephant skin crack patterns, monochrome emphasis
265
+ Gentle cascade flowing over terraced travertine pools, mineral‑rich aqua water
266
+ Colony of bats exiting cave entrance at dusk, blurred motion streaks against sky
267
+ Raindrops clinging to spider lily petals, selective focus yields painterly background
268
+ Drone shot of turquoise river braided through black volcanic sands creating abstract art
269
+ Slow‑motion splash of crimson pomegranate seeds into clear water, crown burst captured
270
+ Glasswing butterfly perched on leaf, transparent wings revealing background blossoms
271
+ Camel caravan crossing vast erg dunes under scorching midday sun, heat shimmer visible
272
+ Sunset silhouette of giraffes browsing acacia trees on savanna ridge
273
+ Macro portrait of praying mantis head showing compound eye facets with rainbow sheen
274
+ Sunlit iceberg arch framing distant mountain range, polar wilderness
275
+ Shallow‑depth‑of‑field photo of blooming cherry blossoms with soft pastel bokeh
276
+ Sodium vapor night image of urban fox exploring quiet alley, eyes gleaming
277
+ Cresting ocean wave backlit to reveal turquoise translucence and spray diamonds
278
+ Water droplet crown captured on reflective surface using high‑speed flash
279
+ Panorama of volcanic eruption under starry sky, lava rivers glowing intensely
280
+ Quokka standing on hind legs engaging camera with curious expression, beach background
281
+ Macro of tomato frog skin showing porous texture and vivid coloration
282
+ Stormy sky double‑rainbow over field of lupine flowers in twilight
283
+ Leafcutter ants marching along branch carrying leaf fragments, shallow focus foreground blur
284
+ Silhouette of migrating cranes flying in V‑formation across fiery sunrise
285
+ Submerged forest trunks in crystal lake water creating surreal vertical reflections
286
+ Close‑up of gecko adhesion pads under microscope revealing microscopic setae
287
+ Golden waterfall of ginkgo leaves falling in city park during gentle breeze
288
+ Moody long‑exposure shot of lighthouse battered by crashing Atlantic waves
289
+ Moorland early‑morning heather fields shrouded in low‑lying mist, soft pink tones
290
+ Axolotl photographed head‑on in clear water tank, external gills fanned out
291
+ Panoramic ridge walk above sea of clouds with hikers silhouetted, late afternoon
292
+ Infrared aerial of agricultural patchwork revealing hidden irrigation patterns
293
+ Backlit translucent maple leaf showing branching vein network in vivid detail
294
+ Whale shark alongside snorkeler for scale, sun rays piercing surface
295
+ Fire salamander crawling across wet moss with saturated black‑yellow contrast
296
+ Ice cave interior glowing sapphire as sunlight filters through thick glacier ice
297
+ Desert bloom macro of cactus flower opening at dawn, dew droplets sparkling
298
+ Slow‑shutter capture of star trails spinning around polaris above stone ruins
299
+ High‑speed frame of geyser eruption against cobalt sky in geothermal field
300
+ Portrait of peacock displaying fully fanned tail feathers, iridescent eyespots centered
301
+ ancient cave painting of a hedgehog floating in cosmic void
302
+ Art‑Deco travel poster for a zorilla projected onto rainy cityscape
303
+ Japanese ink wash of a urchin against black velvet backdrop
304
+ stained‑glass mosaic of a tiger amid swirling galaxies
305
+ Art‑Deco travel poster for a jellyfish under moonlit sky
306
+ neon synthwave poster featuring a deer amid swirling galaxies
307
+ cyberpunk hologram of a cheetah amid swirling galaxies
308
+ Celtic knotwork engraving of a capybara over vibrant gradient background
309
+ Celtic knotwork engraving of a red panda over vibrant gradient background
310
+ embroidered textile artwork of a panda inside glass terrarium
311
+ low‑poly 3‑D render of a dolphin projected onto rainy cityscape
312
+ baroque oil painting featuring a jellyfish under moonlit sky
313
+ embroidered textile artwork of a penguin inside glass terrarium
314
+ baroque oil painting featuring a quokka against black velvet backdrop
315
+ low‑poly 3‑D render of a koala against black velvet backdrop
316
+ Japanese ink wash of a octopus surrounded by geometric patterns
317
+ cyberpunk hologram of a egret against black velvet backdrop
318
+ steampunk clockwork version of a macaw against black velvet backdrop
319
+ cubist painting depicting a ibis over vibrant gradient background
320
+ cyberpunk hologram of a salamander amid swirling galaxies
321
+ ancient cave painting of a giraffe over vibrant gradient background
322
+ steampunk clockwork version of a ibis surrounded by geometric patterns
323
+ minimalist line‑art study of a hedgehog over vibrant gradient background
324
+ chalk pastel sidewalk mural of a red panda on vintage parchment
325
+ baroque oil painting featuring a salamander amid swirling galaxies
326
+ Art‑Deco travel poster for a hippopotamus floating in cosmic void
327
+ low‑poly 3‑D render of a alpaca on vintage parchment
328
+ chalk pastel sidewalk mural of a newt under moonlit sky
329
+ minimalist line‑art study of a armadillo surrounded by geometric patterns
330
+ futuristic chrome statue of a dolphin inside glass terrarium
331
+ Art‑Deco travel poster for a macaw under moonlit sky
332
+ cubist painting depicting a urchin in ornate golden frame
333
+ chalk pastel sidewalk mural of a quokka amid swirling galaxies
334
+ steampunk clockwork version of a peacock on vintage parchment
335
+ psychedelic tie‑dye depiction of a toucan surrounded by geometric patterns
336
+ cyberpunk hologram of a tiger projected onto rainy cityscape
337
+ ancient cave painting of a badger floating in cosmic void
338
+ stained‑glass mosaic of a caracal over vibrant gradient background
339
+ embroidered textile artwork of a dragonfly on vintage parchment
340
+ pixel‑art sprite sheet for a armadillo over vibrant gradient background
341
+ futuristic chrome statue of a salamander inside glass terrarium
342
+ origami paper sculpture of a elephant surrounded by geometric patterns
343
+ minimalist line‑art study of a ibis amid swirling galaxies
344
+ Art‑Deco travel poster for a giraffe under moonlit sky
345
+ cubist painting depicting a giraffe over vibrant gradient background
346
+ embroidered textile artwork of a dragonfly surrounded by geometric patterns
347
+ ceramic glazed statue of a newt over vibrant gradient background
348
+ Art‑Deco travel poster for a bison under moonlit sky
349
+ chalk pastel sidewalk mural of a caracal on vintage parchment
350
+ steampunk clockwork version of a jellyfish under moonlit sky
351
+ chalk pastel sidewalk mural of a penguin projected onto rainy cityscape
352
+ cyberpunk hologram of a alpaca projected onto rainy cityscape
353
+ stained‑glass mosaic of a cheetah amid swirling galaxies
354
+ cyberpunk hologram of a hedgehog projected onto rainy cityscape
355
+ watercolor splash illustration of a caracal amid swirling galaxies
356
+ neon synthwave poster featuring a goat on vintage parchment
357
+ ancient cave painting of a hedgehog projected onto rainy cityscape
358
+ psychedelic tie‑dye depiction of a whale against black velvet backdrop
359
+ Celtic knotwork engraving of a kiwi bird inside glass terrarium
360
+ ancient cave painting of a bison inside glass terrarium
361
+ origami paper sculpture of a whale inside glass terrarium
362
+ origami paper sculpture of a armadillo under moonlit sky
363
+ stained‑glass mosaic of a seahorse on vintage parchment
364
+ steampunk clockwork version of a kangaroo on vintage parchment
365
+ ceramic glazed statue of a koala against black velvet backdrop
366
+ chalk pastel sidewalk mural of a whale against black velvet backdrop
367
+ minimalist line‑art study of a lemur on vintage parchment
368
+ Art‑Deco travel poster for a tiger projected onto rainy cityscape
369
+ watercolor splash illustration of a ferret amid swirling galaxies
370
+ minimalist line‑art study of a newt on vintage parchment
371
+ psychedelic tie‑dye depiction of a ibis over vibrant gradient background
372
+ Art‑Deco travel poster for a bison over vibrant gradient background
373
+ futuristic chrome statue of a cheetah under moonlit sky
374
+ baroque oil painting featuring a vulture projected onto rainy cityscape
375
+ Japanese ink wash of a cheetah projected onto rainy cityscape
376
+ minimalist line‑art study of a tapir inside glass terrarium
377
+ stained‑glass mosaic of a raven on vintage parchment
378
+ Japanese ink wash of a macaw under moonlit sky
379
+ pixel‑art sprite sheet for a tapir projected onto rainy cityscape
380
+ stained‑glass mosaic of a tapir under moonlit sky
381
+ Art‑Deco travel poster for a kangaroo against black velvet backdrop
382
+ Japanese ink wash of a panda amid swirling galaxies
383
+ watercolor splash illustration of a giraffe inside glass terrarium
384
+ minimalist line‑art study of a bison against black velvet backdrop
385
+ chalk pastel sidewalk mural of a egret over vibrant gradient background
386
+ origami paper sculpture of a polar bear amid swirling galaxies
387
+ minimalist line‑art study of a walrus under moonlit sky
388
+ Celtic knotwork engraving of a salamander over vibrant gradient background
389
+ cyberpunk hologram of a ibis amid swirling galaxies
390
+ minimalist line‑art study of a toucan under moonlit sky
391
+ pixel‑art sprite sheet for a quokka floating in cosmic void
392
+ chalk pastel sidewalk mural of a bison in ornate golden frame
393
+ low‑poly 3‑D render of a capybara under moonlit sky
394
+ Celtic knotwork engraving of a yak inside glass terrarium
395
+ steampunk clockwork version of a armadillo amid swirling galaxies
396
+ baroque oil painting featuring a tiger against black velvet backdrop
397
+ steampunk clockwork version of a newt inside glass terrarium
398
+ ancient cave painting of a owl over vibrant gradient background
399
+ ancient cave painting of a raven in ornate golden frame
400
+ origami paper sculpture of a egret against black velvet backdrop
401
+ Photorealistic close‑up of a jackal basking in sun in the misty valley
402
+ Photorealistic close‑up of a caracal calling loudly in the sunlit heathland
403
+ Photorealistic close‑up of a walrus calling loudly in the snowy tundra
404
+ Photorealistic close‑up of a toucan gliding effortlessly in the sunlit heathland
405
+ Photorealistic close‑up of a newt leaping gracefully in the crystal clear alpine lake
406
+ Photorealistic close‑up of a macaw calling loudly in the rocky coastal cliffs
407
+ Photorealistic close‑up of a raven gliding effortlessly in the starlit ocean surface
408
+ Photorealistic close‑up of a kiwi bird bathing playfully in the tropical rainforest canopy
409
+ Photorealistic close‑up of a peacock nursing its young in the savanna at dawn
410
+ Photorealistic close‑up of a penguin resting peacefully in the tropical rainforest canopy
411
+ Photorealistic close‑up of a octopus gliding effortlessly in the crystal clear alpine lake
412
+ Photorealistic close‑up of a vulture gliding effortlessly in the rushing waterfall spray
413
+ Photorealistic close‑up of a butterfly resting peacefully in the tropical rainforest canopy
414
+ Photorealistic close‑up of a porcupine leaping gracefully in the coral reef
415
+ Photorealistic close‑up of a elephant leaping gracefully in the dense mangrove swamp
416
+ Photorealistic close‑up of a butterfly hunting silently in the rushing waterfall spray
417
+ Photorealistic close‑up of a whale prowling cautiously in the crystal clear alpine lake
418
+ Photorealistic close‑up of a egret hunting silently in the snowy tundra
419
+ Photorealistic close‑up of a urchin prowling cautiously in the mountain meadow
420
+ Photorealistic close‑up of a ibis gliding effortlessly in the golden desert dunes
421
+ Photorealistic close‑up of a cheetah nursing its young in the open grassland under stormy sky
422
+ Photorealistic close‑up of a koala leaping gracefully in the snowy tundra
423
+ Photorealistic close‑up of a vulture resting peacefully in the sunlit heathland
424
+ Photorealistic close‑up of a dragonfly gliding effortlessly in the icy Antarctic shelf
425
+ Photorealistic close‑up of a ferret basking in sun in the dense mangrove swamp
426
+ Photorealistic close‑up of a panda nursing its young in the starlit ocean surface
427
+ Photorealistic close‑up of a zorilla leaping gracefully in the submerged kelp forest
428
+ Photorealistic close‑up of a koala basking in sun in the misty valley
429
+ Photorealistic close‑up of a lemming foraging curiously in the savanna at dawn
430
+ Photorealistic close‑up of a macaw hunting silently in the tropical rainforest canopy
431
+ Photorealistic close‑up of a dragonfly basking in sun in the submerged kelp forest
432
+ Photorealistic close‑up of a urchin prowling cautiously in the tropical rainforest canopy
433
+ Photorealistic close‑up of a tiger nursing its young in the starlit ocean surface
434
+ Photorealistic close‑up of a egret gliding effortlessly in the foggy pine forest
435
+ Photorealistic close‑up of a goat resting peacefully in the tropical rainforest canopy
436
+ Photorealistic close‑up of a walrus nursing its young in the submerged kelp forest
437
+ Photorealistic close‑up of a elephant bathing playfully in the lush river delta
438
+ Photorealistic close‑up of a urchin calling loudly in the twilight prairie
439
+ Photorealistic close‑up of a giraffe gliding effortlessly in the dense mangrove swamp
440
+ Photorealistic close‑up of a dolphin nursing its young in the foggy pine forest
441
+ Photorealistic close‑up of a egret basking in sun in the mountain meadow
442
+ Photorealistic close‑up of a lion foraging curiously in the golden desert dunes
443
+ Photorealistic close‑up of a wolf leaping gracefully in the coral reef
444
+ Photorealistic close‑up of a seahorse bathing playfully in the mountain meadow
445
+ Photorealistic close‑up of a urchin basking in sun in the savanna at dawn
446
+ Photorealistic close‑up of a flamingo basking in sun in the icy Antarctic shelf
447
+ Photorealistic close‑up of a caterpillar basking in sun in the starlit ocean surface
448
+ Photorealistic close‑up of a polar bear bathing playfully in the coral reef
449
+ Photorealistic close‑up of a kiwi bird leaping gracefully in the starlit ocean surface
450
+ Photorealistic close‑up of a flamingo prowling cautiously in the steep bamboo grove
451
+ Photorealistic close‑up of a dragonfly hunting silently in the rushing waterfall spray
452
+ Photorealistic close‑up of a salamander bathing playfully in the mountain meadow
453
+ Photorealistic close‑up of a hippopotamus hunting silently in the mountain meadow
454
+ Photorealistic close‑up of a capybara hunting silently in the savanna at dawn
455
+ Photorealistic close‑up of a octopus nursing its young in the savanna at dawn
456
+ Photorealistic close‑up of a macaw leaping gracefully in the foggy pine forest
457
+ Photorealistic close‑up of a oriole resting peacefully in the coral reef
458
+ Photorealistic close‑up of a panda foraging curiously in the steep bamboo grove
459
+ Photorealistic close‑up of a tiger leaping gracefully in the snowy tundra
460
+ Photorealistic close‑up of a caracal gliding effortlessly in the icy Antarctic shelf
461
+ Photorealistic close‑up of a panda calling loudly in the snowy tundra
462
+ Photorealistic close‑up of a jackal nursing its young in the open grassland under stormy sky
463
+ Photorealistic close‑up of a kangaroo nursing its young in the misty valley
464
+ Photorealistic close‑up of a polar bear basking in sun in the steep bamboo grove
465
+ Photorealistic close‑up of a toucan resting peacefully in the golden desert dunes
466
+ Photorealistic close‑up of a kiwi bird bathing playfully in the submerged kelp forest
467
+ Photorealistic close‑up of a deer resting peacefully in the twilight prairie
468
+ Photorealistic close‑up of a fox prowling cautiously in the twilight prairie
469
+ Photorealistic close‑up of a bison bathing playfully in the crystal clear alpine lake
470
+ Photorealistic close‑up of a walrus prowling cautiously in the misty valley
471
+ Photorealistic close‑up of a chameleon foraging curiously in the dense mangrove swamp
472
+ Photorealistic close‑up of a raven hunting silently in the crystal clear alpine lake
473
+ Photorealistic close‑up of a peacock basking in sun in the savanna at dawn
474
+ Photorealistic close‑up of a seahorse foraging curiously in the crystal clear alpine lake
475
+ Photorealistic close‑up of a tapir gliding effortlessly in the crystal clear alpine lake
476
+ Photorealistic close‑up of a polar bear resting peacefully in the foggy pine forest
477
+ Photorealistic close‑up of a urchin resting peacefully in the rushing waterfall spray
478
+ Photorealistic close‑up of a armadillo leaping gracefully in the foggy pine forest
479
+ Photorealistic close‑up of a hippopotamus gliding effortlessly in the foggy pine forest
480
+ Photorealistic close‑up of a hippopotamus bathing playfully in the starlit ocean surface
481
+ Photorealistic close‑up of a alpaca nursing its young in the foggy pine forest
482
+ Photorealistic close‑up of a armadillo foraging curiously in the foggy pine forest
483
+ Photorealistic close‑up of a fox resting peacefully in the snowy tundra
484
+ Photorealistic close‑up of a kangaroo resting peacefully in the dense mangrove swamp
485
+ Photorealistic close‑up of a egret foraging curiously in the steep bamboo grove
486
+ Photorealistic close‑up of a toucan hunting silently in the sunlit heathland
487
+ Photorealistic close‑up of a chameleon nursing its young in the tropical rainforest canopy
488
+ Photorealistic close‑up of a raccoon leaping gracefully in the rushing waterfall spray
489
+ Photorealistic close‑up of a chameleon calling loudly in the coral reef
490
+ Photorealistic close‑up of a polar bear foraging curiously in the rushing waterfall spray
491
+ Photorealistic close‑up of a caracal prowling cautiously in the coral reef
492
+ Photorealistic close‑up of a tapir bathing playfully in the starlit ocean surface
493
+ Photorealistic close‑up of a caracal calling loudly in the rocky coastal cliffs
494
+ Photorealistic close‑up of a goat leaping gracefully in the misty valley
495
+ Photorealistic close‑up of a yak calling loudly in the misty valley
496
+ Photorealistic close‑up of a flamingo calling loudly in the golden desert dunes
497
+ Photorealistic close‑up of a cheetah prowling cautiously in the savanna at dawn
498
+ Photorealistic close‑up of a macaw resting peacefully in the snowy tundra
499
+ Photorealistic close‑up of a hedgehog basking in sun in the lush river delta
500
+ Photorealistic close‑up of a bison hunting silently in the snowy tundra
501
+ Portrait of a data visualizer demonstrating calligraphy strokes on rice paper, candid, natural lighting
502
+ Portrait of a bridge acoustics analyst practicing ballet leaps on an empty theater stage, candid, natural lighting
503
+ Portrait of a classical guitarist harvesting greens on a rooftop farm at sunrise, candid, natural lighting
504
+ Portrait of a harbor master crafting leather shoes in a workshop filled with wood shavings, candid, natural lighting
505
+ Portrait of a aquaculture farmer glazing pottery beside a crackling kiln, candid, natural lighting
506
+ Portrait of a blacksmith contemplating star charts under a dim observatory dome, candid, natural lighting
507
+ Portrait of a glass mosaic artist practicing ballet leaps on an empty theater stage, candid, natural lighting
508
+ Portrait of a avian ecologist demonstrating calligraphy strokes on rice paper, candid, natural lighting
509
+ Portrait of a landscape painter harvesting greens on a rooftop farm at sunrise, candid, natural lighting
510
+ Portrait of a river rafting guide analyzing rock samples in a windswept canyon, candid, natural lighting
511
+ Portrait of a habitat designer assembling a mechanical clockwork device under magnifying lamp, candid, natural lighting
512
+ Portrait of a bridge inspector demonstrating calligraphy strokes on rice paper, candid, natural lighting
513
+ Portrait of a geologist recording ambient sounds inside an old forest, candid, natural lighting
514
+ Portrait of a woodworker shaping surfboards in a powdery shaping bay, candid, natural lighting
515
+ Portrait of a data visualizer testing experimental robots in a sleek lab, candid, natural lighting
516
+ Portrait of a kite maker blowing molten glass into swirling shapes by furnace glow, candid, natural lighting
517
+ Portrait of a bridge acoustics analyst contemplating star charts under a dim observatory dome, candid, natural lighting
518
+ Portrait of a wildland firefighter packing honeycombs at a rustic apiary, candid, natural lighting
519
+ Portrait of a mechanical watchmaker assembling a mechanical clockwork device under magnifying lamp, candid, natural lighting
520
+ Portrait of a astronomer blowing molten glass into swirling shapes by furnace glow, candid, natural lighting
521
+ Portrait of a labyrinth gardener harvesting greens on a rooftop farm at sunrise, candid, natural lighting
522
+ Portrait of a violinist performing on a cobblestone street at dusk, candid, natural lighting
523
+ Portrait of a cheese monger glazing pottery beside a crackling kiln, candid, natural lighting
524
+ Portrait of a heritage conservator shaping surfboards in a powdery shaping bay, candid, natural lighting
525
+ Portrait of a rowboat builder performing on a cobblestone street at dusk, candid, natural lighting
526
+ Portrait of a bonsai cultivator cataloging rare manuscripts in a quiet library alcove, candid, natural lighting
527
+ Portrait of a wildlife rehabilitator practicing ballet leaps on an empty theater stage, candid, natural lighting
528
+ Portrait of a street photographer demonstrating calligraphy strokes on rice paper, candid, natural lighting
529
+ Portrait of a librarian performing on a cobblestone street at dusk, candid, natural lighting
530
+ Portrait of a surfboard shaper guiding hikers along a misty ridge, candid, natural lighting
531
+ Portrait of a archaeologist assembling a mechanical clockwork device under magnifying lamp, candid, natural lighting
532
+ Portrait of a mechanical watchmaker assembling a mechanical clockwork device under magnifying lamp, candid, natural lighting
533
+ Portrait of a hydrologist glazing pottery beside a crackling kiln, candid, natural lighting
534
+ Portrait of a classical guitarist practicing ballet leaps on an empty theater stage, candid, natural lighting
535
+ Portrait of a stained‑glass restorer testing experimental robots in a sleek lab, candid, natural lighting
536
+ Portrait of a ballet dancer working in a sunlit studio cluttered with tools, candid, natural lighting
537
+ Portrait of a mechanical watchmaker restoring stained glass under soft cathedral light, candid, natural lighting
538
+ Portrait of a data visualizer packing honeycombs at a rustic apiary, candid, natural lighting
539
+ Portrait of a habitat designer working in a sunlit studio cluttered with tools, candid, natural lighting
540
+ Portrait of a bonsai cultivator cataloging rare manuscripts in a quiet library alcove, candid, natural lighting
541
+ Portrait of a sustainable architect analyzing rock samples in a windswept canyon, candid, natural lighting
542
+ Portrait of a antique restorer analyzing rock samples in a windswept canyon, candid, natural lighting
543
+ Portrait of a habitat designer contemplating star charts under a dim observatory dome, candid, natural lighting
544
+ Portrait of a urban sketcher crafting leather shoes in a workshop filled with wood shavings, candid, natural lighting
545
+ Portrait of a bike messenger recording ambient sounds inside an old forest, candid, natural lighting
546
+ Portrait of a calligrapher restoring stained glass under soft cathedral light, candid, natural lighting
547
+ Portrait of a beekeeper recording ambient sounds inside an old forest, candid, natural lighting
548
+ Portrait of a sound Foley artist glazing pottery beside a crackling kiln, candid, natural lighting
549
+ Portrait of a renewable energy lobbyist glazing pottery beside a crackling kiln, candid, natural lighting
550
+ Portrait of a tattoo artist blowing molten glass into swirling shapes by furnace glow, candid, natural lighting
551
+ Portrait of a marionette puppeteer recording ambient sounds inside an old forest, candid, natural lighting
552
+ Portrait of a hand‑pan musician harvesting greens on a rooftop farm at sunrise, candid, natural lighting
553
+ Portrait of a mountain guide painting a landscape en plein air beside a river, candid, natural lighting
554
+ Portrait of a materials engineer cataloging rare manuscripts in a quiet library alcove, candid, natural lighting
555
+ Portrait of a stained‑glass restorer painting a landscape en plein air beside a river, candid, natural lighting
556
+ Portrait of a materials engineer assembling a mechanical clockwork device under magnifying lamp, candid, natural lighting
557
+ Portrait of a labyrinth gardener demonstrating calligraphy strokes on rice paper, candid, natural lighting
558
+ Portrait of a potter blowing molten glass into swirling shapes by furnace glow, candid, natural lighting
559
+ Portrait of a prosthetics designer harvesting greens on a rooftop farm at sunrise, candid, natural lighting
560
+ Portrait of a wind surfer crafting leather shoes in a workshop filled with wood shavings, candid, natural lighting
561
+ Portrait of a hydrologist guiding hikers along a misty ridge, candid, natural lighting
562
+ Portrait of a hand‑pan musician assembling a mechanical clockwork device under magnifying lamp, candid, natural lighting
563
+ Portrait of a mountain guide harvesting greens on a rooftop farm at sunrise, candid, natural lighting
564
+ Portrait of a ceramic artist practicing ballet leaps on an empty theater stage, candid, natural lighting
565
+ Portrait of a ship pilot harvesting greens on a rooftop farm at sunrise, candid, natural lighting
566
+ Portrait of a rowboat builder analyzing rock samples in a windswept canyon, candid, natural lighting
567
+ Portrait of a restoration carpenter painting a landscape en plein air beside a river, candid, natural lighting
568
+ Portrait of a bridge acoustics analyst crafting leather shoes in a workshop filled with wood shavings, candid, natural lighting
569
+ Portrait of a shoemaker glazing pottery beside a crackling kiln, candid, natural lighting
570
+ Portrait of a hydrologist testing experimental robots in a sleek lab, candid, natural lighting
571
+ Portrait of a heritage conservator working in a sunlit studio cluttered with tools, candid, natural lighting
572
+ Portrait of a harbor master guiding hikers along a misty ridge, candid, natural lighting
573
+ Portrait of a field linguist guiding hikers along a misty ridge, candid, natural lighting
574
+ Portrait of a ceramic artist painting a landscape en plein air beside a river, candid, natural lighting
575
+ Portrait of a silversmith performing on a cobblestone street at dusk, candid, natural lighting
576
+ Portrait of a glass mosaic artist recording ambient sounds inside an old forest, candid, natural lighting
577
+ Portrait of a potter assembling a mechanical clockwork device under magnifying lamp, candid, natural lighting
578
+ Portrait of a ice climber analyzing rock samples in a windswept canyon, candid, natural lighting
579
+ Portrait of a bookbinder crafting leather shoes in a workshop filled with wood shavings, candid, natural lighting
580
+ Portrait of a marionette puppeteer restoring stained glass under soft cathedral light, candid, natural lighting
581
+ Portrait of a chef practicing ballet leaps on an empty theater stage, candid, natural lighting
582
+ Portrait of a marionette puppeteer blowing molten glass into swirling shapes by furnace glow, candid, natural lighting
583
+ Portrait of a search‑and‑rescue dog handler shaping surfboards in a powdery shaping bay, candid, natural lighting
584
+ Portrait of a field linguist blowing molten glass into swirling shapes by furnace glow, candid, natural lighting
585
+ Portrait of a kite maker contemplating star charts under a dim observatory dome, candid, natural lighting
586
+ Portrait of a toy designer blowing molten glass into swirling shapes by furnace glow, candid, natural lighting
587
+ Portrait of a calligrapher shaping surfboards in a powdery shaping bay, candid, natural lighting
588
+ Portrait of a botanist glazing pottery beside a crackling kiln, candid, natural lighting
589
+ Portrait of a botanist contemplating star charts under a dim observatory dome, candid, natural lighting
590
+ Portrait of a gardener assembling a mechanical clockwork device under magnifying lamp, candid, natural lighting
591
+ Portrait of a wildland firefighter harvesting greens on a rooftop farm at sunrise, candid, natural lighting
592
+ Portrait of a gardener cataloging rare manuscripts in a quiet library alcove, candid, natural lighting
593
+ Portrait of a hydrologist assembling a mechanical clockwork device under magnifying lamp, candid, natural lighting
594
+ Portrait of a community baker restoring stained glass under soft cathedral light, candid, natural lighting
595
+ Portrait of a rowboat builder analyzing rock samples in a windswept canyon, candid, natural lighting
596
+ Portrait of a choral conductor glazing pottery beside a crackling kiln, candid, natural lighting
597
+ Portrait of a geologist glazing pottery beside a crackling kiln, candid, natural lighting
598
+ Portrait of a mechanical watchmaker crafting leather shoes in a workshop filled with wood shavings, candid, natural lighting
599
+ Portrait of a tattoo artist packing honeycombs at a rustic apiary, candid, natural lighting
600
+ Portrait of a botanist glazing pottery beside a crackling kiln, candid, natural lighting
601
+ High‑resolution vintage travel poster spelling 'Velocity' carved into towering glacier face, dramatic lighting
602
+ High‑resolution ice sculpture headline spelling 'Eclipse' carved into towering glacier face, dramatic lighting
603
+ High‑resolution glowing moss graffiti spelling 'Velocity' floating inside zero‑gravity space station, dramatic lighting
604
+ High‑resolution LED hologram billboard spelling 'Orbit' amid cherry‑blossom snowfall, dramatic lighting
605
+ High‑resolution chalkboard typography sketch spelling 'Odyssey' amid cherry‑blossom snowfall, dramatic lighting
606
+ High‑resolution ice sculpture headline spelling 'Nebula' hovering above stormy ocean waves, dramatic lighting
607
+ High‑resolution LED hologram billboard spelling 'Odyssey' emerging from rolling morning fog, dramatic lighting
608
+ High‑resolution neon street sign spelling 'Voyage' over a bustling retro‑futuristic metropolis, dramatic lighting
609
+ High‑resolution sand dune calligraphy spelling 'Ethereal' carved into towering glacier face, dramatic lighting
610
+ High‑resolution neon street sign spelling 'Eclipse' suspended between skyscrapers at twilight, dramatic lighting
611
+ High‑resolution glowing moss graffiti spelling 'Quantum' projected onto ancient ruins at dusk, dramatic lighting
612
+ High‑resolution chalkboard typography sketch spelling 'Harmony' against star‑filled desert night sky, dramatic lighting
613
+ High‑resolution aerial crop‑art installation spelling 'Orbit' floating inside zero‑gravity space station, dramatic lighting
614
+ High‑resolution steampunk brass engraving spelling 'Nebula' emerging from rolling morning fog, dramatic lighting
615
+ High‑resolution glowing moss graffiti spelling 'Zenith' projected onto ancient ruins at dusk, dramatic lighting
616
+ High‑resolution glowing moss graffiti spelling 'Velocity' over a bustling retro‑futuristic metropolis, dramatic lighting
617
+ High‑resolution glowing moss graffiti spelling 'Odyssey' projected onto ancient ruins at dusk, dramatic lighting
618
+ High‑resolution neon street sign spelling 'Orbit' hovering above stormy ocean waves, dramatic lighting
619
+ High‑resolution neon street sign spelling 'Nebula' against star‑filled desert night sky, dramatic lighting
620
+ High‑resolution glowing moss graffiti spelling 'Synthesis' hovering above stormy ocean waves, dramatic lighting
621
+ High‑resolution LED hologram billboard spelling 'Nebula' carved into towering glacier face, dramatic lighting
622
+ High‑resolution chalkboard typography sketch spelling 'Quantum' amid cherry‑blossom snowfall, dramatic lighting
623
+ High‑resolution LED hologram billboard spelling 'Odyssey' suspended between skyscrapers at twilight, dramatic lighting
624
+ High‑resolution LED hologram billboard spelling 'Ethereal' over a bustling retro‑futuristic metropolis, dramatic lighting
625
+ High‑resolution ice sculpture headline spelling 'Eclipse' projected onto ancient ruins at dusk, dramatic lighting
626
+ High‑resolution vintage travel poster spelling 'Serendipity' floating inside zero‑gravity space station, dramatic lighting
627
+ High‑resolution steampunk brass engraving spelling 'Cascade' projected onto ancient ruins at dusk, dramatic lighting
628
+ High‑resolution skywritten message spelling 'Harmony' suspended between skyscrapers at twilight, dramatic lighting
629
+ High‑resolution ice sculpture headline spelling 'Synthesis' suspended between skyscrapers at twilight, dramatic lighting
630
+ High‑resolution neon street sign spelling 'Serendipity' against star‑filled desert night sky, dramatic lighting
631
+ High‑resolution steampunk brass engraving spelling 'Orbit' suspended between skyscrapers at twilight, dramatic lighting
632
+ High‑resolution chalkboard typography sketch spelling 'Serendipity' carved into towering glacier face, dramatic lighting
633
+ High‑resolution steampunk brass engraving spelling 'Equinox' reflected in rain‑soaked cobblestones, dramatic lighting
634
+ High‑resolution vintage travel poster spelling 'Harmony' projected onto ancient ruins at dusk, dramatic lighting
635
+ High‑resolution vintage travel poster spelling 'Voyage' floating inside zero‑gravity space station, dramatic lighting
636
+ High‑resolution aerial crop‑art installation spelling 'Quantum' against star‑filled desert night sky, dramatic lighting
637
+ High‑resolution glowing moss graffiti spelling 'Voyage' reflected in rain‑soaked cobblestones, dramatic lighting
638
+ High‑resolution steampunk brass engraving spelling 'Velocity' amid cherry‑blossom snowfall, dramatic lighting
639
+ High‑resolution steampunk brass engraving spelling 'Zenith' hovering above stormy ocean waves, dramatic lighting
640
+ High‑resolution ice sculpture headline spelling 'Orbit' over a bustling retro‑futuristic metropolis, dramatic lighting
641
+ High‑resolution chalkboard typography sketch spelling 'Harmony' reflected in rain‑soaked cobblestones, dramatic lighting
642
+ High‑resolution aerial crop‑art installation spelling 'Cascade' projected onto ancient ruins at dusk, dramatic lighting
643
+ High‑resolution neon street sign spelling 'Harmony' reflected in rain‑soaked cobblestones, dramatic lighting
644
+ High‑resolution vintage travel poster spelling 'Zenith' emerging from rolling morning fog, dramatic lighting
645
+ High‑resolution aerial crop‑art installation spelling 'Quantum' emerging from rolling morning fog, dramatic lighting
646
+ High‑resolution LED hologram billboard spelling 'Voyage' amid cherry‑blossom snowfall, dramatic lighting
647
+ High‑resolution vintage travel poster spelling 'Quantum' amid cherry‑blossom snowfall, dramatic lighting
648
+ High‑resolution chalkboard typography sketch spelling 'Momentum' projected onto ancient ruins at dusk, dramatic lighting
649
+ High‑resolution neon street sign spelling 'Zenith' projected onto ancient ruins at dusk, dramatic lighting
650
+ High‑resolution glowing moss graffiti spelling 'Voyage' reflected in rain‑soaked cobblestones, dramatic lighting
651
+ High‑resolution glowing moss graffiti spelling 'Odyssey' floating inside zero‑gravity space station, dramatic lighting
652
+ High‑resolution neon street sign spelling 'Serendipity' projected onto ancient ruins at dusk, dramatic lighting
653
+ High‑resolution sand dune calligraphy spelling 'Harmony' projected onto ancient ruins at dusk, dramatic lighting
654
+ High‑resolution neon street sign spelling 'Serendipity' carved into towering glacier face, dramatic lighting
655
+ High‑resolution chalkboard typography sketch spelling 'Momentum' floating inside zero‑gravity space station, dramatic lighting
656
+ High‑resolution sand dune calligraphy spelling 'Momentum' suspended between skyscrapers at twilight, dramatic lighting
657
+ High‑resolution ice sculpture headline spelling 'Eclipse' carved into towering glacier face, dramatic lighting
658
+ High‑resolution LED hologram billboard spelling 'Quantum' over a bustling retro‑futuristic metropolis, dramatic lighting
659
+ High‑resolution ice sculpture headline spelling 'Synthesis' over a bustling retro‑futuristic metropolis, dramatic lighting
660
+ High‑resolution glowing moss graffiti spelling 'Equinox' amid cherry‑blossom snowfall, dramatic lighting
661
+ High‑resolution ice sculpture headline spelling 'Voyage' against star‑filled desert night sky, dramatic lighting
662
+ High‑resolution chalkboard typography sketch spelling 'Odyssey' carved into towering glacier face, dramatic lighting
663
+ High‑resolution neon street sign spelling 'Voyage' projected onto ancient ruins at dusk, dramatic lighting
664
+ High‑resolution LED hologram billboard spelling 'Equinox' reflected in rain‑soaked cobblestones, dramatic lighting
665
+ High‑resolution ice sculpture headline spelling 'Voyage' projected onto ancient ruins at dusk, dramatic lighting
666
+ High‑resolution steampunk brass engraving spelling 'Equinox' amid cherry‑blossom snowfall, dramatic lighting
667
+ High‑resolution ice sculpture headline spelling 'Momentum' hovering above stormy ocean waves, dramatic lighting
668
+ High‑resolution sand dune calligraphy spelling 'Velocity' projected onto ancient ruins at dusk, dramatic lighting
669
+ High‑resolution glowing moss graffiti spelling 'Momentum' over a bustling retro‑futuristic metropolis, dramatic lighting
670
+ High‑resolution LED hologram billboard spelling 'Nebula' amid cherry‑blossom snowfall, dramatic lighting
671
+ High‑resolution sand dune calligraphy spelling 'Momentum' carved into towering glacier face, dramatic lighting
672
+ High‑resolution chalkboard typography sketch spelling 'Odyssey' emerging from rolling morning fog, dramatic lighting
673
+ High‑resolution skywritten message spelling 'Ethereal' amid cherry‑blossom snowfall, dramatic lighting
674
+ High‑resolution skywritten message spelling 'Equinox' hovering above stormy ocean waves, dramatic lighting
675
+ High‑resolution neon street sign spelling 'Odyssey' projected onto ancient ruins at dusk, dramatic lighting
676
+ High‑resolution sand dune calligraphy spelling 'Equinox' floating inside zero‑gravity space station, dramatic lighting
677
+ High‑resolution aerial crop‑art installation spelling 'Eclipse' projected onto ancient ruins at dusk, dramatic lighting
678
+ High‑resolution aerial crop‑art installation spelling 'Quantum' hovering above stormy ocean waves, dramatic lighting
679
+ High‑resolution vintage travel poster spelling 'Serendipity' amid cherry‑blossom snowfall, dramatic lighting
680
+ High‑resolution sand dune calligraphy spelling 'Ethereal' against star‑filled desert night sky, dramatic lighting
681
+ High‑resolution ice sculpture headline spelling 'Ethereal' carved into towering glacier face, dramatic lighting
682
+ High‑resolution ice sculpture headline spelling 'Quantum' emerging from rolling morning fog, dramatic lighting
683
+ High‑resolution aerial crop‑art installation spelling 'Momentum' hovering above stormy ocean waves, dramatic lighting
684
+ High‑resolution vintage travel poster spelling 'Odyssey' against star‑filled desert night sky, dramatic lighting
685
+ High‑resolution ice sculpture headline spelling 'Cascade' amid cherry‑blossom snowfall, dramatic lighting
686
+ High‑resolution glowing moss graffiti spelling 'Harmony' emerging from rolling morning fog, dramatic lighting
687
+ High‑resolution glowing moss graffiti spelling 'Serendipity' amid cherry‑blossom snowfall, dramatic lighting
688
+ High‑resolution neon street sign spelling 'Orbit' emerging from rolling morning fog, dramatic lighting
689
+ High‑resolution skywritten message spelling 'Harmony' suspended between skyscrapers at twilight, dramatic lighting
690
+ High‑resolution sand dune calligraphy spelling 'Equinox' floating inside zero‑gravity space station, dramatic lighting
691
+ High‑resolution skywritten message spelling 'Cascade' carved into towering glacier face, dramatic lighting
692
+ High‑resolution glowing moss graffiti spelling 'Zenith' reflected in rain‑soaked cobblestones, dramatic lighting
693
+ High‑resolution glowing moss graffiti spelling 'Zenith' emerging from rolling morning fog, dramatic lighting
694
+ High‑resolution steampunk brass engraving spelling 'Orbit' emerging from rolling morning fog, dramatic lighting
695
+ High‑resolution LED hologram billboard spelling 'Ethereal' amid cherry‑blossom snowfall, dramatic lighting
696
+ High‑resolution LED hologram billboard spelling 'Zenith' suspended between skyscrapers at twilight, dramatic lighting
697
+ High‑resolution aerial crop‑art installation spelling 'Ethereal' projected onto ancient ruins at dusk, dramatic lighting
698
+ High‑resolution neon street sign spelling 'Ethereal' projected onto ancient ruins at dusk, dramatic lighting
699
+ High‑resolution LED hologram billboard spelling 'Quantum' floating inside zero‑gravity space station, dramatic lighting
700
+ High‑resolution glowing moss graffiti spelling 'Voyage' against star‑filled desert night sky, dramatic lighting
701
+ cinematic A cyberpunk cityscape at night, rain‑soaked streets and neon signs
702
+ vibrant A bowl of soup that looks like a monster knitted out of wool
703
+ dreamlike A cyberpunk cityscape at night, rain‑soaked streets and neon signs
704
+ ultrarealistic A bowl of soup that looks like a monster knitted out of wool
705
+ vibrant A steampunk airship sailing above Victorian London at sunrise
706
+ high‑contrast A futuristic sports car parked in an ancient Roman forum, 8K render
707
+ vibrant A microscopic close‑up of a snowflake shaped like a cathedral
708
+ cinematic A photorealistic image of an astronaut riding a horse on Mars
709
+ vibrant A steampunk airship sailing above Victorian London at sunrise
710
+ dreamlike An ice cream cone melting into a desert landscape, surrealism
711
+ high‑contrast A photorealistic image of an astronaut riding a horse on Mars
712
+ vibrant A steampunk airship sailing above Victorian London at sunrise
713
+ hyperdetailed A photorealistic image of an astronaut riding a horse on Mars
714
+ vibrant A majestic lion wearing a royal crown, oil on canvas
715
+ high‑contrast A steampunk airship sailing above Victorian London at sunrise
716
+ high‑contrast A cat made of galaxies, digital art
717
+ vibrant A bowl of soup that looks like a monster knitted out of wool
718
+ hyperdetailed A bowl of soup that looks like a monster knitted out of wool
719
+ vibrant A steampunk airship sailing above Victorian London at sunrise
720
+ vibrant A photorealistic image of an astronaut riding a horse on Mars
721
+ dreamlike A futuristic sports car parked in an ancient Roman forum, 8K render
722
+ dreamlike A steampunk airship sailing above Victorian London at sunrise
723
+ vibrant A cyberpunk cityscape at night, rain‑soaked streets and neon signs
724
+ high‑contrast A cyberpunk cityscape at night, rain‑soaked streets and neon signs
725
+ ultrarealistic A cat made of galaxies, digital art
726
+ high‑contrast A steampunk airship sailing above Victorian London at sunrise
727
+ dreamlike An ice cream cone melting into a desert landscape, surrealism
728
+ ultrarealistic A photorealistic image of an astronaut riding a horse on Mars
729
+ cinematic A steampunk airship sailing above Victorian London at sunrise
730
+ cinematic A microscopic close‑up of a snowflake shaped like a cathedral
731
+ hyperdetailed A photorealistic image of an astronaut riding a horse on Mars
732
+ hyperdetailed A steampunk airship sailing above Victorian London at sunrise
733
+ vibrant A bowl of soup that looks like a monster knitted out of wool
734
+ hyperdetailed A futuristic sports car parked in an ancient Roman forum, 8K render
735
+ hyperdetailed A steampunk airship sailing above Victorian London at sunrise
736
+ hyperdetailed A steampunk airship sailing above Victorian London at sunrise
737
+ vibrant A painting of a fox in the style of Van Gogh
738
+ dreamlike A photorealistic image of an astronaut riding a horse on Mars
739
+ ultrarealistic A cat made of galaxies, digital art
740
+ cinematic A futuristic sports car parked in an ancient Roman forum, 8K render
741
+ dreamlike A painting of a fox in the style of Van Gogh
742
+ ultrarealistic A photorealistic image of an astronaut riding a horse on Mars
743
+ cinematic A bowl of soup that looks like a monster knitted out of wool
744
+ high‑contrast A photorealistic image of an astronaut riding a horse on Mars
745
+ cinematic A cyberpunk cityscape at night, rain‑soaked streets and neon signs
746
+ vibrant A cyberpunk cityscape at night, rain‑soaked streets and neon signs
747
+ high‑contrast A painting of a fox in the style of Van Gogh
748
+ vibrant A bowl of soup that looks like a monster knitted out of wool
749
+ hyperdetailed A majestic lion wearing a royal crown, oil on canvas
750
+ high‑contrast A cyberpunk cityscape at night, rain‑soaked streets and neon signs
751
+ dreamlike An ice cream cone melting into a desert landscape, surrealism
752
+ cinematic A painting of a fox in the style of Van Gogh
753
+ ultrarealistic A majestic lion wearing a royal crown, oil on canvas
754
+ ultrarealistic An ice cream cone melting into a desert landscape, surrealism
755
+ high‑contrast A majestic lion wearing a royal crown, oil on canvas
756
+ vibrant A steampunk airship sailing above Victorian London at sunrise
757
+ ultrarealistic A bowl of soup that looks like a monster knitted out of wool
758
+ dreamlike An ice cream cone melting into a desert landscape, surrealism
759
+ ultrarealistic A bowl of soup that looks like a monster knitted out of wool
760
+ dreamlike A majestic lion wearing a royal crown, oil on canvas
761
+ ultrarealistic An ice cream cone melting into a desert landscape, surrealism
762
+ ultrarealistic An ice cream cone melting into a desert landscape, surrealism
763
+ high‑contrast A futuristic sports car parked in an ancient Roman forum, 8K render
764
+ dreamlike A steampunk airship sailing above Victorian London at sunrise
765
+ ultrarealistic A futuristic sports car parked in an ancient Roman forum, 8K render
766
+ dreamlike A cat made of galaxies, digital art
767
+ ultrarealistic A futuristic sports car parked in an ancient Roman forum, 8K render
768
+ vibrant A steampunk airship sailing above Victorian London at sunrise
769
+ vibrant A painting of a fox in the style of Van Gogh
770
+ dreamlike A futuristic sports car parked in an ancient Roman forum, 8K render
771
+ dreamlike A microscopic close‑up of a snowflake shaped like a cathedral
772
+ vibrant A cyberpunk cityscape at night, rain‑soaked streets and neon signs
773
+ dreamlike A cyberpunk cityscape at night, rain‑soaked streets and neon signs
774
+ dreamlike A cat made of galaxies, digital art
775
+ high‑contrast A majestic lion wearing a royal crown, oil on canvas
776
+ vibrant A cat made of galaxies, digital art
777
+ vibrant A microscopic close‑up of a snowflake shaped like a cathedral
778
+ hyperdetailed An ice cream cone melting into a desert landscape, surrealism
779
+ cinematic A futuristic sports car parked in an ancient Roman forum, 8K render
780
+ hyperdetailed A painting of a fox in the style of Van Gogh
781
+ cinematic A microscopic close‑up of a snowflake shaped like a cathedral
782
+ high‑contrast A microscopic close‑up of a snowflake shaped like a cathedral
783
+ dreamlike An ice cream cone melting into a desert landscape, surrealism
784
+ hyperdetailed A steampunk airship sailing above Victorian London at sunrise
785
+ vibrant A photorealistic image of an astronaut riding a horse on Mars
786
+ cinematic A futuristic sports car parked in an ancient Roman forum, 8K render
787
+ cinematic A microscopic close‑up of a snowflake shaped like a cathedral
788
+ hyperdetailed A futuristic sports car parked in an ancient Roman forum, 8K render
789
+ hyperdetailed A futuristic sports car parked in an ancient Roman forum, 8K render
790
+ dreamlike An ice cream cone melting into a desert landscape, surrealism
791
+ high‑contrast A majestic lion wearing a royal crown, oil on canvas
792
+ high‑contrast A photorealistic image of an astronaut riding a horse on Mars
793
+ ultrarealistic A bowl of soup that looks like a monster knitted out of wool
794
+ dreamlike A bowl of soup that looks like a monster knitted out of wool
795
+ vibrant A steampunk airship sailing above Victorian London at sunrise
796
+ cinematic A cat made of galaxies, digital art
797
+ dreamlike A microscopic close‑up of a snowflake shaped like a cathedral
798
+ vibrant A microscopic close‑up of a snowflake shaped like a cathedral
799
+ ultrarealistic A photorealistic image of an astronaut riding a horse on Mars
800
+ cinematic A majestic lion wearing a royal crown, oil on canvas
configs/captions/example_prompts5.txt ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ bonsai dragon coiled cup, dramatic top‑down flow, rim lighting, pastel watercolor
2
+ robot barista alley, portrait orientation emphasizing height, long‑exposure light trails, lo‑fi pixel art
3
+ time portal ancient oak, negative space upper third, long‑exposure light trails, sepia ink
4
+ glass terrarium storm, stacked narrative layers, volumetric god‑rays, hyperdetailed CGI
5
+ rain‑soaked neon street, dramatic top‑down flow, rim lighting, photoreal 8K
6
+ lantern festival bamboo, elongated frame guiding eye upward, mist drifting mid‑scene, sepia ink
7
+ library tower floating ladders, portrait orientation emphasizing height, soft bokeh foreground, photoreal 8K
8
+ towering waterfall carving rainbow mist, portrait orientation emphasizing height, mist drifting mid‑scene, gouache storybook
9
+ towering waterfall carving rainbow mist, stacked narrative layers, mist drifting mid‑scene, sepia ink
10
+ rain‑soaked neon street, negative space upper third, volumetric god‑rays, gouache storybook
11
+ phoenix of origami flame, stacked narrative layers, long‑exposure light trails, gouache storybook
12
+ phoenix of origami flame, elongated frame guiding eye upward, rain‑kissed reflections, sepia ink
13
+ lantern festival bamboo, elongated frame guiding eye upward, volumetric god‑rays, hyperdetailed CGI
14
+ sakura tree train car, negative space upper third, volumetric god‑rays, neon vaporwave
15
+ mirror lake inverted castle, portrait orientation emphasizing height, mist drifting mid‑scene, Baroque oil
16
+ frozen wave unveiling city, stacked narrative layers, dynamic lens flare, sepia ink
17
+ mirror lake inverted castle, elongated frame guiding eye upward, dynamic lens flare, lo‑fi pixel art
18
+ frozen wave unveiling city, stacked narrative layers, dynamic lens flare, photoreal 8K
19
+ time portal ancient oak, towering vertical composition, mist drifting mid‑scene, hyperdetailed CGI
20
+ rain‑soaked neon street, negative space upper third, rain‑kissed reflections, Baroque oil
21
+ sakura tree train car, negative space upper third, rim lighting, Baroque oil
22
+ steampunk aviator on skybridge, stacked narrative layers, mist drifting mid‑scene, gouache storybook
23
+ glass terrarium storm, towering vertical composition, dynamic lens flare, hyperdetailed CGI
24
+ celestial whale in clouds, stacked narrative layers, mist drifting mid‑scene, photoreal 8K
25
+ time portal ancient oak, portrait orientation emphasizing height, soft bokeh foreground, ultrawide cinema
26
+ library tower floating ladders, dramatic top‑down flow, long‑exposure light trails, neon vaporwave
27
+ lighthouse in aurora night, dramatic top‑down flow, rain‑kissed reflections, photoreal 8K
28
+ spiral redwood staircase, dramatic top‑down flow, rim lighting, gouache storybook
29
+ phoenix of origami flame, stacked narrative layers, dynamic lens flare, Baroque oil
30
+ library tower floating ladders, elongated frame guiding eye upward, long‑exposure light trails, voxel isometric
31
+ sakura tree train car, dramatic top‑down flow, mist drifting mid‑scene, photoreal 8K
32
+ subway car drifting space, negative space upper third, dynamic lens flare, neon vaporwave
33
+ frozen wave unveiling city, dramatic top‑down flow, rain‑kissed reflections, hyperdetailed CGI
34
+ lone samurai beneath eclipse, stacked narrative layers, rim lighting, Baroque oil
35
+ library tower floating ladders, dramatic top‑down flow, rain‑kissed reflections, gouache storybook
36
+ rain‑soaked neon street, stacked narrative layers, rim lighting, Baroque oil
37
+ rain‑soaked neon street, elongated frame guiding eye upward, volumetric god‑rays, gouache storybook
38
+ sakura tree train car, dramatic top‑down flow, rim lighting, ultrawide cinema
39
+ clock tower glowing vines, stacked narrative layers, dynamic lens flare, photoreal 8K
40
+ phoenix of origami flame, negative space upper third, rain‑kissed reflections, gouache storybook
41
+ library tower floating ladders, towering vertical composition, rim lighting, pastel watercolor
42
+ sakura tree train car, elongated frame guiding eye upward, volumetric god‑rays, pastel watercolor
43
+ steampunk aviator on skybridge, portrait orientation emphasizing height, long‑exposure light trails, photoreal 8K
44
+ robot barista alley, towering vertical composition, soft bokeh foreground, pastel watercolor
45
+ paper sailboat cloud sea, towering vertical composition, soft bokeh foreground, voxel isometric
46
+ steampunk aviator on skybridge, towering vertical composition, rain‑kissed reflections, voxel isometric
47
+ frozen wave unveiling city, towering vertical composition, mist drifting mid‑scene, voxel isometric
48
+ mirror lake inverted castle, stacked narrative layers, mist drifting mid‑scene, Baroque oil
49
+ time portal ancient oak, stacked narrative layers, volumetric god‑rays, photoreal 8K
50
+ mirror lake inverted castle, stacked narrative layers, soft bokeh foreground, hyperdetailed CGI
51
+ spiral redwood staircase, elongated frame guiding eye upward, volumetric god‑rays, voxel isometric
52
+ frozen wave unveiling city, towering vertical composition, dynamic lens flare, hyperdetailed CGI
53
+ towering waterfall carving rainbow mist, towering vertical composition, volumetric god‑rays, hyperdetailed CGI
54
+ bonsai dragon coiled cup, portrait orientation emphasizing height, mist drifting mid‑scene, neon vaporwave
55
+ lantern festival bamboo, negative space upper third, dynamic lens flare, hyperdetailed CGI
56
+ sakura tree train car, towering vertical composition, mist drifting mid‑scene, gouache storybook
57
+ lantern festival bamboo, towering vertical composition, dynamic lens flare, photoreal 8K
58
+ subway car drifting space, negative space upper third, dynamic lens flare, Baroque oil
59
+ rain‑soaked neon street, portrait orientation emphasizing height, mist drifting mid‑scene, voxel isometric
60
+ bonsai dragon coiled cup, towering vertical composition, volumetric god‑rays, voxel isometric
61
+ steampunk aviator on skybridge, portrait orientation emphasizing height, soft bokeh foreground, voxel isometric
62
+ lighthouse in aurora night, stacked narrative layers, dynamic lens flare, voxel isometric
63
+ bonsai dragon coiled cup, portrait orientation emphasizing height, dynamic lens flare, photoreal 8K
64
+ library tower floating ladders, towering vertical composition, dynamic lens flare, lo‑fi pixel art
65
+ lighthouse in aurora night, negative space upper third, dynamic lens flare, gouache storybook
66
+ lone samurai beneath eclipse, stacked narrative layers, rain‑kissed reflections, Baroque oil
67
+ glass terrarium storm, towering vertical composition, dynamic lens flare, hyperdetailed CGI
68
+ library tower floating ladders, towering vertical composition, long‑exposure light trails, pastel watercolor
69
+ subway car drifting space, elongated frame guiding eye upward, dynamic lens flare, lo‑fi pixel art
70
+ towering waterfall carving rainbow mist, elongated frame guiding eye upward, dynamic lens flare, hyperdetailed CGI
71
+ mirror lake inverted castle, portrait orientation emphasizing height, rain‑kissed reflections, sepia ink
72
+ clock tower glowing vines, dramatic top‑down flow, rim lighting, neon vaporwave
73
+ clock tower glowing vines, stacked narrative layers, rain‑kissed reflections, voxel isometric
74
+ glass terrarium storm, stacked narrative layers, dynamic lens flare, Baroque oil
75
+ sakura tree train car, stacked narrative layers, rim lighting, gouache storybook
76
+ phoenix of origami flame, portrait orientation emphasizing height, mist drifting mid‑scene, photoreal 8K
77
+ steampunk aviator on skybridge, towering vertical composition, rain‑kissed reflections, neon vaporwave
78
+ sakura tree train car, towering vertical composition, soft bokeh foreground, Baroque oil
79
+ paper sailboat cloud sea, dramatic top‑down flow, rim lighting, voxel isometric
80
+ celestial whale in clouds, elongated frame guiding eye upward, soft bokeh foreground, pastel watercolor
81
+ mirror lake inverted castle, towering vertical composition, rim lighting, ultrawide cinema
82
+ time portal ancient oak, towering vertical composition, dynamic lens flare, gouache storybook
83
+ glass terrarium storm, elongated frame guiding eye upward, volumetric god‑rays, hyperdetailed CGI
84
+ steampunk aviator on skybridge, stacked narrative layers, rim lighting, neon vaporwave
85
+ bonsai dragon coiled cup, elongated frame guiding eye upward, volumetric god‑rays, ultrawide cinema
86
+ library tower floating ladders, stacked narrative layers, volumetric god‑rays, ultrawide cinema
87
+ sakura tree train car, towering vertical composition, dynamic lens flare, pastel watercolor
88
+ robot barista alley, elongated frame guiding eye upward, soft bokeh foreground, ultrawide cinema
89
+ sakura tree train car, portrait orientation emphasizing height, long‑exposure light trails, voxel isometric
90
+ glass terrarium storm, portrait orientation emphasizing height, rim lighting, voxel isometric
91
+ towering waterfall carving rainbow mist, dramatic top‑down flow, mist drifting mid‑scene, photoreal 8K
92
+ steampunk aviator on skybridge, stacked narrative layers, volumetric god‑rays, photoreal 8K
93
+ spiral redwood staircase, towering vertical composition, rim lighting, ultrawide cinema
94
+ rain‑soaked neon street, towering vertical composition, volumetric god‑rays, lo‑fi pixel art
95
+ lighthouse in aurora night, negative space upper third, rain‑kissed reflections, neon vaporwave
96
+ glass terrarium storm, stacked narrative layers, long‑exposure light trails, gouache storybook
97
+ mirror lake inverted castle, dramatic top‑down flow, soft bokeh foreground, lo‑fi pixel art
98
+ clock tower glowing vines, portrait orientation emphasizing height, rain‑kissed reflections, Baroque oil
99
+ lone samurai beneath eclipse, portrait orientation emphasizing height, volumetric god‑rays, voxel isometric
100
+ paper sailboat cloud sea, elongated frame guiding eye upward, long‑exposure light trails, hyperdetailed CGI
configs/captions/example_prompts6.txt ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ "digital art of a beautiful tiger pokemon under an apple tree, cartoon style,Matte Painting,Magic Realism,Bright colors,hyper quality,high detail,high resolution, --video --s 750 --v 6.0 --ar 1:2"
2
+ In the image, a corgi dog is wearing a straw hat and is laying on a fluffy rug. The dog's tongue is sticking out and it appears to be happy. There are two pumpkins and a basket of leaves nearby, indicating that the scene takes place during the fall season. The background features a Christmas tree, further suggesting the holiday atmosphere. The image has a warm and cozy feel to it, with the dog looking adorable in its hat and the pumpkins adding a festive touch.
3
+ An Arctic scene featuring a polar bear and her cubs walking across ice floes under the northern lights, the sky illuminated with vibrant colors of green and purple, stars twinkling above, reflections on the icy water, hyper-realistic, high resolution.
4
+ A daisy flower made entirely of origami paper, placed against a minimalist background, showcasing the folds and craftsmanship, high-resolution, studio lighting.
5
+ A rustic kitchen table set with freshly baked bread, an assortment of cheeses, a bowl of ripe fruit, and a bouquet of lavender, sunlight streaming through a nearby window casting soft shadows, detailed still life painting with warm tones and textures.
6
+ A panda eating bamboo in a lush green forest, with soft sunlight filtering through the leaves, realistic painting.
7
+ A close-up of a dewdrop-covered spider web glistening in the morning light, intricate patterns, macro photography.
8
+ An assortment of colorful gemstones scattered on a reflective surface, each facet catching the light, macro photography.
9
+ A red fox curled up asleep in a snowy woodland clearing, with delicate snowflakes falling gently around it, watercolor style.
10
+ A futuristic robotic dragon made of metallic scales and glowing blue eyes, perched on a rocky cliff, digital art, high resolution.
11
+ A traditional Japanese pagoda nestled among cherry blossom trees in full bloom, petals gently falling, a serene pond reflecting the structure, mountains in the background, watercolor style, high resolution.
12
+ A detailed macro photograph of a honeybee collecting pollen from a sunflower, with the texture of the petals and the bee's wings clearly visible, tiny particles of pollen floating in the air, background softly blurred to emphasize the subject, high-definition, natural lighting.
13
+ A close-up of a beautifully crafted violin resting on a sheet of classical music, with light reflecting off its polished wooden surface, musical notes seeming to float off the page, and a single red rose lying beside it, artistic illustration with warm tones.
14
+ A glass terrarium containing a miniature rainforest ecosystem, complete with tiny waterfalls, exotic plants, small animals like frogs and butterflies, the glass reflecting light from a nearby window, droplets of condensation visible on the inside, photorealistic rendering.
15
+ A pair of dolphins leaping out of the ocean at sunset, with splashes of water frozen in mid-air, vibrant colors, hyper-realistic.
16
+ A majestic eagle soaring above snow-capped mountains, with wings spread wide against a clear blue sky, realistic painting.
17
+ An elegant glass vase filled with blooming cherry blossoms, placed on a minimalist wooden table, soft natural lighting, photorealistic.,
18
+ An ancient, majestic tree in the heart of an enchanted forest, its luminescent leaves glowing in shades of blue and purple under a starry night sky, surrounded by floating wisps of light, digital art, high resolution.
19
+ A tranquil alpine lake surrounded by snow-capped mountains, with the aurora borealis dancing across the night sky and its vibrant colors reflected in the still waters below, ultra-high-definition.
20
+ A giant tortoise slowly making its way across a misty meadow at dawn, with dew-covered grass and wildflowers in soft pastel colors, mountains in the background shrouded in fog, a few butterflies fluttering nearby, photorealistic, high resolution.
21
+ A fantastical airship sailing through the clouds above a steampunk city, with gears and propellers visible, dirigibles floating nearby, intricate architectural details on the buildings below, sunset sky with shades of pink and orange, digital art, high resolution.
22
+ An African savannah scene featuring a herd of elephants walking towards a watering hole under a vibrant sunset sky, acacia trees silhouetted against the horizon, and distant silhouettes of giraffes grazing, cinematic lighting, ultra-wide shot.
23
+ An antique compass lying on an old map, with a magnifying glass revealing detailed cartography, warm sepia tones, vintage style.
24
+ A close-up of a vintage pocket watch with intricate gears visible through a transparent face, steampunk style, highly detailed illustration.
25
+ A majestic Bengal tiger walking through a dense jungle with sunlight filtering through the canopy, its orange and black stripes contrasting vividly against the lush green foliage, birds perched on nearby branches, and exotic flowers blooming around, highly detailed digital painting, ultra-high-definition.
26
+ A serene Zen garden with carefully raked sand patterns, smooth stones arranged thoughtfully, a small bonsai tree at the center, surrounded by bamboo fencing, soft lantern light illuminating the scene at dusk, a gentle stream flowing nearby, minimalist style, photorealistic.
27
+ A fantasy-themed portrait of a female elf with golden hair and violet eyes, her attire shimmering with iridescent colors, set in an enchanted forest. 8K, best quality, fine details.
28
+ pumpkins, autumn sunset in the old village, cobblestone houses, streets, plants, flowers, entrance, realistic, stunningly beautiful
29
+ "Highly detailed mysterious egyptian (sphynx cat), skindentation:1.2, bright eyes, ancient egypt pyramid background, photorealistic, (hyper-realistic:1.2), cinematic, masterpiece:1.1, cinematic lighting"
30
+ "vw bus, canvas art, abstract art printing, in the style of brian mashburn, light red and light brown, theo prins, charming character illustrations, pierre pellegrini, vintage cut-and-paste, rusty debris --ar 73:92 --stylize 750 --v 6"
31
+ painterly style, seductive female League of legends Jinx character fighting at war, raging, crazy smile, crazy eyes, rocket lancher, guns, crazy face expression, character design, body is adorned with glowing golden runes, intense green aura around her, body dynamic epic action pose, intricate, highly detailed, epic and dynamic composition, dynamic angle, intricate details, multicolor explosion, blur effect, sharp focus, uhd, hdr, colorful shot, stormy weather, tons of flying debris around her, dark city background, modifier=CarnageStyle, color=blood_red, intensity=1.6
32
+ A charismatic chef in a bustling kitchen, his apron dusted with flour, smiling as he presents a beautifully prepared dish. 8K, hyper-realistic, cinematic, post-production.
33
+ A young adventurer with tousled hair and bright eyes, wearing a leather jacket and a backpack, ready to explore distant lands. 8K, hyper-realistic, cinematic, post-production.
34
+ "A watercolor painting of a vibrant flower field in spring, with a rainbow of blossoms under a bright blue sky. 8K, best quality, fine details.",
35
+ "digital art of a beautiful tiger pokemon under an apple tree, cartoon style,Matte Painting,Magic Realism,Bright colors,hyper quality,high detail,high resolution, --video --s 750 --v 6.0 --ar 1:2"
36
+ "painterly style, Goku fighting at war, raging, blue hair, character design, body is adorned with glowing golden runes, yellow aura around him, body dynamic epic action pose, intricate, highly detailed, epic and dynamic composition, dynamic angle, intricate details, multicolor explosion, blur effect, sharp focus, uhd, hdr, colorful shot, stormy weather, tons of flying debris around him, dark city background, modifier=CarnageStyle, color=blood_red, intensity=1.6"
37
+ A stunning steampunk city with towering skyscrapers and intricate clockwork mechanisms, gears and pistons move in a complex symphony, steam billows from chimneys, airships navigate the bustling skylanes, a vibrant metropolis
38
+ "Samurai looks at the enemy, stands after the battle, fear and horror on his face, tired and beaten, sand on his face mixed with sweat, an atmosphere of darkness and horror, hyper realistic photo, In post - production, enhance the details, sharpness, and contrast to achieve the hyper - realistic effect"
39
+ A portrait of an elemental entity with strong rim lighting and intricate details, painted digitally by Alvaro Castagnet, Peter Mohrbacher, and Dan Mumford
40
+ "A regal female portrait with an ornate headdress decorated with colorful gemstones and feathers, her robes rich with intricate designs and bright hues. 8K, best quality, fine details.",
41
+ "A detailed painting of Atlantis by multiple artists, featuring intricate detailing and vibrant colors.",
42
+ "A landscape featuring mountains, a valley, sunset light, wildlife and a gorilla, reminiscent of Bob Ross's artwork.
43
+ a space elevator, cinematic scifi art
44
+ a hole in the floor of my bathroom with small gremlins living in it
45
+ an origami pig on fire in the middle of a dark room with a pentagram on the floor
46
+ a small office made out of car parts
47
+ heat death of the universe, line art
48
+ A car made out of vegetables.
49
+ A cheesburger surfing the vibe wave at night
50
+ An entire universe inside a bottle
51
+ A bioluminescent rainforest at night, viewed from a canopy walkway, hyper-real, crisp moonlight filtering through mist
52
+ Cross-section of an imaginary geode revealing swirling nebula-like mineral layers, macro photography style
53
+ Futuristic library carved into a glacier, warm interior lighting contrasting icy blue walls, isometric view
54
+ Surreal desert with floating sandstone monoliths casting long shadows at golden hour, ultra-wide lens
55
+ Vintage watercolor map of an archipelago shaped like musical notes, illustrated cartography
56
+ Cyberpunk alley drenched in neon rain, reflective puddles, no characters, cinematic atmosphere
57
+ Close-up of a hummingbird made of fractal glass shards hovering near a sapphire flower, 8K detail
58
+ Orbiting observatory above a gas-giant planet, rings stretching across star-filled sky, photoreal
59
+ Abstract kinetic sculpture of twisting ribbons suspended in a white cube gallery, studio lighting
60
+ Fog-covered pine forest with a single crimson tree in the center, muted color palette
61
+ Time-lapse style composite of a tidal pool from dawn to dusk, stitched into one frame
62
+ Isometric diagram of an autonomous greenhouse on Mars, annotated schematics aesthetic
63
+ Paper-cut illustration of a city inside a whale, layered depth, soft muted tones
64
+ Steampunk airship port at sunrise, brass machinery glinting, painterly brushwork
65
+ Minimalist ink wash painting of a solitary mountain peak emerging from clouds
66
+ Ultraviolet microscope image of an invented pollen grain with crystalline spikes
67
+ Retro 8-bit pixel art scene of a cozy lakeside cabin under meteor shower
68
+ Low-poly 3-D render of a coral reef teeming with geometric fish shapes
69
+ Aerial view of a terraced rice field arranged in a perfect Fibonacci spiral
70
+ Schematic cutaway of a clockwork heart pumping luminous liquid, technical drawing style
71
+ Long-exposure night photograph of fireflies tracing mathematical Lissajous curves
72
+ Gothic cathedral interior built entirely from translucent ice, soft subsurface scattering
73
+ Top-down macro of latte foam forming a fractal coastline pattern
74
+ Astronomical illustration of a triple-sunset over an ocean on an exoplanet
75
+ Ink-on-parchment concept art of a floating pagoda tethered by chains to mountain peaks
76
+ Cubist still life of fruit and musical instruments, vivid complementary colors
77
+ Moody black-and-white film photograph of rain on a lonely train platform, 1950s era
78
+ Hyperreal chrome koi fish swimming through clouds, sky as water
79
+ Floral mandala assembled from autumn leaves, top-down symmetric composition
80
+ Concept art of an underground crystal cavern illuminated by bioluminescent fungi
81
+ Sci-fi control room with holographic interfaces projected into fog, teal-orange palette
82
+ Minimal claymation style landscape with rolling pastel hills and giant daisies
83
+ Polaroid aesthetic photo of a roadside diner at twilight, neon sign flickering
84
+ Vector infographic showing the life cycle of a fictional winged seed, flat design
85
+ Dream-like seascape where waves morph into galloping horses, double-exposure effect
86
+ Art-deco poster of an interstellar passenger train speeding past moons
87
+ Cross-section illustration of a layered cake that resembles planetary strata
88
+ Infrared photograph of a mangrove swamp, foliage appearing white, water inky black
89
+ Whimsical pencil sketch of a tea party with levitating porcelain, soft shading
90
+ Architectural render of a zero-gravity museum with exhibits floating mid-air
91
+ Oil painting of a stormy sky splitting into vortices shaped like musical clefs
92
+ Isometric cutaway of an underground dwarf forge with molten rivers, game concept art
93
+ Frosted glass terrarium containing a miniature thunderstorm, studio backdrop
94
+ Minimalist cyanotype print of fern leaves arranged in a golden ratio spiral
95
+ Fantasy moonlit waterfall cascading upward into the sky, long-exposure feel
96
+ Retro-futuristic poster of a solar-powered desert rover kicking up red dust
97
+ Double helix made of blooming flowers against a white background, high-key macro
98
+ Top-down shot of a labyrinth garden trimmed into Escher-like impossible geometry
99
+ Sci-fi vending machine selling bottled starlight, hologram price tags
100
+ Watercolor portrait of an abstract humanoid with translucent skin revealing galaxies
101
+ Silhouette of a lone tree on an island reflected perfectly in still water, dusk gradient
102
+ Close-up macro of snowflakes arranged to form a Mandelbrot set
103
+ Ink drawing of a koi pond where fish tails morph into swirling calligraphy strokes
104
+ Hyperreal food photography of a floating stack of pancakes with gravity-defying syrup
105
+ Electroluminescent circuit board cityscape at night, streets as glowing traces
106
+ Surreal scene of books sprouting wings and migrating across a sunset sky
107
+ Low-angle view of a colossal sandstone arch framing a star-filled Milky Way
108
+ Cross-section of a mechanical sunflower tracking a miniature artificial sun
109
+ Art-nouveau travel poster for an imaginary cloud kingdom, flowing line art
110
+ Graph-paper style blueprint of a perpetual-motion water wheel, annotated
111
+ Futuristic zen garden with levitating raked sand and floating bonsai stones
112
+ Photoreal underwater city with glass domes linked by glowing tunnels
113
+ Tilt-shift photo of a festival lantern parade through narrow cobblestone streets
114
+ Neon wireframe landscape reminiscent of 1980s synthwave, grid fading to horizon
115
+ Paper-quilling style illustration of a comet bursting into colorful spirals
116
+ Panorama of a crimson aurora over icy mountains, ultra-wide 16:9 aspect
117
+ Transparent holographic chess set floating in zero-gravity, pieces mid-game
118
+ Pointillist painting of a bustling open-air market under summer sun
119
+ Infrared thermal view of a volcanic eruption, palette mapped to rainbow hues
120
+ Detail shot of clock gears where each tooth is a tiny stairway with lanterns
121
+ Minimal line-art poster depicting the evolution of flight from feathers to starships
122
+ Glowing jellyfish drifting through a misty pine forest at dawn, photoreal composite
123
+ Art-studio workbench cluttered with vintage robotics schematics and metal parts
124
+ Monochrome charcoal drawing of a lighthouse beam piercing heavy fog
125
+ Isometric voxel art of a floating garden island with waterfalls spilling into void
126
+ Surreal split-scene: left half winter forest, right half summer meadow, seamless blend
127
+ Retro postage stamp design celebrating a fictional eclipse festival
128
+ Hyperdetailed ceramic mosaic of a phoenix rising, mediterranean style
129
+ Sci-fi medical lab growing crystalline plants in suspended nutrient orbs
130
+ High-speed photo of colored ink clouds colliding underwater, symmetrical composition
131
+ Anamorphic street art illusion of a chasm opening in a city square
132
+ Timber-frame hobbit-style cottage under giant sunflowers, golden afternoon
133
+ Futuristic monorail weaving through skyscrapers wrapped in vertical gardens
134
+ Scientific render of a transparent hypercube containing swirling plasma
135
+ Sepia photograph of an abandoned observatory overtaken by vines
136
+ Concept piece: biomechanical dragon skeleton displayed in a museum hall
137
+ Minimal gradient poster of a single droplet rippling concentric neon rings
138
+ Chalkboard schematic showing stages of a do-it-yourself constellation projector
139
+ Digital glitch art of a city skyline melting into cascading pixels
140
+ Aerial drone shot of rice paddies shaped like circuitry pathways
141
+ Macro of soap film displaying shifting rainbow interference patterns
142
+ Oil-on-canvas seascape where waves are brush strokes of pure geometry
143
+ Tilted perspective of a spiral staircase made entirely of stained glass
144
+ Hyperreal 3-D render of a desert mirage city shimmering above dunes
145
+ Vectorized infographic of wind turbine anatomy with exploded components
146
+ Snow-covered bamboo forest under lantern light, gentle falling flakes
147
+ Abstract generative art of golden particles forming a torus knot in black void
148
+ Stop-motion clay diorama of a miniature volcano erupting sprinkles
149
+ Ultrawide cinematic shot of two converging thunderstorms over open ocean
150
+ Graphite sketch of intertwined river deltas resembling tree roots, top-down view
151
+ Witness a hypnotic tableau illustrating translucent mineral forests echoing with harmonic resonances of quantum fireflies, using chiaroscuro lighting reminiscent of Baroque masters embracing neon chroma. Chromatic rhythms pulse like distant quasars, suggesting harmony between chaos and order.
152
+ Observe an intricate composition depicting desert caravans navigating rivers of liquid glass during twin sunsets, modeled via procedural geometry emphasising topological elegance and material translucency. The narrative remains open‑ended, leaving viewers adrift in deliberate interpretive ambiguity.
153
+ Observe an intricate composition illustrating translucent mineral forests echoing with harmonic resonances of quantum fireflies, with meticulously ray‑traced reflections and subsurface scattering. Subtle visual metaphors hint at the cyclic dialogue between technology, ecology, and memory.
154
+ Inspect a surreal mise‑en‑scène portraying nomadic sky‑gardens drifting around thunderous cloud cathedrals in perpetual twilight, rendered at 8K resolution ensuring sub‑pixel precision on reflective anisotropic surfaces. Layered symbolism underscores paradoxes of permanence versus ephemerality in manufactured eternity.
155
+ Encounter a novel conceptual artwork detailing recursive fractal cities folding into higher‑dimensional corridors of prismatic mist, framed by an ultra‑wide 16:9 holographic canvas supporting volumetric parallax. The composition invites contemplation of entropy, renewal, and cosmic interconnectedness.
156
+ Witness a hypnotic tableau displaying vaulted librarian drones cataloging holographic memories within an endless glacier archive, using chiaroscuro lighting reminiscent of Baroque masters embracing neon chroma. The composition invites contemplation of entropy, renewal, and cosmic interconnectedness.
157
+ Observe an intricate composition depicting desert caravans navigating rivers of liquid glass during twin sunsets, with meticulously ray‑traced reflections and subsurface scattering. Textural juxtapositions evoke synesthetic sensations that challenge conventional perceptual hierarchies.
158
+ In this speculative panorama depicting desert caravans navigating rivers of liquid glass during twin sunsets, accentuated by selective focus bokeh revealing micron‑scale glitter particulates. The narrative remains open‑ended, leaving viewers adrift in deliberate interpretive ambiguity.
159
+ Encounter a novel conceptual artwork portraying nomadic sky‑gardens drifting around thunderous cloud cathedrals in perpetual twilight, using chiaroscuro lighting reminiscent of Baroque masters embracing neon chroma. Textural juxtapositions evoke synesthetic sensations that challenge conventional perceptual hierarchies.
160
+ In this speculative panorama displaying vaulted librarian drones cataloging holographic memories within an endless glacier archive, expressed through Art‑Nouveau line work fused with minimal vaporwave gradients. The scene silently questions whether exploration births meaning or merely mirrors ourselves.
161
+ Witness a hypnotic tableau portraying nomadic sky‑gardens drifting around thunderous cloud cathedrals in perpetual twilight, expressed through Art‑Nouveau line work fused with minimal vaporwave gradients. Ultimately, the image celebrates imaginative elasticity as a frontier of scientific discovery.
162
+ In this speculative panorama showing ancient megastructures orbiting a pulsating ruby planet inside a Dyson‑shell observatory, accentuated by selective focus bokeh revealing micron‑scale glitter particulates. The composition invites contemplation of entropy, renewal, and cosmic interconnectedness.
163
+ Inspect a surreal mise‑en‑scène showing ancient megastructures orbiting a pulsating ruby planet inside a Dyson‑shell observatory, rendered at 8K resolution ensuring sub‑pixel precision on reflective anisotropic surfaces. The composition invites contemplation of entropy, renewal, and cosmic interconnectedness.
164
+ Explore a visionary illustration capturing synchronised solar sails blossoming like origami across a kaleidoscopic nebula backdrop, framed by an ultra‑wide 16:9 holographic canvas supporting volumetric parallax. The composition invites contemplation of entropy, renewal, and cosmic interconnectedness.
165
+ Observe an intricate composition illustrating translucent mineral forests echoing with harmonic resonances of quantum fireflies, rendered at 8K resolution ensuring sub‑pixel precision on reflective anisotropic surfaces. Ultimately, the image celebrates imaginative elasticity as a frontier of scientific discovery.
166
+ Inspect a surreal mise‑en‑scène displaying vaulted librarian drones cataloging holographic memories within an endless glacier archive, framed by an ultra‑wide 16:9 holographic canvas supporting volumetric parallax. The narrative remains open‑ended, leaving viewers adrift in deliberate interpretive ambiguity.
167
+ Survey an expansive environment portraying nomadic sky‑gardens drifting around thunderous cloud cathedrals in perpetual twilight, painted with thousand‑stroke impasto textures evoking tactile motion. The scene silently questions whether exploration births meaning or merely mirrors ourselves.
168
+ In this speculative panorama rendering harmony between algorithmic sand dunes and floating ceramic monoliths radiating spectral rain, framed by an ultra‑wide 16:9 holographic canvas supporting volumetric parallax. Subtle visual metaphors hint at the cyclic dialogue between technology, ecology, and memory.
169
+ Contemplate a hyper‑detailed diorama showing ancient megastructures orbiting a pulsating ruby planet inside a Dyson‑shell observatory, using chiaroscuro lighting reminiscent of Baroque masters embracing neon chroma. The narrative remains open‑ended, leaving viewers adrift in deliberate interpretive ambiguity.
170
+ Encounter a novel conceptual artwork detailing recursive fractal cities folding into higher‑dimensional corridors of prismatic mist, painted with thousand‑stroke impasto textures evoking tactile motion. The narrative remains open‑ended, leaving viewers adrift in deliberate interpretive ambiguity.
171
+ Observe an intricate composition showing ancient megastructures orbiting a pulsating ruby planet inside a Dyson‑shell observatory, modeled via procedural geometry emphasising topological elegance and material translucency. Ultimately, the image celebrates imaginative elasticity as a frontier of scientific discovery.
172
+ In this speculative panorama capturing synchronised solar sails blossoming like origami across a kaleidoscopic nebula backdrop, modeled via procedural geometry emphasising topological elegance and material translucency. Every detail encourages reflection on humanity’s place within expansive, unknowable frontiers.
173
+ Behold a cinematic vignette displaying vaulted librarian drones cataloging holographic memories within an endless glacier archive, expressed through Art‑Nouveau line work fused with minimal vaporwave gradients. Ultimately, the image celebrates imaginative elasticity as a frontier of scientific discovery.
174
+ Observe an intricate composition displaying vaulted librarian drones cataloging holographic memories within an endless glacier archive, expressed through Art‑Nouveau line work fused with minimal vaporwave gradients. The composition invites contemplation of entropy, renewal, and cosmic interconnectedness.
175
+ Consider an evocative dreamscape portraying nomadic sky‑gardens drifting around thunderous cloud cathedrals in perpetual twilight, in photorealistic 32‑bit colour depth. Layered symbolism underscores paradoxes of permanence versus ephemerality in manufactured eternity.
176
+ Encounter a novel conceptual artwork where bioluminescent vines entangle crystalline pyramids beneath shifting auroras, captured by a drone‑level perspective employing long‑exposure star trails. The narrative remains open‑ended, leaving viewers adrift in deliberate interpretive ambiguity.
177
+ Observe an intricate composition portraying nomadic sky‑gardens drifting around thunderous cloud cathedrals in perpetual twilight, rendered at 8K resolution ensuring sub‑pixel precision on reflective anisotropic surfaces. Chromatic rhythms pulse like distant quasars, suggesting harmony between chaos and order.
178
+ Witness a hypnotic tableau showing ancient megastructures orbiting a pulsating ruby planet inside a Dyson‑shell observatory, with meticulously ray‑traced reflections and subsurface scattering. Chromatic rhythms pulse like distant quasars, suggesting harmony between chaos and order.
179
+ Encounter a novel conceptual artwork where bioluminescent vines entangle crystalline pyramids beneath shifting auroras, painted with thousand‑stroke impasto textures evoking tactile motion. The scene silently questions whether exploration births meaning or merely mirrors ourselves.
180
+ Behold a cinematic vignette featuring clockwork leviathans swimming through stratified ocean trenches of inverted gravity, rendered at 8K resolution ensuring sub‑pixel precision on reflective anisotropic surfaces. Spatial recursion visually articulates mathematical infinity within finite representational space.
181
+ In this speculative panorama rendering harmony between algorithmic sand dunes and floating ceramic monoliths radiating spectral rain, expressed through Art‑Nouveau line work fused with minimal vaporwave gradients. Spatial recursion visually articulates mathematical infinity within finite representational space.
182
+ Observe an intricate composition portraying nomadic sky‑gardens drifting around thunderous cloud cathedrals in perpetual twilight, rendered at 8K resolution ensuring sub‑pixel precision on reflective anisotropic surfaces. Spatial recursion visually articulates mathematical infinity within finite representational space.
183
+ Contemplate a hyper‑detailed diorama detailing recursive fractal cities folding into higher‑dimensional corridors of prismatic mist, modeled via procedural geometry emphasising topological elegance and material translucency. The narrative remains open‑ended, leaving viewers adrift in deliberate interpretive ambiguity.
184
+ Survey an expansive environment depicting desert caravans navigating rivers of liquid glass during twin sunsets, expressed through Art‑Nouveau line work fused with minimal vaporwave gradients. The narrative remains open‑ended, leaving viewers adrift in deliberate interpretive ambiguity.
185
+ Behold a cinematic vignette rendering harmony between algorithmic sand dunes and floating ceramic monoliths radiating spectral rain, modeled via procedural geometry emphasising topological elegance and material translucency. Spatial recursion visually articulates mathematical infinity within finite representational space.
186
+ Consider an evocative dreamscape displaying vaulted librarian drones cataloging holographic memories within an endless glacier archive, using chiaroscuro lighting reminiscent of Baroque masters embracing neon chroma. Subtle visual metaphors hint at the cyclic dialogue between technology, ecology, and memory.
187
+ Contemplate a hyper‑detailed diorama showing ancient megastructures orbiting a pulsating ruby planet inside a Dyson‑shell observatory, modeled via procedural geometry emphasising topological elegance and material translucency. Spatial recursion visually articulates mathematical infinity within finite representational space.
188
+ Witness a hypnotic tableau where bioluminescent vines entangle crystalline pyramids beneath shifting auroras, rendered at 8K resolution ensuring sub‑pixel precision on reflective anisotropic surfaces. Subtle visual metaphors hint at the cyclic dialogue between technology, ecology, and memory.
189
+ Encounter a novel conceptual artwork capturing synchronised solar sails blossoming like origami across a kaleidoscopic nebula backdrop, accentuated by selective focus bokeh revealing micron‑scale glitter particulates. The scene silently questions whether exploration births meaning or merely mirrors ourselves.
190
+ Contemplate a hyper‑detailed diorama displaying vaulted librarian drones cataloging holographic memories within an endless glacier archive, captured by a drone‑level perspective employing long‑exposure star trails. Chromatic rhythms pulse like distant quasars, suggesting harmony between chaos and order.
191
+ Behold a cinematic vignette where bioluminescent vines entangle crystalline pyramids beneath shifting auroras, captured by a drone‑level perspective employing long‑exposure star trails. Spatial recursion visually articulates mathematical infinity within finite representational space.
192
+ Behold a cinematic vignette where bioluminescent vines entangle crystalline pyramids beneath shifting auroras, painted with thousand‑stroke impasto textures evoking tactile motion. Textural juxtapositions evoke synesthetic sensations that challenge conventional perceptual hierarchies.
193
+ Consider an evocative dreamscape showing ancient megastructures orbiting a pulsating ruby planet inside a Dyson‑shell observatory, in photorealistic 32‑bit colour depth. Ultimately, the image celebrates imaginative elasticity as a frontier of scientific discovery.
194
+ Witness a hypnotic tableau illustrating translucent mineral forests echoing with harmonic resonances of quantum fireflies, painted with thousand‑stroke impasto textures evoking tactile motion. Every detail encourages reflection on humanity’s place within expansive, unknowable frontiers.
195
+ Behold a cinematic vignette detailing recursive fractal cities folding into higher‑dimensional corridors of prismatic mist, modeled via procedural geometry emphasising topological elegance and material translucency. Subtle visual metaphors hint at the cyclic dialogue between technology, ecology, and memory.
196
+ In this speculative panorama where bioluminescent vines entangle crystalline pyramids beneath shifting auroras, modeled via procedural geometry emphasising topological elegance and material translucency. Spatial recursion visually articulates mathematical infinity within finite representational space.
197
+ Explore a visionary illustration where bioluminescent vines entangle crystalline pyramids beneath shifting auroras, painted with thousand‑stroke impasto textures evoking tactile motion. Layered symbolism underscores paradoxes of permanence versus ephemerality in manufactured eternity.
198
+ Encounter a novel conceptual artwork detailing recursive fractal cities folding into higher‑dimensional corridors of prismatic mist, in photorealistic 32‑bit colour depth. The scene silently questions whether exploration births meaning or merely mirrors ourselves.
199
+ Explore a visionary illustration displaying vaulted librarian drones cataloging holographic memories within an endless glacier archive, with meticulously ray‑traced reflections and subsurface scattering. The narrative remains open‑ended, leaving viewers adrift in deliberate interpretive ambiguity.
200
+ In this speculative panorama illustrating translucent mineral forests echoing with harmonic resonances of quantum fireflies, accentuated by selective focus bokeh revealing micron‑scale glitter particulates. Subtle visual metaphors hint at the cyclic dialogue between technology, ecology, and memory.
201
+ Inspect a surreal mise‑en‑scène portraying nomadic sky‑gardens drifting around thunderous cloud cathedrals in perpetual twilight, painted with thousand‑stroke impasto textures evoking tactile motion. Subtle visual metaphors hint at the cyclic dialogue between technology, ecology, and memory.
202
+ Encounter a novel conceptual artwork illustrating translucent mineral forests echoing with harmonic resonances of quantum fireflies, expressed through Art‑Nouveau line work fused with minimal vaporwave gradients. Chromatic rhythms pulse like distant quasars, suggesting harmony between chaos and order.
203
+ Contemplate a hyper‑detailed diorama where bioluminescent vines entangle crystalline pyramids beneath shifting auroras, painted with thousand‑stroke impasto textures evoking tactile motion. The scene silently questions whether exploration births meaning or merely mirrors ourselves.
204
+ Observe an intricate composition capturing synchronised solar sails blossoming like origami across a kaleidoscopic nebula backdrop, framed by an ultra‑wide 16:9 holographic canvas supporting volumetric parallax. Textural juxtapositions evoke synesthetic sensations that challenge conventional perceptual hierarchies.
205
+ Contemplate a hyper‑detailed diorama portraying nomadic sky‑gardens drifting around thunderous cloud cathedrals in perpetual twilight, with meticulously ray‑traced reflections and subsurface scattering. Chromatic rhythms pulse like distant quasars, suggesting harmony between chaos and order.
206
+ Contemplate a hyper‑detailed diorama where bioluminescent vines entangle crystalline pyramids beneath shifting auroras, rendered at 8K resolution ensuring sub‑pixel precision on reflective anisotropic surfaces. Textural juxtapositions evoke synesthetic sensations that challenge conventional perceptual hierarchies.
207
+ Witness a hypnotic tableau featuring clockwork leviathans swimming through stratified ocean trenches of inverted gravity, with meticulously ray‑traced reflections and subsurface scattering. The narrative remains open‑ended, leaving viewers adrift in deliberate interpretive ambiguity.
208
+ Explore a visionary illustration portraying nomadic sky‑gardens drifting around thunderous cloud cathedrals in perpetual twilight, framed by an ultra‑wide 16:9 holographic canvas supporting volumetric parallax. The narrative remains open‑ended, leaving viewers adrift in deliberate interpretive ambiguity.
209
+ Explore a visionary illustration rendering harmony between algorithmic sand dunes and floating ceramic monoliths radiating spectral rain, painted with thousand‑stroke impasto textures evoking tactile motion. The composition invites contemplation of entropy, renewal, and cosmic interconnectedness.
210
+ Explore a visionary illustration featuring clockwork leviathans swimming through stratified ocean trenches of inverted gravity, using chiaroscuro lighting reminiscent of Baroque masters embracing neon chroma. The composition invites contemplation of entropy, renewal, and cosmic interconnectedness.
211
+ Survey an expansive environment rendering harmony between algorithmic sand dunes and floating ceramic monoliths radiating spectral rain, framed by an ultra‑wide 16:9 holographic canvas supporting volumetric parallax. Textural juxtapositions evoke synesthetic sensations that challenge conventional perceptual hierarchies.
212
+ Observe an intricate composition illustrating translucent mineral forests echoing with harmonic resonances of quantum fireflies, framed by an ultra‑wide 16:9 holographic canvas supporting volumetric parallax. Subtle visual metaphors hint at the cyclic dialogue between technology, ecology, and memory.
213
+ Observe an intricate composition depicting desert caravans navigating rivers of liquid glass during twin sunsets, using chiaroscuro lighting reminiscent of Baroque masters embracing neon chroma. Every detail encourages reflection on humanity’s place within expansive, unknowable frontiers.
214
+ Contemplate a hyper‑detailed diorama showing ancient megastructures orbiting a pulsating ruby planet inside a Dyson‑shell observatory, expressed through Art‑Nouveau line work fused with minimal vaporwave gradients. The narrative remains open‑ended, leaving viewers adrift in deliberate interpretive ambiguity.
215
+ Encounter a novel conceptual artwork featuring clockwork leviathans swimming through stratified ocean trenches of inverted gravity, using chiaroscuro lighting reminiscent of Baroque masters embracing neon chroma. The composition invites contemplation of entropy, renewal, and cosmic interconnectedness.
216
+ Consider an evocative dreamscape depicting desert caravans navigating rivers of liquid glass during twin sunsets, accentuated by selective focus bokeh revealing micron‑scale glitter particulates. Subtle visual metaphors hint at the cyclic dialogue between technology, ecology, and memory.
217
+ Consider an evocative dreamscape rendering harmony between algorithmic sand dunes and floating ceramic monoliths radiating spectral rain, captured by a drone‑level perspective employing long‑exposure star trails. Layered symbolism underscores paradoxes of permanence versus ephemerality in manufactured eternity.
218
+ Explore a visionary illustration capturing synchronised solar sails blossoming like origami across a kaleidoscopic nebula backdrop, framed by an ultra‑wide 16:9 holographic canvas supporting volumetric parallax. Subtle visual metaphors hint at the cyclic dialogue between technology, ecology, and memory.
219
+ Contemplate a hyper‑detailed diorama rendering harmony between algorithmic sand dunes and floating ceramic monoliths radiating spectral rain, accentuated by selective focus bokeh revealing micron‑scale glitter particulates. Subtle visual metaphors hint at the cyclic dialogue between technology, ecology, and memory.
220
+ Behold a cinematic vignette rendering harmony between algorithmic sand dunes and floating ceramic monoliths radiating spectral rain, rendered at 8K resolution ensuring sub‑pixel precision on reflective anisotropic surfaces. The scene silently questions whether exploration births meaning or merely mirrors ourselves.
221
+ Encounter a novel conceptual artwork displaying vaulted librarian drones cataloging holographic memories within an endless glacier archive, with meticulously ray‑traced reflections and subsurface scattering. Spatial recursion visually articulates mathematical infinity within finite representational space.
222
+ In this speculative panorama where bioluminescent vines entangle crystalline pyramids beneath shifting auroras, rendered at 8K resolution ensuring sub‑pixel precision on reflective anisotropic surfaces. The narrative remains open‑ended, leaving viewers adrift in deliberate interpretive ambiguity.
223
+ Encounter a novel conceptual artwork where bioluminescent vines entangle crystalline pyramids beneath shifting auroras, using chiaroscuro lighting reminiscent of Baroque masters embracing neon chroma. Textural juxtapositions evoke synesthetic sensations that challenge conventional perceptual hierarchies.
224
+ Observe an intricate composition detailing recursive fractal cities folding into higher‑dimensional corridors of prismatic mist, painted with thousand‑stroke impasto textures evoking tactile motion. The scene silently questions whether exploration births meaning or merely mirrors ourselves.
225
+ Witness a hypnotic tableau illustrating translucent mineral forests echoing with harmonic resonances of quantum fireflies, in photorealistic 32‑bit colour depth. The scene silently questions whether exploration births meaning or merely mirrors ourselves.
226
+ Explore a visionary illustration depicting desert caravans navigating rivers of liquid glass during twin sunsets, modeled via procedural geometry emphasising topological elegance and material translucency. Subtle visual metaphors hint at the cyclic dialogue between technology, ecology, and memory.
227
+ Contemplate a hyper‑detailed diorama rendering harmony between algorithmic sand dunes and floating ceramic monoliths radiating spectral rain, expressed through Art‑Nouveau line work fused with minimal vaporwave gradients. Layered symbolism underscores paradoxes of permanence versus ephemerality in manufactured eternity.
228
+ Behold a cinematic vignette featuring clockwork leviathans swimming through stratified ocean trenches of inverted gravity, expressed through Art‑Nouveau line work fused with minimal vaporwave gradients. Chromatic rhythms pulse like distant quasars, suggesting harmony between chaos and order.
229
+ Inspect a surreal mise‑en‑scène capturing synchronised solar sails blossoming like origami across a kaleidoscopic nebula backdrop, using chiaroscuro lighting reminiscent of Baroque masters embracing neon chroma. Every detail encourages reflection on humanity’s place within expansive, unknowable frontiers.
230
+ Contemplate a hyper‑detailed diorama detailing recursive fractal cities folding into higher‑dimensional corridors of prismatic mist, rendered at 8K resolution ensuring sub‑pixel precision on reflective anisotropic surfaces. The narrative remains open‑ended, leaving viewers adrift in deliberate interpretive ambiguity.
231
+ Consider an evocative dreamscape rendering harmony between algorithmic sand dunes and floating ceramic monoliths radiating spectral rain, accentuated by selective focus bokeh revealing micron‑scale glitter particulates. Spatial recursion visually articulates mathematical infinity within finite representational space.
232
+ In this speculative panorama detailing recursive fractal cities folding into higher‑dimensional corridors of prismatic mist, framed by an ultra‑wide 16:9 holographic canvas supporting volumetric parallax. Textural juxtapositions evoke synesthetic sensations that challenge conventional perceptual hierarchies.
233
+ Witness a hypnotic tableau depicting desert caravans navigating rivers of liquid glass during twin sunsets, framed by an ultra‑wide 16:9 holographic canvas supporting volumetric parallax. The composition invites contemplation of entropy, renewal, and cosmic interconnectedness.
234
+ Consider an evocative dreamscape illustrating translucent mineral forests echoing with harmonic resonances of quantum fireflies, painted with thousand‑stroke impasto textures evoking tactile motion. Every detail encourages reflection on humanity’s place within expansive, unknowable frontiers.
235
+ Consider an evocative dreamscape featuring clockwork leviathans swimming through stratified ocean trenches of inverted gravity, framed by an ultra‑wide 16:9 holographic canvas supporting volumetric parallax. Ultimately, the image celebrates imaginative elasticity as a frontier of scientific discovery.
236
+ Survey an expansive environment illustrating translucent mineral forests echoing with harmonic resonances of quantum fireflies, accentuated by selective focus bokeh revealing micron‑scale glitter particulates. The composition invites contemplation of entropy, renewal, and cosmic interconnectedness.
237
+ Explore a visionary illustration featuring clockwork leviathans swimming through stratified ocean trenches of inverted gravity, with meticulously ray‑traced reflections and subsurface scattering. Every detail encourages reflection on humanity’s place within expansive, unknowable frontiers.
238
+ In this speculative panorama featuring clockwork leviathans swimming through stratified ocean trenches of inverted gravity, expressed through Art‑Nouveau line work fused with minimal vaporwave gradients. Ultimately, the image celebrates imaginative elasticity as a frontier of scientific discovery.
239
+ Observe an intricate composition featuring clockwork leviathans swimming through stratified ocean trenches of inverted gravity, rendered at 8K resolution ensuring sub‑pixel precision on reflective anisotropic surfaces. The narrative remains open‑ended, leaving viewers adrift in deliberate interpretive ambiguity.
240
+ Inspect a surreal mise‑en‑scène displaying vaulted librarian drones cataloging holographic memories within an endless glacier archive, framed by an ultra‑wide 16:9 holographic canvas supporting volumetric parallax. Layered symbolism underscores paradoxes of permanence versus ephemerality in manufactured eternity.
241
+ Survey an expansive environment illustrating translucent mineral forests echoing with harmonic resonances of quantum fireflies, painted with thousand‑stroke impasto textures evoking tactile motion. Subtle visual metaphors hint at the cyclic dialogue between technology, ecology, and memory.
242
+ Behold a cinematic vignette displaying vaulted librarian drones cataloging holographic memories within an endless glacier archive, with meticulously ray‑traced reflections and subsurface scattering. Textural juxtapositions evoke synesthetic sensations that challenge conventional perceptual hierarchies.
243
+ Observe an intricate composition detailing recursive fractal cities folding into higher‑dimensional corridors of prismatic mist, captured by a drone‑level perspective employing long‑exposure star trails. The scene silently questions whether exploration births meaning or merely mirrors ourselves.
244
+ Witness a hypnotic tableau displaying vaulted librarian drones cataloging holographic memories within an endless glacier archive, with meticulously ray‑traced reflections and subsurface scattering. Ultimately, the image celebrates imaginative elasticity as a frontier of scientific discovery.
245
+ Witness a hypnotic tableau capturing synchronised solar sails blossoming like origami across a kaleidoscopic nebula backdrop, modeled via procedural geometry emphasising topological elegance and material translucency. Spatial recursion visually articulates mathematical infinity within finite representational space.
246
+ In this speculative panorama rendering harmony between algorithmic sand dunes and floating ceramic monoliths radiating spectral rain, with meticulously ray‑traced reflections and subsurface scattering. Chromatic rhythms pulse like distant quasars, suggesting harmony between chaos and order.
247
+ Witness a hypnotic tableau showing ancient megastructures orbiting a pulsating ruby planet inside a Dyson‑shell observatory, using chiaroscuro lighting reminiscent of Baroque masters embracing neon chroma. Every detail encourages reflection on humanity’s place within expansive, unknowable frontiers.
248
+ Encounter a novel conceptual artwork illustrating translucent mineral forests echoing with harmonic resonances of quantum fireflies, rendered at 8K resolution ensuring sub‑pixel precision on reflective anisotropic surfaces. Layered symbolism underscores paradoxes of permanence versus ephemerality in manufactured eternity.
249
+ Encounter a novel conceptual artwork capturing synchronised solar sails blossoming like origami across a kaleidoscopic nebula backdrop, with meticulously ray‑traced reflections and subsurface scattering. Ultimately, the image celebrates imaginative elasticity as a frontier of scientific discovery.
250
+ In this speculative panorama capturing synchronised solar sails blossoming like origami across a kaleidoscopic nebula backdrop, using chiaroscuro lighting reminiscent of Baroque masters embracing neon chroma. The composition invites contemplation of entropy, renewal, and cosmic interconnectedness.
configs/captions/starflow_v.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Smooth aerial tracking shot orbiting an erupting volcano; turquoise crater lake, glowing lava vents, thick ash column rising; slow 180° orbit then forward glide, high detail
configs/starflow-v_7B_t2v_caus_480p.yaml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ arguments:
2
+ - no_flip: 1
3
+ - fsdp: 1
4
+ - fsdp_text_encoder: 1
5
+ - img_size: 640
6
+ - secondary_img_size: 512
7
+ - txt_size: 256
8
+ - vid_size: '81:16'
9
+ - fps_cond: 1
10
+ - channel_size: 48
11
+ - patch_size: 1
12
+ - channels: 3072
13
+ - top_block_channels: 4096
14
+ - blocks: 6
15
+ - layers_per_block: 2 2 2 2 2 24
16
+ - noise_std: 0.5
17
+ - batch_size: 192
18
+ - secondary_batch_size: 1536
19
+ - secondary_ratio: 0.2
20
+ - lr: 5e-5
21
+ - min_lr: 1e-6
22
+ - nvp: 1
23
+ - rope: 1
24
+ - adaln: 0
25
+ - sos: 1
26
+ - seq_order: L2R
27
+ - pt_seq_len: 32
28
+ - wds: 1
29
+ - mix_aspect: 1
30
+ - use_softplus: 1
31
+ - cond_top_only: 1
32
+ - use_final_norm: 1
33
+ - learnable_self_denoiser: 1
34
+ - conditional_denoiser: 1
35
+ - denoiser_window: 10
36
+ - cond_noise_level: 1
37
+ - temporal_causal: 2
38
+ - shallow_block_local: 1
39
+ - gradient_checkpoint: 1
40
+ - gradient_checkpoint_mlp: 1
41
+ - vae: Wan-AI/Wan2.2-TI2V-5B-Diffusers:0.6
42
+ - finetuned_vae: none
43
+ - text: google/flan-t5-xl
44
+ - cfg: 2.5
45
+ - drop_label: 0.1
46
+ - drop_image: 0.25
47
+ - loss_scaling: 1
48
+ - grad_clip: 1
49
+ - grad_skip: 1
50
+ - sample_freq: 100000
51
+ - soft_clip: 4
configs/starflow_3B_t2i_256x256.yaml ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ arguments:
2
+ - no_flip: 1
3
+ - fsdp: 1
4
+ - fsdp_text_encoder: 1
5
+ - img_size: 256
6
+ - txt_size: 128
7
+ - channel_size: 4
8
+ - patch_size: 1
9
+ - channels: 3072
10
+ - blocks: 6
11
+ - layers_per_block: 2 2 2 2 2 24
12
+ - noise_std: 0.3
13
+ - batch_size: 1024
14
+ - lr: 6.4e-05
15
+ - min_lr: 1e-6
16
+ - nvp: 1
17
+ - rope: 1
18
+ - adaln: 0
19
+ - sos: 1
20
+ - seq_order: L2R
21
+ - wds: 1
22
+ - use_softplus: 1
23
+ - cond_top_only: 1
24
+ - use_final_norm: 1
25
+ - gradient_checkpoint: 0
26
+ - vae: stabilityai/sd-vae-ft-ema
27
+ - text: google/flan-t5-xl
28
+ - cfg: 2.5
29
+ - drop_label: 0.1
30
+ - sample_freq: 10
31
+ - soft_clip: 4
32
+ - latent_norm_regularization: 1e-4
misc/__init__.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # For licensing see accompanying LICENSE file.
3
+ # Copyright (C) 2025 Apple Inc. All Rights Reserved.
4
+ #
5
+ import os
6
+
7
+
8
+ def get_local_rank():
9
+ if os.environ.get('IRISCTL_ROLE'):
10
+ import irisctl.api as irisctl
11
+ return irisctl.local_rank()
12
+ elif os.environ.get('MASTER_PORT'):
13
+ return int(os.environ['LOCAL_RANK'])
14
+ else:
15
+ return 0
16
+
17
+
18
+ def print(*args, **kwargs):
19
+ if get_local_rank() == 0:
20
+ import builtins
21
+ builtins.print(*args, **kwargs)
22
+
23
+
24
+ def xprint(string):
25
+ import builtins
26
+ local_rank = get_local_rank()
27
+ builtins.print(f'[Local Rank {local_rank}] {string}')
28
+
29
+
30
+ def dividable(x):
31
+ for i in range(int(x ** 0.5), 0, -1):
32
+ if x % i == 0:
33
+ return x // i
34
+ return x
misc/ae_losses.py ADDED
@@ -0,0 +1,330 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # For licensing see accompanying LICENSE file.
3
+ # Copyright (C) 2025 Apple Inc. All Rights Reserved.
4
+ #
5
+ from typing import Mapping, Text, Tuple
6
+
7
+ import torch
8
+ import torch.nn as nn
9
+ import torch.nn.functional as F
10
+ from torchvision import models
11
+ from einops import rearrange
12
+ from torch.cuda.amp import autocast
13
+ from .lpips import LPIPS
14
+ from .discriminator import NLayerDiscriminator, NLayer3DDiscriminator
15
+
16
+
17
+ _IMAGENET_MEAN = [0.485, 0.456, 0.406]
18
+ _IMAGENET_STD = [0.229, 0.224, 0.225]
19
+
20
+
21
+ def hinge_d_loss(logits_real: torch.Tensor, logits_fake: torch.Tensor) -> torch.Tensor:
22
+ """Hinge loss for discrminator.
23
+
24
+ This function is borrowed from
25
+ https://github.com/CompVis/taming-transformers/blob/master/taming/modules/losses/vqperceptual.py#L20
26
+ """
27
+ loss_real = torch.mean(F.relu(1.0 - logits_real))
28
+ loss_fake = torch.mean(F.relu(1.0 + logits_fake))
29
+ d_loss = 0.5 * (loss_real + loss_fake)
30
+ return d_loss
31
+
32
+
33
+ def compute_lecam_loss(
34
+ logits_real_mean: torch.Tensor,
35
+ logits_fake_mean: torch.Tensor,
36
+ ema_logits_real_mean: torch.Tensor,
37
+ ema_logits_fake_mean: torch.Tensor
38
+ ) -> torch.Tensor:
39
+ """Computes the LeCam loss for the given average real and fake logits.
40
+
41
+ Args:
42
+ logits_real_mean -> torch.Tensor: The average real logits.
43
+ logits_fake_mean -> torch.Tensor: The average fake logits.
44
+ ema_logits_real_mean -> torch.Tensor: The EMA of the average real logits.
45
+ ema_logits_fake_mean -> torch.Tensor: The EMA of the average fake logits.
46
+
47
+ Returns:
48
+ lecam_loss -> torch.Tensor: The LeCam loss.
49
+ """
50
+ lecam_loss = torch.mean(torch.pow(F.relu(logits_real_mean - ema_logits_fake_mean), 2))
51
+ lecam_loss += torch.mean(torch.pow(F.relu(ema_logits_real_mean - logits_fake_mean), 2))
52
+ return lecam_loss
53
+
54
+
55
+ class PerceptualLoss(torch.nn.Module):
56
+ def __init__(self, dist, model_name: str = "convnext_s"):
57
+ """Initializes the PerceptualLoss class.
58
+
59
+ Args:
60
+ model_name: A string, the name of the perceptual loss model to use.
61
+
62
+ Raise:
63
+ ValueError: If the model_name does not contain "lpips" or "convnext_s".
64
+ """
65
+ super().__init__()
66
+ if ("lpips" not in model_name) and (
67
+ "convnext_s" not in model_name):
68
+ raise ValueError(f"Unsupported Perceptual Loss model name {model_name}")
69
+ self.dist = dist
70
+ self.lpips = None
71
+ self.convnext = None
72
+ self.loss_weight_lpips = None
73
+ self.loss_weight_convnext = None
74
+
75
+ # Parsing the model name. We support name formatted in
76
+ # "lpips-convnext_s-{float_number}-{float_number}", where the
77
+ # {float_number} refers to the loss weight for each component.
78
+ # E.g., lpips-convnext_s-1.0-2.0 refers to compute the perceptual loss
79
+ # using both the convnext_s and lpips, and average the final loss with
80
+ # (1.0 * loss(lpips) + 2.0 * loss(convnext_s)) / (1.0 + 2.0).
81
+ if "lpips" in model_name:
82
+ self.lpips = LPIPS(dist).eval()
83
+
84
+ if "convnext_s" in model_name:
85
+ self.convnext = models.convnext_small(weights=models.ConvNeXt_Small_Weights.IMAGENET1K_V1).eval()
86
+
87
+ if "lpips" in model_name and "convnext_s" in model_name:
88
+ loss_config = model_name.split('-')[-2:]
89
+ self.loss_weight_lpips, self.loss_weight_convnext = float(loss_config[0]), float(loss_config[1])
90
+ print(f"self.loss_weight_lpips, self.loss_weight_convnext: {self.loss_weight_lpips}, {self.loss_weight_convnext}")
91
+
92
+ self.register_buffer("imagenet_mean", torch.Tensor(_IMAGENET_MEAN)[None, :, None, None])
93
+ self.register_buffer("imagenet_std", torch.Tensor(_IMAGENET_STD)[None, :, None, None])
94
+
95
+ for param in self.parameters():
96
+ param.requires_grad = False
97
+
98
+ def forward(self, input: torch.Tensor, target: torch.Tensor):
99
+ """Computes the perceptual loss.
100
+
101
+ Args:
102
+ input: A tensor of shape (B, C, H, W), the input image. Normalized to [0, 1].
103
+ target: A tensor of shape (B, C, H, W), the target image. Normalized to [0, 1].
104
+
105
+ Returns:
106
+ A scalar tensor, the perceptual loss.
107
+ """
108
+ if input.dim() == 5:
109
+ # If the input is 5D, we assume it is a batch of videos.
110
+ # We will average the loss over the temporal dimension.
111
+ input = rearrange(input, "b t c h w -> (b t) c h w")
112
+ target = rearrange(target, "b t c h w -> (b t) c h w")
113
+
114
+ # Always in eval mode.
115
+ self.eval()
116
+ loss = 0.
117
+ num_losses = 0.
118
+ lpips_loss = 0.
119
+ convnext_loss = 0.
120
+ # Computes LPIPS loss, if available.
121
+ if self.lpips is not None:
122
+ lpips_loss = self.lpips(input, target)
123
+ if self.loss_weight_lpips is None:
124
+ loss += lpips_loss
125
+ num_losses += 1
126
+ else:
127
+ num_losses += self.loss_weight_lpips
128
+ loss += self.loss_weight_lpips * lpips_loss
129
+
130
+ if self.convnext is not None:
131
+ # Computes ConvNeXt-s loss, if available.
132
+ input = torch.nn.functional.interpolate(input, size=224, mode="bilinear", align_corners=False, antialias=True)
133
+ target = torch.nn.functional.interpolate(target, size=224, mode="bilinear", align_corners=False, antialias=True)
134
+ pred_input = self.convnext((input - self.imagenet_mean) / self.imagenet_std)
135
+ pred_target = self.convnext((target - self.imagenet_mean) / self.imagenet_std)
136
+ convnext_loss = torch.nn.functional.mse_loss(
137
+ pred_input,
138
+ pred_target,
139
+ reduction="mean")
140
+
141
+ if self.loss_weight_convnext is None:
142
+ num_losses += 1
143
+ loss += convnext_loss
144
+ else:
145
+ num_losses += self.loss_weight_convnext
146
+ loss += self.loss_weight_convnext * convnext_loss
147
+
148
+ # weighted avg.
149
+ loss = loss / num_losses
150
+ return loss
151
+
152
+
153
+ class WaveletLoss3D(torch.nn.Module):
154
+ def __init__(self):
155
+ super().__init__()
156
+
157
+ def forward(self, inputs, targets):
158
+ from torch_dwt.functional import dwt3
159
+ inputs, targets = inputs.float(), targets.float()
160
+ l1_loss = torch.abs(
161
+ dwt3(inputs.contiguous(), "haar") - dwt3(targets.contiguous(), "haar")
162
+ )
163
+
164
+ # Average over the number of wavelet filters, reducing the dimensions
165
+ l1_loss = torch.mean(l1_loss, dim=1)
166
+
167
+ # Average over all of the filter banks, keeping dimensions
168
+ l1_loss = torch.mean(l1_loss, dim=-1, keepdim=True)
169
+ l1_loss = torch.mean(l1_loss, dim=-2, keepdim=True)
170
+ l1_loss = torch.mean(l1_loss, dim=-3, keepdim=True)
171
+ return l1_loss
172
+
173
+
174
+ class ReconstructionLoss_Single_Stage(torch.nn.Module):
175
+ def __init__(self, dist, args):
176
+ """Initializes the losses module.
177
+
178
+ Args:
179
+ config: A dictionary, the configuration for the model and everything else.
180
+ """
181
+ super().__init__()
182
+ self.dist = dist
183
+ self.with_condition = False
184
+ self.quantize_mode = 'vae'
185
+ self.discriminator = NLayerDiscriminator(with_condition=False).eval() if not args.use_3d_disc else NLayer3DDiscriminator(with_condition=False).eval()
186
+ self.reconstruction_loss = "l2"
187
+ self.reconstruction_weight = 1.0
188
+ self.quantizer_weight = 1.0
189
+ self.perceptual_loss = PerceptualLoss(dist, "lpips-convnext_s-1.0-0.1").eval()
190
+ self.perceptual_weight = 1.1
191
+ self.discriminator_iter_start = 0
192
+ self.discriminator_factor = 1.0
193
+ self.discriminator_weight = 0.1
194
+ self.lecam_regularization_weight = 0.001
195
+ self.lecam_ema_decay = 0.999
196
+ self.kl_weight = 1e-6
197
+ self.wavelet_loss_weight = 0.5
198
+ self.wavelet_loss = WaveletLoss3D()
199
+ self.logvar = nn.Parameter(torch.ones(size=()) * 0.0, requires_grad=False)
200
+ if self.lecam_regularization_weight > 0.0:
201
+ self.register_buffer("ema_real_logits_mean", torch.zeros((1)))
202
+ self.register_buffer("ema_fake_logits_mean", torch.zeros((1)))
203
+
204
+ @torch.amp.autocast("cuda", enabled=False)
205
+ def forward(self,
206
+ inputs: torch.Tensor,
207
+ reconstructions: torch.Tensor,
208
+ extra_result_dict: Mapping[Text, torch.Tensor],
209
+ global_step: int,
210
+ mode: str = "generator",
211
+ ) -> Tuple[torch.Tensor, Mapping[Text, torch.Tensor]]:
212
+ # Both inputs and reconstructions are in range [0, 1].
213
+ inputs = inputs.float()
214
+ reconstructions = reconstructions.float()
215
+
216
+ if mode == "generator":
217
+ return self._forward_generator(inputs, reconstructions, extra_result_dict, global_step)
218
+ elif mode == "discriminator":
219
+ return self._forward_discriminator(inputs, reconstructions, extra_result_dict, global_step)
220
+ else:
221
+ raise ValueError(f"Unsupported mode {mode}")
222
+
223
+ def should_discriminator_be_trained(self, global_step : int):
224
+ return global_step >= self.discriminator_iter_start
225
+
226
+ def _forward_discriminator(self,
227
+ inputs: torch.Tensor,
228
+ reconstructions: torch.Tensor,
229
+ extra_result_dict: Mapping[Text, torch.Tensor],
230
+ global_step: int,
231
+ ) -> Tuple[torch.Tensor, Mapping[Text, torch.Tensor]]:
232
+ """Discrminator training step."""
233
+ discriminator_factor = self.discriminator_factor if self.should_discriminator_be_trained(global_step) else 0
234
+ loss_dict = {}
235
+ # Turn the gradients on.
236
+ for param in self.discriminator.parameters():
237
+ param.requires_grad = True
238
+
239
+ condition = extra_result_dict.get("condition", None) if self.with_condition else None
240
+ real_images = inputs.detach().requires_grad_(True)
241
+ logits_real = self.discriminator(real_images, condition)
242
+ logits_fake = self.discriminator(reconstructions.detach(), condition)
243
+
244
+ discriminator_loss = discriminator_factor * hinge_d_loss(logits_real=logits_real, logits_fake=logits_fake)
245
+
246
+ # optional lecam regularization
247
+ lecam_loss = torch.zeros((), device=inputs.device)
248
+ if self.lecam_regularization_weight > 0.0:
249
+ lecam_loss = compute_lecam_loss(
250
+ torch.mean(logits_real),
251
+ torch.mean(logits_fake),
252
+ self.ema_real_logits_mean,
253
+ self.ema_fake_logits_mean
254
+ ) * self.lecam_regularization_weight
255
+
256
+ self.ema_real_logits_mean = self.ema_real_logits_mean * self.lecam_ema_decay + torch.mean(logits_real).detach() * (1 - self.lecam_ema_decay)
257
+ self.ema_fake_logits_mean = self.ema_fake_logits_mean * self.lecam_ema_decay + torch.mean(logits_fake).detach() * (1 - self.lecam_ema_decay)
258
+
259
+ discriminator_loss += lecam_loss
260
+
261
+ loss_dict = dict(
262
+ discriminator_loss=discriminator_loss.detach(),
263
+ logits_real=logits_real.detach().mean(),
264
+ logits_fake=logits_fake.detach().mean(),
265
+ lecam_loss=lecam_loss.detach(),
266
+ )
267
+ return discriminator_loss, loss_dict
268
+
269
+ def _forward_generator(self,
270
+ inputs: torch.Tensor,
271
+ reconstructions: torch.Tensor,
272
+ extra_result_dict: Mapping[Text, torch.Tensor],
273
+ global_step: int
274
+ ) -> Tuple[torch.Tensor, Mapping[Text, torch.Tensor]]:
275
+ """Generator training step."""
276
+ inputs = inputs.contiguous()
277
+ reconstructions = reconstructions.contiguous()
278
+ if self.reconstruction_loss == "l1":
279
+ reconstruction_loss = F.l1_loss(inputs, reconstructions, reduction="mean")
280
+ elif self.reconstruction_loss == "l2":
281
+ reconstruction_loss = F.mse_loss(inputs, reconstructions, reduction="mean")
282
+ else:
283
+ raise ValueError(f"Unsuppored reconstruction_loss {self.reconstruction_loss}")
284
+ reconstruction_loss *= self.reconstruction_weight
285
+
286
+ # Compute wavelet loss.
287
+ if inputs.dim() == 5:
288
+ wavelet_loss = self.wavelet_loss(
289
+ inputs.permute(0,2,1,3,4), reconstructions.permute(0,2,1,3,4)).mean()
290
+ else:
291
+ wavelet_loss = 0
292
+
293
+ # Compute perceptual loss.
294
+ perceptual_loss = self.perceptual_loss(inputs, reconstructions).mean()
295
+
296
+ # Compute discriminator loss.
297
+ generator_loss = torch.zeros((), device=inputs.device)
298
+ discriminator_factor = self.discriminator_factor if self.should_discriminator_be_trained(global_step) else 0
299
+ d_weight = 1.0
300
+ if discriminator_factor > 0.0 and self.discriminator_weight > 0.0:
301
+ # Disable discriminator gradients.
302
+ for param in self.discriminator.parameters():
303
+ param.requires_grad = False
304
+ logits_fake = self.discriminator(reconstructions)
305
+ generator_loss = -torch.mean(logits_fake)
306
+
307
+ d_weight *= self.discriminator_weight
308
+
309
+ assert self.quantize_mode == "vae", "Only vae mode is supported for now"
310
+
311
+ # Compute kl loss.
312
+ reconstruction_loss = reconstruction_loss / torch.exp(self.logvar)
313
+ total_loss = (
314
+ reconstruction_loss
315
+ + self.perceptual_weight * perceptual_loss
316
+ + d_weight * discriminator_factor * generator_loss
317
+ + self.wavelet_loss_weight * wavelet_loss
318
+ )
319
+ loss_dict = dict(
320
+ total_loss=total_loss.clone().detach(),
321
+ reconstruction_loss=reconstruction_loss.detach(),
322
+ perceptual_loss=(self.perceptual_weight * perceptual_loss).detach(),
323
+ weighted_gan_loss=(d_weight * discriminator_factor * generator_loss).detach(),
324
+ discriminator_factor=torch.tensor(discriminator_factor),
325
+ d_weight=d_weight,
326
+ gan_loss=generator_loss.detach(),
327
+ wavelet_loss=(self.wavelet_loss_weight * wavelet_loss).detach(),
328
+ )
329
+ return total_loss, loss_dict
330
+
misc/condition_utils.py ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # For licensing see accompanying LICENSE file.
3
+ # Copyright (C) 2025 Apple Inc. All Rights Reserved.
4
+ #
5
+ ## camera
6
+
7
+ from pathlib import Path
8
+ import json
9
+ import re
10
+ import tarfile
11
+ from einops import rearrange
12
+ import torch
13
+ import numpy as np
14
+ from PIL import Image
15
+ import torchvision.transforms.functional as F
16
+ from torchvision import transforms
17
+ import math
18
+
19
+ def find_factors(n):
20
+ factors = set()
21
+ for i in range(1, int(math.sqrt(n)) + 1):
22
+ if n % i == 0:
23
+ factors.add(i)
24
+ factors.add(n // i)
25
+ return sorted(factors, reverse=True)
26
+
27
+ def find_max_scale_factor(A, B):
28
+ gcd = math.gcd(A, B)
29
+
30
+ factors = find_factors(gcd)
31
+
32
+ for factor in factors:
33
+ if A // factor >= 32 and B // factor >= 32 and abs(A-B)//factor % 2 ==0:
34
+ return factor
35
+
36
+ return 1
37
+
38
+ def _get_plucker_embedding(intrinsic_parameters, w2c_matrices, height, width, norm_t=False, mask_idx=[0], project=False):
39
+ return np.concatenate([
40
+ get_plucker_embedding(intrinsic_parameters, w2c_matrices, height, width, norm_t, idx, project)
41
+ for idx in mask_idx], -1)
42
+
43
+
44
+ def get_plucker_embedding(intrinsic_parameters, w2c_matrices, height, width, norm_t=False, mask_idx=0, project=True):
45
+ """
46
+ intrinsic_parameters.shape = [b f 4]
47
+ c2w_matrices.shape = [b f 4 4]
48
+ """
49
+
50
+ num_frames = intrinsic_parameters.shape[0]
51
+ c2w_matrices = np.linalg.inv(w2c_matrices)
52
+
53
+ if project:
54
+ w2c_cond_matrices = w2c_matrices[mask_idx: mask_idx+1]
55
+ c2w_matrices = w2c_cond_matrices @ c2w_matrices # relative pose to the first frame
56
+
57
+
58
+ if norm_t:
59
+ offset = c2w_matrices[:, :3, -1:] # f, 3, 1
60
+ offset = offset / (np.abs(offset).max(axis=(1, 2), keepdims=True) + 1e-7)
61
+ c2w_matrices[:, :3, -1:] = offset
62
+
63
+ ys, xs = np.meshgrid(
64
+ np.linspace(0, height - 1, height, dtype=c2w_matrices.dtype),
65
+ np.linspace(0, width - 1, width, dtype=c2w_matrices.dtype), indexing='ij')
66
+ ys = np.tile(ys.reshape([1, height * width]), [num_frames, 1]) +0.5
67
+ xs = np.tile(xs.reshape([1, height * width]), [num_frames, 1]) +0.5
68
+
69
+ fx, fy, cx, cy = np.split(intrinsic_parameters, 4, -1)
70
+ fx, fy, cx, cy = fx * width, fy * height, cx * width, cy * height
71
+
72
+ zs_cam = np.ones_like(xs)
73
+ xs_cam = (xs - cx) / fx * zs_cam
74
+ ys_cam = (ys - cy) / fy * zs_cam
75
+ directions = np.stack((xs_cam, ys_cam, zs_cam), -1)
76
+ directions = directions / np.linalg.norm(directions, axis=-1, keepdims=True)
77
+
78
+ ray_directions_w = (c2w_matrices[..., :3, :3] @ directions.transpose(0, 2, 1)).transpose(0, 2, 1)
79
+ ray_origin_w = np.expand_dims(c2w_matrices[..., :3, 3], axis=-2)
80
+ ray_origin_w = np.broadcast_to(ray_origin_w, ray_directions_w.shape)
81
+ ray_dxo = np.cross(ray_origin_w, ray_directions_w)
82
+ plucker_embedding = np.concatenate([ray_dxo, ray_directions_w], -1).reshape(num_frames, height, width, 6)
83
+
84
+ return plucker_embedding
85
+
86
+
87
+ def label_to_camera(label):
88
+ num_frames = label.shape[0]
89
+ bottom = np.zeros([num_frames, 1, 4])
90
+ bottom[:, :, -1] = 1
91
+
92
+ # [w, h, flx, fly] + camera_model[0] + camera_model[1] + camera_model[2] + camera_model[3]
93
+ w, h, fx, fy = label[:, 0:1], label[:, 1:2], label[:, 2:3], label[:, 3:4]
94
+ fx, fy = fx / w, fy / h
95
+ c2w = label[:, 4:].reshape(num_frames, 4, 4)
96
+ c2w[:, 2, :] *= -1
97
+ c2w = c2w[:, np.array([1, 0, 2, 3]), :]
98
+ c2w[:, 0:3, 1:3] *= -1
99
+ w2c = np.linalg.inv(c2w)
100
+ intrinsic = np.concatenate([fx, fy, np.ones_like(fx) * .5, np.ones_like(fx) * .5], 1)
101
+
102
+ return intrinsic, w2c
103
+
104
+
105
+ def get_camera_condition(tar, camera_file, width=960, height=544, factor=16, frame_inds=None):
106
+
107
+ try:
108
+ with tar.extractfile(camera_file) as cam_data:
109
+ camera_data = json.load(cam_data)
110
+
111
+ prefix = [camera_data['w'], camera_data['h'], camera_data['fl_x'], camera_data['fl_y']]
112
+
113
+ labels = []
114
+ if frame_inds is None:
115
+ frame_inds = list(range(len(camera_data['frames'])))
116
+ for ind in frame_inds:
117
+ frame_info = camera_data['frames'][ind]
118
+ label = prefix + sum(frame_info['transform_matrix'], [])
119
+ labels.append(label)
120
+
121
+ label = np.array(labels)
122
+ intrinsic, w2c = label_to_camera(label)
123
+ # factor = find_max_scale_factor(height, width)
124
+ H, W = height // factor, width // factor
125
+ ray_map = _get_plucker_embedding(intrinsic, w2c, H, W, norm_t=False, mask_idx=[0], project=True)
126
+ ray_map = torch.from_numpy(ray_map) #.permute(0, 3, 1, 2) # [f, h, w, c]
127
+ # ray_map = F.resize(transforms.CenterCrop(min(H, W))(ray_map), 32).permute(0, 2, 3, 1)
128
+ except Exception as e:
129
+ print(f'Reading data error {e} {camera_file}')
130
+ ray_map = np.zeros((len(frame_inds), H, W, 6))
131
+
132
+ return ray_map
133
+
134
+
135
+ ## force
136
+ def get_wind_condition(force, angle, min_force, max_force, num_frames=45, num_channels=3, height=480, width=720):
137
+
138
+ condition = torch.zeros((num_frames, num_channels, height, width))
139
+
140
+ # first channel gets wind_speed
141
+ condition[:, 0] = -1 + 2*(force-min_force)/(max_force-min_force)
142
+
143
+ # second channel gets cos(wind_angle)
144
+ condition[:, 1] = math.cos(angle * torch.pi / 180.0)
145
+
146
+ # third channel gets sin(wind_angle)
147
+ condition[:, 2] = math.sin(angle * torch.pi / 180.0)
148
+
149
+ return rearrange(condition, 'f c h w -> f h w c')
150
+
151
+
152
+ def get_gaussian_blob(x, y, radius=10, amplitude=1.0, shape=(3, 480, 720), device=None):
153
+ """
154
+ Create a tensor containing a Gaussian blob at the specified location.
155
+
156
+ Args:
157
+ x (int): x-coordinate of the blob center
158
+ y (int): y-coordinate of the blob center
159
+ radius (int, optional): Radius of the Gaussian blob. Defaults to 10.
160
+ amplitude (float, optional): Maximum intensity of the blob. Defaults to 1.0.
161
+ shape (tuple, optional): Shape of the output tensor (channels, height, width). Defaults to (3, 480, 720).
162
+ device (torch.device, optional): Device to create the tensor on. Defaults to None.
163
+
164
+ Returns:
165
+ torch.Tensor: Tensor of shape (channels, height, width) containing the Gaussian blob
166
+ """
167
+ num_channels, height, width = shape
168
+
169
+ # Create a new tensor filled with zeros
170
+ blob_tensor = torch.zeros(shape, device=device)
171
+
172
+ # Create coordinate grids
173
+ y_grid, x_grid = torch.meshgrid(
174
+ torch.arange(height, device=device),
175
+ torch.arange(width, device=device),
176
+ indexing='ij'
177
+ )
178
+
179
+ # Calculate squared distance from (x, y)
180
+ squared_dist = (x_grid - x) ** 2 + (y_grid - y) ** 2
181
+
182
+ # Create Gaussian blob using the squared distance
183
+ gaussian = amplitude * torch.exp(-squared_dist / (2.0 * radius ** 2))
184
+
185
+ # Add the Gaussian blob to all channels
186
+ for c in range(num_channels):
187
+ blob_tensor[c] = gaussian
188
+
189
+ return blob_tensor
190
+
191
+ def get_point_condition(force, angle, x_pos, y_pos, min_force, max_force, num_frames=45, num_channels=3, height=480, width=720):
192
+
193
+ condition = torch.zeros((num_frames, num_channels, height, width)) # (45, 3, 480, 720)
194
+
195
+ x_pos_start = x_pos*width
196
+ y_pos_start = (1-y_pos)*height
197
+
198
+ DISPLACEMENT_FOR_MAX_FORCE = width / 2
199
+ DISPLACEMENT_FOR_MIN_FORCE = width / 8
200
+
201
+ force_percent = (force - min_force) / (max_force - min_force)
202
+ total_displacement = DISPLACEMENT_FOR_MIN_FORCE + (DISPLACEMENT_FOR_MAX_FORCE - DISPLACEMENT_FOR_MIN_FORCE) * force_percent
203
+
204
+ x_pos_end = x_pos_start + total_displacement * math.cos(angle * torch.pi / 180.0)
205
+ y_pos_end = y_pos_start - total_displacement * math.sin(angle * torch.pi / 180.0)
206
+
207
+ for frame in range(num_frames):
208
+
209
+ t = frame / (num_frames-1)
210
+ x_pos_ = x_pos_start * (1-t) + x_pos_end * t # t = 0 --> start; t = 0 --> end
211
+ y_pos_ = y_pos_start * (1-t) + y_pos_end * t # t = 0 --> start; t = 0 --> end
212
+
213
+ blob_tensor = get_gaussian_blob(x=x_pos_, y=y_pos_, radius=20, amplitude=1.0, shape=(num_channels, height, width))
214
+
215
+ condition[frame] += blob_tensor
216
+
217
+ return rearrange(condition, 'f c h w -> f h w c')
218
+
misc/discriminator.py ADDED
@@ -0,0 +1,388 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # For licensing see accompanying LICENSE file.
3
+ # Copyright (C) 2025 Apple Inc. All Rights Reserved.
4
+ #
5
+ import functools
6
+ import math
7
+ from typing import Tuple
8
+
9
+
10
+ import torch
11
+ import torch.nn as nn
12
+ import torch.nn.functional as F
13
+
14
+ from einops import rearrange
15
+
16
+ # Conv2D with same padding
17
+ class Conv2dSame(nn.Conv2d):
18
+ def calc_same_pad(self, i: int, k: int, s: int, d: int) -> int:
19
+ return max((math.ceil(i / s) - 1) * s + (k - 1) * d + 1 - i, 0)
20
+
21
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
22
+ ih, iw = x.size()[-2:]
23
+
24
+ pad_h = self.calc_same_pad(i=ih, k=self.kernel_size[0], s=self.stride[0], d=self.dilation[0])
25
+ pad_w = self.calc_same_pad(i=iw, k=self.kernel_size[1], s=self.stride[1], d=self.dilation[1])
26
+
27
+ if pad_h > 0 or pad_w > 0:
28
+ x = F.pad(x, [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2])
29
+ return super().forward(x)
30
+
31
+
32
+ class BlurBlock(torch.nn.Module):
33
+ def __init__(self,
34
+ kernel: Tuple[int] = (1, 3, 3, 1)
35
+ ):
36
+ super().__init__()
37
+
38
+ kernel = torch.tensor(kernel, dtype=torch.float32, requires_grad=False)
39
+ kernel = kernel[None, :] * kernel[:, None]
40
+ kernel /= kernel.sum()
41
+ kernel = kernel.unsqueeze(0).unsqueeze(0)
42
+ self.register_buffer("kernel", kernel)
43
+
44
+ def calc_same_pad(self, i: int, k: int, s: int) -> int:
45
+ return max((math.ceil(i / s) - 1) * s + (k - 1) + 1 - i, 0)
46
+
47
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
48
+ ic, ih, iw = x.size()[-3:]
49
+ pad_h = self.calc_same_pad(i=ih, k=4, s=2)
50
+ pad_w = self.calc_same_pad(i=iw, k=4, s=2)
51
+ if pad_h > 0 or pad_w > 0:
52
+ x = F.pad(x, [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2])
53
+
54
+ weight = self.kernel.expand(ic, -1, -1, -1)
55
+
56
+ out = F.conv2d(input=x, weight=weight, stride=2, groups=x.shape[1])
57
+ return out
58
+
59
+
60
+ class SinusoidalTimeEmbedding(torch.nn.Module):
61
+ def __init__(self, embedding_dim: int):
62
+ super().__init__()
63
+ self.embedding_dim = embedding_dim
64
+ assert embedding_dim % 2 == 0, "embedding_dim must be even"
65
+
66
+ def forward(self, timesteps: torch.Tensor) -> torch.Tensor:
67
+ half_dim = self.embedding_dim // 2
68
+ embeddings = math.log(10000) / (half_dim - 1)
69
+ embeddings = torch.exp(torch.arange(half_dim, device=timesteps.device) * -embeddings)
70
+ embeddings = timesteps[:, None] * embeddings[None, :]
71
+ embeddings = torch.cat((embeddings.sin(), embeddings.cos()), dim=-1)
72
+ return embeddings
73
+
74
+
75
+ class ModulatedConv2dSame(Conv2dSame):
76
+ def __init__(self, in_channels, out_channels, kernel_size, cond_channels=None):
77
+ super().__init__(in_channels, out_channels, kernel_size)
78
+ # FiLM modulation projections
79
+ if cond_channels is not None:
80
+ self.film_proj = torch.nn.Linear(cond_channels, 2 * out_channels)
81
+
82
+ # Initialize scale to 0 and bias to 0
83
+ torch.nn.init.zeros_(self.film_proj.weight)
84
+ torch.nn.init.zeros_(self.film_proj.bias)
85
+
86
+ def forward(self, x, temb=None):
87
+ x = super().forward(x)
88
+ if temb is not None:
89
+ scale, bias = self.film_proj(temb)[:, :, None, None].chunk(2, dim=1)
90
+ x = x * (scale + 1) + bias
91
+ return x
92
+
93
+
94
+ class NLayerDiscriminator(torch.nn.Module):
95
+ def __init__(
96
+ self,
97
+ num_channels: int = 3,
98
+ hidden_channels: int = 128,
99
+ num_stages: int = 3,
100
+ blur_resample: bool = True,
101
+ blur_kernel_size: int = 4,
102
+ with_condition: bool = False,
103
+ ):
104
+ """ Initializes the NLayerDiscriminator.
105
+
106
+ Args:
107
+ num_channels -> int: The number of input channels.
108
+ hidden_channels -> int: The number of hidden channels.
109
+ num_stages -> int: The number of stages.
110
+ blur_resample -> bool: Whether to use blur resampling.
111
+ blur_kernel_size -> int: The blur kernel size.
112
+ """
113
+ super().__init__()
114
+ assert num_stages > 0, "Discriminator cannot have 0 stages"
115
+ assert (not blur_resample) or (blur_kernel_size >= 3 and blur_kernel_size <= 5), "Blur kernel size must be in [3,5] when sampling]"
116
+
117
+ in_channel_mult = (1,) + tuple(map(lambda t: 2**t, range(num_stages)))
118
+ init_kernel_size = 5
119
+ activation = functools.partial(torch.nn.LeakyReLU, negative_slope=0.1)
120
+
121
+ self.with_condition = with_condition
122
+ if with_condition:
123
+ cond_channels = 768
124
+ self.time_emb = SinusoidalTimeEmbedding(128)
125
+ self.time_proj = torch.nn.Sequential(
126
+ torch.nn.Linear(128, cond_channels),
127
+ torch.nn.SiLU(),
128
+ torch.nn.Linear(cond_channels, cond_channels),
129
+ )
130
+ else:
131
+ cond_channels = None
132
+
133
+ self.block_in = torch.nn.Sequential(
134
+ Conv2dSame(
135
+ num_channels,
136
+ hidden_channels,
137
+ kernel_size=init_kernel_size
138
+ ),
139
+ activation(),
140
+ )
141
+
142
+ BLUR_KERNEL_MAP = {
143
+ 3: (1,2,1),
144
+ 4: (1,3,3,1),
145
+ 5: (1,4,6,4,1),
146
+ }
147
+
148
+ discriminator_blocks = []
149
+ for i_level in range(num_stages):
150
+ in_channels = hidden_channels * in_channel_mult[i_level]
151
+ out_channels = hidden_channels * in_channel_mult[i_level + 1]
152
+ conv_block = ModulatedConv2dSame(
153
+ in_channels,
154
+ out_channels,
155
+ kernel_size=3,
156
+ cond_channels=cond_channels
157
+ )
158
+ discriminator_blocks.append(conv_block)
159
+ down_block = torch.nn.Sequential(
160
+ torch.nn.AvgPool2d(kernel_size=2, stride=2) if not blur_resample else BlurBlock(BLUR_KERNEL_MAP[blur_kernel_size]),
161
+ torch.nn.GroupNorm(32, out_channels),
162
+ activation(),
163
+ )
164
+ discriminator_blocks.append(down_block)
165
+
166
+ self.blocks = torch.nn.ModuleList(discriminator_blocks)
167
+ self.pool = torch.nn.AdaptiveMaxPool2d((16, 16))
168
+ self.to_logits = torch.nn.Sequential(
169
+ Conv2dSame(out_channels, out_channels, 1),
170
+ activation(),
171
+ Conv2dSame(out_channels, 1, kernel_size=5)
172
+ )
173
+
174
+ def forward(self, x: torch.Tensor, condition: torch.Tensor = None) -> torch.Tensor:
175
+ """ Forward pass.
176
+
177
+ Args:
178
+ x -> torch.Tensor: The input tensor.
179
+
180
+ Returns:
181
+ output -> torch.Tensor: The output tensor.
182
+ """
183
+ if x.dim() == 5:
184
+ x = rearrange(x, 'b t c h w -> (b t) c h w')
185
+
186
+ hidden_states = self.block_in(x)
187
+ if condition is not None and self.with_condition:
188
+ temb = self.time_proj(self.time_emb(condition * 1000.0))
189
+ else:
190
+ temb = None
191
+
192
+ for i, block in enumerate(self.blocks):
193
+ if i % 2 == 0:
194
+ hidden_states = block(hidden_states, temb) # conv_block
195
+ else:
196
+ hidden_states = block(hidden_states) # down_block
197
+
198
+ hidden_states = self.pool(hidden_states)
199
+ return self.to_logits(hidden_states)
200
+
201
+ # 3D discriminator
202
+
203
+ class Conv3dSame(nn.Conv3d):
204
+ def calc_same_pad(self, i: int, k: int, s: int, d: int) -> int:
205
+ return max((math.ceil(i / s) - 1) * s + (k - 1) * d + 1 - i, 0)
206
+
207
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
208
+ it, ih, iw = x.size()[-3:] # frame, height, width
209
+
210
+ pad_t = self.calc_same_pad(i=it, k=self.kernel_size[0], s=self.stride[0], d=self.dilation[0])
211
+ pad_h = self.calc_same_pad(i=ih, k=self.kernel_size[1], s=self.stride[1], d=self.dilation[1])
212
+ pad_w = self.calc_same_pad(i=iw, k=self.kernel_size[2], s=self.stride[2], d=self.dilation[2])
213
+
214
+ if pad_t > 0 or pad_h > 0 or pad_w > 0:
215
+ x = F.pad(
216
+ x,
217
+ [pad_w // 2, pad_w - pad_w // 2,
218
+ pad_h // 2, pad_h - pad_h // 2,
219
+ pad_t // 2, pad_t - pad_t // 2],
220
+ )
221
+ return super().forward(x)
222
+
223
+ class ModulatedConv3dSame(Conv3dSame):
224
+ def __init__(self, in_channels, out_channels, kernel_size, cond_channels=None):
225
+ super().__init__(in_channels, out_channels, kernel_size)
226
+
227
+ # FiLM modulation
228
+ if cond_channels is not None:
229
+ self.film_proj = torch.nn.Linear(cond_channels, 2 * out_channels)
230
+
231
+ # Initialize FiLM params (scale to 0, bias to 0)
232
+ torch.nn.init.zeros_(self.film_proj.weight)
233
+ torch.nn.init.zeros_(self.film_proj.bias)
234
+
235
+ def forward(self, x, temb=None):
236
+ x = super().forward(x) # (B, C, T, H, W)
237
+ if temb is not None:
238
+ scale, bias = self.film_proj(temb)[:, :, None, None, None].chunk(2, dim=1)
239
+ x = x * (scale + 1) + bias
240
+ return x
241
+
242
+ class BlurBlock3D(nn.Module):
243
+ def __init__(self, kernel=(1, 3, 3, 1), stride=(1, 2, 2)):
244
+ """
245
+ 3D BlurPool block.
246
+ Applies blur to spatial dimensions only by default.
247
+ """
248
+ super().__init__()
249
+ self.stride = stride
250
+
251
+ kernel = torch.tensor(kernel, dtype=torch.float32, requires_grad=False)
252
+ kernel = kernel[None, :] * kernel[:, None]
253
+ kernel /= kernel.sum()
254
+
255
+ kernel = kernel.unsqueeze(0).unsqueeze(0).unsqueeze(0) # shape: (1, 1, 1, H, W)
256
+ self.register_buffer("kernel", kernel)
257
+
258
+ def calc_same_pad(self, i: int, k: int, s: int) -> int:
259
+ return max((math.ceil(i / s) - 1) * s + (k - 1) + 1 - i, 0)
260
+
261
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
262
+ _, c, t, h, w = x.shape
263
+ kd, kh, kw = self.kernel.shape[-3:]
264
+ sd, sh, sw = self.stride
265
+
266
+ # Only apply padding to H and W
267
+ pad_h = self.calc_same_pad(h, kh, sh)
268
+ pad_w = self.calc_same_pad(w, kw, sw)
269
+ pad_d = 0 if sd == 1 else self.calc_same_pad(t, kd, sd)
270
+
271
+ if pad_h > 0 or pad_w > 0 or pad_d > 0:
272
+ x = F.pad(x, [pad_w // 2, pad_w - pad_w // 2,
273
+ pad_h // 2, pad_h - pad_h // 2,
274
+ pad_d // 2, pad_d - pad_d // 2])
275
+
276
+ weight = self.kernel.expand(c, 1, -1, -1, -1)
277
+
278
+ return F.conv3d(x, weight, stride=self.stride, groups=c)
279
+
280
+ class NLayer3DDiscriminator(torch.nn.Module):
281
+ def __init__(
282
+ self,
283
+ num_channels: int = 3,
284
+ hidden_channels: int = 128,
285
+ num_stages: int = 3,
286
+ blur_resample: bool = True,
287
+ blur_kernel_size: int = 4,
288
+ with_condition: bool = False,
289
+ ):
290
+ """ Initializes the NLayer3DDiscriminator.
291
+
292
+ Args:
293
+ num_channels -> int: The number of input channels.
294
+ hidden_channels -> int: The number of hidden channels.
295
+ num_stages -> int: The number of stages.
296
+ blur_resample -> bool: Whether to use blur resampling.
297
+ blur_kernel_size -> int: The blur kernel size.
298
+ """
299
+ super().__init__()
300
+ assert num_stages > 0, "Discriminator cannot have 0 stages"
301
+ assert (not blur_resample) or (blur_kernel_size >= 3 and blur_kernel_size <= 5), "Blur kernel size must be in [3,5] when sampling]"
302
+
303
+ in_channel_mult = (1,) + tuple(map(lambda t: 2**t, range(num_stages)))
304
+ init_kernel_size = 5
305
+ activation = functools.partial(torch.nn.LeakyReLU, negative_slope=0.1)
306
+
307
+ self.with_condition = with_condition
308
+ if with_condition:
309
+ cond_channels = 768
310
+ self.time_emb = SinusoidalTimeEmbedding(128)
311
+ self.time_proj = torch.nn.Sequential(
312
+ torch.nn.Linear(128, cond_channels),
313
+ torch.nn.SiLU(),
314
+ torch.nn.Linear(cond_channels, cond_channels),
315
+ )
316
+ else:
317
+ cond_channels = None
318
+
319
+ self.block_in = torch.nn.Sequential(
320
+ Conv3dSame(
321
+ num_channels,
322
+ hidden_channels,
323
+ kernel_size=init_kernel_size
324
+ ),
325
+ activation(),
326
+ )
327
+
328
+ BLUR_KERNEL_MAP = {
329
+ 3: (1,2,1),
330
+ 4: (1,3,3,1),
331
+ 5: (1,4,6,4,1),
332
+ }
333
+ num_downsample_temp_stage = int(num_stages * 1/3)
334
+ downsample_temp = [False] * num_downsample_temp_stage + [True] * (num_stages - num_downsample_temp_stage)
335
+
336
+ discriminator_blocks = []
337
+ for i_level in range(num_stages):
338
+ in_channels = hidden_channels * in_channel_mult[i_level]
339
+ out_channels = hidden_channels * in_channel_mult[i_level + 1]
340
+ conv_block = ModulatedConv3dSame(
341
+ in_channels,
342
+ out_channels,
343
+ kernel_size=3,
344
+ cond_channels=cond_channels
345
+ )
346
+ discriminator_blocks.append(conv_block)
347
+ down_block = torch.nn.Sequential(
348
+ torch.nn.AvgPool3d(kernel_size=2, stride=(2, 2, 2) if downsample_temp[i_level] else (1, 2, 2)) if not blur_resample else BlurBlock3D(BLUR_KERNEL_MAP[blur_kernel_size], stride=(2, 2, 2) if downsample_temp[i_level] else (1, 2, 2)),
349
+ torch.nn.GroupNorm(32, out_channels),
350
+ activation(),
351
+ )
352
+ discriminator_blocks.append(down_block)
353
+
354
+ self.blocks = torch.nn.ModuleList(discriminator_blocks)
355
+ self.pool = torch.nn.AdaptiveMaxPool3d((2, 16, 16))
356
+ self.to_logits = torch.nn.Sequential(
357
+ Conv3dSame(out_channels, out_channels, 1),
358
+ activation(),
359
+ Conv3dSame(out_channels, 1, kernel_size=5)
360
+ )
361
+
362
+ def forward(self, x: torch.Tensor, condition: torch.Tensor = None) -> torch.Tensor:
363
+ """ Forward pass.
364
+
365
+ Args:
366
+ x -> torch.Tensor: The input tensor of shape [b t c h w].
367
+
368
+ Returns:
369
+ output -> torch.Tensor: The output tensor.
370
+ """
371
+
372
+ x = rearrange(x, 'b t c h w -> b c t h w')
373
+
374
+ hidden_states = self.block_in(x)
375
+ if condition is not None and self.with_condition:
376
+ temb = self.time_proj(self.time_emb(condition * 1000.0))
377
+ else:
378
+ temb = None
379
+
380
+ for i, block in enumerate(self.blocks):
381
+ if i % 2 == 0:
382
+ hidden_states = block(hidden_states, temb) # conv_block
383
+ else:
384
+ hidden_states = block(hidden_states) # down_block
385
+
386
+ hidden_states = self.pool(hidden_states)
387
+ return self.to_logits(hidden_states)
388
+
misc/lpips.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # For licensing see accompanying LICENSE file.
3
+ # Copyright (C) 2025 Apple Inc. All Rights Reserved.
4
+ #
5
+ """This file contains code for LPIPS.
6
+ Reference:
7
+ https://github.com/richzhang/PerceptualSimilarity/
8
+ https://github.com/CompVis/taming-transformers/blob/master/taming/modules/losses/lpips.py
9
+ https://github.com/CompVis/taming-transformers/blob/master/taming/util.py
10
+ """
11
+
12
+ import os
13
+ import hashlib
14
+ import requests
15
+ from collections import namedtuple
16
+ from tqdm import tqdm
17
+
18
+ import torch
19
+ import torch.nn as nn
20
+
21
+ from torchvision import models
22
+
23
+ _LPIPS_MEAN = [-0.030, -0.088, -0.188]
24
+ _LPIPS_STD = [0.458, 0.448, 0.450]
25
+
26
+
27
+ class LPIPS(nn.Module):
28
+ # Learned perceptual metric.
29
+ def __init__(self, dist, use_dropout=True):
30
+ super().__init__()
31
+ self.dist = dist
32
+ self.scaling_layer = ScalingLayer()
33
+ self.chns = [64, 128, 256, 512, 512] # vg16 features
34
+ self.net = vgg16(pretrained=True, requires_grad=False)
35
+ self.lin0 = NetLinLayer(self.chns[0], use_dropout=use_dropout)
36
+ self.lin1 = NetLinLayer(self.chns[1], use_dropout=use_dropout)
37
+ self.lin2 = NetLinLayer(self.chns[2], use_dropout=use_dropout)
38
+ self.lin3 = NetLinLayer(self.chns[3], use_dropout=use_dropout)
39
+ self.lin4 = NetLinLayer(self.chns[4], use_dropout=use_dropout)
40
+ self.load_pretrained()
41
+ for param in self.parameters():
42
+ param.requires_grad = False
43
+
44
+ def load_pretrained(self):
45
+ VGG_PATH = os.path.join(os.path.join("/root/.cache", "vgg.pth"))
46
+ self.load_state_dict(torch.load(VGG_PATH, map_location=torch.device("cpu")), strict=False)
47
+
48
+ def forward(self, input, target):
49
+ # Notably, the LPIPS w/ pre-trained weights expect the input in the range of [-1, 1].
50
+ # However, our codebase assumes all inputs are in range of [0, 1], and thus a scaling is needed.
51
+ input = input * 2. - 1.
52
+ target = target * 2. - 1.
53
+ in0_input, in1_input = (self.scaling_layer(input), self.scaling_layer(target))
54
+ outs0, outs1 = self.net(in0_input), self.net(in1_input)
55
+ feats0, feats1, diffs = {}, {}, {}
56
+ lins = [self.lin0, self.lin1, self.lin2, self.lin3, self.lin4]
57
+ for kk in range(len(self.chns)):
58
+ feats0[kk], feats1[kk] = normalize_tensor(outs0[kk]), normalize_tensor(outs1[kk])
59
+ diffs[kk] = (feats0[kk] - feats1[kk]) ** 2
60
+
61
+ res = [spatial_average(lins[kk].model(diffs[kk]), keepdim=True) for kk in range(len(self.chns))]
62
+ val = res[0]
63
+ for l in range(1, len(self.chns)):
64
+ val += res[l]
65
+ return val
66
+
67
+
68
+ class ScalingLayer(nn.Module):
69
+ def __init__(self):
70
+ super(ScalingLayer, self).__init__()
71
+ self.register_buffer("shift", torch.Tensor(_LPIPS_MEAN)[None, :, None, None])
72
+ self.register_buffer("scale", torch.Tensor(_LPIPS_STD)[None, :, None, None])
73
+
74
+ def forward(self, inp):
75
+ return (inp - self.shift) / self.scale
76
+
77
+
78
+ class NetLinLayer(nn.Module):
79
+ """A single linear layer which does a 1x1 conv."""
80
+
81
+ def __init__(self, chn_in, chn_out=1, use_dropout=False):
82
+ super(NetLinLayer, self).__init__()
83
+ layers = (
84
+ [
85
+ nn.Dropout(),
86
+ ]
87
+ if (use_dropout)
88
+ else []
89
+ )
90
+ layers += [
91
+ nn.Conv2d(chn_in, chn_out, 1, stride=1, padding=0, bias=False),
92
+ ]
93
+ self.model = nn.Sequential(*layers)
94
+
95
+
96
+ class vgg16(torch.nn.Module):
97
+ def __init__(self, requires_grad=False, pretrained=True):
98
+ super(vgg16, self).__init__()
99
+ vgg_pretrained_features = models.vgg16(weights=models.VGG16_Weights.IMAGENET1K_V1).features
100
+ self.slice1 = torch.nn.Sequential()
101
+ self.slice2 = torch.nn.Sequential()
102
+ self.slice3 = torch.nn.Sequential()
103
+ self.slice4 = torch.nn.Sequential()
104
+ self.slice5 = torch.nn.Sequential()
105
+ self.N_slices = 5
106
+ for x in range(4):
107
+ self.slice1.add_module(str(x), vgg_pretrained_features[x])
108
+ for x in range(4, 9):
109
+ self.slice2.add_module(str(x), vgg_pretrained_features[x])
110
+ for x in range(9, 16):
111
+ self.slice3.add_module(str(x), vgg_pretrained_features[x])
112
+ for x in range(16, 23):
113
+ self.slice4.add_module(str(x), vgg_pretrained_features[x])
114
+ for x in range(23, 30):
115
+ self.slice5.add_module(str(x), vgg_pretrained_features[x])
116
+ if not requires_grad:
117
+ for param in self.parameters():
118
+ param.requires_grad = False
119
+
120
+ def forward(self, X):
121
+ h = self.slice1(X)
122
+ h_relu1_2 = h
123
+ h = self.slice2(h)
124
+ h_relu2_2 = h
125
+ h = self.slice3(h)
126
+ h_relu3_3 = h
127
+ h = self.slice4(h)
128
+ h_relu4_3 = h
129
+ h = self.slice5(h)
130
+ h_relu5_3 = h
131
+ vgg_outputs = namedtuple("VggOutputs", ["relu1_2", "relu2_2", "relu3_3", "relu4_3", "relu5_3"])
132
+ out = vgg_outputs(h_relu1_2, h_relu2_2, h_relu3_3, h_relu4_3, h_relu5_3)
133
+ return out
134
+
135
+
136
+ def normalize_tensor(x, eps=1e-10):
137
+ norm_factor = torch.sqrt(torch.sum(x**2, dim=1, keepdim=True))
138
+ return x / (norm_factor + eps)
139
+
140
+
141
+ def spatial_average(x, keepdim=True):
142
+ return x.mean([2, 3], keepdim=keepdim)
misc/pe.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # For licensing see accompanying LICENSE file.
3
+ # Copyright (C) 2025 Apple Inc. All Rights Reserved.
4
+ #
5
+ from math import pi, sqrt
6
+ import torch
7
+ from torch import nn
8
+
9
+ from einops import rearrange, repeat
10
+
11
+
12
+ def broadcat(tensors, dim = -1):
13
+ num_tensors = len(tensors)
14
+ shape_lens = set(list(map(lambda t: len(t.shape), tensors)))
15
+ assert len(shape_lens) == 1, 'tensors must all have the same number of dimensions'
16
+ shape_len = list(shape_lens)[0]
17
+ dim = (dim + shape_len) if dim < 0 else dim
18
+ dims = list(zip(*map(lambda t: list(t.shape), tensors)))
19
+ expandable_dims = [(i, val) for i, val in enumerate(dims) if i != dim]
20
+ assert all([*map(lambda t: len(set(t[1])) <= 2, expandable_dims)]), 'invalid dimensions for broadcastable concatentation'
21
+ max_dims = list(map(lambda t: (t[0], max(t[1])), expandable_dims))
22
+ expanded_dims = list(map(lambda t: (t[0], (t[1],) * num_tensors), max_dims))
23
+ expanded_dims.insert(dim, (dim, dims[dim]))
24
+ expandable_shapes = list(zip(*map(lambda t: t[1], expanded_dims)))
25
+ tensors = list(map(lambda t: t[0].expand(*t[1]), zip(tensors, expandable_shapes)))
26
+ return torch.cat(tensors, dim = dim)
27
+
28
+ def rotate_half(x):
29
+ x = rearrange(x, '... (d r) -> ... d r', r = 2)
30
+ x1, x2 = x.unbind(dim = -1)
31
+ x = torch.stack((-x2, x1), dim = -1)
32
+ return rearrange(x, '... d r -> ... (d r)')
33
+
34
+
35
+ def apply_rope(t, freqs):
36
+ return t * freqs.cos() + rotate_half(t) * freqs.sin()
37
+
38
+
39
+ def get_positions(h=0, w=0, txt_size=0, pt_seq_len=None, duplicate=0, mode='3d'):
40
+ assert mode in ['1d', '2d', '3d'], "mode must be one of ['1d', '2d', '3d']"
41
+ assert h * w + txt_size > 0, "at least one of img_size or txt_size must be greater than 0"
42
+ mean_len = sqrt(h * w)
43
+ pt_seq_len = pt_seq_len or mean_len
44
+ if mode == '1d':
45
+ pos_txt = torch.arange(txt_size)
46
+ pos_img = torch.arange(h * w) # / (h * w) * (pt_seq_len ** 2)
47
+ pos = torch.cat([pos_txt, pos_img + txt_size], dim=0).unsqueeze(-1)
48
+ else:
49
+ assert h * w > 0, "2D/3D RoPE requires img_size > 0"
50
+
51
+ px = torch.arange(h) / mean_len * pt_seq_len
52
+ py = torch.arange(w) / mean_len * pt_seq_len
53
+ px, py = [pi.reshape(-1) for pi in torch.meshgrid(px, py, indexing='ij')]
54
+ if mode == '2d':
55
+ assert txt_size == 0, "2D RoPE does not support text conditioning"
56
+ pos = [px, py]
57
+
58
+ else: # mode == '3d'
59
+ if duplicate == 0:
60
+ pos = [px, py, torch.zeros_like(px)]
61
+ else: # it has sequence length, this is for VideoData
62
+ pos = [torch.cat([px for _ in range(duplicate)]),
63
+ torch.cat([py for _ in range(duplicate)]),
64
+ torch.arange(duplicate).repeat_interleave(h * w)]
65
+
66
+ if txt_size > 0: # text is used as conditioned
67
+ pt = torch.arange(txt_size) / txt_size * pt_seq_len
68
+ pos = [ torch.cat([torch.zeros_like(pt), pos[0]]),
69
+ torch.cat([torch.zeros_like(pt), pos[1]]),
70
+ torch.cat([pt, pos[2]])]
71
+ pos = torch.stack(pos, dim=-1)
72
+ return pos
73
+
74
+
75
+ class VisionRotaryEmbeddingFast(nn.Module):
76
+ def __init__(
77
+ self,
78
+ dim, # half-dim
79
+ pt_seq_len=16,
80
+ ft_seq_len=None,
81
+ latent_len=0,
82
+ custom_freqs = None,
83
+ freqs_for = 'lang',
84
+ theta = 10000,
85
+ max_freq = 10,
86
+ num_freqs = 1,
87
+ dim_split=None,
88
+ no_buffer=False,
89
+ is_1d=False,
90
+ ):
91
+ super().__init__()
92
+
93
+ # length is normalized to pt_seq_len
94
+ if is_1d: # standard 1D-RoPE
95
+ assert freqs_for == 'lang', "RoPE for language settings"
96
+ dim_split, dim = [dim], 2 * dim
97
+ self.freqs = 1. / (theta ** (torch.arange(0, dim, 2)[:(dim // 2)].float() / dim))
98
+
99
+ else:
100
+ if ft_seq_len is None:
101
+ ft_seq_len = pt_seq_len
102
+ if latent_len > 0:
103
+ if dim_split is None: dim_split = [dim - 8, 8]
104
+ dim, latent_dim = dim_split
105
+ else:
106
+ dim_split = [dim]
107
+ if custom_freqs:
108
+ self.freqs = custom_freqs
109
+ elif freqs_for == 'lang':
110
+ self.freqs = 1. / (theta ** (torch.arange(0, dim, 2)[:(dim // 2)].float() / dim))
111
+ elif freqs_for == 'pixel':
112
+ self.freqs = torch.linspace(1., max_freq / 2, dim // 2) * pi
113
+ elif freqs_for == 'constant':
114
+ self.freqs = torch.ones(num_freqs).float()
115
+ else:
116
+ raise ValueError(f'unknown modality {freqs_for}')
117
+ if latent_len > 0:
118
+ self.freqs2 = 1. / (theta ** (torch.arange(0, latent_dim).float() / latent_dim))
119
+
120
+ self.is_1d = is_1d
121
+ self.pt_seq_len = pt_seq_len
122
+ self.ft_seq_len = ft_seq_len
123
+ self.latent_len = latent_len
124
+
125
+ # NOTE: deprecated (do not touch, will affect old checkpoints) #
126
+ if not no_buffer and pt_seq_len > 0:
127
+ _deprecated = torch.zeros(pt_seq_len ** 2, sum(dim_split) * 2)
128
+ if latent_len > 0:
129
+ _deprecated = torch.cat([torch.zeros(latent_len, sum(dim_split) * 2), _deprecated], dim=0)
130
+ self.register_buffer("freqs_cos", _deprecated)
131
+ self.register_buffer("freqs_sin", _deprecated)
132
+ # ------------------------------------------------------------ #
133
+
134
+ def forward(self, pos):
135
+ if not isinstance(pos, torch.Tensor):
136
+ pos = torch.tensor(pos).to(self.freqs_cos.device)
137
+
138
+ if not self.is_1d: # this is 2D or 3D rope
139
+ assert pos.shape[-1] > 1, "2D/3D RoPE requires multi-dimensional positions"
140
+ freqs_all = [
141
+ torch.einsum('..., f -> ... f', pos[..., 0], self.freqs.to(pos.device)),
142
+ torch.einsum('..., f -> ... f', pos[..., 1], self.freqs.to(pos.device)),
143
+ ]
144
+ if pos.shape[-1] == 3: # additional latent dimension (maybe text)
145
+ freqs_all.append(torch.einsum('..., f -> ... f', pos[..., 2], self.freqs2.to(pos.device)))
146
+ freqs_all = torch.cat(freqs_all, -1)
147
+ else:
148
+ freqs_all = torch.einsum('..., f -> ... f', pos[..., 0], self.freqs.to(pos.device))
149
+ freqs_all = repeat(freqs_all, '... n -> ... (n r)', r = 2)
150
+ return freqs_all
151
+
misc/wan_vae2.py ADDED
@@ -0,0 +1,1000 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # For licensing see accompanying LICENSE file.
3
+ # Copyright (C) 2025 Apple Inc. All Rights Reserved.
4
+ #
5
+
6
+ # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
7
+
8
+ import logging
9
+ import torch
10
+ import torch.cuda.amp as amp
11
+ import torch.nn as nn
12
+ import torch.nn.functional as F
13
+ from einops import rearrange
14
+
15
+ __all__ = [
16
+ "Wan2_2_VAE",
17
+ ]
18
+
19
+ CACHE_T = 2
20
+
21
+
22
+ class CausalConv3d(nn.Conv3d):
23
+ """
24
+ Causal 3d convolusion.
25
+ """
26
+
27
+ def __init__(self, *args, **kwargs):
28
+ super().__init__(*args, **kwargs)
29
+ self._padding = (
30
+ self.padding[2],
31
+ self.padding[2],
32
+ self.padding[1],
33
+ self.padding[1],
34
+ 2 * self.padding[0],
35
+ 0,
36
+ )
37
+ self.padding = (0, 0, 0)
38
+
39
+ def forward(self, x, cache_x=None):
40
+ padding = list(self._padding)
41
+ if cache_x is not None and self._padding[4] > 0:
42
+ cache_x = cache_x.to(x.device)
43
+ x = torch.cat([cache_x, x], dim=2)
44
+ padding[4] -= cache_x.shape[2]
45
+ x = F.pad(x, padding)
46
+
47
+ return super().forward(x)
48
+
49
+
50
+ class RMS_norm(nn.Module):
51
+
52
+ def __init__(self, dim, channel_first=True, images=True, bias=False):
53
+ super().__init__()
54
+ broadcastable_dims = (1, 1, 1) if not images else (1, 1)
55
+ shape = (dim, *broadcastable_dims) if channel_first else (dim,)
56
+
57
+ self.channel_first = channel_first
58
+ self.scale = dim**0.5
59
+ self.gamma = nn.Parameter(torch.ones(shape))
60
+ self.bias = nn.Parameter(torch.zeros(shape)) if bias else 0.0
61
+
62
+ def forward(self, x):
63
+ return (F.normalize(x, dim=(1 if self.channel_first else -1)) *
64
+ self.scale * self.gamma + self.bias)
65
+
66
+
67
+ class Upsample(nn.Upsample):
68
+
69
+ def forward(self, x):
70
+ """
71
+ Fix bfloat16 support for nearest neighbor interpolation.
72
+ """
73
+ return super().forward(x.float()).type_as(x)
74
+
75
+
76
+ class Resample(nn.Module):
77
+
78
+ def __init__(self, dim, mode):
79
+ assert mode in (
80
+ "none",
81
+ "upsample2d",
82
+ "upsample3d",
83
+ "downsample2d",
84
+ "downsample3d",
85
+ )
86
+ super().__init__()
87
+ self.dim = dim
88
+ self.mode = mode
89
+
90
+ # layers
91
+ if mode == "upsample2d":
92
+ self.resample = nn.Sequential(
93
+ Upsample(scale_factor=(2.0, 2.0), mode="nearest-exact"),
94
+ nn.Conv2d(dim, dim, 3, padding=1),
95
+ )
96
+ elif mode == "upsample3d":
97
+ self.resample = nn.Sequential(
98
+ Upsample(scale_factor=(2.0, 2.0), mode="nearest-exact"),
99
+ nn.Conv2d(dim, dim, 3, padding=1),
100
+ # nn.Conv2d(dim, dim//2, 3, padding=1)
101
+ )
102
+ self.time_conv = CausalConv3d(
103
+ dim, dim * 2, (3, 1, 1), padding=(1, 0, 0))
104
+ elif mode == "downsample2d":
105
+ self.resample = nn.Sequential(
106
+ nn.ZeroPad2d((0, 1, 0, 1)),
107
+ nn.Conv2d(dim, dim, 3, stride=(2, 2)))
108
+ elif mode == "downsample3d":
109
+ self.resample = nn.Sequential(
110
+ nn.ZeroPad2d((0, 1, 0, 1)),
111
+ nn.Conv2d(dim, dim, 3, stride=(2, 2)))
112
+ self.time_conv = CausalConv3d(
113
+ dim, dim, (3, 1, 1), stride=(2, 1, 1), padding=(0, 0, 0))
114
+ else:
115
+ self.resample = nn.Identity()
116
+
117
+ def forward(self, x, feat_cache=None, feat_idx=[0]):
118
+ b, c, t, h, w = x.size()
119
+ if self.mode == "upsample3d":
120
+ if feat_cache is not None:
121
+ idx = feat_idx[0]
122
+ if feat_cache[idx] is None:
123
+ feat_cache[idx] = "Rep"
124
+ feat_idx[0] += 1
125
+ else:
126
+ cache_x = x[:, :, -CACHE_T:, :, :].clone()
127
+ if (cache_x.shape[2] < 2 and feat_cache[idx] is not None and
128
+ feat_cache[idx] != "Rep"):
129
+ # cache last frame of last two chunk
130
+ cache_x = torch.cat(
131
+ [
132
+ feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
133
+ cache_x.device),
134
+ cache_x,
135
+ ],
136
+ dim=2,
137
+ )
138
+ if (cache_x.shape[2] < 2 and feat_cache[idx] is not None and
139
+ feat_cache[idx] == "Rep"):
140
+ cache_x = torch.cat(
141
+ [
142
+ torch.zeros_like(cache_x).to(cache_x.device),
143
+ cache_x
144
+ ],
145
+ dim=2,
146
+ )
147
+ if feat_cache[idx] == "Rep":
148
+ x = self.time_conv(x)
149
+ else:
150
+ x = self.time_conv(x, feat_cache[idx])
151
+ feat_cache[idx] = cache_x
152
+ feat_idx[0] += 1
153
+ x = x.reshape(b, 2, c, t, h, w)
154
+ x = torch.stack((x[:, 0, :, :, :, :], x[:, 1, :, :, :, :]),
155
+ 3)
156
+ x = x.reshape(b, c, t * 2, h, w)
157
+ t = x.shape[2]
158
+ x = rearrange(x, "b c t h w -> (b t) c h w")
159
+ x = self.resample(x)
160
+ x = rearrange(x, "(b t) c h w -> b c t h w", t=t)
161
+
162
+ if self.mode == "downsample3d":
163
+ if feat_cache is not None:
164
+ idx = feat_idx[0]
165
+ if feat_cache[idx] is None:
166
+ feat_cache[idx] = x.clone()
167
+ feat_idx[0] += 1
168
+ else:
169
+ cache_x = x[:, :, -1:, :, :].clone()
170
+ x = self.time_conv(
171
+ torch.cat([feat_cache[idx][:, :, -1:, :, :], x], 2))
172
+ feat_cache[idx] = cache_x
173
+ feat_idx[0] += 1
174
+ return x
175
+
176
+ def init_weight(self, conv):
177
+ conv_weight = conv.weight.detach().clone()
178
+ nn.init.zeros_(conv_weight)
179
+ c1, c2, t, h, w = conv_weight.size()
180
+ one_matrix = torch.eye(c1, c2)
181
+ init_matrix = one_matrix
182
+ nn.init.zeros_(conv_weight)
183
+ conv_weight.data[:, :, 1, 0, 0] = init_matrix # * 0.5
184
+ conv.weight = nn.Parameter(conv_weight)
185
+ nn.init.zeros_(conv.bias.data)
186
+
187
+ def init_weight2(self, conv):
188
+ conv_weight = conv.weight.data.detach().clone()
189
+ nn.init.zeros_(conv_weight)
190
+ c1, c2, t, h, w = conv_weight.size()
191
+ init_matrix = torch.eye(c1 // 2, c2)
192
+ conv_weight[:c1 // 2, :, -1, 0, 0] = init_matrix
193
+ conv_weight[c1 // 2:, :, -1, 0, 0] = init_matrix
194
+ conv.weight = nn.Parameter(conv_weight)
195
+ nn.init.zeros_(conv.bias.data)
196
+
197
+
198
+ class ResidualBlock(nn.Module):
199
+
200
+ def __init__(self, in_dim, out_dim, dropout=0.0):
201
+ super().__init__()
202
+ self.in_dim = in_dim
203
+ self.out_dim = out_dim
204
+
205
+ # layers
206
+ self.residual = nn.Sequential(
207
+ RMS_norm(in_dim, images=False),
208
+ nn.SiLU(),
209
+ CausalConv3d(in_dim, out_dim, 3, padding=1),
210
+ RMS_norm(out_dim, images=False),
211
+ nn.SiLU(),
212
+ nn.Dropout(dropout),
213
+ CausalConv3d(out_dim, out_dim, 3, padding=1),
214
+ )
215
+ self.shortcut = (
216
+ CausalConv3d(in_dim, out_dim, 1)
217
+ if in_dim != out_dim else nn.Identity())
218
+
219
+ def forward(self, x, feat_cache=None, feat_idx=[0]):
220
+ h = self.shortcut(x)
221
+ for layer in self.residual:
222
+ if isinstance(layer, CausalConv3d) and feat_cache is not None:
223
+ idx = feat_idx[0]
224
+ cache_x = x[:, :, -CACHE_T:, :, :].clone()
225
+ if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
226
+ # cache last frame of last two chunk
227
+ cache_x = torch.cat(
228
+ [
229
+ feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
230
+ cache_x.device),
231
+ cache_x,
232
+ ],
233
+ dim=2,
234
+ )
235
+ x = layer(x, feat_cache[idx])
236
+ feat_cache[idx] = cache_x
237
+ feat_idx[0] += 1
238
+ else:
239
+ x = layer(x)
240
+ return x + h
241
+
242
+
243
+ class AttentionBlock(nn.Module):
244
+ """
245
+ Causal self-attention with a single head.
246
+ """
247
+
248
+ def __init__(self, dim):
249
+ super().__init__()
250
+ self.dim = dim
251
+
252
+ # layers
253
+ self.norm = RMS_norm(dim)
254
+ self.to_qkv = nn.Conv2d(dim, dim * 3, 1)
255
+ self.proj = nn.Conv2d(dim, dim, 1)
256
+
257
+ # zero out the last layer params
258
+ nn.init.zeros_(self.proj.weight)
259
+
260
+ def forward(self, x):
261
+ identity = x
262
+ b, c, t, h, w = x.size()
263
+ x = rearrange(x, "b c t h w -> (b t) c h w")
264
+ x = self.norm(x)
265
+ # compute query, key, value
266
+ q, k, v = (
267
+ self.to_qkv(x).reshape(b * t, 1, c * 3,
268
+ -1).permute(0, 1, 3,
269
+ 2).contiguous().chunk(3, dim=-1))
270
+
271
+ # apply attention
272
+ x = F.scaled_dot_product_attention(
273
+ q,
274
+ k,
275
+ v,
276
+ )
277
+ x = x.squeeze(1).permute(0, 2, 1).reshape(b * t, c, h, w)
278
+
279
+ # output
280
+ x = self.proj(x)
281
+ x = rearrange(x, "(b t) c h w-> b c t h w", t=t)
282
+ return x + identity
283
+
284
+
285
+ def patchify(x, patch_size):
286
+ if patch_size == 1:
287
+ return x
288
+ if x.dim() == 4:
289
+ x = rearrange(
290
+ x, "b c (h q) (w r) -> b (c r q) h w", q=patch_size, r=patch_size)
291
+ elif x.dim() == 5:
292
+ x = rearrange(
293
+ x,
294
+ "b c f (h q) (w r) -> b (c r q) f h w",
295
+ q=patch_size,
296
+ r=patch_size,
297
+ )
298
+ else:
299
+ raise ValueError(f"Invalid input shape: {x.shape}")
300
+
301
+ return x
302
+
303
+
304
+ def unpatchify(x, patch_size):
305
+ if patch_size == 1:
306
+ return x
307
+
308
+ if x.dim() == 4:
309
+ x = rearrange(
310
+ x, "b (c r q) h w -> b c (h q) (w r)", q=patch_size, r=patch_size)
311
+ elif x.dim() == 5:
312
+ x = rearrange(
313
+ x,
314
+ "b (c r q) f h w -> b c f (h q) (w r)",
315
+ q=patch_size,
316
+ r=patch_size,
317
+ )
318
+ return x
319
+
320
+
321
+ class AvgDown3D(nn.Module):
322
+
323
+ def __init__(
324
+ self,
325
+ in_channels,
326
+ out_channels,
327
+ factor_t,
328
+ factor_s=1,
329
+ ):
330
+ super().__init__()
331
+ self.in_channels = in_channels
332
+ self.out_channels = out_channels
333
+ self.factor_t = factor_t
334
+ self.factor_s = factor_s
335
+ self.factor = self.factor_t * self.factor_s * self.factor_s
336
+
337
+ assert in_channels * self.factor % out_channels == 0
338
+ self.group_size = in_channels * self.factor // out_channels
339
+
340
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
341
+ pad_t = (self.factor_t - x.shape[2] % self.factor_t) % self.factor_t
342
+ pad = (0, 0, 0, 0, pad_t, 0)
343
+ x = F.pad(x, pad)
344
+ B, C, T, H, W = x.shape
345
+ x = x.view(
346
+ B,
347
+ C,
348
+ T // self.factor_t,
349
+ self.factor_t,
350
+ H // self.factor_s,
351
+ self.factor_s,
352
+ W // self.factor_s,
353
+ self.factor_s,
354
+ )
355
+ x = x.permute(0, 1, 3, 5, 7, 2, 4, 6).contiguous()
356
+ x = x.view(
357
+ B,
358
+ C * self.factor,
359
+ T // self.factor_t,
360
+ H // self.factor_s,
361
+ W // self.factor_s,
362
+ )
363
+ x = x.view(
364
+ B,
365
+ self.out_channels,
366
+ self.group_size,
367
+ T // self.factor_t,
368
+ H // self.factor_s,
369
+ W // self.factor_s,
370
+ )
371
+ x = x.mean(dim=2)
372
+ return x
373
+
374
+
375
+ class DupUp3D(nn.Module):
376
+
377
+ def __init__(
378
+ self,
379
+ in_channels: int,
380
+ out_channels: int,
381
+ factor_t,
382
+ factor_s=1,
383
+ ):
384
+ super().__init__()
385
+ self.in_channels = in_channels
386
+ self.out_channels = out_channels
387
+
388
+ self.factor_t = factor_t
389
+ self.factor_s = factor_s
390
+ self.factor = self.factor_t * self.factor_s * self.factor_s
391
+
392
+ assert out_channels * self.factor % in_channels == 0
393
+ self.repeats = out_channels * self.factor // in_channels
394
+
395
+ def forward(self, x: torch.Tensor, first_chunk=False) -> torch.Tensor:
396
+ x = x.repeat_interleave(self.repeats, dim=1)
397
+ x = x.view(
398
+ x.size(0),
399
+ self.out_channels,
400
+ self.factor_t,
401
+ self.factor_s,
402
+ self.factor_s,
403
+ x.size(2),
404
+ x.size(3),
405
+ x.size(4),
406
+ )
407
+ x = x.permute(0, 1, 5, 2, 6, 3, 7, 4).contiguous()
408
+ x = x.view(
409
+ x.size(0),
410
+ self.out_channels,
411
+ x.size(2) * self.factor_t,
412
+ x.size(4) * self.factor_s,
413
+ x.size(6) * self.factor_s,
414
+ )
415
+ if first_chunk:
416
+ x = x[:, :, self.factor_t - 1:, :, :]
417
+ return x
418
+
419
+
420
+ class Down_ResidualBlock(nn.Module):
421
+
422
+ def __init__(self,
423
+ in_dim,
424
+ out_dim,
425
+ dropout,
426
+ mult,
427
+ temperal_downsample=False,
428
+ down_flag=False):
429
+ super().__init__()
430
+
431
+ # Shortcut path with downsample
432
+ self.avg_shortcut = AvgDown3D(
433
+ in_dim,
434
+ out_dim,
435
+ factor_t=2 if temperal_downsample else 1,
436
+ factor_s=2 if down_flag else 1,
437
+ )
438
+
439
+ # Main path with residual blocks and downsample
440
+ downsamples = []
441
+ for _ in range(mult):
442
+ downsamples.append(ResidualBlock(in_dim, out_dim, dropout))
443
+ in_dim = out_dim
444
+
445
+ # Add the final downsample block
446
+ if down_flag:
447
+ mode = "downsample3d" if temperal_downsample else "downsample2d"
448
+ downsamples.append(Resample(out_dim, mode=mode))
449
+
450
+ self.downsamples = nn.Sequential(*downsamples)
451
+
452
+ def forward(self, x, feat_cache=None, feat_idx=[0]):
453
+ x_copy = x.clone()
454
+ for module in self.downsamples:
455
+ x = module(x, feat_cache, feat_idx)
456
+
457
+ return x + self.avg_shortcut(x_copy)
458
+
459
+
460
+ class Up_ResidualBlock(nn.Module):
461
+
462
+ def __init__(self,
463
+ in_dim,
464
+ out_dim,
465
+ dropout,
466
+ mult,
467
+ temperal_upsample=False,
468
+ up_flag=False):
469
+ super().__init__()
470
+ # Shortcut path with upsample
471
+ if up_flag:
472
+ self.avg_shortcut = DupUp3D(
473
+ in_dim,
474
+ out_dim,
475
+ factor_t=2 if temperal_upsample else 1,
476
+ factor_s=2 if up_flag else 1,
477
+ )
478
+ else:
479
+ self.avg_shortcut = None
480
+
481
+ # Main path with residual blocks and upsample
482
+ upsamples = []
483
+ for _ in range(mult):
484
+ upsamples.append(ResidualBlock(in_dim, out_dim, dropout))
485
+ in_dim = out_dim
486
+
487
+ # Add the final upsample block
488
+ if up_flag:
489
+ mode = "upsample3d" if temperal_upsample else "upsample2d"
490
+ upsamples.append(Resample(out_dim, mode=mode))
491
+
492
+ self.upsamples = nn.Sequential(*upsamples)
493
+
494
+ def forward(self, x, feat_cache=None, feat_idx=[0], first_chunk=False):
495
+ x_main = x.clone()
496
+ for module in self.upsamples:
497
+ x_main = module(x_main, feat_cache, feat_idx)
498
+ if self.avg_shortcut is not None:
499
+ x_shortcut = self.avg_shortcut(x, first_chunk)
500
+ return x_main + x_shortcut
501
+ else:
502
+ return x_main
503
+
504
+
505
+ class Encoder3d(nn.Module):
506
+
507
+ def __init__(
508
+ self,
509
+ dim=128,
510
+ z_dim=4,
511
+ dim_mult=[1, 2, 4, 4],
512
+ num_res_blocks=2,
513
+ attn_scales=[],
514
+ temperal_downsample=[True, True, False],
515
+ dropout=0.0,
516
+ ):
517
+ super().__init__()
518
+ self.dim = dim
519
+ self.z_dim = z_dim
520
+ self.dim_mult = dim_mult
521
+ self.num_res_blocks = num_res_blocks
522
+ self.attn_scales = attn_scales
523
+ self.temperal_downsample = temperal_downsample
524
+
525
+ # dimensions
526
+ dims = [dim * u for u in [1] + dim_mult]
527
+ scale = 1.0
528
+
529
+ # init block
530
+ self.conv1 = CausalConv3d(12, dims[0], 3, padding=1)
531
+
532
+ # downsample blocks
533
+ downsamples = []
534
+ for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
535
+ t_down_flag = (
536
+ temperal_downsample[i]
537
+ if i < len(temperal_downsample) else False)
538
+ downsamples.append(
539
+ Down_ResidualBlock(
540
+ in_dim=in_dim,
541
+ out_dim=out_dim,
542
+ dropout=dropout,
543
+ mult=num_res_blocks,
544
+ temperal_downsample=t_down_flag,
545
+ down_flag=i != len(dim_mult) - 1,
546
+ ))
547
+ scale /= 2.0
548
+ self.downsamples = nn.Sequential(*downsamples)
549
+
550
+ # middle blocks
551
+ self.middle = nn.Sequential(
552
+ ResidualBlock(out_dim, out_dim, dropout),
553
+ AttentionBlock(out_dim),
554
+ ResidualBlock(out_dim, out_dim, dropout),
555
+ )
556
+
557
+ # # output blocks
558
+ self.head = nn.Sequential(
559
+ RMS_norm(out_dim, images=False),
560
+ nn.SiLU(),
561
+ CausalConv3d(out_dim, z_dim, 3, padding=1),
562
+ )
563
+
564
+ def forward(self, x, feat_cache=None, feat_idx=[0]):
565
+
566
+ if feat_cache is not None:
567
+ idx = feat_idx[0]
568
+ cache_x = x[:, :, -CACHE_T:, :, :].clone()
569
+ if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
570
+ cache_x = torch.cat(
571
+ [
572
+ feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
573
+ cache_x.device),
574
+ cache_x,
575
+ ],
576
+ dim=2,
577
+ )
578
+ x = self.conv1(x, feat_cache[idx])
579
+ feat_cache[idx] = cache_x
580
+ feat_idx[0] += 1
581
+ else:
582
+ x = self.conv1(x)
583
+
584
+ ## downsamples
585
+ for layer in self.downsamples:
586
+ if feat_cache is not None:
587
+ x = layer(x, feat_cache, feat_idx)
588
+ else:
589
+ x = layer(x)
590
+
591
+ ## middle
592
+ for layer in self.middle:
593
+ if isinstance(layer, ResidualBlock) and feat_cache is not None:
594
+ x = layer(x, feat_cache, feat_idx)
595
+ else:
596
+ x = layer(x)
597
+
598
+ ## head
599
+ for layer in self.head:
600
+ if isinstance(layer, CausalConv3d) and feat_cache is not None:
601
+ idx = feat_idx[0]
602
+ cache_x = x[:, :, -CACHE_T:, :, :].clone()
603
+ if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
604
+ cache_x = torch.cat(
605
+ [
606
+ feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
607
+ cache_x.device),
608
+ cache_x,
609
+ ],
610
+ dim=2,
611
+ )
612
+ x = layer(x, feat_cache[idx])
613
+ feat_cache[idx] = cache_x
614
+ feat_idx[0] += 1
615
+ else:
616
+ x = layer(x)
617
+
618
+ return x
619
+
620
+
621
+ class Decoder3d(nn.Module):
622
+
623
+ def __init__(
624
+ self,
625
+ dim=128,
626
+ z_dim=4,
627
+ dim_mult=[1, 2, 4, 4],
628
+ num_res_blocks=2,
629
+ attn_scales=[],
630
+ temperal_upsample=[False, True, True],
631
+ dropout=0.0,
632
+ ):
633
+ super().__init__()
634
+ self.dim = dim
635
+ self.z_dim = z_dim
636
+ self.dim_mult = dim_mult
637
+ self.num_res_blocks = num_res_blocks
638
+ self.attn_scales = attn_scales
639
+ self.temperal_upsample = temperal_upsample
640
+
641
+ # dimensions
642
+ dims = [dim * u for u in [dim_mult[-1]] + dim_mult[::-1]]
643
+ scale = 1.0 / 2**(len(dim_mult) - 2)
644
+ # init block
645
+ self.conv1 = CausalConv3d(z_dim, dims[0], 3, padding=1)
646
+
647
+ # middle blocks
648
+ self.middle = nn.Sequential(
649
+ ResidualBlock(dims[0], dims[0], dropout),
650
+ AttentionBlock(dims[0]),
651
+ ResidualBlock(dims[0], dims[0], dropout),
652
+ )
653
+
654
+ # upsample blocks
655
+ upsamples = []
656
+ for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
657
+ t_up_flag = temperal_upsample[i] if i < len(
658
+ temperal_upsample) else False
659
+ upsamples.append(
660
+ Up_ResidualBlock(
661
+ in_dim=in_dim,
662
+ out_dim=out_dim,
663
+ dropout=dropout,
664
+ mult=num_res_blocks + 1,
665
+ temperal_upsample=t_up_flag,
666
+ up_flag=i != len(dim_mult) - 1,
667
+ ))
668
+ self.upsamples = nn.Sequential(*upsamples)
669
+
670
+ # output blocks
671
+ self.head = nn.Sequential(
672
+ RMS_norm(out_dim, images=False),
673
+ nn.SiLU(),
674
+ CausalConv3d(out_dim, 12, 3, padding=1),
675
+ )
676
+
677
+ def forward(self, x, feat_cache=None, feat_idx=[0], first_chunk=False):
678
+ if feat_cache is not None:
679
+ idx = feat_idx[0]
680
+ cache_x = x[:, :, -CACHE_T:, :, :].clone()
681
+ if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
682
+ cache_x = torch.cat(
683
+ [
684
+ feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
685
+ cache_x.device),
686
+ cache_x,
687
+ ],
688
+ dim=2,
689
+ )
690
+ x = self.conv1(x, feat_cache[idx])
691
+ feat_cache[idx] = cache_x
692
+ feat_idx[0] += 1
693
+ else:
694
+ x = self.conv1(x)
695
+
696
+ for layer in self.middle:
697
+ if isinstance(layer, ResidualBlock) and feat_cache is not None:
698
+ x = layer(x, feat_cache, feat_idx)
699
+ else:
700
+ x = layer(x)
701
+
702
+ ## upsamples
703
+ for layer in self.upsamples:
704
+ if feat_cache is not None:
705
+ x = layer(x, feat_cache, feat_idx, first_chunk)
706
+ else:
707
+ x = layer(x)
708
+
709
+ ## head
710
+ for layer in self.head:
711
+ if isinstance(layer, CausalConv3d) and feat_cache is not None:
712
+ idx = feat_idx[0]
713
+ cache_x = x[:, :, -CACHE_T:, :, :].clone()
714
+ if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
715
+ cache_x = torch.cat(
716
+ [
717
+ feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
718
+ cache_x.device),
719
+ cache_x,
720
+ ],
721
+ dim=2,
722
+ )
723
+ x = layer(x, feat_cache[idx])
724
+ feat_cache[idx] = cache_x
725
+ feat_idx[0] += 1
726
+ else:
727
+ x = layer(x)
728
+ return x
729
+
730
+
731
+ def count_conv3d(model):
732
+ count = 0
733
+ for m in model.modules():
734
+ if isinstance(m, CausalConv3d):
735
+ count += 1
736
+ return count
737
+
738
+
739
+ class WanVAE_(nn.Module):
740
+
741
+ def __init__(
742
+ self,
743
+ dim=160,
744
+ dec_dim=256,
745
+ z_dim=16,
746
+ dim_mult=[1, 2, 4, 4],
747
+ num_res_blocks=2,
748
+ attn_scales=[],
749
+ temperal_downsample=[True, True, False],
750
+ dropout=0.0,
751
+ device='cuda'
752
+ ):
753
+ super().__init__()
754
+ self.dim = dim
755
+ self.z_dim = z_dim
756
+ self.dim_mult = dim_mult
757
+ self.num_res_blocks = num_res_blocks
758
+ self.attn_scales = attn_scales
759
+ self.temperal_downsample = temperal_downsample
760
+ self.temperal_upsample = temperal_downsample[::-1]
761
+
762
+ # modules
763
+ self.encoder = Encoder3d(
764
+ dim,
765
+ z_dim * 2,
766
+ dim_mult,
767
+ num_res_blocks,
768
+ attn_scales,
769
+ self.temperal_downsample,
770
+ dropout,
771
+ )
772
+ self.conv1 = CausalConv3d(z_dim * 2, z_dim * 2, 1)
773
+ self.conv2 = CausalConv3d(z_dim, z_dim, 1)
774
+ self.decoder = Decoder3d(
775
+ dec_dim,
776
+ z_dim,
777
+ dim_mult,
778
+ num_res_blocks,
779
+ attn_scales,
780
+ self.temperal_upsample,
781
+ dropout,
782
+ )
783
+
784
+ self.mean = torch.tensor(
785
+ [
786
+ -0.2289,
787
+ -0.0052,
788
+ -0.1323,
789
+ -0.2339,
790
+ -0.2799,
791
+ 0.0174,
792
+ 0.1838,
793
+ 0.1557,
794
+ -0.1382,
795
+ 0.0542,
796
+ 0.2813,
797
+ 0.0891,
798
+ 0.1570,
799
+ -0.0098,
800
+ 0.0375,
801
+ -0.1825,
802
+ -0.2246,
803
+ -0.1207,
804
+ -0.0698,
805
+ 0.5109,
806
+ 0.2665,
807
+ -0.2108,
808
+ -0.2158,
809
+ 0.2502,
810
+ -0.2055,
811
+ -0.0322,
812
+ 0.1109,
813
+ 0.1567,
814
+ -0.0729,
815
+ 0.0899,
816
+ -0.2799,
817
+ -0.1230,
818
+ -0.0313,
819
+ -0.1649,
820
+ 0.0117,
821
+ 0.0723,
822
+ -0.2839,
823
+ -0.2083,
824
+ -0.0520,
825
+ 0.3748,
826
+ 0.0152,
827
+ 0.1957,
828
+ 0.1433,
829
+ -0.2944,
830
+ 0.3573,
831
+ -0.0548,
832
+ -0.1681,
833
+ -0.0667,
834
+ ],
835
+ device=device,
836
+ )
837
+ self.std = torch.tensor(
838
+ [
839
+ 0.4765,
840
+ 1.0364,
841
+ 0.4514,
842
+ 1.1677,
843
+ 0.5313,
844
+ 0.4990,
845
+ 0.4818,
846
+ 0.5013,
847
+ 0.8158,
848
+ 1.0344,
849
+ 0.5894,
850
+ 1.0901,
851
+ 0.6885,
852
+ 0.6165,
853
+ 0.8454,
854
+ 0.4978,
855
+ 0.5759,
856
+ 0.3523,
857
+ 0.7135,
858
+ 0.6804,
859
+ 0.5833,
860
+ 1.4146,
861
+ 0.8986,
862
+ 0.5659,
863
+ 0.7069,
864
+ 0.5338,
865
+ 0.4889,
866
+ 0.4917,
867
+ 0.4069,
868
+ 0.4999,
869
+ 0.6866,
870
+ 0.4093,
871
+ 0.5709,
872
+ 0.6065,
873
+ 0.6415,
874
+ 0.4944,
875
+ 0.5726,
876
+ 1.2042,
877
+ 0.5458,
878
+ 1.6887,
879
+ 0.3971,
880
+ 1.0600,
881
+ 0.3943,
882
+ 0.5537,
883
+ 0.5444,
884
+ 0.4089,
885
+ 0.7468,
886
+ 0.7744,
887
+ ],
888
+ device=device,
889
+ )
890
+ self.scale = [self.mean, 1.0 / self.std]
891
+
892
+ def forward(self, x, scale=[0, 1]):
893
+ mu = self.encode(x, scale)
894
+ x_recon = self.decode(mu, scale)
895
+ return x_recon, mu
896
+
897
+ def encode(self, x, scale=None):
898
+ self.clear_cache()
899
+ x = patchify(x, patch_size=2)
900
+ t = x.shape[2]
901
+ iter_ = 1 + (t - 1) // 4
902
+ for i in range(iter_):
903
+ self._enc_conv_idx = [0]
904
+ if i == 0:
905
+ out = self.encoder(
906
+ x[:, :, :1, :, :],
907
+ feat_cache=self._enc_feat_map,
908
+ feat_idx=self._enc_conv_idx,
909
+ )
910
+ else:
911
+ out_ = self.encoder(
912
+ x[:, :, 1 + 4 * (i - 1):1 + 4 * i, :, :],
913
+ feat_cache=self._enc_feat_map,
914
+ feat_idx=self._enc_conv_idx,
915
+ )
916
+ out = torch.cat([out, out_], 2)
917
+ mu, log_var = self.conv1(out).chunk(2, dim=1)
918
+ if scale is not None:
919
+ if isinstance(scale[0], torch.Tensor):
920
+ mu = (mu - scale[0].view(1, self.z_dim, 1, 1, 1)) * scale[1].view(
921
+ 1, self.z_dim, 1, 1, 1)
922
+ else:
923
+ mu = (mu - scale[0]) * scale[1]
924
+ self.clear_cache()
925
+ return mu, log_var
926
+
927
+ def decode(self, z, scale=None):
928
+ self.clear_cache()
929
+ if scale is not None:
930
+ if isinstance(scale[0], torch.Tensor):
931
+ z = z / scale[1].view(1, self.z_dim, 1, 1, 1) + scale[0].view(
932
+ 1, self.z_dim, 1, 1, 1)
933
+ else:
934
+ z = z / scale[1] + scale[0]
935
+ iter_ = z.shape[2]
936
+ x = self.conv2(z)
937
+ for i in range(iter_):
938
+ self._conv_idx = [0]
939
+ if i == 0:
940
+ out = self.decoder(
941
+ x[:, :, i:i + 1, :, :],
942
+ feat_cache=self._feat_map,
943
+ feat_idx=self._conv_idx,
944
+ first_chunk=True,
945
+ )
946
+ else:
947
+ out_ = self.decoder(
948
+ x[:, :, i:i + 1, :, :],
949
+ feat_cache=self._feat_map,
950
+ feat_idx=self._conv_idx,
951
+ )
952
+ out = torch.cat([out, out_], 2)
953
+ out = unpatchify(out, patch_size=2)
954
+ self.clear_cache()
955
+ return out
956
+
957
+ def reparameterize(self, mu, log_var):
958
+ std = torch.exp(0.5 * log_var)
959
+ eps = torch.randn_like(std)
960
+ return eps * std + mu
961
+
962
+ def sample(self, imgs, scale, deterministic=False):
963
+ mu, log_var = self.encode(imgs, scale=scale)
964
+ if deterministic:
965
+ return mu
966
+ std = torch.exp(0.5 * log_var.clamp(-30.0, 20.0))
967
+ return mu + std * torch.randn_like(std)
968
+
969
+ def clear_cache(self):
970
+ self._conv_num = count_conv3d(self.decoder)
971
+ self._conv_idx = [0]
972
+ self._feat_map = [None] * self._conv_num
973
+ # cache encode
974
+ self._enc_conv_num = count_conv3d(self.encoder)
975
+ self._enc_conv_idx = [0]
976
+ self._enc_feat_map = [None] * self._enc_conv_num
977
+
978
+
979
+ def video_vae2(pretrained_path=None, z_dim=48, dim=160, device="cuda", **kwargs):
980
+ # params
981
+ cfg = dict(
982
+ dim=dim,
983
+ z_dim=z_dim,
984
+ dim_mult=[1, 2, 4, 4],
985
+ num_res_blocks=2,
986
+ attn_scales=[],
987
+ temperal_downsample=[False, True, True],
988
+ dropout=0.0,
989
+ device=device
990
+ )
991
+ cfg.update(**kwargs)
992
+
993
+ # init model
994
+ model = WanVAE_(**cfg)
995
+
996
+ # load checkpoint
997
+ logging.info(f"loading {pretrained_path}")
998
+ model.load_state_dict(torch.load(pretrained_path))
999
+
1000
+ return model
requirements_hf.txt ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Requirements for Hugging Face Spaces
2
+ # PyTorch with CUDA support - HF Spaces automatically provides CUDA-enabled PyTorch
3
+ # When you select GPU hardware, PyTorch will have full CUDA support
4
+ torch>=2.0.0
5
+ torchvision>=0.15.0
6
+ # Note: CUDA toolkit is pre-installed on HF Spaces GPU instances
7
+ # You can verify with: torch.cuda.is_available() and torch.cuda.get_device_name(0)
8
+
9
+ # Core dependencies
10
+ transformers>=4.30.0
11
+ accelerate>=0.20.0
12
+ torchinfo
13
+ einops
14
+ scipy
15
+ sentencepiece
16
+ wandb[media]
17
+ torchmetrics[image]
18
+ simple_parsing
19
+ opencv-python
20
+ psutil
21
+ pyyaml
22
+ av==12.3.0
23
+
24
+ # Gradio for web interface
25
+ gradio>=4.0.0
26
+
27
+ # Git dependencies
28
+ git+https://github.com/KeKsBoTer/torch-dwt
29
+ git+https://github.com/huggingface/diffusers.git
30
+
31
+ # Note: decord and webdataset are optional and may not be needed for inference
32
+ # If needed, install via: pip install decord webdataset
33
+
sample.py ADDED
@@ -0,0 +1,379 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # For licensing see accompanying LICENSE file.
3
+ # Copyright (C) 2025 Apple Inc. All Rights Reserved.
4
+ #
5
+ #!/usr/bin/env python3
6
+ """
7
+ Scalable Transformer Autoregressive Flow (STARFlow) Sampling Script
8
+
9
+ This script provides functionality for sampling from trained transformer autoregressive flow models.
10
+ Supports both image and video generation with various conditioning options.
11
+
12
+ Usage:
13
+ python sample.py --model_config_path config.yaml --checkpoint_path model.pth --caption "A cat"
14
+ """
15
+
16
+ import argparse
17
+ import copy
18
+ import pathlib
19
+ import time
20
+ from typing import Dict, List, Optional, Tuple, Union
21
+
22
+ import numpy as np
23
+ import torch
24
+ import torch.nn.functional as F
25
+ import torch.utils.data
26
+ import torchvision as tv
27
+ import tqdm
28
+ import yaml
29
+ from einops import repeat
30
+ from PIL import Image
31
+
32
+ # Local imports
33
+ import transformer_flow
34
+ import utils
35
+ from dataset import aspect_ratio_to_image_size
36
+ from train import get_tarflow_parser
37
+ from utils import process_denoising, save_samples_unified, load_model_config, encode_text, add_noise
38
+ from transformer_flow import KVCache
39
+ from misc import print
40
+
41
+
42
+ # Default caption templates for testing and demonstrations
43
+ DEFAULT_CAPTIONS = {
44
+ 'template1': "In the image, a corgi dog is wearing a Santa hat and is laying on a fluffy rug. The dog's tongue is sticking out and it appears to be happy. There are two pumpkins and a basket of leaves nearby, indicating that the scene takes place during the fall season. The background features a Christmas tree, further suggesting the holiday atmosphere. The image has a warm and cozy feel to it, with the dog looking adorable in its hat and the pumpkins adding a festive touch.",
45
+ 'template2': "A close-up portrait of a cheerful Corgi dog, showcasing its fluffy, sandy-brown fur and perky ears. The dog has a friendly expression with a slight smile, looking directly into the camera. Set against a soft, natural green background, the image is captured in a high-definition, realistic photography style, emphasizing the texture of the fur and the vibrant colors.",
46
+ 'template3': "A high-resolution, wide-angle selfie photograph of Albert Einstein in a garden setting. Einstein looks directly into the camera with a gentle, knowing smile. His distinctive wild white hair and bushy mustache frame a face marked by thoughtful wrinkles. He wears a classic tweed jacket over a simple shirt. In the background, lush greenery and flowering bushes under soft daylight create a serene, scholarly atmosphere. Ultra-realistic style, 4K detail.",
47
+ 'template4': 'A close-up, high-resolution selfie of a red panda perched on a tree branch, its large dark eyes looking directly into the lens. Rich reddish-orange fur with white facial markings contrasts against the lush green bamboo forest behind. Soft sunlight filters through the leaves, casting a warm, natural glow over the scene. Ultra-realistic detail, digital photograph style, 4K resolution.',
48
+ 'template5': "A realistic selfie of a llama standing in front of a classic Ivy League building on the Princeton University campus. He is smiling gently, wearing his iconic wild hair and mustache, dressed in a wool sweater and collared shirt. The photo has a vintage, slightly sepia tone, with soft natural lighting and leafy trees in the background, capturing an academic and historical vibe.",
49
+ }
50
+
51
+
52
+
53
+
54
+ def setup_model_and_components(args: argparse.Namespace) -> Tuple[torch.nn.Module, Optional[torch.nn.Module], tuple]:
55
+ """Initialize and load the model, VAE, and text encoder."""
56
+ dist = utils.Distributed()
57
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
58
+
59
+ # Set random seed
60
+ utils.set_random_seed(args.seed + dist.rank)
61
+
62
+ # Setup text encoder
63
+ tokenizer, text_encoder = utils.setup_encoder(args, dist, device)
64
+
65
+ # Setup VAE if specified
66
+ vae = None
67
+ if args.vae is not None:
68
+ vae = utils.setup_vae(args, dist, device)
69
+ args.img_size = args.img_size // vae.downsample_factor
70
+ else:
71
+ args.finetuned_vae = 'none'
72
+
73
+ # Setup main transformer model
74
+ model = utils.setup_transformer(
75
+ args, dist,
76
+ txt_dim=text_encoder.config.hidden_size,
77
+ use_checkpoint=1
78
+ ).to(device)
79
+
80
+ # Load checkpoint
81
+ print(f"Loading checkpoint from local path: {args.checkpoint_path}")
82
+ state_dict = torch.load(args.checkpoint_path, map_location='cpu')
83
+ model.load_state_dict(state_dict, strict=False)
84
+ del state_dict; torch.cuda.empty_cache()
85
+
86
+ # Set model to eval mode and disable gradients
87
+ for p in model.parameters():
88
+ p.requires_grad = False
89
+ model.eval()
90
+
91
+ # Parallelize model for multi-GPU sampling
92
+ _, model = utils.parallelize_model(args, model, dist, device)
93
+
94
+ return model, vae, (tokenizer, text_encoder, dist, device)
95
+
96
+
97
+ def prepare_captions(args: argparse.Namespace, dist) -> Tuple[List[str], List[int], int, str]:
98
+ """Prepare captions for sampling from file or template."""
99
+ if args.caption.endswith('.txt'):
100
+ with open(args.caption, 'r') as f:
101
+ lines = [line.strip() for line in f.readlines()]
102
+
103
+ num_samples = len(lines)
104
+ fixed_y = lines[dist.rank:][::dist.world_size]
105
+ fixed_idxs = list(range(len(lines)))[dist.rank:][::dist.world_size]
106
+ caption_name = args.caption.split('/')[-1][:-4]
107
+ else:
108
+ caption_text = DEFAULT_CAPTIONS.get(args.caption, args.caption)
109
+ fixed_y = [caption_text] * args.sample_batch_size
110
+ fixed_idxs = []
111
+ num_samples = args.sample_batch_size * dist.world_size
112
+ caption_name = args.caption
113
+
114
+ return fixed_y, fixed_idxs, num_samples, caption_name
115
+
116
+
117
+ def get_noise_shape(args: argparse.Namespace, vae) -> callable:
118
+ """Generate noise tensor with appropriate shape for sampling."""
119
+ def _get_noise_func(b: int, x_shape: tuple) -> torch.Tensor:
120
+ rand_shape = [args.channel_size, x_shape[0], x_shape[1]]
121
+ if len(x_shape) == 3:
122
+ rand_shape = [x_shape[2]] + rand_shape
123
+
124
+ if vae is not None:
125
+ if args.vid_size is not None:
126
+ rand_shape[0] = (rand_shape[0] - 1) // vae.temporal_downsample_factor + 1
127
+ rand_shape[-2] //= vae.downsample_factor
128
+ rand_shape[-1] //= vae.downsample_factor
129
+
130
+ return torch.randn(b, *rand_shape)
131
+
132
+ return _get_noise_func
133
+
134
+
135
+ def prepare_input_image(args: argparse.Namespace, x_shape: tuple, vae, device: torch.device, noise_std: float) -> Optional[torch.Tensor]:
136
+ """Load and preprocess input image for conditional generation."""
137
+ input_image = Image.open(args.input_image).convert('RGB')
138
+
139
+ # Resize and crop to target shape
140
+ scale = max(x_shape[0] / input_image.height, x_shape[1] / input_image.width)
141
+ transform = tv.transforms.Compose([
142
+ tv.transforms.Resize((int(input_image.height * scale), int(input_image.width * scale))),
143
+ tv.transforms.CenterCrop(x_shape[:2]),
144
+ tv.transforms.ToTensor(),
145
+ tv.transforms.Normalize([0.5]*3, [0.5]*3)
146
+ ])
147
+
148
+ input_image = transform(input_image).unsqueeze(0).to(device)
149
+
150
+ # Encode with VAE if available
151
+ with torch.no_grad():
152
+ if vae is not None:
153
+ input_image = vae.encode(input_image)
154
+
155
+ # Add noise
156
+ input_image = add_noise(input_image, noise_std)[0]
157
+ return input_image
158
+
159
+
160
+ def build_sampling_kwargs(args: argparse.Namespace, caption_name: str) -> dict:
161
+ """Build sampling keyword arguments based on configuration."""
162
+ sampling_kwargs = {
163
+ 'guidance': args.cfg,
164
+ 'guide_top': args.guide_top,
165
+ 'verbose': not caption_name.endswith('/'),
166
+ 'return_sequence': args.return_sequence,
167
+ 'jacobi': args.jacobi,
168
+ 'context_length': args.context_length
169
+ }
170
+
171
+ if args.jacobi:
172
+ sampling_kwargs.update({
173
+ 'jacobi_th': args.jacobi_th,
174
+ 'jacobi_block_size': args.jacobi_block_size,
175
+ 'jacobi_max_iter': args.jacobi_max_iter
176
+ })
177
+ else:
178
+ sampling_kwargs.update({
179
+ 'attn_temp': args.attn_temp,
180
+ 'annealed_guidance': False
181
+ })
182
+
183
+ return sampling_kwargs
184
+
185
+
186
+ def main(args: argparse.Namespace) -> None:
187
+ """Main sampling function."""
188
+ # Load model configuration and merge with command line args
189
+ trainer_args = load_model_config(args.model_config_path)
190
+ trainer_dict = vars(trainer_args)
191
+ trainer_dict.update(vars(args))
192
+ args = argparse.Namespace(**trainer_dict)
193
+
194
+ # Handle target length configuration for video
195
+ if args.target_length is not None:
196
+ assert args.vid_size is not None, "it must be a video model to use target_length"
197
+ assert args.jacobi == 1, "target_length is only supported with jacobi sampling"
198
+ if args.target_length == 1: # generate single image
199
+ args.vid_size = None
200
+ args.out_fps = 0
201
+ else:
202
+ args.local_attn_window = (int(args.vid_size.split(':')[0]) - 1) // 4 + 1
203
+ args.vid_size = f"{args.target_length}:16"
204
+ if args.context_length is None:
205
+ args.context_length = args.local_attn_window - 1
206
+
207
+ # Override some settings for sampling
208
+ args.fsdp = 1 # sampling using FSDP if available.
209
+ if args.use_pretrained_lm is not None:
210
+ args.text = args.use_pretrained_lm
211
+
212
+ # Setup model and components
213
+ model, vae, (tokenizer, text_encoder, dist, device) = setup_model_and_components(args)
214
+
215
+ # Setup output directory
216
+ model_name = pathlib.Path(args.checkpoint_path).stem
217
+ sample_dir: pathlib.Path = args.logdir / f'{model_name}'
218
+ if dist.local_rank == 0:
219
+ sample_dir.mkdir(parents=True, exist_ok=True)
220
+ dist.barrier()
221
+
222
+ print(f'{" Load ":-^80} {model_name}')
223
+
224
+ # Prepare captions and sampling parameters
225
+ fixed_y, fixed_idxs, num_samples, caption_name = prepare_captions(args, dist)
226
+ print(f'Sampling {num_samples} from {args.caption} on {dist.world_size} GPU(s)')
227
+
228
+ get_noise = get_noise_shape(args, vae)
229
+ sampling_kwargs = build_sampling_kwargs(args, caption_name)
230
+ noise_std = args.target_noise_std if args.target_noise_std else args.noise_std
231
+
232
+ # Start sampling
233
+ print(f'Starting sampling with global batch size {args.sample_batch_size}x{dist.world_size} GPUs')
234
+ torch.cuda.synchronize()
235
+ start_time = time.time()
236
+
237
+ with torch.no_grad():
238
+ with torch.autocast(device_type='cuda', dtype=torch.float32):
239
+ for i in tqdm.tqdm(range(int(np.ceil(num_samples / (args.sample_batch_size * dist.world_size))))):
240
+ # Determine aspect ratio and image shape
241
+ x_aspect = args.aspect_ratio if args.mix_aspect else None
242
+ if x_aspect == "random":
243
+ x_aspect = np.random.choice([
244
+ "1:1", "2:3", "3:2", "16:9", "9:16", "4:5", "5:4", "21:9", "9:21"
245
+ ])
246
+
247
+ x_shape = aspect_ratio_to_image_size(
248
+ args.img_size * vae.downsample_factor, x_aspect,
249
+ multiple=vae.downsample_factor * args.patch_size
250
+ )
251
+
252
+ # Setup text encoder kwargs
253
+ text_encoder_kwargs = dict(
254
+ aspect_ratio=x_aspect,
255
+ fps=args.out_fps if args.fps_cond else None,
256
+ noise_std=noise_std if args.cond_noise_level else None
257
+ )
258
+
259
+ # Handle video dimensions
260
+ if args.vid_size is not None:
261
+ vid_size = tuple(map(int, args.vid_size.split(':')))
262
+ out_fps = args.out_fps if args.fps_cond else vid_size[1]
263
+ num_frames = vid_size[0]
264
+ x_shape = (x_shape[0], x_shape[1], num_frames)
265
+ else:
266
+ out_fps = args.out_fps
267
+
268
+ # Prepare batch and captions
269
+ b = args.sample_batch_size
270
+ y = fixed_y[i * b : (i + 1) * b]
271
+ y_caption = copy.deepcopy(y)
272
+
273
+ # Add null captions for CFG
274
+ if args.cfg > 0:
275
+ y += [""] * len(y)
276
+
277
+ # Prepare text & noise
278
+ y = encode_text(text_encoder, tokenizer, y, args.txt_size, device, **text_encoder_kwargs)
279
+ noise = get_noise(len(y_caption), x_shape).to(device)
280
+
281
+ # Prepare input image if specified
282
+ if args.input_image is not None:
283
+ input_image = prepare_input_image(args, x_shape, vae, device, noise_std)
284
+ input_image = repeat(input_image, '1 c h w -> b c h w', b=b)
285
+
286
+ assert args.cfg > 0, "CFG is required for image conditioned generation"
287
+ kv_caches = model(input_image.unsqueeze(1), y, context=True)
288
+ else:
289
+ input_image, kv_caches = None, None
290
+
291
+ # Generate samples
292
+ samples = model(noise, y, reverse=True, kv_caches=kv_caches, **sampling_kwargs)
293
+ del kv_caches; torch.cuda.empty_cache() # free up memory
294
+
295
+ # Apply denoising if enabled
296
+ samples = process_denoising(
297
+ samples, y_caption, args, model, text_encoder,
298
+ tokenizer, text_encoder_kwargs, noise_std
299
+ )
300
+
301
+ # Decode with VAE if available
302
+ if args.vae is not None:
303
+ dec_fn = vae.decode
304
+ else:
305
+ dec_fn = lambda x: x
306
+
307
+ if isinstance(samples, list):
308
+ samples = torch.cat([dec_fn(s) for s in samples], dim=-1)
309
+ else:
310
+ samples = dec_fn(samples)
311
+
312
+ # Save samples using unified function
313
+ print(f' Saving samples ... {sample_dir}')
314
+
315
+ # Determine save mode based on args
316
+ if args.save_folder and args.caption.endswith('.txt'):
317
+ grid_mode = "individual" # Save individual files when using caption file
318
+ else:
319
+ grid_mode = "auto" # Use automatic grid arrangement
320
+
321
+ save_samples_unified(
322
+ samples=samples,
323
+ save_dir=sample_dir,
324
+ filename_prefix=caption_name[:200] if len(caption_name) > 0 else "samples",
325
+ epoch_or_iter=i,
326
+ fps=out_fps,
327
+ dist=dist,
328
+ wandb_log=False, # Let sample.py handle its own wandb logging
329
+ grid_arrangement=grid_mode
330
+ )
331
+
332
+ # Print timing statistics
333
+ torch.cuda.synchronize()
334
+ elapsed_time = time.time() - start_time
335
+ print(f'{model_name} cfg {args.cfg:.2f}, bsz={args.sample_batch_size}x{dist.world_size}, '
336
+ f'time={elapsed_time:.2f}s, speed={num_samples / elapsed_time:.2f} images/s')
337
+
338
+
339
+ if __name__ == '__main__':
340
+ parser = argparse.ArgumentParser()
341
+
342
+ # Model config
343
+ parser.add_argument('--model_config_path', required=True, type=str, help='path to YAML config file or directory containing config file')
344
+ parser.add_argument('--checkpoint_path', required=True, type=str, help='path to local checkpoint file (required when using model_config_path)')
345
+ parser.add_argument('--save_folder', default=0, type=int)
346
+
347
+ # Caption, condition
348
+ parser.add_argument('--caption', type=str, required=True, help='Caption input (required)')
349
+ parser.add_argument('--input_image', default=None, type=str, help='path to the input image for image-conditioned generation')
350
+ parser.add_argument('--aspect_ratio', default="1:1", type=str, choices=["random", "1:1", "2:3", "3:2", "16:9", "9:16", "4:5", "5:4", "21:9", "9:21"])
351
+ parser.add_argument('--out_fps', default=8, type=int, help='fps for video datasets, only useful if fps_cond is set to 1')
352
+
353
+ # Sampling parameters
354
+ parser.add_argument('--seed', default=191, type=int)
355
+ parser.add_argument('--denoising_batch_size', default=1, type=int)
356
+ parser.add_argument('--self_denoising_lr', default=1, type=float)
357
+ parser.add_argument('--disable_learnable_denoiser', default=0, type=int)
358
+ parser.add_argument('--attn_temp', default=1, type=float)
359
+ parser.add_argument('--jacobi_th', default=0.005, type=float)
360
+ parser.add_argument('--jacobi', default=0, type=int)
361
+ parser.add_argument('--jacobi_block_size', default=64, type=int)
362
+ parser.add_argument('--jacobi_max_iter', default=32, type=int)
363
+ parser.add_argument('--num_samples', default=50000, type=int)
364
+ parser.add_argument('--sample_batch_size', default=16, type=int)
365
+ parser.add_argument('--return_sequence', default=0, type=int)
366
+ parser.add_argument('--cfg', default=5, type=float)
367
+ parser.add_argument('--guide_top', default=None, type=int)
368
+ parser.add_argument('--finetuned_vae', default="px82zaheuu", type=str)
369
+ parser.add_argument('--vae_adapter', default=None)
370
+ parser.add_argument('--target_noise_std', default=None, help="option to use different noise_std from the config")
371
+
372
+ # Video-specific parameters
373
+ parser.add_argument('--target_length', default=None, type=int, help="target length maybe longer than training")
374
+ parser.add_argument('--context_length', default=16, type=int, help="context length used for consective sampling")
375
+ args = parser.parse_args()
376
+
377
+ if args.input_image and args.input_image == 'none':
378
+ args.input_image = None
379
+ main(args)
transformer_flow.py ADDED
@@ -0,0 +1,1356 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # For licensing see accompanying LICENSE file.
3
+ # Copyright (C) 2025 Apple Inc. All Rights Reserved.
4
+ #
5
+ import copy
6
+ import tqdm
7
+ import numpy as np
8
+ import torch
9
+ import torch.nn.functional as F
10
+
11
+ from typing import List, Tuple
12
+ from misc.pe import VisionRotaryEmbeddingFast, apply_rope, get_positions
13
+ from misc import print
14
+ from functools import partial
15
+ from einops import rearrange, repeat
16
+ from torch.utils.checkpoint import checkpoint
17
+
18
+ INV_SOFTPLUS_1 = 0.541324854612918
19
+
20
+ def modulate(x, shift, scale):
21
+ if shift is None:
22
+ return x * (1 + scale)
23
+ return x * (1 + scale) + shift
24
+
25
+
26
+ def stable_neg_log_softplus(x):
27
+ return torch.where(
28
+ x > 20, # softplus(x) ≈ x → log ≈ log(x)
29
+ -x.log(), # so -log(softplus(x)) ≈ -log(x)
30
+ -F.softplus(x).log()
31
+ )
32
+
33
+
34
+ class KVCache:
35
+
36
+ def __init__(self):
37
+ self._is_empty = True
38
+ self.prefix_cache = None
39
+ self.meta_data = {}
40
+
41
+ def initialize(self, num_blocks, *size):
42
+ self._is_empty = False
43
+ self.num_blocks = num_blocks
44
+ self.size = size
45
+ self.kv_caches = [torch.zeros(2, *size) for _ in range(num_blocks)]
46
+ self.kv_index = [0] * num_blocks
47
+
48
+ def register_prefix_cache(self, prefix_cache):
49
+ self.prefix_cache = prefix_cache
50
+
51
+ @property
52
+ def is_empty(self):
53
+ return self._is_empty
54
+
55
+ @property
56
+ def is_full(self):
57
+ if self.is_empty:
58
+ return False
59
+ return all(index == self.size[2] for index in self.kv_index)
60
+
61
+ def delete(self):
62
+ if not self.is_empty:
63
+ self._is_empty = True
64
+ del self.kv_caches
65
+ del self.kv_index
66
+
67
+ def to(self, device, dtype=torch.bfloat16):
68
+ for i in range(self.num_blocks):
69
+ self.kv_caches[i] = self.kv_caches[i].to(device=device, dtype=dtype)
70
+
71
+ def extend_length(self, length):
72
+ assert not self.is_empty, "KVCache is empty, cannot extend length"
73
+ self.size = (self.size[0], self.size[1], self.size[2] + length, self.size[3])
74
+ for i in range(self.num_blocks):
75
+ pad = self.kv_caches[i].new_zeros((2, *self.size))
76
+ pad[:, :, :, :self.kv_caches[i].size(3)] = self.kv_caches[i]
77
+ self.kv_caches[i] = pad
78
+
79
+ def expand_batch(self, ratio=2):
80
+ self.size = (self.size[0] * ratio, *self.size[1:])
81
+ for i in range(self.num_blocks):
82
+ self.kv_caches[i] = torch.cat([self.kv_caches[i] for _ in range(ratio)], dim=1)
83
+
84
+ def remove_negative_cache(self):
85
+ self.size = (self.size[0] // 2, *self.size[1:])
86
+ for i in range(self.num_blocks):
87
+ self.kv_caches[i] = self.kv_caches[i].chunk(2, dim=1)[0]
88
+
89
+ def backward_in_time(self, l):
90
+ for i in range(self.num_blocks):
91
+ self.kv_index[i] = max(0, self.kv_index[i] - l)
92
+
93
+ def reset_kv_index(self):
94
+ for i in range(self.num_blocks):
95
+ self.kv_index[i] = 0
96
+
97
+ def __call__(self, block_idx, k, v):
98
+ assert block_idx < self.num_blocks, f'block_idx {block_idx} out of range {self.num_blocks}'
99
+ # write cache
100
+ l = k.size(2)
101
+ kv_index = self.kv_index[block_idx]
102
+
103
+ if kv_index + l > self.size[2]:
104
+ raise NotImplementedError("Overflow mode is not implemented")
105
+
106
+ self.kv_caches[block_idx][0][:, :, kv_index: kv_index+l] = k
107
+ self.kv_caches[block_idx][1][:, :, kv_index: kv_index+l] = v
108
+ self.kv_index[block_idx] = kv_index + l
109
+
110
+ # read cache
111
+ kv_index = self.kv_index[block_idx]
112
+ return self.kv_caches[block_idx][0][:, :, :kv_index], self.kv_caches[block_idx][1][:, :, :kv_index]
113
+
114
+
115
+ class Permutation(torch.nn.Module):
116
+
117
+ def __init__(self, seq_length: int):
118
+ super().__init__()
119
+ self.seq_length = seq_length
120
+ self.input_shape = None
121
+
122
+ def forward(self, x: torch.Tensor | List[torch.Tensor], dim: int = 1, inverse: bool = False):
123
+ if not inverse:
124
+ self.input_shape = x.shape
125
+ x = rearrange(x, 'b t h w c -> b (t h w) c' if x.dim() == 5 else 'b h w c -> b (h w) c')
126
+ x = self.permute(x, dim, self.input_shape, inverse=False)
127
+ else:
128
+ x = self.permute(x, dim, self.input_shape, inverse=True)
129
+ x = x.reshape(-1, *self.input_shape[1:])
130
+ return x
131
+
132
+ def permute(self, x: torch.Tensor, dim: int = 1, shape=None, inverse: bool = False) -> torch.Tensor:
133
+ raise NotImplementedError('Overload me')
134
+
135
+
136
+ class PermutationIdentity(Permutation):
137
+ def permute(self, x: torch.Tensor, dim: int = 1, shape=None, inverse: bool = False) -> torch.Tensor:
138
+ return x.clone()
139
+
140
+
141
+ class PermutationFlip(Permutation):
142
+ def permute(self, x: torch.Tensor, dim: int = 1, shape=None, inverse: bool = False) -> torch.Tensor:
143
+ return x.flip(dims=[dim])
144
+
145
+
146
+ class PermutationFlipInBlock(Permutation):
147
+ def permute(self, x: torch.Tensor, dim: int = 1, shape=None, inverse: bool = False) -> torch.Tensor:
148
+ assert shape is not None, "shape must be provided for PermutationFlipInBlock"
149
+ if len(shape) == 5:
150
+ assert dim == 1, "dim must be 1 for 5D tensor in PermutationFlipInBlock"
151
+ # flip the tensor within blocks of size `block_size`, globally still in the same order
152
+ x = x.view(x.size(0), shape[1], -1, x.size(-1)).flip(dims=[2]).view_as(x)
153
+ else:
154
+ x = x.flip(dims=[dim])
155
+ return x
156
+
157
+
158
+ class RMSNorm(torch.nn.Module):
159
+
160
+ def __init__(
161
+ self,
162
+ dim: int,
163
+ eps: float = 1e-6,
164
+ add_unit_offset: bool = True,
165
+ ):
166
+ super().__init__()
167
+ self.eps = eps
168
+ self.add_unit_offset = add_unit_offset
169
+ self.weight = torch.nn.Parameter(torch.zeros(dim))
170
+
171
+ def _norm(self, x):
172
+ return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
173
+
174
+ def forward(self, x):
175
+ # Llama does x.to(float16) * w whilst Gemma2 is (x * w).to(float16)
176
+ # See https://github.com/huggingface/transformers/pull/29402
177
+ output = self._norm(x.float())
178
+ if self.add_unit_offset:
179
+ output = output * (1 + self.weight.float())
180
+ else:
181
+ output = output * self.weight.float()
182
+ return output.type_as(x)
183
+
184
+
185
+ class Attention(torch.nn.Module):
186
+ def __init__(self, in_channels: int, head_channels: int, norm_type: str = "layer_norm",
187
+ num_heads=None, num_kv_heads=None, use_qk_norm=False,
188
+ use_post_norm=False, use_bias=True, hf_style_rope=False, non_causal=False):
189
+ super().__init__()
190
+ if norm_type == "layer_norm":
191
+ self.norm = torch.nn.LayerNorm(in_channels)
192
+ elif norm_type == "rms_norm":
193
+ self.norm = RMSNorm(in_channels)
194
+ else:
195
+ self.norm = torch.nn.Identity()
196
+ self.head_channels = head_channels
197
+ self.num_heads = num_heads if num_heads is not None else in_channels // head_channels
198
+ self.num_kv_heads = num_kv_heads if num_kv_heads is not None else self.num_heads # GQA
199
+ self.q_size = self.num_heads * head_channels
200
+ self.kv_size = self.num_kv_heads * head_channels
201
+ self.qkv = torch.nn.Linear(in_channels, self.q_size + 2 * self.kv_size, bias=use_bias)
202
+ self.proj = torch.nn.Linear(self.q_size, in_channels, bias=use_bias)
203
+ self.query_norm = (RMSNorm(self.head_channels) if use_qk_norm else None)
204
+ self.key_norm = (RMSNorm(self.head_channels) if use_qk_norm else None)
205
+ self.post_norm = (RMSNorm(in_channels) if use_post_norm else None)
206
+ self.sqrt_scale = head_channels ** (-0.25)
207
+ self.hf_style_rope = hf_style_rope
208
+ self.non_causal = non_causal
209
+
210
+ def apply_rope(self, x: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor:
211
+ if self.hf_style_rope:
212
+ return rearrange(apply_rope(rearrange(x, '... (u d) -> ... (d u)', u=2), freqs_cis), '... (d u) -> ... (u d)', u=2)
213
+ return apply_rope(x, freqs_cis)
214
+
215
+ def prepare_for_attention(self, x: torch.Tensor, freqs_cis=None, kv_cache=None):
216
+ B, T, _ = x.size()
217
+ q, k, v = self.qkv(self.norm(x)).split([self.q_size, self.kv_size, self.kv_size], dim=-1)
218
+ q = q.view(B, T, self.num_heads, self.head_channels).transpose(1, 2) # (b, h, t, d)
219
+ k = k.view(B, T, self.num_kv_heads, self.head_channels).transpose(1, 2) # (b, h, t, d)
220
+ v = v.view(B, T, self.num_kv_heads, self.head_channels).transpose(1, 2) # (b, h, t, d)
221
+ if self.query_norm is not None and self.key_norm is not None:
222
+ q, k = self.query_norm(q), self.key_norm(k)
223
+
224
+ if kv_cache is not None:
225
+ k, v = kv_cache(k, v)
226
+
227
+ if freqs_cis is not None:
228
+ lq, lk = q.size(2), k.size(2)
229
+ q, k = self.apply_rope(q, freqs_cis[lk-lq:lk]), self.apply_rope(k, freqs_cis[:lk])
230
+
231
+ if self.num_kv_heads != self.num_heads: # GQA (b, h, t, d)
232
+ k = torch.repeat_interleave(k, self.num_heads // self.num_kv_heads, dim=1)
233
+ v = torch.repeat_interleave(v, self.num_heads // self.num_kv_heads, dim=1)
234
+
235
+ return q.to(x.dtype), k.to(x.dtype), v.to(x.dtype)
236
+
237
+ def output_after_attention(self, x: torch.Tensor):
238
+ B, _, T, _ = x.shape
239
+ x = x.transpose(1, 2).reshape(B, T, self.q_size)
240
+ x = self.proj(x)
241
+ if self.post_norm is not None:
242
+ x = self.post_norm(x)
243
+ return x
244
+
245
+ def apply_attention(self, q, k, v, mask=None, temp=1.0):
246
+ scale = self.sqrt_scale**2 / temp
247
+ is_causal = not self.non_causal
248
+ if is_causal and q.size(2) < k.size(2) and mask is None:
249
+ prefix_len = k.size(2) - q.size(2)
250
+ mask = torch.tril(torch.ones(q.size(2), k.size(2), device=q.device, dtype=torch.bool), diagonal=prefix_len)
251
+
252
+ if mask is not None:
253
+ mask = mask.bool()
254
+ is_causal = False
255
+
256
+ # spda
257
+ x = torch.nn.functional.scaled_dot_product_attention(
258
+ q, k, v, attn_mask=mask, is_causal=is_causal, scale=scale)
259
+ return x
260
+
261
+ def forward(self, x: torch.Tensor, mask: torch.Tensor | None = None, temp: float = 1.0, freqs_cis=None, kv_cache=None,
262
+ ) -> torch.Tensor:
263
+ q, k, v = self.prepare_for_attention(x, freqs_cis, kv_cache)
264
+ x = self.apply_attention(q, k, v, mask, temp)
265
+ x = self.output_after_attention(x)
266
+ return x
267
+
268
+
269
+ class MLP(torch.nn.Module):
270
+ def __init__(self, channels: int, expansion: float, use_swiglu=False, norm_type="layer_norm", use_post_norm=False, use_bias=True):
271
+ super().__init__()
272
+ if norm_type == "layer_norm":
273
+ self.norm = torch.nn.LayerNorm(channels)
274
+ elif norm_type == "rms_norm":
275
+ self.norm = RMSNorm(channels)
276
+ else:
277
+ self.norm = torch.nn.Identity()
278
+ self.post_norm = (RMSNorm(channels) if use_post_norm else None)
279
+ self.use_swiglu = use_swiglu
280
+
281
+ intermediate_channels = int(channels * expansion)
282
+ if use_swiglu:
283
+ self.gate_proj = torch.nn.Linear(channels, intermediate_channels, bias=use_bias)
284
+ self.up_proj = torch.nn.Linear(channels, intermediate_channels, bias=use_bias)
285
+ self.down_proj = torch.nn.Linear(intermediate_channels, channels, bias=use_bias)
286
+ else:
287
+ self.main = torch.nn.Sequential(
288
+ torch.nn.Linear(channels, intermediate_channels, bias=use_bias),
289
+ torch.nn.GELU(), torch.nn.Linear(intermediate_channels, channels, bias=use_bias)
290
+ )
291
+
292
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
293
+ if self.use_swiglu:
294
+ x = self.norm(x)
295
+ x = self.down_proj(F.gelu(self.gate_proj(x), approximate='tanh') * self.up_proj(x))
296
+ else:
297
+ x = self.main(self.norm(x))
298
+ return self.post_norm(x) if self.post_norm is not None else x
299
+
300
+
301
+ class AttentionBlock(torch.nn.Module):
302
+ def __init__(self, channels: int, head_channels: int, expansion: float = 4, use_adaln: bool = False,
303
+ use_swiglu=False, norm_type="layer_norm", num_heads=None, num_kv_heads=None,
304
+ use_qk_norm=False, use_post_norm=False, use_bias=True, hf_style_rope=False, non_causal=False):
305
+ super().__init__()
306
+ if use_adaln:
307
+ self.adaLN_modulation = torch.nn.Sequential(
308
+ torch.nn.SiLU(),
309
+ torch.nn.Linear(channels, 4 * channels, bias=True),
310
+ )
311
+ self.norm1 = torch.nn.LayerNorm(channels, elementwise_affine=False, eps=1e-6)
312
+ self.norm2 = torch.nn.LayerNorm(channels, elementwise_affine=False, eps=1e-6)
313
+
314
+ torch.nn.init.constant_(self.adaLN_modulation[-1].weight, 0)
315
+ torch.nn.init.constant_(self.adaLN_modulation[-1].bias, 0)
316
+
317
+ # Hard-coded norm_type=="none" for adaLN
318
+ norm_type = 'none'
319
+ else:
320
+ self.adaLN_modulation = None
321
+
322
+ self.attention = Attention(channels, head_channels, norm_type, num_heads, num_kv_heads, use_qk_norm, use_post_norm, use_bias, hf_style_rope, non_causal)
323
+ self.mlp = MLP(channels, expansion, use_swiglu, norm_type, use_post_norm, use_bias)
324
+
325
+ def forward(
326
+ self, x: torch.Tensor, y: torch.Tensor | None = None, attn_mask: torch.Tensor | None = None,
327
+ attn_temp: float = 1.0, c=None, freqs_cis=None, kv_cache=None,
328
+ checkpoint_attn: bool = False, checkpoint_mlp: bool = False
329
+ ) -> torch.Tensor:
330
+ assert (x is not None) or (y is not None), "x or y must be provided"
331
+ z = torch.cat([y, x], 1) if (x is not None) and (y is not None) else x if x is not None else y
332
+ if self.adaLN_modulation is not None and c is not None:
333
+ shift_msa, scale_msa, shift_mlp, scale_mlp = self.adaLN_modulation(c).chunk(4, dim=-1)
334
+ z = z + self._forward_attention(z, attn_mask, attn_temp, freqs_cis, kv_cache, checkpoint_attn, shift_msa, scale_msa)
335
+ z = z + self._forward_mlp(z, checkpoint_mlp, shift_mlp, scale_mlp)
336
+ else:
337
+ z = z + self._forward_attention(z, attn_mask, attn_temp, freqs_cis, kv_cache, checkpoint_attn)
338
+ z = z + self._forward_mlp(z, checkpoint_mlp)
339
+ x, y = (z[:, y.size(1):], z[:, :y.size(1)]) if (x is not None) and (y is not None) \
340
+ else (z, None) if x is not None else (None, z)
341
+ return x, y
342
+
343
+ def _forward_attention(self, z, attn_mask, attn_temp, freqs_cis, kv_cache, checkpoint_attn, shift=None, scale=None):
344
+ def attn_fn(z_in):
345
+ if shift is not None and scale is not None:
346
+ z_in = modulate(self.norm1(z_in), shift, scale)
347
+ return self.attention(z_in, attn_mask, attn_temp, freqs_cis, kv_cache)
348
+
349
+ return checkpoint(attn_fn, z, use_reentrant=False) if checkpoint_attn and self.training else attn_fn(z)
350
+
351
+ def _forward_mlp(self, z, checkpoint_mlp, shift=None, scale=None):
352
+ def mlp_fn(z_in):
353
+ if shift is not None and scale is not None:
354
+ z_in = modulate(self.norm2(z_in), shift, scale)
355
+ return self.mlp(z_in)
356
+
357
+ return checkpoint(mlp_fn, z, use_reentrant=False) if checkpoint_mlp and self.training else mlp_fn(z)
358
+
359
+
360
+ class MetaBlock(torch.nn.Module):
361
+ attn_mask: torch.Tensor
362
+
363
+ def __init__(
364
+ self,
365
+ in_channels: int,
366
+ channels: int,
367
+ img_size: int,
368
+ permutation: Permutation,
369
+ pt_seq_len: int | None = None,
370
+ num_layers: int = 1,
371
+ head_dim: int = 64,
372
+ num_heads: None | int = None,
373
+ num_kv_heads: None | int = None,
374
+ txt_size: int = 0,
375
+ txt_dim: int = 0,
376
+ expansion: float = 4,
377
+ use_rope: bool = False,
378
+ use_sos: bool = False,
379
+ use_softplus: bool = False,
380
+ use_swiglu: bool = False,
381
+ use_qk_norm: bool =False,
382
+ use_post_norm: bool = False,
383
+ use_final_norm: bool = False,
384
+ use_bias: bool = True,
385
+ use_proj_txt: bool = True,
386
+ hf_style_rope: bool = False,
387
+ norm_type: str ="layer_norm",
388
+ use_mm_attn: bool = False,
389
+ use_checkpoint: int = False,
390
+ use_checkpoint_mlp: int = None,
391
+ soft_clip: float = 0,
392
+ local_attn_window: int = None,
393
+ ):
394
+ super().__init__()
395
+ out_channels = in_channels * 2
396
+
397
+ self.proj_in = torch.nn.Linear(in_channels, channels)
398
+ self.proj_out = torch.nn.Linear(channels, out_channels)
399
+ if use_sos:
400
+ self.sos_embed = torch.nn.Parameter(torch.randn(1, 1, in_channels))
401
+ torch.nn.init.constant_(self.proj_out.weight, 0)
402
+
403
+ self.txt_size = txt_size
404
+ self.img_size = img_size
405
+ self.txt_dim = txt_dim
406
+ self.pt_seq_len = pt_seq_len or img_size
407
+
408
+ # KV cache configurations
409
+ num_kv_heads = num_kv_heads or (num_heads or channels // head_dim)
410
+ self.kv_cache_size = [num_kv_heads, head_dim]
411
+
412
+ if not use_rope:
413
+ self.pos_embed = torch.nn.Parameter(torch.randn(img_size ** 2, channels) * 1e-2)
414
+ else:
415
+ self.pos_embed = None
416
+
417
+ if txt_dim > 0:
418
+ self.proj_txt = torch.nn.Linear(txt_dim, channels) if use_proj_txt else torch.nn.Identity()
419
+ assert use_proj_txt or (txt_dim == channels), 'text dimension must equal channels when not using projection'
420
+
421
+ self.attn_blocks = torch.nn.ModuleList(
422
+ [AttentionBlock(channels, head_dim, expansion, False, use_swiglu,
423
+ norm_type, num_heads, num_kv_heads, use_qk_norm, use_post_norm, use_bias, hf_style_rope)
424
+ for _ in range(num_layers)])
425
+ self.use_final_norm = use_final_norm
426
+ if use_final_norm:
427
+ self.final_norm = RMSNorm(channels)
428
+
429
+ self.use_softplus = use_softplus
430
+ self.permutation = permutation
431
+ self.use_checkpoint = use_checkpoint
432
+ self.use_checkpoint_mlp = use_checkpoint_mlp
433
+ self.use_sos = use_sos
434
+ self.soft_clip = soft_clip
435
+ self.local_attn_window = local_attn_window
436
+ self.block_masks = {} # for local attention
437
+
438
+ # ---- DEPRECATED: do not pass mask to enable flash attention ----- For compatibility ----- #
439
+ self.register_buffer('attn_mask', torch.tril(torch.ones(pt_seq_len ** 2 + txt_size, pt_seq_len ** 2 + txt_size)))
440
+
441
+ def get_freqs_cis(self, x, y, rope):
442
+ # get the input shape
443
+ h, w = x.size(-3), x.size(-2)
444
+ d = x.size(1) if x.dim() == 5 else 0
445
+ txt_size = y.size(1) if self.txt_size > 0 and y is not None else 0
446
+
447
+ if not rope.is_1d: # prepare 2D RoPE
448
+ if self.txt_size > 0 or d > 0: # prepare 3D RoPE
449
+ if self.txt_dim > 0: # text is conditioned
450
+ pos = get_positions(h, w, txt_size, rope.pt_seq_len, d, mode='3d')
451
+ else: # text is not conditioned
452
+ pos = get_positions(h, w, 0, rope.pt_seq_len, d, mode='3d')
453
+ else:
454
+ pos = get_positions(h, w, 0, rope.pt_seq_len, mode='2d')
455
+ else: # prepare 1D RoPE
456
+ pos = get_positions(h, w, txt_size, rope.pt_seq_len, mode='1d')
457
+ return rope(pos.type_as(x))
458
+
459
+ def get_sos_embed(self, x):
460
+ sos_embed = self.sos_embed.expand(x.size(0), -1, -1)
461
+ return sos_embed
462
+
463
+ def get_prepared(self, x):
464
+ # input, output, freqs_cis
465
+ x_in = x.clone()
466
+ if self.use_sos: # add SOS token, predict the first token sos->x_in[0]
467
+ x = torch.cat([self.get_sos_embed(x), x[:, :-1]], dim=1)
468
+ return x_in, x
469
+
470
+ def get_proj_in(self, x):
471
+ x = self.proj_in(x)
472
+ return x
473
+
474
+ def get_proj_out(self, x):
475
+ x = self.proj_out(x)
476
+ if hasattr(self, "soft_clip") and self.soft_clip > 0:
477
+ x = self.soft_clip * torch.tanh(x / self.soft_clip)
478
+ return x
479
+
480
+ def get_local_window_mask(self, x, y):
481
+ _, T, H, W, _ = x.shape
482
+ L = y.size(1) if y is not None else 0
483
+ B = H * W
484
+ N = T * B
485
+ S = L + N
486
+ G = self.local_attn_window
487
+
488
+ def mask(q, k):
489
+ return (k <= q) & ((k < L) | ((k - L) // B > (q - L) // B - G))
490
+
491
+ return mask(torch.arange(S, device=x.device)[:, None], torch.arange(S, device=x.device)[None, :])
492
+
493
+ def initialize_kv_cache(self, kv_cache, x, freqs_cis, reuse_kv_cache=False):
494
+ if self.local_attn_window is not None and self.local_attn_window > 0:
495
+ video_frame_size = x.size(-3) * x.size(-2)
496
+ kv_cache_length = self.local_attn_window * video_frame_size
497
+ kv_cache_length += self.txt_size if self.txt_dim > 0 else 0
498
+ kv_cache.meta_data.update(
499
+ {"frame_size": video_frame_size, "txt_size": self.txt_size + 1 if self.txt_dim > 0 else 0})
500
+ else:
501
+ kv_cache_length = freqs_cis.size(0)
502
+
503
+ kv_cache_size = (x.size(0), self.kv_cache_size[0], kv_cache_length, self.kv_cache_size[1])
504
+ if kv_cache.is_empty:
505
+ kv_cache.initialize(len(self.attn_blocks), *kv_cache_size)
506
+ kv_cache.to(x.device, x.dtype)
507
+ else:
508
+ target_size = kv_cache_size[-2]
509
+ if reuse_kv_cache:
510
+ target_size = target_size - kv_cache.kv_index[0]
511
+ kv_cache.extend_length(target_size)
512
+ return kv_cache
513
+
514
+ def forward(self, x: torch.Tensor | List[torch.Tensor], y: torch.Tensor | None = None, rope=None, kv_cache=None, guidance=None):
515
+ freqs_cis = self.get_freqs_cis(x, y, rope) if rope is not None else None
516
+ attn_mask = None
517
+ if kv_cache is not None:
518
+ kv_cache = self.initialize_kv_cache(kv_cache, x, freqs_cis)
519
+
520
+ x = self.permutation(x)
521
+ pos_embed = self.permutation(self.pos_embed, dim=0) if self.pos_embed is not None else None
522
+
523
+ # prepare input
524
+ x_in, x = self.get_prepared(x)
525
+ if kv_cache is not None:
526
+ kv_cache.register_prefix_cache(x_in)
527
+
528
+ # input projection
529
+ x = self.get_proj_in(x)
530
+ if pos_embed is not None:
531
+ x = x + pos_embed
532
+
533
+ # conditioning
534
+ if self.txt_dim > 0:
535
+ y = self.proj_txt(y)
536
+ else:
537
+ y = None
538
+
539
+ # main block
540
+ for it, block in enumerate(self.attn_blocks):
541
+ _kv_cache = partial(kv_cache, it) if kv_cache is not None else None
542
+
543
+ # Frequency-based checkpointing strategy:
544
+ # - Checkpoint attention every use_checkpoint blocks (if use_checkpoint > 0)
545
+ # - Checkpoint MLP every use_checkpoint_mlp blocks (if provided), otherwise every use_checkpoint blocks
546
+ checkpoint_attn = self.training and self.use_checkpoint > 0 and ((it + 1) % self.use_checkpoint == 0)
547
+ if self.use_checkpoint_mlp is not None:
548
+ checkpoint_mlp = self.training and self.use_checkpoint_mlp > 0 and ((it + 1) % self.use_checkpoint_mlp == 0)
549
+ else:
550
+ checkpoint_mlp = self.training and self.use_checkpoint > 0 and ((it + 1) % self.use_checkpoint == 0)
551
+
552
+ x, y = block(x, y, attn_mask, 1.0, None, freqs_cis, _kv_cache,
553
+ checkpoint_attn=checkpoint_attn,
554
+ checkpoint_mlp=checkpoint_mlp)
555
+
556
+ # final norm
557
+ if self.use_final_norm:
558
+ x, y = self.final_norm(x), self.final_norm(y) if y is not None else None
559
+
560
+ x = self.get_proj_out(x)
561
+ if not self.use_sos: # no SOS token, we need to shift the sequence
562
+ x = torch.cat([torch.zeros_like(x[:, :1]), x[:, :-1]], dim=1)
563
+ xa, xb = x.chunk(2, dim=-1)
564
+
565
+ # Store original dtype for output conversion
566
+ original_dtype = xa.dtype
567
+
568
+ # Convert to fp32 for numerical stability
569
+ xa, xb, x_in = xa.float(), xb.float(), x_in.float()
570
+ if not self.use_softplus:
571
+ xa = xa.exp()
572
+ else:
573
+ xa = F.softplus(xa + INV_SOFTPLUS_1)
574
+ if guidance is not None and guidance > 0:
575
+ xb, xa = self.guidance(xa, xb, guidance, 1.0, 'ab')
576
+
577
+ # NOTE: this "scale" is in fact 1/sigma, not sigma
578
+ x = self.permutation((x_in - xb) / xa, inverse=True)
579
+ logdet = -torch.log(xa) # keep all the dimensions
580
+
581
+ # Convert back to original precision
582
+ x = x.to(original_dtype)
583
+ return x, y, logdet
584
+
585
+ def guidance(self, za, zb, guidance, r=1.0, guide_what='ab'):
586
+ za, za_u = [torch.cat([a, a]) for a in za.chunk(2, dim=0)]
587
+ zb, zb_u = [torch.cat([a, a]) for a in zb.chunk(2, dim=0)]
588
+ g = r * guidance
589
+
590
+ def logits_guided(mu_c, sigma_c, mu_u, sigma_u, w):
591
+ # inspired from: (1+w) * logP_cond - w * logP_uncond
592
+ # sigma_c = torch.minimum(sigma_c, sigma_u)
593
+ s = (sigma_c / sigma_u).clip(max=1.0).square()
594
+ sigma_eff = sigma_c / (1 + w - w * s).sqrt()
595
+ mu_eff = ((1 + w) * mu_c - (w * s) * mu_u) / (1 + w - w * s)
596
+ return mu_eff, sigma_eff
597
+
598
+ def original_guidance(mu_c, sigma_c, mu_u, sigma_u, w):
599
+ if 'a' in guide_what:
600
+ sigma_c = sigma_c + g * (sigma_c - sigma_u)
601
+ if 'b' in guide_what:
602
+ mu_c = mu_c + g * (mu_c - mu_u)
603
+ return mu_c, sigma_c
604
+
605
+ #zb, za = original_guidance(zb, za, zb_u, za_u, guidance)
606
+ zb, za = logits_guided(zb, za, zb_u, za_u, guidance)
607
+ return zb, za
608
+
609
+ def reverse_step(
610
+ self, x: torch.Tensor, t: int, kv_cache: KVCache,
611
+ pos_embed: torch.Tensor | None = None, y: torch.Tensor | None = None,
612
+ attn_temp: float = 1.0, freqs_cis=None
613
+ ) -> tuple[torch.Tensor, torch.Tensor]:
614
+ # Store original dtype for sampling tensor
615
+ original_dtype = x.dtype
616
+
617
+ if self.use_sos: # get i-th patch but keep the sequence dimension
618
+ x_in = self.get_sos_embed(x[:, :1]) if t == 0 else x[:, t - 1 : t]
619
+ else:
620
+ x_in = x[:, t : t + 1]
621
+
622
+ # Convert to model's dtype for neural network computation
623
+ if hasattr(self.proj_in, 'weight'):
624
+ target_dtype = self.proj_in.weight.dtype
625
+ x_in = x_in.to(target_dtype)
626
+
627
+ x = self.get_proj_in(x_in)
628
+
629
+ # if positional embedding
630
+ if pos_embed is not None:
631
+ x = x + pos_embed[t: t+1]
632
+
633
+ # main block
634
+ for i, block in enumerate(self.attn_blocks):
635
+ x, _ = block(x, None, attn_temp=attn_temp, freqs_cis=freqs_cis, kv_cache=partial(kv_cache, i))
636
+
637
+ # final norm
638
+ if self.use_final_norm:
639
+ x = self.final_norm(x)
640
+
641
+ x = self.get_proj_out(x)
642
+ xa, xb = x.chunk(2, dim=-1)
643
+
644
+ # Convert back to original dtype for sampling computations
645
+ return xa.to(original_dtype), xb.to(original_dtype)
646
+
647
+ def reverse_step_condition(self, y, kv_cache, pos_embed=None, attn_temp: float = 1.0, freqs_cis=None):
648
+ # Convert to model's dtype for neural network computation
649
+ if hasattr(self.proj_txt, 'weight'):
650
+ target_dtype = self.proj_txt.weight.dtype
651
+ y = y.to(target_dtype)
652
+
653
+ y = self.proj_txt(y)
654
+ for i, block in enumerate(self.attn_blocks):
655
+ _, y = block(None, y, attn_temp=attn_temp, freqs_cis=freqs_cis, kv_cache=partial(kv_cache, i))
656
+ return y
657
+
658
+ def reverse(
659
+ self,
660
+ z: torch.Tensor,
661
+ y: torch.Tensor | None = None,
662
+ guidance: float = 0,
663
+ guide_what: str = 'ab',
664
+ attn_temp: float = 1.0,
665
+ annealed_guidance: bool = False,
666
+ rope=None,
667
+ verbose=False,
668
+ kv_cache: KVCache=KVCache(),
669
+ **unused_kwargs
670
+ ) -> torch.Tensor:
671
+ # Ensure sampling tensors are in float32 for numerical stability
672
+ original_dtype = z.dtype
673
+ z = z.float()
674
+
675
+ freqs_cis = self.get_freqs_cis(z, y, rope) if rope is not None else None
676
+ if guidance > 0:
677
+ z = torch.cat([z, z], 0)
678
+
679
+ # kv cache
680
+ reuse_kv_cache = kv_cache.prefix_cache is not None and kv_cache.kv_index[0] > 0
681
+ kv_cache = self.initialize_kv_cache(kv_cache, z, freqs_cis, reuse_kv_cache)
682
+
683
+ # permute the input
684
+ z = self.permutation(z)
685
+ pos_embed = self.permutation(self.pos_embed, dim=0) if self.pos_embed is not None else None
686
+
687
+ # run additional text condition, results will be used in KV cache.
688
+ if self.txt_dim > 0:
689
+ if not reuse_kv_cache:
690
+ self.reverse_step_condition(y, kv_cache, pos_embed, attn_temp, freqs_cis)
691
+ txt_size = y.size(1) if self.txt_dim > 0 else 0
692
+
693
+ # run the reverse process
694
+ x = z.clone()
695
+ if reuse_kv_cache:
696
+ x[:, :kv_cache.prefix_cache.size(1)] = kv_cache.prefix_cache # fill the prefix cache
697
+
698
+ T = x.size(1) - 1 if not self.use_sos else x.size(1)
699
+ for t in tqdm.trange(T, disable=not verbose, desc='Sub-flow Sampling', leave=False):
700
+ if reuse_kv_cache and kv_cache.kv_index[0] > t + txt_size:
701
+ continue
702
+ za, zb = self.reverse_step(x, t, kv_cache, pos_embed, y, attn_temp, freqs_cis)
703
+ # Ensure sampling computations stay in float32
704
+ za, zb = za.float(), zb.float()
705
+ if not self.use_softplus:
706
+ za, zb = za.exp().squeeze(1), zb.squeeze(1)
707
+ else:
708
+ za, zb = F.softplus(za + INV_SOFTPLUS_1).squeeze(1), zb.squeeze(1)
709
+
710
+ if guidance > 0 and guide_what:
711
+ r = (t + 1) / T if annealed_guidance else 1.0
712
+ zb, za = self.guidance(za, zb, guidance, r, guide_what)
713
+ if self.use_sos:
714
+ x[:, t] = z[:, t] * za + zb
715
+ else:
716
+ x[:, t + 1] = z[:, t + 1] * za + zb
717
+
718
+ if guidance > 0:
719
+ x = x.chunk(2, dim=0)[0]
720
+ kv_cache.remove_negative_cache() # remove the second half of the cache
721
+
722
+ x = self.permutation(x, inverse=True)
723
+ # Convert back to original dtype if needed
724
+ return x.to(original_dtype)
725
+
726
+ def jacobi(self,
727
+ z: torch.Tensor,
728
+ y: torch.Tensor | None = None,
729
+ guidance: float = 0,
730
+ rope=None,
731
+ kv_cache=None,
732
+ verbose=False,
733
+ jacobi_block_size: int = 32,
734
+ jacobi_max_iter: int = 32,
735
+ jacobi_th: float = 0.001,
736
+ context_length: int = None,
737
+ **unused_kwargs) -> torch.Tensor:
738
+ assert self.use_sos, "Jacobi iteration requires SOS token to be used"
739
+ assert self.pos_embed is None, "Jacobi iteration does not support positional embedding"
740
+
741
+ # Ensure sampling tensors are in float32 for numerical stability
742
+ original_dtype = z.dtype
743
+ z = z.float()
744
+
745
+ freqs_cis = self.get_freqs_cis(z, y, rope) if rope is not None else None
746
+ if guidance > 0:
747
+ z = torch.cat([z, z], 0)
748
+ # kv cache
749
+ reuse_kv_cache = kv_cache.prefix_cache is not None and kv_cache.kv_index[0] > 0
750
+ kv_cache = self.initialize_kv_cache(kv_cache, z, freqs_cis, reuse_kv_cache)
751
+ video_length = z.size(1) if z.dim() == 5 else 1
752
+
753
+ # permute the input
754
+ z = self.permutation(z)
755
+
756
+ # prepare input
757
+ x_full = torch.cat([self.get_sos_embed(z), z.clone()], dim=1)
758
+ if reuse_kv_cache:
759
+ x_full[:, 1: kv_cache.prefix_cache.size(1) + 1] = kv_cache.prefix_cache # fill the prefix cache
760
+
761
+ # conditioning
762
+ if self.txt_dim > 0:
763
+ if not reuse_kv_cache:
764
+ self.reverse_step_condition(y, kv_cache, freqs_cis=freqs_cis)
765
+
766
+ txt_size = y.size(1) if self.txt_dim > 0 else 0
767
+ video_frame_size = z.size(1) // video_length
768
+ start_idx = 0
769
+ if reuse_kv_cache:
770
+ start_idx = kv_cache.kv_index[0] - txt_size # start from the last cached index
771
+ prog_bar = tqdm.tqdm(total=z.size(1), disable=not verbose, desc='Block-wise Jacobi Iteration', leave=False)
772
+ prog_bar.update(start_idx)
773
+
774
+ local_attn_window = self.local_attn_window * video_frame_size if self.local_attn_window is not None else None
775
+ target_frame_size = z.size(1) if local_attn_window is None else min(z.size(1), local_attn_window)
776
+ context_size = None if local_attn_window is None else context_length * video_frame_size
777
+ while target_frame_size <= z.size(1):
778
+ while start_idx < target_frame_size:
779
+ chunk_size = jacobi_block_size if start_idx <= video_frame_size else jacobi_block_size * 4
780
+ local_done = torch.zeros((), dtype=torch.bool, device=x_full.device)
781
+ for i in tqdm.tqdm(range(jacobi_max_iter), disable=True, desc='Jacobi Iteration', leave=False):
782
+ if start_idx + chunk_size >= target_frame_size:
783
+ chunk_size = target_frame_size - start_idx
784
+ if i == 0 and start_idx > video_frame_size: # optional to use past frame to initialize the current frame
785
+ x = x_full[:, start_idx - video_frame_size: start_idx + chunk_size - video_frame_size]
786
+ else:
787
+ x = x_full[:, start_idx: start_idx + chunk_size]
788
+
789
+ # main forward - convert to model dtype for neural network computation
790
+ if hasattr(self.proj_in, 'weight'):
791
+ target_dtype = self.proj_in.weight.dtype
792
+ x = x.to(target_dtype)
793
+
794
+ x = self.get_proj_in(x)
795
+ for it, block in enumerate(self.attn_blocks):
796
+ _kv_cache = partial(kv_cache, it) if kv_cache is not None else None
797
+ x = block(x, None, freqs_cis=freqs_cis, kv_cache=_kv_cache)[0]
798
+
799
+ if self.use_final_norm:
800
+ x = self.final_norm(x)
801
+ x = self.get_proj_out(x)
802
+ xa, xb = x.chunk(2, dim=-1)
803
+
804
+ # Convert back to float32 for sampling computations
805
+ xa, xb = xa.float(), xb.float()
806
+ if not self.use_softplus:
807
+ xa = xa.exp()
808
+ else:
809
+ xa = F.softplus(xa + INV_SOFTPLUS_1)
810
+ if guidance > 0:
811
+ xb, xa = self.guidance(xa, xb, guidance, 1.0, 'ab')
812
+
813
+ # compute the Jacobi Iteration - all in float32
814
+ new_x = xb + xa * z[:, start_idx: start_idx+chunk_size]
815
+ diff = ((new_x - x_full[:, start_idx+1: start_idx+1+chunk_size]) ** 2).mean() / (new_x ** 2).mean()
816
+ x_full[:, start_idx+1: start_idx+1+chunk_size] = new_x
817
+ if diff < jacobi_th or i == jacobi_max_iter - 1: # do not clean the cache on the last iteration
818
+ local_done.fill_(1)
819
+ global_done = local_done.clone()
820
+ torch.distributed.all_reduce(global_done, op=torch.distributed.ReduceOp.MIN)
821
+ if int(global_done.item()) == 1:
822
+ break
823
+
824
+ kv_cache.backward_in_time(chunk_size)
825
+ start_idx += chunk_size
826
+ prog_bar.update(chunk_size)
827
+
828
+ if target_frame_size >= z.size(1):
829
+ break
830
+
831
+ target_frame_size += local_attn_window - context_size if local_attn_window is not None else video_frame_size
832
+ target_frame_size = min(target_frame_size, z.size(1))
833
+
834
+ # re-encode the context with attention blocks
835
+ print(f're-encoding the context {start_idx+1-context_size}:{start_idx+1}')
836
+ kv_cache.reset_kv_index()
837
+ if self.txt_dim > 0:
838
+ self.reverse_step_condition(y, kv_cache, freqs_cis=freqs_cis)
839
+ x_context = x_full[:, start_idx+1-context_size: start_idx+1]
840
+ x_context_in, x_context = self.get_prepared(x_context)
841
+ x_context = self.get_proj_in(x_context)
842
+ for it, block in enumerate(self.attn_blocks):
843
+ _kv_cache = partial(kv_cache, it) if kv_cache is not None else None
844
+ x_context = block(x_context, None, freqs_cis=freqs_cis, kv_cache=_kv_cache)[0]
845
+
846
+ x = x_full[:, 1:]
847
+ if guidance > 0:
848
+ x = x.chunk(2, dim=0)[0] # remove SOS token
849
+ x = self.permutation(x, inverse=True)
850
+ # Convert back to original dtype if needed
851
+ return x.to(original_dtype)
852
+
853
+
854
+ class IdentityBlock(MetaBlock):
855
+ def __init__(self, *args, **kwargs):
856
+ super(MetaBlock, self).__init__()
857
+
858
+ def forward(self, x, y=None, rope=None, **unused):
859
+ return x, y, x.new_zeros(x.size(0))
860
+
861
+ def reverse(self,
862
+ z: torch.Tensor,
863
+ y: torch.Tensor | None = None,
864
+ guidance: float = 0,
865
+ guide_what: str = 'ab',
866
+ attn_temp: float = 1.0,
867
+ annealed_guidance: bool = False,
868
+ rope=None,
869
+ verbose=False,
870
+ kv_cache: KVCache=KVCache(), **unused):
871
+ # Preserve original dtype
872
+ return z
873
+
874
+ def jacobi(self,
875
+ z: torch.Tensor,
876
+ y: torch.Tensor | None = None,
877
+ guidance: float = 0,
878
+ rope=None,
879
+ kv_cache=None,
880
+ verbose=False,
881
+ jacobi_block_size: int = 64,
882
+ jacobi_th: float = 0.005, **unused_kwargs) -> torch.Tensor:
883
+ return z
884
+
885
+
886
+ class NonCausalBlock(MetaBlock):
887
+ def __init__(
888
+ self,
889
+ in_channels: int,
890
+ channels: int,
891
+ img_size: int,
892
+ pt_seq_len: int | None = None,
893
+ num_layers: int = 8,
894
+ head_dim: int = 64,
895
+ num_heads: None | int = None,
896
+ num_kv_heads: None | int = None,
897
+ txt_size: int = 0,
898
+ txt_dim: int = 0,
899
+ expansion: float = 4,
900
+ use_rope: bool = False,
901
+ use_swiglu: bool = False,
902
+ use_qk_norm: bool =False,
903
+ use_post_norm: bool = False,
904
+ use_final_norm: bool = False,
905
+ use_bias: bool = True,
906
+ hf_style_rope: bool = False,
907
+ norm_type: str ="layer_norm",
908
+ use_checkpoint: int = False,
909
+ use_checkpoint_mlp: int = None,
910
+ block_causal: int = 0,
911
+ window: int = None,
912
+ **unused_kwargs,
913
+ ):
914
+ super(MetaBlock, self).__init__()
915
+ out_channels = in_channels
916
+ self.proj_in = torch.nn.Linear(in_channels, channels)
917
+ self.proj_out = torch.nn.Linear(channels, out_channels)
918
+ torch.nn.init.constant_(self.proj_out.weight, 0)
919
+
920
+ self.txt_size = txt_size
921
+ self.img_size = img_size
922
+ self.txt_dim = txt_dim
923
+ self.pt_seq_len = pt_seq_len or img_size
924
+ self.block_causal = block_causal
925
+ self.window = window
926
+
927
+ # KV cache configurations
928
+ num_kv_heads = num_kv_heads or (num_heads or channels // head_dim)
929
+ self.kv_cache_size = [num_kv_heads, head_dim]
930
+ if txt_dim > 0:
931
+ self.proj_txt = torch.nn.Linear(txt_dim, channels)
932
+
933
+ self.attn_blocks = torch.nn.ModuleList(
934
+ [AttentionBlock(channels, head_dim, expansion, False, use_swiglu, norm_type, num_heads, num_kv_heads,
935
+ use_qk_norm, use_post_norm, use_bias, hf_style_rope, non_causal=True) for _ in range(num_layers)])
936
+ self.use_final_norm = use_final_norm
937
+ if use_final_norm:
938
+ self.final_norm = RMSNorm(channels)
939
+ self.use_checkpoint = use_checkpoint
940
+ self.use_checkpoint_mlp = use_checkpoint_mlp
941
+ self.block_masks = {} # for local attention
942
+
943
+ def get_local_window_mask(self, x, y):
944
+ _, T, H, W, _ = x.shape
945
+ L = y.size(1) if y is not None else 0
946
+ B = H * W
947
+ N = T * B
948
+ S = L + N
949
+ A = self.block_causal
950
+ G = self.window if self.window is not None else 10000
951
+
952
+ def mask(q, k):
953
+ return (k < L) | (
954
+ ((k - L) // B >= (q - L) // B + A - 1 - G) &
955
+ ((k - L) // B <= torch.relu(q - L) // B + A - 1)
956
+ )
957
+
958
+ return mask(torch.arange(S, device=x.device)[:, None], torch.arange(S, device=x.device)[None, :])
959
+
960
+ def forward(self, x, y, rope, **unused):
961
+ freqs_cis = self.get_freqs_cis(x, y, rope) if rope is not None else None
962
+ if self.block_causal > 0 and x.dim() == 5:
963
+ attn_mask = self.get_local_window_mask(x, y if self.txt_dim > 0 else None)
964
+ else:
965
+ attn_mask = None
966
+
967
+ if x.dim() == 5: # video input
968
+ N, H, W, x = x.size(1), x.size(2), x.size(3), rearrange(x, 'b t h w c -> b (t h w) c') # flatten x
969
+ else:
970
+ N, H, W, x = 0, x.size(1), x.size(2), rearrange(x, 'b h w c -> b (h w) c') # flatten x
971
+
972
+ x = self.get_proj_in(x)
973
+ y = self.proj_txt(y) if self.txt_dim > 0 else None
974
+
975
+ for it, block in enumerate(self.attn_blocks):
976
+ # Frequency-based checkpointing strategy:
977
+ # - Checkpoint attention every use_checkpoint blocks (if use_checkpoint > 0)
978
+ # - Checkpoint MLP every use_checkpoint_mlp blocks (if provided), otherwise every use_checkpoint blocks
979
+ checkpoint_attn = self.training and self.use_checkpoint > 0 and ((it + 1) % self.use_checkpoint == 0)
980
+ if self.use_checkpoint_mlp is not None:
981
+ checkpoint_mlp = self.training and self.use_checkpoint_mlp > 0 and ((it + 1) % self.use_checkpoint_mlp == 0)
982
+ else:
983
+ checkpoint_mlp = self.training and self.use_checkpoint > 0 and ((it + 1) % self.use_checkpoint == 0)
984
+
985
+ x, y = block(x, y, attn_mask, 1.0, None, freqs_cis,
986
+ checkpoint_attn=checkpoint_attn, checkpoint_mlp=checkpoint_mlp)
987
+
988
+ if self.use_final_norm:
989
+ x = self.final_norm(x)
990
+ x = self.get_proj_out(x)
991
+ if N > 0:
992
+ x = rearrange(x, 'b (t h w) d -> b t h w d', t=N, h=H, w=W)
993
+ else:
994
+ x = rearrange(x, 'b (h w) d -> b h w d', h=H, w=W)
995
+ return x
996
+
997
+
998
+ class Model(torch.nn.Module):
999
+ def __init__(
1000
+ self,
1001
+ in_channels: int,
1002
+ img_size: int,
1003
+ patch_size: int,
1004
+ channels: int,
1005
+ num_blocks: int,
1006
+ layers_per_block: List[int],
1007
+ head_dim: int = 64,
1008
+ num_heads: None | int = None,
1009
+ num_kv_heads: None | int = None,
1010
+ rope: bool = False,
1011
+ pt_seq_len: None | int = None,
1012
+ sos: bool = False,
1013
+ txt_size: int = 0,
1014
+ txt_dim: int = 0,
1015
+ cond_top_only: bool = False,
1016
+ use_softplus: bool = False,
1017
+ use_swiglu: bool = False,
1018
+ use_bias: bool = True,
1019
+ use_qk_norm: bool = False,
1020
+ use_post_norm: bool = False,
1021
+ use_final_norm: bool = False,
1022
+ hf_style_rope: bool = False,
1023
+ norm_type: str = "layer_norm",
1024
+ use_checkpoint: int = 0,
1025
+ use_checkpoint_mlp: int = None,
1026
+ use_pretrained_lm: str | None = None,
1027
+ use_mm_attn: bool = False,
1028
+ soft_clip: float = 0,
1029
+ seq_order: str = "R2L",
1030
+ learnable_self_denoiser: bool = False,
1031
+ conditional_denoiser: bool = False,
1032
+ temporal_causal: int = 0,
1033
+ top_block_channels: int = None, # If specified, top block uses different size
1034
+ shallow_block_local: bool = False, # If True, shallow blocks only constrained within a frame
1035
+ denoiser_window: int = None, # If specified, use local attention in the denoiser with given window size
1036
+ local_attn_window: int = None, # If specified, use local attention in all blocks with given window size
1037
+ **unused_kwargs,
1038
+ ):
1039
+ super().__init__()
1040
+ self.img_size = img_size
1041
+ self.in_channels = in_channels
1042
+ self.patch_size = patch_size
1043
+ self.pt_seq_len = pt_seq_len or img_size // patch_size
1044
+ self.num_patches = self.pt_seq_len ** 2
1045
+ self.use_rope = rope
1046
+ self.use_sos = sos
1047
+ self.use_softplus = use_softplus
1048
+ self.cond_top_only = cond_top_only
1049
+ self.seq_order = seq_order
1050
+ self.temporal_causal = temporal_causal
1051
+ self.top_block_channels = top_block_channels or channels
1052
+ self.shallow_block_local = shallow_block_local
1053
+ self.expansion_init_std = 0.02
1054
+ assert (not local_attn_window) or shallow_block_local, 'local_attn_window requires shallow_block_local'
1055
+ assert (not shallow_block_local) or self.cond_top_only, 'shallow_block_local requires cond_top_only'
1056
+ assert (not self.cond_top_only) or (txt_size > 0), 'cond_top_only requires txt_size > 0'
1057
+ assert (seq_order == 'L2R') or (temporal_causal == 0), 'seq_order must be L2R if temporal causal is True'
1058
+ permutations = [PermutationIdentity(self.num_patches), PermutationFlip(self.num_patches)] if temporal_causal == 0 else \
1059
+ [PermutationIdentity(self.num_patches), PermutationFlipInBlock(self.num_patches)]
1060
+
1061
+ blocks = []
1062
+ if len(layers_per_block) == 1:
1063
+ layers_per_block = [layers_per_block[0]] * num_blocks
1064
+
1065
+ base_kwargs = dict(
1066
+ in_channels=in_channels * patch_size**2,
1067
+ channels=channels,
1068
+ img_size=img_size // patch_size,
1069
+ pt_seq_len=self.pt_seq_len,
1070
+ txt_size=txt_size,
1071
+ use_rope=self.use_rope, hf_style_rope=hf_style_rope, use_sos=self.use_sos,
1072
+ use_softplus=self.use_softplus,
1073
+ use_swiglu=use_swiglu, use_qk_norm=use_qk_norm,
1074
+ use_post_norm=use_post_norm, use_final_norm=use_final_norm,
1075
+ use_bias=use_bias, norm_type=norm_type, num_heads=num_heads,
1076
+ num_kv_heads=num_kv_heads, head_dim=head_dim,
1077
+ use_checkpoint=use_checkpoint,
1078
+ use_checkpoint_mlp=use_checkpoint_mlp,
1079
+ soft_clip=soft_clip,
1080
+ )
1081
+ # bottom blocks
1082
+ for i in range(num_blocks-1):
1083
+ permutation = permutations[i % 2] if seq_order == 'R2L' else permutations[(i+1) % 2]
1084
+ Block = IdentityBlock if layers_per_block[i] == 0 else MetaBlock
1085
+ blocks.append(Block(permutation=permutation, num_layers=layers_per_block[i], txt_dim=0 if cond_top_only else txt_dim, **base_kwargs))
1086
+
1087
+ # top block
1088
+ gen_kwargs = copy.deepcopy(base_kwargs)
1089
+ if self.top_block_channels != channels:
1090
+ gen_kwargs['channels'] = self.top_block_channels
1091
+ if num_heads is None:
1092
+ gen_kwargs['num_heads'] = self.top_block_channels // head_dim
1093
+ if use_pretrained_lm is not None:
1094
+ gen_kwargs.update(eval(f"{use_pretrained_lm}_kwargs"))
1095
+ if use_mm_attn:
1096
+ gen_kwargs.update({"use_mm_attn": True}) # only top block will receive this
1097
+ else:
1098
+ gen_kwargs.update({"num_layers": layers_per_block[-1]})
1099
+
1100
+ permutation = permutations[(num_blocks-1) % 2] if seq_order == 'R2L' else permutations[(num_blocks) % 2]
1101
+ top_block = MetaBlock(permutation=permutation, txt_dim=txt_dim, local_attn_window=local_attn_window, **gen_kwargs)
1102
+ blocks.append(top_block)
1103
+
1104
+ # put together
1105
+ self.blocks = torch.nn.ModuleList(blocks)
1106
+
1107
+ # Self-denoiser
1108
+ if learnable_self_denoiser:
1109
+ self.learnable_self_denoiser = NonCausalBlock(
1110
+ num_layers=8, block_causal=temporal_causal, window=denoiser_window,
1111
+ txt_dim=0 if not conditional_denoiser else txt_dim,
1112
+ **base_kwargs)
1113
+
1114
+ # setup rotary embeddings
1115
+ if self.use_rope:
1116
+ self.feat_rope = VisionRotaryEmbeddingFast(
1117
+ dim=base_kwargs['head_dim'] // 2, pt_seq_len=base_kwargs['pt_seq_len'], latent_len=txt_size)
1118
+
1119
+ if use_pretrained_lm is not None: # using standard 1D RoPE
1120
+ self.feat_rope_gen = VisionRotaryEmbeddingFast(
1121
+ dim=gen_kwargs['head_dim'] // 2, pt_seq_len=gen_kwargs['pt_seq_len'], no_buffer=True, is_1d=True)
1122
+ else:
1123
+ self.feat_rope_gen = VisionRotaryEmbeddingFast(
1124
+ dim=gen_kwargs['head_dim'] // 2, pt_seq_len=gen_kwargs['pt_seq_len'], latent_len=txt_size, no_buffer=True)
1125
+ else:
1126
+ self.feat_rope = self.feat_rope_gen = None
1127
+
1128
+ # ----- DEPRECATED: not useful -------
1129
+ self.register_buffer('var', torch.ones(self.num_patches, in_channels * patch_size**2))
1130
+
1131
+ def patchify(self, x: List[torch.Tensor] | torch.Tensor, p: int | None = None) -> torch.Tensor:
1132
+ """Convert an image (N,C',H,W) to a sequence of patches (N,T,C')"""
1133
+ if len(x.shape) < 4:
1134
+ return x # no need patchify
1135
+ H, W = x.shape[-2], x.shape[-1]
1136
+ p = self.patch_size * p if p is not None else self.patch_size
1137
+ assert H % p == 0 and W % p == 0, "H and W must be divisible by patch_size"
1138
+ x = rearrange(x, '... c (h p1) (w p2) -> ... h w (p1 p2 c)', p1=p, p2=p)
1139
+ return x
1140
+
1141
+ def unpatchify(self, x: List[torch.Tensor] | torch.Tensor, p: int | None = None) -> torch.Tensor:
1142
+ """Convert a sequence of patches (N,T,C) to an image (N,C',H,W)"""
1143
+ if len(x.shape) < 4:
1144
+ return x # no need unpatchify
1145
+ p = self.patch_size * p if p is not None else self.patch_size
1146
+ H, W = x.shape[-3], x.shape[-2]
1147
+ return rearrange(x, '... h w (p1 p2 c) -> ... c (h p1) (w p2)', h=H, w=W, p1=p, p2=p)
1148
+
1149
+ def get_loss(self,
1150
+ z: torch.Tensor | List[torch.Tensor],
1151
+ logdets: torch.Tensor | List[torch.Tensor],
1152
+ weights: torch.Tensor | None = None,
1153
+ drop_first=False) -> dict[str, torch.Tensor]:
1154
+ if drop_first:
1155
+ z, logdets = z[:, 1:], [logdet[:, 1:] for logdet in logdets]
1156
+ loss_z = 0.5 * z.pow(2).mean(dim=tuple(range(1, z.dim())))
1157
+ loss_logdet = -sum([logdet.mean(dim=tuple(range(1, logdet.dim()))) for logdet in logdets])
1158
+ loss = loss_z + loss_logdet
1159
+ if weights is not None:
1160
+ loss = loss * weights
1161
+ loss = loss.mean()
1162
+ return {'loss': loss, 'loss_z': loss_z.detach().mean(), 'loss_logdet': loss_logdet.detach().mean()}
1163
+
1164
+ def forward(
1165
+ self, x: torch.Tensor, y: torch.Tensor | None = None,
1166
+ reverse=False, kv_caches=None, denoiser=False, context=False, **kwargs
1167
+ ) -> tuple[torch.Tensor, list[torch.Tensor], torch.Tensor]:
1168
+ if context:
1169
+ return self.forward_context(x, y, kv_caches=kv_caches, **kwargs)
1170
+
1171
+ if reverse: # inference mode
1172
+ return self.reverse(x, y, kv_caches=kv_caches, **kwargs)
1173
+
1174
+ if denoiser: # forward with self-denoiser
1175
+ x = self.patchify(x)
1176
+ x = self.learnable_self_denoiser(x, y, self.feat_rope, **kwargs)
1177
+ return self.unpatchify(x)
1178
+
1179
+ logdets, outputs = [], []
1180
+ guidance = kwargs.get('guidance', 0)
1181
+
1182
+ # Bottom blocks
1183
+ x = self.patchify(x)
1184
+ outputs += [x]
1185
+ for it, block in enumerate(self.blocks[:-1]):
1186
+ if self.shallow_block_local and x.dim() == 5: # video input
1187
+ x = rearrange(x, 'b t h w c -> (b t) 1 h w c')
1188
+ x, _, logdet = block(x, y.chunk(2, dim=0)[0] if self.cond_top_only and guidance > 0 else y,
1189
+ self.feat_rope, kv_cache=kv_caches[-(it+1)] if kv_caches is not None else None)
1190
+ if self.shallow_block_local and x.dim() == 5: # video input
1191
+ x = rearrange(x, '(b t) 1 h w c -> b t h w c', b=outputs[0].size(0), t=outputs[0].size(1))
1192
+ logdet = rearrange(logdet, '(b t) l c -> b t l c', b=outputs[0].size(0), t=outputs[0].size(1))
1193
+ logdets += [logdet]
1194
+ outputs += x if isinstance(x, list) else [x]
1195
+
1196
+ # Top block
1197
+ x, y, logdet = self.blocks[-1](x, y, self.feat_rope_gen,
1198
+ kv_cache=kv_caches[0] if kv_caches is not None else None,
1199
+ guidance=guidance)
1200
+ outputs += [x]
1201
+ x = self.unpatchify(x)
1202
+ logdets += [logdet]
1203
+ return x, y, outputs, logdets
1204
+
1205
+ def forward_context(self, x: torch.Tensor, y: torch.Tensor | None = None, kv_caches: List[KVCache] | None = None, **kwargs):
1206
+ if kv_caches is None:
1207
+ kv_caches = [KVCache() for _ in range(len(self.blocks))]
1208
+ use_cfg = (x.size(0) * 2 == y.size(0)) if (y is not None and self.cond_top_only) else False
1209
+ if use_cfg:
1210
+ x = torch.cat([x, x], 0) # duplicate for classifier-free guidance generation
1211
+
1212
+ self.forward(x, y, kv_caches=kv_caches, **kwargs) # run once to fill the cache
1213
+
1214
+ if use_cfg:
1215
+ for kv in kv_caches[1:]:
1216
+ kv.remove_negative_cache() # remove negative cache except for the first block
1217
+ kv.prefix_cache = kv.prefix_cache.chunk(2, dim=0)[0] if kv.prefix_cache is not None else None
1218
+ return kv_caches
1219
+
1220
+ def reverse_deep(self,
1221
+ x: List[torch.Tensor] | torch.Tensor,
1222
+ y: torch.Tensor | None = None,
1223
+ guidance: float = 0,
1224
+ verbose: bool = False,
1225
+ kv_caches: List[KVCache] | None = None,
1226
+ jacobi: bool = False,
1227
+ need_caches: bool = False,
1228
+ seq: List[torch.Tensor] = [],
1229
+ **sampling_kwargs,):
1230
+ x = self.patchify(x)
1231
+ x = (self.blocks[-1].jacobi if jacobi else self.blocks[-1].reverse)(
1232
+ x, y, guidance, rope=self.feat_rope_gen, kv_cache=kv_caches[0], verbose=verbose, **sampling_kwargs)
1233
+ x = self.unpatchify(x)
1234
+ if not need_caches:
1235
+ kv_caches[0].delete()
1236
+ seq.append(x)
1237
+ return x
1238
+
1239
+ def reverse_shallow(self,
1240
+ x: List[torch.Tensor] | torch.Tensor,
1241
+ y: torch.Tensor | None = None,
1242
+ guidance: float = 0,
1243
+ verbose: bool = False,
1244
+ kv_caches: List[KVCache] | None = None,
1245
+ jacobi: bool = False,
1246
+ need_caches: bool = False,
1247
+ seq: List[torch.Tensor] = [],
1248
+ **sampling_kwargs,):
1249
+ x = self.patchify(x)
1250
+ for it, block in enumerate(reversed(self.blocks[:-1])):
1251
+ if self.shallow_block_local and x.dim() == 5: # video input
1252
+ x = rearrange(x, 'b t h w c -> (b t) 1 h w c')
1253
+ kv_caches[it+1]._is_empty = True
1254
+ kv_caches[it+1].prefix_cache = None
1255
+ x = (block.jacobi if jacobi else block.reverse)(
1256
+ x, y, guidance, rope=self.feat_rope, kv_cache=kv_caches[it+1], verbose=verbose, **sampling_kwargs)
1257
+ if self.shallow_block_local and x.dim() == 5: # video input
1258
+ x = rearrange(x, '(b t) 1 h w c -> b t h w c', b=seq[0].size(0), t=seq[0].size(1))
1259
+ seq.append(self.unpatchify(x))
1260
+ if not need_caches:
1261
+ kv_caches[it+1].delete()
1262
+ x = self.unpatchify(x)
1263
+ return x
1264
+
1265
+ def reverse(
1266
+ self,
1267
+ x: List[torch.Tensor] | torch.Tensor,
1268
+ y: torch.Tensor | None = None,
1269
+ guidance: float = 0,
1270
+ guide_top: int | None = None,
1271
+ return_sequence: bool = False,
1272
+ verbose: bool = False,
1273
+ kv_caches: List[KVCache] | None = None,
1274
+ jacobi: bool = False,
1275
+ **sampling_kwargs,
1276
+ ) -> torch.Tensor | list[torch.Tensor]:
1277
+ seq, need_caches, kv_caches = [x], (kv_caches is not None), kv_caches or [KVCache() for _ in range(len(self.blocks))]
1278
+
1279
+ # run the deep block first
1280
+ x = self.reverse_deep(x, y, guidance, verbose, kv_caches, jacobi, need_caches, seq, **sampling_kwargs)
1281
+
1282
+ # remove guidance if bottom is unconditional
1283
+ if (guide_top is not None or self.cond_top_only) and guidance > 0:
1284
+ guidance, y = 0, y.chunk(2, dim=0)[0]
1285
+
1286
+ # run the shallow blocks
1287
+ x = self.reverse_shallow(x, y, guidance, verbose, kv_caches, jacobi, need_caches, seq, **sampling_kwargs)
1288
+ return seq if return_sequence else x
1289
+
1290
+
1291
+ #################################################################################
1292
+ # TARFLow Configs #
1293
+ #################################################################################
1294
+
1295
+ def TarFlow_XL_1(**kwargs):
1296
+ return Model(num_blocks=6, layers_per_block=[2,2,2,2,10,10],
1297
+ channels=2048, patch_size=1, head_dim=64, rope=1, **kwargs)
1298
+
1299
+ def TarFlow_XL_2(**kwargs):
1300
+ return Model(num_blocks=6, layers_per_block=[2,2,2,2,10,10],
1301
+ channels=2048, patch_size=2, head_dim=64, rope=1, **kwargs)
1302
+
1303
+ def TarFlow_XXL_1(**kwargs):
1304
+ return Model(num_blocks=6, layers_per_block=[2,2,2,2,13,13],
1305
+ channels=3072, patch_size=1, head_dim=64, rope=1, **kwargs)
1306
+
1307
+ def TarFlow_XLv2_1(**kwargs): # 1.4B
1308
+ return Model(num_blocks=6, layers_per_block=[2,2,2,2,2,18],
1309
+ channels=2048, patch_size=1, head_dim=64, rope=1, **kwargs)
1310
+
1311
+ def TarFlow_XXLv2_1(**kwargs): # 4B
1312
+ return Model(num_blocks=6, layers_per_block=[2,2,2,2,2,24],
1313
+ channels=3072, patch_size=1, head_dim=64, rope=1, **kwargs)
1314
+
1315
+ def TarFlow_Gemma2B(**kwargs): # 2B
1316
+ return Model(num_blocks=6, layers_per_block=[2,2,2,2,2,26],
1317
+ channels=2304, patch_size=1, rope=1,
1318
+ use_rope=True, hf_style_rope=True, use_adaln=False,
1319
+ use_swiglu=True, use_qk_norm=False, use_post_norm=True,
1320
+ use_final_norm=True, use_bias=False, norm_type="rms_norm",
1321
+ num_heads=8, num_kv_heads=4, head_dim=256, **kwargs)
1322
+
1323
+
1324
+ # Pre-trained model configs
1325
+ pre_model_configs = {
1326
+ "TarFlow_XL_1": TarFlow_XL_1,
1327
+ "TarFlow_XLv2_1": TarFlow_XLv2_1,
1328
+ "TarFlow_XL_2": TarFlow_XL_2,
1329
+ "TarFlow_XXL_1": TarFlow_XXL_1,
1330
+ "TarFlow_XXLv2_1": TarFlow_XXLv2_1,
1331
+ }
1332
+
1333
+
1334
+ #################################################################################
1335
+ # Pretrained LLMs #
1336
+ #################################################################################
1337
+ gemma3_4b_kwargs = dict(
1338
+ use_rope=True, hf_style_rope=True, use_adaln=False,
1339
+ use_swiglu=True, use_qk_norm=True, use_post_norm=True,
1340
+ use_final_norm=True, use_bias=False, norm_type="rms_norm",
1341
+ num_heads=8, num_kv_heads=4, head_dim=256, channels=2560,
1342
+ num_layers=34, use_proj_txt=False)
1343
+
1344
+ gemma3_1b_kwargs = dict(
1345
+ use_rope=True, hf_style_rope=True, use_adaln=False,
1346
+ use_swiglu=True, use_qk_norm=True, use_post_norm=True,
1347
+ use_final_norm=True, use_bias=False, norm_type="rms_norm",
1348
+ num_heads=4, num_kv_heads=1, head_dim=256, channels=1152, expansion=6,
1349
+ num_layers=26, use_proj_txt=False)
1350
+
1351
+ gemma2_2b_kwargs = dict(
1352
+ use_rope=True, hf_style_rope=True, use_adaln=False,
1353
+ use_swiglu=True, use_qk_norm=False, use_post_norm=True,
1354
+ use_final_norm=True, use_bias=False, norm_type="rms_norm",
1355
+ num_heads=8, num_kv_heads=4, head_dim=256, channels=2304,
1356
+ num_layers=26, use_proj_txt=False)
utils/__init__.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # For licensing see accompanying LICENSE file.
3
+ # Copyright (C) 2025 Apple Inc. All Rights Reserved.
4
+ #
5
+ """
6
+ STARFlow utilities package.
7
+
8
+ This package contains various utilities for STARFlow training and inference,
9
+ organized by functionality for better maintainability.
10
+ """
11
+
12
+ # Import everything from the original utils.py for backward compatibility
13
+ import warnings
14
+ warnings.filterwarnings('ignore', category=FutureWarning)
15
+
16
+ # Re-export everything from the original utils.py to maintain compatibility
17
+ import sys
18
+ import pathlib
19
+
20
+ # Add the parent directory to path to import the original utils
21
+ parent_dir = pathlib.Path(__file__).parent.parent
22
+ sys.path.insert(0, str(parent_dir))
23
+
24
+ # Import from new modular structure
25
+ from .common import (
26
+ load_model_config, preprocess_text, encode_text, drop_label, add_noise,
27
+ get_data, save_samples_unified, read_tsv, set_random_seed
28
+ )
29
+ from .model_setup import (
30
+ setup_transformer, setup_vae, VAE, setup_encoder,
31
+ LookupTableTokenizer, TextEmbedder, LabelEmbdder
32
+ )
33
+ from .training import (
34
+ CosineLRSchedule, Distributed, get_local_rank, parallelize_model,
35
+ save_model, save_optimizer, sync_ctx
36
+ )
37
+ from .inference import (
38
+ FID, IS, CLIP, Metrics,
39
+ self_denoise, apply_denoising, process_denoising, simple_denoising
40
+ )
41
+
42
+ # Define what gets exported when someone does "from utils import *"
43
+ __all__ = [
44
+ # Configuration
45
+ 'load_model_config',
46
+
47
+ # Text processing
48
+ 'preprocess_text',
49
+ 'encode_text',
50
+ 'drop_label',
51
+
52
+ # Noise
53
+ 'add_noise',
54
+
55
+ # Denoising
56
+ 'self_denoise',
57
+ 'apply_denoising',
58
+ 'process_denoising',
59
+ 'simple_denoising',
60
+
61
+ # Saving
62
+ 'save_samples_unified',
63
+
64
+ # Training
65
+ 'CosineLRSchedule',
66
+ 'Distributed',
67
+ 'set_random_seed',
68
+
69
+ # Metrics
70
+ 'FID',
71
+ 'IS',
72
+ 'CLIP',
73
+ 'Metrics',
74
+
75
+ # Models
76
+ 'setup_transformer',
77
+ 'setup_vae',
78
+ 'VAE',
79
+
80
+ # Encoders
81
+ 'setup_encoder',
82
+ 'LookupTableTokenizer',
83
+ 'TextEmbedder',
84
+ 'LabelEmbdder',
85
+ 'read_tsv',
86
+
87
+ # Distributed
88
+ 'parallelize_model',
89
+ 'save_model',
90
+ 'save_optimizer',
91
+ 'get_local_rank',
92
+ 'sync_ctx',
93
+
94
+ # Data
95
+ 'get_data',
96
+ ]
utils/common.py ADDED
@@ -0,0 +1,346 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # For licensing see accompanying LICENSE file.
3
+ # Copyright (C) 2025 Apple Inc. All Rights Reserved.
4
+ #
5
+ """
6
+ Core utility functions for STARFlow.
7
+
8
+ This module contains essential functions for model configuration, text processing,
9
+ noise injection, and data handling. All functions are self-contained.
10
+ """
11
+
12
+ import torch
13
+ import torch.nn as nn
14
+ import torch.nn.functional as F
15
+ import pathlib
16
+ import argparse
17
+ import yaml
18
+ import random
19
+ import numpy as np
20
+ import csv
21
+ from typing import List, Optional, Union, Dict, Any
22
+ from einops import rearrange
23
+ from misc import dividable
24
+
25
+ import torchvision as tv
26
+ import wandb
27
+
28
+
29
+ # ==== Configuration Functions ====
30
+
31
+ def load_model_config(config_path: str) -> argparse.Namespace:
32
+ """Load model configuration from YAML file and merge with trainer arguments."""
33
+ from train import get_tarflow_parser # Import here to avoid circular imports
34
+
35
+ with open(config_path, 'r') as f:
36
+ model_configs = yaml.safe_load(f)
37
+
38
+ trainer_parser = get_tarflow_parser()
39
+ trainer_args = ""
40
+ for conf in model_configs['arguments']:
41
+ for key in conf:
42
+ trainer_args += f"--{key} {conf[key]} "
43
+
44
+ return trainer_parser.parse_args(trainer_args.split())
45
+
46
+
47
+ # ==== Text Processing Functions ====
48
+
49
+ def preprocess_text(text, use_template=False, aspect_ratio=None, fps=None, noise_std=None):
50
+ """Preprocess text with templates, aspect ratios, fps, and noise levels."""
51
+ modes = ['an image'] * len(text)
52
+ if fps is not None:
53
+ if isinstance(fps, torch.Tensor):
54
+ fps = [int(f) for f in fps.tolist()]
55
+ elif isinstance(fps, int):
56
+ fps = [fps] * len(text)
57
+ modes = ['a video' if f > 0 else 'an image' for f in fps]
58
+ text = [f"A video with {f} fps:\n{txt}\n" if f > 0 else f"An image:\n{txt}\n"
59
+ for txt, f in zip(text, fps)]
60
+
61
+ if noise_std is not None:
62
+ if isinstance(noise_std, torch.Tensor):
63
+ noise_std = [int(n * 1000) for n in noise_std.view(-1).tolist()]
64
+ elif isinstance(noise_std, float):
65
+ noise_std = [int(noise_std * 1000)] * len(text)
66
+ text = [f'Noise Level {n}:\n{txt}' for n, txt in zip(noise_std, text)]
67
+
68
+ if aspect_ratio is not None:
69
+ text = [f"{txt}\n in a {aspect_ratio} aspect ratio.\n" for txt in text]
70
+
71
+ if use_template:
72
+ TEMPLATE = "<start_of_turn>user\nPlease generate {mode} about: {prompt}<end_of_turn>\n"
73
+ TEMPLATE = TEMPLATE + "<start_of_turn>model\n"
74
+ text = [TEMPLATE.format(prompt=txt, mode=mode) for txt, mode in zip(text, modes)]
75
+ return text
76
+
77
+
78
+ # Define helper classes that will be needed
79
+ class LookupTableTokenizer:
80
+ def __init__(self, vocab_file):
81
+ self.vocab = {l[0]: i for i, l in enumerate(read_tsv(f'configs/dataset/{vocab_file}'))}
82
+ self.empty_id = len(self.vocab)
83
+
84
+ def __len__(self):
85
+ return len(self.vocab)
86
+
87
+ def __call__(self, text):
88
+ return {'input_ids': torch.tensor([[self.vocab.get(t, self.empty_id)] for t in text], dtype=torch.long)}
89
+
90
+
91
+ class TextEmbedder(nn.Module):
92
+ def __init__(self, config):
93
+ super().__init__()
94
+ if hasattr(config, "text_config"): # Gemma3
95
+ self.config = config.text_config
96
+ self.vocab_size = config.image_token_index
97
+ else:
98
+ self.config = config
99
+ self.vocab_size = config.vocab_size
100
+ self.text_token_embedder = nn.Embedding(
101
+ self.vocab_size, self.config.hidden_size)
102
+ self.text_token_embedder.weight.requires_grad = False
103
+ self.normalizer = float(self.config.hidden_size) ** 0.5
104
+
105
+
106
+ class LabelEmbdder(nn.Module):
107
+ def __init__(self, num_classes):
108
+ super().__init__()
109
+ self.num_classes = num_classes
110
+ self.config = type('Config', (), {'hidden_size': num_classes + 1})()
111
+ self.Embedding = nn.Parameter(torch.eye(num_classes+1), requires_grad=False)
112
+
113
+ def forward(self, y):
114
+ return F.embedding(y, self.Embedding)
115
+
116
+
117
+ @torch.no_grad()
118
+ def encode_text(text_encoder, tokenizer, text, max_length, device, return_tokens=False, **kwargs):
119
+ """Encode text using the text encoder with preprocessing."""
120
+ text = preprocess_text(text, use_template=isinstance(text_encoder, TextEmbedder), **kwargs)
121
+ if isinstance(tokenizer, LookupTableTokenizer):
122
+ assert max_length == 1, "label embedding only supports max_length=1"
123
+ tokenized_outputs = tokenizer(text)
124
+ else:
125
+ tokenized_outputs = tokenizer(
126
+ text, padding="max_length", truncation=True, return_tensors="pt", max_length=max_length)
127
+ tokenized_outputs = {key: val.to(device) for key, val in tokenized_outputs.items()}
128
+ if isinstance(text_encoder, TextEmbedder) or isinstance(text_encoder, LabelEmbdder):
129
+ y = text_encoder(tokenized_outputs['input_ids'])
130
+ else:
131
+ y = text_encoder(**tokenized_outputs).last_hidden_state
132
+ y = y * tokenized_outputs['attention_mask'].unsqueeze(-1) # mask out padding
133
+ if return_tokens:
134
+ return y, tokenized_outputs
135
+ return y
136
+
137
+
138
+ # ==== Noise Functions ====
139
+
140
+ @torch.no_grad()
141
+ def add_noise(x, noise_std=0.3, noise_type='gaussian', cond_noise_level=False):
142
+ """Add noise to input tensor."""
143
+ if isinstance(x, list):
144
+ return zip(*[add_noise(xi, noise_std, noise_type) for xi in x])
145
+
146
+ # inject noise over images
147
+ if noise_type == 'gaussian':
148
+ noise = noise_std * torch.randn_like(x)
149
+ x = x + noise
150
+ elif noise_type == 'uniform':
151
+ # Uniform dequantization following standard normalizing flow practice
152
+ noise = torch.rand_like(x)
153
+ x = ((x + 1) * (255 / 2) + noise) / 256 * 2 - 1
154
+ else:
155
+ raise NotImplementedError
156
+ return x, noise
157
+
158
+
159
+ def drop_label(y, drop_prob=0.1):
160
+ """Randomly drop labels for classifier-free guidance training."""
161
+ return ["" if random.random() < drop_prob else yi for yi in y]
162
+
163
+
164
+ def save_samples_unified(samples: torch.Tensor,
165
+ save_dir: pathlib.Path,
166
+ filename_prefix: str = "samples",
167
+ epoch_or_iter: Optional[int] = None,
168
+ fps: int = 8,
169
+ dist=None,
170
+ wandb_log: bool = False,
171
+ wandb_step: Optional[int] = None,
172
+ grid_arrangement: str = "auto") -> None:
173
+ """
174
+ Unified function to save samples as images or videos.
175
+
176
+ Automatically detects input range and handles both [0,1] and [-1,1] ranges.
177
+
178
+ Args:
179
+ samples: Tensor with samples to save (can be [0,1] or [-1,1] range)
180
+ save_dir: Directory to save files
181
+ filename_prefix: Prefix for filename (e.g., "train_samples", "inference")
182
+ epoch_or_iter: Epoch or iteration number for filename
183
+ fps: FPS for video files
184
+ dist: Distributed training context (if available)
185
+ wandb_log: Whether to log to wandb
186
+ wandb_step: Step for wandb logging
187
+ grid_arrangement: How to arrange samples ("auto", "grid", "individual")
188
+ """
189
+ # Handle distributed gathering
190
+ if dist is not None:
191
+ samples = dist.gather_concat(samples.contiguous().detach())
192
+ should_save = dist.local_rank == 0
193
+ wandb_should_log = wandb_log and dist.rank == 0
194
+ else:
195
+ should_save = True
196
+ wandb_should_log = wandb_log
197
+
198
+ if not should_save:
199
+ return
200
+
201
+ # Create save directory
202
+ save_dir.mkdir(parents=True, exist_ok=True)
203
+ samples = samples.detach().cpu()
204
+ if samples.dim() == 5 and samples.size(1) == 1:
205
+ # If single-frame video, squeeze time dimension
206
+ samples = samples[:, 0]
207
+ normalized_samples = (samples.clamp(-1, 1) + 1) * 0.5
208
+
209
+ # Generate filename
210
+ if samples.dim() == 5:
211
+ filename = f"{filename_prefix}_{samples.size(1)}x{samples.size(3)}x{samples.size(4)}"
212
+ else:
213
+ filename = f"{filename_prefix}_{samples.size(2)}x{samples.size(3)}"
214
+ if epoch_or_iter is not None:
215
+ filename += f"_video_{epoch_or_iter:03d}"
216
+ if samples.dim() == 5: # Video
217
+ filename += ".mp4"
218
+ else: # Image
219
+ filename += ".png"
220
+ file_path = save_dir / filename
221
+
222
+ if samples.dim() == 5: # Video: (B, T, C, H, W)
223
+ if grid_arrangement == "individual":
224
+ # Save individual videos
225
+ for idx in range(samples.size(0)):
226
+ video_data = (normalized_samples[idx] * 255).to(torch.uint8)
227
+ # torchvision.io.write_video expects (T, H, W, C)
228
+ # video_data shape is (T, C, H, W), so permute to (T, H, W, C)
229
+ video_data = video_data.permute(0, 2, 3, 1)
230
+ individual_path = save_dir / f"{filename_prefix}_video_{idx:03d}.mp4"
231
+ tv.io.write_video(str(individual_path), video_data, fps=fps)
232
+ else:
233
+ # Create video grid
234
+ grid_a = dividable(samples.size(0))
235
+ samples_grid = rearrange(
236
+ normalized_samples, '(a b) t c h w -> t (a h) (b w) c',
237
+ a=grid_a
238
+ )
239
+
240
+ tv.io.write_video(
241
+ str(file_path), (samples_grid * 255).to(torch.uint8),
242
+ fps=fps, video_codec='libx264', options={'crf': '10', 'preset': 'slow'}
243
+ )
244
+
245
+ # Wandb logging for video
246
+ if wandb_should_log:
247
+ wandb.log({f"{filename_prefix}_video": wandb.Video(str(file_path))}, step=wandb_step)
248
+
249
+ else: # Image: (B, C, H, W)
250
+ if grid_arrangement == "individual":
251
+ # Save individual images
252
+ for idx in range(samples.size(0)):
253
+ image_path = save_dir / f"{filename_prefix}_{idx:03d}.jpg"
254
+ tv.utils.save_image(
255
+ normalized_samples[idx:idx+1],
256
+ str(image_path), normalize=False
257
+ )
258
+ else:
259
+ # Save as grid
260
+ tv.utils.save_image(
261
+ normalized_samples,
262
+ str(file_path), normalize=False, nrow=dividable(samples.size(0))
263
+ )
264
+
265
+ # Wandb logging for image
266
+ if wandb_should_log:
267
+ wandb.log({f"{filename_prefix}": wandb.Image(str(file_path))}, step=wandb_step)
268
+
269
+ print(f'Saved samples to {file_path}')
270
+
271
+
272
+ # ==== Data and Utility Functions ====
273
+
274
+ def get_data(args, dist):
275
+ """
276
+ Get data loader using dummy dataset for open source release.
277
+
278
+ Args:
279
+ args: Training arguments
280
+ dist: Distributed training context
281
+
282
+ Returns:
283
+ Data loader with dummy synthetic data
284
+ """
285
+ try:
286
+ from dataset import create_dummy_dataloader
287
+ except ImportError:
288
+ raise ImportError("dataset.py not found or missing create_dummy_dataloader function")
289
+
290
+ local_batch_size = args.batch_size // dist.world_size // getattr(args, "acc", 1)
291
+
292
+ # Determine multiple based on VAE type
293
+ if "Wan2.2" in args.vae:
294
+ multiple = 16
295
+ else:
296
+ multiple = 8
297
+
298
+ # Calculate number of samples per rank
299
+ total_samples = getattr(args, 'epoch_length', 50000) # Default to 50k samples
300
+ samples_per_rank = total_samples // dist.world_size if dist.world_size > 0 else total_samples
301
+
302
+ # Create primary dataloader
303
+ data_loader = create_dummy_dataloader(
304
+ dataset_name=args.dataset,
305
+ img_size=args.img_size,
306
+ vid_size=getattr(args, 'vid_size', None),
307
+ batch_size=local_batch_size,
308
+ use_mixed_aspect=getattr(args, 'mix_aspect', False),
309
+ multiple=multiple * args.patch_size,
310
+ num_samples=samples_per_rank,
311
+ infinite=False
312
+ )
313
+
314
+ # Create secondary dataloader if specified
315
+ if getattr(args, 'secondary_dataset', None) is not None:
316
+ secondary_samples = getattr(args, 'secondary_epoch_length', total_samples // 4)
317
+ secondary_samples_per_rank = secondary_samples // dist.world_size if dist.world_size > 0 else secondary_samples
318
+
319
+ data_loader.secondary_loader = create_dummy_dataloader(
320
+ dataset_name=args.secondary_dataset,
321
+ img_size=getattr(args, 'secondary_img_size', args.img_size),
322
+ vid_size=getattr(args, 'secondary_vid_size', None),
323
+ batch_size=getattr(args, 'secondary_batch_size', local_batch_size),
324
+ use_mixed_aspect=getattr(args, 'mix_aspect', False),
325
+ multiple=multiple * args.patch_size,
326
+ num_samples=secondary_samples_per_rank,
327
+ infinite=True # Secondary loader is typically infinite
328
+ )
329
+
330
+ return data_loader
331
+
332
+
333
+ def read_tsv(filename: str):
334
+ """Simple TSV reader for compatibility."""
335
+ with open(filename, 'r', newline='') as tsvfile:
336
+ reader = csv.reader(tsvfile, delimiter='\t')
337
+ return [row for row in reader]
338
+
339
+
340
+ def set_random_seed(seed: int) -> None:
341
+ """Set random seed for reproducibility."""
342
+ random.seed(seed)
343
+ np.random.seed(seed)
344
+ torch.manual_seed(seed)
345
+ torch.cuda.manual_seed(seed)
346
+ torch.cuda.manual_seed_all(seed)
utils/inference.py ADDED
@@ -0,0 +1,277 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # For licensing see accompanying LICENSE file.
3
+ # Copyright (C) 2025 Apple Inc. All Rights Reserved.
4
+ #
5
+ """
6
+ Inference utilities for STARFlow.
7
+ """
8
+
9
+ import torch
10
+ import datetime
11
+ from typing import List
12
+ from torchmetrics.image.fid import FrechetInceptionDistance, _compute_fid
13
+ from torchmetrics.image.inception import InceptionScore
14
+ from torchmetrics.multimodal.clip_score import CLIPScore
15
+ from torchmetrics.utilities.data import dim_zero_cat
16
+
17
+ # Import Distributed from training module
18
+ from .training import Distributed
19
+
20
+
21
+ # ==== Metrics ====
22
+
23
+ class FID(FrechetInceptionDistance):
24
+ def __init__(self, feature=2048, reset_real_features=True, normalize=False, input_img_size=..., **kwargs):
25
+ super().__init__(feature, reset_real_features, normalize, input_img_size, **kwargs)
26
+ self.reset_real_features = reset_real_features
27
+
28
+ def add_state(self, name, default, *args, **kwargs):
29
+ self.register_buffer(name, default)
30
+
31
+ def manual_compute(self, dist):
32
+ # manually gather the features
33
+ self.fake_features_num_samples = dist.reduce(self.fake_features_num_samples)
34
+ self.fake_features_sum = dist.reduce(self.fake_features_sum)
35
+ self.fake_features_cov_sum = dist.reduce(self.fake_features_cov_sum)
36
+
37
+ if self.reset_real_features:
38
+ self.real_features_num_samples = dist.reduce(self.real_features_num_samples)
39
+ self.real_features_sum = dist.reduce(self.real_features_sum)
40
+ self.real_features_cov_sum = dist.reduce(self.real_features_cov_sum)
41
+
42
+ print(f'Gathered {self.fake_features_num_samples} samples for FID computation')
43
+
44
+ # compute FID
45
+ mean_real = (self.real_features_sum / self.real_features_num_samples).unsqueeze(0)
46
+ mean_fake = (self.fake_features_sum / self.fake_features_num_samples).unsqueeze(0)
47
+ cov_real_num = self.real_features_cov_sum - self.real_features_num_samples * mean_real.t().mm(mean_real)
48
+ cov_real = cov_real_num / (self.real_features_num_samples - 1)
49
+ cov_fake_num = self.fake_features_cov_sum - self.fake_features_num_samples * mean_fake.t().mm(mean_fake)
50
+ cov_fake = cov_fake_num / (self.fake_features_num_samples - 1)
51
+
52
+ if dist.rank == 0:
53
+ fid_score = _compute_fid(mean_real.squeeze(0), cov_real, mean_fake.squeeze(0), cov_fake).to(
54
+ dtype=self.orig_dtype, device=self.real_features_sum.device)
55
+ print(f'FID: {fid_score.item()} DONE')
56
+ else:
57
+ fid_score = torch.tensor(0.0, dtype=self.orig_dtype, device=self.real_features_sum.device)
58
+ dist.barrier()
59
+
60
+ # reset the state
61
+ self.fake_features_num_samples *= 0
62
+ self.fake_features_sum *= 0
63
+ self.fake_features_cov_sum *= 0
64
+
65
+ if self.reset_real_features:
66
+ self.real_features_num_samples *= 0
67
+ self.real_features_sum *= 0
68
+ self.real_features_cov_sum *= 0
69
+
70
+ return fid_score
71
+
72
+
73
+ class IS(InceptionScore):
74
+ def __init__(self, **kwargs):
75
+ super().__init__(**kwargs)
76
+
77
+ def manual_compute(self, dist):
78
+ # manually gather the features
79
+ self.features = dim_zero_cat(self.features)
80
+ features = dist.gather_concat(self.features)
81
+ print(f'Gathered {features.shape[0]} samples for IS computation')
82
+
83
+ if dist.rank == 0:
84
+ idx = torch.randperm(features.shape[0])
85
+ features = features[idx]
86
+
87
+ # calculate probs and logits
88
+ prob = features.softmax(dim=1)
89
+ log_prob = features.log_softmax(dim=1)
90
+
91
+ # split into groups
92
+ prob = prob.chunk(self.splits, dim=0)
93
+ log_prob = log_prob.chunk(self.splits, dim=0)
94
+
95
+ # calculate score per split
96
+ mean_prob = [p.mean(dim=0, keepdim=True) for p in prob]
97
+ kl_ = [p * (log_p - m_p.log()) for p, log_p, m_p in zip(prob, log_prob, mean_prob)]
98
+ kl_ = [k.sum(dim=1).mean().exp() for k in kl_]
99
+ kl = torch.stack(kl_)
100
+
101
+ mean = kl.mean()
102
+ std = kl.std()
103
+
104
+ else:
105
+ mean = torch.tensor(0.0, device=self.features.device)
106
+ std = torch.tensor(0.0, device=self.features.device)
107
+
108
+ dist.barrier()
109
+
110
+ return mean, std
111
+
112
+
113
+ class CLIP(CLIPScore):
114
+ def __init__(self, **kwargs):
115
+ super().__init__(**kwargs)
116
+
117
+ def manual_compute(self, dist):
118
+ # manually gather the features
119
+ self.n_samples = dist.reduce(self.n_samples)
120
+ self.score = dist.reduce(self.score)
121
+
122
+ print(f'Gathered {self.n_samples} samples for CLIP computation')
123
+
124
+ # compute CLIP
125
+ clip_score = torch.max(self.score / self.n_samples, torch.zeros_like(self.score))
126
+ print(f'CLIP: {clip_score.item()} DONE')
127
+ # reset the state
128
+ self.n_samples *= 0
129
+ self.score *= 0
130
+ return clip_score
131
+
132
+
133
+ class Metrics:
134
+ def __init__(self):
135
+ self.metrics: dict[str, list[float]] = {}
136
+
137
+ def update(self, metrics: dict[str, torch.Tensor | float]):
138
+ for k, v in metrics.items():
139
+ if isinstance(v, torch.Tensor):
140
+ v = v.item()
141
+ if k in self.metrics:
142
+ self.metrics[k].append(v)
143
+ else:
144
+ self.metrics[k] = [v]
145
+
146
+ def compute(self, dist: Distributed | None) -> dict[str, float]:
147
+ out: dict[str, float] = {}
148
+ for k, v in self.metrics.items():
149
+ v = sum(v) / len(v)
150
+ if dist is not None:
151
+ v = dist.gather_concat(torch.tensor(v, device='cuda').view(1)).mean().item()
152
+ out[k] = v
153
+ return out
154
+
155
+ @staticmethod
156
+ def print(metrics: dict[str, float], epoch: int):
157
+ print(f'Epoch {epoch} Time {datetime.datetime.now()}')
158
+ print('\n'.join((f'\t{k:40s}: {v: .4g}' for k, v in sorted(metrics.items()))))
159
+
160
+
161
+ # ==== Denoising Functions (from starflow_utils.py) ====
162
+
163
+ def apply_denoising(model, x_chunk: torch.Tensor, y_batch,
164
+ text_encoder, tokenizer, args,
165
+ text_encoder_kwargs: dict, sigma_curr: float, sigma_next: float = 0) -> torch.Tensor:
166
+ """Apply denoising to a chunk of data."""
167
+ from .common import encode_text # Import here to avoid circular imports
168
+
169
+ noise_std_const = 0.3 # a constant used for noise levels.
170
+
171
+ # Handle both encoded tensors and raw captions
172
+ if isinstance(y_batch, torch.Tensor):
173
+ y_ = y_batch
174
+ elif y_batch is not None:
175
+ y_ = encode_text(text_encoder, tokenizer, y_batch, args.txt_size,
176
+ text_encoder.device, **text_encoder_kwargs)
177
+ else:
178
+ y_ = None
179
+
180
+ if getattr(args, 'disable_learnable_denoiser', False) or not hasattr(model, 'learnable_self_denoiser'):
181
+ return self_denoise(
182
+ model, x_chunk, y_,
183
+ noise_std=sigma_curr,
184
+ steps=1,
185
+ disable_learnable_denoiser=getattr(args, 'disable_learnable_denoiser', False)
186
+ )
187
+ else:
188
+ # Learnable denoiser
189
+ if sigma_curr is not None and isinstance(y_batch, (list, type(None))):
190
+ text_encoder_kwargs['noise_std'] = sigma_curr
191
+ denoiser_output = model(x_chunk, y_, denoiser=True)
192
+ return x_chunk - denoiser_output * noise_std_const * (sigma_curr - sigma_next) / sigma_curr
193
+
194
+
195
+ def self_denoise(model, samples, y, noise_std=0.1, lr=1, steps=1, disable_learnable_denoiser=False):
196
+ """Self-denoising function - same as in train.py"""
197
+ if steps == 0:
198
+ return samples
199
+
200
+ outputs = []
201
+ x = samples.clone()
202
+ lr = noise_std ** 2 * lr
203
+ with torch.enable_grad():
204
+ x.requires_grad = True
205
+ model.train()
206
+ z, _, _, logdets = model(x, y)
207
+ loss = model.get_loss(z, logdets)['loss'] * 65536
208
+ grad = float(samples.numel()) / 65536 * torch.autograd.grad(loss, [x])[0]
209
+ outputs += [(x - grad * lr).detach()]
210
+ x = torch.cat(outputs, -1)
211
+ return x
212
+
213
+
214
+ def process_denoising(samples: torch.Tensor, y: List[str], args,
215
+ model, text_encoder, tokenizer, text_encoder_kwargs: dict,
216
+ noise_std: float) -> torch.Tensor:
217
+ """Process samples through denoising if enabled."""
218
+ if not (args.finetuned_vae == 'none' and
219
+ getattr(args, 'vae_adapter', None) is None and
220
+ getattr(args, 'return_sequence', 0) == 0):
221
+ # Denoising not enabled or not applicable
222
+ return samples
223
+
224
+ torch.cuda.empty_cache()
225
+ assert isinstance(samples, torch.Tensor)
226
+ samples = samples.cpu()
227
+
228
+ # Use smaller batch size for training to avoid memory issues
229
+ b = samples.size(0)
230
+ db = min(getattr(args, 'denoising_batch_size', 1), b)
231
+ denoised_samples = []
232
+ is_video = samples.dim() == 5
233
+
234
+ for j in range(b // db):
235
+ x_all = torch.clone(samples[j * db : (j + 1) * db]).detach().cuda()
236
+ y_batch = y[j * db : (j + 1) * db] if y is not None else None
237
+
238
+ if is_video:
239
+ # Chunk-wise denoising for videos
240
+ s_idx, overlap = 0, 0
241
+ steps = x_all.size(1) if getattr(args, 'local_attn_window', None) is None else args.local_attn_window
242
+
243
+ while s_idx < x_all.size(1):
244
+ x_chunk = x_all[:, s_idx : s_idx + steps].detach().clone()
245
+ x_denoised = apply_denoising(
246
+ model, x_chunk, y_batch, text_encoder, tokenizer,
247
+ args, text_encoder_kwargs, noise_std
248
+ )
249
+ x_all[:, s_idx + overlap: s_idx + steps] = x_denoised[:, overlap:]
250
+ overlap = steps - 1 if getattr(args, 'denoiser_window', None) is None else args.denoiser_window
251
+ s_idx += steps - overlap
252
+ else:
253
+ # Process entire batch for images
254
+ x_all = apply_denoising(
255
+ model, x_all, y_batch, text_encoder, tokenizer,
256
+ args, text_encoder_kwargs, noise_std
257
+ )
258
+
259
+ torch.cuda.empty_cache()
260
+ denoised_samples.append(x_all.detach().cpu())
261
+
262
+ return torch.cat(denoised_samples, dim=0).cuda()
263
+
264
+
265
+ def simple_denoising(model, samples: torch.Tensor, y_encoded,
266
+ text_encoder, tokenizer, args, noise_std: float) -> torch.Tensor:
267
+ """Simplified denoising for training - reuses apply_denoising without chunking."""
268
+ if args.finetuned_vae != 'none' and args.finetuned_vae is not None:
269
+ return samples
270
+
271
+ # Reuse apply_denoising - it now handles both encoded tensors and raw captions
272
+ text_encoder_kwargs = {}
273
+ return apply_denoising(
274
+ model, samples, y_encoded, text_encoder, tokenizer,
275
+ args, text_encoder_kwargs, noise_std, sigma_next=0
276
+ )
277
+
utils/model_setup.py ADDED
@@ -0,0 +1,405 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # For licensing see accompanying LICENSE file.
3
+ # Copyright (C) 2025 Apple Inc. All Rights Reserved.
4
+ #
5
+ """
6
+ Model setup utilities for STARFlow.
7
+ Includes: transformer setup, VAE setup, text encoders.
8
+ """
9
+
10
+ import torch
11
+ import torch.nn as nn
12
+ import torch.nn.functional as F
13
+ import pathlib
14
+ import os
15
+ import numpy as np
16
+ from collections import OrderedDict
17
+ from typing import Optional, Tuple, Union
18
+ from einops import rearrange
19
+
20
+ from transformer_flow import pre_model_configs, Model
21
+ from diffusers.models import AutoencoderKL, AutoencoderKLWan
22
+ from diffusers import DiTPipeline
23
+ from misc.wan_vae2 import video_vae2 as AutoencoderKLWan2
24
+ from transformers import AutoTokenizer, AutoModel, AutoConfig, T5Tokenizer, T5EncoderModel
25
+
26
+
27
+ # ==== Model Setup Functions ====
28
+
29
+ def setup_transformer(args, dist, **other_kwargs):
30
+ """Setup transformer model with given arguments."""
31
+ common_kwargs = dict(
32
+ in_channels=args.channel_size,
33
+ img_size=args.img_size,
34
+ txt_size=args.txt_size,
35
+ sos=args.sos, # sos_token
36
+ cond_top_only=args.cond_top_only,
37
+ use_softplus=args.use_softplus,
38
+ use_pretrained_lm=args.use_pretrained_lm,
39
+ use_mm_attn=args.use_mm_attn,
40
+ use_final_norm=args.use_final_norm,
41
+ soft_clip=args.soft_clip,
42
+ seq_order=args.seq_order,
43
+ learnable_self_denoiser=args.learnable_self_denoiser,
44
+ conditional_denoiser=args.conditional_denoiser,
45
+ noise_embed_denoiser=args.noise_embed_denoiser,
46
+ temporal_causal=args.temporal_causal,
47
+ shallow_block_local=args.shallow_block_local,
48
+ denoiser_window=args.denoiser_window,
49
+ local_attn_window=args.local_attn_window,
50
+ top_block_channels=getattr(args, 'top_block_channels', None),
51
+ )
52
+ common_kwargs.update(other_kwargs)
53
+
54
+ if getattr(args, "model_type", None) is not None:
55
+ model = pre_model_configs[args.model_type](**common_kwargs)
56
+ else:
57
+ # generic model initialization
58
+ model = Model(
59
+ patch_size=args.patch_size,
60
+ channels=args.channels,
61
+ num_blocks=args.blocks if len(args.layers_per_block) == 1 else len(args.layers_per_block),
62
+ layers_per_block=args.layers_per_block,
63
+ rope=args.rope,
64
+ pt_seq_len=args.pt_seq_len,
65
+ head_dim=args.head_dim,
66
+ num_heads=args.num_heads,
67
+ num_kv_heads=args.num_kv_heads,
68
+ use_swiglu=args.use_swiglu,
69
+ use_bias=args.use_bias,
70
+ use_qk_norm=args.use_qk_norm,
71
+ use_post_norm=args.use_post_norm,
72
+ norm_type=args.norm_type,
73
+ **common_kwargs)
74
+
75
+ if args.use_pretrained_lm: # Note: pretrained model download removed
76
+ model_name = args.use_pretrained_lm
77
+ assert model_name in ['gemma3_4b', 'gemma2_2b', 'gemma3_1b'], f'{model_name} not supported'
78
+
79
+ # Note: Pretrained LM weights are no longer automatically downloaded
80
+ # Users should provide their own pretrained weights if needed
81
+ local_path = pathlib.Path(args.logdir) / model_name / 'gemma_meta_block.pth'
82
+ if local_path.exists():
83
+ model.blocks[-1].load_state_dict(torch.load(local_path, map_location='cpu'), strict=False)
84
+ print(f'Load top block with pretrained LLM weights from {model_name}')
85
+ else:
86
+ print(f"Warning: Pretrained LM weights for {model_name} not found at {local_path}")
87
+ print("Please provide pretrained weights manually or disable use_pretrained_lm")
88
+
89
+ return model
90
+
91
+
92
+ class VAE(nn.Module):
93
+ def __init__(self, model_name, dist, adapter=None):
94
+ super().__init__()
95
+ self.model_name = model_name
96
+ self.video_vae = False
97
+ self.dist = dist
98
+ model_name, extra = model_name.split(':') if ':' in model_name else (model_name, None)
99
+
100
+ if 'Wan-AI/Wan2.1' in model_name:
101
+ self.vae = AutoencoderKLWan.from_pretrained(model_name, subfolder="vae", torch_dtype=torch.bfloat16)
102
+ self.latents_std = self.vae.config.latents_std
103
+ self.latents_mean = self.vae.config.latents_mean
104
+ self.downsample_factor = 2 ** (len(self.vae.config.dim_mult) - 1)
105
+ self.temporal_downsample_factor = 2 ** sum(self.vae.config.temperal_downsample)
106
+ self.video_vae = True # this is a Video VAE
107
+
108
+ elif 'Wan-AI/Wan2.2' in model_name:
109
+ filename = "/tmp/Wan2.2_VAE.pth" # Use local temp path, download if not exists. WAN2.2 has no diffusers
110
+ if not os.path.exists(filename):
111
+ if dist.local_rank == 0:
112
+ print("Downloading Wan2.2 VAE weights...")
113
+ os.system(f"wget https://huggingface.co/Wan-AI/Wan2.2-TI2V-5B/resolve/main/Wan2.2_VAE.pth -O {filename}")
114
+ dist.barrier() # Ensure only one process downloads
115
+
116
+ self.vae = AutoencoderKLWan2(pretrained_path=filename)
117
+ self.downsample_factor = 16
118
+ self.video_vae = True
119
+ self.latents_std = self.vae.std
120
+ self.latents_mean = self.vae.mean
121
+ self.temporal_downsample_factor = 4
122
+ self.temporal_scale = float(extra) if extra is not None else 1
123
+
124
+ else:
125
+ if 'sd-vae' in model_name or 'sdxl-vae' in model_name:
126
+ self.vae = AutoencoderKL.from_pretrained(model_name)
127
+ self.scaling_factor = self.vae.config.scaling_factor
128
+ else:
129
+ self.vae = AutoencoderKL.from_pretrained(model_name, subfolder="vae", torch_dtype=torch.bfloat16)
130
+ self.scaling_factor = self.vae.config.scaling_factor
131
+ self.downsample_factor = 2 ** (len(self.vae.config.down_block_types) - 1)
132
+ self.temporal_downsample_factor = 1 # this is an Image VAE, no temporal downsample
133
+
134
+ # self.vae.load_state_dict(self.vae.state_dict(), strict=False) # what is this?
135
+ self.use_adapter = adapter is not None
136
+ if self.use_adapter: # adapter is dit #
137
+ self.dit_pipe = DiTPipeline.from_pretrained(adapter, torch_dtype=torch.bfloat16)
138
+
139
+ def to(self, device):
140
+ if self.use_adapter:
141
+ self.dit_pipe.to(device)
142
+ return super().to(device)
143
+
144
+ def _encode(self, x):
145
+ return self.vae.encode(x)
146
+
147
+ def _decode(self, z):
148
+ return self.vae.decode(z)
149
+
150
+ def encode(self, x):
151
+ if self.video_vae: # video VAE
152
+ if 'Wan-AI/Wan2.2' in self.model_name:
153
+ if x.dim() == 5:
154
+ z = rearrange(self.vae.sample(rearrange(x, 'b t c h w -> b c t h w'), self.vae.scale), 'b c t h w -> b t c h w')
155
+ if self.temporal_scale != 1:
156
+ z[:, 1:] = z[:, 1:] * self.temporal_scale # scale the temporal latent
157
+ else:
158
+ z = rearrange(self.vae.sample(rearrange(x, 'b c h w -> b c 1 h w'), self.vae.scale), 'b c 1 h w -> b c h w')
159
+ else:
160
+ if x.dim() == 5:
161
+ z = rearrange(self._encode(rearrange(x, 'b t c h w -> b c t h w')).latent_dist.sample(), 'b c t h w -> b t c h w')
162
+ else:
163
+ z = rearrange(self._encode(rearrange(x, 'b c h w -> b c 1 h w')).latent_dist.sample(), 'b c 1 h w -> b c h w')
164
+ shape = [1, 1, -1, 1, 1] if z.dim() == 5 else [1, -1, 1, 1]
165
+
166
+ scale, shift = torch.tensor(self.latents_std, device=x.device).view(*shape), torch.tensor(self.latents_mean, device=x.device).view(*shape)
167
+ z = (z - shift) / scale
168
+ else: # image VAE
169
+ if x.dim() == 5:
170
+ z = rearrange(self._encode(rearrange(x, 'b t c h w -> (b t) c h w')).latent_dist.sample(), '(b t) c h w -> b t c h w', t=x.shape[1])
171
+ else:
172
+ z = self._encode(x).latent_dist.sample()
173
+ z = z * self.scaling_factor
174
+ return z
175
+
176
+ def decode(self, z, total_steps=100, noise_std=0.3):
177
+ if self.use_adapter:
178
+ z = self.adapter_denoise(z, total_steps, noise_std)
179
+
180
+ if self.video_vae: # video VAE
181
+ if 'Wan-AI/Wan2.2' in self.model_name:
182
+ if z.dim() == 5:
183
+ if self.temporal_scale != 1:
184
+ z = z.clone()
185
+ z[:, 1:] = z[:, 1:] / self.temporal_scale
186
+ x = rearrange(self.vae.decode(rearrange(z, 'b t c h w -> b c t h w'), self.vae.scale), 'b c t h w -> b t c h w')
187
+ else:
188
+ x = rearrange(self.vae.decode(rearrange(z, 'b c h w -> b c 1 h w'), self.vae.scale), 'b c 1 h w -> b c h w')
189
+ else:
190
+ shape = [1, 1, -1, 1, 1] if z.dim() == 5 else [1, -1, 1, 1]
191
+ scale = torch.tensor(self.latents_std, device=z.device).view(*shape)
192
+ shift = torch.tensor(self.latents_mean, device=z.device).view(*shape)
193
+ z = z * scale + shift
194
+ if z.dim() == 5:
195
+ x = rearrange(self._decode(rearrange(z, 'b t c h w -> b c t h w')).sample, 'b c t h w -> b t c h w')
196
+ else:
197
+ x = rearrange(self._decode(rearrange(z, 'b c h w -> b c 1 h w')).sample, 'b c 1 h w -> b c h w')
198
+ else:
199
+ z = z / self.scaling_factor
200
+ if z.dim() == 5: # (b, t, c, h, w)
201
+ x = rearrange(self._decode(rearrange(z, 'b t c h w -> (b t) c h w')).sample, '(b t) c h w -> b t c h w', t=z.shape[1])
202
+ else:
203
+ x = self._decode(z).sample
204
+ return x
205
+
206
+ @torch.no_grad()
207
+ def adapter_denoise(self, z, total_steps=100, noise_std=0.3):
208
+ self.dit_pipe.scheduler.set_timesteps(total_steps)
209
+ timesteps = self.dit_pipe.scheduler.timesteps
210
+ one = torch.ones(z.shape[0], device=z.device)
211
+ target_alpha2 = 1 / (1 + noise_std ** 2)
212
+ target_t = (torch.abs(self.dit_pipe.scheduler.alphas_cumprod - target_alpha2)).argmin().item()
213
+ z = z * np.sqrt(target_alpha2) # normalize the latent
214
+ for it in range(len(timesteps)):
215
+ if timesteps[it] > target_t: continue
216
+ noise_pred = self.dit_pipe.transformer(z, one * timesteps[it], class_labels=one.long() * 1000).sample
217
+ model_output = torch.split(noise_pred, self.dit_pipe.transformer.config.in_channels, dim=1)[0]
218
+ z = self.dit_pipe.scheduler.step(model_output, timesteps[it], z).prev_sample
219
+ return z
220
+
221
+
222
+ def setup_vae(args, dist, device='cuda'):
223
+ """Setup VAE model with given arguments."""
224
+ print(f'Loading VAE {args.vae}...')
225
+ # setup VAE
226
+ vae = VAE(args.vae, dist=dist, adapter=getattr(args, "vae_adapter", None)).to(device)
227
+
228
+ # (optional) load pretrained VAE
229
+ if getattr(args, "finetuned_vae", None) is not None and args.finetuned_vae != 'none':
230
+ vae_task_id = args.finetuned_vae
231
+ local_folder = args.logdir / 'vae'
232
+ local_folder.mkdir(parents=True, exist_ok=True)
233
+
234
+ # Try to load from local path first
235
+ if vae_task_id == "px82zaheuu":
236
+ local_path = local_folder / "pytorch_model.bin"
237
+ if local_path.exists():
238
+ finetuned_vae_state = torch.load(local_path, map_location="cpu", weights_only=False)
239
+ renamed_state = OrderedDict()
240
+ for key in finetuned_vae_state:
241
+ new_key = key.replace("encoder.0", "encoder").replace("encoder.1", "quant_conv").replace("decoder.0", "post_quant_conv").replace("decoder.1", "decoder")
242
+ renamed_state[new_key] = finetuned_vae_state[key]
243
+ vae.vae.load_state_dict(renamed_state)
244
+ print(f'Loaded finetuned VAE {vae_task_id}')
245
+ else:
246
+ print(f"Warning: Finetuned VAE weights for {vae_task_id} not found at {local_path}")
247
+ print("Please provide finetuned VAE weights manually or set finetuned_vae to 'none'")
248
+ else:
249
+ # Try to load general task weights
250
+ local_path = local_folder / f"{vae_task_id}.pth"
251
+ if local_path.exists():
252
+ vae.load_state_dict(torch.load(local_path, map_location='cpu', weights_only=False))
253
+ print(f'Loaded finetuned VAE {vae_task_id}')
254
+ else:
255
+ print(f"Warning: Finetuned VAE weights for {vae_task_id} not found at {local_path}")
256
+ print("Please provide finetuned VAE weights manually or set finetuned_vae to 'none'")
257
+
258
+ return vae
259
+
260
+
261
+ # ==== Text Encoder Classes and Setup ====
262
+
263
+ class LookupTableTokenizer:
264
+ """Simple lookup table tokenizer for label-based datasets."""
265
+
266
+ def __init__(self, vocab_file):
267
+ from .common import read_tsv
268
+ self.vocab = {l[0]: i for i, l in enumerate(read_tsv(f'configs/dataset/{vocab_file}'))}
269
+ self.empty_id = len(self.vocab)
270
+
271
+ def __len__(self):
272
+ return len(self.vocab)
273
+
274
+ def __call__(self, text):
275
+ return {'input_ids': torch.tensor([[self.vocab.get(t, self.empty_id)] for t in text], dtype=torch.long)}
276
+
277
+
278
+ class LabelEmbdder(nn.Module):
279
+ """Simple label embedder for classification-style conditioning."""
280
+
281
+ def __init__(self, num_classes):
282
+ super().__init__()
283
+ self.num_classes = num_classes
284
+ self.config = type('Config', (), {'hidden_size': num_classes + 1})()
285
+ self.Embedding = nn.Parameter(torch.eye(num_classes+1), requires_grad=False)
286
+
287
+ def forward(self, y):
288
+ return F.embedding(y, self.Embedding)
289
+
290
+
291
+ class TextEmbedder(nn.Module):
292
+ """Text embedder for large language models like Gemma."""
293
+
294
+ def __init__(self, config):
295
+ super().__init__()
296
+ if hasattr(config, "text_config"): # Gemma3
297
+ self.config = config.text_config
298
+ self.vocab_size = config.image_token_index
299
+ else:
300
+ self.config = config
301
+ self.vocab_size = config.vocab_size
302
+ self.text_token_embedder = nn.Embedding(
303
+ self.vocab_size, self.config.hidden_size)
304
+ self.text_token_embedder.weight.requires_grad = False
305
+ self.normalizer = float(self.config.hidden_size) ** 0.5
306
+
307
+ def forward(self, x):
308
+ x = self.text_token_embedder(x)
309
+ return (x * self.normalizer).to(x.dtype)
310
+
311
+ @torch.no_grad()
312
+ def sample(
313
+ self,
314
+ hidden_states: torch.Tensor,
315
+ temperatures: Union[float, None] = 1.0,
316
+ top_ps: float = 0.95,
317
+ top_ks: int = 64,
318
+ embedding_bias: Optional[torch.Tensor] = None,
319
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
320
+
321
+ device = hidden_states.device
322
+ batch_size = hidden_states.shape[0]
323
+ temperatures = None if not temperatures else torch.FloatTensor(
324
+ [temperatures] * batch_size).to(device)
325
+ top_ps = torch.FloatTensor([top_ps] * batch_size).to(device)
326
+ top_ks = torch.LongTensor([top_ks] * batch_size).to(device)
327
+
328
+ # Select the last element for each sequence.
329
+ hidden_states = hidden_states[:, -1]
330
+ embedding = self.text_token_embedder.weight
331
+ logits = torch.matmul(hidden_states, embedding.t())
332
+ if embedding_bias is not None:
333
+ logits += embedding_bias
334
+
335
+ if hasattr(self.config, 'final_logit_softcapping') and self.config.final_logit_softcapping is not None:
336
+ logits = logits / self.config.final_logit_softcapping
337
+ logits = torch.tanh(logits)
338
+ logits = logits * self.config.final_logit_softcapping
339
+
340
+ if temperatures is None:
341
+ return torch.argmax(logits, dim=-1).squeeze(dim=-1), logits
342
+
343
+ # Apply temperature scaling.
344
+ logits.div_(temperatures.unsqueeze(dim=1))
345
+
346
+ # Apply top-k and top-p filtering (simplified version)
347
+ probs = F.softmax(logits, dim=-1)
348
+ next_tokens = torch.multinomial(probs, num_samples=1).squeeze(dim=-1)
349
+
350
+ return next_tokens, logits
351
+
352
+
353
+ def setup_encoder(args, dist, device='cuda'):
354
+ """Setup text encoder based on arguments."""
355
+ assert args.txt_size > 0, 'txt_size must be set'
356
+ print(f'Loading text encoder {args.text}...')
357
+
358
+ if args.text.endswith('.vocab'): # caption -> label
359
+ tokenizer = LookupTableTokenizer(args.text)
360
+ text_encoder = LabelEmbdder(len(tokenizer)).to(device)
361
+ block_name = 'Embedding'
362
+
363
+ elif args.text == 't5xxl':
364
+ tokenizer = T5Tokenizer.from_pretrained("THUDM/CogView3-Plus-3B", subfolder="tokenizer")
365
+ text_encoder = T5EncoderModel.from_pretrained("THUDM/CogView3-Plus-3B",
366
+ subfolder="text_encoder", torch_dtype=torch.bfloat16).to(device)
367
+ block_name = 'T5Block'
368
+
369
+ elif args.text == 't5xl' or args.text.startswith('google'):
370
+ tokenizer = AutoTokenizer.from_pretrained(args.text)
371
+ text_encoder = AutoModel.from_pretrained(args.text, add_cross_attention=False).encoder.to(device)
372
+ block_name = 'T5Block'
373
+
374
+ elif args.text == "gemma" or args.text.startswith("Alpha-VLLM"):
375
+ tokenizer = AutoTokenizer.from_pretrained(args.text, subfolder="tokenizer")
376
+ text_encoder = AutoModel.from_pretrained(args.text, subfolder="text_encoder", torch_dtype=torch.bfloat16).to(device)
377
+ block_name = 'GemmaDecoderLayer'
378
+
379
+ elif args.text in ["gemma3_4b", "gemma3_1b", "gemma2_2b"]: # NOTE: special text embedder
380
+ model_name = args.text
381
+ repo_name = {"gemma3_4b": "google/gemma-3-4b-it",
382
+ "gemma3_1b": "google/gemma-3-1b-it",
383
+ "gemma2_2b": "google/gemma-2-2b-it"}[model_name]
384
+ tokenizer = AutoTokenizer.from_pretrained(repo_name)
385
+ config = AutoConfig.from_pretrained(repo_name)
386
+
387
+ text_encoder = TextEmbedder(config).to(device)
388
+ block_name = "Embedding"
389
+
390
+ # Try to load embedding layer
391
+ local_path = pathlib.Path(args.logdir) / model_name
392
+ local_path.mkdir(parents=True, exist_ok=True)
393
+ local_path = local_path / 'gemma_text_embed.pth'
394
+ if local_path.exists():
395
+ text_encoder.load_state_dict(torch.load(local_path, map_location='cpu'))
396
+ print(f'Loaded text encoder weights for {model_name}')
397
+ else:
398
+ print(f"Warning: Text encoder weights for {model_name} not found at {local_path}")
399
+ print("Please provide text encoder weights manually or use a different text encoder")
400
+
401
+ else:
402
+ raise NotImplementedError(f'Unknown text encoder {args.text}')
403
+
404
+ text_encoder.base_block_name = block_name
405
+ return tokenizer, text_encoder
utils/training.py ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # For licensing see accompanying LICENSE file.
3
+ # Copyright (C) 2025 Apple Inc. All Rights Reserved.
4
+ #
5
+ """
6
+ Training utilities for STARFlow.
7
+ """
8
+
9
+ import torch
10
+ import torch.nn as nn
11
+ import torch.nn.functional as F
12
+ import torch.distributed
13
+ import torch.distributed.checkpoint as dcp
14
+ from torch.distributed._composable.fsdp import fully_shard, MixedPrecisionPolicy, CPUOffloadPolicy
15
+ from torch.distributed._tensor import DeviceMesh
16
+ from torch.distributed.device_mesh import init_device_mesh
17
+ import datetime
18
+ import math
19
+ import os
20
+ import random
21
+ import numpy as np
22
+ import contextlib
23
+ import typing as t
24
+ from typing import Any, Dict, List, Union, Optional
25
+ from collections import defaultdict, OrderedDict
26
+ from fnmatch import fnmatch
27
+
28
+
29
+ # ==== Learning Rate Schedule ====
30
+
31
+ class CosineLRSchedule(torch.nn.Module):
32
+ counter: torch.Tensor
33
+
34
+ def __init__(self, optimizer, warmup_steps: int, total_steps: int, min_lr: float, max_lr: float):
35
+ super().__init__()
36
+ self.register_buffer('counter', torch.zeros(()))
37
+ self.warmup_steps = warmup_steps
38
+ self.total_steps = total_steps
39
+ self.optimizer = optimizer
40
+ self.min_lr = min_lr
41
+ self.start_lr = min(min_lr, 1e-6)
42
+ self.max_lr = max_lr
43
+ self.set_lr(min_lr)
44
+
45
+ def set_lr(self, lr: float) -> float:
46
+ if self.min_lr <= lr <= self.max_lr:
47
+ for pg in self.optimizer.param_groups:
48
+ pg['lr'] = lr
49
+ return pg['lr']
50
+
51
+ def step(self) -> float:
52
+ with torch.no_grad():
53
+ counter = self.counter.add_(1).item()
54
+ if self.counter <= self.warmup_steps:
55
+ new_lr = self.start_lr + counter / self.warmup_steps * (self.max_lr - self.start_lr)
56
+ return self.set_lr(new_lr)
57
+
58
+ t = (counter - self.warmup_steps) / (self.total_steps - self.warmup_steps)
59
+ new_lr = self.min_lr + 0.5 * (1 + math.cos(math.pi * t)) * (self.max_lr - self.min_lr)
60
+ return self.set_lr(new_lr)
61
+
62
+
63
+ # ==== Distributed Training ====
64
+
65
+ class Distributed:
66
+ timeout: float = 72000
67
+
68
+ def __init__(self):
69
+ if os.environ.get('MASTER_PORT'): # When running with torchrun
70
+ self.rank = int(os.environ['RANK'])
71
+ self.local_rank = int(os.environ['LOCAL_RANK'])
72
+ self.world_size = int(os.environ['WORLD_SIZE'])
73
+ self.distributed = True
74
+ torch.distributed.init_process_group(
75
+ backend='nccl',
76
+ init_method='env://',
77
+ world_size=self.world_size,
78
+ timeout=datetime.timedelta(seconds=self.timeout),
79
+ rank=self.rank,
80
+ )
81
+ else: # When running with python for debugging
82
+ self.rank, self.local_rank, self.world_size = 0, 0, 1
83
+ self.distributed = False
84
+ torch.cuda.set_device(self.local_rank)
85
+ self.barrier()
86
+
87
+ def barrier(self) -> None:
88
+ if self.distributed:
89
+ torch.distributed.barrier()
90
+
91
+ def gather_concat(self, x: torch.Tensor) -> torch.Tensor:
92
+ if not self.distributed:
93
+ return x
94
+ x_list = [torch.empty_like(x) for _ in range(self.world_size)]
95
+ torch.distributed.all_gather(x_list, x)
96
+ return torch.cat(x_list)
97
+
98
+ def reduce(self, x):
99
+ if not self.distributed:
100
+ return x
101
+ torch.distributed.all_reduce(x, op=torch.distributed.ReduceOp.SUM)
102
+ return x
103
+
104
+ def __del__(self):
105
+ if self.distributed:
106
+ torch.distributed.destroy_process_group()
107
+
108
+
109
+ def get_local_rank() -> int:
110
+ if os.environ.get('MASTER_PORT'): # When running with torchrun
111
+ return int(os.environ['LOCAL_RANK'])
112
+ return 0
113
+
114
+
115
+ def get_device_mesh(dp_size: int, tp_size: int = 1) -> DeviceMesh:
116
+ """Create DeviceMesh based on tensor and data parallelism configuration."""
117
+ # by default, I will use TP=1 for simplicity
118
+ mesh_shape = (dp_size, tp_size)
119
+ names = ("dp", "tp")
120
+ return init_device_mesh("cuda", mesh_shape=mesh_shape, mesh_dim_names=names)
121
+
122
+
123
+ def wrap_matching_layers(
124
+ model: nn.Module,
125
+ layer_patterns: t.List[str],
126
+ wrapper_fn: t.Callable[[nn.Module], nn.Module],
127
+ ):
128
+ """
129
+ Recursively wraps submodules in the order they appear in layer_patterns.
130
+ For each pattern (in order), we do a pass over the model and wrap matches.
131
+ """
132
+ def _wrap_single_pattern(mod: nn.Module, pattern: str):
133
+ """
134
+ Recurse over mod, wrapping submodules that match `pattern`.
135
+ We do a post-order traversal so children get wrapped before the parent.
136
+ """
137
+ for child_name, child_module in list(mod.named_children()):
138
+ # Wrap grandchildren first.
139
+ _wrap_single_pattern(child_module, pattern)
140
+
141
+ # Check if the child's class name matches the pattern.
142
+ if fnmatch(child_module.__class__.__name__, pattern):
143
+ # Replace the child in the parent.
144
+ wrapped = wrapper_fn(child_module)
145
+ setattr(mod, child_name, wrapped)
146
+
147
+ # We do a pass for each pattern in order
148
+ for pattern in layer_patterns:
149
+ _wrap_single_pattern(model, pattern)
150
+
151
+
152
+ def parallelize_model(args, model: nn.Module, dist: Distributed, device='cuda', block_names=['AttentionBlock']) -> nn.Module:
153
+ if not getattr(args, "fsdp", False): # use standard DDP
154
+ model = model.to(device=device)
155
+ if dist.distributed:
156
+ print(f"Using DDP")
157
+ model_ddp = torch.nn.parallel.DistributedDataParallel(model, device_ids=[dist.local_rank])
158
+ else:
159
+ model_ddp = model # compatible with DDP
160
+ return model, model_ddp
161
+
162
+ # Instantiate mixed precision policy from config
163
+ mp_policy = MixedPrecisionPolicy(
164
+ param_dtype=torch.bfloat16,
165
+ reduce_dtype=torch.bfloat16,
166
+ output_dtype=torch.bfloat16,
167
+ cast_forward_inputs=True
168
+ )
169
+ print(f"Using FSDP2 with: {mp_policy}")
170
+
171
+ # Apply FSDP wrapping based on specified parallel dimensions
172
+ dp_mesh = get_device_mesh(dist.world_size)["dp"]
173
+
174
+ # Configure core FSDP parameters
175
+ fsdp_config = {"mp_policy": mp_policy, "mesh": dp_mesh, "reshard_after_forward": True}
176
+
177
+ # Wrap specified layer patterns with FSDP
178
+ wrap_matching_layers(model, block_names, lambda m: fully_shard(m, **fsdp_config))
179
+
180
+ # Then wrap full model (remaining modules are captured with this)
181
+ model = fully_shard(model, **fsdp_config)
182
+ model = model.to(device=device)
183
+ return model, model # for compatibility with DDP
184
+
185
+
186
+ def save_model(args, dist, model, model_ckpt_file):
187
+ states = model.state_dict()
188
+ if not getattr(args, "fsdp", False): # save DDP checkpoints
189
+ if dist.local_rank == 0:
190
+ torch.save(states, model_ckpt_file)
191
+ else: # save FSDP checkpoints
192
+ dcp.save(states, checkpoint_id=str(model_ckpt_file))
193
+
194
+
195
+ def save_optimizer(args, dist, optimizer, lr_schedule, opt_ckpt_file):
196
+ optim_states, lr_states = optimizer.state_dict(), lr_schedule.state_dict()
197
+ if not getattr(args, "fsdp", False): # save DDP checkpoints
198
+ if dist.local_rank == 0:
199
+ torch.save({"optimizer": optim_states, "lr_schedule": lr_states}, opt_ckpt_file)
200
+ else:
201
+ filename = str(opt_ckpt_file)
202
+ dcp.save(optim_states, checkpoint_id=f"{filename}/optimizer")
203
+ torch.save(lr_states, f"{filename}/lr_schedule.bin") # lr_schedule is not fsdp
204
+
205
+
206
+ @contextlib.contextmanager
207
+ def _fsdp2_no_sync(module, sync):
208
+ # v2 APIs
209
+ module.set_requires_gradient_sync(sync, recurse=True)
210
+ try:
211
+ yield
212
+ finally:
213
+ module.set_requires_gradient_sync(True, recurse=True)
214
+
215
+
216
+ def sync_ctx(model, sync=True):
217
+ if hasattr(model, 'set_requires_gradient_sync'):
218
+ return _fsdp2_no_sync(model, sync)
219
+ elif not sync and hasattr(model, 'no_sync'):
220
+ return model.no_sync()
221
+ return contextlib.nullcontext()
222
+
223
+
224
+ # ==== Utility Functions ====
225
+
226
+ def set_random_seed(seed: int) -> None:
227
+ """Set random seed for reproducibility."""
228
+ random.seed(seed)
229
+ np.random.seed(seed)
230
+ torch.manual_seed(seed)
231
+ torch.cuda.manual_seed(seed)
232
+ torch.cuda.manual_seed_all(seed)