Spaces:

GlobalStudio
/

starflow

Sleeping

App Files Files Community

leoeric commited on 6 days ago

Commit

0b4562b

0 Parent(s):

Initial commit for HF Space - code files only

Browse files

Files changed (29) hide show

app.py +249 -0
configs/captions/ar_video_prompts_original.txt +48 -0
configs/captions/ar_video_prompts_videogen.txt +36 -0
configs/captions/editing.txt +118 -0
configs/captions/editing2.txt +158 -0
configs/captions/example_human.txt +100 -0
configs/captions/example_prompts2.txt +100 -0
configs/captions/example_prompts3.txt +100 -0
configs/captions/example_prompts4.txt +800 -0
configs/captions/example_prompts5.txt +100 -0
configs/captions/example_prompts6.txt +250 -0
configs/captions/starflow_v.txt +1 -0
configs/starflow-v_7B_t2v_caus_480p.yaml +51 -0
configs/starflow_3B_t2i_256x256.yaml +32 -0
misc/__init__.py +34 -0
misc/ae_losses.py +330 -0
misc/condition_utils.py +218 -0
misc/discriminator.py +388 -0
misc/lpips.py +142 -0
misc/pe.py +151 -0
misc/wan_vae2.py +1000 -0
requirements_hf.txt +33 -0
sample.py +379 -0
transformer_flow.py +1356 -0
utils/__init__.py +96 -0
utils/common.py +346 -0
utils/inference.py +277 -0
utils/model_setup.py +405 -0
utils/training.py +232 -0

app.py ADDED Viewed

	@@ -0,0 +1,249 @@

+"""
+Hugging Face Space for STARFlow
+Text-to-Image and Text-to-Video Generation
+This app allows you to run STARFlow inference on Hugging Face GPU infrastructure.
+"""
+import os
+import gradio as gr
+import torch
+import subprocess
+import pathlib
+from pathlib import Path
+# Check if running on Hugging Face Spaces
+HF_SPACE = os.environ.get("SPACE_ID") is not None
+# Verify CUDA availability (will be True on HF Spaces with GPU hardware)
+if torch.cuda.is_available():
+    print(f"✅ CUDA available! Device: {torch.cuda.get_device_name(0)}")
+    print(f"   CUDA Version: {torch.version.cuda}")
+    print(f"   PyTorch Version: {torch.__version__}")
+else:
+    print("⚠️  CUDA not available. Make sure GPU hardware is selected in Space settings.")
+def generate_image(prompt, aspect_ratio, cfg, seed, checkpoint_file, config_path):
+    """Generate image from text prompt."""
+    if checkpoint_file is None:
+        return None, "Error: Please upload a checkpoint file."
+    # Handle Gradio file object
+    if hasattr(checkpoint_file, 'name'):
+        checkpoint_path = checkpoint_file.name
+    else:
+        checkpoint_path = str(checkpoint_file)
+    if not os.path.exists(checkpoint_path):
+        return None, f"Error: Checkpoint file not found at {checkpoint_path}."
+    if not config_path or not os.path.exists(config_path):
+        return None, "Error: Config file not found. Please ensure config file exists."
+    try:
+        # Create output directory
+        output_dir = Path("outputs")
+        output_dir.mkdir(exist_ok=True)
+        # Run sampling command
+        cmd = [
+            "python", "sample.py",
+            "--model_config_path", config_path,
+            "--checkpoint_path", checkpoint_path,
+            "--caption", prompt,
+            "--sample_batch_size", "1",
+            "--cfg", str(cfg),
+            "--aspect_ratio", aspect_ratio,
+            "--seed", str(seed),
+            "--save_folder", "1",
+            "--finetuned_vae", "none",
+            "--jacobi", "1",
+            "--jacobi_th", "0.001",
+            "--jacobi_block_size", "16"
+        ]
+        result = subprocess.run(cmd, capture_output=True, text=True, cwd=os.getcwd())
+        if result.returncode != 0:
+            return None, f"Error: {result.stderr}"
+        # Find the generated image
+        # The sample.py script saves to logdir/model_name/...
+        # We need to find the most recent output
+        output_files = list(output_dir.glob("**/*.png")) + list(output_dir.glob("**/*.jpg"))
+        if output_files:
+            latest_file = max(output_files, key=lambda p: p.stat().st_mtime)
+            return str(latest_file), "Success! Image generated."
+        else:
+            return None, "Error: Generated image not found."
+    except Exception as e:
+        return None, f"Error: {str(e)}"
+def generate_video(prompt, aspect_ratio, cfg, seed, target_length, checkpoint_file, config_path, input_image):
+    """Generate video from text prompt."""
+    if checkpoint_file is None:
+        return None, "Error: Please upload a checkpoint file."
+    # Handle Gradio file object
+    if hasattr(checkpoint_file, 'name'):
+        checkpoint_path = checkpoint_file.name
+    else:
+        checkpoint_path = str(checkpoint_file)
+    if not os.path.exists(checkpoint_path):
+        return None, f"Error: Checkpoint file not found at {checkpoint_path}."
+    if not config_path or not os.path.exists(config_path):
+        return None, "Error: Config file not found. Please ensure config file exists."
+    # Handle input image
+    input_image_path = None
+    if input_image is not None:
+        if hasattr(input_image, 'name'):
+            input_image_path = input_image.name
+        else:
+            input_image_path = str(input_image)
+    try:
+        # Create output directory
+        output_dir = Path("outputs")
+        output_dir.mkdir(exist_ok=True)
+        # Run sampling command
+        cmd = [
+            "python", "sample.py",
+            "--model_config_path", config_path,
+            "--checkpoint_path", checkpoint_path,
+            "--caption", prompt,
+            "--sample_batch_size", "1",
+            "--cfg", str(cfg),
+            "--aspect_ratio", aspect_ratio,
+            "--seed", str(seed),
+            "--out_fps", "16",
+            "--save_folder", "1",
+            "--jacobi", "1",
+            "--jacobi_th", "0.001",
+            "--finetuned_vae", "none",
+            "--disable_learnable_denoiser", "0",
+            "--jacobi_block_size", "32",
+            "--target_length", str(target_length)
+        ]
+        if input_image_path and os.path.exists(input_image_path):
+            cmd.extend(["--input_image", input_image_path])
+        else:
+            cmd.extend(["--input_image", "none"])
+        result = subprocess.run(cmd, capture_output=True, text=True, cwd=os.getcwd())
+        if result.returncode != 0:
+            return None, f"Error: {result.stderr}"
+        # Find the generated video
+        output_files = list(output_dir.glob("**/*.mp4")) + list(output_dir.glob("**/*.gif"))
+        if output_files:
+            latest_file = max(output_files, key=lambda p: p.stat().st_mtime)
+            return str(latest_file), "Success! Video generated."
+        else:
+            return None, "Error: Generated video not found."
+    except Exception as e:
+        return None, f"Error: {str(e)}"
+# Create Gradio interface
+with gr.Blocks(title="STARFlow - Text-to-Image & Video Generation") as demo:
+    gr.Markdown("""
+    # STARFlow: Scalable Transformer Auto-Regressive Flow
+    Generate high-quality images and videos from text prompts using STARFlow models.
+    **Note**: You'll need to upload model checkpoints. Check the README for model download links.
+    """)
+    with gr.Tabs():
+        with gr.Tab("Text-to-Image"):
+            with gr.Row():
+                with gr.Column():
+                    image_prompt = gr.Textbox(
+                        label="Prompt",
+                        placeholder="a film still of a cat playing piano",
+                        lines=3
+                    )
+                    image_checkpoint = gr.File(
+                        label="Model Checkpoint (.pth file)",
+                        file_types=[".pth"]
+                    )
+                    image_config = gr.Textbox(
+                        label="Config Path",
+                        value="configs/starflow_3B_t2i_256x256.yaml",
+                        placeholder="configs/starflow_3B_t2i_256x256.yaml"
+                    )
+                    image_aspect = gr.Dropdown(
+                        choices=["1:1", "2:3", "3:2", "16:9", "9:16", "4:5", "5:4"],
+                        value="1:1",
+                        label="Aspect Ratio"
+                    )
+                    image_cfg = gr.Slider(1.0, 10.0, value=3.6, step=0.1, label="CFG Scale")
+                    image_seed = gr.Number(value=999, label="Seed", precision=0)
+                    image_btn = gr.Button("Generate Image", variant="primary")
+                with gr.Column():
+                    image_output = gr.Image(label="Generated Image")
+                    image_status = gr.Textbox(label="Status", interactive=False)
+            image_btn.click(
+                fn=generate_image,
+                inputs=[image_prompt, image_aspect, image_cfg, image_seed, image_checkpoint, image_config],
+                outputs=[image_output, image_status],
+                show_progress=True
+            )
+        with gr.Tab("Text-to-Video"):
+            with gr.Row():
+                with gr.Column():
+                    video_prompt = gr.Textbox(
+                        label="Prompt",
+                        placeholder="a corgi dog looks at the camera",
+                        lines=3
+                    )
+                    video_checkpoint = gr.File(
+                        label="Model Checkpoint (.pth file)",
+                        file_types=[".pth"]
+                    )
+                    video_config = gr.Textbox(
+                        label="Config Path",
+                        value="configs/starflow-v_7B_t2v_caus_480p.yaml",
+                        placeholder="configs/starflow-v_7B_t2v_caus_480p.yaml"
+                    )
+                    video_aspect = gr.Dropdown(
+                        choices=["16:9", "1:1", "4:3"],
+                        value="16:9",
+                        label="Aspect Ratio"
+                    )
+                    video_cfg = gr.Slider(1.0, 10.0, value=3.5, step=0.1, label="CFG Scale")
+                    video_seed = gr.Number(value=99, label="Seed", precision=0)
+                    video_length = gr.Slider(81, 481, value=81, step=80, label="Target Length (frames)")
+                    video_input_image = gr.File(
+                        label="Input Image (optional, for image-to-video)",
+                        file_types=["image"]
+                    )
+                    video_btn = gr.Button("Generate Video", variant="primary")
+                with gr.Column():
+                    video_output = gr.Video(label="Generated Video")
+                    video_status = gr.Textbox(label="Status", interactive=False)
+            video_btn.click(
+                fn=generate_video,
+                inputs=[video_prompt, video_aspect, video_cfg, video_seed, video_length,
+                       video_checkpoint, video_config, video_input_image],
+                outputs=[video_output, video_status],
+                show_progress=True
+            )
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)

configs/captions/ar_video_prompts_original.txt ADDED Viewed

	@@ -0,0 +1,48 @@

+A stylish woman walks down a Tokyo street filled with warm glowing neon and animated city signage. She wears a black leather jacket, a long red dress, and black boots, and carries a black purse. She wears sunglasses and red lipstick. She walks confidently and casually. The street is damp and reflective, creating a mirror effect of the colorful lights. Many pedestrians walk about.
+Several giant wooly mammoths approach treading through a snowy meadow, their long wooly fur lightly blows in the wind as they walk, snow covered trees and dramatic snow capped mountains in the distance, mid afternoon light with wispy clouds and a sun high in the distance creates a warm glow, the low camera view is stunning capturing the large furry mammal with beautiful photography, depth of field.
+Drone view of waves crashing against the rugged cliffs along Big Sur’s Garay Point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff’s edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff’s edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of Big Sur.
+Historical footage of California during the gold rush.
+A close‑up view of a glass sphere that has a zen garden within it. There is a small dwarf in the sphere who is raking the zen garden and creating patterns in the sand.
+Extreme close‑up of a 24‑year‑old woman’s eye blinking, standing in Marrakech during magic hour, cinematic film shot in 70 mm, depth of field, vivid colors, cinematic.
+3D animation of a small, round, fluffy creature with big, expressive eyes explores a vibrant, enchanted forest. The creature, a whimsical blend of a rabbit and a squirrel, has soft blue fur and a bushy, striped tail. It hops along a sparkling stream, its eyes wide with wonder. The forest is alive with magical elements: flowers that glow and change colors, trees with leaves in shades of purple and silver, and small floating lights that resemble fireflies.
+The camera follows behind a white vintage SUV with a black roof rack as it speeds up a steep dirt road surrounded by pine trees on a mountain slope; dust kicks up from its tires. The sunlight shines on the SUV as it accelerates, casting a warm glow over the scene. The dirt road curves gently into the distance, redwoods on either side.
+A drone camera circles around a beautiful historic church built on a rocky outcropping along the Amalfi Coast; the view showcases intricate architectural details. Waves crash against the rocks below while the warm glow of the afternoon sun bathes the scene.
+An extreme close‑up of a gray‑haired man with a beard in his 60s sits at a café in Paris, deep in thought. At the end he offers a subtle closed‑mouth smile as if he found the answer to the mystery of life. Dramatic cinematic lighting.
+Step‑printing scene of a person running, cinematic film shot in 35 mm.
+An adorable happy otter confidently stands on a surfboard wearing a yellow lifejacket, riding along turquoise tropical waters near lush islands, 3D digital render art style.
+A Chinese man sits at a table and eats noodles with chopsticks.
+Carefully pouring the milk into the cup, the milk flows smoothly and the cup gradually fills with milky‑white color.
+A rally car taking a fast turn on a track.
+Close‑up of a bright blue parrot's feathers glittering in the light, showing vibrant colors.
+A white and orange tabby cat happily darts through a dense garden from a ground‑level perspective, cinematic warm tones and grain.
+A FPV drone shot through a castle on a cliff.
+Over‑the‑shoulder shot of a woman running and watching a rocket in the distance.
+A pink pig running fast toward the camera in an alley in Tokyo.
+In a still frame, a tranquil pond fringed by weeping cherry trees, blossoms drifting onto the glassy surface.
+In a still frame, the Parthenon's majestic Doric columns stand atop the Acropolis, framed by the Athenian landscape.
+A person drinking coffee in a cafe.
+A motorcycle accelerating to gain speed.
+A train speeding down the tracks.
+Panda playing the guitar.
+A slow cinematic push‑in on an ostrich standing in a 1980s kitchen.
+A cyclone of broken glass in an urban alleyway, dynamic movement.
+A man standing in front of a burning building giving the thumbs‑up sign.
+A woman singing on a concert stage with a bright backlight.
+Dragon‑toucan walking through the Serengeti.
+Aerial view of Santorini during the blue hour, showcasing white Cycladic buildings with blue domes and the caldera.
+Animated scene: a short fluffy monster kneels beside a melting red candle, gazing at the flame with wonder.
+A cyclist powering up a steep hill in a road race.
+A trio of seahorses holding onto seagrass with their tails.
+A seal eagerly catching tossed fish from a trainer.
+Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: a stylish woman walks down a Tokyo street filled with warm glowing neon and animated city signage. She wears a black leather jacket, a long red dress, and black boots, and carries a black purse. She wears sunglasses and red lipstick. She walks confidently and casually. The street is damp and reflective, creating a mirror effect of the colorful lights. Many pedestrians walk about.
+Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: several giant wooly mammoths approach treading through a snowy meadow, their long wooly fur lightly blows in the wind as they walk, snow covered trees and dramatic snow capped mountains in the distance, mid afternoon light with wispy clouds and a sun high in the distance creates a warm glow, the low camera view is stunning capturing the large furry mammal with beautiful photography, depth of field.
+Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: drone view of waves crashing against the rugged cliffs along Big Sur’s Garay Point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff’s edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff’s edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of Big Sur.
+Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: historical footage of California during the gold rush.
+Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: a close‑up view of a glass sphere that has a zen garden within it. There is a small dwarf in the sphere who is raking the zen garden and creating patterns in the sand.
+Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: extreme close‑up of a 24‑year‑old woman’s eye blinking, standing in Marrakech during magic hour, cinematic film shot in 70 mm, depth of field, vivid colors, cinematic.
+Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: 3D animation of a small, round, fluffy creature with big, expressive eyes explores a vibrant, enchanted forest. The creature, a whimsical blend of a rabbit and a squirrel, has soft blue fur and a bushy, striped tail. It hops along a sparkling stream, its eyes wide with wonder. The forest is alive with magical elements: flowers that glow and change colors, trees with leaves in shades of purple and silver, and small floating lights that resemble fireflies.
+Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: the camera follows behind a white vintage SUV with a black roof rack as it speeds up a steep dirt road surrounded by pine trees on a mountain slope; dust kicks up from its tires. The sunlight shines on the SUV as it accelerates, casting a warm glow over the scene. The dirt road curves gently into the distance, redwoods on either side.
+Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: a drone camera circles around a beautiful historic church built on a rocky outcropping along the Amalfi Coast; the view showcases intricate architectural details. Waves crash against the rocks below while the warm glow of the afternoon sun bathes the scene.
+Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: an extreme close‑up of a gray‑haired man with a beard in his 60s sits at a café in Paris, deep in thought. At the end he offers a subtle closed‑mouth smile as if he found the answer to the mystery of life. Dramatic cinematic lighting.
+Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: step‑printing scene of a person running, cinematic film shot in 35 mm.
+Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: an adorable happy otter confidently stands on a surfboard wearing a yellow lifejacket, riding along turquoise tropical waters near lush islands, 3D digital render art style.

configs/captions/ar_video_prompts_videogen.txt ADDED Viewed

	@@ -0,0 +1,36 @@

+Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: a stylish woman walks down a Tokyo street filled with warm glowing neon and animated city signage. She wears a black leather jacket, a long red dress, and black boots, and carries a black purse. She wears sunglasses and red lipstick. She walks confidently and casually. The street is damp and reflective, creating a mirror effect of the colorful lights. Many pedestrians walk about.
+Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: several giant wooly mammoths approach treading through a snowy meadow, their long wooly fur lightly blows in the wind as they walk, snow covered trees and dramatic snow capped mountains in the distance, mid afternoon light with wispy clouds and a sun high in the distance creates a warm glow, the low camera view is stunning capturing the large furry mammal with beautiful photography, depth of field.
+Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: drone view of waves crashing against the rugged cliffs along Big Sur’s Garay Point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff’s edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff’s edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of Big Sur.
+Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: historical footage of California during the gold rush.
+Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: a close‑up view of a glass sphere that has a zen garden within it. There is a small dwarf in the sphere who is raking the zen garden and creating patterns in the sand.
+Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: extreme close‑up of a 24‑year‑old woman’s eye blinking, standing in Marrakech during magic hour, cinematic film shot in 70 mm, depth of field, vivid colors, cinematic.
+Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: 3D animation of a small, round, fluffy creature with big, expressive eyes explores a vibrant, enchanted forest. The creature, a whimsical blend of a rabbit and a squirrel, has soft blue fur and a bushy, striped tail. It hops along a sparkling stream, its eyes wide with wonder. The forest is alive with magical elements: flowers that glow and change colors, trees with leaves in shades of purple and silver, and small floating lights that resemble fireflies.
+Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: the camera follows behind a white vintage SUV with a black roof rack as it speeds up a steep dirt road surrounded by pine trees on a mountain slope; dust kicks up from its tires. The sunlight shines on the SUV as it accelerates, casting a warm glow over the scene. The dirt road curves gently into the distance, redwoods on either side.
+Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: a drone camera circles around a beautiful historic church built on a rocky outcropping along the Amalfi Coast; the view showcases intricate architectural details. Waves crash against the rocks below while the warm glow of the afternoon sun bathes the scene.
+Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: an extreme close‑up of a gray‑haired man with a beard in his 60s sits at a café in Paris, deep in thought. At the end he offers a subtle closed‑mouth smile as if he found the answer to the mystery of life. Dramatic cinematic lighting.
+Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: step‑printing scene of a person running, cinematic film shot in 35 mm.
+Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: an adorable happy otter confidently stands on a surfboard wearing a yellow lifejacket, riding along turquoise tropical waters near lush islands, 3D digital render art style.
+Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: a Chinese man sits at a table and eats noodles with chopsticks.
+Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: carefully pouring the milk into the cup, the milk flows smoothly and the cup gradually fills with milky‑white color.
+Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: a rally car taking a fast turn on a track.
+Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: close‑up of a bright blue parrot's feathers glittering in the light, showing vibrant colors.
+Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: a white and orange tabby cat happily darts through a dense garden from a ground‑level perspective, cinematic warm tones and grain.
+Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: a FPV drone shot through a castle on a cliff.
+Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: over‑the‑shoulder shot of a woman running and watching a rocket in the distance.
+Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: a pink pig running fast toward the camera in an alley in Tokyo.
+Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: in a still frame, a tranquil pond fringed by weeping cherry trees, blossoms drifting onto the glassy surface.
+Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: in a still frame, the Parthenon's majestic Doric columns stand atop the Acropolis, framed by the Athenian landscape.
+Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: a person drinking coffee in a cafe.
+Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: a motorcycle accelerating to gain speed.
+Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: a train speeding down the tracks.
+Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: panda playing the guitar.
+Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: a slow cinematic push‑in on an ostrich standing in a 1980s kitchen.
+Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: a cyclone of broken glass in an urban alleyway, dynamic movement.
+Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: a man standing in front of a burning building giving the thumbs‑up sign.
+Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: a woman singing on a concert stage with a bright backlight.
+Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: dragon‑toucan walking through the Serengeti.
+Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: aerial view of Santorini during the blue hour, showcasing white Cycladic buildings with blue domes and the caldera.
+Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: animated scene: a short fluffy monster kneels beside a melting red candle, gazing at the flame with wonder.
+Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: a cyclist powering up a steep hill in a road race.
+Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: a trio of seahorses holding onto seagrass with their tails.
+Ultra‑realistic 4K 60 fps Dolby Vision HDR cinematic video, shot on a digital IMAX camera with a 35 mm equivalent lens, ray‑traced global illumination, volumetric lighting, physically based rendering, hyper‑detailed textures, smooth Steadicam or drone tracking, shallow depth‑of‑field, immersive spatial audio. Scene: a seal eagerly catching tossed fish from a trainer.

configs/captions/editing.txt ADDED Viewed

	@@ -0,0 +1,118 @@

+A red vintage bicycle leaning against a brick wall [edit] change the bicycle’s color to teal
+A calico cat sleeping on a sunny windowsill [edit] add a small potted cactus beside the cat
+A steaming mug of coffee on a marble countertop [edit] replace the coffee with green matcha tea
+A wooden rowboat floating on a crystal lake [edit] change the time of day to golden sunset
+A rustic bookshelf filled with old novels [edit] insert a glowing crystal on the middle shelf
+A bowl of fresh strawberries on a white tablecloth [edit] remove three strawberries from the bowl
+A snow-covered cabin in a pine forest [edit] turn the scene into midsummer with lush greenery
+A street artist painting a mural on a brick wall [edit] make the mural feature abstract geometric shapes
+A golden retriever running across a beach [edit] add a colorful kite flying in the sky
+A glass terrarium containing succulents [edit] replace the succulents with blooming orchids
+A ceramic teapot beside two porcelain cups [edit] change the teapot design to blue willow pattern
+A mountain landscape under clear blue sky [edit] add dramatic storm clouds rolling in
+A violin resting on velvet fabric [edit] apply a vintage sepia tone to the entire image
+A busy city street at night with neon signs [edit] remove all people from the scene
+A marble statue in a museum gallery [edit] add a soft spotlight highlighting the statue’s face
+A plate of sushi arranged neatly on slate [edit] replace tuna pieces with avocado slices
+A lighthouse overlooking a rocky shore [edit] change the lighthouse stripes to red and white
+A cyclist on a winding countryside road [edit] turn the season to autumn with orange foliage
+A steaming bowl of ramen on a wooden table [edit] add extra slices of boiled egg on top
+A young sapling planted in fertile soil [edit] transform the sapling into a full-grown oak
+An astronaut floating in outer space [edit] replace the Earth in background with Saturn
+A quaint café terrace with empty chairs [edit] add a black cat sitting on one chair
+A desert scene with a single cactus [edit] introduce a small oasis with palm trees in the distance
+A handwritten love letter on parchment [edit] change the ink color to royal blue
+A crystal chandelier hanging in a ballroom [edit] increase brightness to make it sparkle intensely
+A surfer riding a large ocean wave [edit] add a dolphin jumping alongside the surfer
+A clay pot spinning on a pottery wheel [edit] apply a rainbow glaze pattern to the pot
+A quiet library with tall wooden shelves [edit] add floating dust motes in sunbeams
+A steaming cup of herbal tea with lemon slice [edit] remove the lemon and add a sprig of mint
+A sleek black sports car on a mountain road [edit] change the car color to bright yellow
+A picnic blanket spread on green grass [edit] add a wicker basket filled with fruit
+A snowy owl perched on a pine branch [edit] make the owl’s eyes a vibrant amber color
+A cobblestone alley lined with lanterns [edit] turn the scene into daytime with clear sky
+A chef plating gourmet pasta in a kitchen [edit] replace the pasta sauce with pesto
+A koi pond with lily pads and flowers [edit] add five more colorful koi fish
+A rustic wooden door with iron hinges [edit] paint the door a weathered turquoise shade
+A bouquet of wildflowers in a glass vase [edit] remove two tallest flowers to shorten arrangement
+A stack of pancakes topped with syrup [edit] add fresh blueberries and powdered sugar
+A commuter train passing through countryside [edit] change the season to winter with snowfall
+A puppy wearing a red bandana [edit] change the bandana color to pastel green
+An artist’s desk cluttered with brushes [edit] neatly organize the brushes into a cup
+A glowing full moon over a calm lake [edit] replace the moon with a crescent shape
+A bamboo steamer filled with dumplings [edit] add steam rising more visibly from the dumplings
+A violinist performing on a small stage [edit] dim the background lights for a spotlight effect
+A vineyard at harvest time [edit] change grapes from green to deep purple
+A rustic windmill in a sunflower field [edit] remove the windmill blades entirely
+A glass of red wine on a wooden barrel [edit] switch the wine color to white wine
+A paperback book open on beach sand [edit] add gentle ocean waves reaching the book edge
+A hallway lined with ornate mirrors [edit] turn the flooring into black-and-white checkerboard
+A basket of fresh oranges [edit] add a peeled orange with segments showing
+A vintage typewriter on a desk [edit] change the paper to display the words “Hello World”
+A fisherman casting a line at dawn [edit] move the sun position higher into late morning
+A city skyline at twilight [edit] increase building window lights for a vibrant look
+A steaming bowl of tomato soup [edit] garnish with a swirl of cream on top
+A classic motorcycle parked by roadside [edit] add raindrops for a freshly rained-on effect
+A painter mixing colors on a palette [edit] replace one primary color with metallic gold paint
+A terrier wearing a raincoat [edit] change the raincoat pattern to polka dots
+A cozy fireplace with burning logs [edit] intensify the flames for a warmer glow
+A cappuccino with latte art heart [edit] change the latte art to a leaf design
+A chessboard mid-game [edit] remove the black queen from the board
+A vintage camera resting on a map [edit] add a passport beside the camera
+A plate of macarons in pastel colors [edit] swap pink macarons to lavender hue
+A child flying a paper airplane [edit] turn the background into a starry night sky
+A bakery display of croissants [edit] add powdered sugar dusting on croissants
+A hummingbird hovering near a flower [edit] freeze the wings for sharp detail
+A busy farmer’s market stall [edit] remove price tags from all produce
+A kayak on a tranquil river [edit] change water color to emerald green
+A stack of vinyl records beside a turntable [edit] add a glowing neon sign saying “Now Playing”
+A spiral staircase in an old tower [edit] brighten ambient light for clearer details
+A latte served in a glass cup [edit] make the froth height taller by 20%
+A soccer ball on a grassy field [edit] add morning dew drops on grass blades
+A bookshelf with color-coded novels [edit] shuffle books into random order
+A geisha holding a parasol [edit] change parasol pattern to cherry blossoms
+A mountain biker on forest trail [edit] add motion blur to background for action feel
+A garden gnome standing among tulips [edit] paint the gnome’s hat bright yellow
+A street food cart serving tacos [edit] replace tacos with sushi rolls
+A ceramic mug with chipped rim [edit] restore the rim to perfect condition
+A violin bow resting beside sheet music [edit] add handwritten annotations on the sheet
+A classic lamp post beside a foggy road [edit] turn fog into light snowfall
+A bowl of green apples [edit] change two apples to bright red
+A skyline reflected in calm river [edit] add gentle ripples to distort reflection
+A barista pouring latte art [edit] slow shutter effect to create milk swirl trails
+A polaroid photo pinned to corkboard [edit] fade colors for a vintage look
+A snowboarder on a snowy slope [edit] increase snow spray for dynamic action
+A rustic barn in golden wheat field [edit] turn wheat color to early spring green
+An open sketchbook with pencil drawing [edit] add watercolor wash to the sketch
+A chef sharpening a kitchen knife [edit] replace knife with a wooden spoon
+A sunflower facing the sun [edit] rotate flower to face viewer instead
+A crystal perfume bottle on silken fabric [edit] scatter light rainbow refractions around bottle
+A laptop on a café table [edit] change screen content to code editor view
+A violin case lying open on stage floor [edit] add velvet lining texture detail
+A swimmer diving into a pool [edit] add underwater bubbles trailing the swimmer
+A campfire under starry sky [edit] increase star density for a Milky Way effect
+A gardener watering roses [edit] change watering can to a copper finish
+A bowl of ramen with chopsticks resting [edit] replace chopsticks with a wooden spoon
+A gothic archway in ancient ruins [edit] add creeping ivy on stone walls
+A bicycle leaning against tree [edit] convert scene to black-and-white photograph
+A plate of grilled salmon [edit] add lemon wedges on side
+A pianist’s hands on grand piano keys [edit] add sheet music on stand illuminated softly
+A street dog lying on warm pavement [edit] change pavement to cobblestone texture
+A cratered lunar landscape [edit] add Earth rise on horizon
+A pearl necklace on velvet pillow [edit] increase pearl luster for extra shine
+A mason jar filled with fireflies [edit] add soft glowing light escaping lid gaps
+A shuttle bus arriving at snowy station [edit] remove snow for a springtime setting
+A woman practicing yoga on cliff edge [edit] shift sunrise colors to sunset palette
+A stack of colorful gift boxes [edit] replace top box ribbon with a bow
+A sailboat on calm turquoise sea [edit] add a distant island silhouette
+A stack of chocolate chip cookies [edit] add melting chocolate drizzle
+A vintage radio on windowsill [edit] change radio color to mint green
+A sculpture of abstract metal shapes [edit] convert metal texture to brushed gold
+A firefighter spraying water on flames [edit] increase water spray for dramatic arc
+A desert highway stretching to horizon [edit] add tumbleweed crossing the road
+A fluffy kitten playing with yarn ball [edit] replace yarn color with multicolor rainbow threads
+An ancient stone bridge over river [edit] add lanterns hanging under arches
+A scientist examining test tubes [edit] change liquid color to bright purple
+A row of wind turbines on rolling hills [edit] turn scene into twilight with orange sky
+A scuba diver exploring coral reef [edit] add a school of colorful fish surrounding diver
+A cafe latte on a wooden tray [edit] stencil a star pattern in cocoa on froth

configs/captions/editing2.txt ADDED Viewed

	@@ -0,0 +1,158 @@

+A jade teapot on a bamboo mat [edit] change the teapot to polished copper
+A marble bust lit by soft window light [edit] rotate the bust 30° to the left
+A commuter train entering a tunnel [edit] replace the train with a freight locomotive
+A medieval stone bridge over misty river [edit] add flower boxes along the railing
+A neon food truck at midnight market [edit] switch neon sign color to violet
+A bonsai pine on a wooden shelf [edit] enlarge the bonsai roots onto a rock slab
+A cyclist on a snow-dusted trail [edit] remove all snow for summer scenery
+A row of lab flasks with blue liquid [edit] change liquid to gradient rainbow
+A grand piano on a concert hall stage [edit] lower the key cover halfway
+A lighthouse beam cutting through fog [edit] intensify light to double brightness
+An astronaut planting a flag on Mars [edit] swap Mars sand for lunar regolith
+A vintage typewriter beside ink bottle [edit] add dried lavender sprig on keys
+A plate of macarons in pastel hues [edit] convert entire scene to monochrome
+A sunflower field at golden hour [edit] turn one sunflower to face the viewer
+A violin resting on velvet cloth [edit] change violin varnish from brown to deep red
+A mountain climber reaching the summit [edit] add celebratory confetti burst
+A glass of iced lemonade on patio table [edit] remove ice cubes completely
+A ceramic mug with hand-painted flowers [edit] replace floral pattern with stripes
+A koi pond with rippling water [edit] freeze the water surface perfectly still
+A bookshelf of worn leather tomes [edit] insert a glowing holographic book
+A city skyline reflected on wet pavement [edit] change reflection to fragmented shards
+A chef garnishing sushi rolls [edit] replace sushi with colorful mochi
+A freight ship in calm harbor [edit] add gentle morning fog around hull
+A ballerina mid-pirouette on empty stage [edit] add motion trail behind tutu
+A rustic barn in golden wheat field [edit] turn wheat into lavender plants
+A steaming bowl of pho on wooden table [edit] add fresh chili slices floating
+A geodesic dome in arctic landscape [edit] shift lighting to sunset pinks
+An old pocket watch on velvet [edit] set time from 4:00 to 10:10
+An oak tree with autumn leaves [edit] transition leaves to spring green
+A skateboarder grinding rail [edit] add spark particles at contact point
+A crystal chandelier in foyer [edit] dim ambient light for stronger contrast
+A swan gliding on mirror lake [edit] create concentric ripples behind swan
+A row of wind turbines on ridge [edit] turn blades into transparent glass
+A latte with leaf-shaped foam art [edit] change art to a rosette pattern
+A gourmet burger on slate board [edit] remove cheese slice entirely
+A paper airplane against blue sky [edit] add sketched contrail behind plane
+A flamenco guitarist on street corner [edit] replace guitar with a violin
+A snow-covered pine forest [edit] add footprints leading into distance
+A stack of vinyl records [edit] change record labels to bright cyan
+An artist mixing oil paints [edit] replace palette knife with fine brush
+A classic red telephone booth [edit] repaint booth matte black
+A glass skyscraper at sunrise [edit] add low-lying rain clouds wrapping base
+A puppy chasing butterflies [edit] swap butterflies for floating bubbles
+A clay pot on pottery wheel [edit] apply crackle glaze texture
+A bridge enveloped in autumn fog [edit] lift fog to reveal river below
+A salad bowl of mixed greens [edit] add sliced strawberries on top
+A desert road with heat haze [edit] insert distant thunderstorm on horizon
+A chess set mid-game [edit] replace black queen with 3-D printed model
+A coral reef teeming with fish [edit] shift palette to infrared false-color
+A fountain in public square [edit] stop water flow to frozen sculptural state
+A maple leaf on wet asphalt [edit] intensify reflections around leaf
+A violinist practicing scales [edit] add metronome on nearby stand
+A bowl of ramen with narutomaki [edit] remove narutomaki slices
+A modern smartwatch on wrist [edit] change wristband to woven fabric
+A candlelit dinner table [edit] extinguish leftmost candle only
+A forest path carpeted with moss [edit] scatter pink cherry petals along trail
+A drone view over rice terraces [edit] turn season to early harvest yellow
+A plate of assorted cheeses [edit] replace blue cheese with goat cheese
+A fox sitting in snowy field [edit] add softly falling snowflakes
+A graffiti mural on brick wall [edit] desaturate bricks, keep mural color
+A leather-bound journal and quill [edit] age journal cover with scratches
+A small cabin under starry sky [edit] add faint aurora on horizon
+A cup of espresso with crema [edit] lighten crema for flat white look
+A bouquet of roses in vase [edit] change roses from red to white
+A lighthouse on rocky coastline [edit] add crashing wave spray
+A billiard table break shot [edit] freeze cue ball before impact
+A hot-air balloon over vineyards [edit] change balloon envelope to stripes
+A snow leopard on rocky ledge [edit] brighten ambient light for detail
+A sushi chef slicing tuna [edit] replace tuna with salmon
+A violin bow resting on strings [edit] slightly raise bow above strings
+A stack of pancakes with butter [edit] drizzle maple syrup generously
+A waterfall in tropical jungle [edit] adjust shutter to silky water effect
+A painter’s easel by window [edit] add soft morning light beam
+A cherry pie cooling on sill [edit] remove lattice crust for open top
+A monorail passing futuristic city [edit] shift city lighting to neon pink-blue
+A rustic windmill beside field [edit] stop windmill blades mid-motion
+A latte art heart [edit] invert colors for photographic negative
+A scuba diver photographing coral [edit] switch diver’s fins to bright yellow
+A snowy owl in flight [edit] extend wingspan slightly wider
+A violin case with sheet music [edit] add handwritten annotations
+A garden gnome among tulips [edit] repaint gnome hat to polka dots
+A stone archway leading to courtyard [edit] add creeping ivy on stone
+A bowl of blueberries [edit] remove two berries for asymmetry
+A photographer in desert dunes [edit] add blowing sand trail behind
+A city street with puddles [edit] enhance neon reflections in puddles
+A close-up of eye with makeup [edit] change iris color to amber
+A vintage car parked roadside [edit] convert entire scene to sepia
+A farmer holding basket of apples [edit] change apples to peaches
+A plate of spaghetti carbonara [edit] sprinkle extra parmesan on top
+A kayaker on serene lake [edit] add low morning mist on water
+A bee on sunflower [edit] isolate bee in spotlight, darken background
+A commuter bike leaning on wall [edit] switch bike frame to bamboo
+A violin soloist on stage [edit] dim background to silhouette orchestra
+A chess clock beside board [edit] set clock to 00:05 time left
+A coffee grinder with beans [edit] reduce bean level by half
+A cliffside pagoda at dawn [edit] add low-lying clouds below pagoda
+A pitcher of lemonade [edit] replace ice cubes with frozen lemon slices
+A forest waterfall in winter [edit] transform to autumn foliage
+A crystal ball on pedestal [edit] show swirling galaxy inside
+A plate of dim sum baskets [edit] stack one extra basket on left
+A gondola on Venetian canal [edit] add lanterns hanging above canal
+A foxglove plant in bloom [edit] desaturate background for subject pop
+A pianist’s hands mid-chord [edit] blur hands for motion effect
+A stack of handmade soaps [edit] emboss top soap with floral logo
+A campfire with marshmallows [edit] increase flame height 30%
+A glass of red wine swirling [edit] switch wine color to rosé
+A street dog drinking puddle [edit] remove puddle, replace with food bowl
+A marble statue under skylight [edit] add tiny crack at base
+A subway entrance with stairs [edit] turn scene into rainy evening
+A painter holding color wheel [edit] rotate wheel 90 degrees clockwise
+A plate of tacos with salsa [edit] change salsa from red to green
+A violin decked in flowers [edit] remove all flowers except one rose
+A desert highway at dusk [edit] add old neon motel sign
+A kayak on river rapids [edit] calm water to slow flow
+A snow globe with cabin [edit] add swirling glitter instead of snow
+A chef flipping wok vegetables [edit] increase flame size dramatically
+A castle on cliff at sunset [edit] alter sky to twilight stars
+A latte with leaf art [edit] add cinnamon dusting on froth
+A bonsai tree in tray [edit] add miniature stone lantern near trunk
+A neon-lit alley with puddles [edit] reduce neon signs by half
+A violin bridge macro [edit] add faint dust for realism
+A picnic table with sandwiches [edit] remove one sandwich for negative space
+A city skyline sunrise [edit] shift color palette to dusk purples
+A bowl of ramen [edit] replace noodles with udon
+A journal on wooden desk [edit] add fountain pen diagonal across page
+A barista steaming milk [edit] increase visible steam
+A surfer catching wave [edit] add sun flare in corner
+A vintage film camera [edit] open lens cap slightly
+A plate of donuts [edit] coat top donut with sprinkles
+A horse grazing meadow [edit] extend mane length slightly
+A lighthouse silhouette [edit] brighten beacon light halo
+A violin on chair [edit] place bow across strings
+A chef torching crème brûlée [edit] darken caramelization spots
+A koi pond reflection [edit] add falling cherry petals
+A mountain trail hiker [edit] insert trail signpost
+A glass orb on sand [edit] show inverted reflection sharper
+An espresso shot [edit] double crema thickness
+A windmill at dusk [edit] turn blades into slow blur motion
+A chessboard aerial view [edit] highlight white king with glow
+A bakery croissant stack [edit] dust powdered sugar on top two croissants
+A sunset beach scene [edit] add silhouetted palm in foreground
+A violin scroll close-up [edit] tint varnish warmer amber
+A dripping paintbrush [edit] change paint color to teal
+A bowl of ramen eggs [edit] cut egg halves for yolk view
+A street cafe table [edit] remove one chair for asymmetry
+A forest creek [edit] add stepping stones across water
+A cityscape night sky [edit] add comet trail overhead
+A cup of cocoa with whipped cream [edit] sprinkle cocoa powder on top
+A classical guitar against wall [edit] desaturate background only
+A crystal decanter [edit] fill decanter halfway with amber liquid
+A parchment map [edit] burn map edges slightly
+A snowflake macro [edit] invert to dark field illumination
+A violin tailpiece macro [edit] add subtle wood grain texture
+A racing bike [edit] swap wheels to deep-rim carbon
+A farmer in wheat field [edit] change wheat to barley heads
+A hummingbird feeder [edit] add second bird approaching
+A street market fruit stand [edit] rearrange apples to pyramid
+A calligraphy brush stroke [edit] thicken stroke width 15%

configs/captions/example_human.txt ADDED Viewed

	@@ -0,0 +1,100 @@

+Concept‑art illustration of a wind‑turbine technician perched atop a wind turbine at sunset, elongated frame guiding eye upward, glittering dust in sunbeams, ultrawide cinematic lens
+Dynamic action shot of a percussionist on a cliffside lighthouse balcony, vertical layout with leading lines, volumetric mist, Baroque oil texture
+Dynamic action shot of a sound foley artist by a bioluminescent tide pool, vertical layout with leading lines, golden‑hour backlight, sepia film grain
+Documentary snapshot of a watchmaker by a bioluminescent tide pool, top‑to‑bottom visual flow, dramatic cloud backdrop, Baroque oil texture
+Hyperreal CGI render of a comic colorist inside a bustling food market, towering vertical composition, glittering dust in sunbeams, isometric voxel aesthetic
+Graphic novel panel showing a northern‑lights tour guide amid glowing jellyfish tanks, portrait orientation emphasizing height, tilt‑shift miniaturization effect, lo‑fi pixel art
+Soft pastel painting of a watchmaker amid glowing jellyfish tanks, portrait orientation emphasizing height, glittering dust in sunbeams, infrared false color
+Concept‑art illustration of a forest firefighter on a storm‑battered sea wall, towering vertical composition, falling snowflakes, isometric voxel aesthetic
+Retro film photograph of a VR game designer at a festival of paper lanterns, towering vertical composition, rim lighting on subject, photoreal 8K detail
+Candid photo of a train signal operator on a cliffside lighthouse balcony, elongated frame guiding eye upward, soft foreground bokeh, ultrawide cinematic lens
+Concept‑art illustration of a data journalist perched atop a wind turbine at sunset, vertical layout with leading lines, golden‑hour backlight, lo‑fi pixel art
+Hyperreal CGI render of a calligraphy artist amid autumn maple leaves, vertical layout with leading lines, golden‑hour backlight, infrared false color
+Dynamic action shot of a roller‑coaster mechanic inside a library tower filled with skylight, elongated frame guiding eye upward, falling snowflakes, lo‑fi pixel art
+High‑fashion editorial of a bike messenger on a cliffside lighthouse balcony, vertical layout with leading lines, dramatic cloud backdrop, pastel watercolor wash
+Documentary snapshot of a train signal operator inside a shipyard dry dock, vertical layout with leading lines, wet pavement reflections, photoreal 8K detail
+Documentary snapshot of a ice sculptor on a floating river dock at dawn, portrait orientation emphasizing height, rim lighting on subject, vivid gouache strokes
+Graphic novel panel showing a paramedic inside a bustling food market, towering vertical composition, glittering dust in sunbeams, lo‑fi pixel art
+High‑fashion editorial of a robotics engineer on a rain‑slick neon street, towering vertical composition, glittering dust in sunbeams, photoreal 8K detail
+Cinematic still of a heritage conservator inside a glass‑roofed greenhouse, top‑to‑bottom visual flow, tilt‑shift miniaturization effect, vivid gouache strokes
+Dynamic action shot of a subway conductor on an urban rooftop garden, top‑to‑bottom visual flow, dramatic cloud backdrop, ultrawide cinematic lens
+Dynamic action shot of a tea ceremony master on a wooden pier in fog, elongated frame guiding eye upward, wet pavement reflections, pastel watercolor wash
+Concept‑art illustration of a bonsai cultivator on a rain‑slick neon street, top‑to‑bottom visual flow, glittering dust in sunbeams, ultrawide cinematic lens
+Dynamic action shot of a robotics engineer on a storm‑battered sea wall, vertical layout with leading lines, soft foreground bokeh, vivid gouache strokes
+Retro film photograph of a bee swarm researcher in a misty bamboo grove, towering vertical composition, falling snowflakes, Baroque oil texture
+Cinematic still of a robotics engineer beside a mirror‑still alpine lake, portrait orientation emphasizing height, volumetric mist, ultrawide cinematic lens
+Concept‑art illustration of a calligraphy artist at a coastal cliff wind farm, towering vertical composition, tilt‑shift miniaturization effect, pastel watercolor wash
+Documentary snapshot of a deep‑sea diver on a floating river dock at dawn, top‑to‑bottom visual flow, falling snowflakes, vivid gouache strokes
+Documentary snapshot of a urban beekeeper inside a subterranean crystal cavern, elongated frame guiding eye upward, dramatic cloud backdrop, ultrawide cinematic lens
+Cinematic still of a shipwright inside a shipyard dry dock, portrait orientation emphasizing height, wet pavement reflections, Baroque oil texture
+Retro film photograph of a bonsai cultivator beneath aurora‑lit sky, vertical layout with leading lines, golden‑hour backlight, sepia film grain
+Cinematic still of a watchmaker at a snow‑covered mountain pass, vertical layout with leading lines, dramatic cloud backdrop, Baroque oil texture
+Dynamic action shot of a astronomer by a bioluminescent tide pool, top‑to‑bottom visual flow, falling snowflakes, sepia film grain
+Dynamic action shot of a urban beekeeper inside a bustling food market, portrait orientation emphasizing height, falling snowflakes, isometric voxel aesthetic
+Concept‑art illustration of a shipwright inside a vintage train carriage, portrait orientation emphasizing height, soft foreground bokeh, pastel watercolor wash
+Dynamic action shot of a subway conductor inside a library tower filled with skylight, vertical layout with leading lines, volumetric mist, pastel watercolor wash
+High‑fashion editorial of a mountain guide at a festival of paper lanterns, elongated frame guiding eye upward, falling snowflakes, lo‑fi pixel art
+Dynamic action shot of a surfboard shaper inside a glass‑roofed greenhouse, vertical layout with leading lines, soft foreground bokeh, Baroque oil texture
+Documentary snapshot of a percussionist beside a roaring waterfall, vertical layout with leading lines, rim lighting on subject, vivid gouache strokes
+Candid photo of a train signal operator at a snow‑covered mountain pass, top‑to‑bottom visual flow, rim lighting on subject, sepia film grain
+Concept‑art illustration of a VR game designer inside a vintage train carriage, vertical layout with leading lines, soft foreground bokeh, Baroque oil texture
+Retro film photograph of a ice‑hotel architect inside a shipyard dry dock, elongated frame guiding eye upward, tilt‑shift miniaturization effect, lo‑fi pixel art
+Concept‑art illustration of a urban beekeeper on a cliffside lighthouse balcony, top‑to‑bottom visual flow, falling snowflakes, infrared false color
+Dynamic action shot of a bridge painter amid autumn maple leaves, elongated frame guiding eye upward, falling snowflakes, isometric voxel aesthetic
+Concept‑art illustration of a calligraphy artist inside a glass‑roofed greenhouse, vertical layout with leading lines, soft foreground bokeh, photoreal 8K detail
+Soft pastel painting of a street muralist inside a library tower filled with skylight, top‑to‑bottom visual flow, glittering dust in sunbeams, photoreal 8K detail
+Hyperreal CGI render of a paramedic on a cliffside lighthouse balcony, vertical layout with leading lines, long‑exposure light trails, neon noir palette
+Soft pastel painting of a wildlife rehabilitator in a misty bamboo grove, towering vertical composition, glittering dust in sunbeams, Baroque oil texture
+Retro film photograph of a lighthouse keeper amid autumn maple leaves, portrait orientation emphasizing height, glittering dust in sunbeams, vivid gouache strokes
+Dynamic action shot of a wildlife rehabilitator inside a watchmaker’s workshop, portrait orientation emphasizing height, rim lighting on subject, pastel watercolor wash
+Cinematic still of a forensic analyst inside a vintage train carriage, towering vertical composition, long‑exposure light trails, lo‑fi pixel art
+Hyperreal CGI render of a watchmaker on a rain‑drenched ferry deck, vertical layout with leading lines, glittering dust in sunbeams, lo‑fi pixel art
+Concept‑art illustration of a urban farmer inside a bustling food market, vertical layout with leading lines, soft foreground bokeh, pastel watercolor wash
+Soft pastel painting of a percussionist beside a roaring waterfall, towering vertical composition, falling snowflakes, isometric voxel aesthetic
+Soft pastel painting of a robotics engineer by a bioluminescent tide pool, top‑to‑bottom visual flow, tilt‑shift miniaturization effect, photoreal 8K detail
+Candid photo of a solar‑sail pilot on a rain‑slick neon street, elongated frame guiding eye upward, falling snowflakes, neon noir palette
+Soft pastel painting of a watchmaker inside a watchmaker’s workshop, top‑to‑bottom visual flow, dramatic cloud backdrop, isometric voxel aesthetic
+Retro film photograph of a data journalist by a bioluminescent tide pool, portrait orientation emphasizing height, soft foreground bokeh, sepia film grain
+Candid photo of a forensic analyst perched atop a wind turbine at sunset, portrait orientation emphasizing height, volumetric mist, isometric voxel aesthetic
+Documentary snapshot of a urban farmer inside a subterranean crystal cavern, towering vertical composition, soft foreground bokeh, pastel watercolor wash
+High‑fashion editorial of a drone cinematographer inside a glass‑roofed greenhouse, towering vertical composition, soft foreground bokeh, pastel watercolor wash
+High‑fashion editorial of a dune ecologist at a lunar research outpost, portrait orientation emphasizing height, falling snowflakes, photoreal 8K detail
+Candid photo of a comic colorist by a bioluminescent tide pool, elongated frame guiding eye upward, falling snowflakes, photoreal 8K detail
+Candid photo of a dune ecologist beneath aurora‑lit sky, top‑to‑bottom visual flow, volumetric mist, ultrawide cinematic lens
+Dynamic action shot of a ballet dancer in a misty bamboo grove, portrait orientation emphasizing height, soft foreground bokeh, lo‑fi pixel art
+Documentary snapshot of a urban farmer under blooming cherry trees, top‑to‑bottom visual flow, rim lighting on subject, vivid gouache strokes
+Documentary snapshot of a storm chaser amid glowing jellyfish tanks, top‑to‑bottom visual flow, soft foreground bokeh, ultrawide cinematic lens
+Documentary snapshot of a wind‑turbine technician at a lunar research outpost, towering vertical composition, long‑exposure light trails, sepia film grain
+Hyperreal CGI render of a robotics engineer inside an abandoned observatory, towering vertical composition, wet pavement reflections, neon noir palette
+Graphic novel panel showing a dune ecologist beneath aurora‑lit sky, towering vertical composition, rim lighting on subject, pastel watercolor wash
+Soft pastel painting of a bridge painter inside a watchmaker’s workshop, elongated frame guiding eye upward, rim lighting on subject, photoreal 8K detail
+Documentary snapshot of a drone cinematographer perched atop a wind turbine at sunset, towering vertical composition, soft foreground bokeh, Baroque oil texture
+Dynamic action shot of a roller‑coaster mechanic inside a watchmaker’s workshop, elongated frame guiding eye upward, long‑exposure light trails, photoreal 8K detail
+Dynamic action shot of a shipwright on a wooden pier in fog, top‑to‑bottom visual flow, long‑exposure light trails, pastel watercolor wash
+Concept‑art illustration of a ballet dancer inside a watchmaker’s workshop, elongated frame guiding eye upward, dramatic cloud backdrop, Baroque oil texture
+Dynamic action shot of a paramedic on a storm‑battered sea wall, top‑to‑bottom visual flow, rim lighting on subject, neon noir palette
+Graphic novel panel showing a deep‑sea diver inside an abandoned observatory, elongated frame guiding eye upward, dramatic cloud backdrop, photoreal 8K detail
+Concept‑art illustration of a forest firefighter perched atop a wind turbine at sunset, top‑to‑bottom visual flow, wet pavement reflections, Baroque oil texture
+Graphic novel panel showing a VR game designer beside a roaring waterfall, portrait orientation emphasizing height, rim lighting on subject, ultrawide cinematic lens
+Hyperreal CGI render of a bike messenger in a misty bamboo grove, elongated frame guiding eye upward, tilt‑shift miniaturization effect, neon noir palette
+Dynamic action shot of a food‑truck chef on a rain‑slick neon street, elongated frame guiding eye upward, dramatic cloud backdrop, neon noir palette
+Cinematic still of a deep‑sea diver beside a roaring waterfall, towering vertical composition, tilt‑shift miniaturization effect, ultrawide cinematic lens
+Retro film photograph of a ice sculptor beneath aurora‑lit sky, top‑to‑bottom visual flow, soft foreground bokeh, photoreal 8K detail
+Graphic novel panel showing a parkour athlete at a festival of paper lanterns, portrait orientation emphasizing height, volumetric mist, Baroque oil texture
+Graphic novel panel showing a street muralist inside a glass‑roofed greenhouse, portrait orientation emphasizing height, soft foreground bokeh, neon noir palette
+Hyperreal CGI render of a wildlife rehabilitator amid glowing jellyfish tanks, towering vertical composition, dramatic cloud backdrop, Baroque oil texture
+Graphic novel panel showing a bike messenger at a lunar research outpost, top‑to‑bottom visual flow, dramatic cloud backdrop, sepia film grain
+Candid photo of a kite maker within a desert dust storm, vertical layout with leading lines, long‑exposure light trails, vivid gouache strokes
+Candid photo of a astronomer on a wooden pier in fog, towering vertical composition, falling snowflakes, ultrawide cinematic lens
+Documentary snapshot of a comic colorist at a lunar research outpost, top‑to‑bottom visual flow, soft foreground bokeh, isometric voxel aesthetic
+High‑fashion editorial of a portrait photographer in a misty bamboo grove, vertical layout with leading lines, golden‑hour backlight, lo‑fi pixel art
+Dynamic action shot of a urban farmer in a misty bamboo grove, elongated frame guiding eye upward, glittering dust in sunbeams, infrared false color
+Concept‑art illustration of a drone cinematographer beneath aurora‑lit sky, elongated frame guiding eye upward, golden‑hour backlight, ultrawide cinematic lens
+Retro film photograph of a kite maker under blooming cherry trees, portrait orientation emphasizing height, tilt‑shift miniaturization effect, vivid gouache strokes
+Soft pastel painting of a wildlife rehabilitator inside a watchmaker’s workshop, towering vertical composition, wet pavement reflections, sepia film grain
+Hyperreal CGI render of a urban farmer inside an abandoned observatory, elongated frame guiding eye upward, long‑exposure light trails, lo‑fi pixel art
+Dynamic action shot of a northern‑lights tour guide within a desert dust storm, elongated frame guiding eye upward, tilt‑shift miniaturization effect, Baroque oil texture
+High‑fashion editorial of a northern‑lights tour guide inside an abandoned observatory, elongated frame guiding eye upward, falling snowflakes, lo‑fi pixel art
+Retro film photograph of a subway conductor in a misty bamboo grove, towering vertical composition, volumetric mist, photoreal 8K detail
+Retro film photograph of a storm chaser inside a watchmaker’s workshop, towering vertical composition, long‑exposure light trails, photoreal 8K detail
+Hyperreal CGI render of a urban farmer on a wooden pier in fog, top‑to‑bottom visual flow, golden‑hour backlight, vivid gouache strokes

configs/captions/example_prompts2.txt ADDED Viewed

	@@ -0,0 +1,100 @@

+A celestial lantern village flickering between dimensions, anamorphic lens flare
+A fractal sonar reef resonating with harmonic waves, sepia ink etching
+A amber compass balanced atop a obsidian chalice beneath aurora-lit horizon, infrared false‑color
+'Vortex' displayed as a solar‑panel mosaic framed by erupting geysers, hyperdetailed microphotography
+A amber compass encased within a meteorite slab atop cloud-piercing tower, minimalist negative space
+A kinetic railway bridge aligned with lunar phases, chalk pastel texture
+A cerulean origami crane hovering beside a quartz pedestal in moonlit desert ruin, dreamlike soft focus
+A levitating clock tower flooded by molten stardust, brutalist geometry emphasis
+'Vortex' displayed as a floating floral typography framed by erupting geysers, lo‑fi film grain
+A labyrinthine mirror lake composed of levitating stones, anamorphic lens flare
+A retro‑futuristic tea pavilion guarded by luminous koi, isometric voxel render
+'Halcyon' displayed as a interactive LED canopy projected onto ancient ziggurat, Art‑Nouveau outlines
+A titanium feather spiraling around a mahogany music box beneath aurora-lit horizon, neon noir palette
+A infrared railway bridge composed of levitating stones, vector flat design
+A volcanic nocturnal carnival etched with forgotten runes, ultrawide panoramic frame
+'Catalyst' displayed as a crystal fiber optic sign reflected on tranquil rice paddies, dreamlike soft focus
+'Radiance' displayed as a drone‑light hologram projected onto ancient ziggurat, tilt‑shift miniature look
+A fractal sky monastery scattered across hovering islands, photoreal volumetric lighting
+A vaporous coral metropolis embedded in frozen time, vivid watercolor bloom
+A submerged sonar reef flooded by molten stardust, brutalist geometry emphasis
+A surreal voltage garden singing in ultrasonic tones, hyperdetailed microphotography
+A vaporous coral metropolis drifting through twilight mist, tilt‑shift miniature look
+A emerald hourglass encased within a ancient parchment scroll in moonlit desert ruin, anamorphic lens flare
+A frozen voltage garden guarded by luminous koi, anamorphic lens flare
+'Radiance' displayed as a geothermal steam stencil above polar ice floes at dusk, brutalist geometry emphasis
+A infrared subterranean amphitheater framed by aurora curtains, chalk pastel texture
+A labyrinthine railway bridge woven from crystalline filaments, chalk pastel texture
+A porcelain mask spiraling around a meteorite slab beneath aurora-lit horizon, CGI ray‑traced caustics
+'Obsidian' displayed as a drone‑light hologram amid drifting balloon lanterns, tilt‑shift miniature look
+A phosphorescent desert oasis anchored in swirling vortexes, hyperdetailed microphotography
+A opalescent sky monastery resonating with harmonic waves, hyperdetailed microphotography
+A volcanic bamboo labyrinth anchored in swirling vortexes, brutalist geometry emphasis
+A volcanic lighthouse isle anchored in swirling vortexes, 8‑bit pixel aesthetic
+A amber compass spiraling around a hollow crystal sphere beside luminescent tide pool, dreamlike soft focus
+'Cipher' displayed as a wave‑pattern sand relief hovering above midnight ocean, Baroque oil technique
+A echoing coral metropolis flickering between dimensions, brutalist geometry emphasis
+A mesmerizing mirror lake powered by clockwork tides, isometric voxel render
+'Euphoria' displayed as a solar‑panel mosaic hovering above midnight ocean, brutalist geometry emphasis
+A translucent railway bridge carved into a meteor fragment, anamorphic lens flare
+A tessellated nocturnal carnival guarded by luminous koi, hyperdetailed microphotography
+A retro‑futuristic subterranean amphitheater carved into a meteor fragment, minimalist negative space
+A verdant ice palace etched with forgotten runes, Baroque oil technique
+A sonic glacier cave spiraling into infinite recursion, Baroque oil technique
+A vaporous arboretum anchored in swirling vortexes, isometric voxel render
+A polychromatic quantum workshop traversed by whispering drones, anamorphic lens flare
+A submerged tidal library wrapped in fractal snowflakes, vivid watercolor bloom
+A luminous rainbow waterfall grown from magnetic vines, Art‑Nouveau outlines
+A holographic sonar reef flickering between dimensions, Art‑Nouveau outlines
+A levitating tidal library traversed by whispering drones, sepia ink etching
+A surreal lighthouse isle traversed by whispering drones, sepia ink etching
+A ascending sandstone fortress embedded in frozen time, dreamlike soft focus
+A levitating observatory flooded by molten stardust, sepia ink etching
+A polychromatic storm laboratory guarded by luminous koi, tilt‑shift miniature look
+A levitating arboretum etched with forgotten runes, infrared false‑color
+'Luminescence' displayed as a interactive LED canopy amid drifting balloon lanterns, sepia ink etching
+A infrared lantern village orbiting a miniature sun, Art‑Nouveau outlines
+A echoing nocturnal carnival grown from magnetic vines, Art‑Nouveau outlines
+A origami tea pavilion framed by aurora curtains, chalk pastel texture
+'Vortex' displayed as a geothermal steam stencil beneath swirling cyclone clouds, chalk pastel texture
+A mechanized observatory resonating with harmonic waves, dreamlike soft focus
+A polychromatic nocturnal carnival framed by aurora curtains, ultrawide panoramic frame
+A mesmerizing gravity well plaza framed by aurora curtains, chalk pastel texture
+A cryptic glacier cave resonating with harmonic waves, hyperdetailed microphotography
+'Entropy' displayed as a interactive LED canopy suspended between coral spires, vivid watercolor bloom
+A celestial astral caravan carved into a meteor fragment, anamorphic lens flare
+A infrared nocturnal carnival submerged beneath glassy waves, sepia ink etching
+A whimsical coral metropolis scattered across hovering islands, vivid watercolor bloom
+A silver key spiraling around a quartz pedestal beside luminescent tide pool, vivid watercolor bloom
+A emerald hourglass illuminating a woven silk tapestry in moonlit desert ruin, CGI ray‑traced caustics
+A levitating ice palace carved into a meteor fragment, ultrawide panoramic frame
+'Cipher' displayed as a wave‑pattern sand relief hovering above midnight ocean, lo‑fi film grain
+A tessellated asteroid mine suspended above rolling thunderheads, isometric voxel render
+A zero‑gravity tidal library anchored in swirling vortexes, neon noir palette
+A submerged data cathedral grown from magnetic vines, dreamlike soft focus
+A retro‑futuristic coral metropolis illuminated by bioluminescent spores, vivid watercolor bloom
+A mechanized tea pavilion composed of levitating stones, photoreal volumetric lighting
+A levitating tea pavilion spiraling into infinite recursion, vector flat design
+A polychromatic voltage garden grown from magnetic vines, infrared false‑color
+A tessellated sandstone fortress composed of levitating stones, Baroque oil technique
+'Cascade' displayed as a molten metal casting suspended between coral spires, minimalist negative space
+A vaporous nocturnal carnival embedded in frozen time, vector flat design
+'Luminescence' displayed as a molten metal casting above polar ice floes at dusk, sepia ink etching
+A origami moss temple drifting through twilight mist, Baroque oil technique
+'Cascade' displayed as a wave‑pattern sand relief suspended between coral spires, 8‑bit pixel aesthetic
+A phosphorescent gravity well plaza echoing with distant chimes, chalk pastel texture
+A tessellated data cathedral guarded by luminous koi, vector flat design
+A cryptic subterranean amphitheater aligned with lunar phases, vector flat design
+A submerged mirror lake singing in ultrasonic tones, chalk pastel texture
+A levitating arboretum embedded in frozen time, photoreal volumetric lighting
+A polychromatic bamboo labyrinth drifting through twilight mist, tilt‑shift miniature look
+'Elysium' displayed as a crystal fiber optic sign beneath swirling cyclone clouds, ultrawide panoramic frame
+'Nebulous' displayed as a drone‑light hologram hovering above midnight ocean, brutalist geometry emphasis
+A whimsical data cathedral framed by aurora curtains, photoreal volumetric lighting
+A holographic bamboo labyrinth composed of levitating stones, neon noir palette
+A cryptic tea pavilion singing in ultrasonic tones, minimalist negative space
+A verdant glacier cave suspended above rolling thunderheads, CGI ray‑traced caustics
+A levitating sonar reef submerged beneath glassy waves, brutalist geometry emphasis
+'Entropy' displayed as a wave‑pattern sand relief reflected on tranquil rice paddies, CGI ray‑traced caustics
+'Catalyst' displayed as a crystal fiber optic sign beneath swirling cyclone clouds, infrared false‑color
+A silver key encased within a glacial mirror shard within silent subterranean hall, Art‑Nouveau outlines

configs/captions/example_prompts3.txt ADDED Viewed

	@@ -0,0 +1,100 @@

+A subway station covered in bioluminescent moss, watercolor illustration, with reflective wet pavement
+A lone cherry blossom tree growing on a floating island, ultrawide cinematic frame, bathed in golden hour light
+A paper sailboat navigating waves of handwritten letters, matte‑painting concept art, captured in long‑exposure motion blur
+A desert caravan beneath a sky filled with iridescent balloons, watercolor illustration, in Art‑Nouveau line work
+A lighthouse emitting rainbow beams into coastal fog, watercolor illustration, bathed in golden hour light
+A futuristic monorail gliding above verdant rice terraces, neon synthwave palette, in Art‑Nouveau line work
+A futuristic monorail gliding above verdant rice terraces, matte‑painting concept art, bathed in golden hour light
+A spiraling staircase made of glowing origami cranes, photoreal 8K render, framed by towering archways
+A cascade of cosmic paint pouring from an open window, watercolor illustration, under a crescent moon
+A pair of vintage typewriters merging into a butterfly, photoreal 8K render, under a crescent moon
+A lighthouse emitting rainbow beams into coastal fog, chalk pastel drawing, shot with tilt‑shift focus
+A cascade of cosmic paint pouring from an open window, photoreal 8K render, captured in long‑exposure motion blur
+A futuristic monorail gliding above verdant rice terraces, ultrawide cinematic frame, under a crescent moon
+A pair of vintage typewriters merging into a butterfly, watercolor illustration, under a crescent moon
+A spiraling staircase made of glowing origami cranes, photoreal 8K render, with volumetric fog
+An owl composed entirely of clock gears and cogs, ultrawide cinematic frame, shot with tilt‑shift focus
+A glass teapot swirling with blooming jasmine tea, hyperreal CGI composite, with reflective wet pavement
+A transparent cube containing a miniature thunderstorm, chalk pastel drawing, captured in long‑exposure motion blur
+A lone cherry blossom tree growing on a floating island, isometric pixel art, framed by towering archways
+A violin carved from shimmering ice, ultrawide cinematic frame, with reflective wet pavement
+A lighthouse emitting rainbow beams into coastal fog, watercolor illustration, captured in long‑exposure motion blur
+A bonsai tree shaped like a twisting dragon, hyperreal CGI composite, bathed in golden hour light
+A crystal castle perched on a cliff of amethyst, hyperreal CGI composite, under a crescent moon
+A paper sailboat navigating waves of handwritten letters, neon synthwave palette, framed by towering archways
+A cascade of cosmic paint pouring from an open window, hyperreal CGI composite, using dramatic rim lighting
+A lone cherry blossom tree growing on a floating island, sepia ink sketch, shot with tilt‑shift focus
+A starlit library built inside a hollowed redwood, isometric pixel art, with volumetric fog
+A subway station covered in bioluminescent moss, low‑poly 3‑D art, using dramatic rim lighting
+A subway station covered in bioluminescent moss, sepia ink sketch, captured in long‑exposure motion blur
+An owl composed entirely of clock gears and cogs, photoreal 8K render, shot with tilt‑shift focus
+A spiraling staircase made of glowing origami cranes, ultrawide cinematic frame, captured in long‑exposure motion blur
+A subway station covered in bioluminescent moss, ultrawide cinematic frame, captured in long‑exposure motion blur
+An owl composed entirely of clock gears and cogs, ultrawide cinematic frame, in Art‑Nouveau line work
+A crystal castle perched on a cliff of amethyst, ultrawide cinematic frame, under a crescent moon
+A starlit library built inside a hollowed redwood, hyperreal CGI composite, bathed in golden hour light
+A cascade of cosmic paint pouring from an open window, hyperreal CGI composite, in Art‑Nouveau line work
+An antique compass resting on weathered parchment, neon synthwave palette, under a crescent moon
+A pair of vintage typewriters merging into a butterfly, neon synthwave palette, surrounded by soft bokeh
+A violin carved from shimmering ice, hyperreal CGI composite, surrounded by soft bokeh
+A transparent cube containing a miniature thunderstorm, ultrawide cinematic frame, bathed in golden hour light
+A futuristic monorail gliding above verdant rice terraces, hyperreal CGI composite, in Art‑Nouveau line work
+A pair of vintage typewriters merging into a butterfly, low‑poly 3‑D art, surrounded by soft bokeh
+A desert caravan beneath a sky filled with iridescent balloons, isometric pixel art, using dramatic rim lighting
+An owl composed entirely of clock gears and cogs, isometric pixel art, shot with tilt‑shift focus
+An owl composed entirely of clock gears and cogs, matte‑painting concept art, under a crescent moon
+A spiraling staircase made of glowing origami cranes, chalk pastel drawing, surrounded by soft bokeh
+A glass teapot swirling with blooming jasmine tea, ultrawide cinematic frame, surrounded by soft bokeh
+A desert caravan beneath a sky filled with iridescent balloons, matte‑painting concept art, shot with tilt‑shift focus
+A pocket watch melting over the edge of a marble pedestal, ultrawide cinematic frame, under a crescent moon
+A serene koi pond reflecting autumn maple leaves, isometric pixel art, framed by towering archways
+A paper sailboat navigating waves of handwritten letters, isometric pixel art, framed by towering archways
+A fox wearing a patchwork cloak of autumn leaves, isometric pixel art, using dramatic rim lighting
+A glass teapot swirling with blooming jasmine tea, low‑poly 3‑D art, using dramatic rim lighting
+A cascade of cosmic paint pouring from an open window, chalk pastel drawing, captured in long‑exposure motion blur
+A bonsai tree shaped like a twisting dragon, hyperreal CGI composite, captured in long‑exposure motion blur
+A bonsai tree shaped like a twisting dragon, isometric pixel art, shot with tilt‑shift focus
+A futuristic monorail gliding above verdant rice terraces, neon synthwave palette, captured in long‑exposure motion blur
+An owl composed entirely of clock gears and cogs, sepia ink sketch, with volumetric fog
+A crystal castle perched on a cliff of amethyst, neon synthwave palette, surrounded by soft bokeh
+A fox wearing a patchwork cloak of autumn leaves, low‑poly 3‑D art, framed by towering archways
+A glass teapot swirling with blooming jasmine tea, watercolor illustration, surrounded by soft bokeh
+A transparent cube containing a miniature thunderstorm, isometric pixel art, captured in long‑exposure motion blur
+An antique compass resting on weathered parchment, photoreal 8K render, captured in long‑exposure motion blur
+A futuristic monorail gliding above verdant rice terraces, hyperreal CGI composite, captured in long‑exposure motion blur
+A crystal castle perched on a cliff of amethyst, sepia ink sketch, under a crescent moon
+A transparent cube containing a miniature thunderstorm, hyperreal CGI composite, using dramatic rim lighting
+A pair of vintage typewriters merging into a butterfly, photoreal 8K render, in Art‑Nouveau line work
+A subway station covered in bioluminescent moss, hyperreal CGI composite, in Art‑Nouveau line work
+A starlit library built inside a hollowed redwood, ultrawide cinematic frame, using dramatic rim lighting
+A crystal castle perched on a cliff of amethyst, neon synthwave palette, using dramatic rim lighting
+'Solstice' rendered as a neon sign suspended between skyscrapers, golden glow
+'Eclipse' rendered as a drone light show emerging from city fog, golden glow
+'Solstice' rendered as a drone light show hovering above rolling clouds, golden glow
+'Harmony' rendered as a chalkboard typography projected onto canyon walls, golden glow
+'Quantum' rendered as a chalkboard typography over a misty harbor, golden glow
+'Harmony' rendered as a LED billboard against a star‑filled night sky, golden glow
+'Harmony' rendered as a drone light show projected onto canyon walls, golden glow
+'Solstice' rendered as a drone light show suspended between skyscrapers, golden glow
+'Harmony' rendered as a LED billboard against a star‑filled night sky, golden glow
+'Quantum' rendered as a ice sculpture headline framed by snow‑capped peaks, golden glow
+'Eclipse' rendered as a sand‑dune calligraphy framed by snow‑capped peaks, golden glow
+'Solstice' rendered as a LED billboard against a star‑filled night sky, golden glow
+'Quantum' rendered as a LED billboard reflected on a tranquil lake, golden glow
+'Harmony' rendered as a floral arrangement over a misty harbor, golden glow
+'Zenith' rendered as a ice sculpture headline reflected on a tranquil lake, golden glow
+'Serenity' rendered as a neon sign over a misty harbor, golden glow
+'Eclipse' rendered as a LED billboard reflected on a tranquil lake, golden glow
+'Momentum' rendered as a neon sign hovering above rolling clouds, golden glow
+'Quantum' rendered as a drone light show suspended between skyscrapers, golden glow
+'Zenith' rendered as a ice sculpture headline hovering above rolling clouds, golden glow
+A red apple on a blue table next to a glass of water, low‑poly 3‑D art
+Two identical robots playing chess under a lantern, low‑poly 3‑D art
+A green hummingbird hovering above a purple tulip inside a snow globe, hyperreal CGI composite
+A stack of three books with a lit candle on top beside an hourglass, neon synthwave palette
+A silver teapot pouring tea into a floating porcelain cup, sepia ink sketch
+A tiny astronaut standing on a giant sunflower facing the sunrise, photoreal 8K render
+A cat sleeping under a transparent umbrella in gentle rain, chalk pastel drawing
+A bicycle leaning against a graffiti‑covered wall under string lights, chalk pastel drawing
+An origami crane flying over a steaming cup of coffee on a saucer, chalk pastel drawing
+A vintage camera resting on a map with scattered film negatives, low‑poly 3‑D art

configs/captions/example_prompts4.txt ADDED Viewed

	@@ -0,0 +1,800 @@

+A bioluminescent rainforest at night, viewed from a canopy walkway, hyper-real, crisp moonlight filtering through mist
+Cross-section of an imaginary geode revealing swirling nebula-like mineral layers, macro photography style
+Futuristic library carved into a glacier, warm interior lighting contrasting icy blue walls, isometric view
+Surreal desert with floating sandstone monoliths casting long shadows at golden hour, ultra-wide lens
+Vintage watercolor map of an archipelago shaped like musical notes, illustrated cartography
+Cyberpunk alley drenched in neon rain, reflective puddles, no characters, cinematic atmosphere
+Close-up of a hummingbird made of fractal glass shards hovering near a sapphire flower, 8K detail
+Orbiting observatory above a gas-giant planet, rings stretching across star-filled sky, photoreal
+Abstract kinetic sculpture of twisting ribbons suspended in a white cube gallery, studio lighting
+Fog-covered pine forest with a single crimson tree in the center, muted color palette
+Time-lapse style composite of a tidal pool from dawn to dusk, stitched into one frame
+Isometric diagram of an autonomous greenhouse on Mars, annotated schematics aesthetic
+Paper-cut illustration of a city inside a whale, layered depth, soft muted tones
+Steampunk airship port at sunrise, brass machinery glinting, painterly brushwork
+Minimalist ink wash painting of a solitary mountain peak emerging from clouds
+Ultraviolet microscope image of an invented pollen grain with crystalline spikes
+Retro 8-bit pixel art scene of a cozy lakeside cabin under meteor shower
+Low-poly 3-D render of a coral reef teeming with geometric fish shapes
+Aerial view of a terraced rice field arranged in a perfect Fibonacci spiral
+Schematic cutaway of a clockwork heart pumping luminous liquid, technical drawing style
+Long-exposure night photograph of fireflies tracing mathematical Lissajous curves
+Gothic cathedral interior built entirely from translucent ice, soft subsurface scattering
+Top-down macro of latte foam forming a fractal coastline pattern
+Astronomical illustration of a triple-sunset over an ocean on an exoplanet
+Ink-on-parchment concept art of a floating pagoda tethered by chains to mountain peaks
+Cubist still life of fruit and musical instruments, vivid complementary colors
+Moody black-and-white film photograph of rain on a lonely train platform, 1950s era
+Hyperreal chrome koi fish swimming through clouds, sky as water
+Floral mandala assembled from autumn leaves, top-down symmetric composition
+Concept art of an underground crystal cavern illuminated by bioluminescent fungi
+Sci-fi control room with holographic interfaces projected into fog, teal-orange palette
+Minimal claymation style landscape with rolling pastel hills and giant daisies
+Polaroid aesthetic photo of a roadside diner at twilight, neon sign flickering
+Vector infographic showing the life cycle of a fictional winged seed, flat design
+Dream-like seascape where waves morph into galloping horses, double-exposure effect
+Art-deco poster of an interstellar passenger train speeding past moons
+Cross-section illustration of a layered cake that resembles planetary strata
+Infrared photograph of a mangrove swamp, foliage appearing white, water inky black
+Whimsical pencil sketch of a tea party with levitating porcelain, soft shading
+Architectural render of a zero-gravity museum with exhibits floating mid-air
+Oil painting of a stormy sky splitting into vortices shaped like musical clefs
+Isometric cutaway of an underground dwarf forge with molten rivers, game concept art
+Frosted glass terrarium containing a miniature thunderstorm, studio backdrop
+Minimalist cyanotype print of fern leaves arranged in a golden ratio spiral
+Fantasy moonlit waterfall cascading upward into the sky, long-exposure feel
+Retro-futuristic poster of a solar-powered desert rover kicking up red dust
+Double helix made of blooming flowers against a white background, high-key macro
+Top-down shot of a labyrinth garden trimmed into Escher-like impossible geometry
+Sci-fi vending machine selling bottled starlight, hologram price tags
+Watercolor portrait of an abstract humanoid with translucent skin revealing galaxies
+Silhouette of a lone tree on an island reflected perfectly in still water, dusk gradient
+Close-up macro of snowflakes arranged to form a Mandelbrot set
+Ink drawing of a koi pond where fish tails morph into swirling calligraphy strokes
+Hyperreal food photography of a floating stack of pancakes with gravity-defying syrup
+Electroluminescent circuit board cityscape at night, streets as glowing traces
+Surreal scene of books sprouting wings and migrating across a sunset sky
+Low-angle view of a colossal sandstone arch framing a star-filled Milky Way
+Cross-section of a mechanical sunflower tracking a miniature artificial sun
+Art-nouveau travel poster for an imaginary cloud kingdom, flowing line art
+Graph-paper style blueprint of a perpetual-motion water wheel, annotated
+Futuristic zen garden with levitating raked sand and floating bonsai stones
+Photoreal underwater city with glass domes linked by glowing tunnels
+Tilt-shift photo of a festival lantern parade through narrow cobblestone streets
+Neon wireframe landscape reminiscent of 1980s synthwave, grid fading to horizon
+Paper-quilling style illustration of a comet bursting into colorful spirals
+Panorama of a crimson aurora over icy mountains, ultra-wide 16:9 aspect
+Transparent holographic chess set floating in zero-gravity, pieces mid-game
+Pointillist painting of a bustling open-air market under summer sun
+Infrared thermal view of a volcanic eruption, palette mapped to rainbow hues
+Detail shot of clock gears where each tooth is a tiny stairway with lanterns
+Minimal line-art poster depicting the evolution of flight from feathers to starships
+Glowing jellyfish drifting through a misty pine forest at dawn, photoreal composite
+Art-studio workbench cluttered with vintage robotics schematics and metal parts
+Monochrome charcoal drawing of a lighthouse beam piercing heavy fog
+Isometric voxel art of a floating garden island with waterfalls spilling into void
+Surreal split-scene: left half winter forest, right half summer meadow, seamless blend
+Retro postage stamp design celebrating a fictional eclipse festival
+Hyperdetailed ceramic mosaic of a phoenix rising, mediterranean style
+Sci-fi medical lab growing crystalline plants in suspended nutrient orbs
+High-speed photo of colored ink clouds colliding underwater, symmetrical composition
+Anamorphic street art illusion of a chasm opening in a city square
+Timber-frame hobbit-style cottage under giant sunflowers, golden afternoon
+Futuristic monorail weaving through skyscrapers wrapped in vertical gardens
+Scientific render of a transparent hypercube containing swirling plasma
+Sepia photograph of an abandoned observatory overtaken by vines
+Concept piece: biomechanical dragon skeleton displayed in a museum hall
+Minimal gradient poster of a single droplet rippling concentric neon rings
+Chalkboard schematic showing stages of a do-it-yourself constellation projector
+Digital glitch art of a city skyline melting into cascading pixels
+Aerial drone shot of rice paddies shaped like circuitry pathways
+Macro of soap film displaying shifting rainbow interference patterns
+Oil-on-canvas seascape where waves are brush strokes of pure geometry
+Tilted perspective of a spiral staircase made entirely of stained glass
+Hyperreal 3-D render of a desert mirage city shimmering above dunes
+Vectorized infographic of wind turbine anatomy with exploded components
+Snow-covered bamboo forest under lantern light, gentle falling flakes
+Abstract generative art of golden particles forming a torus knot in black void
+Stop-motion clay diorama of a miniature volcano erupting sprinkles
+Ultrawide cinematic shot of two converging thunderstorms over open ocean
+Graphite sketch of intertwined river deltas resembling tree roots, top-down view
+Design an hourglass where sand forms miniature mountains in vector flat design style
+Design an hourglass where sand forms miniature mountains in low-poly 3‑D model style
+Depict a mountain range shaped like sleeping giants in vibrant watercolor style
+Observe a futuristic city built on towering waterfalls in steampunk-inspired blueprint style
+Create a desert of shattered stained glass dunes in neon-lit synthwave illustration style
+Envision a forest whose trees emit soft neon pulses in minimalist ink sketch style
+Depict a desert of shattered stained glass dunes in baroque-style oil painting style
+Render a spiral staircase carved from moonlight in baroque-style oil painting style
+Depict a surreal corridor of mirrors reflecting infinite galaxies in digital matte painting style
+Render an hourglass where sand forms miniature mountains in digital matte painting style
+Design a violin constructed of flowing water in steampunk-inspired blueprint style
+Compose a futuristic city built on towering waterfalls in minimalist ink sketch style
+Depict a desert of shattered stained glass dunes in digital matte painting style
+Create a violin constructed of flowing water in minimalist ink sketch style
+Compose a forest whose trees emit soft neon pulses in neon-lit synthwave illustration style
+Observe a violin constructed of flowing water in steampunk-inspired blueprint style
+Observe an antique compass floating above stormy seas in minimalist ink sketch style
+Create a desert of shattered stained glass dunes in photorealistic concept art style
+Design a forest whose trees emit soft neon pulses in neon-lit synthwave illustration style
+Envision an antique compass floating above stormy seas in low-poly 3‑D model style
+Picture a surreal corridor of mirrors reflecting infinite galaxies in low-poly 3‑D model style
+Depict a mountain range shaped like sleeping giants in digital matte painting style
+Visualize a violin constructed of flowing water in photorealistic concept art style
+Visualize a violin constructed of flowing water in vibrant watercolor style
+Create an hourglass where sand forms miniature mountains in vector flat design style
+Render a mountain range shaped like sleeping giants in minimalist ink sketch style
+Render a mountain range shaped like sleeping giants in vector flat design style
+Envision a library whose shelves orbit a glowing star in photorealistic concept art style
+Picture a futuristic city built on towering waterfalls in vibrant watercolor style
+Create a library whose shelves orbit a glowing star in minimalist ink sketch style
+Depict a forest whose trees emit soft neon pulses in minimalist ink sketch style
+Envision a futuristic city built on towering waterfalls in digital matte painting style
+Envision a violin constructed of flowing water in steampunk-inspired blueprint style
+Visualize a surreal corridor of mirrors reflecting infinite galaxies in hyperreal CGI render style
+Imagine a forest whose trees emit soft neon pulses in vector flat design style
+Depict a spiral staircase carved from moonlight in neon-lit synthwave illustration style
+Picture a library whose shelves orbit a glowing star in minimalist ink sketch style
+Compose an hourglass where sand forms miniature mountains in minimalist ink sketch style
+Visualize a forest whose trees emit soft neon pulses in vibrant watercolor style
+Picture a violin constructed of flowing water in minimalist ink sketch style
+Depict a spiral staircase carved from moonlight in hyperreal CGI render style
+Visualize a spiral staircase carved from moonlight in low-poly 3‑D model style
+Picture a futuristic city built on towering waterfalls in vector flat design style
+Picture a spiral staircase carved from moonlight in low-poly 3‑D model style
+Picture a spiral staircase carved from moonlight in hyperreal CGI render style
+Visualize a spiral staircase carved from moonlight in steampunk-inspired blueprint style
+Observe a surreal corridor of mirrors reflecting infinite galaxies in low-poly 3‑D model style
+Envision a surreal corridor of mirrors reflecting infinite galaxies in hyperreal CGI render style
+Depict a forest whose trees emit soft neon pulses in steampunk-inspired blueprint style
+Imagine a desert of shattered stained glass dunes in low-poly 3‑D model style
+Envision a library whose shelves orbit a glowing star in vibrant watercolor style
+Compose an antique compass floating above stormy seas in low-poly 3‑D model style
+Design an antique compass floating above stormy seas in neon-lit synthwave illustration style
+Design a forest whose trees emit soft neon pulses in baroque-style oil painting style
+Design a surreal corridor of mirrors reflecting infinite galaxies in photorealistic concept art style
+Envision a futuristic city built on towering waterfalls in hyperreal CGI render style
+Render a forest whose trees emit soft neon pulses in digital matte painting style
+Design an antique compass floating above stormy seas in vector flat design style
+Compose a desert of shattered stained glass dunes in vibrant watercolor style
+Design an hourglass where sand forms miniature mountains in minimalist ink sketch style
+Imagine a library whose shelves orbit a glowing star in baroque-style oil painting style
+Compose a spiral staircase carved from moonlight in vibrant watercolor style
+Compose a desert of shattered stained glass dunes in minimalist ink sketch style
+Compose a library whose shelves orbit a glowing star in digital matte painting style
+Render a library whose shelves orbit a glowing star in vibrant watercolor style
+Envision a violin constructed of flowing water in steampunk-inspired blueprint style
+Create an antique compass floating above stormy seas in digital matte painting style
+Create a surreal corridor of mirrors reflecting infinite galaxies in vector flat design style
+Observe a mountain range shaped like sleeping giants in vector flat design style
+Depict a library whose shelves orbit a glowing star in hyperreal CGI render style
+Compose a violin constructed of flowing water in photorealistic concept art style
+Visualize a surreal corridor of mirrors reflecting infinite galaxies in neon-lit synthwave illustration style
+Visualize a violin constructed of flowing water in baroque-style oil painting style
+Picture a futuristic city built on towering waterfalls in hyperreal CGI render style
+Design a desert of shattered stained glass dunes in hyperreal CGI render style
+Imagine an hourglass where sand forms miniature mountains in vibrant watercolor style
+Visualize a futuristic city built on towering waterfalls in digital matte painting style
+Visualize a violin constructed of flowing water in photorealistic concept art style
+Observe a futuristic city built on towering waterfalls in hyperreal CGI render style
+Create a mountain range shaped like sleeping giants in vibrant watercolor style
+Visualize a violin constructed of flowing water in digital matte painting style
+Design a futuristic city built on towering waterfalls in digital matte painting style
+Depict a forest whose trees emit soft neon pulses in digital matte painting style
+Design an hourglass where sand forms miniature mountains in photorealistic concept art style
+Visualize a surreal corridor of mirrors reflecting infinite galaxies in photorealistic concept art style
+Picture a mountain range shaped like sleeping giants in vibrant watercolor style
+Compose a futuristic city built on towering waterfalls in vibrant watercolor style
+Depict a mountain range shaped like sleeping giants in hyperreal CGI render style
+Envision a violin constructed of flowing water in photorealistic concept art style
+Imagine a desert of shattered stained glass dunes in steampunk-inspired blueprint style
+Compose a library whose shelves orbit a glowing star in neon-lit synthwave illustration style
+Render a desert of shattered stained glass dunes in low-poly 3‑D model style
+Imagine a futuristic city built on towering waterfalls in steampunk-inspired blueprint style
+Picture a futuristic city built on towering waterfalls in minimalist ink sketch style
+Imagine a violin constructed of flowing water in steampunk-inspired blueprint style
+Render a mountain range shaped like sleeping giants in baroque-style oil painting style
+Envision a futuristic city built on towering waterfalls in photorealistic concept art style
+Observe a forest whose trees emit soft neon pulses in steampunk-inspired blueprint style
+Depict a mountain range shaped like sleeping giants in vibrant watercolor style
+Observe a forest whose trees emit soft neon pulses in steampunk-inspired blueprint style
+Golden hour photograph of a red fox stepping cautiously through dewy meadow grass
+Macro shot of morning frost crystallizing on fern fronds in temperate rainforest
+Underwater wide‑angle photo of a sea turtle gliding above a coral reef teeming with anthias
+High‑altitude drone capture of alpine lake mirrors jagged snow‑capped peaks and drifting clouds
+Slow‑shutter waterfall scene where silky water cascades over mossy basalt boulders in Icelandic gorge
+Cinematic backlit portrait of a barn owl perched on ancient oak branch amid floating dust motes
+Time‑lapse composite of Milky Way arcing above blooming lavender fields in Provence
+Split‑level photo of mangrove roots below waterline and sunset‑lit shoreline above
+Close‑up of honeybee collecting pollen from vibrant sunflower, pollen grains visible on legs
+Foggy morning panorama of rolling hills covered in tea plantations with workers in colorful attire
+Crystal‑clear lakebed photograph revealing patterned stones beneath undisturbed surface reflections
+High‑speed capture of kingfisher diving, water droplets frozen mid‑air around electric‑blue feathers
+Infrared landscape where leafy canopy glows white against deep charcoal sky over calm river
+Portrait of snow leopard resting on sun‑warmed granite ledge, whiskers sharply detailed
+Aerial view of winding river carving emerald wetlands into fractal branching shapes
+Cave photograph of bioluminescent glowworms illuminating stalactites like starry constellations
+Golden backlight silhouette of wild horses galloping across dust‑filled prairie at dusk
+Macro image of dragonfly wings showing iridescent lattice structure against soft bokeh background
+Long‑exposure night photo of fireflies painting light trails over forest clearing
+Humpback whale breaching beside sailboat under dramatic storm clouds, telephoto perspective
+Minimalist snowy landscape with single crimson maple tree breaking endless white expanse
+Low‑angle shot of raindrops impacting still pond, concentric ripples overlapping gracefully
+Juvenile emperor penguins huddling together on Antarctic ice shelf, gentle snowfall
+Monsoon lightning fork illuminating terraced rice paddies in tropical valley
+Photograph of desert sand dune crest with sharp wind‑carved ridges and subtle color gradients
+Crisp autumn scene with mirror‑still lake reflecting birch trees in full yellow foliage
+Grizzly bear catching salmon mid‑leap at waterfall, droplets sparkling in sun
+Ultra‑wide rainforest canopy shot looking straight up at towering kapok trees and lianas
+Frosty spiderweb adorned with dew pearls against muted sunrise pastel sky
+Split‑tone black‑and‑white portrait of African elephant with textured, weathered skin
+Close‑focus shot of chameleon eyes moving independently while clinging to branch
+Macro of raindrop on leaf acting as natural lens magnifying vein structure
+Evening photo of bioluminescent plankton igniting gentle waves on secluded beach
+Snowy owl in silent flight, wings fully extended, low sunlight catching feathers
+Panoramic vista of Grand Canyon with storm rolling in, sunbreak spotlighting sandstone layers
+Reflection of northern lights on frozen lake with cracked ice foreground patterns
+Monarch butterflies clustering densely on eucalyptus branches during migration season
+Fogbow arcing over coastal cliffs during sunrise, soft pastel halo effect
+Underwater shot beneath breaking wave showing turbulent bubbles and sandy seabed
+Starlit desert night with silhouetted Joshua trees and meteor streaks overhead
+Close‑up of mossy forest floor with tiny mushrooms resembling fairy rooftops
+Gorilla family interaction, young playing under watchful silverback in mountain forest
+Infrared portrait of flamingo colony, feathers rendered in surreal icy tones
+Sunlit macro of water droplet refracting inverted mountain landscape
+Silk‑smooth long‑exposure image of tide swirling around jagged sea stacks
+Beetle with metallic iridescence crawling over textured bark, focus stacked
+Golden eagle soaring against dramatic cumulonimbus background, telephoto sharpness
+Low‑key studio shot of concentric nautilus shell cross‑section revealing logarithmic spiral
+High‑shutter capture of hummingbird beating wings beside red hibiscus bloom
+Pastel sunrise over mist‑covered bamboo forest, layered depth fading into distance
+Underwater cave photo with diver silhouette illuminated by cyan light shaft
+Macro of snowflake on dark wool mitten showing intricate hexagonal symmetry
+Valley of wildflowers beneath towering granite cliffs during alpine spring bloom
+Reflection of autumn forest distorted in moving river captured abstractly
+Overhead view of stingrays casting shadows on shallow sandy seafloor in turquoise water
+Red fox curled into ball sleeping amidst fallen maple leaves, soft light
+Thunderstorm shelf cloud sweeping across prairie wheat field, cinematic contrast
+Underlit jellyfish drifting gracefully in inky black aquarium space, tentacles trailing
+Cliffside puffin returning to burrow with beak full of fish, ocean backdrop
+High‑frame burst of snow being shaken off evergreen branch, frozen crystals glimmering
+Evening silhouette of baobab trees reflected in seasonal floodplain under violet sky
+Ultra‑wide sunflower field facing rising sun, radial pattern leading toward horizon
+Close‑up of wolf tracks imprinted in fresh snow, subtle shadows defining ridges
+Macro texture study of elephant skin crack patterns, monochrome emphasis
+Gentle cascade flowing over terraced travertine pools, mineral‑rich aqua water
+Colony of bats exiting cave entrance at dusk, blurred motion streaks against sky
+Raindrops clinging to spider lily petals, selective focus yields painterly background
+Drone shot of turquoise river braided through black volcanic sands creating abstract art
+Slow‑motion splash of crimson pomegranate seeds into clear water, crown burst captured
+Glasswing butterfly perched on leaf, transparent wings revealing background blossoms
+Camel caravan crossing vast erg dunes under scorching midday sun, heat shimmer visible
+Sunset silhouette of giraffes browsing acacia trees on savanna ridge
+Macro portrait of praying mantis head showing compound eye facets with rainbow sheen
+Sunlit iceberg arch framing distant mountain range, polar wilderness
+Shallow‑depth‑of‑field photo of blooming cherry blossoms with soft pastel bokeh
+Sodium vapor night image of urban fox exploring quiet alley, eyes gleaming
+Cresting ocean wave backlit to reveal turquoise translucence and spray diamonds
+Water droplet crown captured on reflective surface using high‑speed flash
+Panorama of volcanic eruption under starry sky, lava rivers glowing intensely
+Quokka standing on hind legs engaging camera with curious expression, beach background
+Macro of tomato frog skin showing porous texture and vivid coloration
+Stormy sky double‑rainbow over field of lupine flowers in twilight
+Leafcutter ants marching along branch carrying leaf fragments, shallow focus foreground blur
+Silhouette of migrating cranes flying in V‑formation across fiery sunrise
+Submerged forest trunks in crystal lake water creating surreal vertical reflections
+Close‑up of gecko adhesion pads under microscope revealing microscopic setae
+Golden waterfall of ginkgo leaves falling in city park during gentle breeze
+Moody long‑exposure shot of lighthouse battered by crashing Atlantic waves
+Moorland early‑morning heather fields shrouded in low‑lying mist, soft pink tones
+Axolotl photographed head‑on in clear water tank, external gills fanned out
+Panoramic ridge walk above sea of clouds with hikers silhouetted, late afternoon
+Infrared aerial of agricultural patchwork revealing hidden irrigation patterns
+Backlit translucent maple leaf showing branching vein network in vivid detail
+Whale shark alongside snorkeler for scale, sun rays piercing surface
+Fire salamander crawling across wet moss with saturated black‑yellow contrast
+Ice cave interior glowing sapphire as sunlight filters through thick glacier ice
+Desert bloom macro of cactus flower opening at dawn, dew droplets sparkling
+Slow‑shutter capture of star trails spinning around polaris above stone ruins
+High‑speed frame of geyser eruption against cobalt sky in geothermal field
+Portrait of peacock displaying fully fanned tail feathers, iridescent eyespots centered
+ancient cave painting of a hedgehog floating in cosmic void
+Art‑Deco travel poster for a zorilla projected onto rainy cityscape
+Japanese ink wash of a urchin against black velvet backdrop
+stained‑glass mosaic of a tiger amid swirling galaxies
+Art‑Deco travel poster for a jellyfish under moonlit sky
+neon synthwave poster featuring a deer amid swirling galaxies
+cyberpunk hologram of a cheetah amid swirling galaxies
+Celtic knotwork engraving of a capybara over vibrant gradient background
+Celtic knotwork engraving of a red panda over vibrant gradient background
+embroidered textile artwork of a panda inside glass terrarium
+low‑poly 3‑D render of a dolphin projected onto rainy cityscape
+baroque oil painting featuring a jellyfish under moonlit sky
+embroidered textile artwork of a penguin inside glass terrarium
+baroque oil painting featuring a quokka against black velvet backdrop
+low‑poly 3‑D render of a koala against black velvet backdrop
+Japanese ink wash of a octopus surrounded by geometric patterns
+cyberpunk hologram of a egret against black velvet backdrop
+steampunk clockwork version of a macaw against black velvet backdrop
+cubist painting depicting a ibis over vibrant gradient background
+cyberpunk hologram of a salamander amid swirling galaxies
+ancient cave painting of a giraffe over vibrant gradient background
+steampunk clockwork version of a ibis surrounded by geometric patterns
+minimalist line‑art study of a hedgehog over vibrant gradient background
+chalk pastel sidewalk mural of a red panda on vintage parchment
+baroque oil painting featuring a salamander amid swirling galaxies
+Art‑Deco travel poster for a hippopotamus floating in cosmic void
+low‑poly 3‑D render of a alpaca on vintage parchment
+chalk pastel sidewalk mural of a newt under moonlit sky
+minimalist line‑art study of a armadillo surrounded by geometric patterns
+futuristic chrome statue of a dolphin inside glass terrarium
+Art‑Deco travel poster for a macaw under moonlit sky
+cubist painting depicting a urchin in ornate golden frame
+chalk pastel sidewalk mural of a quokka amid swirling galaxies
+steampunk clockwork version of a peacock on vintage parchment
+psychedelic tie‑dye depiction of a toucan surrounded by geometric patterns
+cyberpunk hologram of a tiger projected onto rainy cityscape
+ancient cave painting of a badger floating in cosmic void
+stained‑glass mosaic of a caracal over vibrant gradient background
+embroidered textile artwork of a dragonfly on vintage parchment
+pixel‑art sprite sheet for a armadillo over vibrant gradient background
+futuristic chrome statue of a salamander inside glass terrarium
+origami paper sculpture of a elephant surrounded by geometric patterns
+minimalist line‑art study of a ibis amid swirling galaxies
+Art‑Deco travel poster for a giraffe under moonlit sky
+cubist painting depicting a giraffe over vibrant gradient background
+embroidered textile artwork of a dragonfly surrounded by geometric patterns
+ceramic glazed statue of a newt over vibrant gradient background
+Art‑Deco travel poster for a bison under moonlit sky
+chalk pastel sidewalk mural of a caracal on vintage parchment
+steampunk clockwork version of a jellyfish under moonlit sky
+chalk pastel sidewalk mural of a penguin projected onto rainy cityscape
+cyberpunk hologram of a alpaca projected onto rainy cityscape
+stained‑glass mosaic of a cheetah amid swirling galaxies
+cyberpunk hologram of a hedgehog projected onto rainy cityscape
+watercolor splash illustration of a caracal amid swirling galaxies
+neon synthwave poster featuring a goat on vintage parchment
+ancient cave painting of a hedgehog projected onto rainy cityscape
+psychedelic tie‑dye depiction of a whale against black velvet backdrop
+Celtic knotwork engraving of a kiwi bird inside glass terrarium
+ancient cave painting of a bison inside glass terrarium
+origami paper sculpture of a whale inside glass terrarium
+origami paper sculpture of a armadillo under moonlit sky
+stained‑glass mosaic of a seahorse on vintage parchment
+steampunk clockwork version of a kangaroo on vintage parchment
+ceramic glazed statue of a koala against black velvet backdrop
+chalk pastel sidewalk mural of a whale against black velvet backdrop
+minimalist line‑art study of a lemur on vintage parchment
+Art‑Deco travel poster for a tiger projected onto rainy cityscape
+watercolor splash illustration of a ferret amid swirling galaxies
+minimalist line‑art study of a newt on vintage parchment
+psychedelic tie‑dye depiction of a ibis over vibrant gradient background
+Art‑Deco travel poster for a bison over vibrant gradient background
+futuristic chrome statue of a cheetah under moonlit sky
+baroque oil painting featuring a vulture projected onto rainy cityscape
+Japanese ink wash of a cheetah projected onto rainy cityscape
+minimalist line‑art study of a tapir inside glass terrarium
+stained‑glass mosaic of a raven on vintage parchment
+Japanese ink wash of a macaw under moonlit sky
+pixel‑art sprite sheet for a tapir projected onto rainy cityscape
+stained‑glass mosaic of a tapir under moonlit sky
+Art‑Deco travel poster for a kangaroo against black velvet backdrop
+Japanese ink wash of a panda amid swirling galaxies
+watercolor splash illustration of a giraffe inside glass terrarium
+minimalist line‑art study of a bison against black velvet backdrop
+chalk pastel sidewalk mural of a egret over vibrant gradient background
+origami paper sculpture of a polar bear amid swirling galaxies
+minimalist line‑art study of a walrus under moonlit sky
+Celtic knotwork engraving of a salamander over vibrant gradient background
+cyberpunk hologram of a ibis amid swirling galaxies
+minimalist line‑art study of a toucan under moonlit sky
+pixel‑art sprite sheet for a quokka floating in cosmic void
+chalk pastel sidewalk mural of a bison in ornate golden frame
+low‑poly 3‑D render of a capybara under moonlit sky
+Celtic knotwork engraving of a yak inside glass terrarium
+steampunk clockwork version of a armadillo amid swirling galaxies
+baroque oil painting featuring a tiger against black velvet backdrop
+steampunk clockwork version of a newt inside glass terrarium
+ancient cave painting of a owl over vibrant gradient background
+ancient cave painting of a raven in ornate golden frame
+origami paper sculpture of a egret against black velvet backdrop
+Photorealistic close‑up of a jackal basking in sun in the misty valley
+Photorealistic close‑up of a caracal calling loudly in the sunlit heathland
+Photorealistic close‑up of a walrus calling loudly in the snowy tundra
+Photorealistic close‑up of a toucan gliding effortlessly in the sunlit heathland
+Photorealistic close‑up of a newt leaping gracefully in the crystal clear alpine lake
+Photorealistic close‑up of a macaw calling loudly in the rocky coastal cliffs
+Photorealistic close‑up of a raven gliding effortlessly in the starlit ocean surface
+Photorealistic close‑up of a kiwi bird bathing playfully in the tropical rainforest canopy
+Photorealistic close‑up of a peacock nursing its young in the savanna at dawn
+Photorealistic close‑up of a penguin resting peacefully in the tropical rainforest canopy
+Photorealistic close‑up of a octopus gliding effortlessly in the crystal clear alpine lake
+Photorealistic close‑up of a vulture gliding effortlessly in the rushing waterfall spray
+Photorealistic close‑up of a butterfly resting peacefully in the tropical rainforest canopy
+Photorealistic close‑up of a porcupine leaping gracefully in the coral reef
+Photorealistic close‑up of a elephant leaping gracefully in the dense mangrove swamp
+Photorealistic close‑up of a butterfly hunting silently in the rushing waterfall spray
+Photorealistic close‑up of a whale prowling cautiously in the crystal clear alpine lake
+Photorealistic close‑up of a egret hunting silently in the snowy tundra
+Photorealistic close‑up of a urchin prowling cautiously in the mountain meadow
+Photorealistic close‑up of a ibis gliding effortlessly in the golden desert dunes
+Photorealistic close‑up of a cheetah nursing its young in the open grassland under stormy sky
+Photorealistic close‑up of a koala leaping gracefully in the snowy tundra
+Photorealistic close‑up of a vulture resting peacefully in the sunlit heathland
+Photorealistic close‑up of a dragonfly gliding effortlessly in the icy Antarctic shelf
+Photorealistic close‑up of a ferret basking in sun in the dense mangrove swamp
+Photorealistic close‑up of a panda nursing its young in the starlit ocean surface
+Photorealistic close‑up of a zorilla leaping gracefully in the submerged kelp forest
+Photorealistic close‑up of a koala basking in sun in the misty valley
+Photorealistic close‑up of a lemming foraging curiously in the savanna at dawn
+Photorealistic close‑up of a macaw hunting silently in the tropical rainforest canopy
+Photorealistic close‑up of a dragonfly basking in sun in the submerged kelp forest
+Photorealistic close‑up of a urchin prowling cautiously in the tropical rainforest canopy
+Photorealistic close‑up of a tiger nursing its young in the starlit ocean surface
+Photorealistic close‑up of a egret gliding effortlessly in the foggy pine forest
+Photorealistic close‑up of a goat resting peacefully in the tropical rainforest canopy
+Photorealistic close‑up of a walrus nursing its young in the submerged kelp forest
+Photorealistic close‑up of a elephant bathing playfully in the lush river delta
+Photorealistic close‑up of a urchin calling loudly in the twilight prairie
+Photorealistic close‑up of a giraffe gliding effortlessly in the dense mangrove swamp
+Photorealistic close‑up of a dolphin nursing its young in the foggy pine forest
+Photorealistic close‑up of a egret basking in sun in the mountain meadow
+Photorealistic close‑up of a lion foraging curiously in the golden desert dunes
+Photorealistic close‑up of a wolf leaping gracefully in the coral reef
+Photorealistic close‑up of a seahorse bathing playfully in the mountain meadow
+Photorealistic close‑up of a urchin basking in sun in the savanna at dawn
+Photorealistic close‑up of a flamingo basking in sun in the icy Antarctic shelf
+Photorealistic close‑up of a caterpillar basking in sun in the starlit ocean surface
+Photorealistic close‑up of a polar bear bathing playfully in the coral reef
+Photorealistic close‑up of a kiwi bird leaping gracefully in the starlit ocean surface
+Photorealistic close‑up of a flamingo prowling cautiously in the steep bamboo grove
+Photorealistic close‑up of a dragonfly hunting silently in the rushing waterfall spray
+Photorealistic close‑up of a salamander bathing playfully in the mountain meadow
+Photorealistic close‑up of a hippopotamus hunting silently in the mountain meadow
+Photorealistic close‑up of a capybara hunting silently in the savanna at dawn
+Photorealistic close‑up of a octopus nursing its young in the savanna at dawn
+Photorealistic close‑up of a macaw leaping gracefully in the foggy pine forest
+Photorealistic close‑up of a oriole resting peacefully in the coral reef
+Photorealistic close‑up of a panda foraging curiously in the steep bamboo grove
+Photorealistic close‑up of a tiger leaping gracefully in the snowy tundra
+Photorealistic close‑up of a caracal gliding effortlessly in the icy Antarctic shelf
+Photorealistic close‑up of a panda calling loudly in the snowy tundra
+Photorealistic close‑up of a jackal nursing its young in the open grassland under stormy sky
+Photorealistic close‑up of a kangaroo nursing its young in the misty valley
+Photorealistic close‑up of a polar bear basking in sun in the steep bamboo grove
+Photorealistic close‑up of a toucan resting peacefully in the golden desert dunes
+Photorealistic close‑up of a kiwi bird bathing playfully in the submerged kelp forest
+Photorealistic close‑up of a deer resting peacefully in the twilight prairie
+Photorealistic close‑up of a fox prowling cautiously in the twilight prairie
+Photorealistic close‑up of a bison bathing playfully in the crystal clear alpine lake
+Photorealistic close‑up of a walrus prowling cautiously in the misty valley
+Photorealistic close‑up of a chameleon foraging curiously in the dense mangrove swamp
+Photorealistic close‑up of a raven hunting silently in the crystal clear alpine lake
+Photorealistic close‑up of a peacock basking in sun in the savanna at dawn
+Photorealistic close‑up of a seahorse foraging curiously in the crystal clear alpine lake
+Photorealistic close‑up of a tapir gliding effortlessly in the crystal clear alpine lake
+Photorealistic close‑up of a polar bear resting peacefully in the foggy pine forest
+Photorealistic close‑up of a urchin resting peacefully in the rushing waterfall spray
+Photorealistic close‑up of a armadillo leaping gracefully in the foggy pine forest
+Photorealistic close‑up of a hippopotamus gliding effortlessly in the foggy pine forest
+Photorealistic close‑up of a hippopotamus bathing playfully in the starlit ocean surface
+Photorealistic close‑up of a alpaca nursing its young in the foggy pine forest
+Photorealistic close‑up of a armadillo foraging curiously in the foggy pine forest
+Photorealistic close‑up of a fox resting peacefully in the snowy tundra
+Photorealistic close‑up of a kangaroo resting peacefully in the dense mangrove swamp
+Photorealistic close‑up of a egret foraging curiously in the steep bamboo grove
+Photorealistic close‑up of a toucan hunting silently in the sunlit heathland
+Photorealistic close‑up of a chameleon nursing its young in the tropical rainforest canopy
+Photorealistic close‑up of a raccoon leaping gracefully in the rushing waterfall spray
+Photorealistic close‑up of a chameleon calling loudly in the coral reef
+Photorealistic close‑up of a polar bear foraging curiously in the rushing waterfall spray
+Photorealistic close‑up of a caracal prowling cautiously in the coral reef
+Photorealistic close‑up of a tapir bathing playfully in the starlit ocean surface
+Photorealistic close‑up of a caracal calling loudly in the rocky coastal cliffs
+Photorealistic close‑up of a goat leaping gracefully in the misty valley
+Photorealistic close‑up of a yak calling loudly in the misty valley
+Photorealistic close‑up of a flamingo calling loudly in the golden desert dunes
+Photorealistic close‑up of a cheetah prowling cautiously in the savanna at dawn
+Photorealistic close‑up of a macaw resting peacefully in the snowy tundra
+Photorealistic close‑up of a hedgehog basking in sun in the lush river delta
+Photorealistic close‑up of a bison hunting silently in the snowy tundra
+Portrait of a data visualizer demonstrating calligraphy strokes on rice paper, candid, natural lighting
+Portrait of a bridge acoustics analyst practicing ballet leaps on an empty theater stage, candid, natural lighting
+Portrait of a classical guitarist harvesting greens on a rooftop farm at sunrise, candid, natural lighting
+Portrait of a harbor master crafting leather shoes in a workshop filled with wood shavings, candid, natural lighting
+Portrait of a aquaculture farmer glazing pottery beside a crackling kiln, candid, natural lighting
+Portrait of a blacksmith contemplating star charts under a dim observatory dome, candid, natural lighting
+Portrait of a glass mosaic artist practicing ballet leaps on an empty theater stage, candid, natural lighting
+Portrait of a avian ecologist demonstrating calligraphy strokes on rice paper, candid, natural lighting
+Portrait of a landscape painter harvesting greens on a rooftop farm at sunrise, candid, natural lighting
+Portrait of a river rafting guide analyzing rock samples in a windswept canyon, candid, natural lighting
+Portrait of a habitat designer assembling a mechanical clockwork device under magnifying lamp, candid, natural lighting
+Portrait of a bridge inspector demonstrating calligraphy strokes on rice paper, candid, natural lighting
+Portrait of a geologist recording ambient sounds inside an old forest, candid, natural lighting
+Portrait of a woodworker shaping surfboards in a powdery shaping bay, candid, natural lighting
+Portrait of a data visualizer testing experimental robots in a sleek lab, candid, natural lighting
+Portrait of a kite maker blowing molten glass into swirling shapes by furnace glow, candid, natural lighting
+Portrait of a bridge acoustics analyst contemplating star charts under a dim observatory dome, candid, natural lighting
+Portrait of a wildland firefighter packing honeycombs at a rustic apiary, candid, natural lighting
+Portrait of a mechanical watchmaker assembling a mechanical clockwork device under magnifying lamp, candid, natural lighting
+Portrait of a astronomer blowing molten glass into swirling shapes by furnace glow, candid, natural lighting
+Portrait of a labyrinth gardener harvesting greens on a rooftop farm at sunrise, candid, natural lighting
+Portrait of a violinist performing on a cobblestone street at dusk, candid, natural lighting
+Portrait of a cheese monger glazing pottery beside a crackling kiln, candid, natural lighting
+Portrait of a heritage conservator shaping surfboards in a powdery shaping bay, candid, natural lighting
+Portrait of a rowboat builder performing on a cobblestone street at dusk, candid, natural lighting
+Portrait of a bonsai cultivator cataloging rare manuscripts in a quiet library alcove, candid, natural lighting
+Portrait of a wildlife rehabilitator practicing ballet leaps on an empty theater stage, candid, natural lighting
+Portrait of a street photographer demonstrating calligraphy strokes on rice paper, candid, natural lighting
+Portrait of a librarian performing on a cobblestone street at dusk, candid, natural lighting
+Portrait of a surfboard shaper guiding hikers along a misty ridge, candid, natural lighting
+Portrait of a archaeologist assembling a mechanical clockwork device under magnifying lamp, candid, natural lighting
+Portrait of a mechanical watchmaker assembling a mechanical clockwork device under magnifying lamp, candid, natural lighting
+Portrait of a hydrologist glazing pottery beside a crackling kiln, candid, natural lighting
+Portrait of a classical guitarist practicing ballet leaps on an empty theater stage, candid, natural lighting
+Portrait of a stained‑glass restorer testing experimental robots in a sleek lab, candid, natural lighting
+Portrait of a ballet dancer working in a sunlit studio cluttered with tools, candid, natural lighting
+Portrait of a mechanical watchmaker restoring stained glass under soft cathedral light, candid, natural lighting
+Portrait of a data visualizer packing honeycombs at a rustic apiary, candid, natural lighting
+Portrait of a habitat designer working in a sunlit studio cluttered with tools, candid, natural lighting
+Portrait of a bonsai cultivator cataloging rare manuscripts in a quiet library alcove, candid, natural lighting
+Portrait of a sustainable architect analyzing rock samples in a windswept canyon, candid, natural lighting
+Portrait of a antique restorer analyzing rock samples in a windswept canyon, candid, natural lighting
+Portrait of a habitat designer contemplating star charts under a dim observatory dome, candid, natural lighting
+Portrait of a urban sketcher crafting leather shoes in a workshop filled with wood shavings, candid, natural lighting
+Portrait of a bike messenger recording ambient sounds inside an old forest, candid, natural lighting
+Portrait of a calligrapher restoring stained glass under soft cathedral light, candid, natural lighting
+Portrait of a beekeeper recording ambient sounds inside an old forest, candid, natural lighting
+Portrait of a sound Foley artist glazing pottery beside a crackling kiln, candid, natural lighting
+Portrait of a renewable energy lobbyist glazing pottery beside a crackling kiln, candid, natural lighting
+Portrait of a tattoo artist blowing molten glass into swirling shapes by furnace glow, candid, natural lighting
+Portrait of a marionette puppeteer recording ambient sounds inside an old forest, candid, natural lighting
+Portrait of a hand‑pan musician harvesting greens on a rooftop farm at sunrise, candid, natural lighting
+Portrait of a mountain guide painting a landscape en plein air beside a river, candid, natural lighting
+Portrait of a materials engineer cataloging rare manuscripts in a quiet library alcove, candid, natural lighting
+Portrait of a stained‑glass restorer painting a landscape en plein air beside a river, candid, natural lighting
+Portrait of a materials engineer assembling a mechanical clockwork device under magnifying lamp, candid, natural lighting
+Portrait of a labyrinth gardener demonstrating calligraphy strokes on rice paper, candid, natural lighting
+Portrait of a potter blowing molten glass into swirling shapes by furnace glow, candid, natural lighting
+Portrait of a prosthetics designer harvesting greens on a rooftop farm at sunrise, candid, natural lighting
+Portrait of a wind surfer crafting leather shoes in a workshop filled with wood shavings, candid, natural lighting
+Portrait of a hydrologist guiding hikers along a misty ridge, candid, natural lighting
+Portrait of a hand‑pan musician assembling a mechanical clockwork device under magnifying lamp, candid, natural lighting
+Portrait of a mountain guide harvesting greens on a rooftop farm at sunrise, candid, natural lighting
+Portrait of a ceramic artist practicing ballet leaps on an empty theater stage, candid, natural lighting
+Portrait of a ship pilot harvesting greens on a rooftop farm at sunrise, candid, natural lighting
+Portrait of a rowboat builder analyzing rock samples in a windswept canyon, candid, natural lighting
+Portrait of a restoration carpenter painting a landscape en plein air beside a river, candid, natural lighting
+Portrait of a bridge acoustics analyst crafting leather shoes in a workshop filled with wood shavings, candid, natural lighting
+Portrait of a shoemaker glazing pottery beside a crackling kiln, candid, natural lighting
+Portrait of a hydrologist testing experimental robots in a sleek lab, candid, natural lighting
+Portrait of a heritage conservator working in a sunlit studio cluttered with tools, candid, natural lighting
+Portrait of a harbor master guiding hikers along a misty ridge, candid, natural lighting
+Portrait of a field linguist guiding hikers along a misty ridge, candid, natural lighting
+Portrait of a ceramic artist painting a landscape en plein air beside a river, candid, natural lighting
+Portrait of a silversmith performing on a cobblestone street at dusk, candid, natural lighting
+Portrait of a glass mosaic artist recording ambient sounds inside an old forest, candid, natural lighting
+Portrait of a potter assembling a mechanical clockwork device under magnifying lamp, candid, natural lighting
+Portrait of a ice climber analyzing rock samples in a windswept canyon, candid, natural lighting
+Portrait of a bookbinder crafting leather shoes in a workshop filled with wood shavings, candid, natural lighting
+Portrait of a marionette puppeteer restoring stained glass under soft cathedral light, candid, natural lighting
+Portrait of a chef practicing ballet leaps on an empty theater stage, candid, natural lighting
+Portrait of a marionette puppeteer blowing molten glass into swirling shapes by furnace glow, candid, natural lighting
+Portrait of a search‑and‑rescue dog handler shaping surfboards in a powdery shaping bay, candid, natural lighting
+Portrait of a field linguist blowing molten glass into swirling shapes by furnace glow, candid, natural lighting
+Portrait of a kite maker contemplating star charts under a dim observatory dome, candid, natural lighting
+Portrait of a toy designer blowing molten glass into swirling shapes by furnace glow, candid, natural lighting
+Portrait of a calligrapher shaping surfboards in a powdery shaping bay, candid, natural lighting
+Portrait of a botanist glazing pottery beside a crackling kiln, candid, natural lighting
+Portrait of a botanist contemplating star charts under a dim observatory dome, candid, natural lighting
+Portrait of a gardener assembling a mechanical clockwork device under magnifying lamp, candid, natural lighting
+Portrait of a wildland firefighter harvesting greens on a rooftop farm at sunrise, candid, natural lighting
+Portrait of a gardener cataloging rare manuscripts in a quiet library alcove, candid, natural lighting
+Portrait of a hydrologist assembling a mechanical clockwork device under magnifying lamp, candid, natural lighting
+Portrait of a community baker restoring stained glass under soft cathedral light, candid, natural lighting
+Portrait of a rowboat builder analyzing rock samples in a windswept canyon, candid, natural lighting
+Portrait of a choral conductor glazing pottery beside a crackling kiln, candid, natural lighting
+Portrait of a geologist glazing pottery beside a crackling kiln, candid, natural lighting
+Portrait of a mechanical watchmaker crafting leather shoes in a workshop filled with wood shavings, candid, natural lighting
+Portrait of a tattoo artist packing honeycombs at a rustic apiary, candid, natural lighting
+Portrait of a botanist glazing pottery beside a crackling kiln, candid, natural lighting
+High‑resolution vintage travel poster spelling 'Velocity' carved into towering glacier face, dramatic lighting
+High‑resolution ice sculpture headline spelling 'Eclipse' carved into towering glacier face, dramatic lighting
+High‑resolution glowing moss graffiti spelling 'Velocity' floating inside zero‑gravity space station, dramatic lighting
+High‑resolution LED hologram billboard spelling 'Orbit' amid cherry‑blossom snowfall, dramatic lighting
+High‑resolution chalkboard typography sketch spelling 'Odyssey' amid cherry‑blossom snowfall, dramatic lighting
+High‑resolution ice sculpture headline spelling 'Nebula' hovering above stormy ocean waves, dramatic lighting
+High‑resolution LED hologram billboard spelling 'Odyssey' emerging from rolling morning fog, dramatic lighting
+High‑resolution neon street sign spelling 'Voyage' over a bustling retro‑futuristic metropolis, dramatic lighting
+High‑resolution sand dune calligraphy spelling 'Ethereal' carved into towering glacier face, dramatic lighting
+High‑resolution neon street sign spelling 'Eclipse' suspended between skyscrapers at twilight, dramatic lighting
+High‑resolution glowing moss graffiti spelling 'Quantum' projected onto ancient ruins at dusk, dramatic lighting
+High‑resolution chalkboard typography sketch spelling 'Harmony' against star‑filled desert night sky, dramatic lighting
+High‑resolution aerial crop‑art installation spelling 'Orbit' floating inside zero‑gravity space station, dramatic lighting
+High‑resolution steampunk brass engraving spelling 'Nebula' emerging from rolling morning fog, dramatic lighting
+High‑resolution glowing moss graffiti spelling 'Zenith' projected onto ancient ruins at dusk, dramatic lighting
+High‑resolution glowing moss graffiti spelling 'Velocity' over a bustling retro‑futuristic metropolis, dramatic lighting
+High‑resolution glowing moss graffiti spelling 'Odyssey' projected onto ancient ruins at dusk, dramatic lighting
+High‑resolution neon street sign spelling 'Orbit' hovering above stormy ocean waves, dramatic lighting
+High‑resolution neon street sign spelling 'Nebula' against star‑filled desert night sky, dramatic lighting
+High‑resolution glowing moss graffiti spelling 'Synthesis' hovering above stormy ocean waves, dramatic lighting
+High‑resolution LED hologram billboard spelling 'Nebula' carved into towering glacier face, dramatic lighting
+High‑resolution chalkboard typography sketch spelling 'Quantum' amid cherry‑blossom snowfall, dramatic lighting
+High‑resolution LED hologram billboard spelling 'Odyssey' suspended between skyscrapers at twilight, dramatic lighting
+High‑resolution LED hologram billboard spelling 'Ethereal' over a bustling retro‑futuristic metropolis, dramatic lighting
+High‑resolution ice sculpture headline spelling 'Eclipse' projected onto ancient ruins at dusk, dramatic lighting
+High‑resolution vintage travel poster spelling 'Serendipity' floating inside zero‑gravity space station, dramatic lighting
+High‑resolution steampunk brass engraving spelling 'Cascade' projected onto ancient ruins at dusk, dramatic lighting
+High‑resolution skywritten message spelling 'Harmony' suspended between skyscrapers at twilight, dramatic lighting
+High‑resolution ice sculpture headline spelling 'Synthesis' suspended between skyscrapers at twilight, dramatic lighting
+High‑resolution neon street sign spelling 'Serendipity' against star‑filled desert night sky, dramatic lighting
+High‑resolution steampunk brass engraving spelling 'Orbit' suspended between skyscrapers at twilight, dramatic lighting
+High‑resolution chalkboard typography sketch spelling 'Serendipity' carved into towering glacier face, dramatic lighting
+High‑resolution steampunk brass engraving spelling 'Equinox' reflected in rain‑soaked cobblestones, dramatic lighting
+High‑resolution vintage travel poster spelling 'Harmony' projected onto ancient ruins at dusk, dramatic lighting
+High‑resolution vintage travel poster spelling 'Voyage' floating inside zero‑gravity space station, dramatic lighting
+High‑resolution aerial crop‑art installation spelling 'Quantum' against star‑filled desert night sky, dramatic lighting
+High‑resolution glowing moss graffiti spelling 'Voyage' reflected in rain‑soaked cobblestones, dramatic lighting
+High‑resolution steampunk brass engraving spelling 'Velocity' amid cherry‑blossom snowfall, dramatic lighting
+High‑resolution steampunk brass engraving spelling 'Zenith' hovering above stormy ocean waves, dramatic lighting
+High‑resolution ice sculpture headline spelling 'Orbit' over a bustling retro‑futuristic metropolis, dramatic lighting
+High‑resolution chalkboard typography sketch spelling 'Harmony' reflected in rain‑soaked cobblestones, dramatic lighting
+High‑resolution aerial crop‑art installation spelling 'Cascade' projected onto ancient ruins at dusk, dramatic lighting
+High‑resolution neon street sign spelling 'Harmony' reflected in rain‑soaked cobblestones, dramatic lighting
+High‑resolution vintage travel poster spelling 'Zenith' emerging from rolling morning fog, dramatic lighting
+High‑resolution aerial crop‑art installation spelling 'Quantum' emerging from rolling morning fog, dramatic lighting
+High‑resolution LED hologram billboard spelling 'Voyage' amid cherry‑blossom snowfall, dramatic lighting
+High‑resolution vintage travel poster spelling 'Quantum' amid cherry‑blossom snowfall, dramatic lighting
+High‑resolution chalkboard typography sketch spelling 'Momentum' projected onto ancient ruins at dusk, dramatic lighting
+High‑resolution neon street sign spelling 'Zenith' projected onto ancient ruins at dusk, dramatic lighting
+High‑resolution glowing moss graffiti spelling 'Voyage' reflected in rain‑soaked cobblestones, dramatic lighting
+High‑resolution glowing moss graffiti spelling 'Odyssey' floating inside zero‑gravity space station, dramatic lighting
+High‑resolution neon street sign spelling 'Serendipity' projected onto ancient ruins at dusk, dramatic lighting
+High‑resolution sand dune calligraphy spelling 'Harmony' projected onto ancient ruins at dusk, dramatic lighting
+High‑resolution neon street sign spelling 'Serendipity' carved into towering glacier face, dramatic lighting
+High‑resolution chalkboard typography sketch spelling 'Momentum' floating inside zero‑gravity space station, dramatic lighting
+High‑resolution sand dune calligraphy spelling 'Momentum' suspended between skyscrapers at twilight, dramatic lighting
+High‑resolution ice sculpture headline spelling 'Eclipse' carved into towering glacier face, dramatic lighting
+High‑resolution LED hologram billboard spelling 'Quantum' over a bustling retro‑futuristic metropolis, dramatic lighting
+High‑resolution ice sculpture headline spelling 'Synthesis' over a bustling retro‑futuristic metropolis, dramatic lighting
+High‑resolution glowing moss graffiti spelling 'Equinox' amid cherry‑blossom snowfall, dramatic lighting
+High‑resolution ice sculpture headline spelling 'Voyage' against star‑filled desert night sky, dramatic lighting
+High‑resolution chalkboard typography sketch spelling 'Odyssey' carved into towering glacier face, dramatic lighting
+High‑resolution neon street sign spelling 'Voyage' projected onto ancient ruins at dusk, dramatic lighting
+High‑resolution LED hologram billboard spelling 'Equinox' reflected in rain‑soaked cobblestones, dramatic lighting
+High‑resolution ice sculpture headline spelling 'Voyage' projected onto ancient ruins at dusk, dramatic lighting
+High‑resolution steampunk brass engraving spelling 'Equinox' amid cherry‑blossom snowfall, dramatic lighting
+High‑resolution ice sculpture headline spelling 'Momentum' hovering above stormy ocean waves, dramatic lighting
+High‑resolution sand dune calligraphy spelling 'Velocity' projected onto ancient ruins at dusk, dramatic lighting
+High‑resolution glowing moss graffiti spelling 'Momentum' over a bustling retro‑futuristic metropolis, dramatic lighting
+High‑resolution LED hologram billboard spelling 'Nebula' amid cherry‑blossom snowfall, dramatic lighting
+High‑resolution sand dune calligraphy spelling 'Momentum' carved into towering glacier face, dramatic lighting
+High‑resolution chalkboard typography sketch spelling 'Odyssey' emerging from rolling morning fog, dramatic lighting
+High‑resolution skywritten message spelling 'Ethereal' amid cherry‑blossom snowfall, dramatic lighting
+High‑resolution skywritten message spelling 'Equinox' hovering above stormy ocean waves, dramatic lighting
+High‑resolution neon street sign spelling 'Odyssey' projected onto ancient ruins at dusk, dramatic lighting
+High‑resolution sand dune calligraphy spelling 'Equinox' floating inside zero‑gravity space station, dramatic lighting
+High‑resolution aerial crop‑art installation spelling 'Eclipse' projected onto ancient ruins at dusk, dramatic lighting
+High‑resolution aerial crop‑art installation spelling 'Quantum' hovering above stormy ocean waves, dramatic lighting
+High‑resolution vintage travel poster spelling 'Serendipity' amid cherry‑blossom snowfall, dramatic lighting
+High‑resolution sand dune calligraphy spelling 'Ethereal' against star‑filled desert night sky, dramatic lighting
+High‑resolution ice sculpture headline spelling 'Ethereal' carved into towering glacier face, dramatic lighting
+High‑resolution ice sculpture headline spelling 'Quantum' emerging from rolling morning fog, dramatic lighting
+High‑resolution aerial crop‑art installation spelling 'Momentum' hovering above stormy ocean waves, dramatic lighting
+High‑resolution vintage travel poster spelling 'Odyssey' against star‑filled desert night sky, dramatic lighting
+High‑resolution ice sculpture headline spelling 'Cascade' amid cherry‑blossom snowfall, dramatic lighting
+High‑resolution glowing moss graffiti spelling 'Harmony' emerging from rolling morning fog, dramatic lighting
+High‑resolution glowing moss graffiti spelling 'Serendipity' amid cherry‑blossom snowfall, dramatic lighting
+High‑resolution neon street sign spelling 'Orbit' emerging from rolling morning fog, dramatic lighting
+High‑resolution skywritten message spelling 'Harmony' suspended between skyscrapers at twilight, dramatic lighting
+High‑resolution sand dune calligraphy spelling 'Equinox' floating inside zero‑gravity space station, dramatic lighting
+High‑resolution skywritten message spelling 'Cascade' carved into towering glacier face, dramatic lighting
+High‑resolution glowing moss graffiti spelling 'Zenith' reflected in rain‑soaked cobblestones, dramatic lighting
+High‑resolution glowing moss graffiti spelling 'Zenith' emerging from rolling morning fog, dramatic lighting
+High‑resolution steampunk brass engraving spelling 'Orbit' emerging from rolling morning fog, dramatic lighting
+High‑resolution LED hologram billboard spelling 'Ethereal' amid cherry‑blossom snowfall, dramatic lighting
+High‑resolution LED hologram billboard spelling 'Zenith' suspended between skyscrapers at twilight, dramatic lighting
+High‑resolution aerial crop‑art installation spelling 'Ethereal' projected onto ancient ruins at dusk, dramatic lighting
+High‑resolution neon street sign spelling 'Ethereal' projected onto ancient ruins at dusk, dramatic lighting
+High‑resolution LED hologram billboard spelling 'Quantum' floating inside zero‑gravity space station, dramatic lighting
+High‑resolution glowing moss graffiti spelling 'Voyage' against star‑filled desert night sky, dramatic lighting
+cinematic A cyberpunk cityscape at night, rain‑soaked streets and neon signs
+vibrant A bowl of soup that looks like a monster knitted out of wool
+dreamlike A cyberpunk cityscape at night, rain‑soaked streets and neon signs
+ultrarealistic A bowl of soup that looks like a monster knitted out of wool
+vibrant A steampunk airship sailing above Victorian London at sunrise
+high‑contrast A futuristic sports car parked in an ancient Roman forum, 8K render
+vibrant A microscopic close‑up of a snowflake shaped like a cathedral
+cinematic A photorealistic image of an astronaut riding a horse on Mars
+vibrant A steampunk airship sailing above Victorian London at sunrise
+dreamlike An ice cream cone melting into a desert landscape, surrealism
+high‑contrast A photorealistic image of an astronaut riding a horse on Mars
+vibrant A steampunk airship sailing above Victorian London at sunrise
+hyperdetailed A photorealistic image of an astronaut riding a horse on Mars
+vibrant A majestic lion wearing a royal crown, oil on canvas
+high‑contrast A steampunk airship sailing above Victorian London at sunrise
+high‑contrast A cat made of galaxies, digital art
+vibrant A bowl of soup that looks like a monster knitted out of wool
+hyperdetailed A bowl of soup that looks like a monster knitted out of wool
+vibrant A steampunk airship sailing above Victorian London at sunrise
+vibrant A photorealistic image of an astronaut riding a horse on Mars
+dreamlike A futuristic sports car parked in an ancient Roman forum, 8K render
+dreamlike A steampunk airship sailing above Victorian London at sunrise
+vibrant A cyberpunk cityscape at night, rain‑soaked streets and neon signs
+high‑contrast A cyberpunk cityscape at night, rain‑soaked streets and neon signs
+ultrarealistic A cat made of galaxies, digital art
+high‑contrast A steampunk airship sailing above Victorian London at sunrise
+dreamlike An ice cream cone melting into a desert landscape, surrealism
+ultrarealistic A photorealistic image of an astronaut riding a horse on Mars
+cinematic A steampunk airship sailing above Victorian London at sunrise
+cinematic A microscopic close‑up of a snowflake shaped like a cathedral
+hyperdetailed A photorealistic image of an astronaut riding a horse on Mars
+hyperdetailed A steampunk airship sailing above Victorian London at sunrise
+vibrant A bowl of soup that looks like a monster knitted out of wool
+hyperdetailed A futuristic sports car parked in an ancient Roman forum, 8K render
+hyperdetailed A steampunk airship sailing above Victorian London at sunrise
+hyperdetailed A steampunk airship sailing above Victorian London at sunrise
+vibrant A painting of a fox in the style of Van Gogh
+dreamlike A photorealistic image of an astronaut riding a horse on Mars
+ultrarealistic A cat made of galaxies, digital art
+cinematic A futuristic sports car parked in an ancient Roman forum, 8K render
+dreamlike A painting of a fox in the style of Van Gogh
+ultrarealistic A photorealistic image of an astronaut riding a horse on Mars
+cinematic A bowl of soup that looks like a monster knitted out of wool
+high‑contrast A photorealistic image of an astronaut riding a horse on Mars
+cinematic A cyberpunk cityscape at night, rain‑soaked streets and neon signs
+vibrant A cyberpunk cityscape at night, rain‑soaked streets and neon signs
+high‑contrast A painting of a fox in the style of Van Gogh
+vibrant A bowl of soup that looks like a monster knitted out of wool
+hyperdetailed A majestic lion wearing a royal crown, oil on canvas
+high‑contrast A cyberpunk cityscape at night, rain‑soaked streets and neon signs
+dreamlike An ice cream cone melting into a desert landscape, surrealism
+cinematic A painting of a fox in the style of Van Gogh
+ultrarealistic A majestic lion wearing a royal crown, oil on canvas
+ultrarealistic An ice cream cone melting into a desert landscape, surrealism
+high‑contrast A majestic lion wearing a royal crown, oil on canvas
+vibrant A steampunk airship sailing above Victorian London at sunrise
+ultrarealistic A bowl of soup that looks like a monster knitted out of wool
+dreamlike An ice cream cone melting into a desert landscape, surrealism
+ultrarealistic A bowl of soup that looks like a monster knitted out of wool
+dreamlike A majestic lion wearing a royal crown, oil on canvas
+ultrarealistic An ice cream cone melting into a desert landscape, surrealism
+ultrarealistic An ice cream cone melting into a desert landscape, surrealism
+high‑contrast A futuristic sports car parked in an ancient Roman forum, 8K render
+dreamlike A steampunk airship sailing above Victorian London at sunrise
+ultrarealistic A futuristic sports car parked in an ancient Roman forum, 8K render
+dreamlike A cat made of galaxies, digital art
+ultrarealistic A futuristic sports car parked in an ancient Roman forum, 8K render
+vibrant A steampunk airship sailing above Victorian London at sunrise
+vibrant A painting of a fox in the style of Van Gogh
+dreamlike A futuristic sports car parked in an ancient Roman forum, 8K render
+dreamlike A microscopic close‑up of a snowflake shaped like a cathedral
+vibrant A cyberpunk cityscape at night, rain‑soaked streets and neon signs
+dreamlike A cyberpunk cityscape at night, rain‑soaked streets and neon signs
+dreamlike A cat made of galaxies, digital art
+high‑contrast A majestic lion wearing a royal crown, oil on canvas
+vibrant A cat made of galaxies, digital art
+vibrant A microscopic close‑up of a snowflake shaped like a cathedral
+hyperdetailed An ice cream cone melting into a desert landscape, surrealism
+cinematic A futuristic sports car parked in an ancient Roman forum, 8K render
+hyperdetailed A painting of a fox in the style of Van Gogh
+cinematic A microscopic close‑up of a snowflake shaped like a cathedral
+high‑contrast A microscopic close‑up of a snowflake shaped like a cathedral
+dreamlike An ice cream cone melting into a desert landscape, surrealism
+hyperdetailed A steampunk airship sailing above Victorian London at sunrise
+vibrant A photorealistic image of an astronaut riding a horse on Mars
+cinematic A futuristic sports car parked in an ancient Roman forum, 8K render
+cinematic A microscopic close‑up of a snowflake shaped like a cathedral
+hyperdetailed A futuristic sports car parked in an ancient Roman forum, 8K render
+hyperdetailed A futuristic sports car parked in an ancient Roman forum, 8K render
+dreamlike An ice cream cone melting into a desert landscape, surrealism
+high‑contrast A majestic lion wearing a royal crown, oil on canvas
+high‑contrast A photorealistic image of an astronaut riding a horse on Mars
+ultrarealistic A bowl of soup that looks like a monster knitted out of wool
+dreamlike A bowl of soup that looks like a monster knitted out of wool
+vibrant A steampunk airship sailing above Victorian London at sunrise
+cinematic A cat made of galaxies, digital art
+dreamlike A microscopic close‑up of a snowflake shaped like a cathedral
+vibrant A microscopic close‑up of a snowflake shaped like a cathedral
+ultrarealistic A photorealistic image of an astronaut riding a horse on Mars
+cinematic A majestic lion wearing a royal crown, oil on canvas

configs/captions/example_prompts5.txt ADDED Viewed

	@@ -0,0 +1,100 @@

+bonsai dragon coiled cup, dramatic top‑down flow, rim lighting, pastel watercolor
+robot barista alley, portrait orientation emphasizing height, long‑exposure light trails, lo‑fi pixel art
+time portal ancient oak, negative space upper third, long‑exposure light trails, sepia ink
+glass terrarium storm, stacked narrative layers, volumetric god‑rays, hyperdetailed CGI
+rain‑soaked neon street, dramatic top‑down flow, rim lighting, photoreal 8K
+lantern festival bamboo, elongated frame guiding eye upward, mist drifting mid‑scene, sepia ink
+library tower floating ladders, portrait orientation emphasizing height, soft bokeh foreground, photoreal 8K
+towering waterfall carving rainbow mist, portrait orientation emphasizing height, mist drifting mid‑scene, gouache storybook
+towering waterfall carving rainbow mist, stacked narrative layers, mist drifting mid‑scene, sepia ink
+rain‑soaked neon street, negative space upper third, volumetric god‑rays, gouache storybook
+phoenix of origami flame, stacked narrative layers, long‑exposure light trails, gouache storybook
+phoenix of origami flame, elongated frame guiding eye upward, rain‑kissed reflections, sepia ink
+lantern festival bamboo, elongated frame guiding eye upward, volumetric god‑rays, hyperdetailed CGI
+sakura tree train car, negative space upper third, volumetric god‑rays, neon vaporwave
+mirror lake inverted castle, portrait orientation emphasizing height, mist drifting mid‑scene, Baroque oil
+frozen wave unveiling city, stacked narrative layers, dynamic lens flare, sepia ink
+mirror lake inverted castle, elongated frame guiding eye upward, dynamic lens flare, lo‑fi pixel art
+frozen wave unveiling city, stacked narrative layers, dynamic lens flare, photoreal 8K
+time portal ancient oak, towering vertical composition, mist drifting mid‑scene, hyperdetailed CGI
+rain‑soaked neon street, negative space upper third, rain‑kissed reflections, Baroque oil
+sakura tree train car, negative space upper third, rim lighting, Baroque oil
+steampunk aviator on skybridge, stacked narrative layers, mist drifting mid‑scene, gouache storybook
+glass terrarium storm, towering vertical composition, dynamic lens flare, hyperdetailed CGI
+celestial whale in clouds, stacked narrative layers, mist drifting mid‑scene, photoreal 8K
+time portal ancient oak, portrait orientation emphasizing height, soft bokeh foreground, ultrawide cinema
+library tower floating ladders, dramatic top‑down flow, long‑exposure light trails, neon vaporwave
+lighthouse in aurora night, dramatic top‑down flow, rain‑kissed reflections, photoreal 8K
+spiral redwood staircase, dramatic top‑down flow, rim lighting, gouache storybook
+phoenix of origami flame, stacked narrative layers, dynamic lens flare, Baroque oil
+library tower floating ladders, elongated frame guiding eye upward, long‑exposure light trails, voxel isometric
+sakura tree train car, dramatic top‑down flow, mist drifting mid‑scene, photoreal 8K
+subway car drifting space, negative space upper third, dynamic lens flare, neon vaporwave
+frozen wave unveiling city, dramatic top‑down flow, rain‑kissed reflections, hyperdetailed CGI
+lone samurai beneath eclipse, stacked narrative layers, rim lighting, Baroque oil
+library tower floating ladders, dramatic top‑down flow, rain‑kissed reflections, gouache storybook
+rain‑soaked neon street, stacked narrative layers, rim lighting, Baroque oil
+rain‑soaked neon street, elongated frame guiding eye upward, volumetric god‑rays, gouache storybook
+sakura tree train car, dramatic top‑down flow, rim lighting, ultrawide cinema
+clock tower glowing vines, stacked narrative layers, dynamic lens flare, photoreal 8K
+phoenix of origami flame, negative space upper third, rain‑kissed reflections, gouache storybook
+library tower floating ladders, towering vertical composition, rim lighting, pastel watercolor
+sakura tree train car, elongated frame guiding eye upward, volumetric god‑rays, pastel watercolor
+steampunk aviator on skybridge, portrait orientation emphasizing height, long‑exposure light trails, photoreal 8K
+robot barista alley, towering vertical composition, soft bokeh foreground, pastel watercolor
+paper sailboat cloud sea, towering vertical composition, soft bokeh foreground, voxel isometric
+steampunk aviator on skybridge, towering vertical composition, rain‑kissed reflections, voxel isometric
+frozen wave unveiling city, towering vertical composition, mist drifting mid‑scene, voxel isometric
+mirror lake inverted castle, stacked narrative layers, mist drifting mid‑scene, Baroque oil
+time portal ancient oak, stacked narrative layers, volumetric god‑rays, photoreal 8K
+mirror lake inverted castle, stacked narrative layers, soft bokeh foreground, hyperdetailed CGI
+spiral redwood staircase, elongated frame guiding eye upward, volumetric god‑rays, voxel isometric
+frozen wave unveiling city, towering vertical composition, dynamic lens flare, hyperdetailed CGI
+towering waterfall carving rainbow mist, towering vertical composition, volumetric god‑rays, hyperdetailed CGI
+bonsai dragon coiled cup, portrait orientation emphasizing height, mist drifting mid‑scene, neon vaporwave
+lantern festival bamboo, negative space upper third, dynamic lens flare, hyperdetailed CGI
+sakura tree train car, towering vertical composition, mist drifting mid‑scene, gouache storybook
+lantern festival bamboo, towering vertical composition, dynamic lens flare, photoreal 8K
+subway car drifting space, negative space upper third, dynamic lens flare, Baroque oil
+rain‑soaked neon street, portrait orientation emphasizing height, mist drifting mid‑scene, voxel isometric
+bonsai dragon coiled cup, towering vertical composition, volumetric god‑rays, voxel isometric
+steampunk aviator on skybridge, portrait orientation emphasizing height, soft bokeh foreground, voxel isometric
+lighthouse in aurora night, stacked narrative layers, dynamic lens flare, voxel isometric
+bonsai dragon coiled cup, portrait orientation emphasizing height, dynamic lens flare, photoreal 8K
+library tower floating ladders, towering vertical composition, dynamic lens flare, lo‑fi pixel art
+lighthouse in aurora night, negative space upper third, dynamic lens flare, gouache storybook
+lone samurai beneath eclipse, stacked narrative layers, rain‑kissed reflections, Baroque oil
+glass terrarium storm, towering vertical composition, dynamic lens flare, hyperdetailed CGI
+library tower floating ladders, towering vertical composition, long‑exposure light trails, pastel watercolor
+subway car drifting space, elongated frame guiding eye upward, dynamic lens flare, lo‑fi pixel art
+towering waterfall carving rainbow mist, elongated frame guiding eye upward, dynamic lens flare, hyperdetailed CGI
+mirror lake inverted castle, portrait orientation emphasizing height, rain‑kissed reflections, sepia ink
+clock tower glowing vines, dramatic top‑down flow, rim lighting, neon vaporwave
+clock tower glowing vines, stacked narrative layers, rain‑kissed reflections, voxel isometric
+glass terrarium storm, stacked narrative layers, dynamic lens flare, Baroque oil
+sakura tree train car, stacked narrative layers, rim lighting, gouache storybook
+phoenix of origami flame, portrait orientation emphasizing height, mist drifting mid‑scene, photoreal 8K
+steampunk aviator on skybridge, towering vertical composition, rain‑kissed reflections, neon vaporwave
+sakura tree train car, towering vertical composition, soft bokeh foreground, Baroque oil
+paper sailboat cloud sea, dramatic top‑down flow, rim lighting, voxel isometric
+celestial whale in clouds, elongated frame guiding eye upward, soft bokeh foreground, pastel watercolor
+mirror lake inverted castle, towering vertical composition, rim lighting, ultrawide cinema
+time portal ancient oak, towering vertical composition, dynamic lens flare, gouache storybook
+glass terrarium storm, elongated frame guiding eye upward, volumetric god‑rays, hyperdetailed CGI
+steampunk aviator on skybridge, stacked narrative layers, rim lighting, neon vaporwave
+bonsai dragon coiled cup, elongated frame guiding eye upward, volumetric god‑rays, ultrawide cinema
+library tower floating ladders, stacked narrative layers, volumetric god‑rays, ultrawide cinema
+sakura tree train car, towering vertical composition, dynamic lens flare, pastel watercolor
+robot barista alley, elongated frame guiding eye upward, soft bokeh foreground, ultrawide cinema
+sakura tree train car, portrait orientation emphasizing height, long‑exposure light trails, voxel isometric
+glass terrarium storm, portrait orientation emphasizing height, rim lighting, voxel isometric
+towering waterfall carving rainbow mist, dramatic top‑down flow, mist drifting mid‑scene, photoreal 8K
+steampunk aviator on skybridge, stacked narrative layers, volumetric god‑rays, photoreal 8K
+spiral redwood staircase, towering vertical composition, rim lighting, ultrawide cinema
+rain‑soaked neon street, towering vertical composition, volumetric god‑rays, lo‑fi pixel art
+lighthouse in aurora night, negative space upper third, rain‑kissed reflections, neon vaporwave
+glass terrarium storm, stacked narrative layers, long‑exposure light trails, gouache storybook
+mirror lake inverted castle, dramatic top‑down flow, soft bokeh foreground, lo‑fi pixel art
+clock tower glowing vines, portrait orientation emphasizing height, rain‑kissed reflections, Baroque oil
+lone samurai beneath eclipse, portrait orientation emphasizing height, volumetric god‑rays, voxel isometric
+paper sailboat cloud sea, elongated frame guiding eye upward, long‑exposure light trails, hyperdetailed CGI

configs/captions/example_prompts6.txt ADDED Viewed

	@@ -0,0 +1,250 @@

+"digital art of a beautiful tiger pokemon under an apple tree, cartoon style,Matte Painting,Magic Realism,Bright colors,hyper quality,high detail,high resolution, --video --s 750 --v 6.0 --ar 1:2"
+In the image, a corgi dog is wearing a straw hat and is laying on a fluffy rug. The dog's tongue is sticking out and it appears to be happy. There are two pumpkins and a basket of leaves nearby, indicating that the scene takes place during the fall season. The background features a Christmas tree, further suggesting the holiday atmosphere. The image has a warm and cozy feel to it, with the dog looking adorable in its hat and the pumpkins adding a festive touch.
+An Arctic scene featuring a polar bear and her cubs walking across ice floes under the northern lights, the sky illuminated with vibrant colors of green and purple, stars twinkling above, reflections on the icy water, hyper-realistic, high resolution.
+A daisy flower made entirely of origami paper, placed against a minimalist background, showcasing the folds and craftsmanship, high-resolution, studio lighting.
+A rustic kitchen table set with freshly baked bread, an assortment of cheeses, a bowl of ripe fruit, and a bouquet of lavender, sunlight streaming through a nearby window casting soft shadows, detailed still life painting with warm tones and textures.
+A panda eating bamboo in a lush green forest, with soft sunlight filtering through the leaves, realistic painting.
+A close-up of a dewdrop-covered spider web glistening in the morning light, intricate patterns, macro photography.
+An assortment of colorful gemstones scattered on a reflective surface, each facet catching the light, macro photography.
+A red fox curled up asleep in a snowy woodland clearing, with delicate snowflakes falling gently around it, watercolor style.
+A futuristic robotic dragon made of metallic scales and glowing blue eyes, perched on a rocky cliff, digital art, high resolution.
+A traditional Japanese pagoda nestled among cherry blossom trees in full bloom, petals gently falling, a serene pond reflecting the structure, mountains in the background, watercolor style, high resolution.
+A detailed macro photograph of a honeybee collecting pollen from a sunflower, with the texture of the petals and the bee's wings clearly visible, tiny particles of pollen floating in the air, background softly blurred to emphasize the subject, high-definition, natural lighting.
+A close-up of a beautifully crafted violin resting on a sheet of classical music, with light reflecting off its polished wooden surface, musical notes seeming to float off the page, and a single red rose lying beside it, artistic illustration with warm tones.
+A glass terrarium containing a miniature rainforest ecosystem, complete with tiny waterfalls, exotic plants, small animals like frogs and butterflies, the glass reflecting light from a nearby window, droplets of condensation visible on the inside, photorealistic rendering.
+A pair of dolphins leaping out of the ocean at sunset, with splashes of water frozen in mid-air, vibrant colors, hyper-realistic.
+A majestic eagle soaring above snow-capped mountains, with wings spread wide against a clear blue sky, realistic painting.
+An elegant glass vase filled with blooming cherry blossoms, placed on a minimalist wooden table, soft natural lighting, photorealistic.,
+An ancient, majestic tree in the heart of an enchanted forest, its luminescent leaves glowing in shades of blue and purple under a starry night sky, surrounded by floating wisps of light, digital art, high resolution.
+A tranquil alpine lake surrounded by snow-capped mountains, with the aurora borealis dancing across the night sky and its vibrant colors reflected in the still waters below, ultra-high-definition.
+A giant tortoise slowly making its way across a misty meadow at dawn, with dew-covered grass and wildflowers in soft pastel colors, mountains in the background shrouded in fog, a few butterflies fluttering nearby, photorealistic, high resolution.
+A fantastical airship sailing through the clouds above a steampunk city, with gears and propellers visible, dirigibles floating nearby, intricate architectural details on the buildings below, sunset sky with shades of pink and orange, digital art, high resolution.
+An African savannah scene featuring a herd of elephants walking towards a watering hole under a vibrant sunset sky, acacia trees silhouetted against the horizon, and distant silhouettes of giraffes grazing, cinematic lighting, ultra-wide shot.
+An antique compass lying on an old map, with a magnifying glass revealing detailed cartography, warm sepia tones, vintage style.
+A close-up of a vintage pocket watch with intricate gears visible through a transparent face, steampunk style, highly detailed illustration.
+A majestic Bengal tiger walking through a dense jungle with sunlight filtering through the canopy, its orange and black stripes contrasting vividly against the lush green foliage, birds perched on nearby branches, and exotic flowers blooming around, highly detailed digital painting, ultra-high-definition.
+A serene Zen garden with carefully raked sand patterns, smooth stones arranged thoughtfully, a small bonsai tree at the center, surrounded by bamboo fencing, soft lantern light illuminating the scene at dusk, a gentle stream flowing nearby, minimalist style, photorealistic.
+A fantasy-themed portrait of a female elf with golden hair and violet eyes, her attire shimmering with iridescent colors, set in an enchanted forest. 8K, best quality, fine details.
+pumpkins, autumn sunset in the old village, cobblestone houses, streets, plants, flowers, entrance, realistic, stunningly beautiful
+"Highly detailed mysterious egyptian  (sphynx cat), skindentation:1.2, bright eyes,  ancient egypt pyramid background, photorealistic, (hyper-realistic:1.2), cinematic, masterpiece:1.1, cinematic lighting"
+"vw bus, canvas art, abstract art printing, in the style of brian mashburn, light red and light brown, theo prins, charming character illustrations, pierre pellegrini, vintage cut-and-paste, rusty debris --ar 73:92 --stylize 750 --v 6"
+painterly style, seductive female League of legends Jinx character fighting at war, raging, crazy smile, crazy eyes, rocket lancher, guns, crazy face expression, character design, body is adorned with glowing golden runes, intense green aura around her, body dynamic epic action pose, intricate, highly detailed, epic and dynamic composition, dynamic angle, intricate details, multicolor explosion, blur effect, sharp focus, uhd, hdr, colorful shot, stormy weather, tons of flying debris around her, dark city background, modifier=CarnageStyle, color=blood_red, intensity=1.6
+A charismatic chef in a bustling kitchen, his apron dusted with flour, smiling as he presents a beautifully prepared dish. 8K, hyper-realistic, cinematic, post-production.
+A young adventurer with tousled hair and bright eyes, wearing a leather jacket and a backpack, ready to explore distant lands. 8K, hyper-realistic, cinematic, post-production.
+"A watercolor painting of a vibrant flower field in spring, with a rainbow of blossoms under a bright blue sky. 8K, best quality, fine details.",
+"digital art of a beautiful tiger pokemon under an apple tree, cartoon style,Matte Painting,Magic Realism,Bright colors,hyper quality,high detail,high resolution, --video --s 750 --v 6.0 --ar 1:2"
+"painterly style, Goku fighting at war, raging, blue hair, character design, body is adorned with glowing golden runes, yellow aura around him, body dynamic epic action pose, intricate, highly detailed, epic and dynamic composition, dynamic angle, intricate details, multicolor explosion, blur effect, sharp focus, uhd, hdr, colorful shot, stormy weather, tons of flying debris around him, dark city background, modifier=CarnageStyle, color=blood_red, intensity=1.6"
+A stunning steampunk city with towering skyscrapers and intricate clockwork mechanisms, gears and pistons move in a complex symphony, steam billows from chimneys, airships navigate the bustling skylanes, a vibrant metropolis
+"Samurai looks at the enemy, stands after the battle, fear and horror on his face, tired and beaten, sand on his face mixed with sweat, an atmosphere of darkness and horror, hyper realistic photo, In post - production, enhance the details, sharpness, and contrast to achieve the hyper - realistic effect"
+A portrait of an elemental entity with strong rim lighting and intricate details, painted digitally by Alvaro Castagnet, Peter Mohrbacher, and Dan Mumford
+"A regal female portrait with an ornate headdress decorated with colorful gemstones and feathers, her robes rich with intricate designs and bright hues. 8K, best quality, fine details.",
+"A detailed painting of Atlantis by multiple artists, featuring intricate detailing and vibrant colors.",
+"A landscape featuring mountains, a valley, sunset light, wildlife and a gorilla, reminiscent of Bob Ross's artwork.
+a space elevator, cinematic scifi art
+a hole in the floor of my bathroom with small gremlins living in it
+an origami pig on fire in the middle of a dark room with a pentagram on the floor
+a small office made out of car parts
+heat death of the universe, line art
+A car made out of vegetables.
+A cheesburger surfing the vibe wave at night
+An entire universe inside a bottle
+A bioluminescent rainforest at night, viewed from a canopy walkway, hyper-real, crisp moonlight filtering through mist
+Cross-section of an imaginary geode revealing swirling nebula-like mineral layers, macro photography style
+Futuristic library carved into a glacier, warm interior lighting contrasting icy blue walls, isometric view
+Surreal desert with floating sandstone monoliths casting long shadows at golden hour, ultra-wide lens
+Vintage watercolor map of an archipelago shaped like musical notes, illustrated cartography
+Cyberpunk alley drenched in neon rain, reflective puddles, no characters, cinematic atmosphere
+Close-up of a hummingbird made of fractal glass shards hovering near a sapphire flower, 8K detail
+Orbiting observatory above a gas-giant planet, rings stretching across star-filled sky, photoreal
+Abstract kinetic sculpture of twisting ribbons suspended in a white cube gallery, studio lighting
+Fog-covered pine forest with a single crimson tree in the center, muted color palette
+Time-lapse style composite of a tidal pool from dawn to dusk, stitched into one frame
+Isometric diagram of an autonomous greenhouse on Mars, annotated schematics aesthetic
+Paper-cut illustration of a city inside a whale, layered depth, soft muted tones
+Steampunk airship port at sunrise, brass machinery glinting, painterly brushwork
+Minimalist ink wash painting of a solitary mountain peak emerging from clouds
+Ultraviolet microscope image of an invented pollen grain with crystalline spikes
+Retro 8-bit pixel art scene of a cozy lakeside cabin under meteor shower
+Low-poly 3-D render of a coral reef teeming with geometric fish shapes
+Aerial view of a terraced rice field arranged in a perfect Fibonacci spiral
+Schematic cutaway of a clockwork heart pumping luminous liquid, technical drawing style
+Long-exposure night photograph of fireflies tracing mathematical Lissajous curves
+Gothic cathedral interior built entirely from translucent ice, soft subsurface scattering
+Top-down macro of latte foam forming a fractal coastline pattern
+Astronomical illustration of a triple-sunset over an ocean on an exoplanet
+Ink-on-parchment concept art of a floating pagoda tethered by chains to mountain peaks
+Cubist still life of fruit and musical instruments, vivid complementary colors
+Moody black-and-white film photograph of rain on a lonely train platform, 1950s era
+Hyperreal chrome koi fish swimming through clouds, sky as water
+Floral mandala assembled from autumn leaves, top-down symmetric composition
+Concept art of an underground crystal cavern illuminated by bioluminescent fungi
+Sci-fi control room with holographic interfaces projected into fog, teal-orange palette
+Minimal claymation style landscape with rolling pastel hills and giant daisies
+Polaroid aesthetic photo of a roadside diner at twilight, neon sign flickering
+Vector infographic showing the life cycle of a fictional winged seed, flat design
+Dream-like seascape where waves morph into galloping horses, double-exposure effect
+Art-deco poster of an interstellar passenger train speeding past moons
+Cross-section illustration of a layered cake that resembles planetary strata
+Infrared photograph of a mangrove swamp, foliage appearing white, water inky black
+Whimsical pencil sketch of a tea party with levitating porcelain, soft shading
+Architectural render of a zero-gravity museum with exhibits floating mid-air
+Oil painting of a stormy sky splitting into vortices shaped like musical clefs
+Isometric cutaway of an underground dwarf forge with molten rivers, game concept art
+Frosted glass terrarium containing a miniature thunderstorm, studio backdrop
+Minimalist cyanotype print of fern leaves arranged in a golden ratio spiral
+Fantasy moonlit waterfall cascading upward into the sky, long-exposure feel
+Retro-futuristic poster of a solar-powered desert rover kicking up red dust
+Double helix made of blooming flowers against a white background, high-key macro
+Top-down shot of a labyrinth garden trimmed into Escher-like impossible geometry
+Sci-fi vending machine selling bottled starlight, hologram price tags
+Watercolor portrait of an abstract humanoid with translucent skin revealing galaxies
+Silhouette of a lone tree on an island reflected perfectly in still water, dusk gradient
+Close-up macro of snowflakes arranged to form a Mandelbrot set
+Ink drawing of a koi pond where fish tails morph into swirling calligraphy strokes
+Hyperreal food photography of a floating stack of pancakes with gravity-defying syrup
+Electroluminescent circuit board cityscape at night, streets as glowing traces
+Surreal scene of books sprouting wings and migrating across a sunset sky
+Low-angle view of a colossal sandstone arch framing a star-filled Milky Way
+Cross-section of a mechanical sunflower tracking a miniature artificial sun
+Art-nouveau travel poster for an imaginary cloud kingdom, flowing line art
+Graph-paper style blueprint of a perpetual-motion water wheel, annotated
+Futuristic zen garden with levitating raked sand and floating bonsai stones
+Photoreal underwater city with glass domes linked by glowing tunnels
+Tilt-shift photo of a festival lantern parade through narrow cobblestone streets
+Neon wireframe landscape reminiscent of 1980s synthwave, grid fading to horizon
+Paper-quilling style illustration of a comet bursting into colorful spirals
+Panorama of a crimson aurora over icy mountains, ultra-wide 16:9 aspect
+Transparent holographic chess set floating in zero-gravity, pieces mid-game
+Pointillist painting of a bustling open-air market under summer sun
+Infrared thermal view of a volcanic eruption, palette mapped to rainbow hues
+Detail shot of clock gears where each tooth is a tiny stairway with lanterns
+Minimal line-art poster depicting the evolution of flight from feathers to starships
+Glowing jellyfish drifting through a misty pine forest at dawn, photoreal composite
+Art-studio workbench cluttered with vintage robotics schematics and metal parts
+Monochrome charcoal drawing of a lighthouse beam piercing heavy fog
+Isometric voxel art of a floating garden island with waterfalls spilling into void
+Surreal split-scene: left half winter forest, right half summer meadow, seamless blend
+Retro postage stamp design celebrating a fictional eclipse festival
+Hyperdetailed ceramic mosaic of a phoenix rising, mediterranean style
+Sci-fi medical lab growing crystalline plants in suspended nutrient orbs
+High-speed photo of colored ink clouds colliding underwater, symmetrical composition
+Anamorphic street art illusion of a chasm opening in a city square
+Timber-frame hobbit-style cottage under giant sunflowers, golden afternoon
+Futuristic monorail weaving through skyscrapers wrapped in vertical gardens
+Scientific render of a transparent hypercube containing swirling plasma
+Sepia photograph of an abandoned observatory overtaken by vines
+Concept piece: biomechanical dragon skeleton displayed in a museum hall
+Minimal gradient poster of a single droplet rippling concentric neon rings
+Chalkboard schematic showing stages of a do-it-yourself constellation projector
+Digital glitch art of a city skyline melting into cascading pixels
+Aerial drone shot of rice paddies shaped like circuitry pathways
+Macro of soap film displaying shifting rainbow interference patterns
+Oil-on-canvas seascape where waves are brush strokes of pure geometry
+Tilted perspective of a spiral staircase made entirely of stained glass
+Hyperreal 3-D render of a desert mirage city shimmering above dunes
+Vectorized infographic of wind turbine anatomy with exploded components
+Snow-covered bamboo forest under lantern light, gentle falling flakes
+Abstract generative art of golden particles forming a torus knot in black void
+Stop-motion clay diorama of a miniature volcano erupting sprinkles
+Ultrawide cinematic shot of two converging thunderstorms over open ocean
+Graphite sketch of intertwined river deltas resembling tree roots, top-down view
+Witness a hypnotic tableau illustrating translucent mineral forests echoing with harmonic resonances of quantum fireflies, using chiaroscuro lighting reminiscent of Baroque masters embracing neon chroma. Chromatic rhythms pulse like distant quasars, suggesting harmony between chaos and order.
+Observe an intricate composition depicting desert caravans navigating rivers of liquid glass during twin sunsets, modeled via procedural geometry emphasising topological elegance and material translucency. The narrative remains open‑ended, leaving viewers adrift in deliberate interpretive ambiguity.
+Observe an intricate composition illustrating translucent mineral forests echoing with harmonic resonances of quantum fireflies, with meticulously ray‑traced reflections and subsurface scattering. Subtle visual metaphors hint at the cyclic dialogue between technology, ecology, and memory.
+Inspect a surreal mise‑en‑scène portraying nomadic sky‑gardens drifting around thunderous cloud cathedrals in perpetual twilight, rendered at 8K resolution ensuring sub‑pixel precision on reflective anisotropic surfaces. Layered symbolism underscores paradoxes of permanence versus ephemerality in manufactured eternity.
+Encounter a novel conceptual artwork detailing recursive fractal cities folding into higher‑dimensional corridors of prismatic mist, framed by an ultra‑wide 16:9 holographic canvas supporting volumetric parallax. The composition invites contemplation of entropy, renewal, and cosmic interconnectedness.
+Witness a hypnotic tableau displaying vaulted librarian drones cataloging holographic memories within an endless glacier archive, using chiaroscuro lighting reminiscent of Baroque masters embracing neon chroma. The composition invites contemplation of entropy, renewal, and cosmic interconnectedness.
+Observe an intricate composition depicting desert caravans navigating rivers of liquid glass during twin sunsets, with meticulously ray‑traced reflections and subsurface scattering. Textural juxtapositions evoke synesthetic sensations that challenge conventional perceptual hierarchies.
+In this speculative panorama depicting desert caravans navigating rivers of liquid glass during twin sunsets, accentuated by selective focus bokeh revealing micron‑scale glitter particulates. The narrative remains open‑ended, leaving viewers adrift in deliberate interpretive ambiguity.
+Encounter a novel conceptual artwork portraying nomadic sky‑gardens drifting around thunderous cloud cathedrals in perpetual twilight, using chiaroscuro lighting reminiscent of Baroque masters embracing neon chroma. Textural juxtapositions evoke synesthetic sensations that challenge conventional perceptual hierarchies.
+In this speculative panorama displaying vaulted librarian drones cataloging holographic memories within an endless glacier archive, expressed through Art‑Nouveau line work fused with minimal vaporwave gradients. The scene silently questions whether exploration births meaning or merely mirrors ourselves.
+Witness a hypnotic tableau portraying nomadic sky‑gardens drifting around thunderous cloud cathedrals in perpetual twilight, expressed through Art‑Nouveau line work fused with minimal vaporwave gradients. Ultimately, the image celebrates imaginative elasticity as a frontier of scientific discovery.
+In this speculative panorama showing ancient megastructures orbiting a pulsating ruby planet inside a Dyson‑shell observatory, accentuated by selective focus bokeh revealing micron‑scale glitter particulates. The composition invites contemplation of entropy, renewal, and cosmic interconnectedness.
+Inspect a surreal mise‑en‑scène showing ancient megastructures orbiting a pulsating ruby planet inside a Dyson‑shell observatory, rendered at 8K resolution ensuring sub‑pixel precision on reflective anisotropic surfaces. The composition invites contemplation of entropy, renewal, and cosmic interconnectedness.
+Explore a visionary illustration capturing synchronised solar sails blossoming like origami across a kaleidoscopic nebula backdrop, framed by an ultra‑wide 16:9 holographic canvas supporting volumetric parallax. The composition invites contemplation of entropy, renewal, and cosmic interconnectedness.
+Observe an intricate composition illustrating translucent mineral forests echoing with harmonic resonances of quantum fireflies, rendered at 8K resolution ensuring sub‑pixel precision on reflective anisotropic surfaces. Ultimately, the image celebrates imaginative elasticity as a frontier of scientific discovery.
+Inspect a surreal mise‑en‑scène displaying vaulted librarian drones cataloging holographic memories within an endless glacier archive, framed by an ultra‑wide 16:9 holographic canvas supporting volumetric parallax. The narrative remains open‑ended, leaving viewers adrift in deliberate interpretive ambiguity.
+Survey an expansive environment portraying nomadic sky‑gardens drifting around thunderous cloud cathedrals in perpetual twilight, painted with thousand‑stroke impasto textures evoking tactile motion. The scene silently questions whether exploration births meaning or merely mirrors ourselves.
+In this speculative panorama rendering harmony between algorithmic sand dunes and floating ceramic monoliths radiating spectral rain, framed by an ultra‑wide 16:9 holographic canvas supporting volumetric parallax. Subtle visual metaphors hint at the cyclic dialogue between technology, ecology, and memory.
+Contemplate a hyper‑detailed diorama showing ancient megastructures orbiting a pulsating ruby planet inside a Dyson‑shell observatory, using chiaroscuro lighting reminiscent of Baroque masters embracing neon chroma. The narrative remains open‑ended, leaving viewers adrift in deliberate interpretive ambiguity.
+Encounter a novel conceptual artwork detailing recursive fractal cities folding into higher‑dimensional corridors of prismatic mist, painted with thousand‑stroke impasto textures evoking tactile motion. The narrative remains open‑ended, leaving viewers adrift in deliberate interpretive ambiguity.
+Observe an intricate composition showing ancient megastructures orbiting a pulsating ruby planet inside a Dyson‑shell observatory, modeled via procedural geometry emphasising topological elegance and material translucency. Ultimately, the image celebrates imaginative elasticity as a frontier of scientific discovery.
+In this speculative panorama capturing synchronised solar sails blossoming like origami across a kaleidoscopic nebula backdrop, modeled via procedural geometry emphasising topological elegance and material translucency. Every detail encourages reflection on humanity’s place within expansive, unknowable frontiers.
+Behold a cinematic vignette displaying vaulted librarian drones cataloging holographic memories within an endless glacier archive, expressed through Art‑Nouveau line work fused with minimal vaporwave gradients. Ultimately, the image celebrates imaginative elasticity as a frontier of scientific discovery.
+Observe an intricate composition displaying vaulted librarian drones cataloging holographic memories within an endless glacier archive, expressed through Art‑Nouveau line work fused with minimal vaporwave gradients. The composition invites contemplation of entropy, renewal, and cosmic interconnectedness.
+Consider an evocative dreamscape portraying nomadic sky‑gardens drifting around thunderous cloud cathedrals in perpetual twilight, in photorealistic 32‑bit colour depth. Layered symbolism underscores paradoxes of permanence versus ephemerality in manufactured eternity.
+Encounter a novel conceptual artwork where bioluminescent vines entangle crystalline pyramids beneath shifting auroras, captured by a drone‑level perspective employing long‑exposure star trails. The narrative remains open‑ended, leaving viewers adrift in deliberate interpretive ambiguity.
+Observe an intricate composition portraying nomadic sky‑gardens drifting around thunderous cloud cathedrals in perpetual twilight, rendered at 8K resolution ensuring sub‑pixel precision on reflective anisotropic surfaces. Chromatic rhythms pulse like distant quasars, suggesting harmony between chaos and order.
+Witness a hypnotic tableau showing ancient megastructures orbiting a pulsating ruby planet inside a Dyson‑shell observatory, with meticulously ray‑traced reflections and subsurface scattering. Chromatic rhythms pulse like distant quasars, suggesting harmony between chaos and order.
+Encounter a novel conceptual artwork where bioluminescent vines entangle crystalline pyramids beneath shifting auroras, painted with thousand‑stroke impasto textures evoking tactile motion. The scene silently questions whether exploration births meaning or merely mirrors ourselves.
+Behold a cinematic vignette featuring clockwork leviathans swimming through stratified ocean trenches of inverted gravity, rendered at 8K resolution ensuring sub‑pixel precision on reflective anisotropic surfaces. Spatial recursion visually articulates mathematical infinity within finite representational space.
+In this speculative panorama rendering harmony between algorithmic sand dunes and floating ceramic monoliths radiating spectral rain, expressed through Art‑Nouveau line work fused with minimal vaporwave gradients. Spatial recursion visually articulates mathematical infinity within finite representational space.
+Observe an intricate composition portraying nomadic sky‑gardens drifting around thunderous cloud cathedrals in perpetual twilight, rendered at 8K resolution ensuring sub‑pixel precision on reflective anisotropic surfaces. Spatial recursion visually articulates mathematical infinity within finite representational space.
+Contemplate a hyper‑detailed diorama detailing recursive fractal cities folding into higher‑dimensional corridors of prismatic mist, modeled via procedural geometry emphasising topological elegance and material translucency. The narrative remains open‑ended, leaving viewers adrift in deliberate interpretive ambiguity.
+Survey an expansive environment depicting desert caravans navigating rivers of liquid glass during twin sunsets, expressed through Art‑Nouveau line work fused with minimal vaporwave gradients. The narrative remains open‑ended, leaving viewers adrift in deliberate interpretive ambiguity.
+Behold a cinematic vignette rendering harmony between algorithmic sand dunes and floating ceramic monoliths radiating spectral rain, modeled via procedural geometry emphasising topological elegance and material translucency. Spatial recursion visually articulates mathematical infinity within finite representational space.
+Consider an evocative dreamscape displaying vaulted librarian drones cataloging holographic memories within an endless glacier archive, using chiaroscuro lighting reminiscent of Baroque masters embracing neon chroma. Subtle visual metaphors hint at the cyclic dialogue between technology, ecology, and memory.
+Contemplate a hyper‑detailed diorama showing ancient megastructures orbiting a pulsating ruby planet inside a Dyson‑shell observatory, modeled via procedural geometry emphasising topological elegance and material translucency. Spatial recursion visually articulates mathematical infinity within finite representational space.
+Witness a hypnotic tableau where bioluminescent vines entangle crystalline pyramids beneath shifting auroras, rendered at 8K resolution ensuring sub‑pixel precision on reflective anisotropic surfaces. Subtle visual metaphors hint at the cyclic dialogue between technology, ecology, and memory.
+Encounter a novel conceptual artwork capturing synchronised solar sails blossoming like origami across a kaleidoscopic nebula backdrop, accentuated by selective focus bokeh revealing micron‑scale glitter particulates. The scene silently questions whether exploration births meaning or merely mirrors ourselves.
+Contemplate a hyper‑detailed diorama displaying vaulted librarian drones cataloging holographic memories within an endless glacier archive, captured by a drone‑level perspective employing long‑exposure star trails. Chromatic rhythms pulse like distant quasars, suggesting harmony between chaos and order.
+Behold a cinematic vignette where bioluminescent vines entangle crystalline pyramids beneath shifting auroras, captured by a drone‑level perspective employing long‑exposure star trails. Spatial recursion visually articulates mathematical infinity within finite representational space.
+Behold a cinematic vignette where bioluminescent vines entangle crystalline pyramids beneath shifting auroras, painted with thousand‑stroke impasto textures evoking tactile motion. Textural juxtapositions evoke synesthetic sensations that challenge conventional perceptual hierarchies.
+Consider an evocative dreamscape showing ancient megastructures orbiting a pulsating ruby planet inside a Dyson‑shell observatory, in photorealistic 32‑bit colour depth. Ultimately, the image celebrates imaginative elasticity as a frontier of scientific discovery.
+Witness a hypnotic tableau illustrating translucent mineral forests echoing with harmonic resonances of quantum fireflies, painted with thousand‑stroke impasto textures evoking tactile motion. Every detail encourages reflection on humanity’s place within expansive, unknowable frontiers.
+Behold a cinematic vignette detailing recursive fractal cities folding into higher‑dimensional corridors of prismatic mist, modeled via procedural geometry emphasising topological elegance and material translucency. Subtle visual metaphors hint at the cyclic dialogue between technology, ecology, and memory.
+In this speculative panorama where bioluminescent vines entangle crystalline pyramids beneath shifting auroras, modeled via procedural geometry emphasising topological elegance and material translucency. Spatial recursion visually articulates mathematical infinity within finite representational space.
+Explore a visionary illustration where bioluminescent vines entangle crystalline pyramids beneath shifting auroras, painted with thousand‑stroke impasto textures evoking tactile motion. Layered symbolism underscores paradoxes of permanence versus ephemerality in manufactured eternity.
+Encounter a novel conceptual artwork detailing recursive fractal cities folding into higher‑dimensional corridors of prismatic mist, in photorealistic 32‑bit colour depth. The scene silently questions whether exploration births meaning or merely mirrors ourselves.
+Explore a visionary illustration displaying vaulted librarian drones cataloging holographic memories within an endless glacier archive, with meticulously ray‑traced reflections and subsurface scattering. The narrative remains open‑ended, leaving viewers adrift in deliberate interpretive ambiguity.
+In this speculative panorama illustrating translucent mineral forests echoing with harmonic resonances of quantum fireflies, accentuated by selective focus bokeh revealing micron‑scale glitter particulates. Subtle visual metaphors hint at the cyclic dialogue between technology, ecology, and memory.
+Inspect a surreal mise‑en‑scène portraying nomadic sky‑gardens drifting around thunderous cloud cathedrals in perpetual twilight, painted with thousand‑stroke impasto textures evoking tactile motion. Subtle visual metaphors hint at the cyclic dialogue between technology, ecology, and memory.
+Encounter a novel conceptual artwork illustrating translucent mineral forests echoing with harmonic resonances of quantum fireflies, expressed through Art‑Nouveau line work fused with minimal vaporwave gradients. Chromatic rhythms pulse like distant quasars, suggesting harmony between chaos and order.
+Contemplate a hyper‑detailed diorama where bioluminescent vines entangle crystalline pyramids beneath shifting auroras, painted with thousand‑stroke impasto textures evoking tactile motion. The scene silently questions whether exploration births meaning or merely mirrors ourselves.
+Observe an intricate composition capturing synchronised solar sails blossoming like origami across a kaleidoscopic nebula backdrop, framed by an ultra‑wide 16:9 holographic canvas supporting volumetric parallax. Textural juxtapositions evoke synesthetic sensations that challenge conventional perceptual hierarchies.
+Contemplate a hyper‑detailed diorama portraying nomadic sky‑gardens drifting around thunderous cloud cathedrals in perpetual twilight, with meticulously ray‑traced reflections and subsurface scattering. Chromatic rhythms pulse like distant quasars, suggesting harmony between chaos and order.
+Contemplate a hyper‑detailed diorama where bioluminescent vines entangle crystalline pyramids beneath shifting auroras, rendered at 8K resolution ensuring sub‑pixel precision on reflective anisotropic surfaces. Textural juxtapositions evoke synesthetic sensations that challenge conventional perceptual hierarchies.
+Witness a hypnotic tableau featuring clockwork leviathans swimming through stratified ocean trenches of inverted gravity, with meticulously ray‑traced reflections and subsurface scattering. The narrative remains open‑ended, leaving viewers adrift in deliberate interpretive ambiguity.
+Explore a visionary illustration portraying nomadic sky‑gardens drifting around thunderous cloud cathedrals in perpetual twilight, framed by an ultra‑wide 16:9 holographic canvas supporting volumetric parallax. The narrative remains open‑ended, leaving viewers adrift in deliberate interpretive ambiguity.
+Explore a visionary illustration rendering harmony between algorithmic sand dunes and floating ceramic monoliths radiating spectral rain, painted with thousand‑stroke impasto textures evoking tactile motion. The composition invites contemplation of entropy, renewal, and cosmic interconnectedness.
+Explore a visionary illustration featuring clockwork leviathans swimming through stratified ocean trenches of inverted gravity, using chiaroscuro lighting reminiscent of Baroque masters embracing neon chroma. The composition invites contemplation of entropy, renewal, and cosmic interconnectedness.
+Survey an expansive environment rendering harmony between algorithmic sand dunes and floating ceramic monoliths radiating spectral rain, framed by an ultra‑wide 16:9 holographic canvas supporting volumetric parallax. Textural juxtapositions evoke synesthetic sensations that challenge conventional perceptual hierarchies.
+Observe an intricate composition illustrating translucent mineral forests echoing with harmonic resonances of quantum fireflies, framed by an ultra‑wide 16:9 holographic canvas supporting volumetric parallax. Subtle visual metaphors hint at the cyclic dialogue between technology, ecology, and memory.
+Observe an intricate composition depicting desert caravans navigating rivers of liquid glass during twin sunsets, using chiaroscuro lighting reminiscent of Baroque masters embracing neon chroma. Every detail encourages reflection on humanity’s place within expansive, unknowable frontiers.
+Contemplate a hyper‑detailed diorama showing ancient megastructures orbiting a pulsating ruby planet inside a Dyson‑shell observatory, expressed through Art‑Nouveau line work fused with minimal vaporwave gradients. The narrative remains open‑ended, leaving viewers adrift in deliberate interpretive ambiguity.
+Encounter a novel conceptual artwork featuring clockwork leviathans swimming through stratified ocean trenches of inverted gravity, using chiaroscuro lighting reminiscent of Baroque masters embracing neon chroma. The composition invites contemplation of entropy, renewal, and cosmic interconnectedness.
+Consider an evocative dreamscape depicting desert caravans navigating rivers of liquid glass during twin sunsets, accentuated by selective focus bokeh revealing micron‑scale glitter particulates. Subtle visual metaphors hint at the cyclic dialogue between technology, ecology, and memory.
+Consider an evocative dreamscape rendering harmony between algorithmic sand dunes and floating ceramic monoliths radiating spectral rain, captured by a drone‑level perspective employing long‑exposure star trails. Layered symbolism underscores paradoxes of permanence versus ephemerality in manufactured eternity.
+Explore a visionary illustration capturing synchronised solar sails blossoming like origami across a kaleidoscopic nebula backdrop, framed by an ultra‑wide 16:9 holographic canvas supporting volumetric parallax. Subtle visual metaphors hint at the cyclic dialogue between technology, ecology, and memory.
+Contemplate a hyper‑detailed diorama rendering harmony between algorithmic sand dunes and floating ceramic monoliths radiating spectral rain, accentuated by selective focus bokeh revealing micron‑scale glitter particulates. Subtle visual metaphors hint at the cyclic dialogue between technology, ecology, and memory.
+Behold a cinematic vignette rendering harmony between algorithmic sand dunes and floating ceramic monoliths radiating spectral rain, rendered at 8K resolution ensuring sub‑pixel precision on reflective anisotropic surfaces. The scene silently questions whether exploration births meaning or merely mirrors ourselves.
+Encounter a novel conceptual artwork displaying vaulted librarian drones cataloging holographic memories within an endless glacier archive, with meticulously ray‑traced reflections and subsurface scattering. Spatial recursion visually articulates mathematical infinity within finite representational space.
+In this speculative panorama where bioluminescent vines entangle crystalline pyramids beneath shifting auroras, rendered at 8K resolution ensuring sub‑pixel precision on reflective anisotropic surfaces. The narrative remains open‑ended, leaving viewers adrift in deliberate interpretive ambiguity.
+Encounter a novel conceptual artwork where bioluminescent vines entangle crystalline pyramids beneath shifting auroras, using chiaroscuro lighting reminiscent of Baroque masters embracing neon chroma. Textural juxtapositions evoke synesthetic sensations that challenge conventional perceptual hierarchies.
+Observe an intricate composition detailing recursive fractal cities folding into higher‑dimensional corridors of prismatic mist, painted with thousand‑stroke impasto textures evoking tactile motion. The scene silently questions whether exploration births meaning or merely mirrors ourselves.
+Witness a hypnotic tableau illustrating translucent mineral forests echoing with harmonic resonances of quantum fireflies, in photorealistic 32‑bit colour depth. The scene silently questions whether exploration births meaning or merely mirrors ourselves.
+Explore a visionary illustration depicting desert caravans navigating rivers of liquid glass during twin sunsets, modeled via procedural geometry emphasising topological elegance and material translucency. Subtle visual metaphors hint at the cyclic dialogue between technology, ecology, and memory.
+Contemplate a hyper‑detailed diorama rendering harmony between algorithmic sand dunes and floating ceramic monoliths radiating spectral rain, expressed through Art‑Nouveau line work fused with minimal vaporwave gradients. Layered symbolism underscores paradoxes of permanence versus ephemerality in manufactured eternity.
+Behold a cinematic vignette featuring clockwork leviathans swimming through stratified ocean trenches of inverted gravity, expressed through Art‑Nouveau line work fused with minimal vaporwave gradients. Chromatic rhythms pulse like distant quasars, suggesting harmony between chaos and order.
+Inspect a surreal mise‑en‑scène capturing synchronised solar sails blossoming like origami across a kaleidoscopic nebula backdrop, using chiaroscuro lighting reminiscent of Baroque masters embracing neon chroma. Every detail encourages reflection on humanity’s place within expansive, unknowable frontiers.
+Contemplate a hyper‑detailed diorama detailing recursive fractal cities folding into higher‑dimensional corridors of prismatic mist, rendered at 8K resolution ensuring sub‑pixel precision on reflective anisotropic surfaces. The narrative remains open‑ended, leaving viewers adrift in deliberate interpretive ambiguity.
+Consider an evocative dreamscape rendering harmony between algorithmic sand dunes and floating ceramic monoliths radiating spectral rain, accentuated by selective focus bokeh revealing micron‑scale glitter particulates. Spatial recursion visually articulates mathematical infinity within finite representational space.
+In this speculative panorama detailing recursive fractal cities folding into higher‑dimensional corridors of prismatic mist, framed by an ultra‑wide 16:9 holographic canvas supporting volumetric parallax. Textural juxtapositions evoke synesthetic sensations that challenge conventional perceptual hierarchies.
+Witness a hypnotic tableau depicting desert caravans navigating rivers of liquid glass during twin sunsets, framed by an ultra‑wide 16:9 holographic canvas supporting volumetric parallax. The composition invites contemplation of entropy, renewal, and cosmic interconnectedness.
+Consider an evocative dreamscape illustrating translucent mineral forests echoing with harmonic resonances of quantum fireflies, painted with thousand‑stroke impasto textures evoking tactile motion. Every detail encourages reflection on humanity’s place within expansive, unknowable frontiers.
+Consider an evocative dreamscape featuring clockwork leviathans swimming through stratified ocean trenches of inverted gravity, framed by an ultra‑wide 16:9 holographic canvas supporting volumetric parallax. Ultimately, the image celebrates imaginative elasticity as a frontier of scientific discovery.
+Survey an expansive environment illustrating translucent mineral forests echoing with harmonic resonances of quantum fireflies, accentuated by selective focus bokeh revealing micron‑scale glitter particulates. The composition invites contemplation of entropy, renewal, and cosmic interconnectedness.
+Explore a visionary illustration featuring clockwork leviathans swimming through stratified ocean trenches of inverted gravity, with meticulously ray‑traced reflections and subsurface scattering. Every detail encourages reflection on humanity’s place within expansive, unknowable frontiers.
+In this speculative panorama featuring clockwork leviathans swimming through stratified ocean trenches of inverted gravity, expressed through Art‑Nouveau line work fused with minimal vaporwave gradients. Ultimately, the image celebrates imaginative elasticity as a frontier of scientific discovery.
+Observe an intricate composition featuring clockwork leviathans swimming through stratified ocean trenches of inverted gravity, rendered at 8K resolution ensuring sub‑pixel precision on reflective anisotropic surfaces. The narrative remains open‑ended, leaving viewers adrift in deliberate interpretive ambiguity.
+Inspect a surreal mise‑en‑scène displaying vaulted librarian drones cataloging holographic memories within an endless glacier archive, framed by an ultra‑wide 16:9 holographic canvas supporting volumetric parallax. Layered symbolism underscores paradoxes of permanence versus ephemerality in manufactured eternity.
+Survey an expansive environment illustrating translucent mineral forests echoing with harmonic resonances of quantum fireflies, painted with thousand‑stroke impasto textures evoking tactile motion. Subtle visual metaphors hint at the cyclic dialogue between technology, ecology, and memory.
+Behold a cinematic vignette displaying vaulted librarian drones cataloging holographic memories within an endless glacier archive, with meticulously ray‑traced reflections and subsurface scattering. Textural juxtapositions evoke synesthetic sensations that challenge conventional perceptual hierarchies.
+Observe an intricate composition detailing recursive fractal cities folding into higher‑dimensional corridors of prismatic mist, captured by a drone‑level perspective employing long‑exposure star trails. The scene silently questions whether exploration births meaning or merely mirrors ourselves.
+Witness a hypnotic tableau displaying vaulted librarian drones cataloging holographic memories within an endless glacier archive, with meticulously ray‑traced reflections and subsurface scattering. Ultimately, the image celebrates imaginative elasticity as a frontier of scientific discovery.
+Witness a hypnotic tableau capturing synchronised solar sails blossoming like origami across a kaleidoscopic nebula backdrop, modeled via procedural geometry emphasising topological elegance and material translucency. Spatial recursion visually articulates mathematical infinity within finite representational space.
+In this speculative panorama rendering harmony between algorithmic sand dunes and floating ceramic monoliths radiating spectral rain, with meticulously ray‑traced reflections and subsurface scattering. Chromatic rhythms pulse like distant quasars, suggesting harmony between chaos and order.
+Witness a hypnotic tableau showing ancient megastructures orbiting a pulsating ruby planet inside a Dyson‑shell observatory, using chiaroscuro lighting reminiscent of Baroque masters embracing neon chroma. Every detail encourages reflection on humanity’s place within expansive, unknowable frontiers.
+Encounter a novel conceptual artwork illustrating translucent mineral forests echoing with harmonic resonances of quantum fireflies, rendered at 8K resolution ensuring sub‑pixel precision on reflective anisotropic surfaces. Layered symbolism underscores paradoxes of permanence versus ephemerality in manufactured eternity.
+Encounter a novel conceptual artwork capturing synchronised solar sails blossoming like origami across a kaleidoscopic nebula backdrop, with meticulously ray‑traced reflections and subsurface scattering. Ultimately, the image celebrates imaginative elasticity as a frontier of scientific discovery.
+In this speculative panorama capturing synchronised solar sails blossoming like origami across a kaleidoscopic nebula backdrop, using chiaroscuro lighting reminiscent of Baroque masters embracing neon chroma. The composition invites contemplation of entropy, renewal, and cosmic interconnectedness.

configs/captions/starflow_v.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ Smooth aerial tracking shot orbiting an erupting volcano; turquoise crater lake, glowing lava vents, thick ash column rising; slow 180° orbit then forward glide, high detail

configs/starflow-v_7B_t2v_caus_480p.yaml ADDED Viewed

	@@ -0,0 +1,51 @@

+arguments:
+- no_flip: 1
+- fsdp: 1
+- fsdp_text_encoder: 1
+- img_size: 640
+- secondary_img_size: 512
+- txt_size: 256
+- vid_size: '81:16'
+- fps_cond: 1
+- channel_size: 48
+- patch_size: 1
+- channels: 3072
+- top_block_channels: 4096
+- blocks: 6
+- layers_per_block: 2 2 2 2 2 24
+- noise_std: 0.5
+- batch_size: 192
+- secondary_batch_size: 1536
+- secondary_ratio: 0.2
+- lr: 5e-5
+- min_lr: 1e-6
+- nvp: 1
+- rope: 1
+- adaln: 0
+- sos: 1
+- seq_order: L2R
+- pt_seq_len: 32
+- wds: 1
+- mix_aspect: 1
+- use_softplus: 1
+- cond_top_only: 1
+- use_final_norm: 1
+- learnable_self_denoiser: 1
+- conditional_denoiser: 1
+- denoiser_window: 10
+- cond_noise_level: 1
+- temporal_causal: 2
+- shallow_block_local: 1
+- gradient_checkpoint: 1
+- gradient_checkpoint_mlp: 1
+- vae: Wan-AI/Wan2.2-TI2V-5B-Diffusers:0.6
+- finetuned_vae: none
+- text: google/flan-t5-xl
+- cfg: 2.5
+- drop_label: 0.1
+- drop_image: 0.25
+- loss_scaling: 1
+- grad_clip: 1
+- grad_skip: 1
+- sample_freq: 100000
+- soft_clip: 4

configs/starflow_3B_t2i_256x256.yaml ADDED Viewed

	@@ -0,0 +1,32 @@

+arguments:
+- no_flip: 1
+- fsdp: 1
+- fsdp_text_encoder: 1
+- img_size: 256
+- txt_size: 128
+- channel_size: 4
+- patch_size: 1
+- channels: 3072
+- blocks: 6
+- layers_per_block: 2 2 2 2 2 24
+- noise_std: 0.3
+- batch_size: 1024
+- lr: 6.4e-05
+- min_lr: 1e-6
+- nvp: 1
+- rope: 1
+- adaln: 0
+- sos: 1
+- seq_order: L2R
+- wds: 1
+- use_softplus: 1
+- cond_top_only: 1
+- use_final_norm: 1
+- gradient_checkpoint: 0
+- vae: stabilityai/sd-vae-ft-ema
+- text: google/flan-t5-xl
+- cfg: 2.5
+- drop_label: 0.1
+- sample_freq: 10
+- soft_clip: 4
+- latent_norm_regularization: 1e-4

misc/__init__.py ADDED Viewed

	@@ -0,0 +1,34 @@

+#
+# For licensing see accompanying LICENSE file.
+# Copyright (C) 2025 Apple Inc. All Rights Reserved.
+#
+import os
+def get_local_rank():
+    if os.environ.get('IRISCTL_ROLE'):
+        import irisctl.api as irisctl
+        return irisctl.local_rank()
+    elif os.environ.get('MASTER_PORT'):
+        return int(os.environ['LOCAL_RANK'])
+    else:
+        return 0
+def print(*args, **kwargs):
+    if get_local_rank() == 0:
+        import builtins
+        builtins.print(*args, **kwargs)
+def xprint(string):
+    import builtins
+    local_rank = get_local_rank()
+    builtins.print(f'[Local Rank {local_rank}] {string}')
+def dividable(x):
+    for i in range(int(x ** 0.5), 0, -1):
+        if x % i == 0:
+            return x // i
+    return x

misc/ae_losses.py ADDED Viewed

	@@ -0,0 +1,330 @@

+#
+# For licensing see accompanying LICENSE file.
+# Copyright (C) 2025 Apple Inc. All Rights Reserved.
+#
+from typing import Mapping, Text, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchvision import models
+from einops import rearrange
+from torch.cuda.amp import autocast
+from .lpips import LPIPS
+from .discriminator import NLayerDiscriminator, NLayer3DDiscriminator
+_IMAGENET_MEAN = [0.485, 0.456, 0.406]
+_IMAGENET_STD = [0.229, 0.224, 0.225]
+def hinge_d_loss(logits_real: torch.Tensor, logits_fake: torch.Tensor) -> torch.Tensor:
+    """Hinge loss for discrminator.
+    This function is borrowed from
+    https://github.com/CompVis/taming-transformers/blob/master/taming/modules/losses/vqperceptual.py#L20
+    """
+    loss_real = torch.mean(F.relu(1.0 - logits_real))
+    loss_fake = torch.mean(F.relu(1.0 + logits_fake))
+    d_loss = 0.5 * (loss_real + loss_fake)
+    return d_loss
+def compute_lecam_loss(
+    logits_real_mean: torch.Tensor,
+    logits_fake_mean: torch.Tensor,
+    ema_logits_real_mean: torch.Tensor,
+    ema_logits_fake_mean: torch.Tensor
+) -> torch.Tensor:
+    """Computes the LeCam loss for the given average real and fake logits.
+    Args:
+        logits_real_mean -> torch.Tensor: The average real logits.
+        logits_fake_mean -> torch.Tensor: The average fake logits.
+        ema_logits_real_mean -> torch.Tensor: The EMA of the average real logits.
+        ema_logits_fake_mean -> torch.Tensor: The EMA of the average fake logits.
+    Returns:
+        lecam_loss -> torch.Tensor: The LeCam loss.
+    """
+    lecam_loss = torch.mean(torch.pow(F.relu(logits_real_mean - ema_logits_fake_mean), 2))
+    lecam_loss += torch.mean(torch.pow(F.relu(ema_logits_real_mean - logits_fake_mean), 2))
+    return lecam_loss
+class PerceptualLoss(torch.nn.Module):
+    def __init__(self, dist, model_name: str = "convnext_s"):
+        """Initializes the PerceptualLoss class.
+        Args:
+            model_name: A string, the name of the perceptual loss model to use.
+        Raise:
+            ValueError: If the model_name does not contain "lpips" or "convnext_s".
+        """
+        super().__init__()
+        if ("lpips" not in model_name) and (
+            "convnext_s" not in model_name):
+            raise ValueError(f"Unsupported Perceptual Loss model name {model_name}")
+        self.dist = dist
+        self.lpips = None
+        self.convnext = None
+        self.loss_weight_lpips = None
+        self.loss_weight_convnext = None
+        # Parsing the model name. We support name formatted in
+        # "lpips-convnext_s-{float_number}-{float_number}", where the
+        # {float_number} refers to the loss weight for each component.
+        # E.g., lpips-convnext_s-1.0-2.0 refers to compute the perceptual loss
+        # using both the convnext_s and lpips, and average the final loss with
+        # (1.0 * loss(lpips) + 2.0 * loss(convnext_s)) / (1.0 + 2.0).
+        if "lpips" in model_name:
+            self.lpips = LPIPS(dist).eval()
+        if "convnext_s" in model_name:
+            self.convnext = models.convnext_small(weights=models.ConvNeXt_Small_Weights.IMAGENET1K_V1).eval()
+        if "lpips" in model_name and "convnext_s" in model_name:
+            loss_config = model_name.split('-')[-2:]
+            self.loss_weight_lpips, self.loss_weight_convnext = float(loss_config[0]), float(loss_config[1])
+            print(f"self.loss_weight_lpips, self.loss_weight_convnext: {self.loss_weight_lpips}, {self.loss_weight_convnext}")
+        self.register_buffer("imagenet_mean", torch.Tensor(_IMAGENET_MEAN)[None, :, None, None])
+        self.register_buffer("imagenet_std", torch.Tensor(_IMAGENET_STD)[None, :, None, None])
+        for param in self.parameters():
+            param.requires_grad = False
+    def forward(self, input: torch.Tensor, target: torch.Tensor):
+        """Computes the perceptual loss.
+        Args:
+            input: A tensor of shape (B, C, H, W), the input image. Normalized to [0, 1].
+            target: A tensor of shape (B, C, H, W), the target image. Normalized to [0, 1].
+        Returns:
+            A scalar tensor, the perceptual loss.
+        """
+        if input.dim() == 5:
+            # If the input is 5D, we assume it is a batch of videos.
+            # We will average the loss over the temporal dimension.
+            input = rearrange(input, "b t c h w -> (b t) c h w")
+            target = rearrange(target, "b t c h w -> (b t) c h w")
+        # Always in eval mode.
+        self.eval()
+        loss = 0.
+        num_losses = 0.
+        lpips_loss = 0.
+        convnext_loss = 0.
+        # Computes LPIPS loss, if available.
+        if self.lpips is not None:
+            lpips_loss = self.lpips(input, target)
+            if self.loss_weight_lpips is None:
+                loss += lpips_loss
+                num_losses += 1
+            else:
+                num_losses += self.loss_weight_lpips
+                loss += self.loss_weight_lpips * lpips_loss
+        if self.convnext is not None:
+            # Computes ConvNeXt-s loss, if available.
+            input = torch.nn.functional.interpolate(input, size=224, mode="bilinear", align_corners=False, antialias=True)
+            target = torch.nn.functional.interpolate(target, size=224, mode="bilinear", align_corners=False, antialias=True)
+            pred_input = self.convnext((input - self.imagenet_mean) / self.imagenet_std)
+            pred_target = self.convnext((target - self.imagenet_mean) / self.imagenet_std)
+            convnext_loss = torch.nn.functional.mse_loss(
+                pred_input,
+                pred_target,
+                reduction="mean")
+            if self.loss_weight_convnext is None:
+                num_losses += 1
+                loss += convnext_loss
+            else:
+                num_losses += self.loss_weight_convnext
+                loss += self.loss_weight_convnext * convnext_loss
+        # weighted avg.
+        loss = loss / num_losses
+        return loss
+class WaveletLoss3D(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, inputs, targets):
+        from torch_dwt.functional import dwt3
+        inputs, targets = inputs.float(), targets.float()
+        l1_loss = torch.abs(
+            dwt3(inputs.contiguous(), "haar") - dwt3(targets.contiguous(), "haar")
+        )
+        # Average over the number of wavelet filters, reducing the dimensions
+        l1_loss = torch.mean(l1_loss, dim=1)
+        # Average over all of the filter banks, keeping dimensions
+        l1_loss = torch.mean(l1_loss, dim=-1, keepdim=True)
+        l1_loss = torch.mean(l1_loss, dim=-2, keepdim=True)
+        l1_loss = torch.mean(l1_loss, dim=-3, keepdim=True)
+        return l1_loss
+class ReconstructionLoss_Single_Stage(torch.nn.Module):
+    def __init__(self, dist, args):
+        """Initializes the losses module.
+        Args:
+            config: A dictionary, the configuration for the model and everything else.
+        """
+        super().__init__()
+        self.dist = dist
+        self.with_condition = False
+        self.quantize_mode = 'vae'
+        self.discriminator = NLayerDiscriminator(with_condition=False).eval() if not args.use_3d_disc else NLayer3DDiscriminator(with_condition=False).eval()
+        self.reconstruction_loss = "l2"
+        self.reconstruction_weight = 1.0
+        self.quantizer_weight = 1.0
+        self.perceptual_loss = PerceptualLoss(dist, "lpips-convnext_s-1.0-0.1").eval()
+        self.perceptual_weight = 1.1
+        self.discriminator_iter_start = 0
+        self.discriminator_factor = 1.0
+        self.discriminator_weight = 0.1
+        self.lecam_regularization_weight = 0.001
+        self.lecam_ema_decay = 0.999
+        self.kl_weight = 1e-6
+        self.wavelet_loss_weight = 0.5
+        self.wavelet_loss = WaveletLoss3D()
+        self.logvar = nn.Parameter(torch.ones(size=()) * 0.0, requires_grad=False)
+        if self.lecam_regularization_weight > 0.0:
+            self.register_buffer("ema_real_logits_mean", torch.zeros((1)))
+            self.register_buffer("ema_fake_logits_mean", torch.zeros((1)))
+    @torch.amp.autocast("cuda", enabled=False)
+    def forward(self,
+                inputs: torch.Tensor,
+                reconstructions: torch.Tensor,
+                extra_result_dict: Mapping[Text, torch.Tensor],
+                global_step: int,
+                mode: str = "generator",
+                ) -> Tuple[torch.Tensor, Mapping[Text, torch.Tensor]]:
+        # Both inputs and reconstructions are in range [0, 1].
+        inputs = inputs.float()
+        reconstructions = reconstructions.float()
+        if mode == "generator":
+            return self._forward_generator(inputs, reconstructions, extra_result_dict, global_step)
+        elif mode == "discriminator":
+            return self._forward_discriminator(inputs, reconstructions, extra_result_dict, global_step)
+        else:
+            raise ValueError(f"Unsupported mode {mode}")
+    def should_discriminator_be_trained(self, global_step : int):
+        return global_step >= self.discriminator_iter_start
+    def _forward_discriminator(self,
+                               inputs: torch.Tensor,
+                               reconstructions: torch.Tensor,
+                               extra_result_dict: Mapping[Text, torch.Tensor],
+                               global_step: int,
+                               ) -> Tuple[torch.Tensor, Mapping[Text, torch.Tensor]]:
+        """Discrminator training step."""
+        discriminator_factor = self.discriminator_factor if self.should_discriminator_be_trained(global_step) else 0
+        loss_dict = {}
+        # Turn the gradients on.
+        for param in self.discriminator.parameters():
+            param.requires_grad = True
+        condition = extra_result_dict.get("condition", None) if self.with_condition else None
+        real_images = inputs.detach().requires_grad_(True)
+        logits_real = self.discriminator(real_images, condition)
+        logits_fake = self.discriminator(reconstructions.detach(), condition)
+        discriminator_loss = discriminator_factor * hinge_d_loss(logits_real=logits_real, logits_fake=logits_fake)
+        # optional lecam regularization
+        lecam_loss = torch.zeros((), device=inputs.device)
+        if self.lecam_regularization_weight > 0.0:
+            lecam_loss = compute_lecam_loss(
+                torch.mean(logits_real),
+                torch.mean(logits_fake),
+                self.ema_real_logits_mean,
+                self.ema_fake_logits_mean
+            ) * self.lecam_regularization_weight
+            self.ema_real_logits_mean = self.ema_real_logits_mean * self.lecam_ema_decay + torch.mean(logits_real).detach()  * (1 - self.lecam_ema_decay)
+            self.ema_fake_logits_mean = self.ema_fake_logits_mean * self.lecam_ema_decay + torch.mean(logits_fake).detach()  * (1 - self.lecam_ema_decay)
+        discriminator_loss += lecam_loss
+        loss_dict = dict(
+            discriminator_loss=discriminator_loss.detach(),
+            logits_real=logits_real.detach().mean(),
+            logits_fake=logits_fake.detach().mean(),
+            lecam_loss=lecam_loss.detach(),
+        )
+        return discriminator_loss, loss_dict
+    def _forward_generator(self,
+                           inputs: torch.Tensor,
+                           reconstructions: torch.Tensor,
+                           extra_result_dict: Mapping[Text, torch.Tensor],
+                           global_step: int
+                           ) -> Tuple[torch.Tensor, Mapping[Text, torch.Tensor]]:
+        """Generator training step."""
+        inputs = inputs.contiguous()
+        reconstructions = reconstructions.contiguous()
+        if self.reconstruction_loss == "l1":
+            reconstruction_loss = F.l1_loss(inputs, reconstructions, reduction="mean")
+        elif self.reconstruction_loss == "l2":
+            reconstruction_loss = F.mse_loss(inputs, reconstructions, reduction="mean")
+        else:
+            raise ValueError(f"Unsuppored reconstruction_loss {self.reconstruction_loss}")
+        reconstruction_loss *= self.reconstruction_weight
+        # Compute wavelet loss.
+        if inputs.dim() == 5:
+            wavelet_loss = self.wavelet_loss(
+                inputs.permute(0,2,1,3,4), reconstructions.permute(0,2,1,3,4)).mean()
+        else:
+            wavelet_loss = 0
+        # Compute perceptual loss.
+        perceptual_loss = self.perceptual_loss(inputs, reconstructions).mean()
+        # Compute discriminator loss.
+        generator_loss = torch.zeros((), device=inputs.device)
+        discriminator_factor = self.discriminator_factor if self.should_discriminator_be_trained(global_step) else 0
+        d_weight = 1.0
+        if discriminator_factor > 0.0 and self.discriminator_weight > 0.0:
+            # Disable discriminator gradients.
+            for param in self.discriminator.parameters():
+                param.requires_grad = False
+            logits_fake = self.discriminator(reconstructions)
+            generator_loss = -torch.mean(logits_fake)
+        d_weight *= self.discriminator_weight
+        assert self.quantize_mode == "vae", "Only vae mode is supported for now"
+        # Compute kl loss.
+        reconstruction_loss = reconstruction_loss / torch.exp(self.logvar)
+        total_loss = (
+            reconstruction_loss
+            + self.perceptual_weight * perceptual_loss
+            + d_weight * discriminator_factor * generator_loss
+            + self.wavelet_loss_weight * wavelet_loss
+        )
+        loss_dict = dict(
+            total_loss=total_loss.clone().detach(),
+            reconstruction_loss=reconstruction_loss.detach(),
+            perceptual_loss=(self.perceptual_weight * perceptual_loss).detach(),
+            weighted_gan_loss=(d_weight * discriminator_factor * generator_loss).detach(),
+            discriminator_factor=torch.tensor(discriminator_factor),
+            d_weight=d_weight,
+            gan_loss=generator_loss.detach(),
+            wavelet_loss=(self.wavelet_loss_weight * wavelet_loss).detach(),
+        )
+        return total_loss, loss_dict

misc/condition_utils.py ADDED Viewed

	@@ -0,0 +1,218 @@

+#
+# For licensing see accompanying LICENSE file.
+# Copyright (C) 2025 Apple Inc. All Rights Reserved.
+#
+## camera
+from pathlib import Path
+import json
+import re
+import tarfile
+from einops import rearrange
+import torch
+import numpy as np
+from PIL import Image
+import torchvision.transforms.functional as F
+from torchvision import transforms
+import math
+def find_factors(n):
+    factors = set()
+    for i in range(1, int(math.sqrt(n)) + 1):
+        if n % i == 0:
+            factors.add(i)
+            factors.add(n // i)
+    return sorted(factors, reverse=True)
+def find_max_scale_factor(A, B):
+    gcd = math.gcd(A, B)
+    factors = find_factors(gcd)
+    for factor in factors:
+        if A // factor >= 32 and B // factor >= 32 and abs(A-B)//factor % 2 ==0:
+            return factor
+    return 1
+def _get_plucker_embedding(intrinsic_parameters, w2c_matrices, height, width, norm_t=False, mask_idx=[0], project=False):
+    return np.concatenate([
+        get_plucker_embedding(intrinsic_parameters, w2c_matrices, height, width, norm_t, idx, project)
+        for idx in mask_idx], -1)
+def get_plucker_embedding(intrinsic_parameters, w2c_matrices, height, width, norm_t=False, mask_idx=0, project=True):
+    """
+        intrinsic_parameters.shape = [b f 4]
+        c2w_matrices.shape = [b f 4 4]
+    """
+    num_frames = intrinsic_parameters.shape[0]
+    c2w_matrices = np.linalg.inv(w2c_matrices)
+    if project:
+        w2c_cond_matrices = w2c_matrices[mask_idx: mask_idx+1]
+        c2w_matrices = w2c_cond_matrices @ c2w_matrices # relative pose to the first frame
+    if norm_t:
+        offset = c2w_matrices[:, :3, -1:]  # f, 3, 1
+        offset = offset / (np.abs(offset).max(axis=(1, 2), keepdims=True) + 1e-7)
+        c2w_matrices[:, :3, -1:] = offset
+    ys, xs = np.meshgrid(
+        np.linspace(0, height - 1, height, dtype=c2w_matrices.dtype),
+        np.linspace(0, width - 1, width, dtype=c2w_matrices.dtype), indexing='ij')
+    ys = np.tile(ys.reshape([1, height * width]), [num_frames, 1])  +0.5
+    xs = np.tile(xs.reshape([1, height * width]), [num_frames, 1])  +0.5
+    fx, fy, cx, cy = np.split(intrinsic_parameters, 4, -1)
+    fx, fy, cx, cy = fx * width, fy * height, cx * width, cy * height
+    zs_cam = np.ones_like(xs)
+    xs_cam = (xs - cx) / fx * zs_cam
+    ys_cam = (ys - cy) / fy * zs_cam
+    directions = np.stack((xs_cam, ys_cam, zs_cam), -1)
+    directions = directions / np.linalg.norm(directions, axis=-1, keepdims=True)
+    ray_directions_w = (c2w_matrices[..., :3, :3] @ directions.transpose(0, 2, 1)).transpose(0, 2, 1)
+    ray_origin_w = np.expand_dims(c2w_matrices[..., :3, 3], axis=-2)
+    ray_origin_w = np.broadcast_to(ray_origin_w, ray_directions_w.shape)
+    ray_dxo = np.cross(ray_origin_w, ray_directions_w)
+    plucker_embedding = np.concatenate([ray_dxo, ray_directions_w], -1).reshape(num_frames, height, width, 6)
+    return plucker_embedding
+def label_to_camera(label):
+    num_frames = label.shape[0]
+    bottom = np.zeros([num_frames, 1, 4])
+    bottom[:, :, -1] = 1
+    # [w, h, flx, fly] + camera_model[0] + camera_model[1] + camera_model[2] + camera_model[3]
+    w, h, fx, fy = label[:, 0:1], label[:, 1:2], label[:, 2:3], label[:, 3:4]
+    fx, fy = fx / w, fy / h
+    c2w = label[:, 4:].reshape(num_frames, 4, 4)
+    c2w[:, 2, :] *= -1
+    c2w = c2w[:, np.array([1, 0, 2, 3]), :]
+    c2w[:, 0:3, 1:3] *= -1
+    w2c = np.linalg.inv(c2w)
+    intrinsic = np.concatenate([fx, fy, np.ones_like(fx) * .5, np.ones_like(fx) * .5], 1)
+    return intrinsic, w2c
+def get_camera_condition(tar, camera_file, width=960, height=544, factor=16, frame_inds=None):
+    try:
+        with tar.extractfile(camera_file) as cam_data:
+            camera_data = json.load(cam_data)
+            prefix = [camera_data['w'], camera_data['h'], camera_data['fl_x'], camera_data['fl_y']]
+            labels = []
+            if frame_inds is None:
+                frame_inds = list(range(len(camera_data['frames'])))
+            for ind in frame_inds:
+                frame_info = camera_data['frames'][ind]
+                label = prefix + sum(frame_info['transform_matrix'], [])
+                labels.append(label)
+            label = np.array(labels)
+            intrinsic, w2c = label_to_camera(label)
+            # factor = find_max_scale_factor(height, width)
+            H, W = height // factor, width // factor
+            ray_map = _get_plucker_embedding(intrinsic, w2c, H, W, norm_t=False, mask_idx=[0], project=True)
+            ray_map = torch.from_numpy(ray_map) #.permute(0, 3, 1, 2) # [f, h, w, c]
+        # ray_map = F.resize(transforms.CenterCrop(min(H, W))(ray_map), 32).permute(0, 2, 3, 1)
+    except Exception as e:
+        print(f'Reading data error {e} {camera_file}')
+        ray_map = np.zeros((len(frame_inds), H, W, 6))
+    return ray_map
+## force
+def get_wind_condition(force, angle, min_force, max_force, num_frames=45, num_channels=3, height=480, width=720):
+    condition = torch.zeros((num_frames, num_channels, height, width))
+    # first channel gets wind_speed
+    condition[:, 0] = -1 + 2*(force-min_force)/(max_force-min_force)
+    # second channel gets cos(wind_angle)
+    condition[:, 1] = math.cos(angle * torch.pi / 180.0)
+    # third channel gets sin(wind_angle)
+    condition[:, 2] = math.sin(angle * torch.pi / 180.0)
+    return rearrange(condition, 'f c h w -> f h w c')
+def get_gaussian_blob(x, y, radius=10, amplitude=1.0, shape=(3, 480, 720), device=None):
+    """
+    Create a tensor containing a Gaussian blob at the specified location.
+    Args:
+        x (int): x-coordinate of the blob center
+        y (int): y-coordinate of the blob center
+        radius (int, optional): Radius of the Gaussian blob. Defaults to 10.
+        amplitude (float, optional): Maximum intensity of the blob. Defaults to 1.0.
+        shape (tuple, optional): Shape of the output tensor (channels, height, width). Defaults to (3, 480, 720).
+        device (torch.device, optional): Device to create the tensor on. Defaults to None.
+    Returns:
+        torch.Tensor: Tensor of shape (channels, height, width) containing the Gaussian blob
+    """
+    num_channels, height, width = shape
+    # Create a new tensor filled with zeros
+    blob_tensor = torch.zeros(shape, device=device)
+    # Create coordinate grids
+    y_grid, x_grid = torch.meshgrid(
+        torch.arange(height, device=device),
+        torch.arange(width, device=device),
+        indexing='ij'
+    )
+    # Calculate squared distance from (x, y)
+    squared_dist = (x_grid - x) ** 2 + (y_grid - y) ** 2
+    # Create Gaussian blob using the squared distance
+    gaussian = amplitude * torch.exp(-squared_dist / (2.0 * radius ** 2))
+    # Add the Gaussian blob to all channels
+    for c in range(num_channels):
+        blob_tensor[c] = gaussian
+    return blob_tensor
+def get_point_condition(force, angle, x_pos, y_pos, min_force, max_force, num_frames=45, num_channels=3, height=480, width=720):
+    condition = torch.zeros((num_frames, num_channels, height, width)) # (45, 3, 480, 720)
+    x_pos_start = x_pos*width
+    y_pos_start = (1-y_pos)*height
+    DISPLACEMENT_FOR_MAX_FORCE = width / 2
+    DISPLACEMENT_FOR_MIN_FORCE = width / 8
+    force_percent = (force - min_force) / (max_force - min_force)
+    total_displacement = DISPLACEMENT_FOR_MIN_FORCE + (DISPLACEMENT_FOR_MAX_FORCE - DISPLACEMENT_FOR_MIN_FORCE) * force_percent
+    x_pos_end = x_pos_start + total_displacement * math.cos(angle * torch.pi / 180.0)
+    y_pos_end = y_pos_start - total_displacement * math.sin(angle * torch.pi / 180.0)
+    for frame in range(num_frames):
+        t = frame / (num_frames-1)
+        x_pos_ = x_pos_start * (1-t) + x_pos_end * t # t = 0 --> start; t = 0 --> end
+        y_pos_ = y_pos_start * (1-t) + y_pos_end * t # t = 0 --> start; t = 0 --> end
+        blob_tensor = get_gaussian_blob(x=x_pos_, y=y_pos_, radius=20, amplitude=1.0, shape=(num_channels, height, width))
+        condition[frame] += blob_tensor
+    return rearrange(condition, 'f c h w -> f h w c')

misc/discriminator.py ADDED Viewed

	@@ -0,0 +1,388 @@

+#
+# For licensing see accompanying LICENSE file.
+# Copyright (C) 2025 Apple Inc. All Rights Reserved.
+#
+import functools
+import math
+from typing import Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+# Conv2D with same padding
+class Conv2dSame(nn.Conv2d):
+    def calc_same_pad(self, i: int, k: int, s: int, d: int) -> int:
+        return max((math.ceil(i / s) - 1) * s + (k - 1) * d + 1 - i, 0)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        ih, iw = x.size()[-2:]
+        pad_h = self.calc_same_pad(i=ih, k=self.kernel_size[0], s=self.stride[0], d=self.dilation[0])
+        pad_w = self.calc_same_pad(i=iw, k=self.kernel_size[1], s=self.stride[1], d=self.dilation[1])
+        if pad_h > 0 or pad_w > 0:
+            x = F.pad(x, [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2])
+        return super().forward(x)
+class BlurBlock(torch.nn.Module):
+    def __init__(self,
+                 kernel: Tuple[int] = (1, 3, 3, 1)
+                 ):
+        super().__init__()
+        kernel = torch.tensor(kernel, dtype=torch.float32, requires_grad=False)
+        kernel = kernel[None, :] * kernel[:, None]
+        kernel /= kernel.sum()
+        kernel = kernel.unsqueeze(0).unsqueeze(0)
+        self.register_buffer("kernel", kernel)
+    def calc_same_pad(self, i: int, k: int, s: int) -> int:
+        return max((math.ceil(i / s) - 1) * s + (k - 1) + 1 - i, 0)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        ic, ih, iw = x.size()[-3:]
+        pad_h = self.calc_same_pad(i=ih, k=4, s=2)
+        pad_w = self.calc_same_pad(i=iw, k=4, s=2)
+        if pad_h > 0 or pad_w > 0:
+            x = F.pad(x, [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2])
+        weight = self.kernel.expand(ic, -1, -1, -1)
+        out = F.conv2d(input=x, weight=weight, stride=2, groups=x.shape[1])
+        return out
+class SinusoidalTimeEmbedding(torch.nn.Module):
+    def __init__(self, embedding_dim: int):
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        assert embedding_dim % 2 == 0, "embedding_dim must be even"
+    def forward(self, timesteps: torch.Tensor) -> torch.Tensor:
+        half_dim = self.embedding_dim // 2
+        embeddings = math.log(10000) / (half_dim - 1)
+        embeddings = torch.exp(torch.arange(half_dim, device=timesteps.device) * -embeddings)
+        embeddings = timesteps[:, None] * embeddings[None, :]
+        embeddings = torch.cat((embeddings.sin(), embeddings.cos()), dim=-1)
+        return embeddings
+class ModulatedConv2dSame(Conv2dSame):
+    def __init__(self, in_channels, out_channels, kernel_size, cond_channels=None):
+        super().__init__(in_channels, out_channels, kernel_size)
+        # FiLM modulation projections
+        if cond_channels is not None:
+            self.film_proj = torch.nn.Linear(cond_channels, 2 * out_channels)
+            # Initialize scale to 0 and bias to 0
+            torch.nn.init.zeros_(self.film_proj.weight)
+            torch.nn.init.zeros_(self.film_proj.bias)
+    def forward(self, x, temb=None):
+        x = super().forward(x)
+        if temb is not None:
+            scale, bias = self.film_proj(temb)[:, :, None, None].chunk(2, dim=1)
+            x = x * (scale + 1) + bias
+        return x
+class NLayerDiscriminator(torch.nn.Module):
+    def __init__(
+        self,
+        num_channels: int = 3,
+        hidden_channels: int = 128,
+        num_stages: int = 3,
+        blur_resample: bool = True,
+        blur_kernel_size: int = 4,
+        with_condition: bool = False,
+    ):
+        """ Initializes the NLayerDiscriminator.
+        Args:
+            num_channels -> int: The number of input channels.
+            hidden_channels -> int: The number of hidden channels.
+            num_stages -> int: The number of stages.
+            blur_resample -> bool: Whether to use blur resampling.
+            blur_kernel_size -> int: The blur kernel size.
+        """
+        super().__init__()
+        assert num_stages > 0, "Discriminator cannot have 0 stages"
+        assert (not blur_resample) or (blur_kernel_size >= 3 and blur_kernel_size <= 5), "Blur kernel size must be in [3,5] when sampling]"
+        in_channel_mult = (1,) + tuple(map(lambda t: 2**t, range(num_stages)))
+        init_kernel_size = 5
+        activation = functools.partial(torch.nn.LeakyReLU, negative_slope=0.1)
+        self.with_condition = with_condition
+        if with_condition:
+            cond_channels = 768
+            self.time_emb = SinusoidalTimeEmbedding(128)
+            self.time_proj = torch.nn.Sequential(
+                torch.nn.Linear(128, cond_channels),
+                torch.nn.SiLU(),
+                torch.nn.Linear(cond_channels, cond_channels),
+            )
+        else:
+            cond_channels = None
+        self.block_in = torch.nn.Sequential(
+            Conv2dSame(
+                num_channels,
+                hidden_channels,
+                kernel_size=init_kernel_size
+            ),
+            activation(),
+        )
+        BLUR_KERNEL_MAP = {
+            3: (1,2,1),
+            4: (1,3,3,1),
+            5: (1,4,6,4,1),
+        }
+        discriminator_blocks = []
+        for i_level in range(num_stages):
+            in_channels = hidden_channels * in_channel_mult[i_level]
+            out_channels = hidden_channels * in_channel_mult[i_level + 1]
+            conv_block = ModulatedConv2dSame(
+                    in_channels,
+                    out_channels,
+                    kernel_size=3,
+                    cond_channels=cond_channels
+            )
+            discriminator_blocks.append(conv_block)
+            down_block = torch.nn.Sequential(
+                torch.nn.AvgPool2d(kernel_size=2, stride=2) if not blur_resample else BlurBlock(BLUR_KERNEL_MAP[blur_kernel_size]),
+                torch.nn.GroupNorm(32, out_channels),
+                activation(),
+            )
+            discriminator_blocks.append(down_block)
+        self.blocks = torch.nn.ModuleList(discriminator_blocks)
+        self.pool = torch.nn.AdaptiveMaxPool2d((16, 16))
+        self.to_logits = torch.nn.Sequential(
+            Conv2dSame(out_channels, out_channels, 1),
+            activation(),
+            Conv2dSame(out_channels, 1, kernel_size=5)
+        )
+    def forward(self, x: torch.Tensor, condition: torch.Tensor = None) -> torch.Tensor:
+        """ Forward pass.
+        Args:
+            x -> torch.Tensor: The input tensor.
+        Returns:
+            output -> torch.Tensor: The output tensor.
+        """
+        if x.dim() == 5:
+            x = rearrange(x, 'b t c h w -> (b t) c h w')
+        hidden_states = self.block_in(x)
+        if condition is not None and self.with_condition:
+            temb = self.time_proj(self.time_emb(condition * 1000.0))
+        else:
+            temb = None
+        for i, block in enumerate(self.blocks):
+            if i % 2 == 0:
+                hidden_states = block(hidden_states, temb)  # conv_block
+            else:
+                hidden_states = block(hidden_states)  # down_block
+        hidden_states = self.pool(hidden_states)
+        return self.to_logits(hidden_states)
+# 3D discriminator
+class Conv3dSame(nn.Conv3d):
+    def calc_same_pad(self, i: int, k: int, s: int, d: int) -> int:
+        return max((math.ceil(i / s) - 1) * s + (k - 1) * d + 1 - i, 0)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        it, ih, iw = x.size()[-3:]  # frame, height, width
+        pad_t = self.calc_same_pad(i=it, k=self.kernel_size[0], s=self.stride[0], d=self.dilation[0])
+        pad_h = self.calc_same_pad(i=ih, k=self.kernel_size[1], s=self.stride[1], d=self.dilation[1])
+        pad_w = self.calc_same_pad(i=iw, k=self.kernel_size[2], s=self.stride[2], d=self.dilation[2])
+        if pad_t > 0 or pad_h > 0 or pad_w > 0:
+            x = F.pad(
+                x,
+                [pad_w // 2, pad_w - pad_w // 2,
+                 pad_h // 2, pad_h - pad_h // 2,
+                 pad_t // 2, pad_t - pad_t // 2],
+            )
+        return super().forward(x)
+class ModulatedConv3dSame(Conv3dSame):
+    def __init__(self, in_channels, out_channels, kernel_size, cond_channels=None):
+        super().__init__(in_channels, out_channels, kernel_size)
+        # FiLM modulation
+        if cond_channels is not None:
+            self.film_proj = torch.nn.Linear(cond_channels, 2 * out_channels)
+            # Initialize FiLM params (scale to 0, bias to 0)
+            torch.nn.init.zeros_(self.film_proj.weight)
+            torch.nn.init.zeros_(self.film_proj.bias)
+    def forward(self, x, temb=None):
+        x = super().forward(x)  # (B, C, T, H, W)
+        if temb is not None:
+            scale, bias = self.film_proj(temb)[:, :, None, None, None].chunk(2, dim=1)
+            x = x * (scale + 1) + bias
+        return x
+class BlurBlock3D(nn.Module):
+    def __init__(self, kernel=(1, 3, 3, 1), stride=(1, 2, 2)):
+        """
+        3D BlurPool block.
+        Applies blur to spatial dimensions only by default.
+        """
+        super().__init__()
+        self.stride = stride
+        kernel = torch.tensor(kernel, dtype=torch.float32, requires_grad=False)
+        kernel = kernel[None, :] * kernel[:, None]
+        kernel /= kernel.sum()
+        kernel = kernel.unsqueeze(0).unsqueeze(0).unsqueeze(0)  # shape: (1, 1, 1, H, W)
+        self.register_buffer("kernel", kernel)
+    def calc_same_pad(self, i: int, k: int, s: int) -> int:
+        return max((math.ceil(i / s) - 1) * s + (k - 1) + 1 - i, 0)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        _, c, t, h, w = x.shape
+        kd, kh, kw = self.kernel.shape[-3:]
+        sd, sh, sw = self.stride
+        # Only apply padding to H and W
+        pad_h = self.calc_same_pad(h, kh, sh)
+        pad_w = self.calc_same_pad(w, kw, sw)
+        pad_d = 0 if sd == 1 else self.calc_same_pad(t, kd, sd)
+        if pad_h > 0 or pad_w > 0 or pad_d > 0:
+            x = F.pad(x, [pad_w // 2, pad_w - pad_w // 2,
+                          pad_h // 2, pad_h - pad_h // 2,
+                          pad_d // 2, pad_d - pad_d // 2])
+        weight = self.kernel.expand(c, 1, -1, -1, -1)
+        return F.conv3d(x, weight, stride=self.stride, groups=c)
+class NLayer3DDiscriminator(torch.nn.Module):
+    def __init__(
+        self,
+        num_channels: int = 3,
+        hidden_channels: int = 128,
+        num_stages: int = 3,
+        blur_resample: bool = True,
+        blur_kernel_size: int = 4,
+        with_condition: bool = False,
+    ):
+        """ Initializes the NLayer3DDiscriminator.
+        Args:
+            num_channels -> int: The number of input channels.
+            hidden_channels -> int: The number of hidden channels.
+            num_stages -> int: The number of stages.
+            blur_resample -> bool: Whether to use blur resampling.
+            blur_kernel_size -> int: The blur kernel size.
+        """
+        super().__init__()
+        assert num_stages > 0, "Discriminator cannot have 0 stages"
+        assert (not blur_resample) or (blur_kernel_size >= 3 and blur_kernel_size <= 5), "Blur kernel size must be in [3,5] when sampling]"
+        in_channel_mult = (1,) + tuple(map(lambda t: 2**t, range(num_stages)))
+        init_kernel_size = 5
+        activation = functools.partial(torch.nn.LeakyReLU, negative_slope=0.1)
+        self.with_condition = with_condition
+        if with_condition:
+            cond_channels = 768
+            self.time_emb = SinusoidalTimeEmbedding(128)
+            self.time_proj = torch.nn.Sequential(
+                torch.nn.Linear(128, cond_channels),
+                torch.nn.SiLU(),
+                torch.nn.Linear(cond_channels, cond_channels),
+            )
+        else:
+            cond_channels = None
+        self.block_in = torch.nn.Sequential(
+            Conv3dSame(
+                num_channels,
+                hidden_channels,
+                kernel_size=init_kernel_size
+            ),
+            activation(),
+        )
+        BLUR_KERNEL_MAP = {
+            3: (1,2,1),
+            4: (1,3,3,1),
+            5: (1,4,6,4,1),
+        }
+        num_downsample_temp_stage = int(num_stages * 1/3)
+        downsample_temp = [False] * num_downsample_temp_stage + [True] * (num_stages - num_downsample_temp_stage)
+        discriminator_blocks = []
+        for i_level in range(num_stages):
+            in_channels = hidden_channels * in_channel_mult[i_level]
+            out_channels = hidden_channels * in_channel_mult[i_level + 1]
+            conv_block = ModulatedConv3dSame(
+                    in_channels,
+                    out_channels,
+                    kernel_size=3,
+                    cond_channels=cond_channels
+            )
+            discriminator_blocks.append(conv_block)
+            down_block = torch.nn.Sequential(
+                torch.nn.AvgPool3d(kernel_size=2, stride=(2, 2, 2) if downsample_temp[i_level] else (1, 2, 2)) if not blur_resample else BlurBlock3D(BLUR_KERNEL_MAP[blur_kernel_size], stride=(2, 2, 2) if downsample_temp[i_level] else (1, 2, 2)),
+                torch.nn.GroupNorm(32, out_channels),
+                activation(),
+            )
+            discriminator_blocks.append(down_block)
+        self.blocks = torch.nn.ModuleList(discriminator_blocks)
+        self.pool = torch.nn.AdaptiveMaxPool3d((2, 16, 16))
+        self.to_logits = torch.nn.Sequential(
+            Conv3dSame(out_channels, out_channels, 1),
+            activation(),
+            Conv3dSame(out_channels, 1, kernel_size=5)
+        )
+    def forward(self, x: torch.Tensor, condition: torch.Tensor = None) -> torch.Tensor:
+        """ Forward pass.
+        Args:
+            x -> torch.Tensor: The input tensor of shape [b t c h w].
+        Returns:
+            output -> torch.Tensor: The output tensor.
+        """
+        x = rearrange(x, 'b t c h w -> b c t h w')
+        hidden_states = self.block_in(x)
+        if condition is not None and self.with_condition:
+            temb = self.time_proj(self.time_emb(condition * 1000.0))
+        else:
+            temb = None
+        for i, block in enumerate(self.blocks):
+            if i % 2 == 0:
+                hidden_states = block(hidden_states, temb)  # conv_block
+            else:
+                hidden_states = block(hidden_states)  # down_block
+        hidden_states = self.pool(hidden_states)
+        return self.to_logits(hidden_states)

misc/lpips.py ADDED Viewed

	@@ -0,0 +1,142 @@

+#
+# For licensing see accompanying LICENSE file.
+# Copyright (C) 2025 Apple Inc. All Rights Reserved.
+#
+"""This file contains code for LPIPS.
+Reference:
+    https://github.com/richzhang/PerceptualSimilarity/
+    https://github.com/CompVis/taming-transformers/blob/master/taming/modules/losses/lpips.py
+    https://github.com/CompVis/taming-transformers/blob/master/taming/util.py
+"""
+import os
+import hashlib
+import requests
+from collections import namedtuple
+from tqdm import tqdm
+import torch
+import torch.nn as nn
+from torchvision import models
+_LPIPS_MEAN = [-0.030, -0.088, -0.188]
+_LPIPS_STD = [0.458, 0.448, 0.450]
+class LPIPS(nn.Module):
+    # Learned perceptual metric.
+    def __init__(self, dist, use_dropout=True):
+        super().__init__()
+        self.dist = dist
+        self.scaling_layer = ScalingLayer()
+        self.chns = [64, 128, 256, 512, 512]  # vg16 features
+        self.net = vgg16(pretrained=True, requires_grad=False)
+        self.lin0 = NetLinLayer(self.chns[0], use_dropout=use_dropout)
+        self.lin1 = NetLinLayer(self.chns[1], use_dropout=use_dropout)
+        self.lin2 = NetLinLayer(self.chns[2], use_dropout=use_dropout)
+        self.lin3 = NetLinLayer(self.chns[3], use_dropout=use_dropout)
+        self.lin4 = NetLinLayer(self.chns[4], use_dropout=use_dropout)
+        self.load_pretrained()
+        for param in self.parameters():
+            param.requires_grad = False
+    def load_pretrained(self):
+        VGG_PATH = os.path.join(os.path.join("/root/.cache", "vgg.pth"))
+        self.load_state_dict(torch.load(VGG_PATH, map_location=torch.device("cpu")), strict=False)
+    def forward(self, input, target):
+        # Notably, the LPIPS w/ pre-trained weights expect the input in the range of [-1, 1].
+        # However, our codebase assumes all inputs are in range of [0, 1], and thus a scaling is needed.
+        input = input * 2. - 1.
+        target = target * 2. - 1.
+        in0_input, in1_input = (self.scaling_layer(input), self.scaling_layer(target))
+        outs0, outs1 = self.net(in0_input), self.net(in1_input)
+        feats0, feats1, diffs = {}, {}, {}
+        lins = [self.lin0, self.lin1, self.lin2, self.lin3, self.lin4]
+        for kk in range(len(self.chns)):
+            feats0[kk], feats1[kk] = normalize_tensor(outs0[kk]), normalize_tensor(outs1[kk])
+            diffs[kk] = (feats0[kk] - feats1[kk]) ** 2
+        res = [spatial_average(lins[kk].model(diffs[kk]), keepdim=True) for kk in range(len(self.chns))]
+        val = res[0]
+        for l in range(1, len(self.chns)):
+            val += res[l]
+        return val
+class ScalingLayer(nn.Module):
+    def __init__(self):
+        super(ScalingLayer, self).__init__()
+        self.register_buffer("shift", torch.Tensor(_LPIPS_MEAN)[None, :, None, None])
+        self.register_buffer("scale", torch.Tensor(_LPIPS_STD)[None, :, None, None])
+    def forward(self, inp):
+        return (inp - self.shift) / self.scale
+class NetLinLayer(nn.Module):
+    """A single linear layer which does a 1x1 conv."""
+    def __init__(self, chn_in, chn_out=1, use_dropout=False):
+        super(NetLinLayer, self).__init__()
+        layers = (
+            [
+                nn.Dropout(),
+            ]
+            if (use_dropout)
+            else []
+        )
+        layers += [
+            nn.Conv2d(chn_in, chn_out, 1, stride=1, padding=0, bias=False),
+        ]
+        self.model = nn.Sequential(*layers)
+class vgg16(torch.nn.Module):
+    def __init__(self, requires_grad=False, pretrained=True):
+        super(vgg16, self).__init__()
+        vgg_pretrained_features = models.vgg16(weights=models.VGG16_Weights.IMAGENET1K_V1).features
+        self.slice1 = torch.nn.Sequential()
+        self.slice2 = torch.nn.Sequential()
+        self.slice3 = torch.nn.Sequential()
+        self.slice4 = torch.nn.Sequential()
+        self.slice5 = torch.nn.Sequential()
+        self.N_slices = 5
+        for x in range(4):
+            self.slice1.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(4, 9):
+            self.slice2.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(9, 16):
+            self.slice3.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(16, 23):
+            self.slice4.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(23, 30):
+            self.slice5.add_module(str(x), vgg_pretrained_features[x])
+        if not requires_grad:
+            for param in self.parameters():
+                param.requires_grad = False
+    def forward(self, X):
+        h = self.slice1(X)
+        h_relu1_2 = h
+        h = self.slice2(h)
+        h_relu2_2 = h
+        h = self.slice3(h)
+        h_relu3_3 = h
+        h = self.slice4(h)
+        h_relu4_3 = h
+        h = self.slice5(h)
+        h_relu5_3 = h
+        vgg_outputs = namedtuple("VggOutputs", ["relu1_2", "relu2_2", "relu3_3", "relu4_3", "relu5_3"])
+        out = vgg_outputs(h_relu1_2, h_relu2_2, h_relu3_3, h_relu4_3, h_relu5_3)
+        return out
+def normalize_tensor(x, eps=1e-10):
+    norm_factor = torch.sqrt(torch.sum(x**2, dim=1, keepdim=True))
+    return x / (norm_factor + eps)
+def spatial_average(x, keepdim=True):
+    return x.mean([2, 3], keepdim=keepdim)

misc/pe.py ADDED Viewed

	@@ -0,0 +1,151 @@

+#
+# For licensing see accompanying LICENSE file.
+# Copyright (C) 2025 Apple Inc. All Rights Reserved.
+#
+from math import pi, sqrt
+import torch
+from torch import nn
+from einops import rearrange, repeat
+def broadcat(tensors, dim = -1):
+    num_tensors = len(tensors)
+    shape_lens = set(list(map(lambda t: len(t.shape), tensors)))
+    assert len(shape_lens) == 1, 'tensors must all have the same number of dimensions'
+    shape_len = list(shape_lens)[0]
+    dim = (dim + shape_len) if dim < 0 else dim
+    dims = list(zip(*map(lambda t: list(t.shape), tensors)))
+    expandable_dims = [(i, val) for i, val in enumerate(dims) if i != dim]
+    assert all([*map(lambda t: len(set(t[1])) <= 2, expandable_dims)]), 'invalid dimensions for broadcastable concatentation'
+    max_dims = list(map(lambda t: (t[0], max(t[1])), expandable_dims))
+    expanded_dims = list(map(lambda t: (t[0], (t[1],) * num_tensors), max_dims))
+    expanded_dims.insert(dim, (dim, dims[dim]))
+    expandable_shapes = list(zip(*map(lambda t: t[1], expanded_dims)))
+    tensors = list(map(lambda t: t[0].expand(*t[1]), zip(tensors, expandable_shapes)))
+    return torch.cat(tensors, dim = dim)
+def rotate_half(x):
+    x = rearrange(x, '... (d r) -> ... d r', r = 2)
+    x1, x2 = x.unbind(dim = -1)
+    x = torch.stack((-x2, x1), dim = -1)
+    return rearrange(x, '... d r -> ... (d r)')
+def apply_rope(t, freqs):
+    return t * freqs.cos() + rotate_half(t) * freqs.sin()
+def get_positions(h=0, w=0, txt_size=0, pt_seq_len=None, duplicate=0, mode='3d'):
+    assert mode in ['1d', '2d', '3d'], "mode must be one of ['1d', '2d', '3d']"
+    assert h * w + txt_size > 0, "at least one of img_size or txt_size must be greater than 0"
+    mean_len = sqrt(h * w)
+    pt_seq_len = pt_seq_len or mean_len
+    if mode == '1d':
+        pos_txt = torch.arange(txt_size)
+        pos_img = torch.arange(h * w)  # / (h * w) * (pt_seq_len ** 2)
+        pos = torch.cat([pos_txt, pos_img + txt_size], dim=0).unsqueeze(-1)
+    else:
+        assert h * w > 0, "2D/3D RoPE requires img_size > 0"
+        px = torch.arange(h) / mean_len * pt_seq_len
+        py = torch.arange(w) / mean_len * pt_seq_len
+        px, py = [pi.reshape(-1) for pi in torch.meshgrid(px, py, indexing='ij')]
+        if mode == '2d':
+            assert txt_size == 0, "2D RoPE does not support text conditioning"
+            pos = [px, py]
+        else:  # mode == '3d'
+            if duplicate == 0:
+                pos = [px, py, torch.zeros_like(px)]
+            else:  # it has sequence length, this is for VideoData
+                pos = [torch.cat([px for _ in range(duplicate)]),
+                       torch.cat([py for _ in range(duplicate)]),
+                       torch.arange(duplicate).repeat_interleave(h * w)]
+            if txt_size > 0:  # text is used as conditioned
+                pt = torch.arange(txt_size) / txt_size * pt_seq_len
+                pos = [ torch.cat([torch.zeros_like(pt), pos[0]]),
+                        torch.cat([torch.zeros_like(pt), pos[1]]),
+                        torch.cat([pt, pos[2]])]
+        pos = torch.stack(pos, dim=-1)
+    return pos
+class VisionRotaryEmbeddingFast(nn.Module):
+    def __init__(
+        self,
+        dim,  # half-dim
+        pt_seq_len=16,
+        ft_seq_len=None,
+        latent_len=0,
+        custom_freqs = None,
+        freqs_for = 'lang',
+        theta = 10000,
+        max_freq = 10,
+        num_freqs = 1,
+        dim_split=None,
+        no_buffer=False,
+        is_1d=False,
+    ):
+        super().__init__()
+        # length is normalized to pt_seq_len
+        if is_1d:  # standard 1D-RoPE
+            assert freqs_for == 'lang', "RoPE for language settings"
+            dim_split, dim = [dim], 2 * dim
+            self.freqs = 1. / (theta ** (torch.arange(0, dim, 2)[:(dim // 2)].float() / dim))
+        else:
+            if ft_seq_len is None:
+                ft_seq_len = pt_seq_len
+            if latent_len > 0:
+                if dim_split is None: dim_split = [dim - 8, 8]
+                dim, latent_dim = dim_split
+            else:
+                dim_split = [dim]
+            if custom_freqs:
+                self.freqs = custom_freqs
+            elif freqs_for == 'lang':
+                self.freqs = 1. / (theta ** (torch.arange(0, dim, 2)[:(dim // 2)].float() / dim))
+            elif freqs_for == 'pixel':
+                self.freqs = torch.linspace(1., max_freq / 2, dim // 2) * pi
+            elif freqs_for == 'constant':
+                self.freqs = torch.ones(num_freqs).float()
+            else:
+                raise ValueError(f'unknown modality {freqs_for}')
+            if latent_len > 0:
+                self.freqs2 = 1. / (theta ** (torch.arange(0, latent_dim).float() / latent_dim))
+        self.is_1d = is_1d
+        self.pt_seq_len = pt_seq_len
+        self.ft_seq_len = ft_seq_len
+        self.latent_len = latent_len
+        # NOTE: deprecated (do not touch, will affect old checkpoints) #
+        if not no_buffer and pt_seq_len > 0:
+            _deprecated = torch.zeros(pt_seq_len ** 2, sum(dim_split) * 2)
+            if latent_len > 0:
+                _deprecated = torch.cat([torch.zeros(latent_len, sum(dim_split) * 2), _deprecated], dim=0)
+            self.register_buffer("freqs_cos", _deprecated)
+            self.register_buffer("freqs_sin", _deprecated)
+        # ------------------------------------------------------------ #
+    def forward(self, pos):
+        if not isinstance(pos, torch.Tensor):
+            pos = torch.tensor(pos).to(self.freqs_cos.device)
+        if not self.is_1d:  # this is 2D or 3D rope
+            assert pos.shape[-1] > 1, "2D/3D RoPE requires multi-dimensional positions"
+            freqs_all = [
+                torch.einsum('..., f -> ... f', pos[..., 0], self.freqs.to(pos.device)),
+                torch.einsum('..., f -> ... f', pos[..., 1], self.freqs.to(pos.device)),
+            ]
+            if pos.shape[-1] == 3:  # additional latent dimension (maybe text)
+                freqs_all.append(torch.einsum('..., f -> ... f', pos[..., 2], self.freqs2.to(pos.device)))
+            freqs_all = torch.cat(freqs_all, -1)
+        else:
+            freqs_all = torch.einsum('..., f -> ... f', pos[..., 0], self.freqs.to(pos.device))
+        freqs_all = repeat(freqs_all, '... n -> ... (n r)', r = 2)
+        return freqs_all

misc/wan_vae2.py ADDED Viewed

	@@ -0,0 +1,1000 @@

+#
+# For licensing see accompanying LICENSE file.
+# Copyright (C) 2025 Apple Inc. All Rights Reserved.
+#
+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import logging
+import torch
+import torch.cuda.amp as amp
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+__all__ = [
+    "Wan2_2_VAE",
+]
+CACHE_T = 2
+class CausalConv3d(nn.Conv3d):
+    """
+    Causal 3d convolusion.
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._padding = (
+            self.padding[2],
+            self.padding[2],
+            self.padding[1],
+            self.padding[1],
+            2 * self.padding[0],
+            0,
+        )
+        self.padding = (0, 0, 0)
+    def forward(self, x, cache_x=None):
+        padding = list(self._padding)
+        if cache_x is not None and self._padding[4] > 0:
+            cache_x = cache_x.to(x.device)
+            x = torch.cat([cache_x, x], dim=2)
+            padding[4] -= cache_x.shape[2]
+        x = F.pad(x, padding)
+        return super().forward(x)
+class RMS_norm(nn.Module):
+    def __init__(self, dim, channel_first=True, images=True, bias=False):
+        super().__init__()
+        broadcastable_dims = (1, 1, 1) if not images else (1, 1)
+        shape = (dim, *broadcastable_dims) if channel_first else (dim,)
+        self.channel_first = channel_first
+        self.scale = dim**0.5
+        self.gamma = nn.Parameter(torch.ones(shape))
+        self.bias = nn.Parameter(torch.zeros(shape)) if bias else 0.0
+    def forward(self, x):
+        return (F.normalize(x, dim=(1 if self.channel_first else -1)) *
+                self.scale * self.gamma + self.bias)
+class Upsample(nn.Upsample):
+    def forward(self, x):
+        """
+        Fix bfloat16 support for nearest neighbor interpolation.
+        """
+        return super().forward(x.float()).type_as(x)
+class Resample(nn.Module):
+    def __init__(self, dim, mode):
+        assert mode in (
+            "none",
+            "upsample2d",
+            "upsample3d",
+            "downsample2d",
+            "downsample3d",
+        )
+        super().__init__()
+        self.dim = dim
+        self.mode = mode
+        # layers
+        if mode == "upsample2d":
+            self.resample = nn.Sequential(
+                Upsample(scale_factor=(2.0, 2.0), mode="nearest-exact"),
+                nn.Conv2d(dim, dim, 3, padding=1),
+            )
+        elif mode == "upsample3d":
+            self.resample = nn.Sequential(
+                Upsample(scale_factor=(2.0, 2.0), mode="nearest-exact"),
+                nn.Conv2d(dim, dim, 3, padding=1),
+                # nn.Conv2d(dim, dim//2, 3, padding=1)
+            )
+            self.time_conv = CausalConv3d(
+                dim, dim * 2, (3, 1, 1), padding=(1, 0, 0))
+        elif mode == "downsample2d":
+            self.resample = nn.Sequential(
+                nn.ZeroPad2d((0, 1, 0, 1)),
+                nn.Conv2d(dim, dim, 3, stride=(2, 2)))
+        elif mode == "downsample3d":
+            self.resample = nn.Sequential(
+                nn.ZeroPad2d((0, 1, 0, 1)),
+                nn.Conv2d(dim, dim, 3, stride=(2, 2)))
+            self.time_conv = CausalConv3d(
+                dim, dim, (3, 1, 1), stride=(2, 1, 1), padding=(0, 0, 0))
+        else:
+            self.resample = nn.Identity()
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        b, c, t, h, w = x.size()
+        if self.mode == "upsample3d":
+            if feat_cache is not None:
+                idx = feat_idx[0]
+                if feat_cache[idx] is None:
+                    feat_cache[idx] = "Rep"
+                    feat_idx[0] += 1
+                else:
+                    cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                    if (cache_x.shape[2] < 2 and feat_cache[idx] is not None and
+                            feat_cache[idx] != "Rep"):
+                        # cache last frame of last two chunk
+                        cache_x = torch.cat(
+                            [
+                                feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
+                                    cache_x.device),
+                                cache_x,
+                            ],
+                            dim=2,
+                        )
+                    if (cache_x.shape[2] < 2 and feat_cache[idx] is not None and
+                            feat_cache[idx] == "Rep"):
+                        cache_x = torch.cat(
+                            [
+                                torch.zeros_like(cache_x).to(cache_x.device),
+                                cache_x
+                            ],
+                            dim=2,
+                        )
+                    if feat_cache[idx] == "Rep":
+                        x = self.time_conv(x)
+                    else:
+                        x = self.time_conv(x, feat_cache[idx])
+                    feat_cache[idx] = cache_x
+                    feat_idx[0] += 1
+                    x = x.reshape(b, 2, c, t, h, w)
+                    x = torch.stack((x[:, 0, :, :, :, :], x[:, 1, :, :, :, :]),
+                                    3)
+                    x = x.reshape(b, c, t * 2, h, w)
+        t = x.shape[2]
+        x = rearrange(x, "b c t h w -> (b t) c h w")
+        x = self.resample(x)
+        x = rearrange(x, "(b t) c h w -> b c t h w", t=t)
+        if self.mode == "downsample3d":
+            if feat_cache is not None:
+                idx = feat_idx[0]
+                if feat_cache[idx] is None:
+                    feat_cache[idx] = x.clone()
+                    feat_idx[0] += 1
+                else:
+                    cache_x = x[:, :, -1:, :, :].clone()
+                    x = self.time_conv(
+                        torch.cat([feat_cache[idx][:, :, -1:, :, :], x], 2))
+                    feat_cache[idx] = cache_x
+                    feat_idx[0] += 1
+        return x
+    def init_weight(self, conv):
+        conv_weight = conv.weight.detach().clone()
+        nn.init.zeros_(conv_weight)
+        c1, c2, t, h, w = conv_weight.size()
+        one_matrix = torch.eye(c1, c2)
+        init_matrix = one_matrix
+        nn.init.zeros_(conv_weight)
+        conv_weight.data[:, :, 1, 0, 0] = init_matrix  # * 0.5
+        conv.weight = nn.Parameter(conv_weight)
+        nn.init.zeros_(conv.bias.data)
+    def init_weight2(self, conv):
+        conv_weight = conv.weight.data.detach().clone()
+        nn.init.zeros_(conv_weight)
+        c1, c2, t, h, w = conv_weight.size()
+        init_matrix = torch.eye(c1 // 2, c2)
+        conv_weight[:c1 // 2, :, -1, 0, 0] = init_matrix
+        conv_weight[c1 // 2:, :, -1, 0, 0] = init_matrix
+        conv.weight = nn.Parameter(conv_weight)
+        nn.init.zeros_(conv.bias.data)
+class ResidualBlock(nn.Module):
+    def __init__(self, in_dim, out_dim, dropout=0.0):
+        super().__init__()
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        # layers
+        self.residual = nn.Sequential(
+            RMS_norm(in_dim, images=False),
+            nn.SiLU(),
+            CausalConv3d(in_dim, out_dim, 3, padding=1),
+            RMS_norm(out_dim, images=False),
+            nn.SiLU(),
+            nn.Dropout(dropout),
+            CausalConv3d(out_dim, out_dim, 3, padding=1),
+        )
+        self.shortcut = (
+            CausalConv3d(in_dim, out_dim, 1)
+            if in_dim != out_dim else nn.Identity())
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        h = self.shortcut(x)
+        for layer in self.residual:
+            if isinstance(layer, CausalConv3d) and feat_cache is not None:
+                idx = feat_idx[0]
+                cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                    # cache last frame of last two chunk
+                    cache_x = torch.cat(
+                        [
+                            feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
+                                cache_x.device),
+                            cache_x,
+                        ],
+                        dim=2,
+                    )
+                x = layer(x, feat_cache[idx])
+                feat_cache[idx] = cache_x
+                feat_idx[0] += 1
+            else:
+                x = layer(x)
+        return x + h
+class AttentionBlock(nn.Module):
+    """
+    Causal self-attention with a single head.
+    """
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+        # layers
+        self.norm = RMS_norm(dim)
+        self.to_qkv = nn.Conv2d(dim, dim * 3, 1)
+        self.proj = nn.Conv2d(dim, dim, 1)
+        # zero out the last layer params
+        nn.init.zeros_(self.proj.weight)
+    def forward(self, x):
+        identity = x
+        b, c, t, h, w = x.size()
+        x = rearrange(x, "b c t h w -> (b t) c h w")
+        x = self.norm(x)
+        # compute query, key, value
+        q, k, v = (
+            self.to_qkv(x).reshape(b * t, 1, c * 3,
+                                   -1).permute(0, 1, 3,
+                                               2).contiguous().chunk(3, dim=-1))
+        # apply attention
+        x = F.scaled_dot_product_attention(
+            q,
+            k,
+            v,
+        )
+        x = x.squeeze(1).permute(0, 2, 1).reshape(b * t, c, h, w)
+        # output
+        x = self.proj(x)
+        x = rearrange(x, "(b t) c h w-> b c t h w", t=t)
+        return x + identity
+def patchify(x, patch_size):
+    if patch_size == 1:
+        return x
+    if x.dim() == 4:
+        x = rearrange(
+            x, "b c (h q) (w r) -> b (c r q) h w", q=patch_size, r=patch_size)
+    elif x.dim() == 5:
+        x = rearrange(
+            x,
+            "b c f (h q) (w r) -> b (c r q) f h w",
+            q=patch_size,
+            r=patch_size,
+        )
+    else:
+        raise ValueError(f"Invalid input shape: {x.shape}")
+    return x
+def unpatchify(x, patch_size):
+    if patch_size == 1:
+        return x
+    if x.dim() == 4:
+        x = rearrange(
+            x, "b (c r q) h w -> b c (h q) (w r)", q=patch_size, r=patch_size)
+    elif x.dim() == 5:
+        x = rearrange(
+            x,
+            "b (c r q) f h w -> b c f (h q) (w r)",
+            q=patch_size,
+            r=patch_size,
+        )
+    return x
+class AvgDown3D(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        factor_t,
+        factor_s=1,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.factor_t = factor_t
+        self.factor_s = factor_s
+        self.factor = self.factor_t * self.factor_s * self.factor_s
+        assert in_channels * self.factor % out_channels == 0
+        self.group_size = in_channels * self.factor // out_channels
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        pad_t = (self.factor_t - x.shape[2] % self.factor_t) % self.factor_t
+        pad = (0, 0, 0, 0, pad_t, 0)
+        x = F.pad(x, pad)
+        B, C, T, H, W = x.shape
+        x = x.view(
+            B,
+            C,
+            T // self.factor_t,
+            self.factor_t,
+            H // self.factor_s,
+            self.factor_s,
+            W // self.factor_s,
+            self.factor_s,
+        )
+        x = x.permute(0, 1, 3, 5, 7, 2, 4, 6).contiguous()
+        x = x.view(
+            B,
+            C * self.factor,
+            T // self.factor_t,
+            H // self.factor_s,
+            W // self.factor_s,
+        )
+        x = x.view(
+            B,
+            self.out_channels,
+            self.group_size,
+            T // self.factor_t,
+            H // self.factor_s,
+            W // self.factor_s,
+        )
+        x = x.mean(dim=2)
+        return x
+class DupUp3D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        factor_t,
+        factor_s=1,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.factor_t = factor_t
+        self.factor_s = factor_s
+        self.factor = self.factor_t * self.factor_s * self.factor_s
+        assert out_channels * self.factor % in_channels == 0
+        self.repeats = out_channels * self.factor // in_channels
+    def forward(self, x: torch.Tensor, first_chunk=False) -> torch.Tensor:
+        x = x.repeat_interleave(self.repeats, dim=1)
+        x = x.view(
+            x.size(0),
+            self.out_channels,
+            self.factor_t,
+            self.factor_s,
+            self.factor_s,
+            x.size(2),
+            x.size(3),
+            x.size(4),
+        )
+        x = x.permute(0, 1, 5, 2, 6, 3, 7, 4).contiguous()
+        x = x.view(
+            x.size(0),
+            self.out_channels,
+            x.size(2) * self.factor_t,
+            x.size(4) * self.factor_s,
+            x.size(6) * self.factor_s,
+        )
+        if first_chunk:
+            x = x[:, :, self.factor_t - 1:, :, :]
+        return x
+class Down_ResidualBlock(nn.Module):
+    def __init__(self,
+                 in_dim,
+                 out_dim,
+                 dropout,
+                 mult,
+                 temperal_downsample=False,
+                 down_flag=False):
+        super().__init__()
+        # Shortcut path with downsample
+        self.avg_shortcut = AvgDown3D(
+            in_dim,
+            out_dim,
+            factor_t=2 if temperal_downsample else 1,
+            factor_s=2 if down_flag else 1,
+        )
+        # Main path with residual blocks and downsample
+        downsamples = []
+        for _ in range(mult):
+            downsamples.append(ResidualBlock(in_dim, out_dim, dropout))
+            in_dim = out_dim
+        # Add the final downsample block
+        if down_flag:
+            mode = "downsample3d" if temperal_downsample else "downsample2d"
+            downsamples.append(Resample(out_dim, mode=mode))
+        self.downsamples = nn.Sequential(*downsamples)
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        x_copy = x.clone()
+        for module in self.downsamples:
+            x = module(x, feat_cache, feat_idx)
+        return x + self.avg_shortcut(x_copy)
+class Up_ResidualBlock(nn.Module):
+    def __init__(self,
+                 in_dim,
+                 out_dim,
+                 dropout,
+                 mult,
+                 temperal_upsample=False,
+                 up_flag=False):
+        super().__init__()
+        # Shortcut path with upsample
+        if up_flag:
+            self.avg_shortcut = DupUp3D(
+                in_dim,
+                out_dim,
+                factor_t=2 if temperal_upsample else 1,
+                factor_s=2 if up_flag else 1,
+            )
+        else:
+            self.avg_shortcut = None
+        # Main path with residual blocks and upsample
+        upsamples = []
+        for _ in range(mult):
+            upsamples.append(ResidualBlock(in_dim, out_dim, dropout))
+            in_dim = out_dim
+        # Add the final upsample block
+        if up_flag:
+            mode = "upsample3d" if temperal_upsample else "upsample2d"
+            upsamples.append(Resample(out_dim, mode=mode))
+        self.upsamples = nn.Sequential(*upsamples)
+    def forward(self, x, feat_cache=None, feat_idx=[0], first_chunk=False):
+        x_main = x.clone()
+        for module in self.upsamples:
+            x_main = module(x_main, feat_cache, feat_idx)
+        if self.avg_shortcut is not None:
+            x_shortcut = self.avg_shortcut(x, first_chunk)
+            return x_main + x_shortcut
+        else:
+            return x_main
+class Encoder3d(nn.Module):
+    def __init__(
+        self,
+        dim=128,
+        z_dim=4,
+        dim_mult=[1, 2, 4, 4],
+        num_res_blocks=2,
+        attn_scales=[],
+        temperal_downsample=[True, True, False],
+        dropout=0.0,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.temperal_downsample = temperal_downsample
+        # dimensions
+        dims = [dim * u for u in [1] + dim_mult]
+        scale = 1.0
+        # init block
+        self.conv1 = CausalConv3d(12, dims[0], 3, padding=1)
+        # downsample blocks
+        downsamples = []
+        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
+            t_down_flag = (
+                temperal_downsample[i]
+                if i < len(temperal_downsample) else False)
+            downsamples.append(
+                Down_ResidualBlock(
+                    in_dim=in_dim,
+                    out_dim=out_dim,
+                    dropout=dropout,
+                    mult=num_res_blocks,
+                    temperal_downsample=t_down_flag,
+                    down_flag=i != len(dim_mult) - 1,
+                ))
+            scale /= 2.0
+        self.downsamples = nn.Sequential(*downsamples)
+        # middle blocks
+        self.middle = nn.Sequential(
+            ResidualBlock(out_dim, out_dim, dropout),
+            AttentionBlock(out_dim),
+            ResidualBlock(out_dim, out_dim, dropout),
+        )
+        # # output blocks
+        self.head = nn.Sequential(
+            RMS_norm(out_dim, images=False),
+            nn.SiLU(),
+            CausalConv3d(out_dim, z_dim, 3, padding=1),
+        )
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, -CACHE_T:, :, :].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                cache_x = torch.cat(
+                    [
+                        feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
+                            cache_x.device),
+                        cache_x,
+                    ],
+                    dim=2,
+                )
+            x = self.conv1(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv1(x)
+        ## downsamples
+        for layer in self.downsamples:
+            if feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+        ## middle
+        for layer in self.middle:
+            if isinstance(layer, ResidualBlock) and feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+        ## head
+        for layer in self.head:
+            if isinstance(layer, CausalConv3d) and feat_cache is not None:
+                idx = feat_idx[0]
+                cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                    cache_x = torch.cat(
+                        [
+                            feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
+                                cache_x.device),
+                            cache_x,
+                        ],
+                        dim=2,
+                    )
+                x = layer(x, feat_cache[idx])
+                feat_cache[idx] = cache_x
+                feat_idx[0] += 1
+            else:
+                x = layer(x)
+        return x
+class Decoder3d(nn.Module):
+    def __init__(
+        self,
+        dim=128,
+        z_dim=4,
+        dim_mult=[1, 2, 4, 4],
+        num_res_blocks=2,
+        attn_scales=[],
+        temperal_upsample=[False, True, True],
+        dropout=0.0,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.temperal_upsample = temperal_upsample
+        # dimensions
+        dims = [dim * u for u in [dim_mult[-1]] + dim_mult[::-1]]
+        scale = 1.0 / 2**(len(dim_mult) - 2)
+        # init block
+        self.conv1 = CausalConv3d(z_dim, dims[0], 3, padding=1)
+        # middle blocks
+        self.middle = nn.Sequential(
+            ResidualBlock(dims[0], dims[0], dropout),
+            AttentionBlock(dims[0]),
+            ResidualBlock(dims[0], dims[0], dropout),
+        )
+        # upsample blocks
+        upsamples = []
+        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
+            t_up_flag = temperal_upsample[i] if i < len(
+                temperal_upsample) else False
+            upsamples.append(
+                Up_ResidualBlock(
+                    in_dim=in_dim,
+                    out_dim=out_dim,
+                    dropout=dropout,
+                    mult=num_res_blocks + 1,
+                    temperal_upsample=t_up_flag,
+                    up_flag=i != len(dim_mult) - 1,
+                ))
+        self.upsamples = nn.Sequential(*upsamples)
+        # output blocks
+        self.head = nn.Sequential(
+            RMS_norm(out_dim, images=False),
+            nn.SiLU(),
+            CausalConv3d(out_dim, 12, 3, padding=1),
+        )
+    def forward(self, x, feat_cache=None, feat_idx=[0], first_chunk=False):
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, -CACHE_T:, :, :].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                cache_x = torch.cat(
+                    [
+                        feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
+                            cache_x.device),
+                        cache_x,
+                    ],
+                    dim=2,
+                )
+            x = self.conv1(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv1(x)
+        for layer in self.middle:
+            if isinstance(layer, ResidualBlock) and feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+        ## upsamples
+        for layer in self.upsamples:
+            if feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx, first_chunk)
+            else:
+                x = layer(x)
+        ## head
+        for layer in self.head:
+            if isinstance(layer, CausalConv3d) and feat_cache is not None:
+                idx = feat_idx[0]
+                cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                    cache_x = torch.cat(
+                        [
+                            feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
+                                cache_x.device),
+                            cache_x,
+                        ],
+                        dim=2,
+                    )
+                x = layer(x, feat_cache[idx])
+                feat_cache[idx] = cache_x
+                feat_idx[0] += 1
+            else:
+                x = layer(x)
+        return x
+def count_conv3d(model):
+    count = 0
+    for m in model.modules():
+        if isinstance(m, CausalConv3d):
+            count += 1
+    return count
+class WanVAE_(nn.Module):
+    def __init__(
+        self,
+        dim=160,
+        dec_dim=256,
+        z_dim=16,
+        dim_mult=[1, 2, 4, 4],
+        num_res_blocks=2,
+        attn_scales=[],
+        temperal_downsample=[True, True, False],
+        dropout=0.0,
+        device='cuda'
+    ):
+        super().__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.temperal_downsample = temperal_downsample
+        self.temperal_upsample = temperal_downsample[::-1]
+        # modules
+        self.encoder = Encoder3d(
+            dim,
+            z_dim * 2,
+            dim_mult,
+            num_res_blocks,
+            attn_scales,
+            self.temperal_downsample,
+            dropout,
+        )
+        self.conv1 = CausalConv3d(z_dim * 2, z_dim * 2, 1)
+        self.conv2 = CausalConv3d(z_dim, z_dim, 1)
+        self.decoder = Decoder3d(
+            dec_dim,
+            z_dim,
+            dim_mult,
+            num_res_blocks,
+            attn_scales,
+            self.temperal_upsample,
+            dropout,
+        )
+        self.mean = torch.tensor(
+            [
+                -0.2289,
+                -0.0052,
+                -0.1323,
+                -0.2339,
+                -0.2799,
+                0.0174,
+                0.1838,
+                0.1557,
+                -0.1382,
+                0.0542,
+                0.2813,
+                0.0891,
+                0.1570,
+                -0.0098,
+                0.0375,
+                -0.1825,
+                -0.2246,
+                -0.1207,
+                -0.0698,
+                0.5109,
+                0.2665,
+                -0.2108,
+                -0.2158,
+                0.2502,
+                -0.2055,
+                -0.0322,
+                0.1109,
+                0.1567,
+                -0.0729,
+                0.0899,
+                -0.2799,
+                -0.1230,
+                -0.0313,
+                -0.1649,
+                0.0117,
+                0.0723,
+                -0.2839,
+                -0.2083,
+                -0.0520,
+                0.3748,
+                0.0152,
+                0.1957,
+                0.1433,
+                -0.2944,
+                0.3573,
+                -0.0548,
+                -0.1681,
+                -0.0667,
+            ],
+            device=device,
+        )
+        self.std = torch.tensor(
+            [
+                0.4765,
+                1.0364,
+                0.4514,
+                1.1677,
+                0.5313,
+                0.4990,
+                0.4818,
+                0.5013,
+                0.8158,
+                1.0344,
+                0.5894,
+                1.0901,
+                0.6885,
+                0.6165,
+                0.8454,
+                0.4978,
+                0.5759,
+                0.3523,
+                0.7135,
+                0.6804,
+                0.5833,
+                1.4146,
+                0.8986,
+                0.5659,
+                0.7069,
+                0.5338,
+                0.4889,
+                0.4917,
+                0.4069,
+                0.4999,
+                0.6866,
+                0.4093,
+                0.5709,
+                0.6065,
+                0.6415,
+                0.4944,
+                0.5726,
+                1.2042,
+                0.5458,
+                1.6887,
+                0.3971,
+                1.0600,
+                0.3943,
+                0.5537,
+                0.5444,
+                0.4089,
+                0.7468,
+                0.7744,
+            ],
+            device=device,
+        )
+        self.scale = [self.mean, 1.0 / self.std]
+    def forward(self, x, scale=[0, 1]):
+        mu = self.encode(x, scale)
+        x_recon = self.decode(mu, scale)
+        return x_recon, mu
+    def encode(self, x, scale=None):
+        self.clear_cache()
+        x = patchify(x, patch_size=2)
+        t = x.shape[2]
+        iter_ = 1 + (t - 1) // 4
+        for i in range(iter_):
+            self._enc_conv_idx = [0]
+            if i == 0:
+                out = self.encoder(
+                    x[:, :, :1, :, :],
+                    feat_cache=self._enc_feat_map,
+                    feat_idx=self._enc_conv_idx,
+                )
+            else:
+                out_ = self.encoder(
+                    x[:, :, 1 + 4 * (i - 1):1 + 4 * i, :, :],
+                    feat_cache=self._enc_feat_map,
+                    feat_idx=self._enc_conv_idx,
+                )
+                out = torch.cat([out, out_], 2)
+        mu, log_var = self.conv1(out).chunk(2, dim=1)
+        if scale is not None:
+            if isinstance(scale[0], torch.Tensor):
+                mu = (mu - scale[0].view(1, self.z_dim, 1, 1, 1)) * scale[1].view(
+                    1, self.z_dim, 1, 1, 1)
+            else:
+                mu = (mu - scale[0]) * scale[1]
+        self.clear_cache()
+        return mu, log_var
+    def decode(self, z, scale=None):
+        self.clear_cache()
+        if scale is not None:
+            if isinstance(scale[0], torch.Tensor):
+                z = z / scale[1].view(1, self.z_dim, 1, 1, 1) + scale[0].view(
+                    1, self.z_dim, 1, 1, 1)
+            else:
+                z = z / scale[1] + scale[0]
+        iter_ = z.shape[2]
+        x = self.conv2(z)
+        for i in range(iter_):
+            self._conv_idx = [0]
+            if i == 0:
+                out = self.decoder(
+                    x[:, :, i:i + 1, :, :],
+                    feat_cache=self._feat_map,
+                    feat_idx=self._conv_idx,
+                    first_chunk=True,
+                )
+            else:
+                out_ = self.decoder(
+                    x[:, :, i:i + 1, :, :],
+                    feat_cache=self._feat_map,
+                    feat_idx=self._conv_idx,
+                )
+                out = torch.cat([out, out_], 2)
+        out = unpatchify(out, patch_size=2)
+        self.clear_cache()
+        return out
+    def reparameterize(self, mu, log_var):
+        std = torch.exp(0.5 * log_var)
+        eps = torch.randn_like(std)
+        return eps * std + mu
+    def sample(self, imgs, scale, deterministic=False):
+        mu, log_var = self.encode(imgs, scale=scale)
+        if deterministic:
+            return mu
+        std = torch.exp(0.5 * log_var.clamp(-30.0, 20.0))
+        return mu + std * torch.randn_like(std)
+    def clear_cache(self):
+        self._conv_num = count_conv3d(self.decoder)
+        self._conv_idx = [0]
+        self._feat_map = [None] * self._conv_num
+        # cache encode
+        self._enc_conv_num = count_conv3d(self.encoder)
+        self._enc_conv_idx = [0]
+        self._enc_feat_map = [None] * self._enc_conv_num
+def video_vae2(pretrained_path=None, z_dim=48, dim=160, device="cuda", **kwargs):
+    # params
+    cfg = dict(
+        dim=dim,
+        z_dim=z_dim,
+        dim_mult=[1, 2, 4, 4],
+        num_res_blocks=2,
+        attn_scales=[],
+        temperal_downsample=[False, True, True],
+        dropout=0.0,
+        device=device
+    )
+    cfg.update(**kwargs)
+    # init model
+    model = WanVAE_(**cfg)
+    # load checkpoint
+    logging.info(f"loading {pretrained_path}")
+    model.load_state_dict(torch.load(pretrained_path))
+    return model

requirements_hf.txt ADDED Viewed

	@@ -0,0 +1,33 @@

+# Requirements for Hugging Face Spaces
+# PyTorch with CUDA support - HF Spaces automatically provides CUDA-enabled PyTorch
+# When you select GPU hardware, PyTorch will have full CUDA support
+torch>=2.0.0
+torchvision>=0.15.0
+# Note: CUDA toolkit is pre-installed on HF Spaces GPU instances
+# You can verify with: torch.cuda.is_available() and torch.cuda.get_device_name(0)
+# Core dependencies
+transformers>=4.30.0
+accelerate>=0.20.0
+torchinfo
+einops
+scipy
+sentencepiece
+wandb[media]
+torchmetrics[image]
+simple_parsing
+opencv-python
+psutil
+pyyaml
+av==12.3.0
+# Gradio for web interface
+gradio>=4.0.0
+# Git dependencies
+git+https://github.com/KeKsBoTer/torch-dwt
+git+https://github.com/huggingface/diffusers.git
+# Note: decord and webdataset are optional and may not be needed for inference
+# If needed, install via: pip install decord webdataset

sample.py ADDED Viewed

	@@ -0,0 +1,379 @@

+#
+# For licensing see accompanying LICENSE file.
+# Copyright (C) 2025 Apple Inc. All Rights Reserved.
+#
+#!/usr/bin/env python3
+"""
+Scalable Transformer Autoregressive Flow (STARFlow) Sampling Script
+This script provides functionality for sampling from trained transformer autoregressive flow models.
+Supports both image and video generation with various conditioning options.
+Usage:
+    python sample.py --model_config_path config.yaml --checkpoint_path model.pth --caption "A cat"
+"""
+import argparse
+import copy
+import pathlib
+import time
+from typing import Dict, List, Optional, Tuple, Union
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.data
+import torchvision as tv
+import tqdm
+import yaml
+from einops import repeat
+from PIL import Image
+# Local imports
+import transformer_flow
+import utils
+from dataset import aspect_ratio_to_image_size
+from train import get_tarflow_parser
+from utils import process_denoising, save_samples_unified, load_model_config, encode_text, add_noise
+from transformer_flow import KVCache
+from misc import print
+# Default caption templates for testing and demonstrations
+DEFAULT_CAPTIONS = {
+    'template1': "In the image, a corgi dog is wearing a Santa hat and is laying on a fluffy rug. The dog's tongue is sticking out and it appears to be happy. There are two pumpkins and a basket of leaves nearby, indicating that the scene takes place during the fall season. The background features a Christmas tree, further suggesting the holiday atmosphere. The image has a warm and cozy feel to it, with the dog looking adorable in its hat and the pumpkins adding a festive touch.",
+    'template2': "A close-up portrait of a cheerful Corgi dog, showcasing its fluffy, sandy-brown fur and perky ears. The dog has a friendly expression with a slight smile, looking directly into the camera. Set against a soft, natural green background, the image is captured in a high-definition, realistic photography style, emphasizing the texture of the fur and the vibrant colors.",
+    'template3': "A high-resolution, wide-angle selfie photograph of Albert Einstein in a garden setting. Einstein looks directly into the camera with a gentle, knowing smile. His distinctive wild white hair and bushy mustache frame a face marked by thoughtful wrinkles. He wears a classic tweed jacket over a simple shirt. In the background, lush greenery and flowering bushes under soft daylight create a serene, scholarly atmosphere. Ultra-realistic style, 4K detail.",
+    'template4': 'A close-up, high-resolution selfie of a red panda perched on a tree branch, its large dark eyes looking directly into the lens. Rich reddish-orange fur with white facial markings contrasts against the lush green bamboo forest behind. Soft sunlight filters through the leaves, casting a warm, natural glow over the scene. Ultra-realistic detail, digital photograph style, 4K resolution.',
+    'template5': "A realistic selfie of a llama standing in front of a classic Ivy League building on the Princeton University campus. He is smiling gently, wearing his iconic wild hair and mustache, dressed in a wool sweater and collared shirt. The photo has a vintage, slightly sepia tone, with soft natural lighting and leafy trees in the background, capturing an academic and historical vibe.",
+}
+def setup_model_and_components(args: argparse.Namespace) -> Tuple[torch.nn.Module, Optional[torch.nn.Module], tuple]:
+    """Initialize and load the model, VAE, and text encoder."""
+    dist = utils.Distributed()
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # Set random seed
+    utils.set_random_seed(args.seed + dist.rank)
+    # Setup text encoder
+    tokenizer, text_encoder = utils.setup_encoder(args, dist, device)
+    # Setup VAE if specified
+    vae = None
+    if args.vae is not None:
+        vae = utils.setup_vae(args, dist, device)
+        args.img_size = args.img_size // vae.downsample_factor
+    else:
+        args.finetuned_vae = 'none'
+    # Setup main transformer model
+    model = utils.setup_transformer(
+        args, dist,
+        txt_dim=text_encoder.config.hidden_size,
+        use_checkpoint=1
+    ).to(device)
+    # Load checkpoint
+    print(f"Loading checkpoint from local path: {args.checkpoint_path}")
+    state_dict = torch.load(args.checkpoint_path, map_location='cpu')
+    model.load_state_dict(state_dict, strict=False)
+    del state_dict; torch.cuda.empty_cache()
+    # Set model to eval mode and disable gradients
+    for p in model.parameters():
+        p.requires_grad = False
+    model.eval()
+    # Parallelize model for multi-GPU sampling
+    _, model = utils.parallelize_model(args, model, dist, device)
+    return model, vae, (tokenizer, text_encoder, dist, device)
+def prepare_captions(args: argparse.Namespace, dist) -> Tuple[List[str], List[int], int, str]:
+    """Prepare captions for sampling from file or template."""
+    if args.caption.endswith('.txt'):
+        with open(args.caption, 'r') as f:
+            lines = [line.strip() for line in f.readlines()]
+        num_samples = len(lines)
+        fixed_y = lines[dist.rank:][::dist.world_size]
+        fixed_idxs = list(range(len(lines)))[dist.rank:][::dist.world_size]
+        caption_name = args.caption.split('/')[-1][:-4]
+    else:
+        caption_text = DEFAULT_CAPTIONS.get(args.caption, args.caption)
+        fixed_y = [caption_text] * args.sample_batch_size
+        fixed_idxs = []
+        num_samples = args.sample_batch_size * dist.world_size
+        caption_name = args.caption
+    return fixed_y, fixed_idxs, num_samples, caption_name
+def get_noise_shape(args: argparse.Namespace, vae) -> callable:
+    """Generate noise tensor with appropriate shape for sampling."""
+    def _get_noise_func(b: int, x_shape: tuple) -> torch.Tensor:
+        rand_shape = [args.channel_size, x_shape[0], x_shape[1]]
+        if len(x_shape) == 3:
+            rand_shape = [x_shape[2]] + rand_shape
+        if vae is not None:
+            if args.vid_size is not None:
+                rand_shape[0] = (rand_shape[0] - 1) // vae.temporal_downsample_factor + 1
+            rand_shape[-2] //= vae.downsample_factor
+            rand_shape[-1] //= vae.downsample_factor
+        return torch.randn(b, *rand_shape)
+    return _get_noise_func
+def prepare_input_image(args: argparse.Namespace, x_shape: tuple, vae, device: torch.device, noise_std: float) -> Optional[torch.Tensor]:
+    """Load and preprocess input image for conditional generation."""
+    input_image = Image.open(args.input_image).convert('RGB')
+    # Resize and crop to target shape
+    scale = max(x_shape[0] / input_image.height, x_shape[1] / input_image.width)
+    transform = tv.transforms.Compose([
+        tv.transforms.Resize((int(input_image.height * scale), int(input_image.width * scale))),
+        tv.transforms.CenterCrop(x_shape[:2]),
+        tv.transforms.ToTensor(),
+        tv.transforms.Normalize([0.5]*3, [0.5]*3)
+    ])
+    input_image = transform(input_image).unsqueeze(0).to(device)
+    # Encode with VAE if available
+    with torch.no_grad():
+        if vae is not None:
+            input_image = vae.encode(input_image)
+    # Add noise
+    input_image = add_noise(input_image, noise_std)[0]
+    return input_image
+def build_sampling_kwargs(args: argparse.Namespace, caption_name: str) -> dict:
+    """Build sampling keyword arguments based on configuration."""
+    sampling_kwargs = {
+        'guidance': args.cfg,
+        'guide_top': args.guide_top,
+        'verbose': not caption_name.endswith('/'),
+        'return_sequence': args.return_sequence,
+        'jacobi': args.jacobi,
+        'context_length': args.context_length
+    }
+    if args.jacobi:
+        sampling_kwargs.update({
+            'jacobi_th': args.jacobi_th,
+            'jacobi_block_size': args.jacobi_block_size,
+            'jacobi_max_iter': args.jacobi_max_iter
+        })
+    else:
+        sampling_kwargs.update({
+            'attn_temp': args.attn_temp,
+            'annealed_guidance': False
+        })
+    return sampling_kwargs
+def main(args: argparse.Namespace) -> None:
+    """Main sampling function."""
+    # Load model configuration and merge with command line args
+    trainer_args = load_model_config(args.model_config_path)
+    trainer_dict = vars(trainer_args)
+    trainer_dict.update(vars(args))
+    args = argparse.Namespace(**trainer_dict)
+    # Handle target length configuration for video
+    if args.target_length is not None:
+        assert args.vid_size is not None, "it must be a video model to use target_length"
+        assert args.jacobi == 1, "target_length is only supported with jacobi sampling"
+        if args.target_length == 1:  # generate single image
+            args.vid_size = None
+            args.out_fps = 0
+        else:
+            args.local_attn_window = (int(args.vid_size.split(':')[0]) - 1) // 4 + 1
+            args.vid_size = f"{args.target_length}:16"
+            if args.context_length is None:
+                args.context_length = args.local_attn_window - 1
+    # Override some settings for sampling
+    args.fsdp = 1  # sampling using FSDP if available.
+    if args.use_pretrained_lm is not None:
+        args.text = args.use_pretrained_lm
+    # Setup model and components
+    model, vae, (tokenizer, text_encoder, dist, device) = setup_model_and_components(args)
+    # Setup output directory
+    model_name = pathlib.Path(args.checkpoint_path).stem
+    sample_dir: pathlib.Path = args.logdir / f'{model_name}'
+    if dist.local_rank == 0:
+        sample_dir.mkdir(parents=True, exist_ok=True)
+    dist.barrier()
+    print(f'{" Load ":-^80} {model_name}')
+    # Prepare captions and sampling parameters
+    fixed_y, fixed_idxs, num_samples, caption_name = prepare_captions(args, dist)
+    print(f'Sampling {num_samples} from {args.caption} on {dist.world_size} GPU(s)')
+    get_noise = get_noise_shape(args, vae)
+    sampling_kwargs = build_sampling_kwargs(args, caption_name)
+    noise_std = args.target_noise_std if args.target_noise_std else args.noise_std
+    # Start sampling
+    print(f'Starting sampling with global batch size {args.sample_batch_size}x{dist.world_size} GPUs')
+    torch.cuda.synchronize()
+    start_time = time.time()
+    with torch.no_grad():
+        with torch.autocast(device_type='cuda', dtype=torch.float32):
+            for i in tqdm.tqdm(range(int(np.ceil(num_samples / (args.sample_batch_size * dist.world_size))))):
+                # Determine aspect ratio and image shape
+                x_aspect = args.aspect_ratio if args.mix_aspect else None
+                if x_aspect == "random":
+                    x_aspect = np.random.choice([
+                        "1:1", "2:3", "3:2", "16:9", "9:16", "4:5", "5:4", "21:9", "9:21"
+                    ])
+                x_shape = aspect_ratio_to_image_size(
+                    args.img_size * vae.downsample_factor, x_aspect,
+                    multiple=vae.downsample_factor * args.patch_size
+                )
+                # Setup text encoder kwargs
+                text_encoder_kwargs = dict(
+                    aspect_ratio=x_aspect,
+                    fps=args.out_fps if args.fps_cond else None,
+                    noise_std=noise_std if args.cond_noise_level else None
+                )
+                # Handle video dimensions
+                if args.vid_size is not None:
+                    vid_size = tuple(map(int, args.vid_size.split(':')))
+                    out_fps = args.out_fps if args.fps_cond else vid_size[1]
+                    num_frames = vid_size[0]
+                    x_shape = (x_shape[0], x_shape[1], num_frames)
+                else:
+                    out_fps = args.out_fps
+                # Prepare batch and captions
+                b = args.sample_batch_size
+                y = fixed_y[i * b : (i + 1) * b]
+                y_caption = copy.deepcopy(y)
+                # Add null captions for CFG
+                if args.cfg > 0:
+                    y += [""] * len(y)
+                # Prepare text & noise
+                y = encode_text(text_encoder, tokenizer, y, args.txt_size, device, **text_encoder_kwargs)
+                noise = get_noise(len(y_caption), x_shape).to(device)
+                # Prepare input image if specified
+                if args.input_image is not None:
+                    input_image = prepare_input_image(args, x_shape, vae, device, noise_std)
+                    input_image = repeat(input_image, '1 c h w -> b c h w', b=b)
+                    assert args.cfg > 0, "CFG is required for image conditioned generation"
+                    kv_caches = model(input_image.unsqueeze(1), y, context=True)
+                else:
+                    input_image, kv_caches = None, None
+                # Generate samples
+                samples = model(noise, y, reverse=True, kv_caches=kv_caches, **sampling_kwargs)
+                del kv_caches; torch.cuda.empty_cache()  # free up memory
+                # Apply denoising if enabled
+                samples = process_denoising(
+                    samples, y_caption, args, model, text_encoder,
+                    tokenizer, text_encoder_kwargs, noise_std
+                )
+                # Decode with VAE if available
+                if args.vae is not None:
+                    dec_fn = vae.decode
+                else:
+                    dec_fn = lambda x: x
+                if isinstance(samples, list):
+                    samples = torch.cat([dec_fn(s) for s in samples], dim=-1)
+                else:
+                    samples = dec_fn(samples)
+                # Save samples using unified function
+                print(f' Saving samples ... {sample_dir}')
+                # Determine save mode based on args
+                if args.save_folder and args.caption.endswith('.txt'):
+                    grid_mode = "individual"  # Save individual files when using caption file
+                else:
+                    grid_mode = "auto"  # Use automatic grid arrangement
+                save_samples_unified(
+                    samples=samples,
+                    save_dir=sample_dir,
+                    filename_prefix=caption_name[:200] if len(caption_name) > 0 else "samples",
+                    epoch_or_iter=i,
+                    fps=out_fps,
+                    dist=dist,
+                    wandb_log=False,  # Let sample.py handle its own wandb logging
+                    grid_arrangement=grid_mode
+                )
+    # Print timing statistics
+    torch.cuda.synchronize()
+    elapsed_time = time.time() - start_time
+    print(f'{model_name} cfg {args.cfg:.2f}, bsz={args.sample_batch_size}x{dist.world_size}, '
+          f'time={elapsed_time:.2f}s, speed={num_samples / elapsed_time:.2f} images/s')
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    # Model config
+    parser.add_argument('--model_config_path', required=True, type=str, help='path to YAML config file or directory containing config file')
+    parser.add_argument('--checkpoint_path', required=True, type=str, help='path to local checkpoint file (required when using model_config_path)')
+    parser.add_argument('--save_folder', default=0, type=int)
+    # Caption, condition
+    parser.add_argument('--caption', type=str, required=True, help='Caption input (required)')
+    parser.add_argument('--input_image', default=None, type=str, help='path to the input image for image-conditioned generation')
+    parser.add_argument('--aspect_ratio', default="1:1", type=str, choices=["random", "1:1", "2:3", "3:2", "16:9", "9:16", "4:5", "5:4", "21:9", "9:21"])
+    parser.add_argument('--out_fps', default=8, type=int, help='fps for video datasets, only useful if fps_cond is set to 1')
+    # Sampling parameters
+    parser.add_argument('--seed', default=191, type=int)
+    parser.add_argument('--denoising_batch_size', default=1, type=int)
+    parser.add_argument('--self_denoising_lr', default=1, type=float)
+    parser.add_argument('--disable_learnable_denoiser', default=0, type=int)
+    parser.add_argument('--attn_temp', default=1, type=float)
+    parser.add_argument('--jacobi_th', default=0.005, type=float)
+    parser.add_argument('--jacobi', default=0, type=int)
+    parser.add_argument('--jacobi_block_size', default=64, type=int)
+    parser.add_argument('--jacobi_max_iter', default=32, type=int)
+    parser.add_argument('--num_samples', default=50000, type=int)
+    parser.add_argument('--sample_batch_size', default=16, type=int)
+    parser.add_argument('--return_sequence', default=0, type=int)
+    parser.add_argument('--cfg', default=5, type=float)
+    parser.add_argument('--guide_top', default=None, type=int)
+    parser.add_argument('--finetuned_vae', default="px82zaheuu", type=str)
+    parser.add_argument('--vae_adapter', default=None)
+    parser.add_argument('--target_noise_std', default=None, help="option to use different noise_std from the config")
+    # Video-specific parameters
+    parser.add_argument('--target_length', default=None, type=int, help="target length maybe longer than training")
+    parser.add_argument('--context_length', default=16,  type=int, help="context length used for consective sampling")
+    args = parser.parse_args()
+    if args.input_image and args.input_image == 'none':
+        args.input_image = None
+    main(args)

transformer_flow.py ADDED Viewed

	@@ -0,0 +1,1356 @@

+#
+# For licensing see accompanying LICENSE file.
+# Copyright (C) 2025 Apple Inc. All Rights Reserved.
+#
+import copy
+import tqdm
+import numpy as np
+import torch
+import torch.nn.functional as F
+from typing import List, Tuple
+from misc.pe import VisionRotaryEmbeddingFast, apply_rope, get_positions
+from misc import print
+from functools import partial
+from einops import rearrange, repeat
+from torch.utils.checkpoint import checkpoint
+INV_SOFTPLUS_1 = 0.541324854612918
+def modulate(x, shift, scale):
+    if shift is None:
+        return x * (1 + scale)
+    return x * (1 + scale) + shift
+def stable_neg_log_softplus(x):
+    return torch.where(
+        x > 20,              # softplus(x) ≈ x → log ≈ log(x)
+        -x.log(),            # so -log(softplus(x)) ≈ -log(x)
+        -F.softplus(x).log()
+    )
+class KVCache:
+    def __init__(self):
+        self._is_empty = True
+        self.prefix_cache = None
+        self.meta_data = {}
+    def initialize(self, num_blocks, *size):
+        self._is_empty = False
+        self.num_blocks = num_blocks
+        self.size = size
+        self.kv_caches = [torch.zeros(2, *size) for _ in range(num_blocks)]
+        self.kv_index = [0] * num_blocks
+    def register_prefix_cache(self, prefix_cache):
+        self.prefix_cache = prefix_cache
+    @property
+    def is_empty(self):
+        return self._is_empty
+    @property
+    def is_full(self):
+        if self.is_empty:
+            return False
+        return all(index == self.size[2] for index in self.kv_index)
+    def delete(self):
+        if not self.is_empty:
+            self._is_empty = True
+            del self.kv_caches
+            del self.kv_index
+    def to(self, device, dtype=torch.bfloat16):
+        for i in range(self.num_blocks):
+            self.kv_caches[i] = self.kv_caches[i].to(device=device, dtype=dtype)
+    def extend_length(self, length):
+        assert not self.is_empty, "KVCache is empty, cannot extend length"
+        self.size = (self.size[0], self.size[1], self.size[2] + length, self.size[3])
+        for i in range(self.num_blocks):
+            pad = self.kv_caches[i].new_zeros((2, *self.size))
+            pad[:, :, :, :self.kv_caches[i].size(3)] = self.kv_caches[i]
+            self.kv_caches[i] = pad
+    def expand_batch(self, ratio=2):
+        self.size = (self.size[0] * ratio, *self.size[1:])
+        for i in range(self.num_blocks):
+            self.kv_caches[i] = torch.cat([self.kv_caches[i] for _ in range(ratio)], dim=1)
+    def remove_negative_cache(self):
+        self.size = (self.size[0] // 2, *self.size[1:])
+        for i in range(self.num_blocks):
+            self.kv_caches[i] = self.kv_caches[i].chunk(2, dim=1)[0]
+    def backward_in_time(self, l):
+        for i in range(self.num_blocks):
+            self.kv_index[i] = max(0, self.kv_index[i] - l)
+    def reset_kv_index(self):
+        for i in range(self.num_blocks):
+            self.kv_index[i] = 0
+    def __call__(self, block_idx, k, v):
+        assert block_idx < self.num_blocks, f'block_idx {block_idx} out of range {self.num_blocks}'
+        # write cache
+        l = k.size(2)
+        kv_index = self.kv_index[block_idx]
+        if kv_index + l > self.size[2]:
+            raise NotImplementedError("Overflow mode is not implemented")
+        self.kv_caches[block_idx][0][:, :, kv_index: kv_index+l] = k
+        self.kv_caches[block_idx][1][:, :, kv_index: kv_index+l] = v
+        self.kv_index[block_idx] = kv_index + l
+        # read cache
+        kv_index = self.kv_index[block_idx]
+        return self.kv_caches[block_idx][0][:, :, :kv_index], self.kv_caches[block_idx][1][:, :, :kv_index]
+class Permutation(torch.nn.Module):
+    def __init__(self, seq_length: int):
+        super().__init__()
+        self.seq_length = seq_length
+        self.input_shape = None
+    def forward(self, x: torch.Tensor | List[torch.Tensor], dim: int = 1, inverse: bool = False):
+        if not inverse:
+            self.input_shape = x.shape
+            x = rearrange(x, 'b t h w c -> b (t h w) c' if x.dim() == 5 else 'b h w c -> b (h w) c')
+            x = self.permute(x, dim, self.input_shape, inverse=False)
+        else:
+            x = self.permute(x, dim, self.input_shape, inverse=True)
+            x = x.reshape(-1, *self.input_shape[1:])
+        return x
+    def permute(self, x: torch.Tensor, dim: int = 1, shape=None, inverse: bool = False) -> torch.Tensor:
+        raise NotImplementedError('Overload me')
+class PermutationIdentity(Permutation):
+    def permute(self, x: torch.Tensor, dim: int = 1, shape=None, inverse: bool = False) -> torch.Tensor:
+        return x.clone()
+class PermutationFlip(Permutation):
+    def permute(self, x: torch.Tensor, dim: int = 1, shape=None, inverse: bool = False) -> torch.Tensor:
+        return x.flip(dims=[dim])
+class PermutationFlipInBlock(Permutation):
+    def permute(self, x: torch.Tensor, dim: int = 1, shape=None, inverse: bool = False) -> torch.Tensor:
+        assert shape is not None, "shape must be provided for PermutationFlipInBlock"
+        if len(shape) == 5:
+            assert dim == 1, "dim must be 1 for 5D tensor in PermutationFlipInBlock"
+            # flip the tensor within blocks of size `block_size`, globally still in the same order
+            x = x.view(x.size(0), shape[1], -1, x.size(-1)).flip(dims=[2]).view_as(x)
+        else:
+            x = x.flip(dims=[dim])
+        return x
+class RMSNorm(torch.nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        eps: float = 1e-6,
+        add_unit_offset: bool = True,
+    ):
+        super().__init__()
+        self.eps = eps
+        self.add_unit_offset = add_unit_offset
+        self.weight = torch.nn.Parameter(torch.zeros(dim))
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x):
+        # Llama does x.to(float16) * w whilst Gemma2 is (x * w).to(float16)
+        # See https://github.com/huggingface/transformers/pull/29402
+        output = self._norm(x.float())
+        if self.add_unit_offset:
+            output = output * (1 + self.weight.float())
+        else:
+            output = output * self.weight.float()
+        return output.type_as(x)
+class Attention(torch.nn.Module):
+    def __init__(self, in_channels: int, head_channels: int, norm_type: str = "layer_norm",
+                num_heads=None, num_kv_heads=None, use_qk_norm=False,
+                use_post_norm=False, use_bias=True, hf_style_rope=False, non_causal=False):
+        super().__init__()
+        if norm_type == "layer_norm":
+            self.norm = torch.nn.LayerNorm(in_channels)
+        elif norm_type == "rms_norm":
+            self.norm = RMSNorm(in_channels)
+        else:
+            self.norm = torch.nn.Identity()
+        self.head_channels = head_channels
+        self.num_heads = num_heads if num_heads is not None else in_channels // head_channels
+        self.num_kv_heads = num_kv_heads if num_kv_heads is not None else self.num_heads  # GQA
+        self.q_size = self.num_heads * head_channels
+        self.kv_size = self.num_kv_heads * head_channels
+        self.qkv = torch.nn.Linear(in_channels, self.q_size + 2 * self.kv_size, bias=use_bias)
+        self.proj = torch.nn.Linear(self.q_size, in_channels, bias=use_bias)
+        self.query_norm = (RMSNorm(self.head_channels) if use_qk_norm else None)
+        self.key_norm = (RMSNorm(self.head_channels) if use_qk_norm else None)
+        self.post_norm = (RMSNorm(in_channels) if use_post_norm else None)
+        self.sqrt_scale = head_channels ** (-0.25)
+        self.hf_style_rope = hf_style_rope
+        self.non_causal = non_causal
+    def apply_rope(self, x: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor:
+        if self.hf_style_rope:
+            return rearrange(apply_rope(rearrange(x, '... (u d) -> ... (d u)', u=2), freqs_cis), '... (d u) -> ... (u d)', u=2)
+        return apply_rope(x, freqs_cis)
+    def prepare_for_attention(self, x: torch.Tensor, freqs_cis=None, kv_cache=None):
+        B, T, _ = x.size()
+        q, k, v = self.qkv(self.norm(x)).split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q = q.view(B, T, self.num_heads, self.head_channels).transpose(1, 2)  # (b, h, t, d)
+        k = k.view(B, T, self.num_kv_heads, self.head_channels).transpose(1, 2)  # (b, h, t, d)
+        v = v.view(B, T, self.num_kv_heads, self.head_channels).transpose(1, 2)  # (b, h, t, d)
+        if self.query_norm is not None and self.key_norm is not None:
+            q, k = self.query_norm(q), self.key_norm(k)
+        if kv_cache is not None:
+            k, v = kv_cache(k, v)
+        if freqs_cis is not None:
+            lq, lk = q.size(2), k.size(2)
+            q, k = self.apply_rope(q, freqs_cis[lk-lq:lk]), self.apply_rope(k, freqs_cis[:lk])
+        if self.num_kv_heads != self.num_heads:  # GQA (b, h, t, d)
+            k = torch.repeat_interleave(k, self.num_heads // self.num_kv_heads, dim=1)
+            v = torch.repeat_interleave(v, self.num_heads // self.num_kv_heads, dim=1)
+        return q.to(x.dtype), k.to(x.dtype), v.to(x.dtype)
+    def output_after_attention(self, x: torch.Tensor):
+        B, _, T, _ = x.shape
+        x = x.transpose(1, 2).reshape(B, T, self.q_size)
+        x = self.proj(x)
+        if self.post_norm is not None:
+            x = self.post_norm(x)
+        return x
+    def apply_attention(self, q, k, v, mask=None, temp=1.0):
+        scale = self.sqrt_scale**2 / temp
+        is_causal = not self.non_causal
+        if is_causal and q.size(2) < k.size(2) and mask is None:
+            prefix_len = k.size(2) - q.size(2)
+            mask = torch.tril(torch.ones(q.size(2), k.size(2), device=q.device, dtype=torch.bool), diagonal=prefix_len)
+        if mask is not None:
+            mask = mask.bool()
+            is_causal = False
+        # spda
+        x = torch.nn.functional.scaled_dot_product_attention(
+                q, k, v, attn_mask=mask, is_causal=is_causal, scale=scale)
+        return x
+    def forward(self, x: torch.Tensor, mask: torch.Tensor | None = None, temp: float = 1.0, freqs_cis=None, kv_cache=None,
+    ) -> torch.Tensor:
+        q, k, v = self.prepare_for_attention(x, freqs_cis, kv_cache)
+        x = self.apply_attention(q, k, v, mask, temp)
+        x = self.output_after_attention(x)
+        return x
+class MLP(torch.nn.Module):
+    def __init__(self, channels: int, expansion: float, use_swiglu=False, norm_type="layer_norm", use_post_norm=False, use_bias=True):
+        super().__init__()
+        if norm_type == "layer_norm":
+            self.norm = torch.nn.LayerNorm(channels)
+        elif norm_type == "rms_norm":
+            self.norm = RMSNorm(channels)
+        else:
+            self.norm = torch.nn.Identity()
+        self.post_norm = (RMSNorm(channels) if use_post_norm else None)
+        self.use_swiglu = use_swiglu
+        intermediate_channels = int(channels * expansion)
+        if use_swiglu:
+            self.gate_proj = torch.nn.Linear(channels, intermediate_channels, bias=use_bias)
+            self.up_proj = torch.nn.Linear(channels, intermediate_channels, bias=use_bias)
+            self.down_proj = torch.nn.Linear(intermediate_channels, channels, bias=use_bias)
+        else:
+            self.main = torch.nn.Sequential(
+                torch.nn.Linear(channels, intermediate_channels, bias=use_bias),
+                torch.nn.GELU(), torch.nn.Linear(intermediate_channels, channels, bias=use_bias)
+            )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.use_swiglu:
+            x = self.norm(x)
+            x = self.down_proj(F.gelu(self.gate_proj(x), approximate='tanh') * self.up_proj(x))
+        else:
+            x = self.main(self.norm(x))
+        return self.post_norm(x) if self.post_norm is not None else x
+class AttentionBlock(torch.nn.Module):
+    def __init__(self, channels: int, head_channels: int, expansion: float = 4, use_adaln: bool = False,
+                 use_swiglu=False, norm_type="layer_norm", num_heads=None, num_kv_heads=None,
+                 use_qk_norm=False, use_post_norm=False, use_bias=True, hf_style_rope=False, non_causal=False):
+        super().__init__()
+        if use_adaln:
+            self.adaLN_modulation = torch.nn.Sequential(
+                torch.nn.SiLU(),
+                torch.nn.Linear(channels, 4 * channels, bias=True),
+            )
+            self.norm1 = torch.nn.LayerNorm(channels, elementwise_affine=False, eps=1e-6)
+            self.norm2 = torch.nn.LayerNorm(channels, elementwise_affine=False, eps=1e-6)
+            torch.nn.init.constant_(self.adaLN_modulation[-1].weight, 0)
+            torch.nn.init.constant_(self.adaLN_modulation[-1].bias, 0)
+            # Hard-coded norm_type=="none" for adaLN
+            norm_type = 'none'
+        else:
+            self.adaLN_modulation = None
+        self.attention = Attention(channels, head_channels, norm_type, num_heads, num_kv_heads, use_qk_norm, use_post_norm, use_bias, hf_style_rope, non_causal)
+        self.mlp = MLP(channels, expansion, use_swiglu, norm_type, use_post_norm, use_bias)
+    def forward(
+        self, x: torch.Tensor, y: torch.Tensor | None = None, attn_mask: torch.Tensor | None = None,
+        attn_temp: float = 1.0, c=None, freqs_cis=None, kv_cache=None,
+        checkpoint_attn: bool = False, checkpoint_mlp: bool = False
+    ) -> torch.Tensor:
+        assert (x is not None) or (y is not None), "x or y must be provided"
+        z = torch.cat([y, x], 1) if (x is not None) and (y is not None) else x if x is not None else y
+        if self.adaLN_modulation is not None and c is not None:
+            shift_msa, scale_msa, shift_mlp, scale_mlp = self.adaLN_modulation(c).chunk(4, dim=-1)
+            z = z + self._forward_attention(z, attn_mask, attn_temp, freqs_cis, kv_cache, checkpoint_attn, shift_msa, scale_msa)
+            z = z + self._forward_mlp(z, checkpoint_mlp, shift_mlp, scale_mlp)
+        else:
+            z = z + self._forward_attention(z, attn_mask, attn_temp, freqs_cis, kv_cache, checkpoint_attn)
+            z = z + self._forward_mlp(z, checkpoint_mlp)
+        x, y = (z[:, y.size(1):], z[:, :y.size(1)]) if (x is not None) and (y is not None) \
+            else (z, None) if x is not None else (None, z)
+        return x, y
+    def _forward_attention(self, z, attn_mask, attn_temp, freqs_cis, kv_cache, checkpoint_attn, shift=None, scale=None):
+        def attn_fn(z_in):
+            if shift is not None and scale is not None:
+                z_in = modulate(self.norm1(z_in), shift, scale)
+            return self.attention(z_in, attn_mask, attn_temp, freqs_cis, kv_cache)
+        return checkpoint(attn_fn, z, use_reentrant=False) if checkpoint_attn and self.training else attn_fn(z)
+    def _forward_mlp(self, z, checkpoint_mlp, shift=None, scale=None):
+        def mlp_fn(z_in):
+            if shift is not None and scale is not None:
+                z_in = modulate(self.norm2(z_in), shift, scale)
+            return self.mlp(z_in)
+        return checkpoint(mlp_fn, z, use_reentrant=False) if checkpoint_mlp and self.training else mlp_fn(z)
+class MetaBlock(torch.nn.Module):
+    attn_mask: torch.Tensor
+    def __init__(
+        self,
+        in_channels: int,
+        channels: int,
+        img_size: int,
+        permutation: Permutation,
+        pt_seq_len: int | None = None,
+        num_layers: int = 1,
+        head_dim: int = 64,
+        num_heads: None | int = None,
+        num_kv_heads: None | int = None,
+        txt_size: int = 0,
+        txt_dim: int = 0,
+        expansion: float = 4,
+        use_rope: bool = False,
+        use_sos: bool = False,
+        use_softplus: bool = False,
+        use_swiglu: bool = False,
+        use_qk_norm: bool =False,
+        use_post_norm: bool = False,
+        use_final_norm: bool = False,
+        use_bias: bool = True,
+        use_proj_txt: bool = True,
+        hf_style_rope: bool = False,
+        norm_type: str ="layer_norm",
+        use_mm_attn: bool = False,
+        use_checkpoint: int = False,
+        use_checkpoint_mlp: int = None,
+        soft_clip: float = 0,
+        local_attn_window: int = None,
+    ):
+        super().__init__()
+        out_channels = in_channels * 2
+        self.proj_in = torch.nn.Linear(in_channels, channels)
+        self.proj_out = torch.nn.Linear(channels, out_channels)
+        if use_sos:
+            self.sos_embed = torch.nn.Parameter(torch.randn(1, 1, in_channels))
+        torch.nn.init.constant_(self.proj_out.weight, 0)
+        self.txt_size = txt_size
+        self.img_size = img_size
+        self.txt_dim = txt_dim
+        self.pt_seq_len = pt_seq_len or img_size
+        # KV cache configurations
+        num_kv_heads = num_kv_heads or (num_heads or channels // head_dim)
+        self.kv_cache_size = [num_kv_heads, head_dim]
+        if not use_rope:
+            self.pos_embed = torch.nn.Parameter(torch.randn(img_size ** 2, channels) * 1e-2)
+        else:
+            self.pos_embed = None
+        if txt_dim > 0:
+            self.proj_txt = torch.nn.Linear(txt_dim, channels) if use_proj_txt else torch.nn.Identity()
+            assert use_proj_txt or (txt_dim == channels), 'text dimension must equal channels when not using projection'
+        self.attn_blocks = torch.nn.ModuleList(
+            [AttentionBlock(channels, head_dim, expansion, False, use_swiglu,
+                            norm_type, num_heads, num_kv_heads, use_qk_norm, use_post_norm, use_bias, hf_style_rope)
+                            for _ in range(num_layers)])
+        self.use_final_norm = use_final_norm
+        if use_final_norm:
+            self.final_norm = RMSNorm(channels)
+        self.use_softplus = use_softplus
+        self.permutation = permutation
+        self.use_checkpoint = use_checkpoint
+        self.use_checkpoint_mlp = use_checkpoint_mlp
+        self.use_sos = use_sos
+        self.soft_clip = soft_clip
+        self.local_attn_window = local_attn_window
+        self.block_masks = {} # for local attention
+        # ---- DEPRECATED: do not pass mask to enable flash attention ----- For compatibility  ----- #
+        self.register_buffer('attn_mask', torch.tril(torch.ones(pt_seq_len ** 2 + txt_size, pt_seq_len ** 2 + txt_size)))
+    def get_freqs_cis(self, x, y, rope):
+        # get the input shape
+        h, w = x.size(-3), x.size(-2)
+        d = x.size(1) if x.dim() == 5 else 0
+        txt_size = y.size(1) if self.txt_size > 0 and y is not None else 0
+        if not rope.is_1d: # prepare 2D RoPE
+            if self.txt_size > 0 or d > 0:  # prepare 3D RoPE
+                if self.txt_dim > 0:  # text is conditioned
+                    pos = get_positions(h, w, txt_size, rope.pt_seq_len, d, mode='3d')
+                else:  # text is not conditioned
+                    pos = get_positions(h, w, 0, rope.pt_seq_len, d, mode='3d')
+            else:
+                pos = get_positions(h, w, 0, rope.pt_seq_len, mode='2d')
+        else:                   # prepare 1D RoPE
+            pos = get_positions(h, w, txt_size, rope.pt_seq_len, mode='1d')
+        return rope(pos.type_as(x))
+    def get_sos_embed(self, x):
+        sos_embed = self.sos_embed.expand(x.size(0), -1, -1)
+        return sos_embed
+    def get_prepared(self, x):
+        # input, output, freqs_cis
+        x_in = x.clone()
+        if self.use_sos:  # add SOS token, predict the first token sos->x_in[0]
+            x = torch.cat([self.get_sos_embed(x), x[:, :-1]], dim=1)
+        return x_in, x
+    def get_proj_in(self, x):
+        x = self.proj_in(x)
+        return x
+    def get_proj_out(self, x):
+        x = self.proj_out(x)
+        if hasattr(self, "soft_clip") and self.soft_clip > 0:
+            x = self.soft_clip * torch.tanh(x / self.soft_clip)
+        return x
+    def get_local_window_mask(self, x, y):
+        _, T, H, W, _ = x.shape
+        L = y.size(1) if y is not None else 0
+        B = H * W
+        N = T * B
+        S = L + N
+        G = self.local_attn_window
+        def mask(q, k):
+            return (k <= q) & ((k < L) | ((k - L) // B > (q - L) // B - G))
+        return mask(torch.arange(S, device=x.device)[:, None], torch.arange(S, device=x.device)[None, :])
+    def initialize_kv_cache(self, kv_cache, x, freqs_cis, reuse_kv_cache=False):
+        if self.local_attn_window is not None and self.local_attn_window > 0:
+            video_frame_size = x.size(-3) * x.size(-2)
+            kv_cache_length  = self.local_attn_window * video_frame_size
+            kv_cache_length += self.txt_size if self.txt_dim > 0 else 0
+            kv_cache.meta_data.update(
+                {"frame_size": video_frame_size, "txt_size": self.txt_size + 1 if self.txt_dim > 0 else 0})
+        else:
+            kv_cache_length = freqs_cis.size(0)
+        kv_cache_size = (x.size(0), self.kv_cache_size[0], kv_cache_length, self.kv_cache_size[1])
+        if kv_cache.is_empty:
+            kv_cache.initialize(len(self.attn_blocks), *kv_cache_size)
+            kv_cache.to(x.device, x.dtype)
+        else:
+            target_size = kv_cache_size[-2]
+            if reuse_kv_cache:
+                target_size = target_size - kv_cache.kv_index[0]
+            kv_cache.extend_length(target_size)
+        return kv_cache
+    def forward(self, x: torch.Tensor | List[torch.Tensor], y: torch.Tensor | None = None, rope=None, kv_cache=None, guidance=None):
+        freqs_cis = self.get_freqs_cis(x, y, rope) if rope is not None else None
+        attn_mask = None
+        if kv_cache is not None:
+            kv_cache = self.initialize_kv_cache(kv_cache, x, freqs_cis)
+        x = self.permutation(x)
+        pos_embed = self.permutation(self.pos_embed, dim=0) if self.pos_embed is not None else None
+        # prepare input
+        x_in, x = self.get_prepared(x)
+        if kv_cache is not None:
+            kv_cache.register_prefix_cache(x_in)
+        # input projection
+        x = self.get_proj_in(x)
+        if pos_embed is not None:
+            x = x + pos_embed
+        # conditioning
+        if self.txt_dim > 0:
+            y = self.proj_txt(y)
+        else:
+            y = None
+        # main block
+        for it, block in enumerate(self.attn_blocks):
+            _kv_cache = partial(kv_cache, it) if kv_cache is not None else None
+            # Frequency-based checkpointing strategy:
+            # - Checkpoint attention every use_checkpoint blocks (if use_checkpoint > 0)
+            # - Checkpoint MLP every use_checkpoint_mlp blocks (if provided), otherwise every use_checkpoint blocks
+            checkpoint_attn = self.training and self.use_checkpoint > 0 and ((it + 1) % self.use_checkpoint == 0)
+            if self.use_checkpoint_mlp is not None:
+                checkpoint_mlp = self.training and self.use_checkpoint_mlp > 0 and ((it + 1) % self.use_checkpoint_mlp == 0)
+            else:
+                checkpoint_mlp = self.training and self.use_checkpoint > 0 and ((it + 1) % self.use_checkpoint == 0)
+            x, y = block(x, y, attn_mask, 1.0, None, freqs_cis, _kv_cache,
+                         checkpoint_attn=checkpoint_attn,
+                         checkpoint_mlp=checkpoint_mlp)
+        # final norm
+        if self.use_final_norm:
+            x, y = self.final_norm(x), self.final_norm(y) if y is not None else None
+        x = self.get_proj_out(x)
+        if not self.use_sos:  # no SOS token, we need to shift the sequence
+            x = torch.cat([torch.zeros_like(x[:, :1]), x[:, :-1]], dim=1)
+        xa, xb = x.chunk(2, dim=-1)
+        # Store original dtype for output conversion
+        original_dtype = xa.dtype
+        # Convert to fp32 for numerical stability
+        xa, xb, x_in = xa.float(), xb.float(), x_in.float()
+        if not self.use_softplus:
+            xa = xa.exp()
+        else:
+            xa = F.softplus(xa + INV_SOFTPLUS_1)
+        if guidance is not None and guidance > 0:
+            xb, xa = self.guidance(xa, xb, guidance, 1.0, 'ab')
+        # NOTE: this "scale" is in fact 1/sigma, not sigma
+        x = self.permutation((x_in - xb) / xa, inverse=True)
+        logdet = -torch.log(xa)  # keep all the dimensions
+        # Convert back to original precision
+        x = x.to(original_dtype)
+        return x, y, logdet
+    def guidance(self, za, zb, guidance, r=1.0, guide_what='ab'):
+        za, za_u = [torch.cat([a, a]) for a in za.chunk(2, dim=0)]
+        zb, zb_u = [torch.cat([a, a]) for a in zb.chunk(2, dim=0)]
+        g = r * guidance
+        def logits_guided(mu_c, sigma_c, mu_u, sigma_u, w):
+            # inspired from: (1+w) * logP_cond - w * logP_uncond
+            # sigma_c = torch.minimum(sigma_c, sigma_u)
+            s = (sigma_c / sigma_u).clip(max=1.0).square()
+            sigma_eff = sigma_c / (1 + w - w * s).sqrt()
+            mu_eff = ((1 + w) * mu_c - (w * s) * mu_u) / (1 + w - w * s)
+            return mu_eff, sigma_eff
+        def original_guidance(mu_c, sigma_c, mu_u, sigma_u, w):
+            if 'a' in guide_what:
+                sigma_c = sigma_c + g * (sigma_c - sigma_u)
+            if 'b' in guide_what:
+                mu_c = mu_c + g * (mu_c - mu_u)
+            return mu_c, sigma_c
+        #zb, za = original_guidance(zb, za, zb_u, za_u, guidance)
+        zb, za = logits_guided(zb, za, zb_u, za_u, guidance)
+        return zb, za
+    def reverse_step(
+        self, x: torch.Tensor, t: int, kv_cache: KVCache,
+        pos_embed: torch.Tensor | None = None, y: torch.Tensor | None = None,
+        attn_temp: float = 1.0, freqs_cis=None
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # Store original dtype for sampling tensor
+        original_dtype = x.dtype
+        if self.use_sos:  # get i-th patch but keep the sequence dimension
+            x_in = self.get_sos_embed(x[:, :1]) if t == 0 else x[:, t - 1 : t]
+        else:
+            x_in = x[:, t : t + 1]
+        # Convert to model's dtype for neural network computation
+        if hasattr(self.proj_in, 'weight'):
+            target_dtype = self.proj_in.weight.dtype
+            x_in = x_in.to(target_dtype)
+        x = self.get_proj_in(x_in)
+        # if positional embedding
+        if pos_embed is not None:
+            x = x + pos_embed[t: t+1]
+        # main block
+        for i, block in enumerate(self.attn_blocks):
+            x, _ = block(x, None, attn_temp=attn_temp, freqs_cis=freqs_cis, kv_cache=partial(kv_cache, i))
+        # final norm
+        if self.use_final_norm:
+            x = self.final_norm(x)
+        x = self.get_proj_out(x)
+        xa, xb = x.chunk(2, dim=-1)
+        # Convert back to original dtype for sampling computations
+        return xa.to(original_dtype), xb.to(original_dtype)
+    def reverse_step_condition(self, y, kv_cache, pos_embed=None, attn_temp: float = 1.0, freqs_cis=None):
+        # Convert to model's dtype for neural network computation
+        if hasattr(self.proj_txt, 'weight'):
+            target_dtype = self.proj_txt.weight.dtype
+            y = y.to(target_dtype)
+        y = self.proj_txt(y)
+        for i, block in enumerate(self.attn_blocks):
+            _, y = block(None, y, attn_temp=attn_temp, freqs_cis=freqs_cis, kv_cache=partial(kv_cache, i))
+        return y
+    def reverse(
+        self,
+        z: torch.Tensor,
+        y: torch.Tensor | None = None,
+        guidance: float = 0,
+        guide_what: str = 'ab',
+        attn_temp: float = 1.0,
+        annealed_guidance: bool = False,
+        rope=None,
+        verbose=False,
+        kv_cache: KVCache=KVCache(),
+        **unused_kwargs
+    ) -> torch.Tensor:
+        # Ensure sampling tensors are in float32 for numerical stability
+        original_dtype = z.dtype
+        z = z.float()
+        freqs_cis = self.get_freqs_cis(z, y, rope) if rope is not None else None
+        if guidance > 0:
+            z = torch.cat([z, z], 0)
+        # kv cache
+        reuse_kv_cache = kv_cache.prefix_cache is not None and kv_cache.kv_index[0] > 0
+        kv_cache = self.initialize_kv_cache(kv_cache, z, freqs_cis, reuse_kv_cache)
+        # permute the input
+        z = self.permutation(z)
+        pos_embed = self.permutation(self.pos_embed, dim=0) if self.pos_embed is not None else None
+        # run additional text condition, results will be used in KV cache.
+        if self.txt_dim > 0:
+            if not reuse_kv_cache:
+                self.reverse_step_condition(y, kv_cache, pos_embed, attn_temp, freqs_cis)
+        txt_size = y.size(1) if self.txt_dim > 0 else 0
+        # run the reverse process
+        x = z.clone()
+        if reuse_kv_cache:
+            x[:, :kv_cache.prefix_cache.size(1)] = kv_cache.prefix_cache  # fill the prefix cache
+        T = x.size(1) - 1 if not self.use_sos else x.size(1)
+        for t in tqdm.trange(T, disable=not verbose, desc='Sub-flow Sampling', leave=False):
+            if reuse_kv_cache and kv_cache.kv_index[0] > t + txt_size:
+                continue
+            za, zb = self.reverse_step(x, t, kv_cache, pos_embed, y, attn_temp, freqs_cis)
+            # Ensure sampling computations stay in float32
+            za, zb = za.float(), zb.float()
+            if not self.use_softplus:
+                za, zb = za.exp().squeeze(1), zb.squeeze(1)
+            else:
+                za, zb = F.softplus(za + INV_SOFTPLUS_1).squeeze(1), zb.squeeze(1)
+            if guidance > 0 and guide_what:
+                r = (t + 1) / T if annealed_guidance else 1.0
+                zb, za = self.guidance(za, zb, guidance, r, guide_what)
+            if self.use_sos:
+                x[:, t] = z[:, t] * za + zb
+            else:
+                x[:, t + 1] = z[:, t + 1] * za + zb
+        if guidance > 0:
+            x = x.chunk(2, dim=0)[0]
+            kv_cache.remove_negative_cache()  # remove the second half of the cache
+        x = self.permutation(x, inverse=True)
+        # Convert back to original dtype if needed
+        return x.to(original_dtype)
+    def jacobi(self,
+               z: torch.Tensor,
+               y: torch.Tensor | None = None,
+               guidance: float = 0,
+               rope=None,
+               kv_cache=None,
+               verbose=False,
+               jacobi_block_size: int = 32,
+               jacobi_max_iter: int = 32,
+               jacobi_th: float = 0.001,
+               context_length: int = None,
+               **unused_kwargs) -> torch.Tensor:
+        assert self.use_sos, "Jacobi iteration requires SOS token to be used"
+        assert self.pos_embed is None, "Jacobi iteration does not support positional embedding"
+        # Ensure sampling tensors are in float32 for numerical stability
+        original_dtype = z.dtype
+        z = z.float()
+        freqs_cis = self.get_freqs_cis(z, y, rope) if rope is not None else None
+        if guidance > 0:
+            z = torch.cat([z, z], 0)
+        # kv cache
+        reuse_kv_cache = kv_cache.prefix_cache is not None and kv_cache.kv_index[0] > 0
+        kv_cache = self.initialize_kv_cache(kv_cache, z, freqs_cis, reuse_kv_cache)
+        video_length = z.size(1) if z.dim() == 5 else 1
+        # permute the input
+        z = self.permutation(z)
+        # prepare input
+        x_full = torch.cat([self.get_sos_embed(z), z.clone()], dim=1)
+        if reuse_kv_cache:
+            x_full[:, 1: kv_cache.prefix_cache.size(1) + 1] = kv_cache.prefix_cache  # fill the prefix cache
+        # conditioning
+        if self.txt_dim > 0:
+            if not reuse_kv_cache:
+                self.reverse_step_condition(y, kv_cache, freqs_cis=freqs_cis)
+        txt_size = y.size(1) if self.txt_dim > 0 else 0
+        video_frame_size = z.size(1) // video_length
+        start_idx = 0
+        if reuse_kv_cache:
+            start_idx = kv_cache.kv_index[0] - txt_size  # start from the last cached index
+        prog_bar = tqdm.tqdm(total=z.size(1), disable=not verbose, desc='Block-wise Jacobi Iteration', leave=False)
+        prog_bar.update(start_idx)
+        local_attn_window = self.local_attn_window * video_frame_size if self.local_attn_window is not None else None
+        target_frame_size = z.size(1) if local_attn_window is None else min(z.size(1), local_attn_window)
+        context_size = None if local_attn_window is None else context_length * video_frame_size
+        while target_frame_size <= z.size(1):
+            while start_idx < target_frame_size:
+                chunk_size = jacobi_block_size if start_idx <= video_frame_size else jacobi_block_size * 4
+                local_done = torch.zeros((), dtype=torch.bool, device=x_full.device)
+                for i in tqdm.tqdm(range(jacobi_max_iter), disable=True, desc='Jacobi Iteration', leave=False):
+                    if start_idx + chunk_size >= target_frame_size:
+                        chunk_size = target_frame_size - start_idx
+                    if i == 0 and start_idx > video_frame_size:  # optional to use past frame to initialize the current frame
+                        x = x_full[:, start_idx - video_frame_size: start_idx + chunk_size - video_frame_size]
+                    else:
+                        x = x_full[:, start_idx: start_idx + chunk_size]
+                    # main forward - convert to model dtype for neural network computation
+                    if hasattr(self.proj_in, 'weight'):
+                        target_dtype = self.proj_in.weight.dtype
+                        x = x.to(target_dtype)
+                    x = self.get_proj_in(x)
+                    for it, block in enumerate(self.attn_blocks):
+                        _kv_cache  = partial(kv_cache, it) if kv_cache is not None else None
+                        x = block(x, None, freqs_cis=freqs_cis, kv_cache=_kv_cache)[0]
+                    if self.use_final_norm:
+                        x = self.final_norm(x)
+                    x = self.get_proj_out(x)
+                    xa, xb = x.chunk(2, dim=-1)
+                    # Convert back to float32 for sampling computations
+                    xa, xb = xa.float(), xb.float()
+                    if not self.use_softplus:
+                        xa = xa.exp()
+                    else:
+                        xa = F.softplus(xa + INV_SOFTPLUS_1)
+                    if guidance > 0:
+                        xb, xa = self.guidance(xa, xb, guidance, 1.0, 'ab')
+                    # compute the Jacobi Iteration - all in float32
+                    new_x = xb + xa * z[:, start_idx: start_idx+chunk_size]
+                    diff = ((new_x - x_full[:, start_idx+1: start_idx+1+chunk_size]) ** 2).mean() / (new_x ** 2).mean()
+                    x_full[:, start_idx+1: start_idx+1+chunk_size] = new_x
+                    if diff < jacobi_th or i == jacobi_max_iter - 1:  # do not clean the cache on the last iteration
+                        local_done.fill_(1)
+                    global_done = local_done.clone()
+                    torch.distributed.all_reduce(global_done, op=torch.distributed.ReduceOp.MIN)
+                    if int(global_done.item()) == 1:
+                        break
+                    kv_cache.backward_in_time(chunk_size)
+                start_idx += chunk_size
+                prog_bar.update(chunk_size)
+            if target_frame_size >= z.size(1):
+                break
+            target_frame_size += local_attn_window - context_size if local_attn_window is not None else video_frame_size
+            target_frame_size = min(target_frame_size, z.size(1))
+            # re-encode the context with attention blocks
+            print(f're-encoding the context {start_idx+1-context_size}:{start_idx+1}')
+            kv_cache.reset_kv_index()
+            if self.txt_dim > 0:
+                self.reverse_step_condition(y, kv_cache, freqs_cis=freqs_cis)
+            x_context = x_full[:, start_idx+1-context_size: start_idx+1]
+            x_context_in, x_context = self.get_prepared(x_context)
+            x_context = self.get_proj_in(x_context)
+            for it, block in enumerate(self.attn_blocks):
+                _kv_cache  = partial(kv_cache, it) if kv_cache is not None else None
+                x_context = block(x_context, None, freqs_cis=freqs_cis, kv_cache=_kv_cache)[0]
+        x = x_full[:, 1:]
+        if guidance > 0:
+            x = x.chunk(2, dim=0)[0]  # remove SOS token
+        x = self.permutation(x, inverse=True)
+        # Convert back to original dtype if needed
+        return x.to(original_dtype)
+class IdentityBlock(MetaBlock):
+    def __init__(self, *args, **kwargs):
+        super(MetaBlock, self).__init__()
+    def forward(self, x, y=None, rope=None, **unused):
+        return x, y, x.new_zeros(x.size(0))
+    def reverse(self,
+                z: torch.Tensor,
+                y: torch.Tensor | None = None,
+                guidance: float = 0,
+                guide_what: str = 'ab',
+                attn_temp: float = 1.0,
+                annealed_guidance: bool = False,
+                rope=None,
+                verbose=False,
+                kv_cache: KVCache=KVCache(), **unused):
+        # Preserve original dtype
+        return z
+    def jacobi(self,
+               z: torch.Tensor,
+               y: torch.Tensor | None = None,
+               guidance: float = 0,
+               rope=None,
+               kv_cache=None,
+               verbose=False,
+               jacobi_block_size: int = 64,
+               jacobi_th: float = 0.005, **unused_kwargs) -> torch.Tensor:
+        return z
+class NonCausalBlock(MetaBlock):
+    def __init__(
+        self,
+        in_channels: int,
+        channels: int,
+        img_size: int,
+        pt_seq_len: int | None = None,
+        num_layers: int = 8,
+        head_dim: int = 64,
+        num_heads: None | int = None,
+        num_kv_heads: None | int = None,
+        txt_size: int = 0,
+        txt_dim: int = 0,
+        expansion: float = 4,
+        use_rope: bool = False,
+        use_swiglu: bool = False,
+        use_qk_norm: bool =False,
+        use_post_norm: bool = False,
+        use_final_norm: bool = False,
+        use_bias: bool = True,
+        hf_style_rope: bool = False,
+        norm_type: str ="layer_norm",
+        use_checkpoint: int = False,
+        use_checkpoint_mlp: int = None,
+        block_causal: int = 0,
+        window: int = None,
+        **unused_kwargs,
+    ):
+        super(MetaBlock, self).__init__()
+        out_channels = in_channels
+        self.proj_in = torch.nn.Linear(in_channels, channels)
+        self.proj_out = torch.nn.Linear(channels, out_channels)
+        torch.nn.init.constant_(self.proj_out.weight, 0)
+        self.txt_size = txt_size
+        self.img_size = img_size
+        self.txt_dim = txt_dim
+        self.pt_seq_len = pt_seq_len or img_size
+        self.block_causal = block_causal
+        self.window = window
+        # KV cache configurations
+        num_kv_heads = num_kv_heads or (num_heads or channels // head_dim)
+        self.kv_cache_size = [num_kv_heads, head_dim]
+        if txt_dim > 0:
+            self.proj_txt = torch.nn.Linear(txt_dim, channels)
+        self.attn_blocks = torch.nn.ModuleList(
+            [AttentionBlock(channels, head_dim, expansion, False, use_swiglu, norm_type, num_heads, num_kv_heads,
+                            use_qk_norm, use_post_norm, use_bias, hf_style_rope, non_causal=True) for _ in range(num_layers)])
+        self.use_final_norm = use_final_norm
+        if use_final_norm:
+            self.final_norm = RMSNorm(channels)
+        self.use_checkpoint = use_checkpoint
+        self.use_checkpoint_mlp = use_checkpoint_mlp
+        self.block_masks = {} # for local attention
+    def get_local_window_mask(self, x, y):
+        _, T, H, W, _ = x.shape
+        L = y.size(1) if y is not None else 0
+        B = H * W
+        N = T * B
+        S = L + N
+        A = self.block_causal
+        G = self.window if self.window is not None else 10000
+        def mask(q, k):
+            return (k < L) | (
+                ((k - L) // B >= (q - L) // B + A - 1 - G) &
+                ((k - L) // B <= torch.relu(q - L) // B + A - 1)
+            )
+        return mask(torch.arange(S, device=x.device)[:, None], torch.arange(S, device=x.device)[None, :])
+    def forward(self, x, y, rope, **unused):
+        freqs_cis = self.get_freqs_cis(x, y, rope) if rope is not None else None
+        if self.block_causal > 0 and x.dim() == 5:
+            attn_mask = self.get_local_window_mask(x, y if self.txt_dim > 0 else None)
+        else:
+            attn_mask = None
+        if x.dim() == 5:  # video input
+            N, H, W, x = x.size(1), x.size(2), x.size(3), rearrange(x, 'b t h w c -> b (t h w) c')  # flatten x
+        else:
+            N, H, W, x = 0, x.size(1), x.size(2), rearrange(x, 'b h w c -> b (h w) c')  # flatten x
+        x = self.get_proj_in(x)
+        y = self.proj_txt(y) if self.txt_dim > 0 else None
+        for it, block in enumerate(self.attn_blocks):
+            # Frequency-based checkpointing strategy:
+            # - Checkpoint attention every use_checkpoint blocks (if use_checkpoint > 0)
+            # - Checkpoint MLP every use_checkpoint_mlp blocks (if provided), otherwise every use_checkpoint blocks
+            checkpoint_attn = self.training and self.use_checkpoint > 0 and ((it + 1) % self.use_checkpoint == 0)
+            if self.use_checkpoint_mlp is not None:
+                checkpoint_mlp = self.training and self.use_checkpoint_mlp > 0 and ((it + 1) % self.use_checkpoint_mlp == 0)
+            else:
+                checkpoint_mlp = self.training and self.use_checkpoint > 0 and ((it + 1) % self.use_checkpoint == 0)
+            x, y = block(x, y, attn_mask, 1.0, None, freqs_cis,
+                        checkpoint_attn=checkpoint_attn, checkpoint_mlp=checkpoint_mlp)
+        if self.use_final_norm:
+            x = self.final_norm(x)
+        x = self.get_proj_out(x)
+        if N > 0:
+            x = rearrange(x, 'b (t h w) d -> b t h w d', t=N, h=H, w=W)
+        else:
+            x = rearrange(x, 'b (h w) d -> b h w d', h=H, w=W)
+        return x
+class Model(torch.nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        img_size: int,
+        patch_size: int,
+        channels: int,
+        num_blocks: int,
+        layers_per_block: List[int],
+        head_dim: int = 64,
+        num_heads: None | int = None,
+        num_kv_heads: None | int = None,
+        rope: bool = False,
+        pt_seq_len: None | int = None,
+        sos: bool = False,
+        txt_size: int = 0,
+        txt_dim: int = 0,
+        cond_top_only: bool = False,
+        use_softplus: bool = False,
+        use_swiglu: bool = False,
+        use_bias: bool = True,
+        use_qk_norm: bool = False,
+        use_post_norm: bool = False,
+        use_final_norm: bool = False,
+        hf_style_rope: bool = False,
+        norm_type: str = "layer_norm",
+        use_checkpoint: int = 0,
+        use_checkpoint_mlp: int = None,
+        use_pretrained_lm: str | None = None,
+        use_mm_attn: bool = False,
+        soft_clip: float = 0,
+        seq_order: str = "R2L",
+        learnable_self_denoiser: bool = False,
+        conditional_denoiser: bool = False,
+        temporal_causal: int = 0,
+        top_block_channels: int = None,  # If specified, top block uses different size
+        shallow_block_local: bool = False,  # If True, shallow blocks only constrained within a frame
+        denoiser_window: int = None,  # If specified, use local attention in the denoiser with given window size
+        local_attn_window: int = None,  # If specified, use local attention in all blocks with given window size
+        **unused_kwargs,
+    ):
+        super().__init__()
+        self.img_size = img_size
+        self.in_channels = in_channels
+        self.patch_size = patch_size
+        self.pt_seq_len = pt_seq_len or img_size // patch_size
+        self.num_patches = self.pt_seq_len ** 2
+        self.use_rope = rope
+        self.use_sos = sos
+        self.use_softplus = use_softplus
+        self.cond_top_only = cond_top_only
+        self.seq_order = seq_order
+        self.temporal_causal = temporal_causal
+        self.top_block_channels = top_block_channels or channels
+        self.shallow_block_local = shallow_block_local
+        self.expansion_init_std = 0.02
+        assert (not local_attn_window) or shallow_block_local, 'local_attn_window requires shallow_block_local'
+        assert (not shallow_block_local) or self.cond_top_only, 'shallow_block_local requires cond_top_only'
+        assert (not self.cond_top_only) or (txt_size > 0), 'cond_top_only requires txt_size > 0'
+        assert (seq_order == 'L2R') or (temporal_causal == 0), 'seq_order must be L2R if temporal causal is True'
+        permutations = [PermutationIdentity(self.num_patches), PermutationFlip(self.num_patches)] if temporal_causal == 0 else \
+                       [PermutationIdentity(self.num_patches), PermutationFlipInBlock(self.num_patches)]
+        blocks = []
+        if len(layers_per_block) == 1:
+            layers_per_block = [layers_per_block[0]] * num_blocks
+        base_kwargs = dict(
+            in_channels=in_channels * patch_size**2,
+            channels=channels,
+            img_size=img_size // patch_size,
+            pt_seq_len=self.pt_seq_len,
+            txt_size=txt_size,
+            use_rope=self.use_rope, hf_style_rope=hf_style_rope, use_sos=self.use_sos,
+            use_softplus=self.use_softplus,
+            use_swiglu=use_swiglu, use_qk_norm=use_qk_norm,
+            use_post_norm=use_post_norm, use_final_norm=use_final_norm,
+            use_bias=use_bias, norm_type=norm_type, num_heads=num_heads,
+            num_kv_heads=num_kv_heads, head_dim=head_dim,
+            use_checkpoint=use_checkpoint,
+            use_checkpoint_mlp=use_checkpoint_mlp,
+            soft_clip=soft_clip,
+        )
+        # bottom blocks
+        for i in range(num_blocks-1):
+            permutation = permutations[i % 2] if seq_order == 'R2L' else permutations[(i+1) % 2]
+            Block = IdentityBlock if layers_per_block[i] == 0 else MetaBlock
+            blocks.append(Block(permutation=permutation, num_layers=layers_per_block[i], txt_dim=0 if cond_top_only else txt_dim, **base_kwargs))
+        # top block
+        gen_kwargs = copy.deepcopy(base_kwargs)
+        if self.top_block_channels != channels:
+            gen_kwargs['channels'] = self.top_block_channels
+            if num_heads is None:
+                gen_kwargs['num_heads'] = self.top_block_channels // head_dim
+        if use_pretrained_lm is not None:
+            gen_kwargs.update(eval(f"{use_pretrained_lm}_kwargs"))
+            if use_mm_attn:
+                gen_kwargs.update({"use_mm_attn": True})  # only top block will receive this
+        else:
+            gen_kwargs.update({"num_layers": layers_per_block[-1]})
+        permutation = permutations[(num_blocks-1) % 2] if seq_order == 'R2L' else permutations[(num_blocks) % 2]
+        top_block = MetaBlock(permutation=permutation, txt_dim=txt_dim, local_attn_window=local_attn_window, **gen_kwargs)
+        blocks.append(top_block)
+        # put together
+        self.blocks = torch.nn.ModuleList(blocks)
+        # Self-denoiser
+        if learnable_self_denoiser:
+            self.learnable_self_denoiser = NonCausalBlock(
+                num_layers=8, block_causal=temporal_causal, window=denoiser_window,
+                txt_dim=0 if not conditional_denoiser else txt_dim,
+                **base_kwargs)
+        # setup rotary embeddings
+        if self.use_rope:
+            self.feat_rope = VisionRotaryEmbeddingFast(
+                dim=base_kwargs['head_dim'] // 2, pt_seq_len=base_kwargs['pt_seq_len'], latent_len=txt_size)
+            if use_pretrained_lm is not None:  # using standard 1D RoPE
+                self.feat_rope_gen = VisionRotaryEmbeddingFast(
+                    dim=gen_kwargs['head_dim'] // 2, pt_seq_len=gen_kwargs['pt_seq_len'], no_buffer=True, is_1d=True)
+            else:
+                self.feat_rope_gen = VisionRotaryEmbeddingFast(
+                    dim=gen_kwargs['head_dim'] // 2, pt_seq_len=gen_kwargs['pt_seq_len'], latent_len=txt_size, no_buffer=True)
+        else:
+            self.feat_rope = self.feat_rope_gen = None
+        # -----  DEPRECATED: not useful -------
+        self.register_buffer('var', torch.ones(self.num_patches, in_channels * patch_size**2))
+    def patchify(self, x: List[torch.Tensor] | torch.Tensor, p: int | None = None) -> torch.Tensor:
+        """Convert an image (N,C',H,W) to a sequence of patches (N,T,C')"""
+        if len(x.shape) < 4:
+            return x  # no need patchify
+        H, W = x.shape[-2], x.shape[-1]
+        p = self.patch_size * p if p is not None else self.patch_size
+        assert H % p == 0 and W % p == 0, "H and W must be divisible by patch_size"
+        x = rearrange(x, '... c (h p1) (w p2) -> ... h w (p1 p2 c)', p1=p, p2=p)
+        return x
+    def unpatchify(self, x: List[torch.Tensor] | torch.Tensor, p: int | None = None) -> torch.Tensor:
+        """Convert a sequence of patches (N,T,C) to an image (N,C',H,W)"""
+        if len(x.shape) < 4:
+            return x  # no need unpatchify
+        p = self.patch_size * p if p is not None else self.patch_size
+        H, W = x.shape[-3], x.shape[-2]
+        return rearrange(x, '... h w (p1 p2 c) -> ... c (h p1) (w p2)', h=H, w=W, p1=p, p2=p)
+    def get_loss(self,
+                 z: torch.Tensor | List[torch.Tensor],
+                 logdets: torch.Tensor | List[torch.Tensor],
+                 weights: torch.Tensor | None = None,
+                 drop_first=False) -> dict[str, torch.Tensor]:
+        if drop_first:
+            z, logdets = z[:, 1:], [logdet[:, 1:] for logdet in logdets]
+        loss_z = 0.5 * z.pow(2).mean(dim=tuple(range(1, z.dim())))
+        loss_logdet = -sum([logdet.mean(dim=tuple(range(1, logdet.dim()))) for logdet in logdets])
+        loss = loss_z + loss_logdet
+        if weights is not None:
+            loss = loss * weights
+        loss = loss.mean()
+        return {'loss': loss, 'loss_z': loss_z.detach().mean(), 'loss_logdet': loss_logdet.detach().mean()}
+    def forward(
+        self, x: torch.Tensor, y: torch.Tensor | None = None,
+        reverse=False, kv_caches=None, denoiser=False, context=False, **kwargs
+    ) -> tuple[torch.Tensor, list[torch.Tensor], torch.Tensor]:
+        if context:
+            return self.forward_context(x, y, kv_caches=kv_caches, **kwargs)
+        if reverse:  # inference mode
+            return self.reverse(x, y, kv_caches=kv_caches, **kwargs)
+        if denoiser: # forward with self-denoiser
+            x = self.patchify(x)
+            x = self.learnable_self_denoiser(x, y, self.feat_rope, **kwargs)
+            return self.unpatchify(x)
+        logdets, outputs = [], []
+        guidance = kwargs.get('guidance', 0)
+        # Bottom blocks
+        x = self.patchify(x)
+        outputs += [x]
+        for it, block in enumerate(self.blocks[:-1]):
+            if self.shallow_block_local and x.dim() == 5:  # video input
+                x = rearrange(x, 'b t h w c -> (b t) 1 h w c')
+            x, _, logdet = block(x, y.chunk(2, dim=0)[0] if self.cond_top_only and guidance > 0 else y,
+                                 self.feat_rope, kv_cache=kv_caches[-(it+1)] if kv_caches is not None else None)
+            if self.shallow_block_local and x.dim() == 5:  # video input
+                x = rearrange(x, '(b t) 1 h w c -> b t h w c', b=outputs[0].size(0), t=outputs[0].size(1))
+                logdet = rearrange(logdet, '(b t) l c -> b t l c', b=outputs[0].size(0), t=outputs[0].size(1))
+            logdets += [logdet]
+            outputs += x if isinstance(x, list) else [x]
+        # Top block
+        x, y, logdet = self.blocks[-1](x, y, self.feat_rope_gen,
+                                       kv_cache=kv_caches[0] if kv_caches is not None else None,
+                                       guidance=guidance)
+        outputs += [x]
+        x = self.unpatchify(x)
+        logdets += [logdet]
+        return x, y, outputs, logdets
+    def forward_context(self, x: torch.Tensor, y: torch.Tensor | None = None, kv_caches: List[KVCache] | None = None, **kwargs):
+        if kv_caches is None:
+            kv_caches = [KVCache() for _ in range(len(self.blocks))]
+        use_cfg = (x.size(0) * 2 == y.size(0)) if (y is not None and self.cond_top_only) else False
+        if use_cfg:
+            x = torch.cat([x, x], 0)  # duplicate for classifier-free guidance generation
+        self.forward(x, y, kv_caches=kv_caches, **kwargs)  # run once to fill the cache
+        if use_cfg:
+            for kv in kv_caches[1:]:
+                kv.remove_negative_cache()  # remove negative cache except for the first block
+                kv.prefix_cache = kv.prefix_cache.chunk(2, dim=0)[0] if kv.prefix_cache is not None else None
+        return kv_caches
+    def reverse_deep(self,
+        x: List[torch.Tensor] | torch.Tensor,
+        y: torch.Tensor | None = None,
+        guidance: float = 0,
+        verbose: bool = False,
+        kv_caches: List[KVCache] | None = None,
+        jacobi: bool = False,
+        need_caches: bool = False,
+        seq: List[torch.Tensor] = [],
+        **sampling_kwargs,):
+        x = self.patchify(x)
+        x = (self.blocks[-1].jacobi if jacobi else self.blocks[-1].reverse)(
+            x, y, guidance, rope=self.feat_rope_gen, kv_cache=kv_caches[0], verbose=verbose, **sampling_kwargs)
+        x = self.unpatchify(x)
+        if not need_caches:
+            kv_caches[0].delete()
+        seq.append(x)
+        return x
+    def reverse_shallow(self,
+        x: List[torch.Tensor] | torch.Tensor,
+        y: torch.Tensor | None = None,
+        guidance: float = 0,
+        verbose: bool = False,
+        kv_caches: List[KVCache] | None = None,
+        jacobi: bool = False,
+        need_caches: bool = False,
+        seq: List[torch.Tensor] = [],
+        **sampling_kwargs,):
+        x = self.patchify(x)
+        for it, block in enumerate(reversed(self.blocks[:-1])):
+            if self.shallow_block_local and x.dim() == 5:  # video input
+                x = rearrange(x, 'b t h w c -> (b t) 1 h w c')
+                kv_caches[it+1]._is_empty = True
+                kv_caches[it+1].prefix_cache = None
+            x = (block.jacobi if jacobi else block.reverse)(
+                x, y, guidance, rope=self.feat_rope, kv_cache=kv_caches[it+1], verbose=verbose, **sampling_kwargs)
+            if self.shallow_block_local and x.dim() == 5:  # video input
+                x = rearrange(x, '(b t) 1 h w c -> b t h w c', b=seq[0].size(0), t=seq[0].size(1))
+            seq.append(self.unpatchify(x))
+            if not need_caches:
+                kv_caches[it+1].delete()
+        x = self.unpatchify(x)
+        return x
+    def reverse(
+        self,
+        x: List[torch.Tensor] | torch.Tensor,
+        y: torch.Tensor | None = None,
+        guidance: float = 0,
+        guide_top: int | None = None,
+        return_sequence: bool = False,
+        verbose: bool = False,
+        kv_caches: List[KVCache] | None = None,
+        jacobi: bool = False,
+        **sampling_kwargs,
+    ) -> torch.Tensor | list[torch.Tensor]:
+        seq, need_caches, kv_caches = [x], (kv_caches is not None), kv_caches or [KVCache() for _ in range(len(self.blocks))]
+        # run the deep block first
+        x = self.reverse_deep(x, y, guidance, verbose, kv_caches, jacobi, need_caches, seq, **sampling_kwargs)
+        # remove guidance if bottom is unconditional
+        if (guide_top is not None or self.cond_top_only) and guidance > 0:
+            guidance, y = 0, y.chunk(2, dim=0)[0]
+        # run the shallow blocks
+        x = self.reverse_shallow(x, y, guidance, verbose, kv_caches, jacobi, need_caches, seq, **sampling_kwargs)
+        return seq if return_sequence else x
+#################################################################################
+#                                  TARFLow Configs                              #
+#################################################################################
+def TarFlow_XL_1(**kwargs):
+    return Model(num_blocks=6, layers_per_block=[2,2,2,2,10,10],
+                 channels=2048, patch_size=1, head_dim=64,  rope=1, **kwargs)
+def TarFlow_XL_2(**kwargs):
+    return Model(num_blocks=6, layers_per_block=[2,2,2,2,10,10],
+                 channels=2048, patch_size=2, head_dim=64,  rope=1, **kwargs)
+def TarFlow_XXL_1(**kwargs):
+    return Model(num_blocks=6, layers_per_block=[2,2,2,2,13,13],
+                 channels=3072, patch_size=1, head_dim=64,  rope=1, **kwargs)
+def TarFlow_XLv2_1(**kwargs):   # 1.4B
+    return Model(num_blocks=6, layers_per_block=[2,2,2,2,2,18],
+                 channels=2048, patch_size=1, head_dim=64,  rope=1, **kwargs)
+def TarFlow_XXLv2_1(**kwargs):  # 4B
+    return Model(num_blocks=6, layers_per_block=[2,2,2,2,2,24],
+                 channels=3072, patch_size=1, head_dim=64,  rope=1, **kwargs)
+def TarFlow_Gemma2B(**kwargs):  # 2B
+    return Model(num_blocks=6, layers_per_block=[2,2,2,2,2,26],
+                 channels=2304, patch_size=1,  rope=1,
+                 use_rope=True, hf_style_rope=True, use_adaln=False,
+                 use_swiglu=True, use_qk_norm=False, use_post_norm=True,
+                 use_final_norm=True, use_bias=False, norm_type="rms_norm",
+                 num_heads=8, num_kv_heads=4, head_dim=256, **kwargs)
+# Pre-trained model configs
+pre_model_configs = {
+    "TarFlow_XL_1": TarFlow_XL_1,
+    "TarFlow_XLv2_1": TarFlow_XLv2_1,
+    "TarFlow_XL_2": TarFlow_XL_2,
+    "TarFlow_XXL_1": TarFlow_XXL_1,
+    "TarFlow_XXLv2_1": TarFlow_XXLv2_1,
+}
+#################################################################################
+#                                  Pretrained LLMs                              #
+#################################################################################
+gemma3_4b_kwargs = dict(
+    use_rope=True, hf_style_rope=True, use_adaln=False,
+    use_swiglu=True, use_qk_norm=True, use_post_norm=True,
+    use_final_norm=True, use_bias=False, norm_type="rms_norm",
+    num_heads=8, num_kv_heads=4, head_dim=256, channels=2560,
+    num_layers=34, use_proj_txt=False)
+gemma3_1b_kwargs = dict(
+    use_rope=True, hf_style_rope=True, use_adaln=False,
+    use_swiglu=True, use_qk_norm=True, use_post_norm=True,
+    use_final_norm=True, use_bias=False, norm_type="rms_norm",
+    num_heads=4, num_kv_heads=1, head_dim=256, channels=1152, expansion=6,
+    num_layers=26, use_proj_txt=False)
+gemma2_2b_kwargs = dict(
+    use_rope=True, hf_style_rope=True, use_adaln=False,
+    use_swiglu=True, use_qk_norm=False, use_post_norm=True,
+    use_final_norm=True, use_bias=False, norm_type="rms_norm",
+    num_heads=8, num_kv_heads=4, head_dim=256, channels=2304,
+    num_layers=26, use_proj_txt=False)

utils/__init__.py ADDED Viewed

	@@ -0,0 +1,96 @@

+#
+# For licensing see accompanying LICENSE file.
+# Copyright (C) 2025 Apple Inc. All Rights Reserved.
+#
+"""
+STARFlow utilities package.
+This package contains various utilities for STARFlow training and inference,
+organized by functionality for better maintainability.
+"""
+# Import everything from the original utils.py for backward compatibility
+import warnings
+warnings.filterwarnings('ignore', category=FutureWarning)
+# Re-export everything from the original utils.py to maintain compatibility
+import sys
+import pathlib
+# Add the parent directory to path to import the original utils
+parent_dir = pathlib.Path(__file__).parent.parent
+sys.path.insert(0, str(parent_dir))
+# Import from new modular structure
+from .common import (
+    load_model_config, preprocess_text, encode_text, drop_label, add_noise,
+    get_data, save_samples_unified, read_tsv, set_random_seed
+)
+from .model_setup import (
+    setup_transformer, setup_vae, VAE, setup_encoder,
+    LookupTableTokenizer, TextEmbedder, LabelEmbdder
+)
+from .training import (
+    CosineLRSchedule, Distributed, get_local_rank, parallelize_model,
+    save_model, save_optimizer, sync_ctx
+)
+from .inference import (
+    FID, IS, CLIP, Metrics,
+    self_denoise, apply_denoising, process_denoising, simple_denoising
+)
+# Define what gets exported when someone does "from utils import *"
+__all__ = [
+    # Configuration
+    'load_model_config',
+    # Text processing
+    'preprocess_text',
+    'encode_text',
+    'drop_label',
+    # Noise
+    'add_noise',
+    # Denoising
+    'self_denoise',
+    'apply_denoising',
+    'process_denoising',
+    'simple_denoising',
+    # Saving
+    'save_samples_unified',
+    # Training
+    'CosineLRSchedule',
+    'Distributed',
+    'set_random_seed',
+    # Metrics
+    'FID',
+    'IS',
+    'CLIP',
+    'Metrics',
+    # Models
+    'setup_transformer',
+    'setup_vae',
+    'VAE',
+    # Encoders
+    'setup_encoder',
+    'LookupTableTokenizer',
+    'TextEmbedder',
+    'LabelEmbdder',
+    'read_tsv',
+    # Distributed
+    'parallelize_model',
+    'save_model',
+    'save_optimizer',
+    'get_local_rank',
+    'sync_ctx',
+    # Data
+    'get_data',
+]

utils/common.py ADDED Viewed

	@@ -0,0 +1,346 @@

+#
+# For licensing see accompanying LICENSE file.
+# Copyright (C) 2025 Apple Inc. All Rights Reserved.
+#
+"""
+Core utility functions for STARFlow.
+This module contains essential functions for model configuration, text processing,
+noise injection, and data handling. All functions are self-contained.
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import pathlib
+import argparse
+import yaml
+import random
+import numpy as np
+import csv
+from typing import List, Optional, Union, Dict, Any
+from einops import rearrange
+from misc import dividable
+import torchvision as tv
+import wandb
+# ==== Configuration Functions ====
+def load_model_config(config_path: str) -> argparse.Namespace:
+    """Load model configuration from YAML file and merge with trainer arguments."""
+    from train import get_tarflow_parser  # Import here to avoid circular imports
+    with open(config_path, 'r') as f:
+        model_configs = yaml.safe_load(f)
+    trainer_parser = get_tarflow_parser()
+    trainer_args = ""
+    for conf in model_configs['arguments']:
+        for key in conf:
+            trainer_args += f"--{key} {conf[key]} "
+    return trainer_parser.parse_args(trainer_args.split())
+# ==== Text Processing Functions ====
+def preprocess_text(text, use_template=False, aspect_ratio=None, fps=None, noise_std=None):
+    """Preprocess text with templates, aspect ratios, fps, and noise levels."""
+    modes = ['an image'] * len(text)
+    if fps is not None:
+        if isinstance(fps, torch.Tensor):
+            fps = [int(f) for f in fps.tolist()]
+        elif isinstance(fps, int):
+            fps = [fps] * len(text)
+        modes = ['a video' if f > 0 else 'an image' for f in fps]
+        text = [f"A video with {f} fps:\n{txt}\n" if f > 0 else f"An image:\n{txt}\n"
+                for txt, f in zip(text, fps)]
+    if noise_std is not None:
+        if isinstance(noise_std, torch.Tensor):
+            noise_std = [int(n * 1000) for n in noise_std.view(-1).tolist()]
+        elif isinstance(noise_std, float):
+            noise_std = [int(noise_std * 1000)] * len(text)
+        text = [f'Noise Level {n}:\n{txt}' for n, txt in zip(noise_std, text)]
+    if aspect_ratio is not None:
+        text = [f"{txt}\n in a {aspect_ratio} aspect ratio.\n" for txt in text]
+    if use_template:
+        TEMPLATE = "<start_of_turn>user\nPlease generate {mode} about: {prompt}<end_of_turn>\n"
+        TEMPLATE = TEMPLATE + "<start_of_turn>model\n"
+        text = [TEMPLATE.format(prompt=txt, mode=mode) for txt, mode in zip(text, modes)]
+    return text
+# Define helper classes that will be needed
+class LookupTableTokenizer:
+    def __init__(self, vocab_file):
+        self.vocab = {l[0]: i for i, l in enumerate(read_tsv(f'configs/dataset/{vocab_file}'))}
+        self.empty_id = len(self.vocab)
+    def __len__(self):
+        return len(self.vocab)
+    def __call__(self, text):
+        return {'input_ids': torch.tensor([[self.vocab.get(t, self.empty_id)] for t in text], dtype=torch.long)}
+class TextEmbedder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        if hasattr(config, "text_config"):  # Gemma3
+            self.config = config.text_config
+            self.vocab_size = config.image_token_index
+        else:
+            self.config = config
+            self.vocab_size = config.vocab_size
+        self.text_token_embedder = nn.Embedding(
+            self.vocab_size, self.config.hidden_size)
+        self.text_token_embedder.weight.requires_grad = False
+        self.normalizer = float(self.config.hidden_size) ** 0.5
+class LabelEmbdder(nn.Module):
+    def __init__(self, num_classes):
+        super().__init__()
+        self.num_classes = num_classes
+        self.config = type('Config', (), {'hidden_size': num_classes + 1})()
+        self.Embedding = nn.Parameter(torch.eye(num_classes+1), requires_grad=False)
+    def forward(self, y):
+        return F.embedding(y, self.Embedding)
+@torch.no_grad()
+def encode_text(text_encoder, tokenizer, text, max_length, device, return_tokens=False, **kwargs):
+    """Encode text using the text encoder with preprocessing."""
+    text = preprocess_text(text, use_template=isinstance(text_encoder, TextEmbedder), **kwargs)
+    if isinstance(tokenizer, LookupTableTokenizer):
+        assert max_length == 1, "label embedding only supports max_length=1"
+        tokenized_outputs = tokenizer(text)
+    else:
+        tokenized_outputs = tokenizer(
+            text, padding="max_length", truncation=True, return_tensors="pt", max_length=max_length)
+    tokenized_outputs = {key: val.to(device) for key, val in tokenized_outputs.items()}
+    if isinstance(text_encoder, TextEmbedder) or isinstance(text_encoder, LabelEmbdder):
+        y = text_encoder(tokenized_outputs['input_ids'])
+    else:
+        y = text_encoder(**tokenized_outputs).last_hidden_state
+        y = y * tokenized_outputs['attention_mask'].unsqueeze(-1)  # mask out padding
+    if return_tokens:
+        return y, tokenized_outputs
+    return y
+# ==== Noise Functions ====
+@torch.no_grad()
+def add_noise(x, noise_std=0.3, noise_type='gaussian', cond_noise_level=False):
+    """Add noise to input tensor."""
+    if isinstance(x, list):
+        return zip(*[add_noise(xi, noise_std, noise_type) for xi in x])
+    # inject noise over images
+    if noise_type == 'gaussian':
+        noise = noise_std * torch.randn_like(x)
+        x = x + noise
+    elif noise_type == 'uniform':
+        # Uniform dequantization following standard normalizing flow practice
+        noise = torch.rand_like(x)
+        x = ((x + 1) * (255 / 2) + noise) / 256 * 2 - 1
+    else:
+        raise NotImplementedError
+    return x, noise
+def drop_label(y, drop_prob=0.1):
+    """Randomly drop labels for classifier-free guidance training."""
+    return ["" if random.random() < drop_prob else yi for yi in y]
+def save_samples_unified(samples: torch.Tensor,
+                        save_dir: pathlib.Path,
+                        filename_prefix: str = "samples",
+                        epoch_or_iter: Optional[int] = None,
+                        fps: int = 8,
+                        dist=None,
+                        wandb_log: bool = False,
+                        wandb_step: Optional[int] = None,
+                        grid_arrangement: str = "auto") -> None:
+    """
+    Unified function to save samples as images or videos.
+    Automatically detects input range and handles both [0,1] and [-1,1] ranges.
+    Args:
+        samples: Tensor with samples to save (can be [0,1] or [-1,1] range)
+        save_dir: Directory to save files
+        filename_prefix: Prefix for filename (e.g., "train_samples", "inference")
+        epoch_or_iter: Epoch or iteration number for filename
+        fps: FPS for video files
+        dist: Distributed training context (if available)
+        wandb_log: Whether to log to wandb
+        wandb_step: Step for wandb logging
+        grid_arrangement: How to arrange samples ("auto", "grid", "individual")
+    """
+    # Handle distributed gathering
+    if dist is not None:
+        samples = dist.gather_concat(samples.contiguous().detach())
+        should_save = dist.local_rank == 0
+        wandb_should_log = wandb_log and dist.rank == 0
+    else:
+        should_save = True
+        wandb_should_log = wandb_log
+    if not should_save:
+        return
+    # Create save directory
+    save_dir.mkdir(parents=True, exist_ok=True)
+    samples = samples.detach().cpu()
+    if samples.dim() == 5 and samples.size(1) == 1:
+        # If single-frame video, squeeze time dimension
+        samples = samples[:, 0]
+    normalized_samples = (samples.clamp(-1, 1) + 1) * 0.5
+    # Generate filename
+    if samples.dim() == 5:
+        filename = f"{filename_prefix}_{samples.size(1)}x{samples.size(3)}x{samples.size(4)}"
+    else:
+        filename = f"{filename_prefix}_{samples.size(2)}x{samples.size(3)}"
+    if epoch_or_iter is not None:
+        filename += f"_video_{epoch_or_iter:03d}"
+    if samples.dim() == 5:  # Video
+        filename += ".mp4"
+    else:  # Image
+        filename += ".png"
+    file_path = save_dir / filename
+    if samples.dim() == 5:  # Video: (B, T, C, H, W)
+        if grid_arrangement == "individual":
+            # Save individual videos
+            for idx in range(samples.size(0)):
+                video_data = (normalized_samples[idx] * 255).to(torch.uint8)
+                # torchvision.io.write_video expects (T, H, W, C)
+                # video_data shape is (T, C, H, W), so permute to (T, H, W, C)
+                video_data = video_data.permute(0, 2, 3, 1)
+                individual_path = save_dir / f"{filename_prefix}_video_{idx:03d}.mp4"
+                tv.io.write_video(str(individual_path), video_data, fps=fps)
+        else:
+            # Create video grid
+            grid_a = dividable(samples.size(0))
+            samples_grid = rearrange(
+                normalized_samples, '(a b) t c h w -> t (a h) (b w) c',
+                a=grid_a
+            )
+            tv.io.write_video(
+                str(file_path), (samples_grid * 255).to(torch.uint8),
+                fps=fps, video_codec='libx264', options={'crf': '10', 'preset': 'slow'}
+            )
+        # Wandb logging for video
+        if wandb_should_log:
+            wandb.log({f"{filename_prefix}_video": wandb.Video(str(file_path))}, step=wandb_step)
+    else:  # Image: (B, C, H, W)
+        if grid_arrangement == "individual":
+            # Save individual images
+            for idx in range(samples.size(0)):
+                image_path = save_dir / f"{filename_prefix}_{idx:03d}.jpg"
+                tv.utils.save_image(
+                    normalized_samples[idx:idx+1],
+                    str(image_path), normalize=False
+                )
+        else:
+            # Save as grid
+            tv.utils.save_image(
+                normalized_samples,
+                str(file_path), normalize=False, nrow=dividable(samples.size(0))
+            )
+        # Wandb logging for image
+        if wandb_should_log:
+            wandb.log({f"{filename_prefix}": wandb.Image(str(file_path))}, step=wandb_step)
+    print(f'Saved samples to {file_path}')
+# ==== Data and Utility Functions ====
+def get_data(args, dist):
+    """
+    Get data loader using dummy dataset for open source release.
+    Args:
+        args: Training arguments
+        dist: Distributed training context
+    Returns:
+        Data loader with dummy synthetic data
+    """
+    try:
+        from dataset import create_dummy_dataloader
+    except ImportError:
+        raise ImportError("dataset.py not found or missing create_dummy_dataloader function")
+    local_batch_size = args.batch_size // dist.world_size // getattr(args, "acc", 1)
+    # Determine multiple based on VAE type
+    if "Wan2.2" in args.vae:
+        multiple = 16
+    else:
+        multiple = 8
+    # Calculate number of samples per rank
+    total_samples = getattr(args, 'epoch_length', 50000)  # Default to 50k samples
+    samples_per_rank = total_samples // dist.world_size if dist.world_size > 0 else total_samples
+    # Create primary dataloader
+    data_loader = create_dummy_dataloader(
+        dataset_name=args.dataset,
+        img_size=args.img_size,
+        vid_size=getattr(args, 'vid_size', None),
+        batch_size=local_batch_size,
+        use_mixed_aspect=getattr(args, 'mix_aspect', False),
+        multiple=multiple * args.patch_size,
+        num_samples=samples_per_rank,
+        infinite=False
+    )
+    # Create secondary dataloader if specified
+    if getattr(args, 'secondary_dataset', None) is not None:
+        secondary_samples = getattr(args, 'secondary_epoch_length', total_samples // 4)
+        secondary_samples_per_rank = secondary_samples // dist.world_size if dist.world_size > 0 else secondary_samples
+        data_loader.secondary_loader = create_dummy_dataloader(
+            dataset_name=args.secondary_dataset,
+            img_size=getattr(args, 'secondary_img_size', args.img_size),
+            vid_size=getattr(args, 'secondary_vid_size', None),
+            batch_size=getattr(args, 'secondary_batch_size', local_batch_size),
+            use_mixed_aspect=getattr(args, 'mix_aspect', False),
+            multiple=multiple * args.patch_size,
+            num_samples=secondary_samples_per_rank,
+            infinite=True  # Secondary loader is typically infinite
+        )
+    return data_loader
+def read_tsv(filename: str):
+    """Simple TSV reader for compatibility."""
+    with open(filename, 'r', newline='') as tsvfile:
+        reader = csv.reader(tsvfile, delimiter='\t')
+        return [row for row in reader]
+def set_random_seed(seed: int) -> None:
+    """Set random seed for reproducibility."""
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)

utils/inference.py ADDED Viewed

	@@ -0,0 +1,277 @@

+#
+# For licensing see accompanying LICENSE file.
+# Copyright (C) 2025 Apple Inc. All Rights Reserved.
+#
+"""
+Inference utilities for STARFlow.
+"""
+import torch
+import datetime
+from typing import List
+from torchmetrics.image.fid import FrechetInceptionDistance, _compute_fid
+from torchmetrics.image.inception import InceptionScore
+from torchmetrics.multimodal.clip_score import CLIPScore
+from torchmetrics.utilities.data import dim_zero_cat
+# Import Distributed from training module
+from .training import Distributed
+# ==== Metrics ====
+class FID(FrechetInceptionDistance):
+    def __init__(self, feature=2048, reset_real_features=True, normalize=False, input_img_size=..., **kwargs):
+        super().__init__(feature, reset_real_features, normalize, input_img_size, **kwargs)
+        self.reset_real_features = reset_real_features
+    def add_state(self, name, default, *args, **kwargs):
+        self.register_buffer(name, default)
+    def manual_compute(self, dist):
+        # manually gather the features
+        self.fake_features_num_samples = dist.reduce(self.fake_features_num_samples)
+        self.fake_features_sum = dist.reduce(self.fake_features_sum)
+        self.fake_features_cov_sum = dist.reduce(self.fake_features_cov_sum)
+        if self.reset_real_features:
+            self.real_features_num_samples = dist.reduce(self.real_features_num_samples)
+            self.real_features_sum = dist.reduce(self.real_features_sum)
+            self.real_features_cov_sum = dist.reduce(self.real_features_cov_sum)
+        print(f'Gathered {self.fake_features_num_samples} samples for FID computation')
+        # compute FID
+        mean_real = (self.real_features_sum / self.real_features_num_samples).unsqueeze(0)
+        mean_fake = (self.fake_features_sum / self.fake_features_num_samples).unsqueeze(0)
+        cov_real_num = self.real_features_cov_sum - self.real_features_num_samples * mean_real.t().mm(mean_real)
+        cov_real = cov_real_num / (self.real_features_num_samples - 1)
+        cov_fake_num = self.fake_features_cov_sum - self.fake_features_num_samples * mean_fake.t().mm(mean_fake)
+        cov_fake = cov_fake_num / (self.fake_features_num_samples - 1)
+        if dist.rank == 0:
+            fid_score = _compute_fid(mean_real.squeeze(0), cov_real, mean_fake.squeeze(0), cov_fake).to(
+                dtype=self.orig_dtype, device=self.real_features_sum.device)
+            print(f'FID: {fid_score.item()} DONE')
+        else:
+            fid_score = torch.tensor(0.0, dtype=self.orig_dtype, device=self.real_features_sum.device)
+        dist.barrier()
+        # reset the state
+        self.fake_features_num_samples *= 0
+        self.fake_features_sum *= 0
+        self.fake_features_cov_sum *= 0
+        if self.reset_real_features:
+            self.real_features_num_samples *= 0
+            self.real_features_sum *= 0
+            self.real_features_cov_sum *= 0
+        return fid_score
+class IS(InceptionScore):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+    def manual_compute(self, dist):
+        # manually gather the features
+        self.features = dim_zero_cat(self.features)
+        features = dist.gather_concat(self.features)
+        print(f'Gathered {features.shape[0]} samples for IS computation')
+        if dist.rank == 0:
+            idx = torch.randperm(features.shape[0])
+            features = features[idx]
+            # calculate probs and logits
+            prob = features.softmax(dim=1)
+            log_prob = features.log_softmax(dim=1)
+            # split into groups
+            prob = prob.chunk(self.splits, dim=0)
+            log_prob = log_prob.chunk(self.splits, dim=0)
+            # calculate score per split
+            mean_prob = [p.mean(dim=0, keepdim=True) for p in prob]
+            kl_ = [p * (log_p - m_p.log()) for p, log_p, m_p in zip(prob, log_prob, mean_prob)]
+            kl_ = [k.sum(dim=1).mean().exp() for k in kl_]
+            kl = torch.stack(kl_)
+            mean = kl.mean()
+            std = kl.std()
+        else:
+            mean = torch.tensor(0.0, device=self.features.device)
+            std = torch.tensor(0.0, device=self.features.device)
+        dist.barrier()
+        return mean, std
+class CLIP(CLIPScore):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+    def manual_compute(self, dist):
+        # manually gather the features
+        self.n_samples = dist.reduce(self.n_samples)
+        self.score = dist.reduce(self.score)
+        print(f'Gathered {self.n_samples} samples for CLIP computation')
+        # compute CLIP
+        clip_score = torch.max(self.score / self.n_samples, torch.zeros_like(self.score))
+        print(f'CLIP: {clip_score.item()} DONE')
+        # reset the state
+        self.n_samples *= 0
+        self.score *= 0
+        return clip_score
+class Metrics:
+    def __init__(self):
+        self.metrics: dict[str, list[float]] = {}
+    def update(self, metrics: dict[str, torch.Tensor | float]):
+        for k, v in metrics.items():
+            if isinstance(v, torch.Tensor):
+                v = v.item()
+            if k in self.metrics:
+                self.metrics[k].append(v)
+            else:
+                self.metrics[k] = [v]
+    def compute(self, dist: Distributed | None) -> dict[str, float]:
+        out: dict[str, float] = {}
+        for k, v in self.metrics.items():
+            v = sum(v) / len(v)
+            if dist is not None:
+                v = dist.gather_concat(torch.tensor(v, device='cuda').view(1)).mean().item()
+            out[k] = v
+        return out
+    @staticmethod
+    def print(metrics: dict[str, float], epoch: int):
+        print(f'Epoch {epoch}  Time {datetime.datetime.now()}')
+        print('\n'.join((f'\t{k:40s}: {v: .4g}' for k, v in sorted(metrics.items()))))
+# ==== Denoising Functions (from starflow_utils.py) ====
+def apply_denoising(model, x_chunk: torch.Tensor, y_batch,
+                    text_encoder, tokenizer, args,
+                    text_encoder_kwargs: dict, sigma_curr: float, sigma_next: float = 0) -> torch.Tensor:
+    """Apply denoising to a chunk of data."""
+    from .common import encode_text  # Import here to avoid circular imports
+    noise_std_const = 0.3  # a constant used for noise levels.
+    # Handle both encoded tensors and raw captions
+    if isinstance(y_batch, torch.Tensor):
+        y_ = y_batch
+    elif y_batch is not None:
+        y_ = encode_text(text_encoder, tokenizer, y_batch, args.txt_size,
+                        text_encoder.device, **text_encoder_kwargs)
+    else:
+        y_ = None
+    if getattr(args, 'disable_learnable_denoiser', False) or not hasattr(model, 'learnable_self_denoiser'):
+        return self_denoise(
+            model, x_chunk, y_,
+            noise_std=sigma_curr,
+            steps=1,
+            disable_learnable_denoiser=getattr(args, 'disable_learnable_denoiser', False)
+        )
+    else:
+        # Learnable denoiser
+        if sigma_curr is not None and isinstance(y_batch, (list, type(None))):
+            text_encoder_kwargs['noise_std'] = sigma_curr
+        denoiser_output = model(x_chunk, y_, denoiser=True)
+        return x_chunk - denoiser_output * noise_std_const * (sigma_curr - sigma_next) / sigma_curr
+def self_denoise(model, samples, y, noise_std=0.1, lr=1, steps=1, disable_learnable_denoiser=False):
+    """Self-denoising function - same as in train.py"""
+    if steps == 0:
+        return samples
+    outputs = []
+    x = samples.clone()
+    lr = noise_std ** 2 * lr
+    with torch.enable_grad():
+        x.requires_grad = True
+        model.train()
+        z, _, _, logdets = model(x, y)
+        loss = model.get_loss(z, logdets)['loss'] * 65536
+        grad = float(samples.numel()) / 65536 * torch.autograd.grad(loss, [x])[0]
+        outputs += [(x - grad * lr).detach()]
+    x = torch.cat(outputs, -1)
+    return x
+def process_denoising(samples: torch.Tensor, y: List[str], args,
+                      model, text_encoder, tokenizer, text_encoder_kwargs: dict,
+                      noise_std: float) -> torch.Tensor:
+    """Process samples through denoising if enabled."""
+    if not (args.finetuned_vae == 'none' and
+            getattr(args, 'vae_adapter', None) is None and
+            getattr(args, 'return_sequence', 0) == 0):
+        # Denoising not enabled or not applicable
+        return samples
+    torch.cuda.empty_cache()
+    assert isinstance(samples, torch.Tensor)
+    samples = samples.cpu()
+    # Use smaller batch size for training to avoid memory issues
+    b = samples.size(0)
+    db = min(getattr(args, 'denoising_batch_size', 1), b)
+    denoised_samples = []
+    is_video = samples.dim() == 5
+    for j in range(b // db):
+        x_all = torch.clone(samples[j * db : (j + 1) * db]).detach().cuda()
+        y_batch = y[j * db : (j + 1) * db] if y is not None else None
+        if is_video:
+            # Chunk-wise denoising for videos
+            s_idx, overlap = 0, 0
+            steps = x_all.size(1) if getattr(args, 'local_attn_window', None) is None else args.local_attn_window
+            while s_idx < x_all.size(1):
+                x_chunk = x_all[:, s_idx : s_idx + steps].detach().clone()
+                x_denoised = apply_denoising(
+                    model, x_chunk, y_batch, text_encoder, tokenizer,
+                    args, text_encoder_kwargs, noise_std
+                )
+                x_all[:, s_idx + overlap: s_idx + steps] = x_denoised[:, overlap:]
+                overlap = steps - 1 if getattr(args, 'denoiser_window', None) is None else args.denoiser_window
+                s_idx += steps - overlap
+        else:
+            # Process entire batch for images
+            x_all = apply_denoising(
+                model, x_all, y_batch, text_encoder, tokenizer,
+                args, text_encoder_kwargs, noise_std
+            )
+        torch.cuda.empty_cache()
+        denoised_samples.append(x_all.detach().cpu())
+    return torch.cat(denoised_samples, dim=0).cuda()
+def simple_denoising(model, samples: torch.Tensor, y_encoded,
+                     text_encoder, tokenizer, args, noise_std: float) -> torch.Tensor:
+    """Simplified denoising for training - reuses apply_denoising without chunking."""
+    if args.finetuned_vae != 'none' and args.finetuned_vae is not None:
+        return samples
+    # Reuse apply_denoising - it now handles both encoded tensors and raw captions
+    text_encoder_kwargs = {}
+    return apply_denoising(
+        model, samples, y_encoded, text_encoder, tokenizer,
+        args, text_encoder_kwargs, noise_std, sigma_next=0
+    )

utils/model_setup.py ADDED Viewed

	@@ -0,0 +1,405 @@

+#
+# For licensing see accompanying LICENSE file.
+# Copyright (C) 2025 Apple Inc. All Rights Reserved.
+#
+"""
+Model setup utilities for STARFlow.
+Includes: transformer setup, VAE setup, text encoders.
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import pathlib
+import os
+import numpy as np
+from collections import OrderedDict
+from typing import Optional, Tuple, Union
+from einops import rearrange
+from transformer_flow import pre_model_configs, Model
+from diffusers.models import AutoencoderKL, AutoencoderKLWan
+from diffusers import DiTPipeline
+from misc.wan_vae2 import video_vae2 as AutoencoderKLWan2
+from transformers import AutoTokenizer, AutoModel, AutoConfig, T5Tokenizer, T5EncoderModel
+# ==== Model Setup Functions ====
+def setup_transformer(args, dist, **other_kwargs):
+    """Setup transformer model with given arguments."""
+    common_kwargs = dict(
+        in_channels=args.channel_size,
+        img_size=args.img_size,
+        txt_size=args.txt_size,
+        sos=args.sos,  # sos_token
+        cond_top_only=args.cond_top_only,
+        use_softplus=args.use_softplus,
+        use_pretrained_lm=args.use_pretrained_lm,
+        use_mm_attn=args.use_mm_attn,
+        use_final_norm=args.use_final_norm,
+        soft_clip=args.soft_clip,
+        seq_order=args.seq_order,
+        learnable_self_denoiser=args.learnable_self_denoiser,
+        conditional_denoiser=args.conditional_denoiser,
+        noise_embed_denoiser=args.noise_embed_denoiser,
+        temporal_causal=args.temporal_causal,
+        shallow_block_local=args.shallow_block_local,
+        denoiser_window=args.denoiser_window,
+        local_attn_window=args.local_attn_window,
+        top_block_channels=getattr(args, 'top_block_channels', None),
+    )
+    common_kwargs.update(other_kwargs)
+    if getattr(args, "model_type", None) is not None:
+        model = pre_model_configs[args.model_type](**common_kwargs)
+    else:
+        # generic model initialization
+        model = Model(
+            patch_size=args.patch_size,
+            channels=args.channels,
+            num_blocks=args.blocks if len(args.layers_per_block) == 1 else len(args.layers_per_block),
+            layers_per_block=args.layers_per_block,
+            rope=args.rope,
+            pt_seq_len=args.pt_seq_len,
+            head_dim=args.head_dim,
+            num_heads=args.num_heads,
+            num_kv_heads=args.num_kv_heads,
+            use_swiglu=args.use_swiglu,
+            use_bias=args.use_bias,
+            use_qk_norm=args.use_qk_norm,
+            use_post_norm=args.use_post_norm,
+            norm_type=args.norm_type,
+            **common_kwargs)
+    if args.use_pretrained_lm:  # Note: pretrained model download removed
+        model_name = args.use_pretrained_lm
+        assert model_name in ['gemma3_4b', 'gemma2_2b', 'gemma3_1b'], f'{model_name} not supported'
+        # Note: Pretrained LM weights are no longer automatically downloaded
+        # Users should provide their own pretrained weights if needed
+        local_path = pathlib.Path(args.logdir) / model_name / 'gemma_meta_block.pth'
+        if local_path.exists():
+            model.blocks[-1].load_state_dict(torch.load(local_path, map_location='cpu'), strict=False)
+            print(f'Load top block with pretrained LLM weights from {model_name}')
+        else:
+            print(f"Warning: Pretrained LM weights for {model_name} not found at {local_path}")
+            print("Please provide pretrained weights manually or disable use_pretrained_lm")
+    return model
+class VAE(nn.Module):
+    def __init__(self, model_name, dist, adapter=None):
+        super().__init__()
+        self.model_name = model_name
+        self.video_vae = False
+        self.dist = dist
+        model_name, extra = model_name.split(':') if ':' in model_name else (model_name, None)
+        if 'Wan-AI/Wan2.1' in model_name:
+            self.vae = AutoencoderKLWan.from_pretrained(model_name, subfolder="vae", torch_dtype=torch.bfloat16)
+            self.latents_std = self.vae.config.latents_std
+            self.latents_mean = self.vae.config.latents_mean
+            self.downsample_factor = 2 ** (len(self.vae.config.dim_mult) - 1)
+            self.temporal_downsample_factor = 2 ** sum(self.vae.config.temperal_downsample)
+            self.video_vae = True  # this is a Video VAE
+        elif 'Wan-AI/Wan2.2' in model_name:
+            filename = "/tmp/Wan2.2_VAE.pth"  # Use local temp path, download if not exists. WAN2.2 has no diffusers
+            if not os.path.exists(filename):
+                if dist.local_rank == 0:
+                    print("Downloading Wan2.2 VAE weights...")
+                    os.system(f"wget https://huggingface.co/Wan-AI/Wan2.2-TI2V-5B/resolve/main/Wan2.2_VAE.pth -O {filename}")
+                dist.barrier()  # Ensure only one process downloads
+            self.vae = AutoencoderKLWan2(pretrained_path=filename)
+            self.downsample_factor = 16
+            self.video_vae = True
+            self.latents_std = self.vae.std
+            self.latents_mean = self.vae.mean
+            self.temporal_downsample_factor = 4
+            self.temporal_scale = float(extra) if extra is not None else 1
+        else:
+            if 'sd-vae' in model_name or 'sdxl-vae' in model_name:
+                self.vae = AutoencoderKL.from_pretrained(model_name)
+                self.scaling_factor = self.vae.config.scaling_factor
+            else:
+                self.vae = AutoencoderKL.from_pretrained(model_name, subfolder="vae", torch_dtype=torch.bfloat16)
+                self.scaling_factor = self.vae.config.scaling_factor
+            self.downsample_factor = 2 ** (len(self.vae.config.down_block_types) - 1)
+            self.temporal_downsample_factor = 1  # this is an Image VAE, no temporal downsample
+        # self.vae.load_state_dict(self.vae.state_dict(), strict=False)  # what is this?
+        self.use_adapter = adapter is not None
+        if self.use_adapter:  # adapter is dit #
+            self.dit_pipe = DiTPipeline.from_pretrained(adapter, torch_dtype=torch.bfloat16)
+    def to(self, device):
+        if self.use_adapter:
+            self.dit_pipe.to(device)
+        return super().to(device)
+    def _encode(self, x):
+        return self.vae.encode(x)
+    def _decode(self, z):
+        return self.vae.decode(z)
+    def encode(self, x):
+        if self.video_vae:  # video VAE
+            if 'Wan-AI/Wan2.2' in self.model_name:
+                if x.dim() == 5:
+                    z = rearrange(self.vae.sample(rearrange(x, 'b t c h w -> b c t h w'), self.vae.scale), 'b c t h w -> b t c h w')
+                    if self.temporal_scale != 1:
+                        z[:, 1:] = z[:, 1:] * self.temporal_scale  # scale the temporal latent
+                else:
+                    z = rearrange(self.vae.sample(rearrange(x, 'b c h w -> b c 1 h w'), self.vae.scale), 'b c 1 h w -> b c h w')
+            else:
+                if x.dim() == 5:
+                    z = rearrange(self._encode(rearrange(x, 'b t c h w -> b c t h w')).latent_dist.sample(), 'b c t h w -> b t c h w')
+                else:
+                    z = rearrange(self._encode(rearrange(x, 'b c h w -> b c 1 h w')).latent_dist.sample(), 'b c 1 h w -> b c h w')
+                shape = [1, 1, -1, 1, 1] if z.dim() == 5 else [1, -1, 1, 1]
+                scale, shift = torch.tensor(self.latents_std, device=x.device).view(*shape), torch.tensor(self.latents_mean, device=x.device).view(*shape)
+                z = (z - shift) / scale
+        else: # image VAE
+            if x.dim() == 5:
+                z = rearrange(self._encode(rearrange(x, 'b t c h w -> (b t) c h w')).latent_dist.sample(), '(b t) c h w -> b t c h w', t=x.shape[1])
+            else:
+                z = self._encode(x).latent_dist.sample()
+            z = z * self.scaling_factor
+        return z
+    def decode(self, z, total_steps=100, noise_std=0.3):
+        if self.use_adapter:
+            z = self.adapter_denoise(z, total_steps, noise_std)
+        if self.video_vae:  # video VAE
+            if 'Wan-AI/Wan2.2' in self.model_name:
+                if z.dim() == 5:
+                    if self.temporal_scale != 1:
+                        z = z.clone()
+                        z[:, 1:] = z[:, 1:] / self.temporal_scale
+                    x = rearrange(self.vae.decode(rearrange(z, 'b t c h w -> b c t h w'), self.vae.scale), 'b c t h w -> b t c h w')
+                else:
+                    x = rearrange(self.vae.decode(rearrange(z, 'b c h w -> b c 1 h w'), self.vae.scale), 'b c 1 h w -> b c h w')
+            else:
+                shape = [1, 1, -1, 1, 1] if z.dim() == 5 else [1, -1, 1, 1]
+                scale = torch.tensor(self.latents_std, device=z.device).view(*shape)
+                shift = torch.tensor(self.latents_mean, device=z.device).view(*shape)
+                z = z * scale + shift
+                if z.dim() == 5:
+                    x = rearrange(self._decode(rearrange(z, 'b t c h w -> b c t h w')).sample, 'b c t h w -> b t c h w')
+                else:
+                    x = rearrange(self._decode(rearrange(z, 'b c h w -> b c 1 h w')).sample, 'b c 1 h w -> b c h w')
+        else:
+            z = z / self.scaling_factor
+            if z.dim() == 5: # (b, t, c, h, w)
+                x = rearrange(self._decode(rearrange(z, 'b t c h w -> (b t) c h w')).sample, '(b t) c h w -> b t c h w', t=z.shape[1])
+            else:
+                x = self._decode(z).sample
+        return x
+    @torch.no_grad()
+    def adapter_denoise(self, z, total_steps=100, noise_std=0.3):
+        self.dit_pipe.scheduler.set_timesteps(total_steps)
+        timesteps = self.dit_pipe.scheduler.timesteps
+        one = torch.ones(z.shape[0], device=z.device)
+        target_alpha2 = 1 / (1 + noise_std ** 2)
+        target_t = (torch.abs(self.dit_pipe.scheduler.alphas_cumprod - target_alpha2)).argmin().item()
+        z = z * np.sqrt(target_alpha2)  # normalize the latent
+        for it in range(len(timesteps)):
+            if timesteps[it] > target_t: continue
+            noise_pred = self.dit_pipe.transformer(z, one * timesteps[it], class_labels=one.long() * 1000).sample
+            model_output = torch.split(noise_pred, self.dit_pipe.transformer.config.in_channels, dim=1)[0]
+            z = self.dit_pipe.scheduler.step(model_output, timesteps[it], z).prev_sample
+        return z
+def setup_vae(args, dist, device='cuda'):
+    """Setup VAE model with given arguments."""
+    print(f'Loading VAE {args.vae}...')
+    # setup VAE
+    vae = VAE(args.vae, dist=dist, adapter=getattr(args, "vae_adapter", None)).to(device)
+    # (optional) load pretrained VAE
+    if getattr(args, "finetuned_vae", None) is not None and args.finetuned_vae != 'none':
+        vae_task_id = args.finetuned_vae
+        local_folder = args.logdir / 'vae'
+        local_folder.mkdir(parents=True, exist_ok=True)
+        # Try to load from local path first
+        if vae_task_id == "px82zaheuu":
+            local_path = local_folder / "pytorch_model.bin"
+            if local_path.exists():
+                finetuned_vae_state = torch.load(local_path, map_location="cpu", weights_only=False)
+                renamed_state = OrderedDict()
+                for key in finetuned_vae_state:
+                    new_key = key.replace("encoder.0", "encoder").replace("encoder.1", "quant_conv").replace("decoder.0", "post_quant_conv").replace("decoder.1", "decoder")
+                    renamed_state[new_key] = finetuned_vae_state[key]
+                vae.vae.load_state_dict(renamed_state)
+                print(f'Loaded finetuned VAE {vae_task_id}')
+            else:
+                print(f"Warning: Finetuned VAE weights for {vae_task_id} not found at {local_path}")
+                print("Please provide finetuned VAE weights manually or set finetuned_vae to 'none'")
+        else:
+            # Try to load general task weights
+            local_path = local_folder / f"{vae_task_id}.pth"
+            if local_path.exists():
+                vae.load_state_dict(torch.load(local_path, map_location='cpu', weights_only=False))
+                print(f'Loaded finetuned VAE {vae_task_id}')
+            else:
+                print(f"Warning: Finetuned VAE weights for {vae_task_id} not found at {local_path}")
+                print("Please provide finetuned VAE weights manually or set finetuned_vae to 'none'")
+    return vae
+# ==== Text Encoder Classes and Setup ====
+class LookupTableTokenizer:
+    """Simple lookup table tokenizer for label-based datasets."""
+    def __init__(self, vocab_file):
+        from .common import read_tsv
+        self.vocab = {l[0]: i for i, l in enumerate(read_tsv(f'configs/dataset/{vocab_file}'))}
+        self.empty_id = len(self.vocab)
+    def __len__(self):
+        return len(self.vocab)
+    def __call__(self, text):
+        return {'input_ids': torch.tensor([[self.vocab.get(t, self.empty_id)] for t in text], dtype=torch.long)}
+class LabelEmbdder(nn.Module):
+    """Simple label embedder for classification-style conditioning."""
+    def __init__(self, num_classes):
+        super().__init__()
+        self.num_classes = num_classes
+        self.config = type('Config', (), {'hidden_size': num_classes + 1})()
+        self.Embedding = nn.Parameter(torch.eye(num_classes+1), requires_grad=False)
+    def forward(self, y):
+        return F.embedding(y, self.Embedding)
+class TextEmbedder(nn.Module):
+    """Text embedder for large language models like Gemma."""
+    def __init__(self, config):
+        super().__init__()
+        if hasattr(config, "text_config"):  # Gemma3
+            self.config = config.text_config
+            self.vocab_size = config.image_token_index
+        else:
+            self.config = config
+            self.vocab_size = config.vocab_size
+        self.text_token_embedder = nn.Embedding(
+            self.vocab_size, self.config.hidden_size)
+        self.text_token_embedder.weight.requires_grad = False
+        self.normalizer = float(self.config.hidden_size) ** 0.5
+    def forward(self, x):
+        x = self.text_token_embedder(x)
+        return (x * self.normalizer).to(x.dtype)
+    @torch.no_grad()
+    def sample(
+        self,
+        hidden_states: torch.Tensor,
+        temperatures: Union[float, None] = 1.0,
+        top_ps: float = 0.95,
+        top_ks: int = 64,
+        embedding_bias: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        device = hidden_states.device
+        batch_size = hidden_states.shape[0]
+        temperatures = None if not temperatures else torch.FloatTensor(
+            [temperatures] * batch_size).to(device)
+        top_ps = torch.FloatTensor([top_ps] * batch_size).to(device)
+        top_ks = torch.LongTensor([top_ks] * batch_size).to(device)
+        # Select the last element for each sequence.
+        hidden_states = hidden_states[:, -1]
+        embedding = self.text_token_embedder.weight
+        logits = torch.matmul(hidden_states, embedding.t())
+        if embedding_bias is not None:
+            logits += embedding_bias
+        if hasattr(self.config, 'final_logit_softcapping') and self.config.final_logit_softcapping is not None:
+            logits = logits / self.config.final_logit_softcapping
+            logits = torch.tanh(logits)
+            logits = logits * self.config.final_logit_softcapping
+        if temperatures is None:
+            return torch.argmax(logits, dim=-1).squeeze(dim=-1), logits
+        # Apply temperature scaling.
+        logits.div_(temperatures.unsqueeze(dim=1))
+        # Apply top-k and top-p filtering (simplified version)
+        probs = F.softmax(logits, dim=-1)
+        next_tokens = torch.multinomial(probs, num_samples=1).squeeze(dim=-1)
+        return next_tokens, logits
+def setup_encoder(args, dist, device='cuda'):
+    """Setup text encoder based on arguments."""
+    assert args.txt_size > 0, 'txt_size must be set'
+    print(f'Loading text encoder {args.text}...')
+    if args.text.endswith('.vocab'):  # caption -> label
+        tokenizer = LookupTableTokenizer(args.text)
+        text_encoder = LabelEmbdder(len(tokenizer)).to(device)
+        block_name = 'Embedding'
+    elif args.text == 't5xxl':
+        tokenizer = T5Tokenizer.from_pretrained("THUDM/CogView3-Plus-3B", subfolder="tokenizer")
+        text_encoder = T5EncoderModel.from_pretrained("THUDM/CogView3-Plus-3B",
+                                                      subfolder="text_encoder", torch_dtype=torch.bfloat16).to(device)
+        block_name = 'T5Block'
+    elif args.text == 't5xl' or args.text.startswith('google'):
+        tokenizer = AutoTokenizer.from_pretrained(args.text)
+        text_encoder = AutoModel.from_pretrained(args.text, add_cross_attention=False).encoder.to(device)
+        block_name = 'T5Block'
+    elif args.text == "gemma" or args.text.startswith("Alpha-VLLM"):
+        tokenizer = AutoTokenizer.from_pretrained(args.text, subfolder="tokenizer")
+        text_encoder = AutoModel.from_pretrained(args.text, subfolder="text_encoder", torch_dtype=torch.bfloat16).to(device)
+        block_name = 'GemmaDecoderLayer'
+    elif args.text in ["gemma3_4b", "gemma3_1b", "gemma2_2b"]:  # NOTE: special text embedder
+        model_name = args.text
+        repo_name = {"gemma3_4b": "google/gemma-3-4b-it",
+                     "gemma3_1b": "google/gemma-3-1b-it",
+                     "gemma2_2b": "google/gemma-2-2b-it"}[model_name]
+        tokenizer = AutoTokenizer.from_pretrained(repo_name)
+        config = AutoConfig.from_pretrained(repo_name)
+        text_encoder = TextEmbedder(config).to(device)
+        block_name = "Embedding"
+        # Try to load embedding layer
+        local_path = pathlib.Path(args.logdir) / model_name
+        local_path.mkdir(parents=True, exist_ok=True)
+        local_path = local_path / 'gemma_text_embed.pth'
+        if local_path.exists():
+            text_encoder.load_state_dict(torch.load(local_path, map_location='cpu'))
+            print(f'Loaded text encoder weights for {model_name}')
+        else:
+            print(f"Warning: Text encoder weights for {model_name} not found at {local_path}")
+            print("Please provide text encoder weights manually or use a different text encoder")
+    else:
+        raise NotImplementedError(f'Unknown text encoder {args.text}')
+    text_encoder.base_block_name = block_name
+    return tokenizer, text_encoder

utils/training.py ADDED Viewed

	@@ -0,0 +1,232 @@

+#
+# For licensing see accompanying LICENSE file.
+# Copyright (C) 2025 Apple Inc. All Rights Reserved.
+#
+"""
+Training utilities for STARFlow.
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.distributed
+import torch.distributed.checkpoint as dcp
+from torch.distributed._composable.fsdp import fully_shard, MixedPrecisionPolicy, CPUOffloadPolicy
+from torch.distributed._tensor import DeviceMesh
+from torch.distributed.device_mesh import init_device_mesh
+import datetime
+import math
+import os
+import random
+import numpy as np
+import contextlib
+import typing as t
+from typing import Any, Dict, List, Union, Optional
+from collections import defaultdict, OrderedDict
+from fnmatch import fnmatch
+# ==== Learning Rate Schedule ====
+class CosineLRSchedule(torch.nn.Module):
+    counter: torch.Tensor
+    def __init__(self, optimizer, warmup_steps: int, total_steps: int, min_lr: float, max_lr: float):
+        super().__init__()
+        self.register_buffer('counter', torch.zeros(()))
+        self.warmup_steps = warmup_steps
+        self.total_steps = total_steps
+        self.optimizer = optimizer
+        self.min_lr = min_lr
+        self.start_lr = min(min_lr, 1e-6)
+        self.max_lr = max_lr
+        self.set_lr(min_lr)
+    def set_lr(self, lr: float) -> float:
+        if self.min_lr <= lr <= self.max_lr:
+            for pg in self.optimizer.param_groups:
+                pg['lr'] = lr
+        return pg['lr']
+    def step(self) -> float:
+        with torch.no_grad():
+            counter = self.counter.add_(1).item()
+        if self.counter <= self.warmup_steps:
+            new_lr = self.start_lr + counter / self.warmup_steps * (self.max_lr - self.start_lr)
+            return self.set_lr(new_lr)
+        t = (counter - self.warmup_steps) / (self.total_steps - self.warmup_steps)
+        new_lr = self.min_lr + 0.5 * (1 + math.cos(math.pi * t)) * (self.max_lr - self.min_lr)
+        return self.set_lr(new_lr)
+# ==== Distributed Training ====
+class Distributed:
+    timeout: float = 72000
+    def __init__(self):
+        if os.environ.get('MASTER_PORT'):  # When running with torchrun
+            self.rank = int(os.environ['RANK'])
+            self.local_rank = int(os.environ['LOCAL_RANK'])
+            self.world_size = int(os.environ['WORLD_SIZE'])
+            self.distributed = True
+            torch.distributed.init_process_group(
+                backend='nccl',
+                init_method='env://',
+                world_size=self.world_size,
+                timeout=datetime.timedelta(seconds=self.timeout),
+                rank=self.rank,
+            )
+        else:  # When running with python for debugging
+            self.rank, self.local_rank, self.world_size = 0, 0, 1
+            self.distributed = False
+        torch.cuda.set_device(self.local_rank)
+        self.barrier()
+    def barrier(self) -> None:
+        if self.distributed:
+            torch.distributed.barrier()
+    def gather_concat(self, x: torch.Tensor) -> torch.Tensor:
+        if not self.distributed:
+            return x
+        x_list = [torch.empty_like(x) for _ in range(self.world_size)]
+        torch.distributed.all_gather(x_list, x)
+        return torch.cat(x_list)
+    def reduce(self, x):
+        if not self.distributed:
+            return x
+        torch.distributed.all_reduce(x, op=torch.distributed.ReduceOp.SUM)
+        return x
+    def __del__(self):
+        if self.distributed:
+            torch.distributed.destroy_process_group()
+def get_local_rank() -> int:
+    if os.environ.get('MASTER_PORT'):  # When running with torchrun
+        return int(os.environ['LOCAL_RANK'])
+    return 0
+def get_device_mesh(dp_size: int, tp_size: int = 1) -> DeviceMesh:
+    """Create DeviceMesh based on tensor and data parallelism configuration."""
+    # by default, I will use TP=1 for simplicity
+    mesh_shape = (dp_size, tp_size)
+    names = ("dp", "tp")
+    return init_device_mesh("cuda", mesh_shape=mesh_shape, mesh_dim_names=names)
+def wrap_matching_layers(
+    model: nn.Module,
+    layer_patterns: t.List[str],
+    wrapper_fn: t.Callable[[nn.Module], nn.Module],
+):
+    """
+    Recursively wraps submodules in the order they appear in layer_patterns.
+    For each pattern (in order), we do a pass over the model and wrap matches.
+    """
+    def _wrap_single_pattern(mod: nn.Module, pattern: str):
+        """
+        Recurse over mod, wrapping submodules that match `pattern`.
+        We do a post-order traversal so children get wrapped before the parent.
+        """
+        for child_name, child_module in list(mod.named_children()):
+            # Wrap grandchildren first.
+            _wrap_single_pattern(child_module, pattern)
+            # Check if the child's class name matches the pattern.
+            if fnmatch(child_module.__class__.__name__, pattern):
+                # Replace the child in the parent.
+                wrapped = wrapper_fn(child_module)
+                setattr(mod, child_name, wrapped)
+    # We do a pass for each pattern in order
+    for pattern in layer_patterns:
+        _wrap_single_pattern(model, pattern)
+def parallelize_model(args, model: nn.Module, dist: Distributed, device='cuda', block_names=['AttentionBlock']) -> nn.Module:
+    if not getattr(args, "fsdp", False):  # use standard DDP
+        model = model.to(device=device)
+        if dist.distributed:
+            print(f"Using DDP")
+            model_ddp = torch.nn.parallel.DistributedDataParallel(model, device_ids=[dist.local_rank])
+        else:
+            model_ddp = model  # compatible with DDP
+        return model, model_ddp
+    # Instantiate mixed precision policy from config
+    mp_policy = MixedPrecisionPolicy(
+        param_dtype=torch.bfloat16,
+        reduce_dtype=torch.bfloat16,
+        output_dtype=torch.bfloat16,
+        cast_forward_inputs=True
+    )
+    print(f"Using FSDP2 with: {mp_policy}")
+    # Apply FSDP wrapping based on specified parallel dimensions
+    dp_mesh = get_device_mesh(dist.world_size)["dp"]
+    # Configure core FSDP parameters
+    fsdp_config = {"mp_policy": mp_policy, "mesh": dp_mesh, "reshard_after_forward": True}
+    # Wrap specified layer patterns with FSDP
+    wrap_matching_layers(model, block_names, lambda m: fully_shard(m, **fsdp_config))
+    # Then wrap full model (remaining modules are captured with this)
+    model = fully_shard(model, **fsdp_config)
+    model = model.to(device=device)
+    return model, model  # for compatibility with DDP
+def save_model(args, dist, model, model_ckpt_file):
+    states = model.state_dict()
+    if not getattr(args, "fsdp", False):  # save DDP checkpoints
+        if dist.local_rank == 0:
+            torch.save(states, model_ckpt_file)
+    else:  # save FSDP checkpoints
+        dcp.save(states, checkpoint_id=str(model_ckpt_file))
+def save_optimizer(args, dist, optimizer, lr_schedule, opt_ckpt_file):
+    optim_states, lr_states = optimizer.state_dict(), lr_schedule.state_dict()
+    if not getattr(args, "fsdp", False):  # save DDP checkpoints
+        if dist.local_rank == 0:
+            torch.save({"optimizer": optim_states, "lr_schedule": lr_states}, opt_ckpt_file)
+    else:
+        filename = str(opt_ckpt_file)
+        dcp.save(optim_states, checkpoint_id=f"{filename}/optimizer")
+        torch.save(lr_states, f"{filename}/lr_schedule.bin")  # lr_schedule is not fsdp
+@contextlib.contextmanager
+def _fsdp2_no_sync(module, sync):
+    # v2 APIs
+    module.set_requires_gradient_sync(sync, recurse=True)
+    try:
+        yield
+    finally:
+        module.set_requires_gradient_sync(True, recurse=True)
+def sync_ctx(model, sync=True):
+    if hasattr(model, 'set_requires_gradient_sync'):
+        return _fsdp2_no_sync(model, sync)
+    elif not sync and hasattr(model, 'no_sync'):
+        return model.no_sync()
+    return contextlib.nullcontext()
+# ==== Utility Functions ====
+def set_random_seed(seed: int) -> None:
+    """Set random seed for reproducibility."""
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)