Spaces:

yujiwang0606
/

rCM-Wan-720p

Running on Zero

App Files Files Community

yujiwang0606 commited on 14 days ago

Commit

adecc3c

1 Parent(s): 4d42c48

init

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +15 -0
README.md +5 -5
app.py +175 -0
imaginaire/__init__.py +14 -0
imaginaire/callbacks/__init__.py +14 -0
imaginaire/callbacks/every_n.py +84 -0
imaginaire/callbacks/manual_gc.py +49 -0
imaginaire/config.py +410 -0
imaginaire/lazy_config/__init__.py +73 -0
imaginaire/lazy_config/file_io.py +24 -0
imaginaire/lazy_config/instantiate.py +119 -0
imaginaire/lazy_config/lazy.py +442 -0
imaginaire/lazy_config/omegaconf_patch.py +65 -0
imaginaire/lazy_config/registry.py +74 -0
imaginaire/model.py +137 -0
imaginaire/trainer.py +322 -0
imaginaire/utils/__init__.py +14 -0
imaginaire/utils/callback.py +518 -0
imaginaire/utils/checkpointer.py +282 -0
imaginaire/utils/config_helper.py +201 -0
imaginaire/utils/device.py +39 -0
imaginaire/utils/distributed.py +444 -0
imaginaire/utils/easy_io/__init__.py +14 -0
imaginaire/utils/easy_io/backends/__init__.py +28 -0
imaginaire/utils/easy_io/backends/base_backend.py +60 -0
imaginaire/utils/easy_io/backends/http_backend.py +91 -0
imaginaire/utils/easy_io/backends/local_backend.py +551 -0
imaginaire/utils/easy_io/backends/registry_utils.py +125 -0
imaginaire/utils/easy_io/easy_io.py +1034 -0
imaginaire/utils/easy_io/file_client.py +448 -0
imaginaire/utils/easy_io/handlers/__init__.py +29 -0
imaginaire/utils/easy_io/handlers/base.py +44 -0
imaginaire/utils/easy_io/handlers/byte_handler.py +39 -0
imaginaire/utils/easy_io/handlers/csv_handler.py +42 -0
imaginaire/utils/easy_io/handlers/gzip_handler.py +33 -0
imaginaire/utils/easy_io/handlers/imageio_video_handler.py +168 -0
imaginaire/utils/easy_io/handlers/json_handler.py +49 -0
imaginaire/utils/easy_io/handlers/jsonl_handler.py +80 -0
imaginaire/utils/easy_io/handlers/np_handler.py +89 -0
imaginaire/utils/easy_io/handlers/pandas_handler.py +31 -0
imaginaire/utils/easy_io/handlers/pickle_handler.py +42 -0
imaginaire/utils/easy_io/handlers/pil_handler.py +96 -0
imaginaire/utils/easy_io/handlers/registry_utils.py +82 -0
imaginaire/utils/easy_io/handlers/tarfile_handler.py +39 -0
imaginaire/utils/easy_io/handlers/torch_handler.py +34 -0
imaginaire/utils/easy_io/handlers/torchjit_handler.py +34 -0
imaginaire/utils/easy_io/handlers/txt_handler.py +34 -0
imaginaire/utils/easy_io/handlers/yaml_handler.py +38 -0
imaginaire/utils/ema.py +315 -0
imaginaire/utils/fused_adam.py +398 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,15 @@

+# Python cache and build files
+__pycache__/
+*.py[cod]
+*.pyo
+*.pyd
+# Virtual environments
+.venv/
+venv/
+env/
+# IDE and misc
+.idea/
+.vscode/
+.DS_Store

README.md CHANGED Viewed

@@ -1,8 +1,8 @@
 ---
-title: RCM Wan 720p
-emoji: 🌍
-colorFrom: purple
-colorTo: pink
 sdk: gradio
 sdk_version: 5.49.1
 app_file: app.py
@@ -11,4 +11,4 @@ license: apache-2.0
 short_description: rCM model for Wan2.1
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: rCM-Wan 720p
+emoji: 🐠
+colorFrom: green
+colorTo: gray
 sdk: gradio
 sdk_version: 5.49.1
 app_file: app.py
 short_description: rCM model for Wan2.1
 ---
+This demo uses the unofficial rCM models for Wan from worstcoder/rcm-Wan.

app.py ADDED Viewed

	@@ -0,0 +1,175 @@

+import spaces
+import gradio as gr
+import time
+import requests
+from wan2pt1_t2v_rcm_infer import inference, prepare_models
+from huggingface_hub import hf_hub_download
+import random
+from types import SimpleNamespace
+import gc
+import torch
+from imaginaire.lazy_config import LazyCall as L, LazyDict, instantiate
+from wan2pt1_t2v_rcm_infer import load_dit_model, WanModel
+import flash_attn
+print("flash_attn version: ", flash_attn.__version__)
+WAN2PT1_1PT3B_T2V: LazyDict = L(WanModel)(
+    dim=1536,
+    eps=1e-06,
+    ffn_dim=8960,
+    freq_dim=256,
+    in_dim=16,
+    model_type="t2v",
+    num_heads=12,
+    num_layers=30,
+    out_dim=16,
+    text_len=512,
+)
+WAN2PT1_14B_T2V: LazyDict = L(WanModel)(
+    dim=5120,
+    eps=1e-06,
+    ffn_dim=13824,
+    freq_dim=256,
+    in_dim=16,
+    model_type="t2v",
+    num_heads=40,
+    num_layers=40,
+    out_dim=16,
+    text_len=512,
+)
+dit_configs = {"1.3B": WAN2PT1_1PT3B_T2V, "14B": WAN2PT1_14B_T2V}
+dit_path_14B_720p = hf_hub_download(
+    repo_id="worstcoder/rcm-Wan",
+    filename="rCM_Wan2.1_T2V_14B_720p.pt",
+)
+vae_path = hf_hub_download(
+    repo_id="Wan-AI/Wan2.1-T2V-1.3B",
+    filename="Wan2.1_VAE.pth"
+)
+text_encoder_path = hf_hub_download(
+    repo_id="Wan-AI/Wan2.1-T2V-1.3B",
+    filename="models_t5_umt5-xxl-enc-bf16.pth"
+)
+net_14B_720p, tokenizer, t5_encoder = prepare_models(dit_path_14B_720p, vae_path, text_encoder_path)
+print("Loaded models")
+gc.collect()
+def random_seed():
+    return random.randint(0, 2**32 - 1)
+@spaces.GPU(duration=360)
+def generate_videos(prompt, model_size, num_samples, aspect_ratio, sigma_max, num_steps, seed):
+    if seed is None:
+        seed = random.randint(0, 2**32 - 1)
+    if "480p" in model_size:
+        resolution = "480p"
+    else:
+        resolution = "720p"
+    args = SimpleNamespace(
+        prompt=prompt,
+        model_size=model_size,
+        num_steps=num_steps,
+        num_samples=num_samples,
+        sigma_max=sigma_max,
+        num_frames=77,
+        resolution=resolution,
+        aspect_ratio=aspect_ratio,
+        seed=seed,
+    )
+    with torch.no_grad():
+        video_list = inference(args, net_14B_720p, tokenizer, t5_encoder)
+    if aspect_ratio == "16:9":
+        return video_list, None
+    else:
+        return None, video_list
+def update_num_samples(model_choice):
+    if model_choice == "rCM-Wan2.1-T2V-1.3B-480p":
+        options = [1, 2, 3, 4]
+    elif model_choice == "rCM-Wan2.1-T2V-14B-480p":
+        options = [1, 2, 3]
+    else:
+        options = [1, 2, 3]
+    return gr.Dropdown(choices=options, value=options[0], label="num_samples")
+def update_sigma_max(model_choice):
+    if "480p" in model_choice:
+        options = [80, 120, 200, 400, 800, 1600]
+    else:
+        options = [120, 200, 400, 800, 1600]
+    return gr.Dropdown(choices=options, value=options[0], label="sigma_max")
+with gr.Blocks() as demo:
+    gr.Markdown("## rCM model for Wan")
+    examples = [
+        ["A stylish woman walks down a Tokyo street filled with warm glowing neon and animated city signage. She wears a black leather jacket, a long red dress, and black boots, and carries a black purse. She wears sunglasses and red lipstick. She walks confidently and casually. The street is damp and reflective, creating a mirror effect of the colorful lights. Many pedestrians walk about."],
+        ["A close-up shot captures a steaming hot pot brimming with vegetables and dumplings, set on a rustic wooden table. The camera focuses on the bubbling broth as a woman, dressed in a light, patterned blouse, reaches in with chopsticks to lift a tender leaf of cabbage from the simmering mixture. Steam rises around her as she leans back slightly, her warm smile reflecting satisfaction and joy. Her movements are smooth and deliberate, showcasing her comfort and familiarity with the dining process. The background includes a small bowl of dipping sauce and a clay pot, adding to the cozy, communal dining atmosphere."],
+        ["A dynamic time-lapse video showing the rapidly moving scenery from the window of a speeding train. The camera captures various elements such as lush green fields, towering trees, quaint countryside houses, and distant mountain ranges passing by quickly. The train window frames the view, adding a sense of speed and motion as the landscape rushes past. The camera remains static but emphasizes the fast-paced movement outside. The overall atmosphere is serene yet exhilarating, capturing the essence of travel and exploration. Medium shot focusing on the train window and the rushing scenery beyond."]
+    ]
+    with gr.Row():
+        with gr.Column(scale=1):
+            with gr.Row():
+                prompt = gr.Textbox(label="Text prompt", placeholder="Text prompt for videos")
+                model_size = gr.Radio(
+                    ["rCM-Wan2.1-T2V-14B-720p"],
+                    value="rCM-Wan2.1-T2V-14B-720p",
+                    label="Model"
+                )
+            with gr.Row():
+                num_samples = gr.Dropdown([1, 2], value=1, label="num_samples")
+                aspect_ratio = gr.Radio(["16:9", "9:16"], value="16:9", label="aspect_ratio")
+                sigma_max = gr.Dropdown([120, 200, 400, 800, 1600], value=120, label="sigma_max")
+            with gr.Row():
+                num_steps = gr.Slider(1, 4, value=4, step=1, label="num_steps")
+                seed = gr.Number(label="seed", value=random_seed(), interactive=True)
+            with gr.Row():
+                regenerate_btn = gr.Button("New Seed")
+                run_btn = gr.Button("Generate Videos")
+            with gr.Row():
+                gr.Examples(
+                    examples,
+                    inputs=[prompt],
+                    label="Example prompts"
+                )
+        with gr.Column(scale=1):
+            video_16_9 = gr.Video(label="Videos 16:9", width=832)
+            video_9_16 = gr.Video(label="Videos 9:16", width=480, visible=False)
+        def show_video(aspect):
+            if aspect == "16:9":
+                return gr.update(visible=True), gr.update(visible=False, value=None)
+            else:
+                return gr.update(visible=False, value=None), gr.update(visible=True)
+    model_size.change(fn=update_num_samples, inputs=model_size, outputs=num_samples)
+    model_size.change(fn=update_sigma_max, inputs=model_size, outputs=sigma_max)
+    aspect_ratio.change(show_video, inputs=aspect_ratio, outputs=[video_16_9, video_9_16])
+    regenerate_btn.click(fn=random_seed, outputs=seed)
+    run_btn.click(
+        fn=generate_videos,
+        inputs=[prompt, model_size, num_samples, aspect_ratio, sigma_max, num_steps, seed],
+        outputs=[video_16_9, video_9_16],
+    )
+demo.launch()

imaginaire/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

imaginaire/callbacks/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

imaginaire/callbacks/every_n.py ADDED Viewed

	@@ -0,0 +1,84 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from abc import abstractmethod
+import torch
+from imaginaire.model import ImaginaireModel
+from imaginaire.trainer import ImaginaireTrainer
+from imaginaire.utils import distributed, log
+from imaginaire.utils.callback import Callback
+class EveryN(Callback):
+    def __init__(
+        self,
+        every_n: int | None = None,
+        step_size: int = 1,
+        barrier_after_run: bool = True,
+        run_at_start: bool = False,
+    ) -> None:
+        """Constructor for `EveryN`.
+        Args:
+            every_n (int): Frequency with which callback is run during training.
+            step_size (int): Size of iteration step count. Default 1.
+            barrier_after_run (bool): Whether to have a distributed barrier after each execution. Default True, to avoid timeouts.
+            run_at_start (bool): Whether to run at the beginning of training. Default False.
+        """
+        self.every_n = every_n
+        if self.every_n == 0:
+            log.warning(
+                f"every_n is set to 0. Callback {self.__class__.__name__} will be invoked only once in the beginning of the training. Calls happens on_training_step_end will be skipped."
+            )
+        self.step_size = step_size
+        self.barrier_after_run = barrier_after_run
+        self.run_at_start = run_at_start
+    def on_training_step_end(
+        self,
+        model: ImaginaireModel,
+        data_batch: dict[str, torch.Tensor],
+        output_batch: dict[str, torch.Tensor],
+        loss: torch.Tensor,
+        iteration: int = 0,
+    ) -> None:
+        # every_n = 0 is a special case which means every_n_impl will be called only once in the beginning of the training
+        if self.every_n != 0:
+            trainer = self.trainer
+            global_step = iteration // self.step_size
+            should_run = (iteration == 1 and self.run_at_start) or (
+                global_step % self.every_n == 0
+            )  # (self.every_n - 1)
+            if should_run:
+                log.debug(f"Callback {self.__class__.__name__} fired on train_batch_end step {global_step}")
+                self.every_n_impl(trainer, model, data_batch, output_batch, loss, iteration)
+                log.debug(f"Callback {self.__class__.__name__} finished on train_batch_end step {global_step}")
+                # add necessary barrier to avoid timeout
+                if self.barrier_after_run:
+                    distributed.barrier()
+    @abstractmethod
+    def every_n_impl(
+        self,
+        trainer: ImaginaireTrainer,
+        model: ImaginaireModel,
+        data_batch: dict[str, torch.Tensor],
+        output_batch: dict[str, torch.Tensor],
+        loss: torch.Tensor,
+        iteration: int,
+    ) -> None: ...

imaginaire/callbacks/manual_gc.py ADDED Viewed

	@@ -0,0 +1,49 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import gc
+from imaginaire.callbacks.every_n import EveryN
+from imaginaire.utils import log
+class ManualGarbageCollection(EveryN):
+    """
+    Disable auto gc and manually trigger garbage collection every N iterations
+    It is super useful for large scale training to reduce gpu sync time!
+    Can reach 50% speedup.
+    It is important to note that this callback only disables gc in main process and have auto gc enabled in subprocesses.
+    We start disable gc after warm_up iterations to avoid disabling gc in subprocesses, such as dataloader, which can cause OOM
+    """
+    def __init__(self, *args, warm_up: int = 5, **kwargs):
+        kwargs["barrier_after_run"] = False
+        super().__init__(*args, **kwargs)
+        self.counter = 0
+        self.warm = warm_up
+    def every_n_impl(self, trainer, model, data_batch, output_batch, loss, iteration):
+        del trainer, model, data_batch, output_batch, loss
+        self.counter += 1
+        if self.counter < self.warm:
+            return
+        if self.counter == self.warm:
+            gc.disable()
+            log.critical("Garbage collection disabled")
+        gc.collect(1)

imaginaire/config.py ADDED Viewed

	@@ -0,0 +1,410 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Training config system for Imaginare4"""
+from __future__ import annotations
+import os
+from typing import Any, TypeVar
+import attrs
+import torch
+import torch.utils.data
+from imaginaire.model import ImaginaireModel
+try:
+    from megatron.core import ModelParallelConfig
+    USE_MEGATRON = True
+except ImportError:
+    USE_MEGATRON = False
+    print("Megatron-core is not installed.")
+import builtins
+from imaginaire.lazy_config import LazyCall as L
+from imaginaire.lazy_config import LazyDict
+from imaginaire.utils import callback, distributed
+from imaginaire.utils.misc import Color
+T = TypeVar("T")
+def _is_attrs_instance(obj: object) -> bool:
+    """
+    Helper function to check if an object is an instance of an attrs-defined class.
+    Args:
+        obj: The object to check.
+    Returns:
+        bool: True if the object is an instance of an attrs-defined class, False otherwise.
+    """
+    return hasattr(obj, "__attrs_attrs__")
+def make_freezable(cls: T) -> T:
+    """
+    A decorator that adds the capability to freeze instances of an attrs-defined class.
+    NOTE: This requires the wrapped attrs to be defined with attrs.define(slots=False) because we need
+    to hack on a "_is_frozen" attribute.
+    This decorator enhances an attrs-defined class with the ability to be "frozen" at runtime.
+    Once an instance is frozen, its attributes cannot be changed. It also recursively freezes
+    any attrs-defined objects that are attributes of the class.
+    Usage:
+        @make_freezable
+        @attrs.define(slots=False)
+        class MyClass:
+            attribute1: int
+            attribute2: str
+        obj = MyClass(1, 'a')
+        obj.freeze()  # Freeze the instance
+        obj.attribute1 = 2  # Raises AttributeError
+    Args:
+        cls: The class to be decorated.
+    Returns:
+        The decorated class with added freezing capability.
+    """
+    if not hasattr(cls, "__dict__"):
+        raise TypeError(
+            "make_freezable cannot be used with classes that do not define __dict__. Make sure that the wrapped "
+            "class was defined with `@attrs.define(slots=False)`"
+        )
+    original_setattr = cls.__setattr__
+    def setattr_override(self, key, value) -> None:
+        """
+        Override __setattr__ to allow modifications during initialization
+        and prevent modifications once the instance is frozen.
+        """
+        if hasattr(self, "_is_frozen") and self._is_frozen and key != "_is_frozen":
+            raise AttributeError("Cannot modify frozen instance")
+        original_setattr(self, key, value)  # type: ignore
+    cls.__setattr__ = setattr_override  # type: ignore
+    def freeze(self: object) -> None:
+        """
+        Freeze the instance and all its attrs-defined attributes.
+        """
+        for _, value in attrs.asdict(self, recurse=False).items():
+            if _is_attrs_instance(value) and hasattr(value, "freeze"):
+                value.freeze()
+        self._is_frozen = True  # type: ignore
+    cls.freeze = freeze  # type: ignore
+    return cls
+def _pretty_print_attrs_instance(obj: object, indent: int = 0, use_color: bool = False) -> str:
+    """
+    Recursively pretty prints attrs objects with color.
+    """
+    assert attrs.has(obj.__class__)
+    lines: list[str] = []
+    for attribute in attrs.fields(obj.__class__):
+        value = getattr(obj, attribute.name)
+        if attrs.has(value.__class__):
+            if use_color:
+                lines.append("   " * indent + Color.cyan("* ") + Color.green(attribute.name) + ":")
+            else:
+                lines.append("   " * indent + "* " + attribute.name + ":")
+            lines.append(_pretty_print_attrs_instance(value, indent + 1, use_color))
+        else:
+            if use_color:
+                lines.append(
+                    "   " * indent + Color.cyan("* ") + Color.green(attribute.name) + ": " + Color.yellow(value)
+                )
+            else:
+                lines.append("   " * indent + "* " + attribute.name + ": " + str(value))
+    return "\n".join(lines)
+def pretty_print_overrides(overrides: list[str] | None = None, use_color: bool = False) -> str:
+    """
+    Pretty prints overrides.
+    """
+    lines: list[str] = []
+    lines.append(Color.cyan("* ") + Color.green("overrides") + ": ")
+    for override in overrides:
+        if override == "--":
+            continue
+        if override.startswith("~"):
+            attribute_name = override[1:]
+            attribute_value = None
+        else:
+            attribute_name, attribute_value = override.split("=")
+        if use_color:
+            lines.append("   " + Color.cyan("* ") + Color.green(attribute_name) + ": " + Color.yellow(attribute_value))
+        else:
+            lines.append("   " + "* " + attribute_name + ": " + str(attribute_value))
+    return "\n".join(lines)
+@make_freezable
+@attrs.define(slots=False)  # slots=False is required for make_freezable. See the make_freezable notes for more info.
+class ObjectStoreConfig:
+    # Whether the file I/O is from object store instead of local disk.
+    enabled: bool = False
+    # Path to the object store credentials file.
+    credentials: str = ""
+    # Object store bucket to read from / write to the objects.
+    bucket: str = ""
+@make_freezable
+@attrs.define(slots=False)
+class JobConfig:
+    # Project name.
+    project: str = ""
+    # Experiment name.
+    group: str = ""
+    # Run/job name.
+    name: str = ""
+    @property
+    def path(self) -> str:
+        return f"{self.project}/{self.group}/{self.name}"
+    @property
+    def path_local(self) -> str:
+        local_root = os.environ.get("IMAGINAIRE_OUTPUT_ROOT", "checkpoints")
+        return f"{local_root}/{self.path}"
+@make_freezable
+@attrs.define(slots=False)
+class EMAConfig:
+    # Enable tracking a set of exponential moving average (EMA) weights.
+    enabled: bool = False
+    # EMA decay rate.
+    beta: float = 0.9999
+    # Enable removing "_orig_mod-" from buffer names that is added by torch.compile
+    torch_compile_buffer_renaming: bool = False
+@make_freezable
+@attrs.define(slots=False)
+class PowerEMAConfig:
+    # Enable tracking a set of exponential moving average (EMA) weights.
+    enabled: bool = False
+    # EDM2 paper EMA decay rate.
+    s: float = 0.1
+    # Enable removing "_orig_mod-" from buffer names that is added by torch.compile
+    torch_compile_buffer_renaming: bool = False
+@make_freezable
+@attrs.define(slots=False)
+class DDPConfig:
+    # Traverse the computation graph to find parameters that don't receive gradients.
+    find_unused_parameters: bool = False
+    # Set to True if the computation graph does not change during the whole training loop.
+    static_graph: bool = True
+    # Set to True if we want to synchronize buffers. Set to False if the sync is going to be handled elsewhere.
+    broadcast_buffers: bool = True
+@make_freezable
+@attrs.define(slots=False)
+class CuDNNConfig:
+    # Set to True for better reproducibility of the results (only using deterministic cudnn functions).
+    deterministic: bool = False
+    # If set to True, cudnn will benchmark several algorithms and pick the fastest one.
+    benchmark: bool = True
+@make_freezable
+@attrs.define(slots=False)
+class JITConfig:
+    # Enable exporting a JIT compiled model.
+    enabled: bool = False
+    # Input tensor shape, for example input.
+    input_shape: list[int] | None = None
+    # Device to compile onto.
+    device: str = "cuda"
+    # # Data type to compile onto.
+    dtype: str = "bfloat16"
+    # Strict mode for PyTorch JIT.
+    strict: bool = True
+@make_freezable
+@attrs.define(slots=False)
+class CheckpointConfig:
+    # possible checkpoint class
+    type: dict | None = None
+    # for dcp, whether to use async mode
+    dcp_async_mode_enabled: bool = False
+    # Save the checkpoint every N iterations.
+    save_iter: int = 999999999
+    # Path of model weights to resume the checkpoint from.
+    load_path: str = ""
+    # Whether to load the training states (optimizer/scheduler/grad-scaler) from the checkpoint path.
+    load_training_state: bool = False
+    # Whether to load the scheduler state only from the checkpoint path. If load_training_state is True, this will be ignored.
+    only_load_scheduler_state: bool = False
+    # Load state_dict to the models in strict mode.
+    strict_resume: bool = True
+    # Configs for JIT compiling EMA model.
+    jit: JITConfig = attrs.field(factory=JITConfig)
+    # Print detailed information during checkpoint saving/loading.
+    verbose: bool = True
+    # keys not to resume from the checkpoint, choices: ["model", "optim", "scheduler", "trainer"]
+    keys_not_to_resume: list[str] = []  # noqa: RUF008
+    # Whether to use the local filesystem for broadcasting checkpoint data (used for Tensor Parallel Checkpointer).
+    broadcast_via_filesystem: bool = False
+    load_ema_to_reg: bool = False
+    # In dcp planner, skip the weight shape check, load weights into the model even weight shape is different
+    dcp_allow_mismatched_size: bool = False
+@make_freezable
+@attrs.define(slots=False)
+class NVTXConfig:
+    """Config for NVTX ranges used in the main training loop.
+    See tutorials/nanogpt for more details on how to integrate profiling into your model."""
+    # Enable the NVTX ranges.
+    enabled: bool = False
+    # Synchronize everything in each NVTX range.
+    cuda_synchronize: bool = False
+@make_freezable
+@attrs.define(slots=False)
+class Profiling:
+    enable_profiling: bool = False
+    enable_memory_snapshot: bool = False
+    profile_freq: int = 1
+    first_n_rank: int = 8  # -1 means all ranks, n means first n ranks dumpy profiling info
+    record_shape: bool = True
+    profile_memory: bool = True
+    with_stack: bool = True
+    with_modules: bool = True
+@make_freezable
+@attrs.define(slots=False)
+class TrainerConfig:
+    from imaginaire.trainer import ImaginaireTrainer
+    type: builtins.type[ImaginaireTrainer] = ImaginaireTrainer
+    # Set the callback class.
+    # Defaults to the callbacks below.
+    callbacks: LazyDict[dict[str, callback.Callback]] = LazyDict(  # noqa: RUF009
+        dict(
+            ema=L(callback.EMAModelCallback)(),
+            progress_bar=L(callback.ProgressBarCallback)(),
+        )
+    )
+    # distributed parallelism strategy
+    distributed_parallelism: str = "ddp"
+    # Distributed data parallel configs.
+    ddp: DDPConfig = attrs.field(factory=DDPConfig)
+    # cuDNN configs.
+    cudnn: CuDNNConfig = attrs.field(factory=CuDNNConfig)
+    # Set the random seed.
+    seed: int = 0
+    # Gradient scaler arguments (for torch.amp.GradScaler).
+    grad_scaler_args: dict = attrs.field(factory=lambda: dict(enabled=False))
+    # Maximum number of iterations to train the model.
+    max_iter: int = 999999999
+    # Maximum number of iterations to validate the model. If None, validate on the entire dataset.
+    max_val_iter: int | None = None
+    # How often we log the training stats.
+    logging_iter: int = 100
+    # Whether we want to run the validation routines.
+    run_validation: bool = True
+    # How often we evaluate on the validation set.
+    validation_iter: int = 999999999
+    # Kill the process after N seconds since the last iteration (usually means dead job).
+    timeout_period: int = 999999999
+    # Tensor memory organization format.
+    memory_format: torch.memory_format = torch.preserve_format
+    # Gradient accumulation (update step every N iteration).
+    grad_accum_iter: int = 1
+    # Profiling config
+    profiling: Profiling = attrs.field(factory=Profiling)
+@make_freezable
+@attrs.define(slots=False)
+class Config:
+    """Config for an imaginaire4 job.
+    See /README.md/Configuration System for more info.
+    """
+    # Model configs.
+    model: LazyDict[ImaginaireModel]
+    # Optimizer configs.
+    optimizer: LazyDict[torch.optim.Optimizer]
+    # Scheduler configs.
+    scheduler: LazyDict[torch.optim.lr_scheduler.LRScheduler]
+    # Training data configs.
+    dataloader_train: LazyDict[torch.utils.data.DataLoader]
+    # Validation data configs.
+    dataloader_val: LazyDict[torch.utils.data.DataLoader]
+    # Training job configs.
+    job: JobConfig = attrs.field(factory=JobConfig)
+    # Trainer configs.
+    trainer: TrainerConfig = attrs.field(factory=TrainerConfig)
+    if USE_MEGATRON:
+        # Megatron-Core configs
+        model_parallel: ModelParallelConfig = attrs.field(factory=ModelParallelConfig)
+    else:
+        model_parallel: None = None
+    # Checkpointer configs.
+    checkpoint: CheckpointConfig = attrs.field(factory=CheckpointConfig)
+    def pretty_print(self, use_color: bool = False) -> str:
+        return _pretty_print_attrs_instance(self, 0, use_color)
+    def to_dict(self) -> dict[str, Any]:
+        return attrs.asdict(self)
+    def validate(self) -> None:
+        """Validate that the config has all required fields."""
+        # broadcast job.name across all ranks to make sure it is consistent
+        # otherwise, unaligned job names leads unaligned path to save checkpoints
+        job_name_tensor = torch.ByteTensor(bytearray(self.job.name, "utf-8")).cuda()
+        distributed.broadcast(job_name_tensor, 0)
+        self.job.name = job_name_tensor.cpu().numpy().tobytes().decode("utf-8")
+        assert self.job.project != ""
+        assert self.job.group != ""
+        assert self.job.name != ""

imaginaire/lazy_config/__init__.py ADDED Viewed

	@@ -0,0 +1,73 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from omegaconf import OmegaConf
+from imaginaire.lazy_config.instantiate import instantiate
+from imaginaire.lazy_config.lazy import LazyCall, LazyConfig, LazyDict
+from imaginaire.lazy_config.omegaconf_patch import to_object
+OmegaConf.to_object = to_object
+PLACEHOLDER = None
+__all__ = ["PLACEHOLDER", "LazyCall", "LazyConfig", "LazyDict", "instantiate"]
+DOC_BUILDING = os.getenv("_DOC_BUILDING", False)  # set in docs/conf.py
+def fixup_module_metadata(module_name, namespace, keys=None):
+    """
+    Fix the __qualname__ of module members to be their exported api name, so
+    when they are referenced in docs, sphinx can find them. Reference:
+    https://github.com/python-trio/trio/blob/6754c74eacfad9cc5c92d5c24727a2f3b620624e/trio/_util.py#L216-L241
+    """
+    if not DOC_BUILDING:
+        return
+    seen_ids = set()
+    def fix_one(qualname, name, obj):
+        # avoid infinite recursion (relevant when using
+        # typing.Generic, for example)
+        if id(obj) in seen_ids:
+            return
+        seen_ids.add(id(obj))
+        mod = getattr(obj, "__module__", None)
+        if mod is not None and (mod.startswith(module_name) or mod.startswith("fvcore.")):
+            obj.__module__ = module_name
+            # Modules, unlike everything else in Python, put fully-qualitied
+            # names into their __name__ attribute. We check for "." to avoid
+            # rewriting these.
+            if hasattr(obj, "__name__") and "." not in obj.__name__:
+                obj.__name__ = name
+                obj.__qualname__ = qualname
+            if isinstance(obj, type):
+                for attr_name, attr_value in obj.__dict__.items():
+                    fix_one(objname + "." + attr_name, attr_name, attr_value)
+    if keys is None:
+        keys = namespace.keys()
+    for objname in keys:
+        if not objname.startswith("_"):
+            obj = namespace[objname]
+            fix_one(objname, objname, obj)
+fixup_module_metadata(__name__, globals(), __all__)
+del fixup_module_metadata

imaginaire/lazy_config/file_io.py ADDED Viewed

	@@ -0,0 +1,24 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from iopath.common.file_io import HTTPURLHandler, OneDrivePathHandler, PathHandler
+from iopath.common.file_io import PathManager as PathManagerBase
+__all__ = ["PathHandler", "PathManager"]
+PathManager = PathManagerBase()
+PathManager.register_handler(HTTPURLHandler())
+PathManager.register_handler(OneDrivePathHandler())

imaginaire/lazy_config/instantiate.py ADDED Viewed

	@@ -0,0 +1,119 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections.abc as abc
+import dataclasses
+from typing import Any
+import attrs
+from imaginaire.lazy_config.registry import _convert_target_to_string, locate
+from imaginaire.utils import log
+__all__ = ["dump_dataclass", "instantiate"]
+def is_dataclass_or_attrs(target):
+    return dataclasses.is_dataclass(target) or attrs.has(target)
+def dump_dataclass(obj: Any):
+    """
+    Dump a dataclass recursively into a dict that can be later instantiated.
+    Args:
+        obj: a dataclass object
+    Returns:
+        dict
+    """
+    assert dataclasses.is_dataclass(obj) and not isinstance(obj, type), (
+        "dump_dataclass() requires an instance of a dataclass."
+    )
+    ret = {"_target_": _convert_target_to_string(type(obj))}
+    for f in dataclasses.fields(obj):
+        v = getattr(obj, f.name)
+        if dataclasses.is_dataclass(v):
+            v = dump_dataclass(v)
+        if isinstance(v, (list, tuple)):
+            v = [dump_dataclass(x) if dataclasses.is_dataclass(x) else x for x in v]
+        ret[f.name] = v
+    return ret
+def instantiate(cfg, *args, **kwargs):
+    """
+    Recursively instantiate objects defined in dictionaries by
+    "_target_" and arguments.
+    Args:
+        cfg: a dict-like object with "_target_" that defines the caller, and
+            other keys that define the arguments
+        args: Optional positional parameters pass-through.
+        kwargs: Optional named parameters pass-through.
+    Returns:
+        object instantiated by cfg
+    """
+    from omegaconf import DictConfig, ListConfig, OmegaConf
+    if isinstance(cfg, ListConfig):
+        lst = [instantiate(x) for x in cfg]
+        return ListConfig(lst, flags={"allow_objects": True})
+    if isinstance(cfg, list):
+        # Specialize for list, because many classes take
+        # list[objects] as arguments, such as ResNet, DatasetMapper
+        return [instantiate(x) for x in cfg]
+    # If input is a DictConfig backed by dataclasses (i.e. omegaconf's structured config),
+    # instantiate it to the actual dataclass.
+    if isinstance(cfg, DictConfig) and is_dataclass_or_attrs(cfg._metadata.object_type):
+        return OmegaConf.to_object(cfg)
+    if isinstance(cfg, abc.Mapping) and "_target_" in cfg:
+        # conceptually equivalent to hydra.utils.instantiate(cfg) with _convert_=all,
+        # but faster: https://github.com/facebookresearch/hydra/issues/1200
+        is_recursive = getattr(cfg, "_recursive_", True)
+        if is_recursive:
+            cfg = {k: instantiate(v) for k, v in cfg.items()}
+        else:
+            cfg = {k: v for k, v in cfg.items()}
+        # pop the _recursive_ key to avoid passing it as a parameter
+        if "_recursive_" in cfg:
+            cfg.pop("_recursive_")
+        cls = cfg.pop("_target_")
+        cls = instantiate(cls)
+        if isinstance(cls, str):
+            cls_name = cls
+            cls = locate(cls_name)
+            assert cls is not None, cls_name
+        else:
+            try:
+                cls_name = cls.__module__ + "." + cls.__qualname__
+            except Exception:
+                # target could be anything, so the above could fail
+                cls_name = str(cls)
+        assert callable(cls), f"_target_ {cls} does not define a callable object"
+        try:
+            # override config with kwargs
+            instantiate_kwargs = {}
+            instantiate_kwargs.update(cfg)
+            instantiate_kwargs.update(kwargs)
+            return cls(*args, **instantiate_kwargs)
+        except TypeError:
+            log.error(f"Error when instantiating {cls_name}!")
+            raise
+    return cfg  # return as-is if don't know what to do

imaginaire/lazy_config/lazy.py ADDED Viewed

	@@ -0,0 +1,442 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import ast
+import builtins
+import collections.abc as abc
+import importlib
+import inspect
+import logging
+import os
+import pickle
+import uuid
+from collections import OrderedDict
+from contextlib import contextmanager
+from copy import deepcopy
+from dataclasses import is_dataclass
+from typing import TYPE_CHECKING, Any, Generic, TypeAlias, TypeVar, cast
+import attrs
+import yaml
+from omegaconf import DictConfig, ListConfig, OmegaConf
+from imaginaire.utils import log
+try:
+    import dill as dill_pickle
+except ImportError:
+    dill_pickle = None
+try:
+    import cloudpickle
+except ImportError:
+    cloudpickle = None
+from imaginaire.lazy_config.file_io import PathManager
+from imaginaire.lazy_config.registry import _convert_target_to_string
+__all__ = ["LazyCall", "LazyConfig", "LazyDict"]
+T = TypeVar("T")
+def sort_dict(d: dict[str, Any]) -> OrderedDict[str, Any]:
+    return OrderedDict(sorted(d.items(), key=lambda x: x[0]))
+def dict_representer(dumper: yaml.Dumper, data: OrderedDict[str, Any]) -> yaml.nodes.MappingNode:
+    return dumper.represent_mapping("tag:yaml.org,2002:map", data.items())
+def sort_recursive(obj: dict[str, Any] | list[Any] | Any) -> OrderedDict[str, Any] | list[Any] | Any:
+    if isinstance(obj, dict):
+        return sort_dict({k: sort_recursive(v) for k, v in obj.items()})
+    elif isinstance(obj, list):
+        return [sort_recursive(item) for item in obj]
+    return obj
+yaml.add_representer(OrderedDict, dict_representer)
+OmegaConf.register_new_resolver("add", lambda *vals: sum(vals))
+OmegaConf.register_new_resolver("subtract", lambda *vals: vals[0] - sum(vals[1:]))
+def get_default_params(cls_or_func):
+    if callable(cls_or_func):
+        # inspect signature for function
+        signature = inspect.signature(cls_or_func)
+    else:
+        # inspect signature for class
+        signature = inspect.signature(cls_or_func.__init__)
+    params = signature.parameters
+    default_params = {
+        name: param.default for name, param in params.items() if param.default is not inspect.Parameter.empty
+    }
+    return default_params
+if TYPE_CHECKING:
+    # Have `LazyDict[T]` behave as `T`, so that attribute access works. Ideally, it
+    # would be a subclass of `T`, but this doesn't seem to be possible in the type
+    # system yet.
+    LazyDict: TypeAlias = T
+else:
+    LazyDict = DictConfig
+class LazyCall(Generic[T]):
+    """
+    Wrap a callable so that when it's called, the call will not be executed,
+    but returns a dict that describes the call.
+    LazyCall object has to be called with only keyword arguments. Positional
+    arguments are not yet supported.
+    Examples:
+    ::
+        from detectron2.config import instantiate, LazyCall
+        layer_cfg = LazyCall(nn.Conv2d)(in_channels=32, out_channels=32)
+        layer_cfg.out_channels = 64   # can edit it afterwards
+        layer = instantiate(layer_cfg)
+    """
+    def __init__(self, target: type[T]):
+        if not (callable(target) or isinstance(target, (str, abc.Mapping))):
+            raise TypeError(f"target of LazyCall must be a callable or defines a callable! Got {target}")
+        self._target = target
+    def __call__(self, **kwargs) -> LazyDict[T]:
+        if is_dataclass(self._target) or attrs.has(self._target):
+            # omegaconf object cannot hold dataclass type
+            # https://github.com/omry/omegaconf/issues/784
+            target = _convert_target_to_string(self._target)
+        else:
+            target = self._target
+        kwargs["_target_"] = target
+        _final_params = get_default_params(self._target)
+        _final_params.update(kwargs)
+        return cast(LazyDict[T], DictConfig(content=_final_params, flags={"allow_objects": True}))
+def _visit_dict_config(cfg, func):
+    """
+    Apply func recursively to all DictConfig in cfg.
+    """
+    if isinstance(cfg, DictConfig):
+        func(cfg)
+        for v in cfg.values():
+            _visit_dict_config(v, func)
+    elif isinstance(cfg, ListConfig):
+        for v in cfg:
+            _visit_dict_config(v, func)
+def _validate_py_syntax(filename):
+    # see also https://github.com/open-mmlab/mmcv/blob/master/mmcv/utils/config.py
+    with PathManager.open(filename, "r") as f:
+        content = f.read()
+    try:
+        ast.parse(content)
+    except SyntaxError as e:
+        raise SyntaxError(f"Config file {filename} has syntax error!") from e
+def _cast_to_config(obj):
+    # if given a dict, return DictConfig instead
+    if isinstance(obj, dict):
+        return DictConfig(obj, flags={"allow_objects": True})
+    return obj
+_CFG_PACKAGE_NAME = "detectron2._cfg_loader"
+"""
+A namespace to put all imported config into.
+"""
+def _random_package_name(filename):
+    # generate a random package name when loading config files
+    return _CFG_PACKAGE_NAME + str(uuid.uuid4())[:4] + "." + os.path.basename(filename)
+@contextmanager
+def _patch_import():
+    """
+    Enhance relative import statements in config files, so that they:
+    1. locate files purely based on relative location, regardless of packages.
+       e.g. you can import file without having __init__
+    2. do not cache modules globally; modifications of module states has no side effect
+    3. support other storage system through PathManager, so config files can be in the cloud
+    4. imported dict are turned into omegaconf.DictConfig automatically
+    """
+    old_import = builtins.__import__
+    def find_relative_file(original_file, relative_import_path, level):
+        # NOTE: "from . import x" is not handled. Because then it's unclear
+        # if such import should produce `x` as a python module or DictConfig.
+        # This can be discussed further if needed.
+        relative_import_err = """
+Relative import of directories is not allowed within config files.
+Within a config file, relative import can only import other config files.
+""".replace("\n", " ")
+        if not len(relative_import_path):
+            raise ImportError(relative_import_err)
+        cur_file = os.path.dirname(original_file)
+        for _ in range(level - 1):
+            cur_file = os.path.dirname(cur_file)
+        cur_name = relative_import_path.lstrip(".")
+        for part in cur_name.split("."):
+            cur_file = os.path.join(cur_file, part)
+        if not cur_file.endswith(".py"):
+            cur_file += ".py"
+        if not PathManager.isfile(cur_file):
+            cur_file_no_suffix = cur_file[: -len(".py")]
+            if PathManager.isdir(cur_file_no_suffix):
+                raise ImportError(f"Cannot import from {cur_file_no_suffix}." + relative_import_err)
+            else:
+                raise ImportError(
+                    f"Cannot import name {relative_import_path} from {original_file}: {cur_file} does not exist."
+                )
+        return cur_file
+    def new_import(name, globals=None, locals=None, fromlist=(), level=0):
+        if (
+            # Only deal with relative imports inside config files
+            level != 0 and globals is not None and (globals.get("__package__", "") or "").startswith(_CFG_PACKAGE_NAME)
+        ):
+            cur_file = find_relative_file(globals["__file__"], name, level)
+            _validate_py_syntax(cur_file)
+            spec = importlib.machinery.ModuleSpec(_random_package_name(cur_file), None, origin=cur_file)
+            module = importlib.util.module_from_spec(spec)
+            module.__file__ = cur_file
+            with PathManager.open(cur_file) as f:
+                content = f.read()
+            exec(compile(content, cur_file, "exec"), module.__dict__)
+            for name in fromlist:  # turn imported dict into DictConfig automatically
+                val = _cast_to_config(module.__dict__[name])
+                module.__dict__[name] = val
+            return module
+        return old_import(name, globals, locals, fromlist=fromlist, level=level)
+    builtins.__import__ = new_import
+    yield new_import
+    builtins.__import__ = old_import
+class LazyConfig:
+    """
+    Provide methods to save, load, and overrides an omegaconf config object
+    which may contain definition of lazily-constructed objects.
+    """
+    @staticmethod
+    def load_rel(filename: str, keys: None | str | tuple[str, ...] = None):
+        """
+        Similar to :meth:`load()`, but load path relative to the caller's
+        source file.
+        This has the same functionality as a relative import, except that this method
+        accepts filename as a string, so more characters are allowed in the filename.
+        """
+        caller_frame = inspect.stack()[1]
+        caller_fname = caller_frame[0].f_code.co_filename
+        assert caller_fname != "<string>", "load_rel Unable to find caller"
+        caller_dir = os.path.dirname(caller_fname)
+        filename = os.path.join(caller_dir, filename)
+        return LazyConfig.load(filename, keys)
+    @staticmethod
+    def load(filename: str, keys: None | str | tuple[str, ...] = None):
+        """
+        Load a config file.
+        Args:
+            filename: absolute path or relative path w.r.t. the current working directory
+            keys: keys to load and return. If not given, return all keys
+                (whose values are config objects) in a dict.
+        """
+        has_keys = keys is not None
+        filename = filename.replace("/./", "/")  # redundant
+        if os.path.splitext(filename)[1] not in [".py", ".yaml", ".yml"]:
+            raise ValueError(f"Config file {filename} has to be a python or yaml file.")
+        if filename.endswith(".py"):
+            _validate_py_syntax(filename)
+            with _patch_import():
+                # Record the filename
+                module_namespace = {
+                    "__file__": filename,
+                    "__package__": _random_package_name(filename),
+                }
+                with PathManager.open(filename) as f:
+                    content = f.read()
+                # Compile first with filename to:
+                # 1. make filename appears in stacktrace
+                # 2. make load_rel able to find its parent's (possibly remote) location
+                exec(compile(content, filename, "exec"), module_namespace)
+            ret = module_namespace
+        else:
+            with PathManager.open(filename) as f:
+                obj = yaml.unsafe_load(f)
+            ret = OmegaConf.create(obj, flags={"allow_objects": True})
+        if has_keys:
+            if isinstance(keys, str):
+                return _cast_to_config(ret[keys])
+            else:
+                return tuple(_cast_to_config(ret[a]) for a in keys)
+        else:
+            if filename.endswith(".py"):
+                # when not specified, only load those that are config objects
+                ret = DictConfig(
+                    {
+                        name: _cast_to_config(value)
+                        for name, value in ret.items()
+                        if isinstance(value, (DictConfig, ListConfig, dict)) and not name.startswith("_")
+                    },
+                    flags={"allow_objects": True},
+                )
+            return ret
+    @staticmethod
+    def save_pkl(cfg, filename: str) -> str:
+        """
+        Saves a Config object to a file using pickle serialization. This method is typically used
+        when the configuration object contains complex objects, such as lambdas, that are not supported by
+        simpler serialization methods like YAML. The function attempts to create a deep copy of the configuration
+        object before serialization to ensure that the original object remains unmodified.
+        Args:
+            cfg: A Config object to be serialized and saved.
+            filename: The path and name of the file where the configuration should be saved. The function
+                      assumes the file extension indicates a pickle format (e.g., .pkl).
+        Returns:
+            str: The filename to which the configuration was saved. This can be used to verify the file location
+                 or log the outcome.
+        Notes:
+            - The function logs a warning if the configuration is successfully saved using pickle.
+            - If saving fails, an error is logged with the exception details.
+        """
+        try:
+            cfg = deepcopy(cfg)
+        except Exception:
+            pass
+        try:
+            with PathManager.open(filename, "wb") as f:
+                pickle.dump(cfg, f)
+            log.warning(f"Config is saved using pickle at {filename}.")
+        except Exception as e:
+            log.error(f"Failed to save config to {filename}: {e}. Trying dill or cloudpickle instead")
+            if dill_pickle:
+                try:
+                    with PathManager.open(filename, "wb") as f:
+                        pickle.dump(dill_pickle.dumps(cfg, recurse=True), f)
+                        log.warning(f"Config is saved using dill at {filename}.")
+                except Exception as e:
+                    log.error(f"Failed to save config to {filename}: {e}.")
+                    if cloudpickle:
+                        try:
+                            with PathManager.open(filename, "wb") as f:
+                                pickle.dump(cloudpickle.dumps(cfg), f)
+                            log.warning(f"Config is saved using cloudpickle at {filename}.")
+                        except Exception as e:
+                            log.error(f"Failed to save config to {filename}: {e}.")
+                    else:
+                        log.error("cloudpickle is not available. Cannot save the config.")
+                        raise e
+        return filename
+    @staticmethod
+    def save_yaml(cfg, filename: str) -> str:
+        """
+        Saves a Config object to a file using YAML serialization. This method is beneficial when the configuration object's content needs to be human-readable and easily editable. YAML is suitable for configurations that do not contain complex types like lambdas, which must be handled differently. The function converts unserializable items to strings before saving to ensure compatibility with YAML serialization.
+        Args:
+            cfg: A Config object to be serialized and saved. It handles both DictConfig and ListConfig types.
+            filename: The path and name of the file where the configuration should be saved. The function does not require a specific file extension but typically uses '.yaml'.
+        Returns:
+            str: The filename to which the configuration was saved. This can be used to verify the file location or log the outcome.
+        Notes:
+            - The function logs a warning if the configuration is successfully saved using YAML.
+            - If saving fails, an error is logged with the exception details.
+        """
+        logger = logging.getLogger(__name__)
+        try:
+            cfg = deepcopy(cfg)
+        except Exception:
+            pass
+        # Define a function to check if an item is serializable to YAML
+        def is_serializable(item):
+            try:
+                OmegaConf.to_yaml(item)
+                return True
+            except Exception as e:
+                return False
+        # Function to convert unserializable items to strings
+        def serialize_config(config):
+            if isinstance(config, DictConfig):
+                for key, value in config.items():
+                    if isinstance(value, (DictConfig, ListConfig)):
+                        try:
+                            if "_target_" in value:
+                                default_params = get_default_params(value["_target_"])
+                                for default_key, default_v in default_params.items():
+                                    if default_key not in value:
+                                        value[default_key] = default_v
+                        except Exception as e:
+                            log.error(f"Failed to add default argument values: {e}")
+                        serialize_config(value)
+                    else:
+                        if not is_serializable(value) and value is not None:
+                            config[key] = str(value)
+            elif isinstance(config, ListConfig):
+                for i, item in enumerate(config):
+                    if isinstance(item, (DictConfig, ListConfig)):
+                        serialize_config(item)
+                    else:
+                        if not is_serializable(item) and item is not None:
+                            config[i] = str(item)
+            else:
+                raise NotImplementedError("Input config must be a DictConfig or ListConfig.")
+            return config
+        # Convert Config object to a DictConfig object.
+        config_dict = attrs.asdict(cfg)
+        config_omegaconf = DictConfig(content=config_dict, flags={"allow_objects": True})
+        # Serialize the DictConfig object by converting non-serializable objects to strings.
+        config_omegaconf = serialize_config(config_omegaconf)
+        config_dict: dict[str, Any] = OmegaConf.to_container(config_omegaconf, resolve=True)
+        sorted_config: OrderedDict[str, Any] = sort_recursive(config_dict)
+        with open(filename, "w") as f:
+            yaml.dump(sorted_config, f, default_flow_style=False)
+        log.warning(f"Config is saved using omegaconf at {filename}.")
+        return filename

imaginaire/lazy_config/omegaconf_patch.py ADDED Viewed

	@@ -0,0 +1,65 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any
+from omegaconf import OmegaConf
+from omegaconf.base import DictKeyType, SCMode
+from omegaconf.dictconfig import DictConfig  # pragma: no cover
+def to_object(cfg: Any) -> dict[DictKeyType, Any] | list[Any] | None | str | Any:
+    """
+    Converts an OmegaConf configuration object to a native Python container (dict or list), unless
+    the configuration is specifically created by LazyCall, in which case the original configuration
+    is returned directly.
+    This function serves as a modification of the original `to_object` method from OmegaConf,
+    preventing DictConfig objects created by LazyCall from being automatically converted to Python
+    dictionaries. This ensures that configurations meant to be lazily evaluated retain their intended
+    structure and behavior.
+    Differences from OmegaConf's original `to_object`:
+    - Adds a check at the beginning to return the configuration unchanged if it is created by LazyCall.
+    Reference:
+    - Original OmegaConf `to_object` method: https://github.com/omry/omegaconf/blob/master/omegaconf/omegaconf.py#L595
+    Args:
+        cfg (Any): The OmegaConf configuration object to convert.
+    Returns:
+        Union[Dict[DictKeyType, Any], List[Any], None, str, Any]: The converted Python container if
+        `cfg` is not a LazyCall created configuration, otherwise the unchanged `cfg`.
+    Examples:
+        >>> cfg = DictConfig({"key": "value", "_target_": "Model"})
+        >>> to_object(cfg)
+        DictConfig({"key": "value", "_target_": "Model"})
+        >>> cfg = DictConfig({"list": [1, 2, 3]})
+        >>> to_object(cfg)
+        {'list': [1, 2, 3]}
+    """
+    if isinstance(cfg, DictConfig) and "_target_" in cfg.keys():
+        return cfg
+    return OmegaConf.to_container(
+        cfg=cfg,
+        resolve=True,
+        throw_on_missing=True,
+        enum_to_str=False,
+        structured_config_mode=SCMode.INSTANTIATE,
+    )

imaginaire/lazy_config/registry.py ADDED Viewed

	@@ -0,0 +1,74 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pydoc
+from typing import Any
+from fvcore.common.registry import Registry  # for backward compatibility.
+"""
+``Registry`` and `locate` provide ways to map a string (typically found
+in config files) to callable objects.
+"""
+__all__ = ["Registry", "locate"]
+def _convert_target_to_string(t: Any) -> str:
+    """
+    Inverse of ``locate()``.
+    Args:
+        t: any object with ``__module__`` and ``__qualname__``
+    """
+    module, qualname = t.__module__, t.__qualname__
+    # Compress the path to this object, e.g. ``module.submodule._impl.class``
+    # may become ``module.submodule.class``, if the later also resolves to the same
+    # object. This simplifies the string, and also is less affected by moving the
+    # class implementation.
+    module_parts = module.split(".")
+    for k in range(1, len(module_parts)):
+        prefix = ".".join(module_parts[:k])
+        candidate = f"{prefix}.{qualname}"
+        try:
+            if locate(candidate) is t:
+                return candidate
+        except ImportError:
+            pass
+    return f"{module}.{qualname}"
+def locate(name: str) -> Any:
+    """
+    Locate and return an object ``x`` using an input string ``{x.__module__}.{x.__qualname__}``,
+    such as "module.submodule.class_name".
+    Raise Exception if it cannot be found.
+    """
+    obj = pydoc.locate(name)
+    # Some cases (e.g. torch.optim.sgd.SGD) not handled correctly
+    # by pydoc.locate. Try a private function from hydra.
+    if obj is None:
+        try:
+            # from hydra.utils import get_method - will print many errors
+            from hydra.utils import _locate
+        except ImportError as e:
+            raise ImportError(f"Cannot dynamically locate object {name}!") from e
+        else:
+            obj = _locate(name)  # it raises if fails
+    return obj

imaginaire/model.py ADDED Viewed

	@@ -0,0 +1,137 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any
+import torch
+from imaginaire.lazy_config import LazyDict, instantiate
+class ImaginaireModel(torch.nn.Module):
+    """The base model class of Imaginaire. It is inherited from torch.nn.Module.
+    All models in Imaginaire should inherit ImaginaireModel. It should include the implementions for all the
+    computation graphs. All inheriting child classes should implement the following methods:
+    - training_step(): The training step of the model, including the loss computation.
+    - validation_step(): The validation step of the model, including the loss computation.
+    - forward(): The computation graph for model inference.
+    The following methods have default implementations in ImaginaireModel:
+    - init_optimizer_scheduler(): Creates the optimizer and scheduler for the model.
+    """
+    def __init__(self) -> None:
+        super().__init__()
+    def init_optimizer_scheduler(
+        self,
+        optimizer_config: LazyDict[torch.optim.Optimizer],
+        scheduler_config: LazyDict[torch.optim.lr_scheduler.LRScheduler],
+    ) -> tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LRScheduler]:
+        """Creates the optimizer and scheduler for the model.
+        Args:
+            config_model (ModelConfig): The config object for the model.
+        Returns:
+            optimizer (torch.optim.Optimizer): The model optimizer.
+            scheduler (torch.optim.lr_scheduler.LRScheduler): The optimization scheduler.
+        """
+        optimizer_config.params = self.parameters()
+        optimizer = instantiate(optimizer_config)
+        scheduler_config.optimizer = optimizer
+        scheduler = instantiate(scheduler_config)
+        return optimizer, scheduler
+    def training_step(
+        self, data_batch: dict[str, torch.Tensor], iteration: int
+    ) -> tuple[dict[str, torch.Tensor], torch.Tensor]:
+        """The training step of the model, including the loss computation.
+        Args:
+            data (dict[str, torch.Tensor]): Data batch (dictionary of tensors).
+            iteration (int): Current iteration number.
+        Returns:
+            output_batch (dict[str, torch.Tensor]): Auxiliary model output from the training batch.
+            loss (torch.Tensor): The total loss for backprop (weighted sum of various losses).
+        """
+        raise NotImplementedError
+    @torch.no_grad()
+    def validation_step(
+        self, data_batch: dict[str, torch.Tensor], iteration: int
+    ) -> tuple[dict[str, torch.Tensor], torch.Tensor]:
+        """The validation step of the model, including the loss computation.
+        Args:
+            data (dict[str, torch.Tensor]): Data batch (dictionary of tensors).
+            iteration (int): Current iteration number.
+        Returns:
+            output_batch (dict[str, torch.Tensor]): Auxiliary model output from the validation batch.
+            loss (torch.Tensor): The total loss (weighted sum of various losses).
+        """
+        raise NotImplementedError
+    @torch.inference_mode()
+    def forward(self, *args: Any, **kwargs: Any) -> Any:
+        """The computation graph for model inference.
+        Args:
+            *args: Whatever you decide to pass into the forward method.
+            **kwargs: Keyword arguments are also possible.
+        Return:
+            Your model's output.
+        """
+        raise NotImplementedError
+    def on_model_init_start(self, set_barrier=False) -> None:
+        return
+    def on_model_init_end(self, set_barrier=False) -> None:
+        return
+    def on_train_start(self, memory_format: torch.memory_format = torch.preserve_format) -> None:
+        """The model preparation before the training is launched
+        Args:
+            memory_format (torch.memory_format): Memory format of the model.
+        """
+        pass
+    def on_before_zero_grad(
+        self, optimizer: torch.optim.Optimizer, scheduler: torch.optim.lr_scheduler.LRScheduler, iteration: int
+    ) -> None:
+        """Hook before zero_grad() is called.
+        Args:
+            optimizer (torch.optim.Optimizer): The model optimizer.
+            scheduler (torch.optim.lr_scheduler.LRScheduler): The optimization scheduler.
+            iteration (int): Current iteration number.
+        """
+        pass
+    def on_after_backward(self, iteration: int = 0) -> None:
+        """Hook after loss.backward() is called.
+        This method is called immediately after the backward pass, allowing for custom operations
+        or modifications to be performed on the gradients before the optimizer step.
+        Args:
+            iteration (int): Current iteration number.
+        """
+        pass

imaginaire/trainer.py ADDED Viewed

	@@ -0,0 +1,322 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import functools
+import inspect
+import os
+import signal
+import torch
+import torch.distributed as dist
+import torch.utils.data
+from imaginaire.utils.profiling import maybe_enable_memory_snapshot, maybe_enable_profiling
+try:
+    from megatron.core import parallel_state
+    USE_MEGATRON = True
+except ImportError:
+    USE_MEGATRON = False
+    print("Megatron-core is not installed.")
+from imaginaire.lazy_config import LazyConfig, instantiate
+from imaginaire.model import ImaginaireModel
+from imaginaire.utils import callback, distributed, log, misc
+from imaginaire.utils.checkpointer import Checkpointer
+class ImaginaireTrainer:
+    """The base trainer class of Imaginaire.
+    All trainers in Imaginaire should inherit ImaginaireTrainer. It contains the basic functionality for model training
+    (particularly suited for large-scale training), including data parallel (DDP/FSDP), model weight average (EMA),
+    mixed-precision training (fp16/bf16).
+    Attributes:
+        checkpointer (Checkpointer): checkpointer object to save/load model weights and optimizer states.
+        training_timer (misc.Timer): Timer object to time code blocks and functions.
+    """
+    def __init__(self, config):
+        """Constructor of the trainer.
+        Args:
+            config (Config): The config object for the Imaginaire codebase.
+        """
+        super().__init__()
+        self.config = config
+        # Set up the distributed computing environment.
+        with misc.timer("init_distributed"):
+            distributed.init()
+            # Set up parallel states.
+            if hasattr(config.model, "context_parallel_size"):
+                if config.model_parallel.context_parallel_size > 1:
+                    raise ValueError(
+                        "Both config.model.context_parallel_size and config.model_parallel.context_parallel_size are set. "
+                        "config.model.context_parallel_size is deprecated. Please only set config.model_parallel.context_parallel_size."
+                    )
+                else:
+                    log.critical(
+                        "Using deprecated config.model.context_parallel_size. Please use config.model_parallel.context_parallel_size instead."
+                    )
+                    config.model_parallel.context_parallel_size = config.model.context_parallel_size
+            if USE_MEGATRON:
+                if (
+                    "create_gloo_process_groups"
+                    in inspect.signature(parallel_state.initialize_model_parallel).parameters
+                ):
+                    parallel_state.initialize_model_parallel(
+                        pipeline_model_parallel_size=config.model_parallel.pipeline_model_parallel_size,
+                        tensor_model_parallel_size=config.model_parallel.tensor_model_parallel_size,
+                        context_parallel_size=config.model_parallel.context_parallel_size,
+                        create_gloo_process_groups=False,
+                    )
+                else:
+                    parallel_state.initialize_model_parallel(
+                        pipeline_model_parallel_size=config.model_parallel.pipeline_model_parallel_size,
+                        tensor_model_parallel_size=config.model_parallel.tensor_model_parallel_size,
+                        context_parallel_size=config.model_parallel.context_parallel_size,
+                    )
+                # `config.model_parallel.sequence_parallel` is a bool that indicates whether to use sequence parallelism.
+                # It is not part of the original `parallel_state` API, so we need to set it manually.
+                parallel_state.sequence_parallel = config.model_parallel.sequence_parallel
+                if parallel_state.sequence_parallel:
+                    os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
+        # Create the local job directory, save the config file, and pipe to a local log.
+        if distributed.is_rank0():
+            os.makedirs(config.job.path_local, exist_ok=True)
+            # Save the config as .pkl for reproducibility.
+            LazyConfig.save_pkl(config, f"{config.job.path_local}/config.pkl")
+            # Save the config as .yaml for reading or parsing experiment hyperparameters.
+            LazyConfig.save_yaml(config, f"{config.job.path_local}/config.yaml")
+        dist.barrier()
+        log.init_loguru_file(f"{config.job.path_local}/stdout.log")
+        if distributed.is_rank0():
+            # Print important environment variables and the effective config.
+            log.info("Config:\n" + config.pretty_print(use_color=True))
+        misc.print_environ_variables(["TORCH_HOME", "IMAGINAIRE_OUTPUT_ROOT"])
+        # Set the random seed. If multi-GPU, different ranks are set with different seeds.
+        misc.set_random_seed(seed=config.trainer.seed, by_rank=True)
+        # Initialize cuDNN.
+        torch.backends.cudnn.deterministic = config.trainer.cudnn.deterministic
+        torch.backends.cudnn.benchmark = config.trainer.cudnn.benchmark
+        # Floating-point precision settings.
+        torch.backends.cudnn.allow_tf32 = torch.backends.cuda.matmul.allow_tf32 = True
+        # Initialize the callback functions.
+        self.callbacks = callback.CallBackGroup(config=config, trainer=self)
+        # Initialize the model checkpointer.
+        if config.checkpoint.type is None:
+            self.checkpointer = Checkpointer(config.checkpoint, config.job, callbacks=self.callbacks)
+        else:
+            self.checkpointer: Checkpointer = instantiate(
+                config.checkpoint.type, config.checkpoint, config.job, callbacks=self.callbacks
+            )
+        # Initialize the timer for speed benchmarking.
+        self.training_timer = misc.TrainingTimer()
+        # Send a TimeoutError if a training step takes over timeout_period seconds.
+        signal.signal(signal.SIGALRM, functools.partial(misc.timeout_handler, config.trainer.timeout_period))  # type: ignore
+    def train(
+        self,
+        model: ImaginaireModel,
+        dataloader_train: torch.utils.data.DataLoader,
+        dataloader_val: torch.utils.data.DataLoader,
+    ) -> None:
+        """The training function.
+        Args:
+            model (ImaginaireModel): The PyTorch model.
+            dataloader_train (torch.utils.data.DataLoader): The training data loader.
+            dataloader_val (torch.utils.data.DataLoader): The validation data loader.
+        """
+        # Leaving this for backward compability for now, but we can think about moving this to model.on_train_start for all models.
+        model = model.to("cuda", memory_format=self.config.trainer.memory_format)  # type: ignore
+        model.on_train_start(self.config.trainer.memory_format)
+        # Initialize the optimizer, scheduler, and grad_scaler.
+        self.callbacks.on_optimizer_init_start()
+        optimizer, scheduler = model.init_optimizer_scheduler(self.config.optimizer, self.config.scheduler)
+        grad_scaler = torch.amp.GradScaler("cuda", **self.config.trainer.grad_scaler_args)
+        self.callbacks.on_optimizer_init_end()
+        # Load the model checkpoint and get the starting iteration number.
+        iteration = self.checkpointer.load(model, optimizer, scheduler, grad_scaler)
+        grad_accum_iter = 0
+        log.critical(f"Distributed parallelism mode: {self.config.trainer.distributed_parallelism}")
+        if self.config.trainer.distributed_parallelism == "ddp":
+            # Create a DDP model wrapper.
+            model_ddp = distributed.parallel_model_wrapper(self.config.trainer.ddp, model)
+        elif self.config.trainer.distributed_parallelism == "fsdp":
+            model_ddp = model
+        else:
+            raise ValueError(f"Unknown distributed parallelism mode: {self.config.trainer.distributed_parallelism}")
+        log.info("Starting training...")
+        self.callbacks.on_train_start(model, iteration=iteration)
+        # Initial validation.
+        if self.config.trainer.run_validation and iteration == 0:
+            self.validate(model, dataloader_val, iteration=iteration)
+            log.info("Initial validation done.")
+        _end_training = False
+        with (
+            maybe_enable_profiling(self.config, global_step=iteration) as torch_profiler,
+            maybe_enable_memory_snapshot(self.config, global_step=iteration) as memory_profiler,
+        ):
+            while True:
+                dataloader_train_iter = iter(dataloader_train)
+                while True:
+                    self.callbacks.on_before_dataloading(iteration)
+                    try:
+                        with self.training_timer("dataloader_train"):
+                            data_batch = next(dataloader_train_iter)
+                    except StopIteration:
+                        break
+                    finally:
+                        self.callbacks.on_after_dataloading(iteration)
+                    # If max_iter is reached, exit the training loop.
+                    if iteration >= self.config.trainer.max_iter:
+                        _end_training = True
+                        break
+                    # Move all tensors in the data batch to GPU device.
+                    data_batch = misc.to(data_batch, device="cuda")
+                    # The actual training step.
+                    self.callbacks.on_training_step_start(model, data_batch, iteration=iteration)
+                    self.callbacks.on_training_step_batch_start(model, data_batch, iteration=iteration)
+                    if not model.training:
+                        model_ddp.train()
+                    assert model_ddp.training, "model_ddp is not in training mode."
+                    assert model.training, "model is not in training mode."
+                    output_batch, loss, grad_accum_iter = self.training_step(
+                        model_ddp,
+                        optimizer,
+                        scheduler,
+                        grad_scaler,
+                        data_batch,
+                        iteration=iteration,
+                        grad_accum_iter=grad_accum_iter,
+                    )
+                    self.callbacks.on_training_step_batch_end(
+                        model, data_batch, output_batch, loss, iteration=iteration
+                    )
+                    # If the gradients are still being accumulated, continue to load the next training batch.
+                    if grad_accum_iter != 0:
+                        continue
+                    # Do the following when an actual optimizer (update) step has been made.
+                    iteration += 1
+                    # Save checkpoint.
+                    if iteration % self.config.checkpoint.save_iter == 0:
+                        self.checkpointer.save(model, optimizer, scheduler, grad_scaler, iteration=iteration)
+                    self.callbacks.on_training_step_end(model, data_batch, output_batch, loss, iteration=iteration)
+                    # Validation.
+                    if self.config.trainer.run_validation and iteration % self.config.trainer.validation_iter == 0:
+                        self.validate(model, dataloader_val, iteration=iteration)
+                    # This iteration is successful; reset the timeout signal.
+                    signal.alarm(self.config.trainer.timeout_period)
+                    if torch_profiler:
+                        torch_profiler.step()
+                    if memory_profiler:
+                        memory_profiler.step()
+                if _end_training:
+                    break
+        log.success("Done with training.")
+        if iteration % self.config.checkpoint.save_iter != 0:
+            self.checkpointer.save(model, optimizer, scheduler, grad_scaler, iteration=iteration)
+        self.callbacks.on_train_end(model, iteration=iteration)
+        self.checkpointer.finalize()
+        distributed.barrier()
+        self.callbacks.on_app_end()
+    def training_step(
+        self,
+        model_ddp: torch.nn.Module | distributed.DistributedDataParallel,
+        optimizer: torch.optim.Optimizer,
+        scheduler: torch.optim.lr_scheduler.LRScheduler,
+        grad_scaler: torch.amp.GradScaler,
+        data: dict[str, torch.Tensor],
+        iteration: int = 0,
+        grad_accum_iter: int = 0,
+    ) -> tuple[dict[str, torch.Tensor], torch.Tensor, int]:
+        """The training step.
+        Args:
+            model_ddp (torch.nn.Module | distributed.DistributedDataParallel): The model with a DDP wrapper or, the bare
+              module, depending on whether distributed training is enabled or not.
+            optimizer (torch.optim.Optimizer): The model optimizer.
+            scheduler (torch.optim.lr_scheduler.LRScheduler): The optimization scheduler.
+            grad_scaler (torch.amp.GradScaler): The gradient scaler (for mixed precision training).
+            data (dict[str, torch.Tensor]): Data batch (dictionary of tensors).
+            iteration (int): Current iteration number.
+            grad_accum_iter (int): Number of gradient accumulation iterations.
+        Returns:
+            output (dict[str, torch.Tensor]): The model output from the training data batch (dictionary of tensors).
+            loss (torch.Tensor): The total loss of the training data batch.
+        """
+        # Only let DDP sync gradient at the last iteration of the gradient accumulation window
+        with distributed.ddp_sync_grad(model_ddp, grad_accum_iter == self.config.trainer.grad_accum_iter - 1):
+            self.callbacks.on_before_forward(iteration=iteration)
+            with self.training_timer("forward"):
+                output_batch, loss = model_ddp.training_step(data, iteration)
+            self.callbacks.on_after_forward(iteration=iteration)
+            self.callbacks.on_before_backward(model_ddp, loss, iteration=iteration)
+            with self.training_timer("backward"):
+                loss_scaled = grad_scaler.scale(loss / self.config.trainer.grad_accum_iter)
+                loss_scaled.backward()
+                if self.config.trainer.distributed_parallelism == "ddp":
+                    model_ddp.module.on_after_backward()
+                else:
+                    model_ddp.on_after_backward()
+            self.callbacks.on_after_backward(model_ddp, iteration=iteration)
+        grad_accum_iter += 1
+        if grad_accum_iter == self.config.trainer.grad_accum_iter:
+            with self.training_timer("optimizer_step"):
+                self.callbacks.on_before_optimizer_step(
+                    model_ddp, optimizer, scheduler, grad_scaler, iteration=iteration
+                )
+                grad_scaler.step(optimizer)
+                grad_scaler.update()
+                scheduler.step()
+                self.callbacks.on_before_zero_grad(model_ddp, optimizer, scheduler, iteration=iteration)
+                if self.config.trainer.distributed_parallelism == "ddp":
+                    model_ddp.module.on_before_zero_grad(optimizer, scheduler, iteration=iteration)
+                else:
+                    model_ddp.on_before_zero_grad(optimizer, scheduler, iteration=iteration)
+                optimizer.zero_grad(set_to_none=True)
+            grad_accum_iter = 0
+        return output_batch, loss, grad_accum_iter
+    @torch.no_grad()
+    def validate(self, model: ImaginaireModel, dataloader_val: torch.utils.data.DataLoader, iteration: int = 0) -> None:
+        """Validate on the full validation dataset.
+        Args:
+            model (ImaginaireModel): The PyTorch model.
+            dataloader_val (torch.utils.data.DataLoader): The validation data loader.
+            iteration (int): Current iteration number.
+        """
+        log.info(f"Validating at iteration {iteration}...")
+        self.callbacks.on_validation_start(model, dataloader_val, iteration=iteration)
+        model.eval()
+        # Evaluate on the full validation set.
+        with model.pipe.ema_scope(context="Validation", is_cpu=False):
+            for val_iter, data_batch in enumerate(dataloader_val):
+                if self.config.trainer.max_val_iter is not None and val_iter >= self.config.trainer.max_val_iter:
+                    break
+                data_batch = misc.to(data_batch, device="cuda")
+                self.callbacks.on_validation_step_start(model, data_batch, iteration=iteration)
+                output_batch, loss = model.validation_step(data_batch, iteration)
+                self.callbacks.on_validation_step_end(model, data_batch, output_batch, loss, iteration=iteration)
+        self.callbacks.on_validation_end(model, iteration=iteration)

imaginaire/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

imaginaire/utils/callback.py ADDED Viewed

	@@ -0,0 +1,518 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+import time
+import warnings
+from collections.abc import Callable
+from typing import TYPE_CHECKING, Any
+import omegaconf
+import torch
+import torch.utils.data
+import tqdm
+from imaginaire.lazy_config import instantiate
+from imaginaire.utils import distributed, log
+from imaginaire.utils.misc import get_local_tensor_if_DTensor
+try:
+    from megatron.core import parallel_state
+except ImportError:
+    parallel_state = None
+    print("Megatron-core is not installed.")
+if TYPE_CHECKING:
+    from imaginaire.config import Config
+    from imaginaire.model import ImaginaireModel
+    from imaginaire.trainer import ImaginaireTrainer
+class CallBackGroup:
+    """A class for hosting a collection of callback objects.
+    It is used to execute callback functions of multiple callback objects with the same method name.
+    When callbackgroup.func(args) is executed, internally it loops through the objects in self._callbacks and runs
+    self._callbacks[0].func(args), self._callbacks[1].func(args), etc. The method name and arguments should match.
+    Attributes:
+        _callbacks (list[Callback]): List of callback objects.
+    """
+    def __init__(self, config: Config, trainer: ImaginaireTrainer) -> None:
+        """Initializes the list of callback objects.
+        Args:
+            config (Config): The config object for the Imaginaire codebase.
+            trainer (ImaginaireTrainer): The main trainer.
+        """
+        self._callbacks = []
+        callback_configs = config.trainer.callbacks
+        if callback_configs:
+            if isinstance(callback_configs, list) or isinstance(callback_configs, omegaconf.listconfig.ListConfig):
+                warnings.warn(
+                    "The 'config.trainer.callbacks' parameter should be a dict instead of a list. "
+                    "Please update your code",
+                    DeprecationWarning,
+                    stacklevel=2,
+                )
+                callback_configs = {f"callback_{i}": v for i, v in enumerate(callback_configs)}
+            for callback_name, current_callback_cfg in callback_configs.items():
+                if "_target_" not in current_callback_cfg:
+                    log.critical(
+                        f"Callback {callback_name} is missing the '_target_' field. \n SKip {current_callback_cfg}"
+                    )
+                    continue
+                log.critical(f"Instantiating callback {callback_name}: {current_callback_cfg}")
+                _callback = instantiate(current_callback_cfg)
+                assert isinstance(_callback, Callback), f"{current_callback_cfg} is not a valid callback."
+                _callback.config = config
+                _callback.trainer = trainer
+                self._callbacks.append(_callback)
+    def __getattr__(self, method_name: str) -> Callable:
+        """Loops through the callback objects to call the corresponding callback function.
+        Args:
+            method_name (str): Callback method name.
+        """
+        def multi_callback_wrapper(*args, **kwargs) -> None:
+            for callback in self._callbacks:
+                assert hasattr(callback, method_name)
+                method = getattr(callback, method_name)
+                assert callable(method)
+                _ = method(*args, **kwargs)
+        return multi_callback_wrapper
+class Callback:
+    """The base class for all callbacks.
+    All callbacks should inherit from this class and adhere to the established method names and signatures.
+    """
+    def __init__(self, config: Config | None = None, trainer: ImaginaireTrainer | None = None):
+        """Initializes a Callback object.
+        Args:
+            config (Optional[Config]): The configuration object for the Imaginaire codebase, if available.
+            trainer (Optional[ImaginaireTrainer]): The main trainer handling the training loop, if available.
+        Notes:
+            The config and trainer parameters are optional to maintain backward compatibility.
+            In future releases, these parameters will be removed. Upon using these parameters, a deprecation
+            warning will be issued.
+        """
+        if config is not None or trainer is not None:
+            warnings.warn(
+                "The 'config' and 'trainer' parameters are deprecated and will be removed in a future release. "
+                "Please update your code to create Callback instances without these parameters.",
+                DeprecationWarning,
+                stacklevel=2,
+            )
+        del config, trainer
+    def on_train_start(self, model: ImaginaireModel, iteration: int = 0) -> None:
+        pass
+    def on_training_step_start(self, model: ImaginaireModel, data: dict[str, torch.Tensor], iteration: int = 0) -> None:
+        """
+        Called before the training step, for each batch. This is paired with on_training_step_end() but note that
+        when using gradient accumulation, while on_training_step_end() is only called when the optimizer is updated,
+        this function is called for every batch.
+        Use on_training_step_batch_start and on_training_step_batch_end if you need callbacks that are called
+        for every batch, albeit with the same iteration number.
+        """
+        pass
+    def on_training_step_batch_start(
+        self, model: ImaginaireModel, data: dict[str, torch.Tensor], iteration: int = 0
+    ) -> None:
+        """
+        Called before the training step, for each batch, similarly to on_training_step_start(). This function is paired with
+        on_training_step_batch_end(), and both functions are called for every batch even when using gradient accumulation.
+        Note that the iteration is only updated when the optimizer is updated, and therefore it may be the same for multiple invocations.
+        """
+        pass
+    def on_before_forward(self, iteration: int = 0) -> None:
+        pass
+    def on_after_forward(self, iteration: int = 0) -> None:
+        pass
+    def on_before_backward(
+        self, model_ddp: distributed.DistributedDataParallel, loss: torch.Tensor, iteration: int = 0
+    ) -> None:
+        pass
+    def on_after_backward(self, model_ddp: distributed.DistributedDataParallel, iteration: int = 0) -> None:
+        pass
+    def on_before_dataloading(self, iteration: int = 0) -> None:
+        pass
+    def on_after_dataloading(self, iteration: int = 0) -> None:
+        pass
+    def on_optimizer_init_start(self) -> None:
+        pass
+    def on_optimizer_init_end(self) -> None:
+        pass
+    def on_before_optimizer_step(
+        self,
+        model_ddp: distributed.DistributedDataParallel,
+        optimizer: torch.optim.Optimizer,
+        scheduler: torch.optim.lr_scheduler.LRScheduler,
+        grad_scaler: torch.amp.GradScaler,
+        iteration: int = 0,
+    ) -> None:
+        pass
+    def on_before_zero_grad(
+        self,
+        model_ddp: distributed.DistributedDataParallel,
+        optimizer: torch.optim.Optimizer,
+        scheduler: torch.optim.lr_scheduler.LRScheduler,
+        iteration: int = 0,
+    ) -> None:
+        pass
+    def on_training_step_batch_end(
+        self,
+        model: ImaginaireModel,
+        data_batch: dict[str, torch.Tensor],
+        output_batch: dict[str, torch.Tensor],
+        loss: torch.Tensor,
+        iteration: int = 0,
+    ) -> None:
+        """
+        Called at the end of a training step for every batch even when using gradient accumulation.
+        This is paired with on_training_step_batch_start(). Note that the iteration is only updated when the optimizer is updated,
+        and therefore it may be the same for multiple batches.
+        """
+        pass
+    def on_training_step_end(
+        self,
+        model: ImaginaireModel,
+        data_batch: dict[str, torch.Tensor],
+        output_batch: dict[str, torch.Tensor],
+        loss: torch.Tensor,
+        iteration: int = 0,
+    ) -> None:
+        """
+        Called at the end of a training step, but note that when using gradient accumulation, this is only called
+        when the optimizer is updated, and the iteration incremented, whereas on_training_step_start is called every time.
+        Use on_training_step_batch_start and on_training_step_batch_end if you need callbacks that are called
+        for every batch.
+        """
+        pass
+    def on_validation_start(
+        self, model: ImaginaireModel, dataloader_val: torch.utils.data.DataLoader, iteration: int = 0
+    ) -> None:
+        pass
+    def on_validation_step_start(
+        self, model: ImaginaireModel, data: dict[str, torch.Tensor], iteration: int = 0
+    ) -> None:
+        pass
+    def on_validation_step_end(
+        self,
+        model: ImaginaireModel,
+        data_batch: dict[str, torch.Tensor],
+        output_batch: dict[str, torch.Tensor],
+        loss: torch.Tensor,
+        iteration: int = 0,
+    ) -> None:
+        pass
+    def on_validation_end(self, model: ImaginaireModel, iteration: int = 0) -> None:
+        pass
+    def on_load_checkpoint_start(self, model: ImaginaireModel) -> None:
+        pass
+    def on_load_checkpoint_end(
+        self, model: ImaginaireModel, iteration: int = 0, checkpoint_path: str | None = None
+    ) -> None:
+        pass
+    def on_load_checkpoint(self, model: ImaginaireModel, state_dict: dict[Any]) -> None:
+        pass
+    def on_save_checkpoint_start(self, model: ImaginaireModel, iteration: int = 0) -> None:
+        """
+        Called when checkpoint saving is about to start.
+        """
+        pass
+    def on_save_checkpoint_end(self, model: ImaginaireModel, iteration: int = 0) -> None:
+        """
+        Called when the synchronous part of checkpointing is finished, this function can be used
+        along with on_save_checkpoint_start() to measure the exposed (synchronous) checkpoint time.
+        Note that for asynchronous checkpoint, the checkpoint may still be ongoing, so this function
+        does not mean the checkpoint is finished for the asynchronous case, use on_save_checkpoint_success()
+        for that.
+        """
+        pass
+    def on_save_checkpoint_success(self, iteration: int = 0, elapsed_time: float = 0) -> None:
+        """
+        Called when checkpoint saving is fully finished, and succeeded. Not called if checkpoint failed.
+        For synchronous checkpoint, it is called at the same time as on_save_checkpoint_end(), but for asynchronous
+        checkpoint, it is called after the asynchronous part has also finished. For checkpointers with out-of-process
+        checkpointing, this function is called as soon as the notification is received from the checkpointer process,
+        which may not be immediately after the checkpoint has completed but later on. Therefore, if you need to measure
+        the full checkpoint duration for the asynchronous part, use the elapsed_time parameter, do not measure it directly
+        as this would be a significant overestimate.
+        """
+        pass
+    def on_save_checkpoint(self, model: ImaginaireModel, state_dict: dict[Any]) -> None:
+        pass
+    def on_train_end(self, model: ImaginaireModel, iteration: int = 0) -> None:
+        pass
+    def on_app_end(self) -> None:
+        pass
+class EMAModelCallback(Callback):
+    """The callback class for tracking EMA model weights."""
+    def on_train_start(self, model: ImaginaireModel, iteration: int = 0) -> None:
+        # Set up the EMA model weight tracker.
+        if model.config.ema.enabled:
+            assert hasattr(model, "ema"), "EMA should be initialized from ImaginaireModel"
+            # EMA model must be kept in FP32 precision.
+            model.ema = model.ema.to(dtype=torch.float32)
+        else:
+            assert not hasattr(model, "ema"), "There should be no EMA initialized."
+    def on_training_step_end(
+        self,
+        model: ImaginaireModel,
+        data_batch: dict[str, torch.Tensor],
+        output_batch: dict[str, torch.Tensor],
+        loss: torch.Tensor,
+        iteration: int = 0,
+    ) -> None:
+        # Update the EMA model with the new regular weights.
+        if model.config.ema.enabled:
+            model.ema.update_average(model, iteration)
+class ProgressBarCallback(Callback):
+    """The callback class for visualizing the training/validation progress bar in the console."""
+    @distributed.rank0_only
+    def on_train_start(self, model: ImaginaireModel, iteration: int = 0) -> None:
+        self.train_pbar = tqdm.trange(self.config.trainer.max_iter, initial=iteration, desc="Training")
+    @distributed.rank0_only
+    def on_training_step_end(
+        self,
+        model: ImaginaireModel,
+        data_batch: dict[str, torch.Tensor],
+        output_batch: dict[str, torch.Tensor],
+        loss: torch.Tensor,
+        iteration: int = 0,
+    ) -> None:
+        self.train_pbar.update()
+    @distributed.rank0_only
+    def on_validation_start(
+        self, model: ImaginaireModel, dataloader_val: torch.utils.data.DataLoader, iteration: int = 0
+    ) -> None:
+        if self.config.trainer.max_val_iter is not None:
+            num_iter = self.config.trainer.max_val_iter
+        else:
+            num_iter = len(dataloader_val)
+        assert num_iter is not None and num_iter > 0, f"Invalid number of validation iterations: {num_iter}"
+        self.val_pbar = tqdm.trange(num_iter, desc="Validating", position=1, leave=False)
+    @distributed.rank0_only
+    def on_validation_step_end(
+        self,
+        model: ImaginaireModel,
+        data_batch: dict[str, torch.Tensor],
+        output_batch: dict[str, torch.Tensor],
+        loss: torch.Tensor,
+        iteration: int = 0,
+    ) -> None:
+        self.val_pbar.update()
+    @distributed.rank0_only
+    def on_validation_end(self, model: ImaginaireModel, iteration: int = 0) -> None:
+        self.val_pbar.close()
+    @distributed.rank0_only
+    def on_train_end(self, model: ImaginaireModel, iteration: int = 0) -> None:
+        self.trainer.checkpointer.finalize()
+        self.train_pbar.close()
+class IterationLoggerCallback(Callback):
+    """The callback class for visualizing the training/validation progress bar in the console."""
+    @distributed.rank0_only
+    def on_train_start(self, model: ImaginaireModel, iteration: int = 0) -> None:
+        # self.train_pbar = tqdm.trange(self.config.trainer.max_iter, initial=iteration, desc="Training")
+        self.start_iteration_time = time.time()
+        self.elapsed_iteration_time = 0
+    @distributed.rank0_only
+    def on_training_step_start(self, model: ImaginaireModel, data: dict[str, torch.Tensor], iteration: int = 0) -> None:
+        self.start_iteration_time = time.time()
+    @distributed.rank0_only
+    def on_training_step_end(
+        self,
+        model: ImaginaireModel,
+        data_batch: dict[str, torch.Tensor],
+        output_batch: dict[str, torch.Tensor],
+        loss: torch.Tensor,
+        iteration: int = 0,
+    ) -> None:
+        self.elapsed_iteration_time += time.time() - self.start_iteration_time
+        if iteration % self.config.trainer.logging_iter == 0:
+            avg_time = self.elapsed_iteration_time / self.config.trainer.logging_iter
+            log.info(f"Iteration: {iteration}, average iter time: {avg_time:2f}, total loss {loss.item():4f}")
+            self.elapsed_iteration_time = 0
+class LowPrecisionCallback(Callback):
+    """The callback class handling low precision training
+    Config with non-primitive type makes it difficult to override the option.
+    The callback gets precision from model.precision instead.
+    It also auto disabled when using fp32.
+    """
+    def __init__(self, config: Config, trainer: ImaginaireTrainer, update_iter: int):
+        self.update_iter = update_iter
+    def on_train_start(self, model: ImaginaireModel, iteration: int = 0) -> None:
+        assert model.precision in [
+            torch.bfloat16,
+            torch.float16,
+            torch.half,
+        ], "LowPrecisionCallback must use a low precision dtype."
+        self.precision_type = model.precision
+    def on_training_step_start(self, model: ImaginaireModel, data: dict[str, torch.Tensor], iteration: int = 0) -> None:
+        for k, v in data.items():
+            if isinstance(v, torch.Tensor) and torch.is_floating_point(data[k]):
+                data[k] = v.to(dtype=self.precision_type)
+    def on_validation_step_start(
+        self, model: ImaginaireModel, data: dict[str, torch.Tensor], iteration: int = 0
+    ) -> None:
+        for k, v in data.items():
+            if isinstance(v, torch.Tensor) and torch.is_floating_point(data[k]):
+                data[k] = v.to(dtype=self.precision_type)
+    def on_before_zero_grad(
+        self,
+        model_ddp: distributed.DistributedDataParallel,
+        optimizer: torch.optim.Optimizer,
+        scheduler: torch.optim.lr_scheduler.LRScheduler,
+        iteration: int = 0,
+    ) -> None:
+        if iteration % self.update_iter == 0:
+            if getattr(optimizer, "master_weights", False):
+                params, master_params = [], []
+                for group, group_master in zip(optimizer.param_groups, optimizer.param_groups_master, strict=False):
+                    for p, p_master in zip(group["params"], group_master["params"], strict=False):
+                        params.append(get_local_tensor_if_DTensor(p.data))
+                        master_params.append(p_master.data)
+                torch._foreach_copy_(params, master_params)
+class NVTXCallback(Callback):
+    """The callback for creating NVTX ranges"""
+    def __init__(
+        self,
+        synchronize: bool = False,
+        config: Config | None = None,
+        trainer: ImaginaireTrainer | None = None,
+    ):
+        super().__init__(config, trainer)
+        self.synchronize = synchronize
+    def on_before_forward(self, iteration: int = 0) -> None:
+        if self.synchronize:
+            torch.cuda.synchronize()
+        torch.cuda.nvtx.range_push("forward")
+    def on_after_forward(self, iteration: int = 0) -> None:
+        if self.synchronize:
+            torch.cuda.synchronize()
+        torch.cuda.nvtx.range_pop()
+    def on_before_backward(
+        self, model_ddp: distributed.DistributedDataParallel, loss: torch.Tensor, iteration: int = 0
+    ) -> None:
+        if self.synchronize:
+            torch.cuda.synchronize()
+        torch.cuda.nvtx.range_push("backward")
+    def on_after_backward(self, model_ddp: distributed.DistributedDataParallel, iteration: int = 0) -> None:
+        if self.synchronize:
+            torch.cuda.synchronize()
+        torch.cuda.nvtx.range_pop()
+    def on_before_optimizer_step(
+        self,
+        model_ddp: distributed.DistributedDataParallel,
+        optimizer: torch.optim.Optimizer,
+        scheduler: torch.optim.lr_scheduler.LRScheduler,
+        grad_scaler: torch.amp.GradScaler,
+        iteration: int = 0,
+    ) -> None:
+        if self.synchronize:
+            torch.cuda.synchronize()
+        torch.cuda.nvtx.range_push("optimizer_step")
+    def on_before_zero_grad(
+        self,
+        model_ddp: distributed.DistributedDataParallel,
+        optimizer: torch.optim.Optimizer,
+        scheduler: torch.optim.lr_scheduler.LRScheduler,
+        iteration: int = 0,
+    ) -> None:
+        if self.synchronize:
+            torch.cuda.synchronize()
+        torch.cuda.nvtx.range_pop()
+    def on_before_dataloading(self, iteration: int = 0) -> None:
+        torch.cuda.nvtx.range_push("dataloading")
+    def on_after_dataloading(self, iteration: int = 0) -> None:
+        torch.cuda.nvtx.range_pop()

imaginaire/utils/checkpointer.py ADDED Viewed

	@@ -0,0 +1,282 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+import os
+import threading
+from typing import TYPE_CHECKING, NamedTuple
+import torch
+import torch.distributed as dist
+from torch import nn
+from imaginaire.model import ImaginaireModel
+from imaginaire.utils import callback, distributed, log, misc
+from imaginaire.utils.parallelism import ModelWrapper
+if TYPE_CHECKING:
+    from imaginaire.config import CheckpointConfig, JobConfig
+class Checkpointer:
+    """The checkpointer class. Supports checkpoint saving/loading to local disk."""
+    def __init__(self, config_checkpoint: CheckpointConfig, config_job: JobConfig, callbacks: callback.CallBackGroup):
+        """Constructor of the checkpointer.
+        Args:
+            config_checkpoint (CheckpointConfig): The config object for the checkpointer.
+        """
+        # Set the callback functions.
+        self.callbacks = callbacks
+        self.checkpoint_dir_local = f"{config_job.path_local}/checkpoints"
+        self.strict_resume = config_checkpoint.strict_resume
+        self.load_path = config_checkpoint.load_path or None
+        self.load_training_state = config_checkpoint.load_training_state
+        self.only_load_scheduler_state = config_checkpoint.only_load_scheduler_state
+        self.save_thread = None
+    def save(
+        self,
+        model: ImaginaireModel,
+        optimizer: torch.optim.Optimizer,
+        scheduler: torch.optim.lr_scheduler.LRScheduler,
+        grad_scaler: torch.amp.GradScaler,
+        iteration: int,
+    ) -> None:
+        """Save network weights, optimizer parameters, scheduler parameters to a checkpoint.
+        Args:
+            model (ImaginaireModel): The PyTorch model.
+            optimizer (torch.optim.Optimizer): The model optimizer.
+            scheduler (torch.optim.lr_scheduler.LRScheduler): The optimization scheduler.
+            grad_scaler (torch.amp.GradScaler): The gradient scaler (for mixed precision training).
+            iteration (int): Current iteration number.
+        """
+        self.callbacks.on_save_checkpoint_start(model, iteration)
+        checkpoint_file = f"iter_{iteration:09}.pt"
+        if distributed.get_rank() == 0:
+            state_dict = dict(
+                model=model.state_dict(),
+                optimizer=optimizer.state_dict(),
+                scheduler=scheduler.state_dict(),
+                grad_scaler=grad_scaler.state_dict(),
+                iteration=iteration,
+            )
+            state_dict = misc.to(state_dict, device="cpu")
+            self.callbacks.on_save_checkpoint(model, state_dict=state_dict)
+            # Wait for previous saver thread to end.
+            if self.save_thread:
+                self.save_thread.join()
+            # Run the checkpoint saver in a separate thread.
+            self.save_thread = threading.Thread(
+                target=self._save_worker_local,
+                daemon=False,
+                args=(state_dict, checkpoint_file, distributed.get_rank()),
+            )
+            self.save_thread.start()
+        # Note: Checkpoints are saved on a separate thread and this callback is not accurate.
+        # Please check logs from on_save_checkpoint_success() for better accuracy
+        self.callbacks.on_save_checkpoint_end(model=None, iteration=iteration)
+    @misc.timer("checkpoint saving (local)")
+    def _save_worker_local(self, state_dict: dict[str, torch.Tensor], checkpoint_file: str, rank: int = 0) -> None:
+        """Worker to save checkpoint to local disk, spawned with a child thread (runs in parallel with the training).
+        Args:
+            state_dict (dict[str, torch.Tensor]): The state dict of the model/optimizer/scheduler.
+            checkpoint_file (str): The file name of the model checkpoint.
+            rank (int): GPU device (default: 0).
+        """
+        checkpoint_path = os.path.join(self.checkpoint_dir_local, checkpoint_file)
+        os.makedirs(self.checkpoint_dir_local, exist_ok=True)
+        try:
+            torch.save(state_dict, checkpoint_path)
+            if rank == 0:
+                self._write_latest_checkpoint_file(checkpoint_file)
+            log.success(f"Saved checkpoint (local): {checkpoint_path}")
+            iteration = int(checkpoint_file.replace("iter_", "").replace(".pt", ""))
+            self.callbacks.on_save_checkpoint_success(iteration=iteration)
+        except Exception as e:
+            log.exception(f"Checkpoint failed to save (local): {e}")
+    @misc.timer("checkpoint loading")
+    def load(
+        self,
+        model: ImaginaireModel,
+        optimizer: torch.optim.Optimizer | None = None,
+        scheduler: torch.optim.lr_scheduler.LRScheduler | None = None,
+        grad_scaler: torch.amp.GradScaler | None = None,
+    ) -> int:
+        """Load network weights and optimizer states from a checkpoint in a single process.
+        The priority of the checkpoint loading logic is:
+        1. Attempt to resume training if possible by looking for latest_checkpoint.txt under the same name.
+        2. If no latest checkpoint were found, it loads the model weights specified by config_checkpoint.path.
+           - This is typically used for inference mode.
+           - If config_checkpoint.load_optimizer_state is True, then also load the optimizer and scheduler states.
+        3. If none of the above, randomly initialize the model parameters and train from scratch.
+        Args:
+            model (ImaginaireModel): The PyTorch model.
+            optimizer (torch.optim.Optimizer | None): The model optimizer (default: None).
+            scheduler (torch.optim.lr_scheduler.LRScheduler | None): The optimization scheduler (default: None).
+            grad_scaler (torch.amp.GradScaler | None): The gradient scaler (for mixed precision training).
+        Returns:
+            iteration (int): the iteration number to start/resume from.
+        """
+        self.callbacks.on_load_checkpoint_start(model)
+        latest_checkpoint_file = self._read_latest_checkpoint_file()
+        if latest_checkpoint_file is not None:
+            # 1. Resume training from latest_checkpoint.txt under the same name.
+            checkpoint_dir = self.checkpoint_dir_local
+            checkpoint_path = os.path.join(checkpoint_dir, latest_checkpoint_file)
+            resume = True
+            only_resume_scheduler = True
+        else:
+            if self.load_path:
+                # 2. Load the module weights specified by config_checkpoint.path.
+                checkpoint_path = self.load_path
+                resume = self.load_training_state
+                only_resume_scheduler = self.only_load_scheduler_state
+            else:
+                # 3. Randomly initialize the model parameters and train from scratch.
+                checkpoint_path = None
+                resume = False
+                only_resume_scheduler = False
+        # Load checkpoint.
+        if checkpoint_path is not None:
+            self._check_checkpoint_exists(checkpoint_path)
+            log.info(f"Loading checkpoint (local): {checkpoint_path}")
+            state_dict = torch.load(checkpoint_path, map_location=lambda storage, loc: storage)
+            log.success(f"Complete loading checkpoint (local): {checkpoint_path}")
+            self.callbacks.on_load_checkpoint(model, state_dict=state_dict)
+            # Load the state dicts.
+            log.info("- Loading the model...")
+            model.load_state_dict(state_dict["model"], strict=self.strict_resume)
+            if resume or only_resume_scheduler:
+                iteration = state_dict["iteration"]
+                assert scheduler
+                log.info("- Loading the scheduler...")
+                scheduler.load_state_dict(state_dict["scheduler"])
+                scheduler.last_epoch = iteration
+            else:
+                iteration = 0
+            if resume:
+                assert optimizer
+                log.info("- Loading the optimizer...")
+                optimizer.load_state_dict(state_dict["optimizer"])
+                log.info("- Loading the gradient scaler...")
+                grad_scaler.load_state_dict(state_dict["grad_scaler"])
+                log.success(f"Done with loading the checkpoint (iteration {iteration}).")
+            else:
+                log.success("Done with loading the checkpoint.")
+        else:
+            # Checkpoint not found and not specified. We will train everything from scratch.
+            iteration = 0
+            log.info("Training from scratch.")
+        torch.cuda.empty_cache()
+        self.callbacks.on_load_checkpoint_end(model, iteration=iteration, checkpoint_path=checkpoint_path)
+        return iteration
+    def _read_latest_checkpoint_file(self) -> str | None:
+        """Get the file name of the latest saved checkpoint. If it doesn't exist, return None.
+        Returns:
+            checkpoint_file (str | None): file name of the latest saved checkpoint.
+        """
+        checkpoint_file = None
+        latest_path = os.path.join(self.checkpoint_dir_local, "latest_checkpoint.txt")
+        if os.path.isfile(latest_path):
+            checkpoint_file = open(latest_path).read().strip()
+        return checkpoint_file
+    def _write_latest_checkpoint_file(self, checkpoint_file: str) -> None:
+        """Track the file name of the latest saved checkpoint.
+        Args:
+            checkpoint_file (str): file name of the latest saved checkpoint.
+        """
+        content = f"{checkpoint_file}\n"
+        latest_path = os.path.join(self.checkpoint_dir_local, "latest_checkpoint.txt")
+        with open(latest_path, "w") as file:
+            file.write(content)
+    def _check_checkpoint_exists(self, checkpoint_path: str) -> None:
+        """If the file checkpoint_path does not exist, raise an error.
+        Args:
+            checkpoint_path (str): full path to the checkpoint.
+        """
+        if not os.path.exists(checkpoint_path):
+            raise FileNotFoundError(f"File not found (local): {checkpoint_path}")
+    def finalize(self) -> None:
+        """Finalize the checkpointer."""
+        if self.save_thread:
+            self.save_thread.join()
+class _IncompatibleKeys(
+    NamedTuple(
+        "IncompatibleKeys",
+        [
+            ("missing_keys", list[str]),
+            ("unexpected_keys", list[str]),
+            ("incorrect_shapes", list[tuple[str, tuple[int], tuple[int]]]),
+        ],
+    )
+):
+    pass
+def load_checkpoint(
+    model_parts: list[nn.Module],
+    ckpt_dir,
+    model_ckpt_key_map: dict[str, str] = {},  # noqa: B006
+):
+    log.info(f"Loading checkpoint from {ckpt_dir}.")
+    _model_wrapper = ModelWrapper(model_parts)
+    state_dict = _model_wrapper.state_dict()
+    # remove _extra_state
+    state_dict = {k: v for k, v in state_dict.items() if not k.endswith("._extra_state")}
+    # remap keys if needed
+    if model_ckpt_key_map:
+        for model_key, checkpoint_key in model_ckpt_key_map.items():
+            state_dict[checkpoint_key] = state_dict.pop(model_key)
+            log.info(f"Re-mapping {model_key} to {checkpoint_key}")
+    fs_storage_reader = dist.checkpoint.FileSystemReader(ckpt_dir)
+    dist.checkpoint.load(state_dict=state_dict, storage_reader=fs_storage_reader)
+    # inverse the remapping if needed
+    if model_ckpt_key_map:
+        for model_key, checkpoint_key in model_ckpt_key_map.items():
+            state_dict[model_key] = state_dict.pop(checkpoint_key)
+            log.info(f"Inverse re-mapping {checkpoint_key} to {model_key}")
+    _model_wrapper.load_state_dict(state_dict)
+    log.info(f"Finished loading checkpoint from {ckpt_dir}.")

imaginaire/utils/config_helper.py ADDED Viewed

	@@ -0,0 +1,201 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import importlib
+import os
+import pkgutil
+import sys
+from dataclasses import fields as dataclass_fields
+from dataclasses import is_dataclass
+from typing import Any
+import attr
+import attrs
+from hydra import compose, initialize
+from hydra.core.config_store import ConfigStore
+from hydra.core.global_hydra import GlobalHydra
+from omegaconf import DictConfig, OmegaConf
+from imaginaire.config import Config
+from imaginaire.utils import log
+def is_attrs_or_dataclass(obj) -> bool:
+    """
+    Check if the object is an instance of an attrs class or a dataclass.
+    Args:
+        obj: The object to check.
+    Returns:
+        bool: True if the object is an instance of an attrs class or a dataclass, False otherwise.
+    """
+    return is_dataclass(obj) or attr.has(type(obj))
+def get_fields(obj):
+    """
+    Get the fields of an attrs class or a dataclass.
+    Args:
+        obj: The object to get fields from. Must be an instance of an attrs class or a dataclass.
+    Returns:
+        list: A list of field names.
+    Raises:
+        ValueError: If the object is neither an attrs class nor a dataclass.
+    """
+    if is_dataclass(obj):
+        return [field.name for field in dataclass_fields(obj)]
+    elif attr.has(type(obj)):
+        return [field.name for field in attr.fields(type(obj))]
+    else:
+        raise ValueError("The object is neither an attrs class nor a dataclass.")
+def override(config: Config, overrides: list[str] | None = None) -> Config:
+    """
+    :param config: the instance of class `Config` (usually from `make_config`)
+    :param overrides: list of overrides for config
+    :return: the composed instance of class `Config`
+    """
+    # Store the class of the config for reconstruction after overriding.
+    # config_class = type(config)
+    # Convert Config object to a DictConfig object
+    config_dict = attrs.asdict(config)
+    config_omegaconf = DictConfig(content=config_dict, flags={"allow_objects": True})
+    # Enforce "--" separator between the script arguments and overriding configs.
+    if overrides:
+        if overrides[0] != "--":
+            raise ValueError('Hydra config overrides must be separated with a "--" token.')
+        overrides = overrides[1:]
+    # Use Hydra to handle overrides
+    cs = ConfigStore.instance()
+    cs.store(name="config", node=config_omegaconf)
+    if not GlobalHydra().is_initialized():
+        with initialize(version_base=None):
+            config_omegaconf = compose(config_name="config", overrides=overrides)
+            OmegaConf.resolve(config_omegaconf)
+    else:
+        config_omegaconf = compose(config_name="config", overrides=overrides)
+        OmegaConf.resolve(config_omegaconf)
+    def config_from_dict(ref_instance: Any, kwargs: Any) -> Any:
+        """
+        Construct an instance of the same type as ref_instance using the provided dictionary or data or unstructured data
+        Args:
+            ref_instance: The reference instance to determine the type and fields when needed
+            kwargs: A dictionary of keyword arguments to use for constructing the new instance or primitive data or unstructured data
+        Returns:
+            Any: A new instance of the same type as ref_instance constructed using the provided kwargs or the primitive data or unstructured data
+        Raises:
+            AssertionError: If the fields do not match or if extra keys are found.
+            Exception: If there is an error constructing the new instance.
+        """
+        is_type = is_attrs_or_dataclass(ref_instance)
+        if not is_type:
+            return kwargs
+        else:
+            ref_fields = set(get_fields(ref_instance))
+            assert isinstance(kwargs, dict) or isinstance(kwargs, DictConfig), (
+                "kwargs must be a dictionary or a DictConfig"
+            )
+            keys = set(kwargs.keys())
+            # ref_fields must equal to or include all keys
+            extra_keys = keys - ref_fields
+            assert ref_fields == keys or keys.issubset(ref_fields), (
+                f"Fields mismatch: {ref_fields} != {keys}. Extra keys found: {extra_keys} \n \t when constructing {type(ref_instance)} with {keys}"
+            )
+            resolved_kwargs: dict[str, Any] = {}
+            for f in keys:
+                resolved_kwargs[f] = config_from_dict(getattr(ref_instance, f), kwargs[f])
+            try:
+                new_instance = type(ref_instance)(**resolved_kwargs)
+            except Exception as e:
+                log.error(f"Error when constructing {type(ref_instance)} with {resolved_kwargs}")
+                log.error(e)
+                raise e
+            return new_instance
+    config = config_from_dict(config, config_omegaconf)
+    return config
+def get_config_module(config_file: str) -> str:
+    if not config_file.endswith(".py"):
+        log.error("Config file cannot be specified as module.")
+        log.error("Please provide the path to the Python config file (relative to the Imaginaire4 root).")
+    assert os.path.isfile(config_file), f"Imaginaire4 config file ({config_file}) not found."
+    # Convert to importable module format.
+    config_module = config_file.replace("/", ".").replace(".py", "")
+    return config_module
+def import_all_modules_from_package(package_path: str, reload: bool = False, skip_underscore: bool = True) -> None:
+    """
+    Import all modules from the specified package path recursively.
+    This function is typically used in conjunction with Hydra to ensure that all modules
+    within a specified package are imported, which is necessary for registering configurations.
+    Example usage:
+    ```python
+    import_all_modules_from_package("projects.cosmos.diffusion.v1.config.experiment", reload=True, skip_underscore=False)
+    ```
+    Args:
+        package_path (str): The dotted path to the package from which to import all modules.
+        reload (bool): Flag to determine whether to reload modules if they're already imported.
+        skip_underscore (bool): If True, skips importing modules that start with an underscore.
+    """
+    log.critical(f"{'Reloading' if reload else 'Importing'} all modules from package {package_path}")
+    package = importlib.import_module(package_path)
+    package_directory = package.__path__
+    def import_modules_recursively(directory: str, prefix: str) -> None:
+        """
+        Recursively imports or reloads all modules in the given directory.
+        Args:
+            directory (str): The file system path to the current package directory.
+            prefix (str): The module prefix (e.g., 'projects.cosmos.diffusion.v1.config').
+        """
+        for _, module_name, is_pkg in pkgutil.iter_modules([directory]):
+            if skip_underscore and module_name.startswith("_"):
+                log.debug(f"Skipping module {module_name} as it starts with an underscore")
+                continue
+            full_module_name = f"{prefix}.{module_name}"
+            log.debug(f"{'Reloading' if reload else 'Importing'} module {full_module_name}")
+            if full_module_name in sys.modules and reload:
+                importlib.reload(sys.modules[full_module_name])
+            else:
+                importlib.import_module(full_module_name)
+            if is_pkg:
+                sub_package_directory = os.path.join(directory, module_name)
+                import_modules_recursively(sub_package_directory, full_module_name)
+    for directory in package_directory:
+        import_modules_recursively(directory, package_path)

imaginaire/utils/device.py ADDED Viewed

	@@ -0,0 +1,39 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import os
+import pynvml
+class Device:
+    _nvml_affinity_elements = math.ceil(os.cpu_count() / 64)  # type: ignore
+    def __init__(self, device_idx: int):
+        super().__init__()
+        self.handle = pynvml.nvmlDeviceGetHandleByIndex(device_idx)
+    def get_name(self) -> str:
+        return pynvml.nvmlDeviceGetName(self.handle)
+    def get_cpu_affinity(self) -> list[int]:
+        affinity_string = ""
+        for j in pynvml.nvmlDeviceGetCpuAffinity(self.handle, Device._nvml_affinity_elements):
+            # assume nvml returns list of 64 bit ints
+            affinity_string = f"{j:064b}" + affinity_string
+        affinity_list = [int(x) for x in affinity_string]
+        affinity_list.reverse()  # so core 0 is in 0th element of list
+        return [i for i, e in enumerate(affinity_list) if e != 0]

imaginaire/utils/distributed.py ADDED Viewed

	@@ -0,0 +1,444 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+import collections
+import collections.abc
+import ctypes
+import functools
+import os
+from collections.abc import Callable, Container
+from contextlib import contextmanager
+from datetime import timedelta
+from typing import TYPE_CHECKING, Any
+import pynvml
+import torch
+import torch.distributed as dist
+from torch.distributed import get_process_group_ranks
+from imaginaire.utils.device import Device
+if dist.is_available():
+    from torch.distributed.distributed_c10d import _get_default_group
+    from torch.distributed.utils import _sync_module_states, _verify_param_shape_across_processes
+from imaginaire.utils import log
+if TYPE_CHECKING:
+    from imaginaire.config import DDPConfig
+try:
+    from megatron.core import parallel_state
+except ImportError:
+    print("Megatron-core is not installed.")
+def init() -> int | None:
+    """Initialize distributed training."""
+    if dist.is_initialized():
+        return torch.cuda.current_device()
+    # Set GPU affinity.
+    pynvml.nvmlInit()
+    local_rank = int(os.getenv("LOCAL_RANK", 0))
+    try:
+        device = Device(local_rank)
+        os.sched_setaffinity(0, device.get_cpu_affinity())
+    except (OSError, pynvml.NVMLError) as e:
+        log.warning(f"Failed to set device affinity: {e}")
+    # Set up NCCL communication.
+    os.environ["TORCH_NCCL_BLOCKING_WAIT"] = "0"
+    os.environ["TORCH_NCCL_ASYNC_ERROR_HANDLING"] = "1"
+    if dist.is_available():
+        torch.cuda.set_device(local_rank)
+        # Get the timeout value from environment variable
+        timeout_seconds = os.getenv("TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC", 1800)
+        # Convert the timeout to an integer (if it isn't already) and then to a timedelta
+        timeout_timedelta = timedelta(seconds=int(timeout_seconds))
+        dist.init_process_group(backend="nccl", init_method="env://", timeout=timeout_timedelta)
+        log.info(
+            f"Initialized distributed training with local rank {local_rank} with timeout {timeout_seconds}",
+            rank0_only=False,
+        )
+    # Increase the L2 fetch granularity for faster speed.
+    _libcudart = ctypes.CDLL("libcudart.so")
+    # Set device limit on the current device.
+    p_value = ctypes.cast((ctypes.c_int * 1)(), ctypes.POINTER(ctypes.c_int))
+    _libcudart.cudaDeviceSetLimit(ctypes.c_int(0x05), ctypes.c_int(128))
+    _libcudart.cudaDeviceGetLimit(p_value, ctypes.c_int(0x05))
+    log.info(f"Training with {get_world_size()} GPUs.")
+def get_rank(group: dist.ProcessGroup | None = None) -> int:
+    """Get the rank (GPU device) of the worker.
+    Returns:
+        rank (int): The rank of the worker.
+    """
+    rank = 0
+    if dist.is_available() and dist.is_initialized():
+        rank = dist.get_rank(group)
+    return rank
+def get_world_size(group: dist.ProcessGroup | None = None) -> int:
+    """Get world size. How many GPUs are available in this job.
+    Returns:
+        world_size (int): The total number of GPUs available in this job.
+    """
+    world_size = 1
+    if dist.is_available() and dist.is_initialized():
+        world_size = dist.get_world_size(group)
+    return world_size
+def is_rank0() -> bool:
+    """Check if current process is the master GPU.
+    Returns:
+        (bool): True if this function is called from the master GPU, else False.
+    """
+    return get_rank() == 0
+def is_local_rank0() -> bool:
+    """Check if current process is the local master GPU in the current node.
+    Returns:
+        (bool): True if this function is called from the local master GPU, else False.
+    """
+    return torch.cuda.current_device() == 0
+def rank0_only(func: Callable) -> Callable:
+    """Apply this function only to the master GPU.
+    Example usage:
+        @rank0_only
+        def func(x):
+            return x + 3
+    Args:
+        func (Callable): a function.
+    Returns:
+        (Callable): A function wrapper executing the function only on the master GPU.
+    """
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        if is_rank0():
+            return func(*args, **kwargs)
+        else:
+            return None
+    return wrapper
+def barrier() -> None:
+    """Barrier for all GPUs."""
+    if dist.is_available() and dist.is_initialized():
+        dist.barrier()
+def rank0_first(func: Callable) -> Callable:
+    """run the function on rank 0 first, then on other ranks."""
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        if is_rank0():
+            result = func(*args, **kwargs)
+        barrier()
+        if not is_rank0():
+            result = func(*args, **kwargs)
+        return result
+    return wrapper
+def parallel_model_wrapper(config_ddp: DDPConfig, model: torch.nn.Module) -> torch.nn.Module | DistributedDataParallel:
+    """Wraps the model to enable data parallalism for training across multiple GPU devices.
+    Args:
+        config_ddp (DDPConfig): The data parallel config.
+        model (torch.nn.Module): The PyTorch module.
+    Returns:
+        model (torch.nn.Module | DistributedDataParallel): The data parallel model wrapper
+            if distributed environment is available, otherwise return the original model.
+    """
+    if dist.is_available() and dist.is_initialized():
+        local_rank = int(os.getenv("LOCAL_RANK", 0))
+        try:
+            ddp_group = parallel_state.get_data_parallel_group(with_context_parallel=True)
+        except Exception as e:
+            log.info(e)
+            log.info("parallel_state not initialized, treating all GPUs equally for DDP")
+            ddp_group = None
+        model = DistributedDataParallel(
+            model,
+            device_ids=[local_rank],
+            output_device=local_rank,
+            find_unused_parameters=config_ddp.find_unused_parameters,
+            static_graph=config_ddp.static_graph,
+            broadcast_buffers=config_ddp.broadcast_buffers,
+            process_group=ddp_group,
+        )
+    return model
+class DistributedDataParallel(torch.nn.parallel.DistributedDataParallel):
+    """This extends torch.nn.parallel.DistributedDataParallel with .training_step().
+    This borrows the concept of `forward-redirection` from Pytorch lightning. It wraps an ImaginaireModel such that
+    model.training_step() would be executed when calling self.training_step(), while preserving the behavior of calling
+    model() for Pytorch modules. Internally, this is a double rerouting mechanism (training_step -> forward ->
+    training_step), allowing us to preserve the function names and signatures.
+    """
+    def __init__(self, model: torch.nn.Module, *args, **kwargs):
+        super().__init__(model, *args, **kwargs)
+        self.show_sync_grad_static_graph_warning = True
+    def training_step(self, *args, **kwargs) -> Any:
+        # Cache the original model.forward() method.
+        original_forward = self.module.forward
+        def wrapped_training_step(*_args, **_kwargs):
+            # Unpatch immediately before calling training_step() because itself may want to call the real forward.
+            self.module.forward = original_forward
+            # The actual .training_step().
+            return self.module.training_step(*_args, **_kwargs)
+        # Patch the original_module's forward so we can redirect the arguments back to the real method.
+        self.module.forward = wrapped_training_step
+        # Call self, which implicitly calls self.forward() --> model.forward(), which is now model.training_step().
+        # Without calling self.forward() or model.forward() explciitly, implicit hooks are also executed.
+        return self(*args, **kwargs)
+@contextmanager
+def ddp_sync_grad(model, enabled):
+    r"""
+    Context manager to enable/disable gradient synchronizations across DDP processes for DDP model.
+    Modified from:
+    https://pytorch.org/docs/stable/_modules/torch/nn/parallel/distributed.html#DistributedDataParallel.no_sync
+    Note that this is incompatible with static_graph=True and will be an no-op if static_graph=True.
+    Within this context, gradients will be accumulated on module
+    variables, which will later be synchronized in the first
+    forward-backward pass exiting the context.
+    .. warning::
+        The forward pass should be included inside the context manager, or
+        else gradients will still be synchronized.
+    """
+    assert isinstance(model, torch.nn.Module)
+    if isinstance(model, DistributedDataParallel):
+        old_require_backward_grad_sync = model.require_backward_grad_sync
+        if model.static_graph and model.require_backward_grad_sync != enabled:
+            if model.show_sync_grad_static_graph_warning:
+                log.warning("DDP static_graph=True is incompatible with sync_grad(). Performance will be reduced.")
+                model.show_sync_grad_static_graph_warning = False
+        else:
+            model.require_backward_grad_sync = enabled
+    try:
+        yield
+    finally:
+        if isinstance(model, DistributedDataParallel):
+            model.require_backward_grad_sync = old_require_backward_grad_sync
+def collate_batches(data_batches: list[dict[str, torch.Tensor]]) -> torch.Tensor | dict[str, torch.Tensor]:
+    """Aggregate the list of data batches from all devices and process the results.
+    This is used for gathering validation data batches with imaginaire.utils.dataloader.DistributedEvalSampler.
+    It will return the data/output of the entire validation set in its original index order. The sizes of data_batches
+    in different ranks may differ by 1 (if dataset size is not evenly divisible), in which case a dummy sample will be
+    created before calling dis.all_gather().
+    Args:
+        data_batches (list[dict[str, torch.Tensor]]): List of tensors or (hierarchical) dictionary where
+            leaf entries are tensors.
+    Returns:
+        data_gather (torch.Tensor | dict[str, torch.Tensor]): tensors or (hierarchical) dictionary where
+            leaf entries are concatenated tensors.
+    """
+    if isinstance(data_batches[0], torch.Tensor):
+        # Concatenate the local data batches.
+        data_concat = torch.cat(data_batches, dim=0)  # type: ignore
+        # Get the largest number of local samples from all ranks to determine whether to dummy-pad on this rank.
+        max_num_local_samples = torch.tensor(len(data_concat), device="cuda")
+        dist.all_reduce(max_num_local_samples, op=dist.ReduceOp.MAX)
+        if len(data_concat) < max_num_local_samples:
+            assert len(data_concat) + 1 == max_num_local_samples
+            dummy = torch.empty_like(data_concat[:1])
+            data_concat = torch.cat([data_concat, dummy], dim=0)
+            dummy_count = torch.tensor(1, device="cuda")
+        else:
+            dummy_count = torch.tensor(0, device="cuda")
+        # Get all concatenated batches from all ranks and concatenate again.
+        dist.all_reduce(dummy_count, op=dist.ReduceOp.SUM)
+        data_concat = all_gather_tensor(data_concat.contiguous())
+        data_collate = torch.stack(data_concat, dim=1).flatten(start_dim=0, end_dim=1)
+        # Remove the dummy samples.
+        if dummy_count > 0:
+            data_collate = data_collate[:-dummy_count]
+    elif isinstance(data_batches[0], collections.abc.Mapping):
+        data_collate = dict()
+        for key in data_batches[0].keys():
+            data_collate[key] = collate_batches([data[key] for data in data_batches])  # type: ignore
+    else:
+        raise TypeError
+    return data_collate
+@torch.no_grad()
+def all_gather_tensor(tensor: torch.Tensor) -> list[torch.Tensor]:
+    """Gather the corresponding tensor from all GPU devices to a list.
+    Args:
+        tensor (torch.Tensor): Pytorch tensor.
+    Returns:
+        tensor_list (list[torch.Tensor]): A list of Pytorch tensors gathered from all GPU devices.
+    """
+    tensor_list = [torch.zeros_like(tensor) for _ in range(get_world_size())]
+    dist.all_gather(tensor_list, tensor)
+    return tensor_list
+def broadcast(tensor, src, group=None, async_op=False):
+    world_size = get_world_size()
+    if world_size < 2:
+        return tensor
+    dist.broadcast(tensor, src=src, group=group, async_op=async_op)
+def dist_reduce_tensor(tensor, rank=0, reduce="mean"):
+    r"""Reduce to rank 0"""
+    world_size = get_world_size()
+    if world_size < 2:
+        return tensor
+    with torch.no_grad():
+        dist.reduce(tensor, dst=rank)
+        if get_rank() == rank:
+            if reduce == "mean":
+                tensor /= world_size
+            elif reduce == "sum":
+                pass
+            else:
+                raise NotImplementedError
+    return tensor
+def sync_model_states(
+    model: torch.nn.Module,
+    process_group: dist.ProcessGroup | None = None,
+    src: int = 0,
+    params_and_buffers_to_ignore: Container[str] | None = None,
+    broadcast_buffers: bool = True,
+):
+    """
+    Modify based on DDP source code
+    Synchronizes the parameters and buffers of a model across different processes in a distributed setting.
+    This function ensures that all processes in the specified process group have the same initial parameters and
+    buffers from the source rank, typically rank 0. It is useful when different processes start with different model
+    states and a synchronization is required to ensure consistency across all ranks.
+    Args:
+        model (nn.Module): The model whose parameters and buffers are to be synchronized.
+        process_group (dist.ProcessGroup, optional): The process group for communication. If None,
+            the default group is used. Defaults to None.
+        src (int, optional): The source rank from which parameters and buffers will be broadcasted.
+            Defaults to 0.
+        params_and_buffers_to_ignore (Optional[Container[str]], optional): A container of parameter and buffer
+            names to exclude from synchronization. Defaults to None, which means all parameters and buffers are
+            included.
+        broadcast_buffers (bool, optional): Whether to broadcast buffers or not. Defaults to True.
+    Side Effects:
+        This function modifies the state of the model in-place to synchronize it with the source rank's model state.
+    Raises:
+        RuntimeError: If the shapes of parameters across processes do not match, a runtime error will be raised.
+    Examples:
+        >>> # downloading duplicated model weights from s3 in each rank and save network bandwidth
+        >>> # useful and save our time when model weights are huge
+        >>> if dist.get_rank == 0:
+        >>>     model.load_state_dict(network_bound_weights_download_fn(s3_weights_path))
+        >>> dist.barrir()
+        >>> sync_model_states(model) # sync rank0 weights to other ranks
+    """
+    if not dist.is_available() or not dist.is_initialized():
+        return
+    if process_group is None:
+        process_group = _get_default_group()
+    if not params_and_buffers_to_ignore:
+        params_and_buffers_to_ignore = set()
+    log.info(
+        f"Synchronizing model states from rank {src} to all ranks in process group {get_process_group_ranks(process_group)}."
+    )
+    # Build tuple of (module, parameter) for all parameters that require grads.
+    modules_and_parameters = [
+        (module, parameter)
+        for module_name, module in model.named_modules()
+        for parameter in [
+            param
+            # Note that we access module.named_parameters instead of
+            # parameters(module). parameters(module) is only needed in the
+            # single-process multi device case, where it accesses replicated
+            # parameters through _former_parameters.
+            for param_name, param in module.named_parameters(recurse=False)
+            if f"{module_name}.{param_name}" not in params_and_buffers_to_ignore
+            # if param.requires_grad
+            # and f"{module_name}.{param_name}" not in params_and_buffers_to_ignore
+        ]
+    ]
+    # Deduplicate any parameters that might be shared across child modules.
+    memo = set()
+    modules_and_parameters = [
+        # "p not in memo" is the deduplication check.
+        # "not memo.add(p)" is always True, and it's only there to cause "add(p)" if needed.
+        (m, p)
+        for m, p in modules_and_parameters
+        if p not in memo and not memo.add(p)  # type: ignore[func-returns-value]
+    ]
+    # Build list of parameters.
+    parameters = [parameter for _, parameter in modules_and_parameters]
+    if len(parameters) == 0:
+        return
+    _verify_param_shape_across_processes(process_group, parameters)
+    _sync_module_states(
+        module=model,
+        process_group=process_group,
+        broadcast_bucket_size=(250 * 1024 * 1024),
+        src=src,
+        params_and_buffers_to_ignore=params_and_buffers_to_ignore,
+        broadcast_buffers=broadcast_buffers,
+    )

imaginaire/utils/easy_io/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

imaginaire/utils/easy_io/backends/__init__.py ADDED Viewed

	@@ -0,0 +1,28 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from imaginaire.utils.easy_io.backends.base_backend import BaseStorageBackend
+from imaginaire.utils.easy_io.backends.http_backend import HTTPBackend
+from imaginaire.utils.easy_io.backends.local_backend import LocalBackend
+from imaginaire.utils.easy_io.backends.registry_utils import backends, prefix_to_backends, register_backend
+__all__ = [
+    "BaseStorageBackend",
+    "HTTPBackend",
+    "LocalBackend",
+    "backends",
+    "prefix_to_backends",
+    "register_backend",
+]

imaginaire/utils/easy_io/backends/base_backend.py ADDED Viewed

	@@ -0,0 +1,60 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import os.path as osp
+from abc import ABCMeta, abstractmethod
+def mkdir_or_exist(dir_name, mode=0o777):
+    if dir_name == "":
+        return
+    dir_name = osp.expanduser(dir_name)
+    os.makedirs(dir_name, mode=mode, exist_ok=True)
+def has_method(obj, method):
+    return hasattr(obj, method) and callable(getattr(obj, method))
+class BaseStorageBackend(metaclass=ABCMeta):
+    """Abstract class of storage backends.
+    All backends need to implement two apis: :meth:`get()` and
+    :meth:`get_text()`.
+    - :meth:`get()` reads the file as a byte stream.
+    - :meth:`get_text()` reads the file as texts.
+    """
+    # a flag to indicate whether the backend can create a symlink for a file
+    # This attribute will be deprecated in future.
+    _allow_symlink = False
+    @property
+    def allow_symlink(self):
+        return self._allow_symlink
+    @property
+    def name(self):
+        return self.__class__.__name__
+    @abstractmethod
+    def get(self, filepath):
+        pass
+    @abstractmethod
+    def get_text(self, filepath):
+        pass

imaginaire/utils/easy_io/backends/http_backend.py ADDED Viewed

	@@ -0,0 +1,91 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import tempfile
+from collections.abc import Generator
+from contextlib import contextmanager
+from pathlib import Path
+from urllib.request import urlopen
+from imaginaire.utils.easy_io.backends.base_backend import BaseStorageBackend
+class HTTPBackend(BaseStorageBackend):
+    """HTTP and HTTPS storage bachend."""
+    def get(self, filepath: str) -> bytes:
+        """Read bytes from a given ``filepath``.
+        Args:
+            filepath (str): Path to read data.
+        Returns:
+            bytes: Expected bytes object.
+        Examples:
+            >>> backend = HTTPBackend()
+            >>> backend.get('http://path/of/file')
+            b'hello world'
+        """
+        return urlopen(filepath).read()
+    def get_text(self, filepath, encoding="utf-8") -> str:
+        """Read text from a given ``filepath``.
+        Args:
+            filepath (str): Path to read data.
+            encoding (str): The encoding format used to open the ``filepath``.
+                Defaults to 'utf-8'.
+        Returns:
+            str: Expected text reading from ``filepath``.
+        Examples:
+            >>> backend = HTTPBackend()
+            >>> backend.get_text('http://path/of/file')
+            'hello world'
+        """
+        return urlopen(filepath).read().decode(encoding)
+    @contextmanager
+    def get_local_path(self, filepath: str) -> Generator[str | Path, None, None]:
+        """Download a file from ``filepath`` to a local temporary directory,
+        and return the temporary path.
+        ``get_local_path`` is decorated by :meth:`contxtlib.contextmanager`. It
+        can be called with ``with`` statement, and when exists from the
+        ``with`` statement, the temporary path will be released.
+        Args:
+            filepath (str): Download a file from ``filepath``.
+        Yields:
+            Iterable[str]: Only yield one temporary path.
+        Examples:
+            >>> backend = HTTPBackend()
+            >>> # After existing from the ``with`` clause,
+            >>> # the path will be removed
+            >>> with backend.get_local_path('http://path/of/file') as path:
+            ...     # do something here
+        """
+        try:
+            f = tempfile.NamedTemporaryFile(delete=False)
+            f.write(self.get(filepath))
+            f.close()
+            yield f.name
+        finally:
+            os.remove(f.name)

imaginaire/utils/easy_io/backends/local_backend.py ADDED Viewed

	@@ -0,0 +1,551 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import io
+import os
+import os.path as osp
+import shutil
+from collections.abc import Generator, Iterator
+from contextlib import contextmanager
+from pathlib import Path
+from imaginaire.utils.easy_io.backends.base_backend import BaseStorageBackend, mkdir_or_exist
+class LocalBackend(BaseStorageBackend):
+    """Raw local storage backend."""
+    _allow_symlink = True
+    def get(self, filepath: str | Path) -> bytes:
+        """Read bytes from a given ``filepath`` with 'rb' mode.
+        Args:
+            filepath (str or Path): Path to read data.
+        Returns:
+            bytes: Expected bytes object.
+        Examples:
+            >>> backend = LocalBackend()
+            >>> filepath = '/path/of/file'
+            >>> backend.get(filepath)
+            b'hello world'
+        """
+        with open(filepath, "rb") as f:
+            value = f.read()
+        return value
+    def get_text(self, filepath: str | Path, encoding: str = "utf-8") -> str:
+        """Read text from a given ``filepath`` with 'r' mode.
+        Args:
+            filepath (str or Path): Path to read data.
+            encoding (str): The encoding format used to open the ``filepath``.
+                Defaults to 'utf-8'.
+        Returns:
+            str: Expected text reading from ``filepath``.
+        Examples:
+            >>> backend = LocalBackend()
+            >>> filepath = '/path/of/file'
+            >>> backend.get_text(filepath)
+            'hello world'
+        """
+        with open(filepath, encoding=encoding) as f:
+            text = f.read()
+        return text
+    def put(self, obj: bytes | io.BytesIO, filepath: str | Path) -> None:
+        """Write bytes to a given ``filepath`` with 'wb' mode.
+        Note:
+            ``put`` will create a directory if the directory of
+            ``filepath`` does not exist.
+        Args:
+            obj (bytes): Data to be written.
+            filepath (str or Path): Path to write data.
+        Examples:
+            >>> backend = LocalBackend()
+            >>> filepath = '/path/of/file'
+            >>> backend.put(b'hello world', filepath)
+        """
+        mkdir_or_exist(osp.dirname(filepath))
+        if isinstance(obj, io.BytesIO):
+            obj.seek(0)
+            obj = obj.getvalue()
+        with open(filepath, "wb") as f:
+            f.write(obj)
+    def put_text(self, obj: str, filepath: str | Path, encoding: str = "utf-8") -> None:
+        """Write text to a given ``filepath`` with 'w' mode.
+        Note:
+            ``put_text`` will create a directory if the directory of
+            ``filepath`` does not exist.
+        Args:
+            obj (str): Data to be written.
+            filepath (str or Path): Path to write data.
+            encoding (str): The encoding format used to open the ``filepath``.
+                Defaults to 'utf-8'.
+        Examples:
+            >>> backend = LocalBackend()
+            >>> filepath = '/path/of/file'
+            >>> backend.put_text('hello world', filepath)
+        """
+        mkdir_or_exist(osp.dirname(filepath))
+        with open(filepath, "w", encoding=encoding) as f:
+            f.write(obj)
+    def exists(self, filepath: str | Path) -> bool:
+        """Check whether a file path exists.
+        Args:
+            filepath (str or Path): Path to be checked whether exists.
+        Returns:
+            bool: Return ``True`` if ``filepath`` exists, ``False`` otherwise.
+        Examples:
+            >>> backend = LocalBackend()
+            >>> filepath = '/path/of/file'
+            >>> backend.exists(filepath)
+            True
+        """
+        return osp.exists(filepath)
+    def isdir(self, filepath: str | Path) -> bool:
+        """Check whether a file path is a directory.
+        Args:
+            filepath (str or Path): Path to be checked whether it is a
+                directory.
+        Returns:
+            bool: Return ``True`` if ``filepath`` points to a directory,
+            ``False`` otherwise.
+        Examples:
+            >>> backend = LocalBackend()
+            >>> filepath = '/path/of/dir'
+            >>> backend.isdir(filepath)
+            True
+        """
+        return osp.isdir(filepath)
+    def isfile(self, filepath: str | Path) -> bool:
+        """Check whether a file path is a file.
+        Args:
+            filepath (str or Path): Path to be checked whether it is a file.
+        Returns:
+            bool: Return ``True`` if ``filepath`` points to a file, ``False``
+            otherwise.
+        Examples:
+            >>> backend = LocalBackend()
+            >>> filepath = '/path/of/file'
+            >>> backend.isfile(filepath)
+            True
+        """
+        return osp.isfile(filepath)
+    def join_path(self, filepath: str | Path, *filepaths: str | Path) -> str:
+        r"""Concatenate all file paths.
+        Join one or more filepath components intelligently. The return value
+        is the concatenation of filepath and any members of \*filepaths.
+        Args:
+            filepath (str or Path): Path to be concatenated.
+        Returns:
+            str: The result of concatenation.
+        Examples:
+            >>> backend = LocalBackend()
+            >>> filepath1 = '/path/of/dir1'
+            >>> filepath2 = 'dir2'
+            >>> filepath3 = 'path/of/file'
+            >>> backend.join_path(filepath1, filepath2, filepath3)
+            '/path/of/dir/dir2/path/of/file'
+        """
+        return osp.join(filepath, *filepaths)
+    @contextmanager
+    def get_local_path(self, filepath: str) -> Generator[str, None, None]:
+        """Download data from filepath to local path with a context manager.
+        If filepath exists in localhost, it just return filepath.
+        If filepath doesn't exist in localhost, it will download the data
+        to local path, and return the path, then the path will be removed
+        after existing from the with statement.
+        Args:
+            filepath (str): Path to be read data.
+        Yields:
+            str: Local path.
+        Examples:
+            >>> with backend.get_local_path('http://example.com/abc.jpg') as path:
+            ...     # do something here
+        """
+        yield filepath
+    def copyfile(
+        self,
+        src: str | Path,
+        dst: str | Path,
+    ) -> str:
+        """Copy a file src to dst and return the destination file.
+        src and dst should have the same prefix. If dst specifies a directory,
+        the file will be copied into dst using the base filename from src. If
+        dst specifies a file that already exists, it will be replaced.
+        Args:
+            src (str or Path): A file to be copied.
+            dst (str or Path): Copy file to dst.
+        Returns:
+            str: The destination file.
+        Raises:
+            SameFileError: If src and dst are the same file, a SameFileError
+                will be raised.
+        Examples:
+            >>> backend = LocalBackend()
+            >>> # dst is a file
+            >>> src = '/path/of/file'
+            >>> dst = '/path1/of/file1'
+            >>> # src will be copied to '/path1/of/file1'
+            >>> backend.copyfile(src, dst)
+            '/path1/of/file1'
+            >>> # dst is a directory
+            >>> dst = '/path1/of/dir'
+            >>> # src will be copied to '/path1/of/dir/file'
+            >>> backend.copyfile(src, dst)
+            '/path1/of/dir/file'
+        """
+        return shutil.copy(src, dst)
+    def copytree(
+        self,
+        src: str | Path,
+        dst: str | Path,
+    ) -> str:
+        """Recursively copy an entire directory tree rooted at src to a
+        directory named dst and return the destination directory.
+        src and dst should have the same prefix and dst must not already exist.
+        Args:
+            src (str or Path): A directory to be copied.
+            dst (str or Path): Copy directory to dst.
+        Returns:
+            str: The destination directory.
+        Raises:
+            FileExistsError: If dst had already existed, a FileExistsError will
+                be raised.
+        Examples:
+            >>> backend = LocalBackend()
+            >>> src = '/path/of/dir1'
+            >>> dst = '/path/of/dir2'
+            >>> backend.copytree(src, dst)
+            '/path/of/dir2'
+        """
+        return shutil.copytree(src, dst)
+    def copyfile_from_local(
+        self,
+        src: str | Path,
+        dst: str | Path,
+    ) -> str:
+        """Copy a local file src to dst and return the destination file. Same
+        as :meth:`copyfile`.
+        Args:
+            src (str or Path): A local file to be copied.
+            dst (str or Path): Copy file to dst.
+        Returns:
+            str: If dst specifies a directory, the file will be copied into dst
+            using the base filename from src.
+        Raises:
+            SameFileError: If src and dst are the same file, a SameFileError
+                will be raised.
+        Examples:
+            >>> backend = LocalBackend()
+            >>> # dst is a file
+            >>> src = '/path/of/file'
+            >>> dst = '/path1/of/file1'
+            >>> # src will be copied to '/path1/of/file1'
+            >>> backend.copyfile_from_local(src, dst)
+            '/path1/of/file1'
+            >>> # dst is a directory
+            >>> dst = '/path1/of/dir'
+            >>> # src will be copied to
+            >>> backend.copyfile_from_local(src, dst)
+            '/path1/of/dir/file'
+        """
+        return self.copyfile(src, dst)
+    def copytree_from_local(
+        self,
+        src: str | Path,
+        dst: str | Path,
+    ) -> str:
+        """Recursively copy an entire directory tree rooted at src to a
+        directory named dst and return the destination directory. Same as
+        :meth:`copytree`.
+        Args:
+            src (str or Path): A local directory to be copied.
+            dst (str or Path): Copy directory to dst.
+        Returns:
+            str: The destination directory.
+        Examples:
+            >>> backend = LocalBackend()
+            >>> src = '/path/of/dir1'
+            >>> dst = '/path/of/dir2'
+            >>> backend.copytree_from_local(src, dst)
+            '/path/of/dir2'
+        """
+        return self.copytree(src, dst)
+    def copyfile_to_local(
+        self,
+        src: str | Path,
+        dst: str | Path,
+        dst_type: str | None = None,
+    ) -> str:
+        """Copy the file src to local dst and return the destination file. Same
+        as :meth:`copyfile`.
+        If dst specifies a directory, the file will be copied into dst using
+        the base filename from src. If dst specifies a file that already
+        exists, it will be replaced.
+        Args:
+            src (str or Path): A file to be copied.
+            dst (str or Path): Copy file to to local dst.
+        Returns:
+            str: If dst specifies a directory, the file will be copied into dst
+            using the base filename from src.
+        Examples:
+            >>> backend = LocalBackend()
+            >>> # dst is a file
+            >>> src = '/path/of/file'
+            >>> dst = '/path1/of/file1'
+            >>> # src will be copied to '/path1/of/file1'
+            >>> backend.copyfile_to_local(src, dst)
+            '/path1/of/file1'
+            >>> # dst is a directory
+            >>> dst = '/path1/of/dir'
+            >>> # src will be copied to
+            >>> backend.copyfile_to_local(src, dst)
+            '/path1/of/dir/file'
+        """
+        return self.copyfile(src, dst)
+    def copytree_to_local(
+        self,
+        src: str | Path,
+        dst: str | Path,
+    ) -> str:
+        """Recursively copy an entire directory tree rooted at src to a local
+        directory named dst and return the destination directory.
+        Args:
+            src (str or Path): A directory to be copied.
+            dst (str or Path): Copy directory to local dst.
+            backend_args (dict, optional): Arguments to instantiate the
+                prefix of uri corresponding backend. Defaults to None.
+        Returns:
+            str: The destination directory.
+        Examples:
+            >>> backend = LocalBackend()
+            >>> src = '/path/of/dir1'
+            >>> dst = '/path/of/dir2'
+            >>> backend.copytree_from_local(src, dst)
+            '/path/of/dir2'
+        """
+        return self.copytree(src, dst)
+    def remove(self, filepath: str | Path) -> None:
+        """Remove a file.
+        Args:
+            filepath (str or Path): Path to be removed.
+        Raises:
+            IsADirectoryError: If filepath is a directory, an IsADirectoryError
+                will be raised.
+            FileNotFoundError: If filepath does not exist, an FileNotFoundError
+                will be raised.
+        Examples:
+            >>> backend = LocalBackend()
+            >>> filepath = '/path/of/file'
+            >>> backend.remove(filepath)
+        """
+        if not self.exists(filepath):
+            raise FileNotFoundError(f"filepath {filepath} does not exist")
+        if self.isdir(filepath):
+            raise IsADirectoryError("filepath should be a file")
+        os.remove(filepath)
+    def rmtree(self, dir_path: str | Path) -> None:
+        """Recursively delete a directory tree.
+        Args:
+            dir_path (str or Path): A directory to be removed.
+        Examples:
+            >>> dir_path = '/path/of/dir'
+            >>> backend.rmtree(dir_path)
+        """
+        shutil.rmtree(dir_path)
+    def copy_if_symlink_fails(
+        self,
+        src: str | Path,
+        dst: str | Path,
+    ) -> bool:
+        """Create a symbolic link pointing to src named dst.
+        If failed to create a symbolic link pointing to src, directly copy src
+        to dst instead.
+        Args:
+            src (str or Path): Create a symbolic link pointing to src.
+            dst (str or Path): Create a symbolic link named dst.
+        Returns:
+            bool: Return True if successfully create a symbolic link pointing
+            to src. Otherwise, return False.
+        Examples:
+            >>> backend = LocalBackend()
+            >>> src = '/path/of/file'
+            >>> dst = '/path1/of/file1'
+            >>> backend.copy_if_symlink_fails(src, dst)
+            True
+            >>> src = '/path/of/dir'
+            >>> dst = '/path1/of/dir1'
+            >>> backend.copy_if_symlink_fails(src, dst)
+            True
+        """
+        try:
+            os.symlink(src, dst)
+            return True
+        except Exception:
+            if self.isfile(src):
+                self.copyfile(src, dst)
+            else:
+                self.copytree(src, dst)
+            return False
+    def list_dir_or_file(
+        self,
+        dir_path: str | Path,
+        list_dir: bool = True,
+        list_file: bool = True,
+        suffix: str | tuple[str] | None = None,
+        recursive: bool = False,
+    ) -> Iterator[str]:
+        """Scan a directory to find the interested directories or files in
+        arbitrary order.
+        Note:
+            :meth:`list_dir_or_file` returns the path relative to ``dir_path``.
+        Args:
+            dir_path (str or Path): Path of the directory.
+            list_dir (bool): List the directories. Defaults to True.
+            list_file (bool): List the path of files. Defaults to True.
+            suffix (str or tuple[str], optional): File suffix that we are
+                interested in. Defaults to None.
+            recursive (bool): If set to True, recursively scan the directory.
+                Defaults to False.
+        Yields:
+            Iterable[str]: A relative path to ``dir_path``.
+        Examples:
+            >>> backend = LocalBackend()
+            >>> dir_path = '/path/of/dir'
+            >>> # list those files and directories in current directory
+            >>> for file_path in backend.list_dir_or_file(dir_path):
+            ...     print(file_path)
+            >>> # only list files
+            >>> for file_path in backend.list_dir_or_file(dir_path, list_dir=False):
+            ...     print(file_path)
+            >>> # only list directories
+            >>> for file_path in backend.list_dir_or_file(dir_path, list_file=False):
+            ...     print(file_path)
+            >>> # only list files ending with specified suffixes
+            >>> for file_path in backend.list_dir_or_file(dir_path, suffix='.txt'):
+            ...     print(file_path)
+            >>> # list all files and directory recursively
+            >>> for file_path in backend.list_dir_or_file(dir_path, recursive=True):
+            ...     print(file_path)
+        """
+        if list_dir and suffix is not None:
+            raise TypeError("`suffix` should be None when `list_dir` is True")
+        if (suffix is not None) and not isinstance(suffix, (str, tuple)):
+            raise TypeError("`suffix` must be a string or tuple of strings")
+        root = dir_path
+        def _list_dir_or_file(dir_path, list_dir, list_file, suffix, recursive):
+            for entry in os.scandir(dir_path):
+                if not entry.name.startswith(".") and entry.is_file():
+                    rel_path = osp.relpath(entry.path, root)
+                    if (suffix is None or rel_path.endswith(suffix)) and list_file:
+                        yield rel_path
+                elif osp.isdir(entry.path):
+                    if list_dir:
+                        rel_dir = osp.relpath(entry.path, root)
+                        yield rel_dir
+                    if recursive:
+                        yield from _list_dir_or_file(entry.path, list_dir, list_file, suffix, recursive)
+        return _list_dir_or_file(dir_path, list_dir, list_file, suffix, recursive)

imaginaire/utils/easy_io/backends/registry_utils.py ADDED Viewed

	@@ -0,0 +1,125 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+from imaginaire.utils.easy_io.backends.base_backend import BaseStorageBackend
+from imaginaire.utils.easy_io.backends.http_backend import HTTPBackend
+from imaginaire.utils.easy_io.backends.local_backend import LocalBackend
+backends: dict = {}
+prefix_to_backends: dict = {}
+def _register_backend(
+    name: str,
+    backend: type[BaseStorageBackend],
+    force: bool = False,
+    prefixes: str | list | tuple | None = None,
+):
+    """Register a backend.
+    Args:
+        name (str): The name of the registered backend.
+        backend (BaseStorageBackend): The backend class to be registered,
+            which must be a subclass of :class:`BaseStorageBackend`.
+        force (bool): Whether to override the backend if the name has already
+            been registered. Defaults to False.
+        prefixes (str or list[str] or tuple[str], optional): The prefix
+            of the registered storage backend. Defaults to None.
+    """
+    global backends, prefix_to_backends
+    if not isinstance(name, str):
+        raise TypeError(f"the backend name should be a string, but got {type(name)}")
+    if not inspect.isclass(backend):
+        raise TypeError(f"backend should be a class, but got {type(backend)}")
+    if not issubclass(backend, BaseStorageBackend):
+        raise TypeError(f"backend {backend} is not a subclass of BaseStorageBackend")
+    if name in backends and not force:
+        raise ValueError(
+            f'{name} is already registered as a storage backend, add "force=True" if you want to override it'
+        )
+    backends[name] = backend
+    if prefixes is not None:
+        if isinstance(prefixes, str):
+            prefixes = [prefixes]
+        else:
+            assert isinstance(prefixes, (list, tuple))
+        for prefix in prefixes:
+            if prefix in prefix_to_backends and not force:
+                raise ValueError(
+                    f'{prefix} is already registered as a storage backend, add "force=True" if you want to override it'
+                )
+            prefix_to_backends[prefix] = backend
+def register_backend(
+    name: str,
+    backend: type[BaseStorageBackend] | None = None,
+    force: bool = False,
+    prefixes: str | list | tuple | None = None,
+):
+    """Register a backend.
+    Args:
+        name (str): The name of the registered backend.
+        backend (class, optional): The backend class to be registered,
+            which must be a subclass of :class:`BaseStorageBackend`.
+            When this method is used as a decorator, backend is None.
+            Defaults to None.
+        force (bool): Whether to override the backend if the name has already
+            been registered. Defaults to False.
+        prefixes (str or list[str] or tuple[str], optional): The prefix
+            of the registered storage backend. Defaults to None.
+    This method can be used as a normal method or a decorator.
+    Examples:
+        >>> class NewBackend(BaseStorageBackend):
+        ...     def get(self, filepath):
+        ...         return filepath
+        ...
+        ...     def get_text(self, filepath):
+        ...         return filepath
+        >>> register_backend('new', NewBackend)
+        >>> @register_backend('new')
+        ... class NewBackend(BaseStorageBackend):
+        ...     def get(self, filepath):
+        ...         return filepath
+        ...
+        ...     def get_text(self, filepath):
+        ...         return filepath
+    """
+    if backend is not None:
+        _register_backend(name, backend, force=force, prefixes=prefixes)
+        return
+    def _register(backend_cls):
+        _register_backend(name, backend_cls, force=force, prefixes=prefixes)
+        return backend_cls
+    return _register
+register_backend("local", LocalBackend, prefixes="")
+register_backend("http", HTTPBackend, prefixes=["http", "https"])

imaginaire/utils/easy_io/easy_io.py ADDED Viewed

	@@ -0,0 +1,1034 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import warnings
+from collections.abc import Generator, Iterator
+from contextlib import contextmanager
+from io import BytesIO, StringIO
+from pathlib import Path
+from typing import IO, Any
+from imaginaire.utils.easy_io.backends import backends, prefix_to_backends
+from imaginaire.utils.easy_io.file_client import FileClient
+from imaginaire.utils.easy_io.handlers import file_handlers
+backend_instances: dict = {}
+def is_filepath(filepath):
+    return isinstance(filepath, (str, Path))
+def _parse_uri_prefix(uri: str | Path) -> str:
+    """Parse the prefix of uri.
+    Args:
+        uri (str or Path): Uri to be parsed that contains the file prefix.
+    Examples:
+        >>> _parse_uri_prefix('/home/path/of/your/file')
+        ''
+        >>> _parse_uri_prefix('http://path/of/your/file')
+        'http'
+    Returns:
+        str: Return the prefix of uri if the uri contains '://'. Otherwise,
+        return ''.
+    """
+    assert is_filepath(uri)
+    uri = str(uri)
+    # if uri does not contains '://', the uri will be handled by
+    # LocalBackend by default
+    if "://" not in uri:
+        return ""
+    else:
+        prefix, _ = uri.split("://")
+        if ":" in prefix:
+            _, prefix = prefix.split(":")
+        return prefix
+def _get_file_backend(prefix: str, backend_args: dict):
+    """Return a file backend based on the prefix or backend_args.
+    Args:
+        prefix (str): Prefix of uri.
+        backend_args (dict): Arguments to instantiate the corresponding
+            backend.
+    """
+    # backend name has a higher priority
+    if "backend" in backend_args:
+        # backend_args should not be modified
+        backend_args_bak = backend_args.copy()
+        backend_name = backend_args_bak.pop("backend")
+        backend = backends[backend_name](**backend_args_bak)
+    else:
+        backend = prefix_to_backends[prefix](**backend_args)
+    return backend
+def get_file_backend(
+    uri: str | Path | None = None,
+    *,
+    backend_args: dict | None = None,
+    enable_singleton: bool = False,
+    backend_key: str | None = None,
+):
+    """Return a file backend based on the prefix of uri or backend_args.
+    Args:
+        uri (str or Path): Uri to be parsed that contains the file prefix.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+        enable_singleton (bool): Whether to enable the singleton pattern.
+            If it is True, the backend created will be reused if the
+            signature is same with the previous one. Defaults to False.
+        backend_key: str: The key to register the backend. Defaults to None.
+    Returns:
+        BaseStorageBackend: Instantiated Backend object.
+    Examples:
+        >>> # get file backend based on the prefix of uri
+        >>> uri = 'http://path/of/your/file'
+        >>> backend = get_file_backend(uri)
+        >>> # get file backend based on the backend_args
+        >>> backend = get_file_backend(backend_args={'backend': 'http'})
+        >>> # backend name has a higher priority if 'backend' in backend_args
+        >>> backend = get_file_backend(uri, backend_args={'backend': 'http'})
+    """
+    global backend_instances
+    if backend_key is not None:
+        if backend_key in backend_instances:
+            return backend_instances[backend_key]
+    if backend_args is None:
+        backend_args = {}
+    if uri is None and "backend" not in backend_args and backend_key is None:
+        raise ValueError('uri should not be None when "backend" does not exist in backend_args and backend_key is None')
+    if uri is not None:
+        prefix = _parse_uri_prefix(uri)
+    else:
+        prefix = ""
+    if enable_singleton:
+        unique_key = f"{prefix}:{json.dumps(backend_args)}"
+        if unique_key in backend_instances:
+            return backend_instances[unique_key]
+        backend = _get_file_backend(prefix, backend_args)
+        backend_instances[unique_key] = backend
+        if backend_key is not None:
+            backend_instances[backend_key] = backend
+        return backend
+    else:
+        backend = _get_file_backend(prefix, backend_args)
+        return backend
+def get(
+    filepath: str | Path,
+    backend_args: dict | None = None,
+    backend_key: str | None = None,
+) -> bytes:
+    """Read bytes from a given ``filepath`` with 'rb' mode.
+    Args:
+        filepath (str or Path): Path to read data.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+        backend_key (str, optional): The key to get the backend from register.
+    Returns:
+        bytes: Expected bytes object.
+    Examples:
+        >>> filepath = '/path/of/file'
+        >>> get(filepath)
+        b'hello world'
+    """
+    backend = get_file_backend(
+        filepath,
+        backend_args=backend_args,
+        enable_singleton=True,
+        backend_key=backend_key,
+    )
+    return backend.get(filepath)
+def get_text(
+    filepath: str | Path,
+    encoding="utf-8",
+    backend_args: dict | None = None,
+    backend_key: str | None = None,
+) -> str:
+    """Read text from a given ``filepath`` with 'r' mode.
+    Args:
+        filepath (str or Path): Path to read data.
+        encoding (str): The encoding format used to open the ``filepath``.
+            Defaults to 'utf-8'.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+        backend_key (str, optional): The key to get the backend from register.
+    Returns:
+        str: Expected text reading from ``filepath``.
+    Examples:
+        >>> filepath = '/path/of/file'
+        >>> get_text(filepath)
+        'hello world'
+    """
+    backend = get_file_backend(
+        filepath,
+        backend_args=backend_args,
+        enable_singleton=True,
+        backend_key=backend_key,
+    )
+    return backend.get_text(filepath, encoding)
+def put(
+    obj: bytes,
+    filepath: str | Path,
+    backend_args: dict | None = None,
+    backend_key: str | None = None,
+) -> None:
+    """Write bytes to a given ``filepath`` with 'wb' mode.
+    Note:
+        ``put`` should create a directory if the directory of
+        ``filepath`` does not exist.
+    Args:
+        obj (bytes): Data to be written.
+        filepath (str or Path): Path to write data.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+        backend_key (str, optional): The key to get the backend from register.
+    Examples:
+        >>> filepath = '/path/of/file'
+        >>> put(b'hello world', filepath)
+    """
+    backend = get_file_backend(
+        filepath,
+        backend_args=backend_args,
+        enable_singleton=True,
+        backend_key=backend_key,
+    )
+    backend.put(obj, filepath)
+def put_text(
+    obj: str,
+    filepath: str | Path,
+    backend_args: dict | None = None,
+    backend_key: str | None = None,
+) -> None:
+    """Write text to a given ``filepath`` with 'w' mode.
+    Note:
+        ``put_text`` should create a directory if the directory of
+        ``filepath`` does not exist.
+    Args:
+        obj (str): Data to be written.
+        filepath (str or Path): Path to write data.
+        encoding (str, optional): The encoding format used to open the
+            ``filepath``. Defaults to 'utf-8'.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+        backend_key (str, optional): The key to get the backend from register.
+    Examples:
+        >>> filepath = '/path/of/file'
+        >>> put_text('hello world', filepath)
+    """
+    backend = get_file_backend(
+        filepath,
+        backend_args=backend_args,
+        enable_singleton=True,
+        backend_key=backend_key,
+    )
+    backend.put_text(obj, filepath)
+def exists(
+    filepath: str | Path,
+    backend_args: dict | None = None,
+    backend_key: str | None = None,
+) -> bool:
+    """Check whether a file path exists.
+    Args:
+        filepath (str or Path): Path to be checked whether exists.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+        backend_key (str, optional): The key to get the backend from register.
+    Returns:
+        bool: Return ``True`` if ``filepath`` exists, ``False`` otherwise.
+    Examples:
+        >>> filepath = '/path/of/file'
+        >>> exists(filepath)
+        True
+    """
+    backend = get_file_backend(
+        filepath,
+        backend_args=backend_args,
+        enable_singleton=True,
+        backend_key=backend_key,
+    )
+    return backend.exists(filepath)
+def isdir(
+    filepath: str | Path,
+    backend_args: dict | None = None,
+    backend_key: str | None = None,
+) -> bool:
+    """Check whether a file path is a directory.
+    Args:
+        filepath (str or Path): Path to be checked whether it is a
+            directory.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+        backend_key (str, optional): The key to get the backend from register.
+    Returns:
+        bool: Return ``True`` if ``filepath`` points to a directory,
+        ``False`` otherwise.
+    Examples:
+        >>> filepath = '/path/of/dir'
+        >>> isdir(filepath)
+        True
+    """
+    backend = get_file_backend(
+        filepath,
+        backend_args=backend_args,
+        enable_singleton=True,
+        backend_key=backend_key,
+    )
+    return backend.isdir(filepath)
+def isfile(
+    filepath: str | Path,
+    backend_args: dict | None = None,
+    backend_key: str | None = None,
+) -> bool:
+    """Check whether a file path is a file.
+    Args:
+        filepath (str or Path): Path to be checked whether it is a file.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+        backend_key (str, optional): The key to get the backend from register.
+    Returns:
+        bool: Return ``True`` if ``filepath`` points to a file, ``False``
+        otherwise.
+    Examples:
+        >>> filepath = '/path/of/file'
+        >>> isfile(filepath)
+        True
+    """
+    backend = get_file_backend(
+        filepath,
+        backend_args=backend_args,
+        enable_singleton=True,
+        backend_key=backend_key,
+    )
+    return backend.isfile(filepath)
+def join_path(
+    filepath: str | Path,
+    *filepaths: str | Path,
+    backend_args: dict | None = None,
+    backend_key: str | None = None,
+) -> str | Path:
+    r"""Concatenate all file paths.
+    Join one or more filepath components intelligently. The return value
+    is the concatenation of filepath and any members of \*filepaths.
+    Args:
+        filepath (str or Path): Path to be concatenated.
+        *filepaths (str or Path): Other paths to be concatenated.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+        backend_key (str, optional): The key to get the backend from register.
+    Returns:
+        str: The result of concatenation.
+    Examples:
+        >>> filepath1 = '/path/of/dir1'
+        >>> filepath2 = 'dir2'
+        >>> filepath3 = 'path/of/file'
+        >>> join_path(filepath1, filepath2, filepath3)
+        '/path/of/dir/dir2/path/of/file'
+    """
+    backend = get_file_backend(
+        filepath,
+        backend_args=backend_args,
+        enable_singleton=True,
+        backend_key=backend_key,
+    )
+    return backend.join_path(filepath, *filepaths)
+@contextmanager
+def get_local_path(
+    filepath: str | Path,
+    backend_args: dict | None = None,
+    backend_key: str | None = None,
+) -> Generator[str | Path, None, None]:
+    """Download data from ``filepath`` and write the data to local path.
+    ``get_local_path`` is decorated by :meth:`contxtlib.contextmanager`. It
+    can be called with ``with`` statement, and when exists from the
+    ``with`` statement, the temporary path will be released.
+    Note:
+        If the ``filepath`` is a local path, just return itself and it will
+        not be released (removed).
+    Args:
+        filepath (str or Path): Path to be read data.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+    Yields:
+        Iterable[str]: Only yield one path.
+    Examples:
+        >>> with get_local_path('http://example.com/file.jpg') as path:
+        ...     # do something here
+    """
+    backend = get_file_backend(
+        filepath,
+        backend_args=backend_args,
+        enable_singleton=True,
+        backend_key=backend_key,
+    )
+    with backend.get_local_path(str(filepath)) as local_path:
+        yield local_path
+def copyfile(
+    src: str | Path,
+    dst: str | Path,
+    backend_args: dict | None = None,
+    backend_key: str | None = None,
+) -> str | Path:
+    """Copy a file src to dst and return the destination file.
+    src and dst should have the same prefix. If dst specifies a directory,
+    the file will be copied into dst using the base filename from src. If
+    dst specifies a file that already exists, it will be replaced.
+    Args:
+        src (str or Path): A file to be copied.
+        dst (str or Path): Copy file to dst.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+    Returns:
+        str: The destination file.
+    Raises:
+        SameFileError: If src and dst are the same file, a SameFileError will
+            be raised.
+    Examples:
+        >>> # dst is a file
+        >>> src = '/path/of/file'
+        >>> dst = '/path1/of/file1'
+        >>> # src will be copied to '/path1/of/file1'
+        >>> copyfile(src, dst)
+        '/path1/of/file1'
+        >>> # dst is a directory
+        >>> dst = '/path1/of/dir'
+        >>> # src will be copied to '/path1/of/dir/file'
+        >>> copyfile(src, dst)
+        '/path1/of/dir/file'
+    """
+    backend = get_file_backend(src, backend_args=backend_args, enable_singleton=True, backend_key=backend_key)
+    return backend.copyfile(src, dst)
+def copytree(
+    src: str | Path,
+    dst: str | Path,
+    backend_args: dict | None = None,
+    backend_key: str | None = None,
+) -> str | Path:
+    """Recursively copy an entire directory tree rooted at src to a directory
+    named dst and return the destination directory.
+    src and dst should have the same prefix and dst must not already exist.
+    Args:
+        src (str or Path): A directory to be copied.
+        dst (str or Path): Copy directory to dst.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+        backend_key (str, optional): The key to get the backend from register.
+    Returns:
+        str: The destination directory.
+    Raises:
+        FileExistsError: If dst had already existed, a FileExistsError will be
+            raised.
+    Examples:
+        >>> src = '/path/of/dir1'
+        >>> dst = '/path/of/dir2'
+        >>> copytree(src, dst)
+        '/path/of/dir2'
+    """
+    backend = get_file_backend(src, backend_args=backend_args, enable_singleton=True, backend_key=backend_key)
+    return backend.copytree(src, dst)
+def copyfile_from_local(
+    src: str | Path,
+    dst: str | Path,
+    backend_args: dict | None = None,
+    backend_key: str | None = None,
+) -> str | Path:
+    """Copy a local file src to dst and return the destination file.
+    Note:
+        If the backend is the instance of LocalBackend, it does the same
+        thing with :func:`copyfile`.
+    Args:
+        src (str or Path): A local file to be copied.
+        dst (str or Path): Copy file to dst.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+    Returns:
+        str: If dst specifies a directory, the file will be copied into dst
+        using the base filename from src.
+    Examples:
+        >>> # dst is a file
+        >>> src = '/path/of/file'
+        >>> dst = 'http://example.com/file1'
+        >>> # src will be copied to 'http://example.com/file1'
+        >>> copyfile_from_local(src, dst)
+        http://example.com/file1
+        >>> # dst is a directory
+        >>> dst = 'http://example.com/dir'
+        >>> # src will be copied to 'http://example.com/dir/file''
+        >>> copyfile_from_local(src, dst)
+        'http://example.com/dir/file'
+    """
+    backend = get_file_backend(dst, backend_args=backend_args, enable_singleton=True, backend_key=backend_key)
+    return backend.copyfile_from_local(src, dst)
+def copytree_from_local(
+    src: str | Path,
+    dst: str | Path,
+    backend_args: dict | None = None,
+    backend_key: str | None = None,
+) -> str | Path:
+    """Recursively copy an entire directory tree rooted at src to a directory
+    named dst and return the destination directory.
+    Note:
+        If the backend is the instance of LocalBackend, it does the same
+        thing with :func:`copytree`.
+    Args:
+        src (str or Path): A local directory to be copied.
+        dst (str or Path): Copy directory to dst.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+    Returns:
+        str: The destination directory.
+    Examples:
+        >>> src = '/path/of/dir'
+        >>> dst = 'http://example.com/dir'
+        >>> copyfile_from_local(src, dst)
+        'http://example.com/dir'
+    """
+    backend = get_file_backend(dst, backend_args=backend_args, enable_singleton=True, backend_key=backend_key)
+    return backend.copytree_from_local(src, dst)
+def copyfile_to_local(
+    src: str | Path,
+    dst: str | Path,
+    dst_type: str,  # Choose from ["file", "dir"]
+    backend_args: dict | None = None,
+    backend_key: str | None = None,
+) -> str | Path:
+    """Copy the file src to local dst and return the destination file.
+    If dst specifies a directory, the file will be copied into dst using
+    the base filename from src. If dst specifies a file that already
+    exists, it will be replaced.
+    Note:
+        If the backend is the instance of LocalBackend, it does the same
+        thing with :func:`copyfile`.
+    Args:
+        src (str or Path): A file to be copied.
+        dst (str or Path): Copy file to to local dst.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+    Returns:
+        str: If dst specifies a directory, the file will be copied into dst
+        using the base filename from src.
+    Examples:
+        >>> # dst is a file
+        >>> src = 'http://example.com/file'
+        >>> dst = '/path/of/file'
+        >>> # src will be copied to '/path/of/file'
+        >>> copyfile_to_local(src, dst)
+        '/path/of/file'
+        >>> # dst is a directory
+        >>> dst = '/path/of/dir'
+        >>> # src will be copied to '/path/of/dir/file'
+        >>> copyfile_to_local(src, dst)
+        '/path/of/dir/file'
+    """
+    assert dst_type in ["file", "dir"]
+    Path(dst).parent.mkdir(parents=True, exist_ok=True)
+    backend = get_file_backend(src, backend_args=backend_args, enable_singleton=True, backend_key=backend_key)
+    return backend.copyfile_to_local(src, dst, dst_type=dst_type)
+def copytree_to_local(
+    src: str | Path,
+    dst: str | Path,
+    backend_args: dict | None = None,
+    backend_key: str | None = None,
+) -> str | Path:
+    """Recursively copy an entire directory tree rooted at src to a local
+    directory named dst and return the destination directory.
+    Note:
+        If the backend is the instance of LocalBackend, it does the same
+        thing with :func:`copytree`.
+    Args:
+        src (str or Path): A directory to be copied.
+        dst (str or Path): Copy directory to local dst.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+    Returns:
+        str: The destination directory.
+    Examples:
+        >>> src = 'http://example.com/dir'
+        >>> dst = '/path/of/dir'
+        >>> copytree_to_local(src, dst)
+        '/path/of/dir'
+    """
+    Path(dst).parent.mkdir(parents=True, exist_ok=True)
+    backend = get_file_backend(dst, backend_args=backend_args, enable_singleton=True, backend_key=backend_key)
+    return backend.copytree_to_local(src, dst)
+def remove(
+    filepath: str | Path,
+    backend_args: dict | None = None,
+    backend_key: str | None = None,
+) -> None:
+    """Remove a file.
+    Args:
+        filepath (str, Path): Path to be removed.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+    Raises:
+        FileNotFoundError: If filepath does not exist, an FileNotFoundError
+            will be raised.
+        IsADirectoryError: If filepath is a directory, an IsADirectoryError
+            will be raised.
+    Examples:
+        >>> filepath = '/path/of/file'
+        >>> remove(filepath)
+    """
+    backend = get_file_backend(
+        filepath,
+        backend_args=backend_args,
+        enable_singleton=True,
+        backend_key=backend_key,
+    )
+    backend.remove(filepath)
+def rmtree(
+    dir_path: str | Path,
+    backend_args: dict | None = None,
+    backend_key: str | None = None,
+) -> None:
+    """Recursively delete a directory tree.
+    Args:
+        dir_path (str or Path): A directory to be removed.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+    Examples:
+        >>> dir_path = '/path/of/dir'
+        >>> rmtree(dir_path)
+    """
+    backend = get_file_backend(
+        dir_path,
+        backend_args=backend_args,
+        enable_singleton=True,
+        backend_key=backend_key,
+    )
+    backend.rmtree(dir_path)
+def copy_if_symlink_fails(
+    src: str | Path,
+    dst: str | Path,
+    backend_args: dict | None = None,
+    backend_key: str | None = None,
+) -> bool:
+    """Create a symbolic link pointing to src named dst.
+    If failed to create a symbolic link pointing to src, directory copy src to
+    dst instead.
+    Args:
+        src (str or Path): Create a symbolic link pointing to src.
+        dst (str or Path): Create a symbolic link named dst.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+    Returns:
+        bool: Return True if successfully create a symbolic link pointing to
+        src. Otherwise, return False.
+    Examples:
+        >>> src = '/path/of/file'
+        >>> dst = '/path1/of/file1'
+        >>> copy_if_symlink_fails(src, dst)
+        True
+        >>> src = '/path/of/dir'
+        >>> dst = '/path1/of/dir1'
+        >>> copy_if_symlink_fails(src, dst)
+        True
+    """
+    backend = get_file_backend(src, backend_args=backend_args, enable_singleton=True, backend_key=backend_key)
+    return backend.copy_if_symlink_fails(src, dst)
+def list_dir(
+    dir_path: str | Path,
+    backend_args: dict | None = None,
+    backend_key: str | None = None,
+):
+    """List all folders in a directory with a given path.
+    Args:
+        dir_path (str | Path): Path of the directory.
+    Examples:
+        >>> dir_path = '/path/of/dir'
+        >>> for file_path in list_dir(dir_path):
+        ...     print(file_path)
+    """
+    if not dir_path.endswith("/"):
+        dir_path += "/"
+    backend = get_file_backend(
+        dir_path,
+        backend_args=backend_args,
+        enable_singleton=True,
+        backend_key=backend_key,
+    )
+    return backend.list_dir(dir_path)
+def list_dir_or_file(
+    dir_path: str | Path,
+    list_dir: bool = True,
+    list_file: bool = True,
+    suffix: str | tuple[str] | None = None,
+    recursive: bool = False,
+    backend_args: dict | None = None,
+    backend_key: str | None = None,
+) -> Iterator[str]:
+    """Scan a directory to find the interested directories or files in
+    arbitrary order.
+    Note:
+        :meth:`list_dir_or_file` returns the path relative to ``dir_path``.
+    Args:
+        dir_path (str or Path): Path of the directory.
+        list_dir (bool): List the directories. Defaults to True.
+        list_file (bool): List the path of files. Defaults to True.
+        suffix (str or tuple[str], optional): File suffix that we are
+            interested in. Defaults to None.
+        recursive (bool): If set to True, recursively scan the directory.
+            Defaults to False.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+    Yields:
+        Iterable[str]: A relative path to ``dir_path``.
+    Examples:
+        >>> dir_path = '/path/of/dir'
+        >>> for file_path in list_dir_or_file(dir_path):
+        ...     print(file_path)
+        >>> # list those files and directories in current directory
+        >>> for file_path in list_dir_or_file(dir_path):
+        ...     print(file_path)
+        >>> # only list files
+        >>> for file_path in list_dir_or_file(dir_path, list_dir=False):
+        ...     print(file_path)
+        >>> # only list directories
+        >>> for file_path in list_dir_or_file(dir_path, list_file=False):
+        ...     print(file_path)
+        >>> # only list files ending with specified suffixes
+        >>> for file_path in list_dir_or_file(dir_path, suffix='.txt'):
+        ...     print(file_path)
+        >>> # list all files and directory recursively
+        >>> for file_path in list_dir_or_file(dir_path, recursive=True):
+        ...     print(file_path)
+    """
+    backend = get_file_backend(
+        dir_path,
+        backend_args=backend_args,
+        enable_singleton=True,
+        backend_key=backend_key,
+    )
+    yield from backend.list_dir_or_file(dir_path, list_dir, list_file, suffix, recursive)
+def load(
+    file: str | Path | IO[Any],
+    file_format: str | None = None,
+    file_client_args: dict | None = None,
+    fast_backend: bool = False,
+    backend_args: dict | None = None,
+    backend_key: str | None = None,
+    **kwargs,
+):
+    """Load data from json/yaml/pickle files.
+    This method provides a unified api for loading data from serialized files.
+    ``load`` supports loading data from serialized files those can be storaged
+    in different backends.
+    Args:
+        file (str or :obj:`Path` or file-like object): Filename or a file-like
+            object.
+        file_format (str, optional): If not specified, the file format will be
+            inferred from the file extension, otherwise use the specified one.
+            Currently supported formats include "json", "yaml/yml" and
+            "pickle/pkl".
+        file_client_args (dict, optional): Arguments to instantiate a
+            FileClient. See :class:`mmengine.fileio.FileClient` for details.
+            Defaults to None. It will be deprecated in future. Please use
+            ``backend_args`` instead.
+        fast_backend: bool: Whether to use multiprocess. Defaults to False.
+        backend_args (dict, optional): Arguments to instantiate the
+            prefix of uri corresponding backend. Defaults to None.
+            New in v0.2.0.
+    Examples:
+        >>> load('/path/of/your/file')  # file is storaged in disk
+        >>> load('https://path/of/your/file')  # file is storaged in Internet
+    Returns:
+        The content from the file.
+    """
+    if isinstance(file, Path):
+        file = str(file)
+    if file_format is None and isinstance(file, str):
+        file_format = file.split(".")[-1]
+    # convert file_format to lower case
+    file_format = file_format.lower()
+    if file_format not in file_handlers:
+        raise TypeError(f"Unsupported format: {file_format}")
+    if file_client_args is not None:
+        warnings.warn(  # noqa: B028
+            '"file_client_args" will be deprecated in future. Please use "backend_args" instead',
+            DeprecationWarning,
+        )
+        if backend_args is not None:
+            raise ValueError('"file_client_args and "backend_args" cannot be set at the same time.')
+    handler = file_handlers[file_format]
+    if isinstance(file, str):
+        if file_client_args is not None:
+            file_client = FileClient.infer_client(file_client_args, file)
+            file_backend = file_client
+        else:
+            file_backend = get_file_backend(
+                file,
+                backend_args=backend_args,
+                backend_key=backend_key,
+                enable_singleton=True,
+            )
+        if handler.str_like:
+            with StringIO(file_backend.get_text(file)) as f:
+                obj = handler.load_from_fileobj(f, **kwargs)
+        else:
+            if fast_backend:
+                if hasattr(file_backend, "fast_get"):
+                    with BytesIO(file_backend.fast_get(file)) as f:
+                        obj = handler.load_from_fileobj(f, **kwargs)
+                else:
+                    warnings.warn(  # noqa: B028
+                        f"fast_backend is not supported by the backend, type {type(file_backend)} fallback to normal get"
+                    )
+                    with BytesIO(file_backend.get(file)) as f:
+                        obj = handler.load_from_fileobj(f, **kwargs)
+            else:
+                with BytesIO(file_backend.get(file)) as f:
+                    obj = handler.load_from_fileobj(f, **kwargs)
+    elif hasattr(file, "read"):
+        obj = handler.load_from_fileobj(file, **kwargs)
+    else:
+        raise TypeError('"file" must be a filepath str or a file-object')
+    return obj
+def dump(
+    obj: Any,
+    file: str | Path | IO[Any] | None = None,
+    file_format: str | None = None,
+    file_client_args: dict | None = None,
+    fast_backend: bool = False,
+    backend_args: dict | None = None,
+    backend_key: str | None = None,
+    **kwargs,
+):
+    """Dump data to json/yaml/pickle strings or files.
+    This method provides a unified api for dumping data as strings or to files,
+    and also supports custom arguments for each file format.
+    ``dump`` supports dumping data as strings or to files which is saved to
+    different backends.
+    Args:
+        obj (any): The python object to be dumped.
+        file (str or :obj:`Path` or file-like object, optional): If not
+            specified, then the object is dumped to a str, otherwise to a file
+            specified by the filename or file-like object.
+        file_format (str, optional): Same as :func:`load`.
+        file_client_args (dict, optional): Arguments to instantiate a
+            FileClient. See :class:`mmengine.fileio.FileClient` for details.
+            Defaults to None. It will be deprecated in future. Please use
+            ``backend_args`` instead.
+        fast_backend: bool: Whether to use multiprocess. Defaults to False.
+        backend_args (dict, optional): Arguments to instantiate the
+            prefix of uri corresponding backend. Defaults to None.
+            New in v0.2.0.
+        backend_key: str: The key to register the backend. Defaults to None.
+    Examples:
+        >>> dump('hello world', '/path/of/your/file')  # disk
+        >>> dump('hello world', 'http://path/of/your/file')  # http
+    Returns:
+        bool: True for success, False otherwise.
+    """
+    if isinstance(file, Path):
+        file = str(file)
+    if file_format is None:
+        if isinstance(file, str):
+            file_format = file.split(".")[-1]
+        elif file is None:
+            raise ValueError("file_format must be specified since file is None")
+    # convert file_format to lower case
+    file_format = file_format.lower()
+    if file_format not in file_handlers:
+        raise TypeError(f"Unsupported format: {file_format}")
+    if file_client_args is not None:
+        warnings.warn(  # noqa: B028
+            '"file_client_args" will be deprecated in future. Please use "backend_args" instead',
+            DeprecationWarning,
+        )
+        if backend_args is not None:
+            raise ValueError('"file_client_args" and "backend_args" cannot be set at the same time.')
+    handler = file_handlers[file_format]
+    if file is None:
+        return handler.dump_to_str(obj, **kwargs)
+    elif isinstance(file, str):
+        if file_client_args is not None:
+            file_client = FileClient.infer_client(file_client_args, file)
+            file_backend = file_client
+        else:
+            file_backend = get_file_backend(
+                file,
+                backend_args=backend_args,
+                backend_key=backend_key,
+                enable_singleton=True,
+            )
+        if handler.str_like:
+            with StringIO() as f:
+                handler.dump_to_fileobj(obj, f, **kwargs)
+                file_backend.put_text(f.getvalue(), file)
+        else:
+            with BytesIO() as f:
+                handler.dump_to_fileobj(obj, f, **kwargs)
+                if fast_backend:
+                    if hasattr(file_backend, "fast_put"):
+                        file_backend.fast_put(f, file)
+                    else:
+                        warnings.warn("fast_backend is not supported by the backend, fallback to normal put")  # noqa: B028
+                        file_backend.put(f, file)
+                else:
+                    file_backend.put(f, file)
+    elif hasattr(file, "write"):
+        handler.dump_to_fileobj(obj, file, **kwargs)
+    else:
+        raise TypeError('"file" must be a filename str or a file-object')

imaginaire/utils/easy_io/file_client.py ADDED Viewed

	@@ -0,0 +1,448 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+from collections.abc import Generator, Iterator
+from contextlib import contextmanager
+from pathlib import Path
+from typing import Any
+from imaginaire.utils.easy_io.backends import BaseStorageBackend, HTTPBackend, LocalBackend
+def is_filepath(filepath):
+    return isinstance(filepath, (str, Path))
+class HardDiskBackend(LocalBackend):
+    """Raw hard disks storage backend."""
+    @property
+    def name(self):
+        return self.__class__.__name__
+class FileClient:
+    """A general file client to access files in different backends.
+    The client loads a file or text in a specified backend from its path
+    and returns it as a binary or text file. There are two ways to choose a
+    backend, the name of backend and the prefix of path. Although both of them
+    can be used to choose a storage backend, ``backend`` has a higher priority
+    that is if they are all set, the storage backend will be chosen by the
+    backend argument. If they are all `None`, the disk backend will be chosen.
+    Note that It can also register other backend accessor with a given name,
+    prefixes, and backend class. In addition, We use the singleton pattern to
+    avoid repeated object creation. If the arguments are the same, the same
+    object will be returned.
+    Warning:
+        `FileClient` will be deprecated in future. Please use io functions
+        in https://mmengine.readthedocs.io/en/latest/api/fileio.html#file-io
+    Args:
+        backend (str, optional): The storage backend type. Options are "disk",
+            "memcached", "lmdb" and "http". Defaults to None.
+        prefix (str, optional): The prefix of the registered storage backend.
+            Options are "http", "https". Defaults to None.
+    Examples:
+        >>> # only set backend
+        >>> file_client = FileClient(backend='disk')
+        >>> # only set prefix
+        >>> file_client = FileClient(prefix='http')
+        >>> # set both backend and prefix but use backend to choose client
+        >>> file_client = FileClient(backend='http', prefix='http')
+        >>> # if the arguments are the same, the same object is returned
+        >>> file_client1 = FileClient(backend='disk')
+        >>> file_client1 is file_client
+        True
+    Attributes:
+        client (:obj:`BaseStorageBackend`): The backend object.
+    """
+    _backends = {  # noqa: RUF012
+        "disk": HardDiskBackend,
+        "http": HTTPBackend,
+    }
+    _prefix_to_backends: dict = {  # noqa: RUF012
+        "http": HTTPBackend,
+        "https": HTTPBackend,
+    }
+    _instances: dict = {}  # noqa: RUF012
+    client: Any
+    def __new__(cls, backend=None, prefix=None, **kwargs):
+        if backend is None and prefix is None:
+            backend = "disk"
+        if backend is not None and backend not in cls._backends:
+            raise ValueError(
+                f"Backend {backend} is not supported. Currently supported ones are {list(cls._backends.keys())}"
+            )
+        if prefix is not None and prefix not in cls._prefix_to_backends:
+            raise ValueError(
+                f"prefix {prefix} is not supported. Currently supported ones are {list(cls._prefix_to_backends.keys())}"
+            )
+        # concatenate the arguments to a unique key for determining whether
+        # objects with the same arguments were created
+        arg_key = f"{backend}:{prefix}"
+        for key, value in kwargs.items():
+            arg_key += f":{key}:{value}"
+        # if a backend was overridden, it will create a new object
+        if arg_key in cls._instances:
+            _instance = cls._instances[arg_key]
+        else:
+            # create a new object and put it to _instance
+            _instance = super().__new__(cls)
+            if backend is not None:
+                _instance.client = cls._backends[backend](**kwargs)
+            else:
+                _instance.client = cls._prefix_to_backends[prefix](**kwargs)
+            cls._instances[arg_key] = _instance
+        return _instance
+    @property
+    def name(self):
+        return self.client.name
+    @property
+    def allow_symlink(self):
+        return self.client.allow_symlink
+    @staticmethod
+    def parse_uri_prefix(uri: str | Path) -> str | None:
+        """Parse the prefix of a uri.
+        Args:
+            uri (str | Path): Uri to be parsed that contains the file prefix.
+        Examples:
+            >>> FileClient.parse_uri_prefix('http://path/of/your/file')
+            'http'
+        Returns:
+            str | None: Return the prefix of uri if the uri contains '://' else
+            ``None``.
+        """
+        assert is_filepath(uri)
+        uri = str(uri)
+        if "://" not in uri:
+            return None
+        else:
+            prefix, _ = uri.split("://")
+            return prefix
+    @classmethod
+    def infer_client(
+        cls,
+        file_client_args: dict | None = None,
+        uri: str | Path | None = None,
+    ) -> "FileClient":
+        """Infer a suitable file client based on the URI and arguments.
+        Args:
+            file_client_args (dict, optional): Arguments to instantiate a
+                FileClient. Defaults to None.
+            uri (str | Path, optional): Uri to be parsed that contains the file
+                prefix. Defaults to None.
+        Examples:
+            >>> uri = 'http://path/of/your/file'
+            >>> file_client = FileClient.infer_client(uri=uri)
+            >>> file_client_args = {'backend': 'disk'}
+            >>> file_client = FileClient.infer_client(file_client_args)
+        Returns:
+            FileClient: Instantiated FileClient object.
+        """
+        assert file_client_args is not None or uri is not None
+        if file_client_args is None:
+            file_prefix = cls.parse_uri_prefix(uri)  # type: ignore
+            return cls(prefix=file_prefix)
+        else:
+            return cls(**file_client_args)
+    @classmethod
+    def _register_backend(cls, name, backend, force=False, prefixes=None):
+        if not isinstance(name, str):
+            raise TypeError(f"the backend name should be a string, but got {type(name)}")
+        if not inspect.isclass(backend):
+            raise TypeError(f"backend should be a class but got {type(backend)}")
+        if not issubclass(backend, BaseStorageBackend):
+            raise TypeError(f"backend {backend} is not a subclass of BaseStorageBackend")
+        if not force and name in cls._backends:
+            raise KeyError(
+                f'{name} is already registered as a storage backend, add "force=True" if you want to override it'
+            )
+        if name in cls._backends and force:
+            for arg_key, instance in list(cls._instances.items()):
+                if isinstance(instance.client, cls._backends[name]):
+                    cls._instances.pop(arg_key)
+        cls._backends[name] = backend
+        if prefixes is not None:
+            if isinstance(prefixes, str):
+                prefixes = [prefixes]
+            else:
+                assert isinstance(prefixes, (list, tuple))
+            for prefix in prefixes:
+                if prefix not in cls._prefix_to_backends:
+                    cls._prefix_to_backends[prefix] = backend
+                elif (prefix in cls._prefix_to_backends) and force:
+                    overridden_backend = cls._prefix_to_backends[prefix]
+                    for arg_key, instance in list(cls._instances.items()):
+                        if isinstance(instance.client, overridden_backend):
+                            cls._instances.pop(arg_key)
+                else:
+                    raise KeyError(
+                        f"{prefix} is already registered as a storage backend,"
+                        ' add "force=True" if you want to override it'
+                    )
+    @classmethod
+    def register_backend(cls, name, backend=None, force=False, prefixes=None):
+        """Register a backend to FileClient.
+        This method can be used as a normal class method or a decorator.
+        .. code-block:: python
+            class NewBackend(BaseStorageBackend):
+                def get(self, filepath):
+                    return filepath
+                def get_text(self, filepath):
+                    return filepath
+            FileClient.register_backend('new', NewBackend)
+        or
+        .. code-block:: python
+            @FileClient.register_backend('new')
+            class NewBackend(BaseStorageBackend):
+                def get(self, filepath):
+                    return filepath
+                def get_text(self, filepath):
+                    return filepath
+        Args:
+            name (str): The name of the registered backend.
+            backend (class, optional): The backend class to be registered,
+                which must be a subclass of :class:`BaseStorageBackend`.
+                When this method is used as a decorator, backend is None.
+                Defaults to None.
+            force (bool, optional): Whether to override the backend if the name
+                has already been registered. Defaults to False.
+            prefixes (str or list[str] or tuple[str], optional): The prefixes
+                of the registered storage backend. Defaults to None.
+                `New in version 1.3.15.`
+        """
+        if backend is not None:
+            cls._register_backend(name, backend, force=force, prefixes=prefixes)
+            return
+        def _register(backend_cls):
+            cls._register_backend(name, backend_cls, force=force, prefixes=prefixes)
+            return backend_cls
+        return _register
+    def get(self, filepath: str | Path) -> bytes | memoryview:
+        """Read data from a given ``filepath`` with 'rb' mode.
+        Note:
+            There are two types of return values for ``get``, one is ``bytes``
+            and the other is ``memoryview``. The advantage of using memoryview
+            is that you can avoid copying, and if you want to convert it to
+            ``bytes``, you can use ``.tobytes()``.
+        Args:
+            filepath (str or Path): Path to read data.
+        Returns:
+            bytes | memoryview: Expected bytes object or a memory view of the
+            bytes object.
+        """
+        return self.client.get(filepath)
+    def get_text(self, filepath: str | Path, encoding="utf-8") -> str:
+        """Read data from a given ``filepath`` with 'r' mode.
+        Args:
+            filepath (str or Path): Path to read data.
+            encoding (str): The encoding format used to open the ``filepath``.
+                Defaults to 'utf-8'.
+        Returns:
+            str: Expected text reading from ``filepath``.
+        """
+        return self.client.get_text(filepath, encoding)
+    def put(self, obj: bytes, filepath: str | Path) -> None:
+        """Write data to a given ``filepath`` with 'wb' mode.
+        Note:
+            ``put`` should create a directory if the directory of ``filepath``
+            does not exist.
+        Args:
+            obj (bytes): Data to be written.
+            filepath (str or Path): Path to write data.
+        """
+        self.client.put(obj, filepath)
+    def put_text(self, obj: str, filepath: str | Path) -> None:
+        """Write data to a given ``filepath`` with 'w' mode.
+        Note:
+            ``put_text`` should create a directory if the directory of
+            ``filepath`` does not exist.
+        Args:
+            obj (str): Data to be written.
+            filepath (str or Path): Path to write data.
+            encoding (str, optional): The encoding format used to open the
+                `filepath`. Defaults to 'utf-8'.
+        """
+        self.client.put_text(obj, filepath)
+    def remove(self, filepath: str | Path) -> None:
+        """Remove a file.
+        Args:
+            filepath (str, Path): Path to be removed.
+        """
+        self.client.remove(filepath)
+    def exists(self, filepath: str | Path) -> bool:
+        """Check whether a file path exists.
+        Args:
+            filepath (str or Path): Path to be checked whether exists.
+        Returns:
+            bool: Return ``True`` if ``filepath`` exists, ``False`` otherwise.
+        """
+        return self.client.exists(filepath)
+    def isdir(self, filepath: str | Path) -> bool:
+        """Check whether a file path is a directory.
+        Args:
+            filepath (str or Path): Path to be checked whether it is a
+                directory.
+        Returns:
+            bool: Return ``True`` if ``filepath`` points to a directory,
+            ``False`` otherwise.
+        """
+        return self.client.isdir(filepath)
+    def isfile(self, filepath: str | Path) -> bool:
+        """Check whether a file path is a file.
+        Args:
+            filepath (str or Path): Path to be checked whether it is a file.
+        Returns:
+            bool: Return ``True`` if ``filepath`` points to a file, ``False``
+            otherwise.
+        """
+        return self.client.isfile(filepath)
+    def join_path(self, filepath: str | Path, *filepaths: str | Path) -> str:
+        r"""Concatenate all file paths.
+        Join one or more filepath components intelligently. The return value
+        is the concatenation of filepath and any members of \*filepaths.
+        Args:
+            filepath (str or Path): Path to be concatenated.
+        Returns:
+            str: The result of concatenation.
+        """
+        return self.client.join_path(filepath, *filepaths)
+    @contextmanager
+    def get_local_path(self, filepath: str | Path) -> Generator[str | Path, None, None]:
+        """Download data from ``filepath`` and write the data to local path.
+        ``get_local_path`` is decorated by :meth:`contxtlib.contextmanager`. It
+        can be called with ``with`` statement, and when exists from the
+        ``with`` statement, the temporary path will be released.
+        Note:
+            If the ``filepath`` is a local path, just return itself.
+        .. warning::
+            ``get_local_path`` is an experimental interface that may change in
+            the future.
+        Args:
+            filepath (str or Path): Path to be read data.
+        Examples:
+            >>> file_client = FileClient(prefix='http')
+            >>> with file_client.get_local_path('http://example.com/abc.jpg') as path:
+            ...     # do something here
+        Yields:
+            Iterable[str]: Only yield one path.
+        """
+        with self.client.get_local_path(str(filepath)) as local_path:
+            yield local_path
+    def list_dir_or_file(  # pylint: disable=too-many-arguments
+        self,
+        dir_path: str | Path,
+        list_dir: bool = True,
+        list_file: bool = True,
+        suffix: str | tuple[str] | None = None,
+        recursive: bool = False,
+    ) -> Iterator[str]:
+        """Scan a directory to find the interested directories or files in
+        arbitrary order.
+        Note:
+            :meth:`list_dir_or_file` returns the path relative to ``dir_path``.
+        Args:
+            dir_path (str | Path): Path of the directory.
+            list_dir (bool): List the directories. Defaults to True.
+            list_file (bool): List the path of files. Defaults to True.
+            suffix (str or tuple[str], optional):  File suffix
+                that we are interested in. Defaults to None.
+            recursive (bool): If set to True, recursively scan the
+                directory. Defaults to False.
+        Yields:
+            Iterable[str]: A relative path to ``dir_path``.
+        """
+        yield from self.client.list_dir_or_file(dir_path, list_dir, list_file, suffix, recursive)

imaginaire/utils/easy_io/handlers/__init__.py ADDED Viewed

	@@ -0,0 +1,29 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from imaginaire.utils.easy_io.handlers.base import BaseFileHandler
+from imaginaire.utils.easy_io.handlers.json_handler import JsonHandler
+from imaginaire.utils.easy_io.handlers.pickle_handler import PickleHandler
+from imaginaire.utils.easy_io.handlers.registry_utils import file_handlers, register_handler
+from imaginaire.utils.easy_io.handlers.yaml_handler import YamlHandler
+__all__ = [
+    "BaseFileHandler",
+    "JsonHandler",
+    "PickleHandler",
+    "YamlHandler",
+    "file_handlers",
+    "register_handler",
+]

imaginaire/utils/easy_io/handlers/base.py ADDED Viewed

	@@ -0,0 +1,44 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from abc import ABCMeta, abstractmethod
+class BaseFileHandler(metaclass=ABCMeta):
+    # `str_like` is a flag to indicate whether the type of file object is
+    # str-like object or bytes-like object. Pickle only processes bytes-like
+    # objects but json only processes str-like object. If it is str-like
+    # object, `StringIO` will be used to process the buffer.
+    str_like = True
+    @abstractmethod
+    def load_from_fileobj(self, file, **kwargs):
+        pass
+    @abstractmethod
+    def dump_to_fileobj(self, obj, file, **kwargs):
+        pass
+    @abstractmethod
+    def dump_to_str(self, obj, **kwargs):
+        pass
+    def load_from_path(self, filepath, mode="r", **kwargs):
+        with open(filepath, mode) as f:
+            return self.load_from_fileobj(f, **kwargs)
+    def dump_to_path(self, obj, filepath, mode="w", **kwargs):
+        with open(filepath, mode) as f:
+            self.dump_to_fileobj(obj, f, **kwargs)

imaginaire/utils/easy_io/handlers/byte_handler.py ADDED Viewed

	@@ -0,0 +1,39 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import IO
+from imaginaire.utils.easy_io.handlers.base import BaseFileHandler
+class ByteHandler(BaseFileHandler):
+    str_like = False
+    def load_from_fileobj(self, file: IO[bytes], **kwargs):
+        file.seek(0)
+        # extra all bytes and return
+        return file.read()
+    def dump_to_fileobj(
+        self,
+        obj: bytes,
+        file: IO[bytes],
+        **kwargs,
+    ):
+        # write all bytes to file
+        file.write(obj)
+    def dump_to_str(self, obj, **kwargs):
+        raise NotImplementedError

imaginaire/utils/easy_io/handlers/csv_handler.py ADDED Viewed

	@@ -0,0 +1,42 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import csv
+from io import StringIO
+from imaginaire.utils.easy_io.handlers.base import BaseFileHandler
+class CsvHandler(BaseFileHandler):
+    def load_from_fileobj(self, file, **kwargs):
+        del kwargs
+        reader = csv.reader(file)
+        return list(reader)
+    def dump_to_fileobj(self, obj, file, **kwargs):
+        del kwargs
+        writer = csv.writer(file)
+        if not all(isinstance(row, list) for row in obj):
+            raise ValueError("Each row must be a list")
+        writer.writerows(obj)
+    def dump_to_str(self, obj, **kwargs):
+        del kwargs
+        output = StringIO()
+        writer = csv.writer(output)
+        if not all(isinstance(row, list) for row in obj):
+            raise ValueError("Each row must be a list")
+        writer.writerows(obj)
+        return output.getvalue()

imaginaire/utils/easy_io/handlers/gzip_handler.py ADDED Viewed

	@@ -0,0 +1,33 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import gzip
+import pickle
+from io import BytesIO
+from typing import Any
+from imaginaire.utils.easy_io.handlers.pickle_handler import PickleHandler
+class GzipHandler(PickleHandler):
+    str_like = False
+    def load_from_fileobj(self, file: BytesIO, **kwargs):
+        with gzip.GzipFile(fileobj=file, mode="rb") as f:
+            return pickle.load(f)
+    def dump_to_fileobj(self, obj: Any, file: BytesIO, **kwargs):
+        with gzip.GzipFile(fileobj=file, mode="wb") as f:
+            pickle.dump(obj, f)

imaginaire/utils/easy_io/handlers/imageio_video_handler.py ADDED Viewed

	@@ -0,0 +1,168 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import IO, Any
+import imageio
+import imageio.v3 as iio_v3
+import numpy as np
+import torch
+from imaginaire.utils import log
+from imaginaire.utils.easy_io.handlers.base import BaseFileHandler
+class ImageioVideoHandler(BaseFileHandler):
+    str_like = False
+    def load_from_fileobj(
+        self, file: IO[bytes], format: str = "mp4", mode: str = "rgb", **kwargs
+    ) -> tuple[np.ndarray, dict[str, Any]]:
+        """
+        Load video from a file-like object using imageio.v3 with specified format and color mode.
+        Parameters:
+            file (IO[bytes]): A file-like object containing video data.
+            format (str): Format of the video file (default 'mp4').
+            mode (str): Color mode of the video, 'rgb' or 'gray' (default 'rgb').
+        Returns:
+            tuple: A tuple containing an array of video frames and metadata about the video.
+        """
+        file.seek(0)
+        # The plugin argument in v3 replaces the format argument in v2
+        plugin = kwargs.pop("plugin", "pyav")
+        # Load all frames at once using v3 API
+        video_frames = iio_v3.imread(file, plugin=plugin, **kwargs)
+        # Handle grayscale conversion if needed
+        if mode == "gray":
+            import cv2
+            if len(video_frames.shape) == 4:  # (frames, height, width, channels)
+                gray_frames = []
+                for frame in video_frames:
+                    gray_frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
+                    gray_frame = np.expand_dims(gray_frame, axis=2)  # Keep dimensions consistent
+                    gray_frames.append(gray_frame)
+                video_frames = np.array(gray_frames)
+        # Extract metadata
+        # Note: iio_v3.imread doesn't return metadata directly like v2 did
+        # We need to extract it separately
+        file.seek(0)
+        metadata = self._extract_metadata(file, plugin=plugin)
+        return video_frames, metadata
+    def _extract_metadata(self, file: IO[bytes], plugin: str = "pyav") -> dict[str, Any]:
+        """
+        Extract metadata from a video file.
+        Parameters:
+            file (IO[bytes]): File-like object containing video data.
+            plugin (str): Plugin to use for reading.
+        Returns:
+            dict: Video metadata.
+        """
+        try:
+            # Create a generator to read frames and metadata
+            metadata = iio_v3.immeta(file, plugin=plugin)
+            # Add some standard fields similar to v2 metadata format
+            if "fps" not in metadata and "duration" in metadata:
+                # Read the first frame to get shape information
+                file.seek(0)
+                first_frame = iio_v3.imread(file, plugin=plugin, index=0)
+                metadata["size"] = first_frame.shape[1::-1]  # (width, height)
+                metadata["source_size"] = metadata["size"]
+                # Create a consistent metadata structure with v2
+                metadata["plugin"] = plugin
+                if "codec" not in metadata:
+                    metadata["codec"] = "unknown"
+                if "pix_fmt" not in metadata:
+                    metadata["pix_fmt"] = "unknown"
+                # Calculate nframes if possible
+                if "fps" in metadata and "duration" in metadata:
+                    metadata["nframes"] = int(metadata["fps"] * metadata["duration"])
+                else:
+                    metadata["nframes"] = float("inf")
+            return metadata
+        except Exception as e:
+            # Fallback to basic metadata
+            return {
+                "plugin": plugin,
+                "nframes": float("inf"),
+                "codec": "unknown",
+                "fps": 30.0,  # Default values
+                "duration": 0,
+                "size": (0, 0),
+            }
+    def dump_to_fileobj(
+        self,
+        obj: np.ndarray | torch.Tensor,
+        file: IO[bytes],
+        format: str = "mp4",  # pylint: disable=redefined-builtin
+        fps: int = 17,
+        quality: int = 7,
+        ffmpeg_params=None,
+        **kwargs,
+    ):
+        """
+        Save an array of video frames to a file-like object using imageio.
+        Parameters:
+            obj (Union[np.ndarray, torch.Tensor]): An array of frames to be saved as video.
+            file (IO[bytes]): A file-like object to which the video data will be written.
+            format (str): Format of the video file (default 'mp4').
+            fps (int): Frames per second of the output video (default 17).
+            quality (int): Quality of the video (0-10, default 5).
+            ffmpeg_params (list): Additional parameters to pass to ffmpeg.
+        """
+        if isinstance(obj, torch.Tensor):
+            assert obj.dtype == torch.uint8, "Tensor must be of type uint8"
+            obj = obj.cpu().numpy()
+        h, w = obj.shape[1:-1]
+        # Default ffmpeg params that ensure width and height are set
+        default_ffmpeg_params = ["-s", f"{w}x{h}"]
+        # Use provided ffmpeg_params if any, otherwise use defaults
+        final_ffmpeg_params = ffmpeg_params if ffmpeg_params is not None else default_ffmpeg_params
+        mimsave_kwargs = {
+            "fps": fps,
+            "quality": quality,
+            "macro_block_size": 1,
+            "ffmpeg_params": final_ffmpeg_params,
+            "output_params": ["-f", "mp4"],
+        }
+        # Update with any other kwargs
+        mimsave_kwargs.update(kwargs)
+        log.debug(f"mimsave_kwargs: {mimsave_kwargs}")
+        imageio.mimsave(file, obj, format, **mimsave_kwargs)
+    def dump_to_str(self, obj, **kwargs):
+        raise NotImplementedError

imaginaire/utils/easy_io/handlers/json_handler.py ADDED Viewed

	@@ -0,0 +1,49 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import numpy as np
+from imaginaire.utils.easy_io.handlers.base import BaseFileHandler
+def set_default(obj):
+    """Set default json values for non-serializable values.
+    It helps convert ``set``, ``range`` and ``np.ndarray`` data types to list.
+    It also converts ``np.generic`` (including ``np.int32``, ``np.float32``,
+    etc.) into plain numbers of plain python built-in types.
+    """
+    if isinstance(obj, (set, range)):
+        return list(obj)
+    elif isinstance(obj, np.ndarray):
+        return obj.tolist()
+    elif isinstance(obj, np.generic):
+        return obj.item()
+    raise TypeError(f"{type(obj)} is unsupported for json dump")
+class JsonHandler(BaseFileHandler):
+    def load_from_fileobj(self, file):
+        return json.load(file)
+    def dump_to_fileobj(self, obj, file, **kwargs):
+        kwargs.setdefault("default", set_default)
+        json.dump(obj, file, **kwargs)
+    def dump_to_str(self, obj, **kwargs):
+        kwargs.setdefault("default", set_default)
+        return json.dumps(obj, **kwargs)

imaginaire/utils/easy_io/handlers/jsonl_handler.py ADDED Viewed

	@@ -0,0 +1,80 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+from typing import IO
+import numpy as np
+from imaginaire.utils.easy_io.handlers.base import BaseFileHandler
+def set_default(obj):
+    """Set default json values for non-serializable values.
+    It helps convert ``set``, ``range`` and ``np.ndarray`` data types to list.
+    It also converts ``np.generic`` (including ``np.int32``, ``np.float32``,
+    etc.) into plain numbers of plain python built-in types.
+    """
+    if isinstance(obj, (set, range)):
+        return list(obj)
+    elif isinstance(obj, np.ndarray):
+        return obj.tolist()
+    elif isinstance(obj, np.generic):
+        return obj.item()
+    raise TypeError(f"{type(obj)} is unsupported for json dump")
+class JsonlHandler(BaseFileHandler):
+    """Handler for JSON lines (JSONL) files."""
+    def load_from_fileobj(self, file: IO[bytes]):
+        """Load JSON objects from a newline-delimited JSON (JSONL) file object.
+        Returns:
+            A list of Python objects loaded from each JSON line.
+        """
+        data = []
+        for line in file:
+            line = line.strip()
+            if not line:
+                continue  # skip empty lines if any
+            data.append(json.loads(line))
+        return data
+    def dump_to_fileobj(self, obj: IO[bytes], file, **kwargs):
+        """Dump a list of objects to a newline-delimited JSON (JSONL) file object.
+        Args:
+            obj: A list (or iterable) of objects to dump line by line.
+        """
+        kwargs.setdefault("default", set_default)
+        for item in obj:
+            file.write(json.dumps(item, **kwargs) + "\n")
+    def dump_to_str(self, obj, **kwargs):
+        """Dump a list of objects to a newline-delimited JSON (JSONL) string."""
+        kwargs.setdefault("default", set_default)
+        lines = [json.dumps(item, **kwargs) for item in obj]
+        return "\n".join(lines)
+if __name__ == "__main__":
+    from imaginaire.utils.easy_io import easy_io
+    easy_io.dump([1, 2, 3], "test.jsonl", file_format="jsonl")
+    print(easy_io.load("test.jsonl"))
+    easy_io.dump([{"key1": 1, "key2": 2}, {"key1": 3, "key2": 4}], "test.jsonl", file_format="jsonl")
+    print(easy_io.load("test.jsonl"))

imaginaire/utils/easy_io/handlers/np_handler.py ADDED Viewed

	@@ -0,0 +1,89 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from io import BytesIO
+from typing import IO, Any
+import numpy as np
+from imaginaire.utils.easy_io.handlers.base import BaseFileHandler
+class NumpyHandler(BaseFileHandler):
+    str_like = False
+    def load_from_fileobj(self, file: IO[bytes], **kwargs) -> Any:
+        """
+        Load a NumPy array from a file-like object.
+        Parameters:
+            file (IO[bytes]): The file-like object containing the NumPy array data.
+            **kwargs: Additional keyword arguments passed to `np.load`.
+        Returns:
+            numpy.ndarray: The loaded NumPy array.
+        """
+        return np.load(file, **kwargs)
+    def load_from_path(self, filepath: str, **kwargs) -> Any:
+        """
+        Load a NumPy array from a file path.
+        Parameters:
+            filepath (str): The path to the file to load.
+            **kwargs: Additional keyword arguments passed to `np.load`.
+        Returns:
+            numpy.ndarray: The loaded NumPy array.
+        """
+        return super().load_from_path(filepath, mode="rb", **kwargs)
+    def dump_to_str(self, obj: np.ndarray, **kwargs) -> str:
+        """
+        Serialize a NumPy array to a string in binary format.
+        Parameters:
+            obj (np.ndarray): The NumPy array to serialize.
+            **kwargs: Additional keyword arguments passed to `np.save`.
+        Returns:
+            str: The serialized NumPy array as a string.
+        """
+        with BytesIO() as f:
+            np.save(f, obj, **kwargs)
+            return f.getvalue()
+    def dump_to_fileobj(self, obj: np.ndarray, file: IO[bytes], **kwargs):
+        """
+        Dump a NumPy array to a file-like object.
+        Parameters:
+            obj (np.ndarray): The NumPy array to dump.
+            file (IO[bytes]): The file-like object to which the array is dumped.
+            **kwargs: Additional keyword arguments passed to `np.save`.
+        """
+        np.save(file, obj, **kwargs)
+    def dump_to_path(self, obj: np.ndarray, filepath: str, **kwargs):
+        """
+        Dump a NumPy array to a file path.
+        Parameters:
+            obj (np.ndarray): The NumPy array to dump.
+            filepath (str): The file path where the array should be saved.
+            **kwargs: Additional keyword arguments passed to `np.save`.
+        """
+        with open(filepath, "wb") as f:
+            np.save(f, obj, **kwargs)

imaginaire/utils/easy_io/handlers/pandas_handler.py ADDED Viewed

	@@ -0,0 +1,31 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pandas as pd
+from imaginaire.utils.easy_io.handlers.base import BaseFileHandler  # isort:skip
+class PandasHandler(BaseFileHandler):
+    str_like = False
+    def load_from_fileobj(self, file, **kwargs):
+        return pd.read_csv(file, **kwargs)
+    def dump_to_fileobj(self, obj, file, **kwargs):
+        obj.to_csv(file, **kwargs)
+    def dump_to_str(self, obj, **kwargs):
+        raise NotImplementedError("PandasHandler does not support dumping to str")

imaginaire/utils/easy_io/handlers/pickle_handler.py ADDED Viewed

	@@ -0,0 +1,42 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pickle
+from io import BytesIO
+from typing import Any
+from imaginaire.utils.easy_io.handlers.base import BaseFileHandler
+class PickleHandler(BaseFileHandler):
+    str_like = False
+    def load_from_fileobj(self, file: BytesIO, **kwargs):
+        return pickle.load(file, **kwargs)
+    def load_from_path(self, filepath, **kwargs):
+        return super().load_from_path(filepath, mode="rb", **kwargs)
+    def dump_to_str(self, obj, **kwargs):
+        kwargs.setdefault("protocol", 2)
+        return pickle.dumps(obj, **kwargs)
+    def dump_to_fileobj(self, obj: Any, file: BytesIO, **kwargs):
+        kwargs.setdefault("protocol", 2)
+        pickle.dump(obj, file, **kwargs)
+    def dump_to_path(self, obj, filepath, **kwargs):
+        with open(filepath, "wb") as f:
+            pickle.dump(obj, f, **kwargs)

imaginaire/utils/easy_io/handlers/pil_handler.py ADDED Viewed

	@@ -0,0 +1,96 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import IO
+import numpy as np
+from imaginaire.utils.easy_io.handlers.base import BaseFileHandler
+try:
+    from PIL import Image
+except ImportError:
+    Image = None
+class PILHandler(BaseFileHandler):
+    format: str
+    str_like = False
+    def load_from_fileobj(
+        self,
+        file: IO[bytes],
+        fmt: str = "pil",
+        size: int | tuple[int, int] | None = None,
+        **kwargs,
+    ):
+        """
+        Load an image from a file-like object and return it in a specified format.
+        Args:
+            file (IO[bytes]): A file-like object containing the image data.
+            fmt (str): The format to convert the image into. Options are \
+                'numpy', 'np', 'npy', 'type' (all return numpy arrays), \
+                    'pil' (returns PIL Image), 'th', 'torch' (returns a torch tensor).
+            size (Optional[Union[int, Tuple[int, int]]]): The new size of the image as a single integer \
+                or a tuple of (width, height). If specified, the image is resized accordingly.
+            **kwargs: Additional keyword arguments that can be passed to conversion functions.
+        Returns:
+            Image data in the format specified by `fmt`.
+        Raises:
+            IOError: If the image cannot be loaded or processed.
+            ValueError: If the specified format is unsupported.
+        """
+        try:
+            img = Image.open(file)
+            img.load()  # Explicitly load the image data
+            if size is not None:
+                if isinstance(size, int):
+                    size = (
+                        size,
+                        size,
+                    )  # create a tuple if only one integer is provided
+                img = img.resize(size, Image.ANTIALIAS)
+            # Return the image in the requested format
+            if fmt in ["numpy", "np", "npy"]:
+                return np.array(img, **kwargs)
+            if fmt == "pil":
+                return img
+            if fmt in ["th", "torch"]:
+                import torch
+                # Convert to tensor
+                img_tensor = torch.from_numpy(np.array(img, **kwargs))
+                # Convert image from HxWxC to CxHxW
+                if img_tensor.ndim == 3:
+                    img_tensor = img_tensor.permute(2, 0, 1)
+                return img_tensor
+            raise ValueError(
+                "Unsupported format. Supported formats are 'numpy', 'np', 'npy', 'pil', 'th', and 'torch'."
+            )
+        except Exception as e:
+            raise OSError(f"Unable to load image: {e}") from e
+    def dump_to_fileobj(self, obj, file: IO[bytes], **kwargs):
+        if "format" not in kwargs:
+            kwargs["format"] = self.format
+        kwargs["format"] = "JPEG" if self.format.lower() == "jpg" else self.format.upper()
+        obj.save(file, **kwargs)
+    def dump_to_str(self, obj, **kwargs):
+        raise NotImplementedError

imaginaire/utils/easy_io/handlers/registry_utils.py ADDED Viewed

	@@ -0,0 +1,82 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from imaginaire.utils.easy_io.handlers.base import BaseFileHandler
+from imaginaire.utils.easy_io.handlers.byte_handler import ByteHandler
+from imaginaire.utils.easy_io.handlers.csv_handler import CsvHandler
+from imaginaire.utils.easy_io.handlers.gzip_handler import GzipHandler
+from imaginaire.utils.easy_io.handlers.imageio_video_handler import ImageioVideoHandler
+from imaginaire.utils.easy_io.handlers.json_handler import JsonHandler
+from imaginaire.utils.easy_io.handlers.jsonl_handler import JsonlHandler
+from imaginaire.utils.easy_io.handlers.np_handler import NumpyHandler
+from imaginaire.utils.easy_io.handlers.pandas_handler import PandasHandler
+from imaginaire.utils.easy_io.handlers.pickle_handler import PickleHandler
+from imaginaire.utils.easy_io.handlers.pil_handler import PILHandler
+from imaginaire.utils.easy_io.handlers.tarfile_handler import TarHandler
+from imaginaire.utils.easy_io.handlers.torch_handler import TorchHandler
+from imaginaire.utils.easy_io.handlers.torchjit_handler import TorchJitHandler
+from imaginaire.utils.easy_io.handlers.txt_handler import TxtHandler
+from imaginaire.utils.easy_io.handlers.yaml_handler import YamlHandler
+file_handlers = {
+    "json": JsonHandler(),
+    "yaml": YamlHandler(),
+    "yml": YamlHandler(),
+    "pickle": PickleHandler(),
+    "pkl": PickleHandler(),
+    "tar": TarHandler(),
+    "jit": TorchJitHandler(),
+    "npy": NumpyHandler(),
+    "txt": TxtHandler(),
+    "csv": CsvHandler(),
+    "pandas": PandasHandler(),
+    "gz": GzipHandler(),
+    "jsonl": JsonlHandler(),
+    "byte": ByteHandler(),
+}
+for torch_type in ["pt", "pth", "ckpt"]:
+    file_handlers[torch_type] = TorchHandler()
+for img_type in ["jpg", "jpeg", "png", "bmp", "gif"]:
+    file_handlers[img_type] = PILHandler()
+    file_handlers[img_type].format = img_type
+for video_type in ["mp4", "avi", "mov", "webm", "flv", "wmv"]:
+    file_handlers[video_type] = ImageioVideoHandler()
+def _register_handler(handler, file_formats):
+    """Register a handler for some file extensions.
+    Args:
+        handler (:obj:`BaseFileHandler`): Handler to be registered.
+        file_formats (str or list[str]): File formats to be handled by this
+            handler.
+    """
+    if not isinstance(handler, BaseFileHandler):
+        raise TypeError(f"handler must be a child of BaseFileHandler, not {type(handler)}")
+    if isinstance(file_formats, str):
+        file_formats = [file_formats]
+    if not all([isinstance(item, str) for item in file_formats]):
+        raise TypeError("file_formats must be a str or a list of str")
+    for ext in file_formats:
+        file_handlers[ext] = handler
+def register_handler(file_formats, **kwargs):
+    def wrap(cls):
+        _register_handler(cls(**kwargs), file_formats)
+        return cls
+    return wrap

imaginaire/utils/easy_io/handlers/tarfile_handler.py ADDED Viewed

	@@ -0,0 +1,39 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import tarfile
+from imaginaire.utils.easy_io.handlers.base import BaseFileHandler
+class TarHandler(BaseFileHandler):
+    str_like = False
+    def load_from_fileobj(self, file, mode="r|*", **kwargs):
+        return tarfile.open(fileobj=file, mode=mode, **kwargs)
+    def load_from_path(self, filepath, mode="r|*", **kwargs):
+        return tarfile.open(filepath, mode=mode, **kwargs)
+    def dump_to_fileobj(self, obj, file, mode="w", **kwargs):
+        with tarfile.open(fileobj=file, mode=mode) as tar:
+            tar.add(obj, **kwargs)
+    def dump_to_path(self, obj, filepath, mode="w", **kwargs):
+        with tarfile.open(filepath, mode=mode) as tar:
+            tar.add(obj, **kwargs)
+    def dump_to_str(self, obj, **kwargs):
+        raise NotImplementedError

imaginaire/utils/easy_io/handlers/torch_handler.py ADDED Viewed

	@@ -0,0 +1,34 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+try:
+    import torch
+except ImportError:
+    torch = None
+from imaginaire.utils.easy_io.handlers.base import BaseFileHandler
+class TorchHandler(BaseFileHandler):
+    str_like = False
+    def load_from_fileobj(self, file, **kwargs):
+        return torch.load(file, **kwargs)
+    def dump_to_fileobj(self, obj, file, **kwargs):
+        torch.save(obj, file, **kwargs)
+    def dump_to_str(self, obj, **kwargs):
+        raise NotImplementedError

imaginaire/utils/easy_io/handlers/torchjit_handler.py ADDED Viewed

	@@ -0,0 +1,34 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+try:
+    import torch
+except ImportError:
+    torch = None
+from imaginaire.utils.easy_io.handlers.base import BaseFileHandler
+class TorchJitHandler(BaseFileHandler):
+    str_like = False
+    def load_from_fileobj(self, file, **kwargs):
+        return torch.jit.load(file, **kwargs)
+    def dump_to_fileobj(self, obj, file, **kwargs):
+        torch.jit.save(obj, file, **kwargs)
+    def dump_to_str(self, obj, **kwargs):
+        raise NotImplementedError

imaginaire/utils/easy_io/handlers/txt_handler.py ADDED Viewed

	@@ -0,0 +1,34 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from imaginaire.utils.easy_io.handlers.base import BaseFileHandler
+class TxtHandler(BaseFileHandler):
+    def load_from_fileobj(self, file, **kwargs):
+        del kwargs
+        return file.read()
+    def dump_to_fileobj(self, obj, file, **kwargs):
+        del kwargs
+        if not isinstance(obj, str):
+            obj = str(obj)
+        file.write(obj)
+    def dump_to_str(self, obj, **kwargs):
+        del kwargs
+        if not isinstance(obj, str):
+            obj = str(obj)
+        return obj

imaginaire/utils/easy_io/handlers/yaml_handler.py ADDED Viewed

	@@ -0,0 +1,38 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import yaml
+try:
+    from yaml import CDumper as Dumper  # type: ignore
+    from yaml import CLoader as Loader  # type: ignore
+except ImportError:
+    from yaml import Dumper, Loader  # type: ignore
+from imaginaire.utils.easy_io.handlers.base import BaseFileHandler  # isort:skip
+class YamlHandler(BaseFileHandler):
+    def load_from_fileobj(self, file, **kwargs):
+        kwargs.setdefault("Loader", Loader)
+        return yaml.load(file, **kwargs)
+    def dump_to_fileobj(self, obj, file, **kwargs):
+        kwargs.setdefault("Dumper", Dumper)
+        yaml.dump(obj, file, **kwargs)
+    def dump_to_str(self, obj, **kwargs):
+        kwargs.setdefault("Dumper", Dumper)
+        return yaml.dump(obj, **kwargs)

imaginaire/utils/ema.py ADDED Viewed

	@@ -0,0 +1,315 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+from typing import TYPE_CHECKING, Any
+import numpy as np
+import torch
+try:
+    from megatron.core import parallel_state
+    USE_MEGATRON = True
+except ImportError:
+    USE_MEGATRON = False
+from imaginaire.utils import distributed, log
+if TYPE_CHECKING:
+    from imaginaire.model import ImaginaireModel
+class FastEmaModelUpdater:
+    """
+    This class is used to update target model~(EMA) given source model~(regular model) and beta.
+    The method interaface mimic :class:`EMAModelTracker` and :class:`PowerEMATracker`.
+    Different from two classes, this class does not maintain the EMA model weights as buffers. It expects the user to have two module with same architecture and weights shape.
+    The class is proposed to work with FSDP model where above two classes are not working as expected. Besides, it is strange to claim model weights as buffers and do unnecessary name changing in :class:`EMAModelTracker` and :class:`PowerEMATracker`. Moeving forward, we should use this class instead of above two classes.
+    """
+    def __init__(self):
+        # Flag to indicate whether the cache is taken or not. Useful to avoid cache overwrite
+        self.is_cached = False
+    @torch.no_grad()
+    def copy_to(self, src_model: torch.nn.Module, tgt_model: torch.nn.Module) -> None:
+        for tgt_params, src_params in zip(tgt_model.parameters(), src_model.parameters(), strict=False):
+            tgt_params.data.copy_(src_params.data)
+    @torch.no_grad()
+    def update_average(self, src_model: torch.nn.Module, tgt_model: torch.nn.Module, beta: float = 0.9999) -> None:
+        target_list = []
+        source_list = []
+        for tgt_params, src_params in zip(tgt_model.parameters(), src_model.parameters(), strict=False):
+            assert tgt_params.dtype == torch.float32, (
+                f"EMA model only works in FP32 dtype, got {tgt_params.dtype} instead."
+            )
+            target_list.append(tgt_params)
+            source_list.append(src_params.data)
+        torch._foreach_mul_(target_list, beta)
+        torch._foreach_add_(target_list, source_list, alpha=1.0 - beta)
+    @torch.no_grad()
+    def cache(self, parameters: Any, is_cpu: bool = False) -> None:
+        """Save the current parameters for restoring later.
+        Args:
+            parameters (iterable): Iterable of torch.nn.Parameter to be temporarily stored.
+        """
+        assert self.is_cached is False, "EMA cache is already taken. Did you forget to restore it?"
+        device = "cpu" if is_cpu else "cuda"
+        self.collected_params = [param.clone().to(device) for param in parameters]
+        self.is_cached = True
+    @torch.no_grad()
+    def restore(self, parameters: Any) -> None:
+        """Restore the parameters in self.collected_params.
+        Useful to validate the model with EMA parameters without affecting the
+        original optimization process. Store the parameters before copy_to().
+        After validation (or model saving), use this to restore the former parameters.
+        Args:
+            parameters (iterable): Iterable of torch.nn.Parameter to be updated with the stored parameters.
+        """
+        assert self.is_cached, "EMA cache is not taken yet."
+        for c_param, param in zip(self.collected_params, parameters, strict=False):
+            param.data.copy_(c_param.data.type_as(param.data))
+        self.collected_params = []
+        # Release the cache after we call restore
+        self.is_cached = False
+def get_buffer_name(param_name: str, torch_compile_buffer_renaming: bool = False) -> str:
+    """
+    This function creates buffer name used by EMA from parameter's name
+    Args:
+        param_name (str): Model's parameter name
+    Returns:
+        buffer_name (str): buffer name to be used for given parameter name
+    """
+    buffer_name = param_name.replace(".", "-")
+    if torch_compile_buffer_renaming:
+        # torch.compile() adds _orig_mod to state dict names, this way we get original name
+        buffer_name = buffer_name.replace("_orig_mod-", "")
+    return buffer_name
+class EMAModelTracker(torch.nn.Module):
+    """This is a class to track the EMA model weights.
+    The EMA weights are registered as buffers, which are extractable as state dicts. The names follow those of the
+    regular weights, except all "." are replaced with "-" (limitation of register_buffer()). This is similar to SDXL's
+    implementation of EMA. There are no optimizable parameters.
+    Attributes:
+        collected_params (list): temporarily stores the regular weights while in EMA mode.
+        beta (float): EMA decay rate. (default: 0.9999).
+        torch_compile_buffer_renaming (bool): whether to remove '_orig_mod-' from buffer names when torch.compile is used
+    """
+    def __init__(self, model: ImaginaireModel, beta: float = 0.9999, torch_compile_buffer_renaming: bool = False):
+        """Constructor of the EMA model weight tracker.
+        Args:
+            model (ImaginaireModel): The PyTorch model.
+            beta (float): EMA decay rate. (default: 0.9999).
+        """
+        super().__init__()
+        self.torch_compile_buffer_renaming: bool = torch_compile_buffer_renaming
+        if not 0.0 <= beta <= 1.0:
+            raise ValueError("Decay must be between 0 and 1")
+        self.beta = beta
+        for name, param in model.named_parameters():
+            if param.requires_grad:
+                buffer_name = get_buffer_name(name, self.torch_compile_buffer_renaming)
+                self.register_buffer(buffer_name, param.clone().detach().data)
+        self.collected_params = []
+        # Flag to indicate whether the cache is taken or not. Useful to avoid cache overwrite
+        self.is_cached = False
+    @torch.no_grad()
+    def update_average(self, model: ImaginaireModel, iteration: int | None = None) -> None:
+        del iteration
+        target_list = []
+        source_list = []
+        ema_buffers = self.state_dict()
+        for name, param in model.named_parameters():
+            if param.requires_grad:
+                buffer_name = get_buffer_name(name, self.torch_compile_buffer_renaming)
+                buffer = ema_buffers[buffer_name]
+                assert buffer.dtype == torch.float32, f"EMA model only works in FP32 dtype, got {buffer.dtype} instead."
+                target_list.append(buffer)
+                source_list.append(param.data)
+        torch._foreach_mul_(target_list, self.beta)
+        torch._foreach_add_(target_list, source_list, alpha=1.0 - self.beta)
+    def copy_to(self, model: ImaginaireModel) -> None:
+        ema_buffers = self.state_dict()
+        for name, param in model.named_parameters():
+            if param.requires_grad:
+                buffer_name = get_buffer_name(name, self.torch_compile_buffer_renaming)
+                buffer = ema_buffers[buffer_name]
+                param.data.copy_(buffer.data)
+    def cache(self, parameters: Any, is_cpu: bool = False) -> None:
+        """Save the current parameters for restoring later.
+        Args:
+            parameters (iterable): Iterable of torch.nn.Parameter to be temporarily stored.
+        """
+        assert self.is_cached is False, "EMA cache is already taken. Did you forget to restore it?"
+        device = "cpu" if is_cpu else "cuda"
+        self.collected_params = [param.clone().to(device) for param in parameters]
+        self.is_cached = True
+    def restore(self, parameters: Any) -> None:
+        """Restore the parameters in self.collected_params.
+        Useful to validate the model with EMA parameters without affecting the
+        original optimization process. Store the parameters before copy_to().
+        After validation (or model saving), use this to restore the former parameters.
+        Args:
+            parameters (iterable): Iterable of torch.nn.Parameter to be updated with the stored parameters.
+        """
+        assert self.is_cached, "EMA cache is not taken yet."
+        for c_param, param in zip(self.collected_params, parameters, strict=False):
+            param.data.copy_(c_param.data.type_as(param.data))
+        self.collected_params = []
+        # Release the cache after we call restore
+        self.is_cached = False
+    @classmethod
+    def initialize_multi_rank_ema(
+        cls, model: torch.nn.Module, rate: float | list[float], num: int = 1, enabled: bool = True
+    ) -> EMAModelTracker | None:
+        """
+        Class method to initialize per rank EMA Model Tracker with different rate.
+        Each rank will have a different rate based on the given configuration, resulting in different EMA weights.
+        Args:
+            model (torch.nn.Module): The neural network model to be tracked.
+            rate (Union[float, List[float]]): The decay rate(s) for the EMA. If a list is provided,
+                                              it corresponds to rates for different ranks.
+            num (int, optional): The number of leading ranks to consider for different rates.
+                                 Defaults to 1.
+            enabled (bool, optional): Flag to enable or disable the creation of the tracker.
+                                      If False, returns None. Defaults to True.
+        Returns:
+            Optional[EMAModelTracker]: An instance of EMAModelTracker if enabled, otherwise None.
+        Example:
+            >>> model = torch.nn.Linear(10, 2)
+            >>> tracker = EMAModelTracker.initialize_ema_from_settings(model, rate=[0.1, 0.2], num=2)
+            >>> print(tracker)
+        Notes:
+            If `rate` is a list and the current rank is less than `num`, the rate for the current rank
+            is used. If the current rank exceeds `num`, the first rate in the list is used by default.
+        """
+        if not enabled:
+            return None
+        if USE_MEGATRON and parallel_state.is_initialized():
+            cur_dp_rank = parallel_state.get_data_parallel_rank(with_context_parallel=True)
+            log.critical(f"using MCore parallel_state for EMA initialization. DP RANK: {cur_dp_rank}", rank0_only=False)
+            log.warning("It should not used together with FSDP!")
+        else:
+            cur_dp_rank = distributed.get_rank()
+            log.critical(f"using torch.distributed for EMA initialization. DP RANK: {cur_dp_rank}", rank0_only=False)
+        rate = rate if isinstance(rate, list) else [rate]
+        num = min(num, len(rate))
+        rate = rate[cur_dp_rank] if cur_dp_rank < num else rate[0]
+        if cur_dp_rank < num:
+            print(f"EMAModelTracker: rank {cur_dp_rank}, rate {rate}")
+        return cls(model, rate)
+class PowerEMATracker(EMAModelTracker):
+    def __init__(self, model: ImaginaireModel, s: float = 0.1, torch_compile_buffer_renaming: bool = False):
+        """Constructor of the EMA model weight tracker.
+        Args:
+            model (ImaginaireModel): The PyTorch model.
+            s (float): EMA decay rate. See EDM2 paper
+            torch_compile_buffer_renaming (bool): whether to remove '_orig_mod-' from buffer names when torch.compile is used
+        """
+        super().__init__(model=model, beta=0.0, torch_compile_buffer_renaming=torch_compile_buffer_renaming)
+        self.exp = np.roots([1, 7, 16 - s**-2, 12 - s**-2]).real.max()
+    @torch.no_grad()
+    def update_average(self, model: ImaginaireModel, iteration: int | None = None) -> None:
+        if iteration == 0:
+            beta = 0.0
+        else:
+            i = iteration + 1
+            beta = (1 - 1 / i) ** (self.exp + 1)
+        self.beta = beta
+        super().update_average(model, iteration)
+    @classmethod
+    def initialize_multi_rank_ema(
+        cls, model: torch.nn.Module, rate: float, num: int, enabled: bool = True
+    ) -> PowerEMATracker | None:
+        """
+        Class method to initialize per rank EMA Model Tracker with different rate.
+        Each rank will have a different rate based on the given configuration, resulting in different EMA weights.
+        Args:
+            model (torch.nn.Module): The neural network model for which the EMA tracker is being set up.
+            num (int): The number of ranks for which the rate adjustment is applied. Beyond this, the rate remains unchanged.
+            rate (float): The base decay rate for the EMA calculation.
+            enabled (bool, optional): Flag to enable or disable the initialization of the tracker. If False, returns None.
+                                      Defaults to True.
+        Returns:
+            Optional[PowerEMATracker]: An instance of PowerEMATracker with adjusted rate if enabled, otherwise None.
+        Raises:
+            None
+        Example:
+            >>> model = torch.nn.Linear(10, 2)
+            >>> tracker = PowerEMATracker.initialize_multi_rank_ema(model, num=3, rate=0.99)
+            >>> print(tracker)
+        Notes:
+            The decay rate is modified by dividing it by 2 raised to the power of the rank for each rank less than `num`.
+            If the rank is greater than or equal to `num`, the base rate is used without modification. This approach
+            allows higher ranked processes to have a less aggressive decay, potentially reflecting their delayed synchronization
+            in a distributed training scenario.
+        """
+        if not enabled:
+            return None
+        if USE_MEGATRON and parallel_state.is_initialized():
+            cur_dp_rank = parallel_state.get_data_parallel_rank(with_context_parallel=True)
+            log.critical(f"using MCore parallel_state for EMA initialization. DP RANK: {cur_dp_rank}", rank0_only=False)
+            log.warning("It should not used together with FSDP!")
+        else:
+            cur_dp_rank = distributed.get_rank()
+            log.critical(f"using torch.distributed for EMA initialization. DP RANK: {cur_dp_rank}", rank0_only=False)
+        divider = 2**cur_dp_rank if cur_dp_rank < num else 1
+        if cur_dp_rank < num:
+            print(f"PowerEMATracker: rank {cur_dp_rank}, rate {rate / divider}")
+        return cls(model, rate / divider)

imaginaire/utils/fused_adam.py ADDED Viewed

	@@ -0,0 +1,398 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from apex.multi_tensor_apply import multi_tensor_applier
+from imaginaire.utils import distributed, log
+class FusedAdam(torch.optim.Optimizer):
+    """Implements Adam algorithm.
+    Currently GPU-only.  Requires Apex to be installed via
+    ``pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./``.
+    This version of fused Adam implements 2 fusions.
+      * Fusion of the Adam update's elementwise operations
+      * A multi-tensor apply launch that batches the elementwise updates applied to all the model's parameters
+        into one or a few kernel launches.
+    :class:`apex.optimizers.FusedAdam` may be used as a drop-in replacement for ``torch.optim.AdamW``,
+    or ``torch.optim.Adam`` with ``adam_w_mode=False``::
+        opt = apex.optimizers.FusedAdam(model.parameters(), lr = ....)
+        ...
+        opt.step()
+    :class:`apex.optimizers.FusedAdam` may be used with or without Amp.  If you wish to use :class:`FusedAdam` with Amp,
+    you may choose any ``opt_level``::
+        opt = apex.optimizers.FusedAdam(model.parameters(), lr = ....)
+        model, opt = amp.initialize(model, opt, opt_level="O0" or "O1 or "O2")
+        ...
+        opt.step()
+    In general, ``opt_level="O1"`` is recommended.
+    .. warning::
+        A previous version of :class:`FusedAdam` allowed a number of additional arguments to ``step``.
+        These additional arguments are now deprecated and unnecessary.
+    Adam was been proposed in `Adam: A Method for Stochastic Optimization`_.
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups.
+        lr (float, optional): learning rate. (default: 1e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square. (default: (0.9, 0.999))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability. (default: 1e-8)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
+            algorithm from the paper `On the Convergence of Adam and Beyond`_
+            (default: False) NOT SUPPORTED in FusedAdam!
+        adam_w_mode (boolean, optional): Apply L2 regularization or weight decay
+            True for decoupled weight decay(also known as AdamW) (default: True)
+        capturable (bool, optional): whether to use the version of the optimizer
+            that can be used with CUDA Graphs. (default: False)
+        master_weights (bool, optional): whether to maintain FP32 master weights
+           in the optimizer with FP16 mixed precision training, currently can
+           only be used with capturable set to True. (default: False)
+    .. _Adam - A Method for Stochastic Optimization:
+        https://arxiv.org/abs/1412.6980
+    .. _On the Convergence of Adam and Beyond:
+        https://openreview.net/forum?id=ryQu7f-RZ
+    """
+    def __init__(
+        self,
+        params,
+        lr=1e-3,
+        bias_correction=True,
+        betas=(0.9, 0.999),
+        eps=1e-8,
+        adam_w_mode=True,
+        weight_decay=0.0,
+        amsgrad=False,
+        capturable=False,
+        master_weights=False,
+    ):
+        if amsgrad:
+            raise RuntimeError("FusedAdam does not support the AMSGrad variant.")
+        if master_weights and not capturable:
+            raise RuntimeError("Master weights is currently only supported with the capturable version.")
+        # If the optimizer is capturable then LR should be a tensor (on GPU)
+        log.warning(f"FusedAdam master_weights: {master_weights} capturable: {capturable}")
+        lr = torch.tensor(lr, dtype=torch.float32) if capturable else lr
+        defaults = dict(lr=lr, bias_correction=bias_correction, betas=betas, eps=eps, weight_decay=weight_decay)
+        super(FusedAdam, self).__init__(params, defaults)  # noqa: UP008
+        self.adam_w_mode = 1 if adam_w_mode else 0
+        self.capturable = capturable
+        self.master_weights = master_weights
+        self.param_groups_master = None
+        if capturable:
+            for idx, group in enumerate(self.param_groups):
+                if len(group["params"]) == 0:
+                    continue
+                device = group["params"][0].device
+                for item in ["lr"]:
+                    if isinstance(group[item], float):
+                        group[item] = torch.tensor(group[item], dtype=torch.float32)
+                    self.param_groups[idx][item] = group[item].to(device=device)
+            self._step_supports_amp_scaling = True
+        if multi_tensor_applier.available:
+            import amp_C
+            # Skip buffer
+            self._dummy_overflow_buf = torch.tensor([0], dtype=torch.int, device="cuda")
+            self.multi_tensor_adam = amp_C.multi_tensor_adam
+            self.multi_tensor_adam_capturable = amp_C.multi_tensor_adam_capturable
+            self.multi_tensor_adam_capturable_master = amp_C.multi_tensor_adam_capturable_master
+        else:
+            raise RuntimeError("apex.optimizers.FusedAdam requires cuda extensions")
+    def step(self, closure=None, grads=None, output_params=None, scale=None, grad_norms=None, grad_scaler=None):
+        """Performs a single optimization step.
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        The remaining arguments are deprecated, and are only retained (for the moment) for error-checking purposes.
+        """
+        if any(p is not None for p in [grads, output_params, scale, grad_norms]):
+            raise RuntimeError(
+                "FusedAdam has been updated. "
+                "Simply initialize it identically to torch.optim.Adam, and call step() with no arguments."
+            )
+        loss = None
+        if closure is not None:
+            loss = closure()
+        if self.param_groups_master is None:
+            # Create full precision master weights
+            self.param_groups_master = []
+            for i, pg in enumerate(self.param_groups):  # noqa: B007
+                param_list = pg["params"]
+                self.param_groups_master.append(
+                    {
+                        "params": [p.clone().detach().float() if self.master_weights else None for p in param_list],
+                    }
+                )
+        for group, group_master in zip(self.param_groups, self.param_groups_master, strict=False):
+            if len(group["params"]) == 0:
+                continue
+            device = group["params"][0].device
+            bias_correction = 1 if group.get("bias_correction") else 0
+            beta1, beta2 = group["betas"]
+            # assume same step across group now to simplify things
+            # per parameter step can be easily support by making it tensor, or pass list into kernel
+            if "step" in group:
+                if self.capturable:
+                    group["step"] = (
+                        group["step"].to(device=device)
+                        if isinstance(group["step"], torch.Tensor)
+                        else torch.tensor(group["step"], dtype=torch.int32, device=device)
+                    )
+                    group["step"] += (self._dummy_overflow_buf != 1).to(torch.int)
+                else:
+                    group["step"] += 1
+            else:
+                group["step"] = 1 if not self.capturable else torch.tensor([1], dtype=torch.int, device=device)
+            if self.capturable:
+                group["lr"] = (
+                    group["lr"].to(device=device)
+                    if isinstance(group["lr"], torch.Tensor)
+                    else torch.tensor(group["lr"], dtype=torch.float32, device=device)
+                )
+            # create lists for multi-tensor apply
+            g_16, p_16, m_16, v_16 = [], [], [], []
+            g_bf, p_bf, m_bf, v_bf = [], [], [], []
+            g_32, p_32, m_32, v_32 = [], [], [], []
+            p_16_master = []
+            p_32_master = []
+            bf16_master = []
+            for p, p_master in zip(group["params"], group_master["params"], strict=False):
+                if p.grad is None:
+                    continue
+                if p.grad.data.is_sparse:
+                    raise RuntimeError(
+                        "FusedAdam does not support sparse gradients, please consider SparseAdam instead"
+                    )
+                state = self.state[p]
+                # State initialization
+                if len(state) == 0:
+                    # Exponential moving average of gradient values
+                    state["exp_avg"] = torch.zeros_like(p.data).float()
+                    # Exponential moving average of squared gradient values
+                    state["exp_avg_sq"] = torch.zeros_like(p.data).float()
+                if p.dtype == torch.float16:
+                    if self.master_weights:
+                        p_16_master.append(p_master.data)
+                    g_16.append(p.grad.data)
+                    p_16.append(p.data)
+                    m_16.append(state["exp_avg"])
+                    v_16.append(state["exp_avg_sq"])
+                elif p.dtype == torch.bfloat16:
+                    if self.master_weights:
+                        bf16_master.append(p_master.data)
+                    g_bf.append(p.grad)
+                    p_bf.append(p)
+                    m_bf.append(state["exp_avg"])
+                    v_bf.append(state["exp_avg_sq"])
+                elif p.dtype == torch.float32:
+                    if self.master_weights:
+                        p_32_master.append(p_master.data)
+                    g_32.append(p.grad.data)
+                    p_32.append(p.data)
+                    m_32.append(state["exp_avg"])
+                    v_32.append(state["exp_avg_sq"])
+                else:
+                    raise RuntimeError("FusedAdam only support fp16 and fp32.")
+            # If the optimizer is capturable, then if there's a grad scaler it works
+            # on the GPU + a different multi_tensor_applier should be called
+            if self.capturable:
+                # overflow check of gradients
+                found_inf = (
+                    grad_scaler._check_inf_per_device(self)[device]
+                    if grad_scaler is not None
+                    else torch.zeros((1,), device=device)
+                )
+                self._dummy_overflow_buf.copy_(found_inf)
+                # get unscale scale factor
+                scale, inv_scale = None, None
+                if grad_scaler:
+                    scale = grad_scaler._get_scale_async()
+                    inv_scale = scale.double().reciprocal().float()
+                else:
+                    scale = torch.ones((1,), device=device, dtype=torch.float32)
+                    inv_scale = torch.ones((1,), device=device, dtype=torch.float32)
+                if len(g_16) > 0:
+                    multi_tensor_applier(
+                        (
+                            self.multi_tensor_adam_capturable_master
+                            if self.master_weights
+                            else self.multi_tensor_adam_capturable
+                        ),
+                        self._dummy_overflow_buf,
+                        [g_16, p_16, m_16, v_16, p_16_master] if self.master_weights else [g_16, p_16, m_16, v_16],
+                        group["lr"],
+                        beta1,
+                        beta2,
+                        group["eps"],
+                        group["step"],
+                        self.adam_w_mode,
+                        bias_correction,
+                        group["weight_decay"],
+                        inv_scale,
+                    )
+                if len(g_bf) > 0:
+                    multi_tensor_applier(
+                        (
+                            self.multi_tensor_adam_capturable_master
+                            if self.master_weights
+                            else self.multi_tensor_adam_capturable
+                        ),
+                        self._dummy_overflow_buf,
+                        [g_bf, p_bf, m_bf, v_bf, bf16_master] if self.master_weights else [g_bf, p_bf, m_bf, v_bf],
+                        group["lr"],
+                        beta1,
+                        beta2,
+                        group["eps"],
+                        group["step"],
+                        self.adam_w_mode,
+                        bias_correction,
+                        group["weight_decay"],
+                        inv_scale,
+                    )
+                if len(g_32) > 0:
+                    multi_tensor_applier(
+                        (
+                            self.multi_tensor_adam_capturable_master
+                            if self.master_weights
+                            else self.multi_tensor_adam_capturable
+                        ),
+                        self._dummy_overflow_buf,
+                        [g_32, p_32, m_32, v_32, p_32_master] if self.master_weights else [g_32, p_32, m_32, v_32],
+                        group["lr"],
+                        beta1,
+                        beta2,
+                        group["eps"],
+                        group["step"],
+                        self.adam_w_mode,
+                        bias_correction,
+                        group["weight_decay"],
+                        inv_scale,
+                    )
+            else:
+                if len(g_16) > 0:
+                    multi_tensor_applier(
+                        self.multi_tensor_adam,
+                        self._dummy_overflow_buf,
+                        [g_16, p_16, m_16, v_16],
+                        group["lr"],
+                        beta1,
+                        beta2,
+                        group["eps"],
+                        group["step"],
+                        self.adam_w_mode,
+                        bias_correction,
+                        group["weight_decay"],
+                    )
+                if len(g_bf) > 0:
+                    multi_tensor_applier(
+                        self.multi_tensor_adam,
+                        self._dummy_overflow_buf,
+                        [g_bf, p_bf, m_bf, v_bf],
+                        group["lr"],
+                        beta1,
+                        beta2,
+                        group["eps"],
+                        group["step"],
+                        self.adam_w_mode,
+                        bias_correction,
+                        group["weight_decay"],
+                    )
+                if len(g_32) > 0:
+                    multi_tensor_applier(
+                        self.multi_tensor_adam,
+                        self._dummy_overflow_buf,
+                        [g_32, p_32, m_32, v_32],
+                        group["lr"],
+                        beta1,
+                        beta2,
+                        group["eps"],
+                        group["step"],
+                        self.adam_w_mode,
+                        bias_correction,
+                        group["weight_decay"],
+                    )
+        return loss
+    def load_state_dict(self, state_dict):
+        super().load_state_dict(state_dict)
+        for group in self.param_groups:
+            if self.capturable:
+                group["lr"] = (
+                    group["lr"].cuda()
+                    if isinstance(group["lr"], torch.Tensor)
+                    else torch.tensor(group["lr"], dtype=torch.float32).cuda()
+                )
+            if "step" in group:
+                if self.capturable:
+                    if distributed.get_rank() == 0:
+                        step = (
+                            group["step"].cuda()
+                            if isinstance(group["step"], torch.Tensor)
+                            else torch.tensor([group["step"]], dtype=torch.int32).cuda()
+                        )
+                    else:
+                        step = torch.zeros(1, dtype=torch.int32).cuda()
+                    # make it compatible with FSDP optimizer
+                    distributed.broadcast(step, 0)
+                    group["step"] = step
+                elif isinstance(group["step"], torch.Tensor):
+                    group["step"] = group["step"].item()
+            for p in group["params"]:
+                state = self.state[p]
+                if "exp_avg" in state:
+                    state["exp_avg"] = state["exp_avg"].float()
+                    state["exp_avg_sq"] = state["exp_avg_sq"].float()