ml-flashpoint/src/ml_flashpoint/core/checkpoint_loader.py at d68b4eac87b99d6cb4cc504d8a49eabeaa5e7989 · google/ml-flashpoint · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import abc
import fcntl
import io
import logging
import os
import pickle
import re
import struct
from collections import defaultdict
from pathlib import Path
from typing import IO, List, Optional, Set, Tuple, TypeVar, cast

import torch
import torch.distributed as dist
from torch.distributed._shard._utils import narrow_tensor_by_index
from torch.distributed.checkpoint import Metadata
from torch.distributed.checkpoint.filesystem import _StorageInfo
from torch.distributed.checkpoint.planner import (
    LoadItemType,
    LoadPlanner,
    ReadItem,
)
from torch.distributed.checkpoint.utils import _create_file_view
from typing_extensions import override

from ml_flashpoint.checkpoint_object_manager.checkpoint_object_manager import CheckpointObjectManager
from ml_flashpoint.core.checkpoint_id_types import CheckpointContainerId, CheckpointObjectId
from ml_flashpoint.core.defaults import (
    COMMON_STATE_FNAME,
    DIRTY_MARKER_SUFFIX,
    GLOBAL_RANK_PATTERN,
    CheckpointFormat,
    default_metadata_object_name,
)
from ml_flashpoint.core.mlf_logging import get_logger
from ml_flashpoint.core.utils import get_num_of_nodes, log_execution_time
from ml_flashpoint.replication.replication_manager import ReplicationManager

M = TypeVar("M")

_LOGGER = get_logger(__name__)


# Allowlist of (module, name) pairs for safe metadata deserialization.
# Replaces insecure pickle.load() to prevent arbitrary code execution when loading
# .metadata files from untrusted sources (e.g., peer nodes, shared storage).
# See CWE-502: Deserialization of Untrusted Data.
_SAFE_METADATA_UNPICKLE_ALLOWLIST: frozenset[Tuple[str, str]] = frozenset(
    [
        # torch.distributed.checkpoint.metadata classes
        ("torch.distributed.checkpoint.metadata", "Metadata"),
        ("torch.distributed.checkpoint.metadata", "MetadataIndex"),
        ("torch.distributed.checkpoint.metadata", "TensorStorageMetadata"),
        ("torch.distributed.checkpoint.metadata", "BytesStorageMetadata"),
        ("torch.distributed.checkpoint.metadata", "StorageMeta"),
        ("torch.distributed.checkpoint.metadata", "TensorProperties"),
        ("torch.distributed.checkpoint.metadata", "ChunkStorageMetadata"),
        ("torch.distributed.checkpoint.metadata", "_MEM_FORMAT_ENCODING"),
        # torch.serialization
        ("torch.serialization", "_get_layout"),
        # torch types
        ("torch", "Size"),
        ("torch", "float32"),
        ("torch", "float16"),
        ("torch", "float64"),
        ("torch", "bfloat16"),
        ("torch", "int8"),
        ("torch", "uint8"),
        ("torch", "int16"),
        ("torch", "int32"),
        ("torch", "int64"),
        ("torch", "bool"),
        ("torch", "complex64"),
        ("torch", "complex128"),
        ("torch", "strided"),
        ("torch", "sparse_coo"),
        ("torch", "sparse_csr"),
        ("torch", "sparse_bsr"),
        ("torch", "sparse_csc"),
        ("torch", "sparse_bsc"),
        ("torch", "jagged"),
    ]
)


class _RestrictedMetadataUnpickler(pickle.Unpickler):
    """Unpickler that only allows deserializing PyTorch checkpoint Metadata classes.

    Prevents arbitrary code execution from malicious pickle payloads (CWE-502).
    """

    def find_class(self, module: str, name: str) -> type:
        key = (module, name)
        if key not in _SAFE_METADATA_UNPICKLE_ALLOWLIST:
            raise pickle.UnpicklingError(
                f"Unsafe deserialization blocked: ({module!r}, {name!r}) is not in the allowlist. "
                "Metadata files must only contain PyTorch checkpoint metadata structures."
            )
        return super().find_class(module, name)

class MLFlashpointCheckpointLoader(abc.ABC):
    """
    This is the main interface for loading checkpoints, providing functionality for the different
    stages in the load flow.

    See `DefaultMLFlashpointCheckpointLoader` as the canonical implementation, and the `ml_flashpoint.adapter`
    package for out-of-the-box framework integrations.
    """

    @abc.abstractmethod
    def read_metadata(
        self,
        checkpoint_id: CheckpointContainerId,
        object_name: str = default_metadata_object_name(),
    ) -> Metadata:
        """
        Reads and returns Metadata for the given checkpoint_id. The CheckpointObjectId to read and return is formed as
        `{checkpoint_id}/{object_name}`, where filename is optional and uses the default value when unspecified.

        Args:
            checkpoint_id: The CheckpointContainerId to read metadata for.
            object_name: The metadata object name. Optional, defaults to default_metadata_object_name().

        Returns:
            Metadata read from CheckpointObjectId(f"{checkpoint_id}/{object_name}").
        """
        pass

    @abc.abstractmethod
    def read_data(
        self,
        checkpoint_object_id: CheckpointObjectId,
        read_items: List[ReadItem],
        planner: LoadPlanner,
        storage_data: dict[int, _StorageInfo],
    ) -> None:
        """Reads data from the checkpoint object and loads it into the application state.

        Args:
            checkpoint_object_id: The ID of the checkpoint object to read from.
            read_items: A list of items to read from the checkpoint.
            planner: The load planner that coordinates the load process.
            storage_data: A dictionary containing storage information for the checkpoint.
        """
        pass

    @abc.abstractmethod
    def get_latest_complete_checkpoint(
        self, checkpoint_base_container: CheckpointContainerId
    ) -> Optional[CheckpointContainerId]:
        """Get the latest complete `CheckpointContainerId`.

        A checkpoint is considered complete if a directory `step-{step}_ckpt` exists
        and no corresponding `step-{step}_ckpt.*_unfinished` marker exists.

        Args:
            checkpoint_base_container: The base directory to search for checkpoints.

        Returns:
            A CheckpointContainerId for the latest complete checkpoint, or None if none are found.
        """
        pass


class DefaultMLFlashpointCheckpointLoader(MLFlashpointCheckpointLoader):
    """
    The default implementation of the MLFlashpointCheckpointLoader interface.
    """

    def __init__(
        self,
        checkpoint_object_manager: CheckpointObjectManager,
        replication_manager: ReplicationManager,
    ):
        """Initializes the DefaultMLFlashpointCheckpointLoader.

        Args:
            checkpoint_object_manager: The checkpoint object manager to use for
                reading data.
            replication_manager: The replication manager to use for retrieving
                missing checkpoint objects from peer nodes.
        """
        self._checkpoint_object_manager = checkpoint_object_manager
        self._replication_manager = replication_manager
        # Cache for available objects: CheckpointContainerId -> dict[object_path, list[rank]]
        self._available_objects_cache: dict[CheckpointContainerId, dict[str, List[int]]] = {}

    @override
    @log_execution_time(logger=_LOGGER, name="read_metadata")
    def read_metadata(
        self,
        checkpoint_id: CheckpointContainerId,
        object_name: str = default_metadata_object_name(),
    ) -> Metadata:
        metadata_path = Path(checkpoint_id.data) / object_name
        try:
            with open(metadata_path, "rb") as f:
                return _RestrictedMetadataUnpickler(f).load()
        except Exception:
            _LOGGER.exception("Error reading metadata from '%s'", metadata_path)
            raise

    def read_tensor(self, buffer_slice: IO[bytes], req: ReadItem, use_optimized_loader: bool = False) -> torch.Tensor:
        """Read tensor from file slice.

        Args:
            buffer_slice (IO[bytes]): file slice to read from.
            req (ReadItem): read item.
            use_optimized_loader (bool): whether to use optimized loader.

        Returns:
            torch.Tensor: read tensor.
        """
        pos = buffer_slice.tell()
        tensor: Optional[torch.Tensor] = None

        if use_optimized_loader:
            # Read as optimized format (TensorHeader)
            # First read 4 bytes for length
            len_bytes = buffer_slice.read(4)
            if len(len_bytes) == 4:
                header_len = struct.unpack("<I", len_bytes)[0]
                # stored header length should be reasonable, if it's too large, it might be legacy format
                if header_len < 1024 * 1024:
                    pickle_bytes = buffer_slice.read(header_len)

                    try:
                        tensor_header = pickle.loads(pickle_bytes)

                        tensor_dtype = tensor_header.dtype
                        tensor_shape = tensor_header.shape
                        data_bytes = buffer_slice.read()
                        tensor = torch.frombuffer(data_bytes, dtype=tensor_dtype)
                        tensor = tensor.reshape(tensor_shape)
                    except Exception:
                        _LOGGER.exception("Failed to parse tensor header")
                        raise
        # Fallback to torch.load if optimized loader fails.
        if tensor is None:
            buffer_slice.seek(pos)
            tensor = cast(
                torch.Tensor,
                torch.load(cast(IO[bytes], buffer_slice), map_location="cpu", weights_only=True),
            )
        return narrow_tensor_by_index(tensor, req.storage_offsets, req.lengths)

    def _try_retrieve_object_if_missing(self, checkpoint_object_id: CheckpointObjectId) -> bool:
        """Attempts to retrieve a checkpoint object from peer nodes if it is missing locally.
        This method assume we have _available_objects_cache saved
        as this should be called after get_latest_complete_checkpoint, which populates _available_objects_cache.

        Args:
            checkpoint_object_id: The ID of the missing checkpoint object.

        Returns:
            True if the object was successfully retrieved or already exists, False otherwise.
        """
        if os.path.exists(checkpoint_object_id.data):
            _LOGGER.debug("Object '%s' already exists locally.", checkpoint_object_id.data)
            return True

        if not self._replication_manager:
            _LOGGER.error("ReplicationManager is not initialized.")
            return False

        current_container_id = checkpoint_object_id.get_parent()

        source_rank = -1
        found_source = False

        if current_container_id in self._available_objects_cache:
            locations = self._available_objects_cache[current_container_id]
            if checkpoint_object_id.data in locations:
                _LOGGER.debug(
                    "checkpoint_object_id.data: '%s' found in container '%s'",
                    checkpoint_object_id.data,
                    current_container_id,
                )
                sources = locations[checkpoint_object_id.data]
                if sources:
                    # TODO: Optimize the source rank selection to be more efficient,
                    # currently we just pick the first one.
                    source_rank = sources[0]
                    found_source = True

        if not found_source:
            return False

        _LOGGER.debug(
            "Object '%s' missing locally but found on rank %s. Attempting retrieval.",
            checkpoint_object_id.data,
            source_rank,
        )

        # TODO: use filelock lib.
        lock_path = f"{checkpoint_object_id.data}.lock"
        try:
            with open(lock_path, "w") as lock_file:
                fcntl.flock(lock_file, fcntl.LOCK_EX)
                try:
                    # Double check existence after acquiring lock
                    if os.path.exists(checkpoint_object_id.data):
                        _LOGGER.debug("Object '%s' appeared after acquiring lock.", checkpoint_object_id.data)
                        return True

                    _LOGGER.debug("Retrieving '%s' from rank %s", checkpoint_object_id.data, source_rank)
                    success = self._replication_manager.sync_bulk_retrieve(
                        source_global_rank=source_rank,
                        object_ids_to_retrieve=[checkpoint_object_id],
                        container_ids_to_retrieve=[],
                    )
                    if success:
                        _LOGGER.debug("Successfully retrieved '%s'", checkpoint_object_id.data)
                        return True
                    else:
                        _LOGGER.error("Failed to retrieve '%s'", checkpoint_object_id.data)
                        return False
                finally:
                    fcntl.flock(lock_file, fcntl.LOCK_UN)
        except Exception:
            _LOGGER.exception("Error during retrieval/locking for '%s'", checkpoint_object_id.data)
            return False

    @override
    @log_execution_time(logger=_LOGGER, name="read_data", level=logging.INFO)
    def read_data(
        self,
        checkpoint_object_id: CheckpointObjectId,
        read_items: List[ReadItem],
        planner: LoadPlanner,
        storage_data: dict[int, _StorageInfo],
    ) -> None:
        """
        If the checkpoint object is missing locally, attempt to retrieve
        it from a peer node with _try_retrieve_object_if_missing.
        """
        if not self._try_retrieve_object_if_missing(checkpoint_object_id):
            error_msg = f"Checkpoint object '{checkpoint_object_id.data}' does not exist on any node in the cluster"
            _LOGGER.error(error_msg)
            raise FileNotFoundError(error_msg)

        with self._checkpoint_object_manager.get_buffer(checkpoint_object_id) as stream:
            use_optimized_loader = False
            if stream.format_signature == CheckpointFormat.MLF_FORMAT:
                use_optimized_loader = True
                _LOGGER.debug("Using optimized loader for '%s'", checkpoint_object_id.data)

            for req in read_items:
                item_md = storage_data[req.storage_index]
                buffer_slice = cast(IO[bytes], _create_file_view(stream, item_md.offset, item_md.length))
                if req.type == LoadItemType.BYTE_IO:
                    read_bytes = io.BytesIO(buffer_slice.read(item_md.length))
                    read_bytes.seek(0)
                    planner.load_bytes(req, read_bytes)
                else:
                    tensor = self.read_tensor(buffer_slice, req, use_optimized_loader=use_optimized_loader)
                    target_tensor = planner.resolve_tensor(req).detach()
                    assert target_tensor.size() == tensor.size(), (
                        f"req {req.storage_index} mismatch sizes {target_tensor.size()} vs {tensor.size()}"
                    )
                    target_tensor.copy_(tensor)
                    planner.commit_tensor(req, target_tensor)

    @override
    @log_execution_time(logger=_LOGGER, name="get_latest_complete_checkpoint", level=logging.INFO)
    def get_latest_complete_checkpoint(
        self, checkpoint_base_container: CheckpointContainerId
    ) -> Optional[CheckpointContainerId]:
        """
        Step 1: call get_candidate_checkpoints to get all existing checkpoint containers across
            all ranks as candidates and sorted in a descending order by step
        Step 2: traverse the candidate checkpoints and for each checkpoint, for each candidate:
            - call get_checkpoint_objects_by_rank to get all existing checkpoint objects cross
                all ranks of a candidate checkpoint
            - get retrieve plan for all ranks
                - make sure at least one rank has .metadata file
                - call _compute_retrieval_plan on that rank to get the retrieve plan for all ranks
                - broadcast the retrieve plan to all ranks
            - if it's retrievable, call retrieve_checkpoint for all ranks,
            else continue to the next candidate checkpoint
            - return the checkpoint container id of the latest complete checkpoint
        """
        # TODO: use global_rank_getter and local_rank_getter.
        rank = dist.get_rank()
        _LOGGER.debug(
            "Rank %s: Getting latest complete checkpoint for '%s'",
            rank,
            checkpoint_base_container,
        )

        candidate_checkpoints = self.get_candidate_checkpoints(checkpoint_base_container)
        _LOGGER.debug("Rank %s: Candidate checkpoints: '%s'", rank, candidate_checkpoints)
        if not candidate_checkpoints:
            _LOGGER.warning("No candidate checkpoints found.")
            return None

        for i, checkpoint in enumerate(candidate_checkpoints):
            available_objects_by_rank = self.get_checkpoint_objects_by_rank(checkpoint)
            _LOGGER.debug("Rank %s: Available objects by rank: '%s'", rank, available_objects_by_rank)

            planner_rank = -1
            found_metadata = False

            # Iterate in sorted order of ranks to ensure determinism
            for r in sorted(available_objects_by_rank.keys()):
                objs = available_objects_by_rank[r]
                for obj in objs:
                    if os.path.basename(obj.data) == default_metadata_object_name():
                        planner_rank = r
                        found_metadata = True
                        break
                if found_metadata:
                    break

            if found_metadata:
                _LOGGER.debug("Rank %s: Selected planner rank %s for '%s'", rank, planner_rank, checkpoint)
            else:
                _LOGGER.warning("Rank %s: No metadata found for '%s'", rank, checkpoint)
                continue

            retrieval_plan = None
            # Only the designated planner rank (the lowest rank id that has the metadata file)
            # computes the retrieval plan.
            if rank == planner_rank:
                retrieval_plan = self._compute_retrieval_plan(checkpoint, available_objects_by_rank)
            # Broadcast the retrieval plan to all ranks.
            plan_container = [retrieval_plan]
            dist.broadcast_object_list(plan_container, src=planner_rank)
            retrieval_plan = plan_container[0]

            if retrieval_plan is None:
                # If retrieval plan is None, it means the checkpoint is not viable.
                _LOGGER.warning("Rank %s: retrieval plan is None for '%s'. Not viable.", rank, checkpoint)
                continue

            if not retrieval_plan:
                # Empty dict means no retrieval needed, choose this checkpoint as the latest valid checkpoint.
                _LOGGER.debug("Rank %s: No retrieval needed for '%s'", rank, checkpoint)
                return checkpoint

            _LOGGER.debug("Rank %s: Retrieval plan: '%s'", rank, retrieval_plan)

            if self.retrieve_checkpoint(retrieval_plan):
                _LOGGER.debug("Successfully prepared checkpoint '%s' on all ranks.", checkpoint)
                return checkpoint
            else:
                _LOGGER.warning("Failed to retrieve all necessary objects for checkpoint '%s'.", checkpoint)
                continue

        _LOGGER.warning("Rank %s: No complete checkpoint found.", rank)
        return None

    def _compute_retrieval_plan(
        self,
        checkpoint: CheckpointContainerId,
        available_objects_by_rank: dict[int, List[CheckpointObjectId]],
    ) -> Optional[dict[int, List[Tuple[int, str]]]]:
        """Computes the retrieval plan.

        The plan assumes an even number of ranks (accelerator processes) on each node in the training cluster.

        Args:
            checkpoint: The checkpoint container ID.
            available_objects_by_rank: Map of rank to available objects on that rank.

        Returns:
            A retrieval plan or None if the checkpoint is not viable. Empty dict means no retrieval needed.
        """
        try:
            metadata = self.read_metadata(checkpoint)
        except Exception:
            _LOGGER.warning("Failed to read metadata for '%s'", checkpoint)
            return None

        storage_data = metadata.storage_data
        if storage_data is None:
            return None

        all_needed_checkpoint_objects_by_rank: dict[int, set[str]] = defaultdict(set)
        for _, storage_info in storage_data.items():
            match = GLOBAL_RANK_PATTERN.search(storage_info.relative_path)
            if match:
                global_rank = int(match.group(1))
                full_path = CheckpointObjectId.from_container(checkpoint, storage_info.relative_path)
                all_needed_checkpoint_objects_by_rank[global_rank].add(str(full_path))

        # Identify objects needed by the first rank of each node (local_rank 0).
        # This includes common state files, metadata, and optionally context files.
        objects_needed_by_local_rank_0 = set()
        objects_needed_by_local_rank_0.add(str(CheckpointObjectId.from_container(checkpoint, COMMON_STATE_FNAME)))
        objects_needed_by_local_rank_0.add(
            str(CheckpointObjectId.from_container(checkpoint, default_metadata_object_name()))
        )

        objects_needed_by_local_rank_0.update(self._get_extra_needed_objects(checkpoint, available_objects_by_rank))

        world_size = dist.get_world_size()
        num_nodes = get_num_of_nodes()
        ranks_per_node = world_size // num_nodes

        for rank in range(world_size):
            # Only local_node 0 needs to retrieve these common objects
            if rank % ranks_per_node == 0:
                all_needed_checkpoint_objects_by_rank[rank].update(objects_needed_by_local_rank_0)

        # Build reverse map for object locations
        object_locations = defaultdict(list)
        for rank, objects in available_objects_by_rank.items():
            for obj in objects:
                obj_str = str(obj)
                object_locations[obj_str].append(rank)

        retrieval_plan = {}
        for target_rank, needed_objs in all_needed_checkpoint_objects_by_rank.items():
            already_has = {str(o) for o in available_objects_by_rank.get(target_rank, [])}
            missing = needed_objs - already_has

            if missing:
                retrieval_plan[target_rank] = []
                for missing_obj in missing:
                    sources = object_locations.get(missing_obj)
                    if not sources:
                        # If it's missing globally, we can't retrieve it.
                        # This will result in returning None for the plan, effectively skipping this checkpoint.
                        _LOGGER.warning("Object '%s' is missing globally (needed by rank %s)", missing_obj, target_rank)
                        return None
                    # TODO: Optimize the source rank selection to be more efficient
                    # default to the first source rank for now.
                    retrieval_plan[target_rank].append((sources[0], missing_obj))

        return retrieval_plan

    @log_execution_time(logger=_LOGGER, name="get_candidate_checkpoints")
    def get_candidate_checkpoints(
        self, checkpoint_base_container: CheckpointContainerId
    ) -> List[CheckpointContainerId]:
        """Gathers all apparently finished checkpoint containers from all nodes and returns the sorted union.

        This method first finds all locally available checkpoints, then gathers the lists from all nodes
        in the distributed environment, computes the intersection of these lists, and returns a single sorted
        list of candidate checkpoints.

        Args:
            checkpoint_base_container: The base container ID to search for checkpoints.

        Returns:
            A sorted list of candidate checkpoint container IDs in descending order.
        """
        _LOGGER.debug("Getting candidate checkpoints for '%s'", checkpoint_base_container)

        # Scan locally only on the first rank of each node
        base_path = Path(checkpoint_base_container.data)
        rank = dist.get_rank()
        local_rank = dist.get_node_local_rank()

        local_candidate_ckpt_ids = []

        _LOGGER.debug("Rank %s (Local Rank %s): Checking base path: '%s'", rank, local_rank, base_path)
        if base_path.is_dir():
            ckpt_pattern = re.compile(r"(step-\d+_ckpt)")
            potential_ckpts = set()
            unfinished_ckpts = set()

            for entry in os.listdir(base_path):
                ckpt_match = ckpt_pattern.match(entry)
                if ckpt_match:
                    if not entry.endswith(DIRTY_MARKER_SUFFIX):
                        potential_ckpts.add(entry)
                    else:
                        ckpt_id = ckpt_match.group(1)
                        _LOGGER.debug("Found unfinished marker: '%s'", entry)
                        unfinished_ckpts.add(ckpt_id)

            local_candidate_ckpt_ids = [str(base_path / p) for p in (potential_ckpts - unfinished_ckpts)]
        else:
            _LOGGER.debug("Rank %s: Base path '%s' is not a directory or does not exist.", rank, base_path)

        all_checkpoint_container_path_lists = [None for _ in range(dist.get_world_size())]
        dist.all_gather_object(all_checkpoint_container_path_lists, local_candidate_ckpt_ids)
        _LOGGER.debug(
            "Rank %s: Gathered checkpoint container paths from all ranks: '%s'",
            rank,
            all_checkpoint_container_path_lists,
        )

        # Filter out None values and use set.intersection to gather all candidates
        valid_path_lists = [set(paths) for paths in all_checkpoint_container_path_lists if paths]
        if not valid_path_lists:
            _LOGGER.debug("No valid checkpoint container paths found.")
            return []
        else:
            intersection_of_checkpoint_containers = set.intersection(*valid_path_lists)

        _LOGGER.debug(
            "Rank %s: Intersection of checkpoint containers: '%s'", rank, intersection_of_checkpoint_containers
        )

        candidate_ckpt_ids = [CheckpointContainerId(p) for p in intersection_of_checkpoint_containers]

        return sorted(
            candidate_ckpt_ids,
            key=lambda cid: CheckpointContainerId.parse_version_container_step(os.path.basename(cid.data)) or -1,
            reverse=True,
        )

    @log_execution_time(logger=_LOGGER, name="get_checkpoint_objects_by_rank")
    def get_checkpoint_objects_by_rank(
        self, checkpoint_container_id: CheckpointContainerId
    ) -> dict[int, List[CheckpointObjectId]]:
        """Gathers all available checkpoint objects from all nodes for a given checkpoint.

        This method directly inspects the checkpoint container directory on each node
        to identify checkpoint object files.

        Args:
            checkpoint_container_id: The ID of the checkpoint container to inspect.

        Returns:
            A dictionary mapping each rank to a list of
            `CheckpointObjectId`s available on that rank.
        """
        container_path = Path(checkpoint_container_id.data)
        local_objects: List[CheckpointObjectId] = []
        if not container_path.is_dir():
            _LOGGER.debug(
                "Checkpoint container path '%s' is not a directory. Returning empty list.",
                container_path,
            )
        else:
            for entry in os.listdir(container_path):
                local_objects.append(CheckpointObjectId.from_container(checkpoint_container_id, entry))

            local_objects.extend(self._get_extra_local_objects(container_path))

        all_objects_by_rank_paths = [None for _ in range(dist.get_world_size())]
        dist.all_gather_object(all_objects_by_rank_paths, local_objects)

        result = {}
        object_locations = defaultdict(list)
        if all_objects_by_rank_paths:
            for rank, objects in enumerate(all_objects_by_rank_paths):
                if objects:
                    result[rank] = objects
                    for obj in objects:
                        object_locations[obj.data].append(rank)
                else:
                    result[rank] = []

        self._available_objects_cache[checkpoint_container_id] = object_locations
        _LOGGER.debug("Available objects for '%s': '%s'", checkpoint_container_id, object_locations)
        _LOGGER.debug("_available_objects_cache '%s'", self._available_objects_cache)
        return result

    @log_execution_time(logger=_LOGGER, name="retrieve_checkpoint")
    def retrieve_checkpoint(
        self,
        retrieval_plan: dict[int, List[Tuple[int, str]]],
    ) -> bool:
        """Retrieves missing checkpoint objects based on the retrieval plan.

        Args:
            retrieval_plan: A dictionary mapping Rank -> List of (SourceRank, ObjectPath).
                            If empty for this rank, no retrieval is needed.
        """

        rank = dist.get_rank()
        all_success = True

        # Only proceed with retrieval if we have items to retrieve
        if retrieval_plan and rank in retrieval_plan:
            items_to_retrieve = retrieval_plan[rank]
            if items_to_retrieve:
                _LOGGER.debug("Retrieving %d objects for rank %s", len(items_to_retrieve), rank)

                # Group by source rank
                objects_to_retrieve_by_source: dict[int, List[CheckpointObjectId]] = {}
                for source_rank, obj_path in items_to_retrieve:
                    if source_rank not in objects_to_retrieve_by_source:
                        objects_to_retrieve_by_source[source_rank] = []
                    objects_to_retrieve_by_source[source_rank].append(CheckpointObjectId(obj_path))

                _LOGGER.debug("Retrieving %d objects for rank %s", len(objects_to_retrieve_by_source), rank)

                for source_rank, objects_to_retrieve in objects_to_retrieve_by_source.items():
                    _LOGGER.debug(
                        "Retrieving %d objects for rank %s from source rank %s",
                        len(objects_to_retrieve),
                        rank,
                        source_rank,
                    )
                    success = self._replication_manager.sync_bulk_retrieve(
                        source_global_rank=source_rank,
                        object_ids_to_retrieve=objects_to_retrieve,
                        container_ids_to_retrieve=[],  # Not used yet
                    )
                    if not success:
                        _LOGGER.error("Failed to retrieve objects from rank %s", source_rank)
                        all_success = False

        # Gather success status from all ranks
        _LOGGER.debug("Gathering success status from all ranks")
        all_success_list = [None for _ in range(dist.get_world_size())]
        dist.all_gather_object(all_success_list, all_success)
        _LOGGER.debug("All success list: '%s'", all_success_list)
        return all(all_success_list)

    def _get_extra_local_objects(self, container_path: Path) -> List[CheckpointObjectId]:
        """Hook for subclasses to provide extra local objects that are available on the current rank,
        which may be needed by other ranks. This would be called on every rank.

        This can be used when additional objects beyond the standard checkpoint data are needed,
        such as framework-specific context data.

        This should always be implemented alongside `_get_extra_needed_objects`.

        Returns:
            List of additional locally available objects.
        """
        return []

    def _get_extra_needed_objects(
        self,
        checkpoint: CheckpointContainerId,
        available_objects_by_rank: dict[int, List[CheckpointObjectId]],
    ) -> Set[str]:
        """Hook for subclasses to provide extra needed objects for each node.

        The objects returned by this method are considered necessary for the first rank
        (local rank 0) of every node.

        This can leverage `available_objects_by_rank` to determine the set of additional objects
        needed.

        This should always be implemented alongside `_get_extra_local_objects`.

        Returns:
            Set of extra needed objects on each node (specifically local rank 0).
        """
        return set()