[train v2][doc] Update user guides for metrics, checkpoints, results, and experiment tracking (#51204)

justinvyu · web-flow · commit d44ef311866b · 2025-03-11T07:09:07.000Z
Updates a few user guides mostly around the reporting of free-floating
metrics that is no longer persisted by Ray Train. Ray Train only keeps
around metrics that are attached to reported checkpoints.

---------

Signed-off-by: Justin Yu &lt;justinvyu@anyscale.com&gt;
diff --git a/doc/source/train/doc_code/checkpoints.py b/doc/source/train/doc_code/checkpoints.py
@@ -155,14 +155,6 @@ def train_func(config):
     run_config=train.RunConfig(failure_config=train.FailureConfig(max_failures=1)),
 )
 result = trainer.fit()
-
-# Seed a training run with a checkpoint using `resume_from_checkpoint`
-trainer = TorchTrainer(
-    train_func,
-    train_loop_config={"num_epochs": 5},
-    scaling_config=ScalingConfig(num_workers=2),
-    resume_from_checkpoint=result.checkpoint,
-)
 # __pytorch_restore_end__
 
 # __checkpoint_from_single_worker_start__
@@ -249,7 +241,7 @@ def on_train_epoch_end(self, trainer, pl_module):
         should_checkpoint = trainer.current_epoch % 3 == 0
 
         with TemporaryDirectory() as tmpdir:
-            # Fetch metrics
+            # Fetch metrics from `self.log(..)` in the LightningModule
             metrics = trainer.callback_metrics
             metrics = {k: v.item() for k, v in metrics.items()}
 
@@ -289,21 +281,18 @@ def train_func():
     checkpoint = train.get_checkpoint()
     if checkpoint:
         with checkpoint.as_directory() as ckpt_dir:
-            ckpt_path = os.path.join(ckpt_dir, "checkpoint.ckpt")
+            ckpt_path = os.path.join(ckpt_dir, RayTrainReportCallback.CHECKPOINT_NAME)
             trainer.fit(model, datamodule=datamodule, ckpt_path=ckpt_path)
     else:
         trainer.fit(model, datamodule=datamodule)
 
 
-# Build a Ray Train Checkpoint
-# Suppose we have a Lightning checkpoint at `s3://bucket/ckpt_dir/checkpoint.ckpt`
-checkpoint = Checkpoint("s3://bucket/ckpt_dir")
-
-# Resume training from checkpoint file
 ray_trainer = TorchTrainer(
     train_func,
     scaling_config=train.ScalingConfig(num_workers=2),
-    resume_from_checkpoint=checkpoint,
+    run_config=train.RunConfig(
+        checkpoint_config=train.CheckpointConfig(num_to_keep=2),
+    ),
 )
 # __lightning_restore_example_end__
 
diff --git a/doc/source/train/doc_code/key_concepts.py b/doc/source/train/doc_code/key_concepts.py
@@ -122,11 +122,12 @@ def train_fn(config):
 # __result_path_end__
 
 
+# TODO(justinvyu): Result.from_path is not supported in Train V2 yet.
 # __result_restore_start__
-from ray.train import Result
+# from ray.train import Result
 
-restored_result = Result.from_path(result_path)
-print("Restored loss", result.metrics["loss"])
+# restored_result = Result.from_path(result_path)
+# print("Restored loss", result.metrics["loss"])
 # __result_restore_end__
 
 
diff --git a/doc/source/train/doc_code/metric_logging.py b/doc/source/train/doc_code/metric_logging.py
@@ -1,11 +1,19 @@
 # flake8: noqa
 # isort: skip_file
 
-# __start__
+import os
+
+os.environ["RAY_TRAIN_V2_ENABLED"] = "1"
+
+
+# __torchmetrics_start__
 
 # First, pip install torchmetrics
 # This code is tested with torchmetrics==0.7.3 and torch==1.12.1
 
+import os
+import tempfile
+
 import ray.train.torch
 from ray import train
 from ray.train import ScalingConfig
@@ -62,13 +70,19 @@ def train_func(config):
         mape_collected = mape.compute().item()
         mean_valid_loss_collected = mean_valid_loss.compute().item()
 
-        train.report(
-            {
-                "mape_collected": mape_collected,
-                "valid_loss": valid_loss,
-                "mean_valid_loss_collected": mean_valid_loss_collected,
-            }
-        )
+        with tempfile.TemporaryDirectory() as temp_checkpoint_dir:
+            torch.save(
+                model.state_dict(), os.path.join(temp_checkpoint_dir, "model.pt")
+            )
+
+            train.report(
+                {
+                    "mape_collected": mape_collected,
+                    "valid_loss": valid_loss,
+                    "mean_valid_loss_collected": mean_valid_loss_collected,
+                },
+                checkpoint=train.Checkpoint.from_directory(temp_checkpoint_dir),
+            )
 
         # reset for next epoch
         mape.reset()
@@ -83,3 +97,42 @@ def train_func(config):
 result = trainer.fit()
 print(result.metrics["valid_loss"], result.metrics["mean_valid_loss_collected"])
 # 0.5109779238700867 0.5512474775314331
+
+# __torchmetrics_end__
+
+# __report_callback_start__
+import os
+
+assert os.environ["RAY_TRAIN_V2_ENABLED"] == "1"
+
+from typing import Any, Dict, List, Optional
+
+import ray.train
+import ray.train.torch
+
+
+def train_fn_per_worker(config):
+    # Free-floating metrics can be accessed from the callback below.
+    ray.train.report({"rank": ray.train.get_context().get_world_rank()})
+
+
+class CustomMetricsCallback(ray.train.UserCallback):
+    def after_report(
+        self,
+        run_context,
+        metrics: List[Dict[str, Any]],
+        checkpoint: Optional[ray.train.Checkpoint],
+    ):
+        rank_0_metrics = metrics[0]
+        print(rank_0_metrics)
+        # Ex: Write metrics to a file...
+
+
+trainer = ray.train.torch.TorchTrainer(
+    train_fn_per_worker,
+    scaling_config=ray.train.ScalingConfig(num_workers=2),
+    run_config=ray.train.RunConfig(callbacks=[CustomMetricsCallback()]),
+)
+trainer.fit()
+
+# __report_callback_end__
diff --git a/doc/source/train/user-guides/checkpoints.rst b/doc/source/train/user-guides/checkpoints.rst
@@ -7,13 +7,10 @@ Ray Train provides a way to snapshot training progress with :class:`Checkpoints
 
 This is useful for:
 
-1. **Storing the best-performing model weights:** Save your model to persistent storage, and use it for downstream serving/inference.
-2. **Fault tolerance:** Handle node failures in a long-running training job on a cluster of pre-emptible machines/pods.
-3. **Distributed checkpointing:** When doing *model-parallel training*, Ray Train checkpointing provides an easy way to
-   :ref:`upload model shards from each worker in parallel <train-distributed-checkpointing>`,
-   without needing to gather the full model to a single node.
-4. **Integration with Ray Tune:** Checkpoint saving and loading is required by certain :ref:`Ray Tune schedulers <tune-schedulers>`.
-
+1. **Storing the best-performing model weights:** Save your model to persistent storage, and use it for downstream serving or inference.
+2. **Fault tolerance:** Handle worker process and node failures in a long-running training job and leverage pre-emptible machines.
+3. **Distributed checkpointing:** Ray Train checkpointing can be used to
+   :ref:`upload model shards from multiple workers in parallel. <train-distributed-checkpointing>`
 
 .. _train-dl-saving-checkpoints:
 
@@ -69,8 +66,8 @@ Then, the local temporary directory can be safely cleaned up to free up disk spa
         :start-after: __checkpoint_from_single_worker_start__
         :end-before: __checkpoint_from_single_worker_end__
 
-    If using parallel training strategies such as DeepSpeed Zero-3 and FSDP, where
-    each worker only has a shard of the full-model, you should save and report a checkpoint
+    If using parallel training strategies such as DeepSpeed Zero and FSDP, where
+    each worker only has a shard of the full training state, you can save and report a checkpoint
     from each worker. See :ref:`train-distributed-checkpointing` for an example.
 
 
@@ -310,12 +307,10 @@ training state from a :class:`~ray.train.Checkpoint`.
 The :class:`Checkpoint <ray.train.Checkpoint>` to restore from can be accessed in the
 training function with :func:`ray.train.get_checkpoint <ray.train.get_checkpoint>`.
 
-The checkpoint returned by :func:`ray.train.get_checkpoint <ray.train.get_checkpoint>` is populated in two ways:
-
-1. It can be auto-populated as the latest reported checkpoint, e.g. during :ref:`automatic failure recovery <train-fault-tolerance>` or :ref:`on manual restoration <train-restore-guide>`.
-2. It can be manually populated by passing a checkpoint to the ``resume_from_checkpoint`` argument of a Ray :class:`Trainer <ray.train.trainer.BaseTrainer>`.
-   This is useful for initializing a new training run with a previous run's checkpoint.
+The checkpoint returned by :func:`ray.train.get_checkpoint <ray.train.get_checkpoint>` is populated
+as the latest reported checkpoint during :ref:`automatic failure recovery <train-fault-tolerance>`.
 
+See :ref:`train-fault-tolerance` for more details on restoration and fault tolerance.
 
 .. tab-set::
 
diff --git a/doc/source/train/user-guides/experiment-tracking.rst b/doc/source/train/user-guides/experiment-tracking.rst
@@ -4,11 +4,6 @@
 Experiment Tracking
 ===================
 
-.. note::
-    This guide is relevant for all trainers in which you define a custom training loop.
-    This includes :class:`TorchTrainer <ray.train.torch.TorchTrainer>` and
-    :class:`TensorflowTrainer <ray.train.tensorflow.TensorflowTrainer>`.
-
 Most experiment tracking libraries work out-of-the-box with Ray Train.
 This guide provides instructions on how to set up the code so that your favorite experiment tracking libraries
 can work for distributed training with Ray Train. The end of the guide has common errors to aid in debugging
@@ -253,27 +248,6 @@ Refer to the tracking libraries' documentation for semantics.
 
     When performing **fault-tolerant training** with auto-restoration, use a
     consistent ID to configure all tracking runs that logically belong to the same training run.
-    One way to acquire an unique ID is with the following method:
-    :meth:`ray.train.get_context().get_trial_id() <ray.train.context.TrainContext.get_trial_id>`.
-
-    .. testcode::
-        :skipif: True
-
-        import ray
-        from ray.train import ScalingConfig, RunConfig, FailureConfig
-        from ray.train.torch import TorchTrainer
-
-        def train_func():
-            if ray.train.get_context().get_world_rank() == 0:
-                wandb.init(id=ray.train.get_context().get_trial_id())
-            ...
-
-        trainer = TorchTrainer(
-            train_func,
-            run_config=RunConfig(failure_config=FailureConfig(max_failures=3))
-        )
-
-        trainer.fit()
 
 
 Step 3: Log metrics
diff --git a/doc/source/train/user-guides/hyperparameter-optimization.rst b/doc/source/train/user-guides/hyperparameter-optimization.rst
@@ -140,6 +140,8 @@ Fault tolerance on the Ray Train side is configured and handled separately. See
     :end-before: __fault_tolerance_end__
 
 
+.. _train-with-tune-callbacks:
+
 Advanced: Using Ray Tune callbacks
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/doc/source/train/user-guides/monitoring-logging.rst b/doc/source/train/user-guides/monitoring-logging.rst
@@ -3,60 +3,15 @@
 Monitoring and Logging Metrics
 ==============================
 
-Ray Train provides an API for reporting intermediate
-results and checkpoints from the training function (run on distributed workers) up to the
-``Trainer`` (where your python script is executed) by calling ``train.report(metrics)``.
-The results will be collected from the distributed workers and passed to the driver to
-be logged and displayed.
+Ray Train provides an API for attaching metrics to :ref:`checkpoints <train-checkpointing>` from the training function by calling :func:`ray.train.report(metrics, checkpoint) <ray.train.report>`.
+The results will be collected from the distributed workers and passed to the Ray Train driver process for book-keeping.
 
-.. warning::
+The primary use-case for reporting is for metrics (accuracy, loss, etc.) at the end of each training epoch. See :ref:`train-dl-saving-checkpoints` for usage examples.
 
-    Only the results from rank 0 worker will be used. However, in order to ensure
-    consistency, ``train.report()`` has to be called on each worker. If you
-    want to aggregate results from multiple workers, see :ref:`train-aggregating-results`.
+Only the result reported by the rank 0 worker will be attached to the checkpoint.
+However, in order to ensure consistency, ``train.report()`` acts as a barrier and must be called on each worker.
+To aggregate results from multiple workers, see :ref:`train-aggregating-results`.
 
-The primary use-case for reporting is for metrics (accuracy, loss, etc.) at
-the end of each training epoch.
-
-.. tab-set::
-
-    .. tab-item:: PyTorch
-
-        .. testcode::
-
-            from ray import train
-
-            def train_func():
-                ...
-                for i in range(num_epochs):
-                    result = model.train(...)
-                    train.report({"result": result})
-
-    .. tab-item:: PyTorch Lightning
-
-        In PyTorch Lightning, we use a callback to call ``train.report()``.
-
-        .. testcode::
-            :skipif: True
-
-            from ray import train
-            import pytorch_lightning as pl
-            from pytorch_lightning.callbacks import Callback
-
-            class MyRayTrainReportCallback(Callback):
-                def on_train_epoch_end(self, trainer, pl_module):
-                    metrics = trainer.callback_metrics
-                    metrics = {k: v.item() for k, v in metrics.items()}
-
-                    train.report(metrics=metrics)
-
-            def train_func_per_worker():
-                ...
-                trainer = pl.Trainer(
-                    # ...
-                    callbacks=[MyRayTrainReportCallback()]
-                )
-                trainer.fit()
 
 .. _train-aggregating-results:
 
@@ -77,6 +32,38 @@ metrics from multiple workers.
 
         Here is an example of reporting both the aggregated R2 score and mean train and validation loss from all workers.
 
-        .. literalinclude:: ../doc_code/torchmetrics_example.py
+        .. literalinclude:: ../doc_code/metric_logging.py
             :language: python
-            :start-after: __start__
+            :start-after: __torchmetrics_start__
+            :end-before: __torchmetrics_end__
+
+
+.. _train-metric-only-reporting-deprecation:
+
+(Deprecated) Reporting free-floating metrics
+--------------------------------------------
+
+Reporting metrics with ``ray.train.report(metrics, checkpoint=None)`` from every worker writes the metrics to a Ray Tune log file (``progress.csv``, ``result.json``)
+and is accessible via the ``Result.metrics_dataframe`` on the :class:`~ray.train.Result` returned by ``trainer.fit()``.
+
+As of Ray 2.43, this behavior is deprecated and will not be supported in Ray Train V2,
+which is an overhaul of Ray Train's implementation and select APIs.
+
+Ray Train V2 only keeps a slim set of experiment tracking features that are necessary for fault tolerance, so it does not support reporting free-floating metrics that are not attached to checkpoints.
+The recommendation for metric tracking is to report metrics directly from the workers to experiment tracking tools such as MLFlow and WandB.
+See :ref:`train-experiment-tracking-native` for examples.
+
+In Ray Train V2, reporting only metrics from all workers is a no-op. However, it is still possible to access the results reported by all workers to implement custom metric-handling logic.
+
+.. literalinclude:: ../doc_code/metric_logging.py
+    :language: python
+    :start-after: __report_callback_start__
+    :end-before: __report_callback_end__
+
+
+To use Ray Tune :class:`Callbacks <ray.tune.Callback>` that depend on free-floating metrics reported by workers, :ref:`run Ray Train as a single Ray Tune trial. <train-with-tune-callbacks>`
+
+See the following resources for more information:
+
+* `Train V2 REP <https://github.com/ray-project/enhancements/blob/main/reps/2024-10-18-train-tune-api-revamp/2024-10-18-train-tune-api-revamp.md>`_: Technical details about the API changes in Train V2
+* `Train V2 Migration Guide <https://github.com/ray-project/ray/issues/49454>`_: Full migration guide for Train V2
diff --git a/doc/source/train/user-guides/results.rst b/doc/source/train/user-guides/results.rst