pytorch
diff --git a/‎opacus/grad_sample/__init__.py
Lines changed: 4 additions & 0 deletions b/‎opacus/grad_sample/__init__.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎opacus/grad_sample/grad_sample_module.py
Lines changed: 8 additions & 3 deletions b/‎opacus/grad_sample/grad_sample_module.py
Lines changed: 8 additions & 3 deletions
diff --git a/‎opacus/grad_sample/grad_sample_module_fast_gradient_clipping_fsdp.py
Lines changed: 219 additions & 0 deletions b/‎opacus/grad_sample/grad_sample_module_fast_gradient_clipping_fsdp.py
Lines changed: 219 additions & 0 deletions
diff --git a/‎opacus/grad_sample/utils.py
Lines changed: 5 additions & 0 deletions b/‎opacus/grad_sample/utils.py
Lines changed: 5 additions & 0 deletions
diff --git a/‎opacus/optimizers/__init__.py
Lines changed: 9 additions & 1 deletion b/‎opacus/optimizers/__init__.py
Lines changed: 9 additions & 1 deletion
@@ -22,6 +22,9 @@
 from .grad_sample_module_fast_gradient_clipping import (  # noqa
     GradSampleModuleFastGradientClipping,
 )
+from .grad_sample_module_fast_gradient_clipping_fsdp import (  # noqa
+    GradSampleModuleFastGradientClippingFSDP,
+)
 from .group_norm import compute_group_norm_grad_sample  # noqa
 from .gsm_base import AbstractGradSampleModule
 from .gsm_exp_weights import GradSampleModuleExpandedWeights
@@ -41,6 +44,7 @@
 __all__ = [
     "GradSampleModule",
     "GradSampleModuleFastGradientClipping",
+    "GradSampleModuleFastGradientClippingFSDP",
     "GradSampleModuleExpandedWeights",
     "GradSampleModuleNoOp",
     "AbstractGradSampleModule",
 
@@ -20,8 +20,6 @@
 from functools import partial
 from typing import Iterable, List, Tuple
 
-import torch
-import torch.nn as nn
 from opacus.grad_sample.functorch import ft_compute_per_sample_gradient, prepare_layer
 from opacus.grad_sample.gsm_base import AbstractGradSampleModule
 from opacus.layers.dp_rnn import DPGRU, DPLSTM, DPRNN, RNNLinear
@@ -32,6 +30,9 @@
     trainable_parameters,
 )
 
+import torch
+import torch.nn as nn
+
 
 logger = logging.getLogger(__name__)
 logger.disabled = True
@@ -199,7 +200,11 @@ def add_hooks(
             if type(module) in [DPRNN, DPLSTM, DPGRU]:
                 continue
 
-            if force_functorch or not type(module) in self.GRAD_SAMPLERS:
+            module_type = next(
+                (i for i in self.GRAD_SAMPLERS.keys() if isinstance(module, i)),
+                type(module),
+            )
+            if force_functorch or not (module_type in self.GRAD_SAMPLERS):
                 prepare_layer(module, batch_first=batch_first)
 
             self.autograd_grad_sample_hooks.append(
 
@@ -0,0 +1,219 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import logging
+from typing import List
+
+from opacus.grad_sample.functorch import ft_compute_per_sample_gradient
+from opacus.grad_sample.grad_sample_module_fast_gradient_clipping import (
+    GradSampleModuleFastGradientClipping,
+)
+from opacus.utils.module_utils import requires_grad, trainable_parameters
+
+import torch
+import torch.nn as nn
+
+
+logger = logging.getLogger(__name__)
+logger.disabled = True
+
+
+class GradSampleModuleFastGradientClippingFSDP(GradSampleModuleFastGradientClipping):
+    """
+    Hooks-based implementation of GradSampleModule with Fast Gradient and Ghost Clipping
+
+    Computes norms of gradients without gradient instantiation
+    """
+
+    def __init__(
+        self,
+        m: nn.Module,
+        *,
+        batch_first=True,
+        loss_reduction="mean",
+        strict: bool = True,
+        max_grad_norm=1,
+    ):
+        """
+
+        Args:
+            m: nn.Module to be wrapped
+            batch_first: Flag to indicate if the input tensor to the corresponding module
+                has the first dimension representing the batch. If set to True, dimensions on
+                input tensor are expected be ``[batch_size, ...]``, otherwise
+                ``[K, batch_size, ...]``
+            loss_reduction: Indicates if the loss reduction (for aggregating the gradients)
+                is a sum or a mean operation. Can take values "sum" or "mean"
+            max_grad_norm: The value at which gradients are to be clipped.
+            strict: If set to True, the input module will be validated to make sure that
+                it does not have buffers in all its submodules.
+
+        Raises:
+            NotImplementedError
+                If ``strict`` is set to ``True`` and module ``m`` (or any of its
+                submodules) includes a buffer.
+        """
+
+        super().__init__(
+            m,
+            batch_first=batch_first,
+            loss_reduction=loss_reduction,
+            strict=strict,
+            force_functorch=False,
+            max_grad_norm=max_grad_norm,
+            use_ghost_clipping=True,
+        )
+
+        self.sampler_classes = list(self.GRAD_SAMPLERS.keys()) + list(
+            self.NORM_SAMPLERS.keys()
+        )
+        self.module_types = {}
+        for m in self.iterate_submodules(m):
+            if type(m) not in self.module_types:
+                module_type = next(
+                    (i for i in self.sampler_classes if isinstance(m, i)),
+                    type(m),
+                )
+                self.module_types[type(m)] = module_type
+
+    def get_clipping_coef(self) -> torch.Tensor:
+        """Get per-example gradient scaling factor for clipping."""
+        norm_sample = self.get_norm_sample()
+        return (self.max_grad_norm / (norm_sample + 1e-6)).clamp(max=1.0)
+
+    def get_norm_sample(self) -> torch.Tensor:
+        """Get per-example gradient norms."""
+        norm_sample = torch.stack(
+            [
+                per_param_norm
+                for module in self.iterate_submodules(self._module)
+                for per_param_norm in module.norm_sample
+            ],
+            dim=0,
+        ).norm(2, dim=0)
+
+        self.per_sample_gradient_norms = norm_sample
+        return norm_sample
+
+    def capture_activations_hook(
+        self,
+        module: nn.Module,
+        forward_input: List[torch.Tensor],
+        _forward_output: torch.Tensor,
+    ):
+        if (
+            not requires_grad(module)
+            or not module.training
+            or not torch.is_grad_enabled()
+            or not self.hooks_enabled
+        ):
+            return
+
+        if not hasattr(module, "activations"):
+            module.activations = []
+        module.activations.append([t.detach() for t in forward_input])  # pyre-ignore
+
+        if not hasattr(module, "forward_counter"):
+            module.forward_counter = 0
+
+        module.forward_counter += 1
+        if self.use_ghost_clipping and module.forward_counter > 1:
+            raise NotImplementedError("Parameter tying is not supported with FSDP")
+
+    def capture_backprops_hook(
+        self,
+        module: nn.Module,
+        _forward_input: torch.Tensor,
+        forward_output: torch.Tensor,
+        loss_reduction: str,
+        batch_first: bool,
+    ):
+        """
+        Computes norms of per sample gradient given the current backprops and activations
+        stored by the associated forward hook. Computed per sample gradient norms are
+        stored in ``norm_sample`` field in each parameter.
+
+        Args:
+            module: nn.Module,
+            _forward_input: torch.Tensor,
+            forward_output: torch.Tensor,
+            loss_reduction: str,
+            batch_first: bool,
+        """
+        if not self.hooks_enabled:
+            return
+
+        backprops = forward_output[0].detach()
+
+        activations, backprops = self.rearrange_grad_samples(
+            module=module,
+            backprops=backprops,
+            loss_reduction=loss_reduction,
+            batch_first=batch_first,
+        )
+
+        if not hasattr(module, "norm_sample"):
+            # currently, we don't support freezing and unfreezing params in between training. Making this a dictionary and mapping with param names might fix this.
+            module.norm_sample = []
+            for _, param in trainable_parameters(module):
+                module.norm_sample.append(
+                    torch.zeros(
+                        torch.Size([module.max_batch_len, 1]),
+                        device=param.device,
+                        dtype=param.dtype,
+                    )
+                )
+
+        module_type = self.module_types[type(module)]
+        module.forward_counter -= 1
+        if self.use_ghost_clipping and module_type in self.NORM_SAMPLERS:
+            norm_sampler_fn = self.NORM_SAMPLERS[module_type]
+            norm_samples = norm_sampler_fn(module, activations, backprops)
+
+            for idx, (_, ns) in enumerate(
+                (item for item in norm_samples.items() if item[0].requires_grad)
+            ):
+                module.norm_sample[idx] = ns
+        else:
+            if not self.force_functorch and module_type in self.GRAD_SAMPLERS:
+                grad_sampler_fn = self.GRAD_SAMPLERS[module_type]
+            else:
+                grad_sampler_fn = ft_compute_per_sample_gradient
+
+            grad_samples = grad_sampler_fn(module, activations, backprops)
+
+            for idx, (_, gs) in enumerate((item for item in grad_samples.items())):
+                module.norm_sample[idx] = gs.reshape(len(gs), -1).norm(2, dim=-1)
+            del grad_samples
+
+        if len(module.activations) == 0:
+            if hasattr(module, "max_batch_len"):
+                del module.max_batch_len
+
+    @property
+    def per_sample_gradient_norms(self) -> torch.Tensor:
+        """Returns per sample gradient norms. Note that these are not privatized and should only be used for debugging purposes or in non-private settings"""
+        if self._per_sample_gradient_norms is not None:
+            return self._per_sample_gradient_norms
+        else:
+            raise AttributeError(
+                "per_sample_gradient_norms is not set. Please call forward and backward on the model before accessing this property."
+            )
+
+    @per_sample_gradient_norms.setter
+    def per_sample_gradient_norms(self, value):
+        self._per_sample_gradient_norms = value
@@ -21,6 +21,9 @@
 from .grad_sample_module_fast_gradient_clipping import (
     GradSampleModuleFastGradientClipping,
 )
+from .grad_sample_module_fast_gradient_clipping_fsdp import (
+    GradSampleModuleFastGradientClippingFSDP,
+)
 from .gsm_base import AbstractGradSampleModule
 from .gsm_exp_weights import GradSampleModuleExpandedWeights
 from .gsm_no_op import GradSampleModuleNoOp
@@ -102,6 +105,8 @@ def get_gsm_class(grad_sample_mode: str) -> Type[AbstractGradSampleModule]:
         return GradSampleModuleExpandedWeights
     elif grad_sample_mode == "ghost":
         return GradSampleModuleFastGradientClipping
+    elif grad_sample_mode == "ghost_fsdp":
+        return GradSampleModuleFastGradientClippingFSDP
     elif grad_sample_mode == "no_op":
         return GradSampleModuleNoOp
     else:
 
@@ -18,6 +18,7 @@
 from .ddpoptimizer_fast_gradient_clipping import (
     DistributedDPOptimizerFastGradientClipping,
 )
+from .fsdpoptimizer_fast_gradient_clipping import FSDPOptimizerFastGradientClipping
 from .optimizer import DPOptimizer
 from .optimizer_fast_gradient_clipping import DPOptimizerFastGradientClipping
 from .perlayeroptimizer import DPPerLayerOptimizer
@@ -29,7 +30,7 @@
     "DPOptimizer",
     "DPOptimizerFastGradientClipping",
     "DistributedDPOptimizerFastGradientlipping",
-    "DPPerLayerOptimizer",
+    "FSDPOptimizerFastGradientClipping" "DPPerLayerOptimizer",
     "SimpleDistributedPerLayerOptimizer",
 ]
 
@@ -44,6 +45,13 @@ def get_optimizer_class(clipping: str, distributed: bool, grad_sample_mode: str
             raise ValueError(
                 f"Unsupported combination of parameters. Clipping: {clipping} and grad_sample_mode: {grad_sample_mode}"
             )
+    elif grad_sample_mode == "ghost_fsdp":
+        if clipping == "flat" and distributed is True:
+            return FSDPOptimizerFastGradientClipping
+        else:
+            raise ValueError(
+                f"Unsupported combination of parameters. Clipping: {clipping}, distributed: {distributed}, and grad_sample_mode: {grad_sample_mode}"
+            )
     elif clipping == "flat" and distributed is False:
         return DPOptimizer
     elif clipping == "flat" and distributed is True: