Skip to content

Commit b23bcb0

Browse files
committed
Add PMTUD blocking NFT and OF rules for remote nodes
Create nftables rules to block sending ICMP needs frag/packet too big for known Kubernetes node IPs. PMTUD between nodes can be deterimental to the cluster. Note, this does not affect PMTUD messages received from an intermediary router. For shared gateway mode, also install openflow rules to drop needs frag packets from OVN GR that are destined to known kubernetes nodes. Signed-off-by: Tim Rozet <[email protected]>
1 parent f8bf30b commit b23bcb0

File tree

8 files changed

+979
-12
lines changed

8 files changed

+979
-12
lines changed

go-controller/pkg/node/default_node_network_controller.go

Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import (
2525
"k8s.io/client-go/tools/record"
2626
"k8s.io/klog/v2"
2727
utilnet "k8s.io/utils/net"
28+
"sigs.k8s.io/knftables"
2829

2930
"github.com/ovn-org/libovsdb/client"
3031

@@ -40,6 +41,7 @@ import (
4041
"github.com/ovn-org/ovn-kubernetes/go-controller/pkg/node/controllers/egressservice"
4142
"github.com/ovn-org/ovn-kubernetes/go-controller/pkg/node/linkmanager"
4243
"github.com/ovn-org/ovn-kubernetes/go-controller/pkg/node/managementport"
44+
nodenft "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/node/nftables"
4345
"github.com/ovn-org/ovn-kubernetes/go-controller/pkg/node/ovspinning"
4446
"github.com/ovn-org/ovn-kubernetes/go-controller/pkg/node/routemanager"
4547
"github.com/ovn-org/ovn-kubernetes/go-controller/pkg/ovn/controller/apbroute"
@@ -117,6 +119,9 @@ type DefaultNodeNetworkController struct {
117119
// retry framework for endpoint slices, used for the removal of stale conntrack entries for services
118120
retryEndpointSlices *retry.RetryFramework
119121

122+
// retry framework for nodes, used for updating routes/nftables rules for node PMTUD guarding
123+
retryNodes *retry.RetryFramework
124+
120125
apbExternalRouteNodeController *apbroute.ExternalGatewayNodeController
121126

122127
networkManager networkmanager.Interface
@@ -181,12 +186,23 @@ func NewDefaultNodeNetworkController(cnnci *CommonNodeNetworkControllerInfo, net
181186

182187
nc.initRetryFrameworkForNode()
183188

189+
err = setupPMTUDNFTSets()
190+
if err != nil {
191+
return nil, fmt.Errorf("failed to setup PMTUD nftables sets: %w", err)
192+
}
193+
194+
err = setupPMTUDNFTChain()
195+
if err != nil {
196+
return nil, fmt.Errorf("failed to setup PMTUD nftables chain: %w", err)
197+
}
198+
184199
return nc, nil
185200
}
186201

187202
func (nc *DefaultNodeNetworkController) initRetryFrameworkForNode() {
188203
nc.retryNamespaces = nc.newRetryFrameworkNode(factory.NamespaceExGwType)
189204
nc.retryEndpointSlices = nc.newRetryFrameworkNode(factory.EndpointSliceForStaleConntrackRemovalType)
205+
nc.retryNodes = nc.newRetryFrameworkNode(factory.NodeType)
190206
}
191207

192208
func (oc *DefaultNodeNetworkController) shouldReconcileNetworkChange(old, new util.NetInfo) bool {
@@ -1238,6 +1254,10 @@ func (nc *DefaultNodeNetworkController) Start(ctx context.Context) error {
12381254
if err != nil {
12391255
return fmt.Errorf("failed to watch endpointSlices: %w", err)
12401256
}
1257+
err = nc.WatchNodes()
1258+
if err != nil {
1259+
return fmt.Errorf("failed to watch nodes: %w", err)
1260+
}
12411261
}
12421262

12431263
if nc.healthzServer != nil {
@@ -1445,6 +1465,144 @@ func (nc *DefaultNodeNetworkController) WatchNamespaces() error {
14451465
return err
14461466
}
14471467

1468+
func (nc *DefaultNodeNetworkController) WatchNodes() error {
1469+
_, err := nc.retryNodes.WatchResource()
1470+
return err
1471+
}
1472+
1473+
// addOrUpdateNode handles creating flows or nftables rules for each node to handle PMTUD
1474+
func (nc *DefaultNodeNetworkController) addOrUpdateNode(node *corev1.Node) error {
1475+
var nftElems []*knftables.Element
1476+
var addrs []string
1477+
for _, address := range node.Status.Addresses {
1478+
if address.Type != corev1.NodeInternalIP {
1479+
continue
1480+
}
1481+
nodeIP := net.ParseIP(address.Address)
1482+
if nodeIP == nil {
1483+
continue
1484+
}
1485+
1486+
addrs = append(addrs, nodeIP.String())
1487+
klog.Infof("Adding remote node %q, IP: %s to PMTUD blocking rules", node.Name, nodeIP)
1488+
if utilnet.IsIPv4(nodeIP) {
1489+
nftElems = append(nftElems, &knftables.Element{
1490+
Set: types.NFTNoPMTUDRemoteNodeIPsv4,
1491+
Key: []string{nodeIP.String()},
1492+
})
1493+
} else {
1494+
nftElems = append(nftElems, &knftables.Element{
1495+
Set: types.NFTNoPMTUDRemoteNodeIPsv6,
1496+
Key: []string{nodeIP.String()},
1497+
})
1498+
}
1499+
}
1500+
1501+
gw := nc.Gateway.(*gateway)
1502+
gw.openflowManager.updateBridgePMTUDFlowCache(getPMTUDKey(node.Name), addrs)
1503+
1504+
if len(nftElems) > 0 {
1505+
if err := nodenft.UpdateNFTElements(nftElems); err != nil {
1506+
return fmt.Errorf("unable to update NFT elements for node %q, error: %w", node.Name, err)
1507+
}
1508+
}
1509+
1510+
return nil
1511+
}
1512+
1513+
func removePMTUDNodeNFTRules(nodeIPs []net.IP) error {
1514+
var nftElems []*knftables.Element
1515+
for _, nodeIP := range nodeIPs {
1516+
// Remove IPs from NFT sets
1517+
if utilnet.IsIPv4(nodeIP) {
1518+
nftElems = append(nftElems, &knftables.Element{
1519+
Set: types.NFTNoPMTUDRemoteNodeIPsv4,
1520+
Key: []string{nodeIP.String()},
1521+
})
1522+
} else {
1523+
nftElems = append(nftElems, &knftables.Element{
1524+
Set: types.NFTNoPMTUDRemoteNodeIPsv6,
1525+
Key: []string{nodeIP.String()},
1526+
})
1527+
}
1528+
}
1529+
if len(nftElems) > 0 {
1530+
if err := nodenft.DeleteNFTElements(nftElems); err != nil {
1531+
return err
1532+
}
1533+
}
1534+
return nil
1535+
}
1536+
1537+
func (nc *DefaultNodeNetworkController) deleteNode(node *corev1.Node) {
1538+
gw := nc.Gateway.(*gateway)
1539+
gw.openflowManager.deleteFlowsByKey(getPMTUDKey(node.Name))
1540+
ipsToRemove := make([]net.IP, 0)
1541+
for _, address := range node.Status.Addresses {
1542+
if address.Type != corev1.NodeInternalIP {
1543+
continue
1544+
}
1545+
nodeIP := net.ParseIP(address.Address)
1546+
if nodeIP == nil {
1547+
continue
1548+
}
1549+
ipsToRemove = append(ipsToRemove, nodeIP)
1550+
}
1551+
1552+
klog.Infof("Deleting NFT elements for node: %s", node.Name)
1553+
if err := removePMTUDNodeNFTRules(ipsToRemove); err != nil {
1554+
klog.Errorf("Failed to delete nftables rules for PMTUD blocking for node %q: %v", node.Name, err)
1555+
}
1556+
}
1557+
1558+
func (nc *DefaultNodeNetworkController) syncNodes(objs []interface{}) error {
1559+
var keepNFTSetElemsV4, keepNFTSetElemsV6 []*knftables.Element
1560+
var errors []error
1561+
klog.Infof("Starting node controller node sync")
1562+
start := time.Now()
1563+
for _, obj := range objs {
1564+
node, ok := obj.(*corev1.Node)
1565+
if !ok {
1566+
klog.Errorf("Spurious object in syncNodes: %v", obj)
1567+
continue
1568+
}
1569+
if node.Name == nc.name {
1570+
continue
1571+
}
1572+
for _, address := range node.Status.Addresses {
1573+
if address.Type != corev1.NodeInternalIP {
1574+
continue
1575+
}
1576+
nodeIP := net.ParseIP(address.Address)
1577+
if nodeIP == nil {
1578+
continue
1579+
}
1580+
1581+
// Remove IPs from NFT sets
1582+
if utilnet.IsIPv4(nodeIP) {
1583+
keepNFTSetElemsV4 = append(keepNFTSetElemsV4, &knftables.Element{
1584+
Set: types.NFTNoPMTUDRemoteNodeIPsv4,
1585+
Key: []string{nodeIP.String()},
1586+
})
1587+
} else {
1588+
keepNFTSetElemsV6 = append(keepNFTSetElemsV6, &knftables.Element{
1589+
Set: types.NFTNoPMTUDRemoteNodeIPsv6,
1590+
Key: []string{nodeIP.String()},
1591+
})
1592+
}
1593+
}
1594+
}
1595+
if err := recreateNFTSet(types.NFTNoPMTUDRemoteNodeIPsv4, keepNFTSetElemsV4); err != nil {
1596+
errors = append(errors, err)
1597+
}
1598+
if err := recreateNFTSet(types.NFTNoPMTUDRemoteNodeIPsv6, keepNFTSetElemsV6); err != nil {
1599+
errors = append(errors, err)
1600+
}
1601+
1602+
klog.Infof("Node controller node sync done. Time taken: %s", time.Since(start))
1603+
return utilerrors.Join(errors...)
1604+
}
1605+
14481606
// validateVTEPInterfaceMTU checks if the MTU of the interface that has ovn-encap-ip is big
14491607
// enough to carry the `config.Default.MTU` and the Geneve header. If the MTU is not big
14501608
// enough, it will return an error
@@ -1485,6 +1643,10 @@ func (nc *DefaultNodeNetworkController) validateVTEPInterfaceMTU() error {
14851643
return nil
14861644
}
14871645

1646+
func getPMTUDKey(nodeName string) string {
1647+
return fmt.Sprintf("%s_pmtud", nodeName)
1648+
}
1649+
14881650
func configureSvcRouteViaBridge(routeManager *routemanager.Controller, bridge string) error {
14891651
return configureSvcRouteViaInterface(routeManager, bridge, DummyNextHopIPs())
14901652
}

0 commit comments

Comments
 (0)