Skip to content

Commit d61ffa1

Browse files
committed
sdn: make pod operation metrics more useful and collectable
The pod operation error metrics were in the wrong place to capture the overall pod setup/teardown operation. Move them to capture everything. Next, the labels of the Latency metric meant that every observation was a unique metric and no statistics could be determined from them in aggregate. Change that (and pod errors) to follow the Kubelet dockershim DockerOperations[Latency|Errors] metric pattern with a label for the operation instead of the sandbox.
1 parent fe4f498 commit d61ffa1

File tree

3 files changed

+25
-58
lines changed

3 files changed

+25
-58
lines changed

pkg/network/node/metrics.go

Lines changed: 16 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,13 @@ const (
2323
OVSFlowsKey = "ovs_flows"
2424
ARPCacheAvailableEntriesKey = "arp_cache_entries"
2525
PodIPsKey = "pod_ips"
26-
PodSetupErrorsKey = "pod_setup_errors"
27-
PodSetupLatencyKey = "pod_setup_latency"
28-
PodTeardownErrorsKey = "pod_teardown_errors"
29-
PodTeardownLatencyKey = "pod_teardown_latency"
26+
PodOperationsErrorsKey = "pod_operations_errors"
27+
PodOperationsLatencyKey = "pod_operations_latency"
3028
VnidNotFoundErrorsKey = "vnid_not_found_errors"
29+
30+
// Pod Operation types
31+
PodOperationSetup = "setup"
32+
PodOperationTeardown = "teardown"
3133
)
3234

3335
var (
@@ -58,42 +60,24 @@ var (
5860
},
5961
)
6062

61-
PodSetupErrors = prometheus.NewCounter(
62-
prometheus.CounterOpts{
63-
Namespace: SDNNamespace,
64-
Subsystem: SDNSubsystem,
65-
Name: PodSetupErrorsKey,
66-
Help: "Number pod setup errors",
67-
},
68-
)
69-
70-
PodSetupLatency = prometheus.NewSummaryVec(
71-
prometheus.SummaryOpts{
72-
Namespace: SDNNamespace,
73-
Subsystem: SDNSubsystem,
74-
Name: PodSetupLatencyKey,
75-
Help: "Latency of pod network setup in microseconds",
76-
},
77-
[]string{"pod_namespace", "pod_name", "sandbox_id"},
78-
)
79-
80-
PodTeardownErrors = prometheus.NewCounter(
63+
PodOperationsErrors = prometheus.NewCounterVec(
8164
prometheus.CounterOpts{
8265
Namespace: SDNNamespace,
8366
Subsystem: SDNSubsystem,
84-
Name: PodTeardownErrorsKey,
85-
Help: "Number pod teardown errors",
67+
Name: PodOperationsErrorsKey,
68+
Help: "Cumulative number of SDN operation errors by operation type",
8669
},
70+
[]string{"operation_type"},
8771
)
8872

89-
PodTeardownLatency = prometheus.NewSummaryVec(
73+
PodOperationsLatency = prometheus.NewSummaryVec(
9074
prometheus.SummaryOpts{
9175
Namespace: SDNNamespace,
9276
Subsystem: SDNSubsystem,
93-
Name: PodTeardownLatencyKey,
94-
Help: "Latency of pod network teardown in microseconds",
77+
Name: PodOperationsLatencyKey,
78+
Help: "Latency in microseconds of SDN operations by operation type",
9579
},
96-
[]string{"pod_namespace", "pod_name", "sandbox_id"},
80+
[]string{"operation_type"},
9781
)
9882

9983
VnidNotFoundErrors = prometheus.NewCounter(
@@ -121,10 +105,8 @@ func RegisterMetrics() {
121105
prometheus.MustRegister(OVSFlows)
122106
prometheus.MustRegister(ARPCacheAvailableEntries)
123107
prometheus.MustRegister(PodIPs)
124-
prometheus.MustRegister(PodSetupErrors)
125-
prometheus.MustRegister(PodSetupLatency)
126-
prometheus.MustRegister(PodTeardownErrors)
127-
prometheus.MustRegister(PodTeardownLatency)
108+
prometheus.MustRegister(PodOperationsErrors)
109+
prometheus.MustRegister(PodOperationsLatency)
128110
prometheus.MustRegister(VnidNotFoundErrors)
129111
})
130112
}

pkg/network/node/ovscontroller.go

Lines changed: 3 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -288,22 +288,11 @@ func getPodNote(sandboxID string) (string, error) {
288288
}
289289

290290
func (oc *ovsController) SetUpPod(hostVeth, podIP, podMAC, sandboxID string, vnid uint32) (int, error) {
291-
var (
292-
err error
293-
note string
294-
ofport int
295-
)
296-
defer func() {
297-
if err != nil {
298-
PodSetupErrors.Inc()
299-
}
300-
}()
301-
302-
note, err = getPodNote(sandboxID)
291+
note, err := getPodNote(sandboxID)
303292
if err != nil {
304293
return -1, err
305294
}
306-
ofport, err = oc.ensureOvsPort(hostVeth)
295+
ofport, err := oc.ensureOvsPort(hostVeth)
307296
if err != nil {
308297
return -1, err
309298
}
@@ -422,15 +411,7 @@ func (oc *ovsController) TearDownPod(hostVeth, podIP, sandboxID string) error {
422411
podIP = ip
423412
}
424413

425-
var err error
426-
defer func() {
427-
if err != nil {
428-
PodTeardownErrors.Inc()
429-
}
430-
}()
431-
432-
err = oc.cleanupPodFlows(podIP)
433-
if err != nil {
414+
if err := oc.cleanupPodFlows(podIP); err != nil {
434415
return err
435416
}
436417
_ = oc.SetPodBandwidth(hostVeth, -1, -1)

pkg/network/node/pod.go

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -311,6 +311,7 @@ func (m *podManager) processRequest(request *cniserver.PodRequest) *cniserver.Po
311311
}
312312
}
313313
if err != nil {
314+
PodOperationsErrors.WithLabelValues(PodOperationSetup).Inc()
314315
result.Err = err
315316
}
316317
case cniserver.CNI_UPDATE:
@@ -329,6 +330,9 @@ func (m *podManager) processRequest(request *cniserver.PodRequest) *cniserver.Po
329330
}
330331
}
331332
result.Err = m.podHandler.teardown(request)
333+
if result.Err != nil {
334+
PodOperationsErrors.WithLabelValues(PodOperationTeardown).Inc()
335+
}
332336
default:
333337
result.Err = fmt.Errorf("unhandled CNI request %v", request.Command)
334338
}
@@ -543,7 +547,7 @@ func podIsExited(p *kcontainer.Pod) bool {
543547

544548
// Set up all networking (host/container veth, OVS flows, IPAM, loopback, etc)
545549
func (m *podManager) setup(req *cniserver.PodRequest) (cnitypes.Result, *runningPod, error) {
546-
defer PodSetupLatency.WithLabelValues(req.PodNamespace, req.PodName, req.SandboxID).Observe(sinceInMicroseconds(time.Now()))
550+
defer PodOperationsLatency.WithLabelValues(PodOperationSetup).Observe(sinceInMicroseconds(time.Now()))
547551

548552
pod, err := m.kClient.Core().Pods(req.PodNamespace).Get(req.PodName, metav1.GetOptions{})
549553
if err != nil {
@@ -672,7 +676,7 @@ func (m *podManager) update(req *cniserver.PodRequest) (uint32, error) {
672676

673677
// Clean up all pod networking (clear OVS flows, release IPAM lease, remove host/container veth)
674678
func (m *podManager) teardown(req *cniserver.PodRequest) error {
675-
defer PodTeardownLatency.WithLabelValues(req.PodNamespace, req.PodName, req.SandboxID).Observe(sinceInMicroseconds(time.Now()))
679+
defer PodOperationsLatency.WithLabelValues(PodOperationTeardown).Observe(sinceInMicroseconds(time.Now()))
676680

677681
netnsValid := true
678682
if err := ns.IsNSorErr(req.Netns); err != nil {

0 commit comments

Comments
 (0)