Skip to content

Commit 9614f2d

Browse files
authored
Merge pull request #2829 from kragniz/prometheus-oom-count
Expose OOM event count to prometheus
2 parents de11763 + fa07332 commit 9614f2d

File tree

8 files changed

+46
-1
lines changed

8 files changed

+46
-1
lines changed

cmd/cadvisor.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@ var (
112112
container.CPUTopologyMetrics: struct{}{},
113113
container.ResctrlMetrics: struct{}{},
114114
container.CPUSetMetrics: struct{}{},
115+
container.OOMMetrics: struct{}{},
115116
}
116117
)
117118

cmd/cadvisor_test.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ func TestToIncludedMetrics(t *testing.T) {
109109
container.CPUTopologyMetrics: struct{}{},
110110
container.ResctrlMetrics: struct{}{},
111111
container.CPUSetMetrics: struct{}{},
112+
container.OOMMetrics: struct{}{},
112113
},
113114
container.AllMetrics,
114115
{},

container/factory.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ const (
6464
CPUTopologyMetrics MetricKind = "cpu_topology"
6565
ResctrlMetrics MetricKind = "resctrl"
6666
CPUSetMetrics MetricKind = "cpuset"
67+
OOMMetrics MetricKind = "oom_event"
6768
)
6869

6970
// AllMetrics represents all kinds of metrics that cAdvisor supported.
@@ -89,6 +90,7 @@ var AllMetrics = MetricSet{
8990
CPUTopologyMetrics: struct{}{},
9091
ResctrlMetrics: struct{}{},
9192
CPUSetMetrics: struct{}{},
93+
OOMMetrics: struct{}{},
9294
}
9395

9496
func (mk MetricKind) String() string {

info/v1/container.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -963,6 +963,8 @@ type ContainerStats struct {
963963
Resctrl ResctrlStats `json:"resctrl,omitempty"`
964964

965965
CpuSet CPUSetStats `json:"cpuset,omitempty"`
966+
967+
OOMEvents uint64 `json:"oom_events,omitempty"`
966968
}
967969

968970
func timeEq(t1, t2 time.Time, tolerance time.Duration) bool {

manager/container.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ import (
2727
"strconv"
2828
"strings"
2929
"sync"
30+
"sync/atomic"
3031
"time"
3132

3233
"github.com/google/cadvisor/cache/memory"
@@ -102,6 +103,8 @@ type containerData struct {
102103

103104
// resctrlCollector updates stats for resctrl controller.
104105
resctrlCollector stats.Collector
106+
107+
oomEvents uint64
105108
}
106109

107110
// jitter returns a time.Duration between duration and duration + maxFactor * duration,
@@ -668,6 +671,9 @@ func (cd *containerData) updateStats() error {
668671
klog.V(2).Infof("Failed to add summary stats for %q: %v", cd.info.Name, err)
669672
}
670673
}
674+
675+
stats.OOMEvents = atomic.LoadUint64(&cd.oomEvents)
676+
671677
var customStatsErr error
672678
cm := cd.collectorManager.(*collector.GenericCollectorManager)
673679
if len(cm.Collectors) > 0 {

manager/manager.go

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424
"strconv"
2525
"strings"
2626
"sync"
27+
"sync/atomic"
2728
"time"
2829

2930
"github.com/google/cadvisor/accelerators"
@@ -35,7 +36,7 @@ import (
3536
"github.com/google/cadvisor/events"
3637
"github.com/google/cadvisor/fs"
3738
info "github.com/google/cadvisor/info/v1"
38-
"github.com/google/cadvisor/info/v2"
39+
v2 "github.com/google/cadvisor/info/v2"
3940
"github.com/google/cadvisor/machine"
4041
"github.com/google/cadvisor/nvm"
4142
"github.com/google/cadvisor/perf"
@@ -1237,6 +1238,24 @@ func (m *manager) watchForNewOoms() error {
12371238
if err != nil {
12381239
klog.Errorf("failed to add OOM kill event for %q: %v", oomInstance.ContainerName, err)
12391240
}
1241+
1242+
// Count OOM events for later collection by prometheus
1243+
request := v2.RequestOptions{
1244+
IdType: v2.TypeName,
1245+
Count: 1,
1246+
}
1247+
conts, err := m.getRequestedContainers(oomInstance.ContainerName, request)
1248+
if err != nil {
1249+
klog.V(2).Infof("failed getting container info for %q: %v", oomInstance.ContainerName, err)
1250+
continue
1251+
}
1252+
if len(conts) != 1 {
1253+
klog.V(2).Info("Expected the request to match only one container")
1254+
continue
1255+
}
1256+
for _, cont := range conts {
1257+
atomic.AddUint64(&cont.oomEvents, 1)
1258+
}
12401259
}
12411260
}()
12421261
return nil

metrics/prometheus.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1757,6 +1757,17 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc, includedMetri
17571757
},
17581758
}...)
17591759
}
1760+
if includedMetrics.Has(container.OOMMetrics) {
1761+
c.containerMetrics = append(c.containerMetrics, containerMetric{
1762+
name: "container_oom_events_total",
1763+
help: "Count of out of memory events observed for the container",
1764+
valueType: prometheus.CounterValue,
1765+
getValues: func(s *info.ContainerStats) metricValues {
1766+
return metricValues{{value: float64(s.OOMEvents), timestamp: s.Timestamp}}
1767+
},
1768+
})
1769+
}
1770+
17601771
return c
17611772
}
17621773

metrics/testdata/prometheus_metrics

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -358,6 +358,9 @@ container_network_udp_usage_total{container_env_foo_env="prod",container_label_f
358358
container_network_udp_usage_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",udp_state="listen",zone_name="hello"} 0 1395066363000
359359
container_network_udp_usage_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",udp_state="rxqueued",zone_name="hello"} 0 1395066363000
360360
container_network_udp_usage_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",udp_state="txqueued",zone_name="hello"} 0 1395066363000
361+
# HELP container_oom_events_total Count of out of memory events observed for the container
362+
# TYPE container_oom_events_total counter
363+
container_oom_events_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0 1395066363000
361364
# HELP container_perf_events_total Perf event metric.
362365
# TYPE container_perf_events_total counter
363366
container_perf_events_total{container_env_foo_env="prod",container_label_foo_label="bar",cpu="0",event="instructions",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 123 1395066363000

0 commit comments

Comments
 (0)